Ebooks: handle multiple dupe options
authorDan Scott <dscott@laurentian.ca>
Thu, 4 Oct 2012 21:17:04 +0000 (17:17 -0400)
committerDan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:58:12 +0000 (14:58 -0400)
process_fields() modifies the base record, and we were inadvertently
running the base record through the ringer every time we checked for
dupes - which would be evidenced by the addition of one 506 and 590 per
consortial member per dupe check we ran.

In the process of cleaning this up, try to dedupe some of our own code
(heh). We can probably go futher but this is a good start.

Signed-off-by: Dan Scott <dscott@laurentian.ca>
tools/ebooks/prep_ebook_records.py

index 21b7a46..ee11a94 100644 (file)
@@ -15,6 +15,7 @@ be accommodated in batch load.
 """
 
 import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json
+import copy
 from urllib import quote
 from datetime import date
 from BeautifulSoup import BeautifulSoup
@@ -397,10 +398,7 @@ def process_records(options):
 
     global RECORD_COUNT
     global DUP_COUNT
-    sample = ''
-    duplicate = ''
-    tcn = ''
-    url = ''
+    files = {}
 
     try:
         reader = pymarc.MARCReader(
@@ -409,31 +407,15 @@ def process_records(options):
     except Exception, ex:
         print("Could not open input file [%s]" % options['input'])
 
-    try:
-        writer = pymarc.MARCWriter(open(options['output'], mode='wb'))
-    except Exception, ex:
-        print("Could not open output file [%s]" % options['output'])
+    for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'):
+        if fname in options:
+            _fname = options[fname]
+            try:
+                files[fname] = pymarc.MARCWriter(open(_fname, mode='wb'))
+            except Exception, ex:
+                print("Could not open output file [%s]: %s" % (_fname, ex))
 
-    if 'duplicate' in options:
-        try:
-            duplicate = pymarc.MARCWriter(open(options['duplicate'], mode='wb'))
-        except Exception, ex:
-            print("Could not open output file [%s]" % options['duplicate'])
-
-    if 'tcn' in options:
-        try:
-            tcn = pymarc.MARCWriter(open(options['tcn'], mode='wb'))
-        except Exception, ex:
-            print("Could not open output file [%s]" % options['tcn'])
-
-    if 'url' in options:
-        try:
-            url = pymarc.MARCWriter(open(options['url'], mode='wb'))
-        except Exception, ex:
-            print("Could not open output file [%s]" % options['url'])
-
-    if 'sample' in options:
-        sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
+    writer = files['output']
 
     for record in reader:
         RECORD_COUNT += 1
@@ -445,41 +427,42 @@ def process_records(options):
             else:
                 print ("%d - %s\n" % (RECORD_COUNT, record['856']))
 
-
-            new_record = ''
-            dup_flag = False
-
-            if duplicate:
-                bib_id, dup_flag = isbn_check(record)
-                new_record = process_fields(record, options, bib_id, dup_flag)
-                if dup_flag:
-                    duplicate.write(new_record)
-            if tcn:
-                bib_id, dup_flag = tcn_check(record)
-                new_record = process_fields(record, options, bib_id, dup_flag)
-                if dup_flag:
-                    tcn.write(new_record)
-            if url:
-                bib_id, dup_flag = url_check(record, options)
-                new_record = process_fields(record, options, bib_id, dup_flag)
-                if dup_flag:
-                    url.write(new_record)
-
-            if not dup_flag:
-                new_record = process_fields(record, options, 0, False)
-            else:
+            dupe_flags = {}
+
+            if files['duplicate']:
+                tmp_record = process_fields(copy.deepcopy(record), options)
+                bib_id, dupe_flags['isbn'] = isbn_check(tmp_record)
+                if dupe_flags['isbn']:
+                    tmp_record = add_dupe_field(tmp_record, bib_id)
+                    files['duplicate'].write(tmp_record)
+
+            if files['tcn']:
+                tmp_record = process_fields(copy.deepcopy(record), options)
+                bib_id, dupe_flags['tcn'] = tcn_check(tmp_record)
+                if dupe_flags['tcn']:
+                    tmp_record = add_dupe_field(tmp_record, bib_id)
+                    files['tcn'].write(tmp_record)
+
+            if files['url']:
+                tmp_record = process_fields(copy.deepcopy(record), options)
+                bib_id, dupe_flags['url'] = url_check(tmp_record, options)
+                if dupe_flags['url']:
+                    tmp_record = add_dupe_field(tmp_record, bib_id)
+                    files['url'].write(tmp_record)
+
+            if len(dupe_flags):
                 DUP_COUNT += 1
-
-            if new_record:
+            else:
+                new_record = process_fields(record, options)
                 writer.write(new_record)
-                if (sample and (
+                if (files['sample'] and (
                     (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)
                 )):
-                    sample.write(new_record)
+                    files['sample'].write(new_record)
         except Exception, ex:
             print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
 
-def process_fields(record, options, bib_id, dup_flag):
+def process_fields(record, options):
     """Decide which fields to add, delete, and keep"""
 
     new_record = pymarc.Record(to_unicode=True, force_utf8=True)
@@ -498,17 +481,6 @@ def process_fields(record, options, bib_id, dup_flag):
         )
         record.add_field(note)
 
-    # 909
-    if dup_flag:
-        dup_value = str(bib_id)
-        dup = pymarc.Field(tag = '909',
-            indicators = [' ', ' '],
-            subfields = [
-                'a', dup_value
-            ]
-        )
-        record.add_field(dup)
-
     add_marc_source(record, options) # 598
     publisher = add_publisher(record, options) # 710
     add_restriction(record, options, publisher) # 506
@@ -556,6 +528,19 @@ def process_fields(record, options, bib_id, dup_flag):
 
     return new_record
 
+def add_dupe_field(record, bib_id):
+    """Add a 909 field marking the duplicate record"""
+
+    dup_value = str(bib_id)
+    dup = pymarc.Field(tag = '909',
+        indicators = [' ', ' '],
+        subfields = [
+            'a', dup_value
+        ]
+    )
+    record.add_field(dup)
+    return record
+
 def clean_diacritics(field):
     """
     Change specific patterns of bytes into other patterns of bytes