Ebooks: handle multiple dupe options

author Dan Scott <dscott@laurentian.ca>

Thu, 4 Oct 2012 21:17:04 +0000 (17:17 -0400)

committer Dan Scott <dscott@laurentian.ca>

Tue, 7 May 2013 18:58:12 +0000 (14:58 -0400)
author Dan Scott <dscott@laurentian.ca>
Thu, 4 Oct 2012 21:17:04 +0000 (17:17 -0400)
committer Dan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:58:12 +0000 (14:58 -0400)
diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py

index 21b7a46..ee11a94 100644 (file)
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -15,6 +15,7 @@ be accommodated in batch load.
  """
  
  import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json
+import copy
  from urllib import quote
  from datetime import date
  from BeautifulSoup import BeautifulSoup
@@ -397,10 +398,7 @@ def process_records(options):
  
      global RECORD_COUNT
      global DUP_COUNT
-    sample = ''
-    duplicate = ''
-    tcn = ''
-    url = ''
+    files = {}
  
      try:
          reader = pymarc.MARCReader(
@@ -409,31 +407,15 @@ def process_records(options):
      except Exception, ex:
          print("Could not open input file [%s]" % options['input'])
  
-    try:
-        writer = pymarc.MARCWriter(open(options['output'], mode='wb'))
-    except Exception, ex:
-        print("Could not open output file [%s]" % options['output'])
+    for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'):
+        if fname in options:
+            _fname = options[fname]
+            try:
+                files[fname] = pymarc.MARCWriter(open(_fname, mode='wb'))
+            except Exception, ex:
+                print("Could not open output file [%s]: %s" % (_fname, ex))
  
-    if 'duplicate' in options:
-        try:
-            duplicate = pymarc.MARCWriter(open(options['duplicate'], mode='wb'))
-        except Exception, ex:
-            print("Could not open output file [%s]" % options['duplicate'])
-
-    if 'tcn' in options:
-        try:
-            tcn = pymarc.MARCWriter(open(options['tcn'], mode='wb'))
-        except Exception, ex:
-            print("Could not open output file [%s]" % options['tcn'])
-
-    if 'url' in options:
-        try:
-            url = pymarc.MARCWriter(open(options['url'], mode='wb'))
-        except Exception, ex:
-            print("Could not open output file [%s]" % options['url'])
-
-    if 'sample' in options:
-        sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
+    writer = files['output']
  
      for record in reader:
          RECORD_COUNT += 1
@@ -445,41 +427,42 @@ def process_records(options):
              else:
                  print ("%d - %s\n" % (RECORD_COUNT, record['856']))
  
-
-            new_record = ''
-            dup_flag = False
-
-            if duplicate:
-                bib_id, dup_flag = isbn_check(record)
-                new_record = process_fields(record, options, bib_id, dup_flag)
-                if dup_flag:
-                    duplicate.write(new_record)
-            if tcn:
-                bib_id, dup_flag = tcn_check(record)
-                new_record = process_fields(record, options, bib_id, dup_flag)
-                if dup_flag:
-                    tcn.write(new_record)
-            if url:
-                bib_id, dup_flag = url_check(record, options)
-                new_record = process_fields(record, options, bib_id, dup_flag)
-                if dup_flag:
-                    url.write(new_record)
-
-            if not dup_flag:
-                new_record = process_fields(record, options, 0, False)
-            else:
+            dupe_flags = {}
+
+            if files['duplicate']:
+                tmp_record = process_fields(copy.deepcopy(record), options)
+                bib_id, dupe_flags['isbn'] = isbn_check(tmp_record)
+                if dupe_flags['isbn']:
+                    tmp_record = add_dupe_field(tmp_record, bib_id)
+                    files['duplicate'].write(tmp_record)
+
+            if files['tcn']:
+                tmp_record = process_fields(copy.deepcopy(record), options)
+                bib_id, dupe_flags['tcn'] = tcn_check(tmp_record)
+                if dupe_flags['tcn']:
+                    tmp_record = add_dupe_field(tmp_record, bib_id)
+                    files['tcn'].write(tmp_record)
+
+            if files['url']:
+                tmp_record = process_fields(copy.deepcopy(record), options)
+                bib_id, dupe_flags['url'] = url_check(tmp_record, options)
+                if dupe_flags['url']:
+                    tmp_record = add_dupe_field(tmp_record, bib_id)
+                    files['url'].write(tmp_record)
+
+            if len(dupe_flags):
                  DUP_COUNT += 1
-
-            if new_record:
+            else:
+                new_record = process_fields(record, options)
                  writer.write(new_record)
-                if (sample and (
+                if (files['sample'] and (
                      (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)
                  )):
-                    sample.write(new_record)
+                    files['sample'].write(new_record)
          except Exception, ex:
              print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
  
-def process_fields(record, options, bib_id, dup_flag):
+def process_fields(record, options):
      """Decide which fields to add, delete, and keep"""
  
      new_record = pymarc.Record(to_unicode=True, force_utf8=True)
@@ -498,17 +481,6 @@ def process_fields(record, options, bib_id, dup_flag):
          )
          record.add_field(note)
  
-    # 909
-    if dup_flag:
-        dup_value = str(bib_id)
-        dup = pymarc.Field(tag = '909',
-            indicators = [' ', ' '],
-            subfields = [
-                'a', dup_value
-            ]
-        )
-        record.add_field(dup)
-
      add_marc_source(record, options) # 598
      publisher = add_publisher(record, options) # 710
      add_restriction(record, options, publisher) # 506
@@ -556,6 +528,19 @@ def process_fields(record, options, bib_id, dup_flag):
  
      return new_record
  
+def add_dupe_field(record, bib_id):
+    """Add a 909 field marking the duplicate record"""
+
+    dup_value = str(bib_id)
+    dup = pymarc.Field(tag = '909',
+        indicators = [' ', ' '],
+        subfields = [
+            'a', dup_value
+        ]
+    )
+    record.add_field(dup)
+    return record
+
  def clean_diacritics(field):
      """
      Change specific patterns of bytes into other patterns of bytes
author	Dan Scott <dscott@laurentian.ca>
	Thu, 4 Oct 2012 21:17:04 +0000 (17:17 -0400)
committer	Dan Scott <dscott@laurentian.ca>
	Tue, 7 May 2013 18:58:12 +0000 (14:58 -0400)