EResource handling improvements
authorDan Scott <dscott@laurentian.ca>
Tue, 20 Nov 2012 18:55:02 +0000 (13:55 -0500)
committerDan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:58:23 +0000 (14:58 -0400)
* Add the ability to read (via -F xml) and write (via -T xml) MARCXML
  records. Writing XML records will place one per line, making it
  ideal for inserting into a database via COPY().
* Make the "Mark ISBN for SFX" functionality optional via the -I flag,
  as not all electronic resources have ISBNs. (Hi, journals and music
  records).
* Reduce code duplication significantly

Signed-off-by: Dan Scott <dscott@laurentian.ca>
tools/ebooks/prep_ebook_records.py

index 7e483be..f3220cd 100755 (executable)
@@ -15,7 +15,7 @@ be accommodated in batch load.
 """
 
 import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json
-import copy
+import codecs, copy
 from urllib import quote
 from datetime import date
 from BeautifulSoup import BeautifulSoup
@@ -26,6 +26,8 @@ GATEWAY_URL = "http://www.concat.ca/osrf-gateway-v1"
 OPENSRF_ISBN_CALL = "open-ils.search.biblio.isbn"
 OPENSRF_TCN_CALL = "open-ils.search.biblio.tcn"
 OPENSRF_KEYWORD_CALL = "open-ils.search.biblio.multiclass.query"
+OPTIONS = {}
+FILES = {}
 
 class Institution():
     """Defines standard settings for each Conifer institution"""
@@ -140,12 +142,18 @@ Optional arguments:
 
     -d / --duplicate : The name of the file to route ISBN duplicates to.
 
+    -F / --from-format : The format ('xml' or 'marc21') of the input file
+
+    -T / --to-format : The format ('xml' or 'marc21') of the output file
+
     -t / --tcn : The name of the file to route TCN duplicates to.
 
     -u / --url : The name of the file to route URL duplicates to.
 
     -e / --ebrary : Put Ebrary 001 into 924 as a URN for SFX lookup purposes
 
+    -I / --isbn-sfx: Mark the ISBN if found in SFX
+
     -n / --note : The text of the internal note to be inserted into a 590 field.
 
     -s / --sample : The name of the sample output MARC file (generates
@@ -169,10 +177,13 @@ def consolidate_options(opts):
         '-C': '--clean',
         '-d': '--duplicate',
         '-e': '--ebrary',
+        '-F': '--from-format',
+        '-I': '--isbn-sfx',
         '-p': '--publisher',
         '-P': '--platform',
         '-n': '--note',
         '-t': '--tcn',
+        '-T': '--to-format',
         '-u': '--url',
         '-A': '--algoma',
         '-B': '--boreal',
@@ -233,35 +244,35 @@ def check_options(options):
         print("* Cannot write to output path %s" % (os.path.dirname(_output)))
         sys.exit(0)
 
-    clean_opts = dict()
-    clean_opts['publisher'] = append_period(options['--publisher'])
-
-    clean_opts['consortium'] = options['--consortium'].decode('utf-8')
-    clean_opts['authorization'] = options['--authorization'].decode('utf-8')
-
-    if '--clean' in options:
-        clean_opts['clean'] = True
-
-    if '--duplicate' in options:
-        clean_opts['duplicate'] = options['--duplicate']
-
-    if '--tcn' in options:
-        clean_opts['tcn'] = options['--tcn']
-
-    if '--url' in options:
-        clean_opts['url'] = options['--url']
+    _bool_opts = {
+        '--clean': 'clean',
+        '--ebrary': 'ebrary',
+        '--isbn-sfx': 'isbn-sfx',
+    }
 
-    if '--ebrary' in options:
-        clean_opts['ebrary'] = True
+    _string_opts = {
+        '--authorization': 'authorization',
+        '--consortium': 'consortium',
+        '--duplicate': 'duplicate',
+        '--from-format': 'from-format',
+        '--note': 'note',
+        '--platform': 'platform',
+        '--sample': 'sample',
+        '--tcn': 'tcn',
+        '--to-format': 'to-format',
+        '--url': 'url',
+    }
 
-    if '--sample' in options:
-        clean_opts['sample'] = options['--sample']
+    clean_opts = dict()
+    clean_opts['publisher'] = append_period(options['--publisher'])
 
-    if '--note' in options:
-        clean_opts['note'] = options['--note']
+    for optkey, optval in _bool_opts.items():
+        if optkey in options:
+            clean_opts[optval] = True
 
-    if '--platform' in options:
-        clean_opts['platform'] = append_period(options['--platform'])
+    for optkey, optval in _string_opts.items():
+        if optkey in options:
+            clean_opts[optval] = options[optkey].decode('utf-8')
 
     clean_opts['libraries'] = _libraries
     clean_opts['input'] = _input
@@ -379,11 +390,12 @@ def check_libraries(options):
 def parse_opts():
     """Get command-line arguments from the script"""
     try:
-        _short_opts = 'i:o:a:c:p:P:ABCLWe:d:t:u:n:s:h'
+        _short_opts = 'i:o:a:c:p:P:ABLWeCI:d:F:T:t:u:n:s:h'
         _long_opts = ['input=', 'output=', 'authorization=', 'consortium=',
             'publisher=', 'platform=', 'algoma', 'boreal', 'laurentian',
-            'windsor', 'ebrary', 'clean', 'duplicate=', 'tcn=', 'url=', 'note=',
-            'sample=', 'help'
+            'windsor', 'ebrary', 'clean', 'isbn-sfx', 'duplicate=',
+            'from-format=', 'to-format=', 'tcn=', 'url=', 'note=', 'sample=',
+            'help'
         ]
         opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts)
     except getopt.GetoptError, ex:
@@ -393,80 +405,79 @@ def parse_opts():
     _options = consolidate_options(opts[0])
     return check_options(_options)
 
-def process_records(options):
+def process_marc(options):
     """Converts raw ebook MARC records to Conifer-ready MARC records"""
 
+    global FILES
+    files = FILES
+
+    if 'from-format' in options and options['from-format'] == 'xml':
+        pymarc.map_xml(process_xml, options['input'])
+    else:
+        try:
+            reader = pymarc.MARCReader(
+                open(options['input'], mode='rb'), to_unicode=True
+            )
+        except Exception, ex:
+            print("Could not open input file [%s]" % options['input'])
+
+        for record in reader:
+            process_record(record, options, files)
+
+def process_record(record, options, files):
     global RECORD_COUNT
     global DUP_COUNT
-    files = {}
-
+    RECORD_COUNT += 1
     try:
-        reader = pymarc.MARCReader(
-            open(options['input'], mode='rb'), to_unicode=True
-        )
-    except Exception, ex:
-        print("Could not open input file [%s]" % options['input'])
-
-    for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'):
-        if fname in options:
-            _fname = options[fname]
-            try:
-                files[fname] = pymarc.MARCWriter(open(_fname, mode='wb'))
-            except Exception, ex:
-                print("Could not open output file [%s]: %s" % (_fname, ex))
+        if not (record['856'] and record['856']['u']):
+            print("* No 856 for record # %s in file %s"
+                    % (RECORD_COUNT, options['input'])
+            )
+        else:
+            print ("%d - %s" % (RECORD_COUNT, record['856']))
 
-    writer = files['output']
+        dupe_flags = {}
 
-    for record in reader:
-        RECORD_COUNT += 1
-        try:
-            if not (record['856'] and record['856']['u']):
-                print("* No 856 for record # %s in file %s"
-                        % (RECORD_COUNT, options['input'])
-                )
+        if 'duplicate' in files:
+            tmp_record = process_fields(copy.deepcopy(record), options)
+            bib_id, dupe_flags['isbn'] = isbn_check(tmp_record)
+            if dupe_flags['isbn']:
+                tmp_record = add_dupe_field(tmp_record, bib_id)
+                files['duplicate'].write(tmp_record)
             else:
-                print ("%d - %s\n" % (RECORD_COUNT, record['856']))
-
-            dupe_flags = {}
-
-            if 'duplicate' in files:
-                tmp_record = process_fields(copy.deepcopy(record), options)
-                bib_id, dupe_flags['isbn'] = isbn_check(tmp_record)
-                if dupe_flags['isbn']:
-                    tmp_record = add_dupe_field(tmp_record, bib_id)
-                    files['duplicate'].write(tmp_record)
-                else:
-                    del(dupe_flags['isbn'])
-
-            if 'tcn' in files and len(dupe_flags) == 0:
-                tmp_record = process_fields(copy.deepcopy(record), options)
-                bib_id, dupe_flags['tcn'] = tcn_check(tmp_record)
-                if dupe_flags['tcn']:
-                    tmp_record = add_dupe_field(tmp_record, bib_id)
-                    files['tcn'].write(tmp_record)
-                else:
-                    del(dupe_flags['tcn'])
-
-            if 'url' in files and len(dupe_flags) == 0:
-                tmp_record = process_fields(copy.deepcopy(record), options)
-                bib_id, dupe_flags['url'] = url_check(tmp_record, options)
-                if dupe_flags['url']:
-                    tmp_record = add_dupe_field(tmp_record, bib_id)
-                    files['url'].write(tmp_record)
-                else:
-                    del(dupe_flags['url'])
-
-            if len(dupe_flags):
-                DUP_COUNT += 1
+                del(dupe_flags['isbn'])
+
+        if 'tcn' in files and len(dupe_flags) == 0:
+            tmp_record = process_fields(copy.deepcopy(record), options)
+            bib_id, dupe_flags['tcn'] = tcn_check(tmp_record)
+            if dupe_flags['tcn']:
+                tmp_record = add_dupe_field(tmp_record, bib_id)
+                files['tcn'].write(tmp_record)
             else:
-                new_record = process_fields(record, options)
-                writer.write(new_record)
-                if ('sample' in files and (
-                    (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)
-                )):
-                    files['sample'].write(new_record)
-        except Exception, ex:
-            print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
+                del(dupe_flags['tcn'])
+
+        if 'url' in files and len(dupe_flags) == 0:
+            tmp_record = process_fields(copy.deepcopy(record), options)
+            bib_id, dupe_flags['url'] = url_check(tmp_record, options)
+            if dupe_flags['url']:
+                tmp_record = add_dupe_field(tmp_record, bib_id)
+                files['url'].write(tmp_record)
+            else:
+                del(dupe_flags['url'])
+
+        if len(dupe_flags):
+            DUP_COUNT += 1
+        else:
+            new_record = process_fields(record, options)
+            if 'to-format' in options and options['to-format'] == 'xml':
+                new_record = pymarc.record_to_xml(new_record) + '\n'
+            files['output'].write(new_record)
+            if ('sample' in files and (
+                (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)
+            )):
+                files['sample'].write(new_record)
+    except Exception, ex:
+        print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
 
 def process_fields(record, options):
     """Decide which fields to add, delete, and keep"""
@@ -492,9 +503,8 @@ def process_fields(record, options):
     add_restriction(record, options, publisher) # 506
     add_platform(record, options) # 710
 
-    # USE FOR SKIPPING SFX
-    # marked_isbn = False
-    marked_isbn = mark_isbn_for_sfx(record, options)
+    if 'isbn-sfx' in options:
+        marked_isbn = mark_isbn_for_sfx(record, options)
 
     for field in record.get_fields():
         if 'clean' in options:
@@ -521,7 +531,7 @@ def process_fields(record, options):
         else:
             new_record.add_ordered_field(field)
 
-    if not marked_isbn:
+    if 'isbn-sfx' in options and not marked_isbn:
         try:
             isbn = record['020']['a']
             print("ISBN: [%s] - no matching ISBN target found in SFX for %s" %
@@ -1104,6 +1114,24 @@ def get_subfields(field, data):
 
     return subs
 
+def process_xml(record):
+    global OPTIONS
+    global FILES
+    options = OPTIONS
+    files = FILES
+    process_record(record, options, files)
 
 if __name__ == '__main__':
-    process_records(parse_opts())
+    OPTIONS = parse_opts()
+    for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'):
+        if fname in OPTIONS:
+            try:
+                if 'to-format' in OPTIONS and OPTIONS['to-format'] == 'xml':
+                    FILES[fname] = codecs.open(OPTIONS[fname], 'w', 'utf-8')
+                else:
+                    FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'w'))
+            except Exception, ex:
+                print("Could not open output file [%s]: %s" % (OPTIONS[fname], ex))
+
+    process_marc(OPTIONS)
+    #pymarc.map_xml(process_xml, '/home/dan/Downloads/AlexanderStreetPress_JazzMusicLibrary_Canada_MONOSER_2012-05-23.xml')