Check for 020$z, 024$a,$z for ISBNs
authorDan Scott <dan@coffeecode.net>
Thu, 28 Jul 2011 15:16:42 +0000 (11:16 -0400)
committerDan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:36:33 +0000 (14:36 -0400)
Either our MARC record sources are horribly corrupted, or the SFX
knowledge base is questionable, or both, as we're having to search
cancelled/invalid/non-ISBNs to get matches in SFX for our electronic
records.

Also, use 506 $9 to record our institutional ID in authorization notes.

Signed-off-by: Dan Scott <dscott@laurentian.ca>
tools/ebooks/prep_ebook_records.py

index 7a607e9..e753c9c 100644 (file)
@@ -14,11 +14,8 @@ be accommodated in batch load.
 """
 
 import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
-from datetime import date
 from BeautifulSoup import BeautifulSoup
 
-RECORD_COUNT = 0
-
 class Institution():
     """Defines standard settings for each Conifer institution"""
 
@@ -30,7 +27,7 @@ class Institution():
             "proxy": "http://libproxy.auc.ca/login?url=", \
             "link_text": "Available online", \
             "sfx_url": "http://sfx.scholarsportal.info/algoma", \
-            "access_note": "Access restricted to users with a valid Algoma University ID ;" \
+            "access_note": "Access restricted to users with a valid Algoma University ID" \
         }
         
         self.laurentian = { \
@@ -39,7 +36,7 @@ class Institution():
             "proxy": "https://librweb.laurentian.ca/login?url=", \
             "link_text": "Available online / disponible en ligne", \
             "sfx_url": "http://sfx.scholarsportal.info/laurentian", \
-            "access_note": "Access restricted to users with a valid Laurentian University ID ;" \
+            "access_note": "Access restricted to users with a valid Laurentian University ID" \
         }
 
         self.windsor = { \
@@ -48,7 +45,7 @@ class Institution():
             "proxy": "http://ezproxy.uwindsor.ca/login?url=", \
             "link_text": "Available online", \
             "sfx_url": "http://sfx.scholarsportal.info/windsor", \
-            "access_note": "Access restricted to users with a valid University of Windsor ID ;" \
+            "access_note": "Access restricted to users with a valid University of Windsor ID" \
         }
 
     def get_settings(self, lib):
@@ -95,11 +92,7 @@ Required arguments:
     -c / --consortium : The name of the consortial license to be inserted in
                         each 506$b access restriction note.
 
-    -p / --publisher : The name of the publisher to be inserted in a 710 field
-                       with a subfield 4 relator code 'pbl'.
-
-    -P / --platform: The name of the digital platform to be inserted in a 710
-                     field.
+    -p / --publisher : The name of the publisher to be inserted in a 710 field.
 
     -A / --algoma: Add an 856 for Algoma University
 
@@ -136,8 +129,6 @@ def consolidate_options(opts):
             _options['--consortium'] = val
         elif key == '-p':
             _options['--publisher'] = val
-        elif key == '-P':
-            _options['--platform'] = val
         elif key == '-n':
             _options['--note'] = val
         elif key == '-A':
@@ -215,9 +206,6 @@ def check_options(options):
     if '--note' in options:
         clean_opts['note'] = options['--note']
 
-    if '--platform' in options:
-        clean_opts['platform'] = options['--platform']
-
     clean_opts['libraries'] = _libraries
     clean_opts['input'] = _input
     clean_opts['output'] = _output
@@ -244,10 +232,10 @@ def check_libraries(options):
 def parse_opts():
     """Get command-line arguments from the script"""
     try:
-        _short_opts = 'i:o:a:c:p:ALWn:P:s:h'
+        _short_opts = 'i:o:a:c:p:ALWn:s:h'
         _long_opts = ['input=', 'output=', 'authorization=', 'consortium=', 
             'publisher=', 'algoma', 'laurentian', 'windsor', 'note=',
-            'platform=', 'sample=', 'help'
+            'sample=', 'help'
         ]
         opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) 
     except getopt.GetoptError, ex:
@@ -260,7 +248,6 @@ def parse_opts():
 def process_records(options):
     """Converts raw ebook MARC records to Conifer-ready MARC records"""
 
-    global RECORD_COUNT
     sample = ''
     reader = pymarc.MARCReader(
         open(options['input'], mode='rb'), to_unicode=True
@@ -269,50 +256,32 @@ def process_records(options):
     if ('sample' in options):
         sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
 
+    cnt = 0
     for record in reader:
-        RECORD_COUNT += 1
+        cnt = cnt + 1
         try:
             if not (record['856'] and record['856']['u']):
-                print("* No 856 for record # %d in file %s"
-                        % (RECORD_COUNT, options['input'])
+                print("* No 856 for record # %s in file %s"
+                        % (cnt, options['input'])
                 )
 
             new_record = process_fields(record, options)
 
             writer.write(new_record)
-            if (sample and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0))):
+            if (sample and ((cnt == 1) or (cnt % 100 == 0))):
                 sample.write(new_record)
         except Exception, ex:
-            print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
+            print("* Error processing record %s - %s" % (cnt, ex))
 
 def process_fields(record, options):
     """Decide which fields to add, delete, and keep"""
 
     new_record = pymarc.Record(to_unicode=True, force_utf8=True)
 
-    add_cat_source(new_record, options) # 040
-    add_restriction(record, options) # 506
-
-    # 590
-    if 'note' in options:
-        note = pymarc.Field(tag = '590',
-            indicators = [' ', ' '],
-            subfields = [
-                'a', options['note']
-            ]
-        )
-        record.add_field(note)
-
-    add_marc_source(record, options) # 598
-    publisher = add_publisher(record, options) # 710
-    add_platform(record, options) # 710
-
-    marked_isbn = mark_isbn_for_sfx(record, options)
-
     for field in record.get_fields():
         # Process all of the 856 fields
         if field.tag == '856':
-            new_fields = process_urls(field, options, publisher)
+            new_fields = process_urls(field, options)
             if new_fields:
                 for new_856 in new_fields:
                     new_record.add_field(new_856)
@@ -325,46 +294,41 @@ def process_fields(record, options):
         else:
             new_record.add_field(field)
 
+    add_publisher(record, new_record, options)
+    add_restriction(new_record, options)
+    marked_isbn = mark_isbn_for_sfx(new_record, options)
     if not marked_isbn:
-        try:
-            isbn = record['020']['a']
-            print("ISBN: [%s] - no matching ISBN target found in SFX for %s" %
-                (isbn, new_record['856']['u'])
-            )
-        except:
-            print("No matching ISBN target found in SFX for %s" %
-                (new_record['856']['u'])
-            )
+        print("No matching ISBN target found in SFX for %s" %
+            (new_record['856']['u'])
+        )
+
+    if 'note' in options:
+        note = pymarc.Field(tag = '590',
+            indicators = [' ', ' '],
+            subfields = [
+                'a', options['note']
+            ]
+        )
+        new_record.add_field(note)
+
+    add_cat_source(new_record, options)
 
     return new_record
 
-def add_publisher(record, options):
+def add_publisher(record, new_record, options):
     """
     This is a convoluted way to avoid creating a new 710 if we already
     have a matching 710 and just need to add the publisher relator code.
     """
 
-    publisher = options['publisher']
     munge_publisher = False
     need_publisher = True
     need_relator = True
 
-    raw_publisher = None
-    try:
-        raw_publisher = record['260']['b']
-    except:
-        pass
-
-    if raw_publisher:
-        if 'Oxford' in raw_publisher or 'Clarendon' in raw_publisher:
-            publisher = 'Oxford University Press'
-        elif 'Cambridge' in raw_publisher:
-            publisher = 'Cambridge University Press'
-
     # Iterate through all of the existing 710 fields
     for sten in record.get_fields('710'):
         for pub in sten.get_subfields('a'):
-            if pub == publisher:
+            if pub == options['publisher']:
                 munge_publisher = True
                 for rel in sten.get_subfields('4'): 
                     if rel == 'pbl':
@@ -380,38 +344,11 @@ def add_publisher(record, options):
         seven_ten = pymarc.Field(tag = '710',
             indicators = ['2', ' '],
             subfields = [
-                'a', publisher,
+                'a', options['publisher'],
                 '4', 'pbl'
             ]
         )
-        record.add_field(seven_ten)
-
-    return publisher
-
-def add_platform(record, options):
-    """
-    This is a convoluted way to avoid creating a new 710 if we already
-    have a matching 710 for digital platform.
-    """
-
-    platform = options['platform']
-    need_platform = True
-
-    # Iterate through all of the existing 710 fields
-    for sten in record.get_fields('710'):
-        for pub in sten.get_subfields('a'):
-            if pub == platform:
-                need_platform = False
-
-    if need_platform:
-        # Add the platform
-        seven_ten = pymarc.Field(tag = '710',
-            indicators = ['2', ' '],
-            subfields = [
-                'a', platform
-            ]
-        )
-        record.add_field(seven_ten)
+        new_record.add_field(seven_ten)
 
 def mark_isbn_for_sfx(record, options):
     """
@@ -423,42 +360,52 @@ def mark_isbn_for_sfx(record, options):
     """
 
     # For every ISBN in the record
-    for isbn in record.get_fields('020'):
-        for isbnval in isbn.get_subfields('a'):
+    for isbn in record.get_fields('020', '024'):
+        for isbnval in isbn.get_subfields('a', 'z'):
             isbnval = clean_isbn(isbnval)
             # And for every library we have enabled
             for lib in options['libraries']:
-                sfx = options['settings'].get_settings(lib)['sfx_url']
-                url = "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&" \
-                    "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&" \
-                    "sfx.ignore_date_threshold=1&" \
-                    "sfx.response_type=multi_obj_detailed_xml" \
-                    "&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval)
-
-                try:
-                    req = urllib2.urlopen(url)
-                    sfx_res = BeautifulSoup(req.read())
-                except urllib2.HTTPError, ex:
-                    print("%s for URL %s" % (ex, url))
-                    continue
-                except urllib2.URLError, ex:
-                    print("%s for URL %s" % (ex, url))
-                    continue
-            
-                # We want a target with a service_type element of 'getFullTxt'
-                targets = sfx_res.ctx_obj.ctx_obj_targets.findAll(
-                    'target', recursive=False
-                )
+                found = check_for_isbn(options, lib, isbnval)
+                if found:
+                    # Add the $9 subfield to mark this as a good one
+                    isbn.add_subfield('9', 'SFX')
+                    return True
+    return False
+
+def check_for_isbn(options, lib, isbnval):
+    """
+    Given an ISBN value, check SFX at the specified library for a match
+    """
+    sfx = options['settings'].get_settings(lib)['sfx_url']
+    url = "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&" \
+        "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&" \
+        "sfx.ignore_date_threshold=1&" \
+        "sfx.response_type=multi_obj_detailed_xml" \
+        "&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval)
+
+    try:
+        req = urllib2.urlopen(url)
+        sfx_res = BeautifulSoup(req.read())
+    except urllib2.HTTPError, ex:
+        print("%s for URL %s" % (ex, url))
+        return False
+    except urllib2.URLError, ex:
+        print("%s for URL %s" % (ex, url))
+        return False
 
-                if len(targets) == 0:
-                    # No SFX targets found for this ISBN - next!
-                    continue
+    # We want a target with a service_type element of 'getFullTxt'
+    targets = sfx_res.ctx_obj.ctx_obj_targets.findAll(
+        'target', recursive=False
+    )
+
+    if len(targets) == 0:
+        # No SFX targets found for this ISBN - next!
+        return False
+
+    for target in targets:
+        if target.service_type.renderContents() == 'getFullTxt':
+            return True
 
-                for target in targets:
-                    if target.service_type.renderContents() == 'getFullTxt':
-                        # Add the $9 subfield to mark this as a good one
-                        isbn.add_subfield('9', 'SFX')
-                        return True
     return False
 
 def clean_isbn(isbn):
@@ -494,11 +441,6 @@ def add_restriction(new_record, options):
       * $9 - Institutional code to which this note applies
     """
 
-    # Add a period if the authorization ends with a number or letter
-    authnote = options['authorization']
-    if authnote[-1] not in '.)]':
-        authnote += '.'
-
     for library in options['libraries']:
         libopts = options['settings'].get_settings(library)
         # Add the access restriction note
@@ -506,8 +448,8 @@ def add_restriction(new_record, options):
             indicators = ['1', ' '],
             subfields = [
                 'a', libopts['access_note'],
-                'b', options['consortium'] + ' ; ',
-                'e', authnote,
+                'b', options['consortium'],
+                'e', options['authorization'],
                 '9', libopts['code']
             ]
         )
@@ -532,26 +474,8 @@ def add_cat_source(record, options):
         )
         record.add_field(forty)
 
-def add_marc_source(record, options):
-    """
-    Add a 598 field identifying the source MARC file name and processing date
-    """
-
-    global RECORD_COUNT
 
-    source = os.path.basename(options['input'])
-
-    marc_source = pymarc.Field(tag = '598',
-        indicators = [' ', ' '],
-        subfields = [
-            'a', source,
-            'b', date.today().isoformat(),
-            'c', str(RECORD_COUNT)
-        ]
-    )
-    record.add_field(marc_source)
-
-def process_urls(field, options, publisher):
+def process_urls(field, options):
     """Creates 856 fields required by Conifer"""
 
     new_fields = []
@@ -567,10 +491,6 @@ def process_urls(field, options, publisher):
             new_fields.append(enrich)
     else:
         for lib in options['libraries']:
-
-            # Tweak for Algoma for combined CUP/OUP
-            if lib == 'algoma' and 'Cambridge' in publisher:
-                continue
             data = options['settings'].get_settings(lib)
             subs = get_subfields(field, data)
             eight_five_six = pymarc.Field(tag = '856',
@@ -697,5 +617,4 @@ def get_subfields(field, data):
 
 
 if __name__ == '__main__':
-    
     process_records(parse_opts())