Check for 020$z, 024$a,$z for ISBNs

author Dan Scott <dan@coffeecode.net>

Thu, 28 Jul 2011 15:16:42 +0000 (11:16 -0400)

committer Dan Scott <dscott@laurentian.ca>

Tue, 7 May 2013 18:36:33 +0000 (14:36 -0400)
author Dan Scott <dan@coffeecode.net>
Thu, 28 Jul 2011 15:16:42 +0000 (11:16 -0400)
committer Dan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:36:33 +0000 (14:36 -0400)
diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py

index 7a607e9..e753c9c 100644 (file)
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -14,11 +14,8 @@ be accommodated in batch load.
  """
  
  import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
-from datetime import date
  from BeautifulSoup import BeautifulSoup
  
-RECORD_COUNT = 0
-
  class Institution():
      """Defines standard settings for each Conifer institution"""
  
@@ -30,7 +27,7 @@ class Institution():
              "proxy": "http://libproxy.auc.ca/login?url=", \
              "link_text": "Available online", \
              "sfx_url": "http://sfx.scholarsportal.info/algoma", \
-            "access_note": "Access restricted to users with a valid Algoma University ID ;" \
+            "access_note": "Access restricted to users with a valid Algoma University ID" \
          }
          
          self.laurentian = { \
@@ -39,7 +36,7 @@ class Institution():
              "proxy": "https://librweb.laurentian.ca/login?url=", \
              "link_text": "Available online / disponible en ligne", \
              "sfx_url": "http://sfx.scholarsportal.info/laurentian", \
-            "access_note": "Access restricted to users with a valid Laurentian University ID ;" \
+            "access_note": "Access restricted to users with a valid Laurentian University ID" \
          }
  
          self.windsor = { \
@@ -48,7 +45,7 @@ class Institution():
              "proxy": "http://ezproxy.uwindsor.ca/login?url=", \
              "link_text": "Available online", \
              "sfx_url": "http://sfx.scholarsportal.info/windsor", \
-            "access_note": "Access restricted to users with a valid University of Windsor ID ;" \
+            "access_note": "Access restricted to users with a valid University of Windsor ID" \
          }
  
      def get_settings(self, lib):
@@ -95,11 +92,7 @@ Required arguments:
      -c / --consortium : The name of the consortial license to be inserted in
                          each 506$b access restriction note.
  
-    -p / --publisher : The name of the publisher to be inserted in a 710 field
-                       with a subfield 4 relator code 'pbl'.
-
-    -P / --platform: The name of the digital platform to be inserted in a 710
-                     field.
+    -p / --publisher : The name of the publisher to be inserted in a 710 field.
  
      -A / --algoma: Add an 856 for Algoma University
  
@@ -136,8 +129,6 @@ def consolidate_options(opts):
              _options['--consortium'] = val
          elif key == '-p':
              _options['--publisher'] = val
-        elif key == '-P':
-            _options['--platform'] = val
          elif key == '-n':
              _options['--note'] = val
          elif key == '-A':
@@ -215,9 +206,6 @@ def check_options(options):
      if '--note' in options:
          clean_opts['note'] = options['--note']
  
-    if '--platform' in options:
-        clean_opts['platform'] = options['--platform']
-
      clean_opts['libraries'] = _libraries
      clean_opts['input'] = _input
      clean_opts['output'] = _output
@@ -244,10 +232,10 @@ def check_libraries(options):
  def parse_opts():
      """Get command-line arguments from the script"""
      try:
-        _short_opts = 'i:o:a:c:p:ALWn:P:s:h'
+        _short_opts = 'i:o:a:c:p:ALWn:s:h'
          _long_opts = ['input=', 'output=', 'authorization=', 'consortium=', 
              'publisher=', 'algoma', 'laurentian', 'windsor', 'note=',
-            'platform=', 'sample=', 'help'
+            'sample=', 'help'
          ]
          opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) 
      except getopt.GetoptError, ex:
@@ -260,7 +248,6 @@ def parse_opts():
  def process_records(options):
      """Converts raw ebook MARC records to Conifer-ready MARC records"""
  
-    global RECORD_COUNT
      sample = ''
      reader = pymarc.MARCReader(
          open(options['input'], mode='rb'), to_unicode=True
@@ -269,50 +256,32 @@ def process_records(options):
      if ('sample' in options):
          sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
  
+    cnt = 0
      for record in reader:
-        RECORD_COUNT += 1
+        cnt = cnt + 1
          try:
              if not (record['856'] and record['856']['u']):
-                print("* No 856 for record # %d in file %s"
-                        % (RECORD_COUNT, options['input'])
+                print("* No 856 for record # %s in file %s"
+                        % (cnt, options['input'])
                  )
  
              new_record = process_fields(record, options)
  
              writer.write(new_record)
-            if (sample and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0))):
+            if (sample and ((cnt == 1) or (cnt % 100 == 0))):
                  sample.write(new_record)
          except Exception, ex:
-            print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
+            print("* Error processing record %s - %s" % (cnt, ex))
  
  def process_fields(record, options):
      """Decide which fields to add, delete, and keep"""
  
      new_record = pymarc.Record(to_unicode=True, force_utf8=True)
  
-    add_cat_source(new_record, options) # 040
-    add_restriction(record, options) # 506
-
-    # 590
-    if 'note' in options:
-        note = pymarc.Field(tag = '590',
-            indicators = [' ', ' '],
-            subfields = [
-                'a', options['note']
-            ]
-        )
-        record.add_field(note)
-
-    add_marc_source(record, options) # 598
-    publisher = add_publisher(record, options) # 710
-    add_platform(record, options) # 710
-
-    marked_isbn = mark_isbn_for_sfx(record, options)
-
      for field in record.get_fields():
          # Process all of the 856 fields
          if field.tag == '856':
-            new_fields = process_urls(field, options, publisher)
+            new_fields = process_urls(field, options)
              if new_fields:
                  for new_856 in new_fields:
                      new_record.add_field(new_856)
@@ -325,46 +294,41 @@ def process_fields(record, options):
          else:
              new_record.add_field(field)
  
+    add_publisher(record, new_record, options)
+    add_restriction(new_record, options)
+    marked_isbn = mark_isbn_for_sfx(new_record, options)
      if not marked_isbn:
-        try:
-            isbn = record['020']['a']
-            print("ISBN: [%s] - no matching ISBN target found in SFX for %s" %
-                (isbn, new_record['856']['u'])
-            )
-        except:
-            print("No matching ISBN target found in SFX for %s" %
-                (new_record['856']['u'])
-            )
+        print("No matching ISBN target found in SFX for %s" %
+            (new_record['856']['u'])
+        )
+
+    if 'note' in options:
+        note = pymarc.Field(tag = '590',
+            indicators = [' ', ' '],
+            subfields = [
+                'a', options['note']
+            ]
+        )
+        new_record.add_field(note)
+
+    add_cat_source(new_record, options)
  
      return new_record
  
-def add_publisher(record, options):
+def add_publisher(record, new_record, options):
      """
      This is a convoluted way to avoid creating a new 710 if we already
      have a matching 710 and just need to add the publisher relator code.
      """
  
-    publisher = options['publisher']
      munge_publisher = False
      need_publisher = True
      need_relator = True
  
-    raw_publisher = None
-    try:
-        raw_publisher = record['260']['b']
-    except:
-        pass
-
-    if raw_publisher:
-        if 'Oxford' in raw_publisher or 'Clarendon' in raw_publisher:
-            publisher = 'Oxford University Press'
-        elif 'Cambridge' in raw_publisher:
-            publisher = 'Cambridge University Press'
-
      # Iterate through all of the existing 710 fields
      for sten in record.get_fields('710'):
          for pub in sten.get_subfields('a'):
-            if pub == publisher:
+            if pub == options['publisher']:
                  munge_publisher = True
                  for rel in sten.get_subfields('4'): 
                      if rel == 'pbl':
@@ -380,38 +344,11 @@ def add_publisher(record, options):
          seven_ten = pymarc.Field(tag = '710',
              indicators = ['2', ' '],
              subfields = [
-                'a', publisher,
+                'a', options['publisher'],
                  '4', 'pbl'
              ]
          )
-        record.add_field(seven_ten)
-
-    return publisher
-
-def add_platform(record, options):
-    """
-    This is a convoluted way to avoid creating a new 710 if we already
-    have a matching 710 for digital platform.
-    """
-
-    platform = options['platform']
-    need_platform = True
-
-    # Iterate through all of the existing 710 fields
-    for sten in record.get_fields('710'):
-        for pub in sten.get_subfields('a'):
-            if pub == platform:
-                need_platform = False
-
-    if need_platform:
-        # Add the platform
-        seven_ten = pymarc.Field(tag = '710',
-            indicators = ['2', ' '],
-            subfields = [
-                'a', platform
-            ]
-        )
-        record.add_field(seven_ten)
+        new_record.add_field(seven_ten)
  
  def mark_isbn_for_sfx(record, options):
      """
@@ -423,42 +360,52 @@ def mark_isbn_for_sfx(record, options):
      """
  
      # For every ISBN in the record
-    for isbn in record.get_fields('020'):
-        for isbnval in isbn.get_subfields('a'):
+    for isbn in record.get_fields('020', '024'):
+        for isbnval in isbn.get_subfields('a', 'z'):
              isbnval = clean_isbn(isbnval)
              # And for every library we have enabled
              for lib in options['libraries']:
-                sfx = options['settings'].get_settings(lib)['sfx_url']
-                url = "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&" \
-                    "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&" \
-                    "sfx.ignore_date_threshold=1&" \
-                    "sfx.response_type=multi_obj_detailed_xml" \
-                    "&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval)
-
-                try:
-                    req = urllib2.urlopen(url)
-                    sfx_res = BeautifulSoup(req.read())
-                except urllib2.HTTPError, ex:
-                    print("%s for URL %s" % (ex, url))
-                    continue
-                except urllib2.URLError, ex:
-                    print("%s for URL %s" % (ex, url))
-                    continue
-            
-                # We want a target with a service_type element of 'getFullTxt'
-                targets = sfx_res.ctx_obj.ctx_obj_targets.findAll(
-                    'target', recursive=False
-                )
+                found = check_for_isbn(options, lib, isbnval)
+                if found:
+                    # Add the $9 subfield to mark this as a good one
+                    isbn.add_subfield('9', 'SFX')
+                    return True
+    return False
+
+def check_for_isbn(options, lib, isbnval):
+    """
+    Given an ISBN value, check SFX at the specified library for a match
+    """
+    sfx = options['settings'].get_settings(lib)['sfx_url']
+    url = "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&" \
+        "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&" \
+        "sfx.ignore_date_threshold=1&" \
+        "sfx.response_type=multi_obj_detailed_xml" \
+        "&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval)
+
+    try:
+        req = urllib2.urlopen(url)
+        sfx_res = BeautifulSoup(req.read())
+    except urllib2.HTTPError, ex:
+        print("%s for URL %s" % (ex, url))
+        return False
+    except urllib2.URLError, ex:
+        print("%s for URL %s" % (ex, url))
+        return False
  
-                if len(targets) == 0:
-                    # No SFX targets found for this ISBN - next!
-                    continue
+    # We want a target with a service_type element of 'getFullTxt'
+    targets = sfx_res.ctx_obj.ctx_obj_targets.findAll(
+        'target', recursive=False
+    )
+
+    if len(targets) == 0:
+        # No SFX targets found for this ISBN - next!
+        return False
+
+    for target in targets:
+        if target.service_type.renderContents() == 'getFullTxt':
+            return True
  
-                for target in targets:
-                    if target.service_type.renderContents() == 'getFullTxt':
-                        # Add the $9 subfield to mark this as a good one
-                        isbn.add_subfield('9', 'SFX')
-                        return True
      return False
  
  def clean_isbn(isbn):
@@ -494,11 +441,6 @@ def add_restriction(new_record, options):
        * $9 - Institutional code to which this note applies
      """
  
-    # Add a period if the authorization ends with a number or letter
-    authnote = options['authorization']
-    if authnote[-1] not in '.)]':
-        authnote += '.'
-
      for library in options['libraries']:
          libopts = options['settings'].get_settings(library)
          # Add the access restriction note
@@ -506,8 +448,8 @@ def add_restriction(new_record, options):
              indicators = ['1', ' '],
              subfields = [
                  'a', libopts['access_note'],
-                'b', options['consortium'] + ' ; ',
-                'e', authnote,
+                'b', options['consortium'],
+                'e', options['authorization'],
                  '9', libopts['code']
              ]
          )
@@ -532,26 +474,8 @@ def add_cat_source(record, options):
          )
          record.add_field(forty)
  
-def add_marc_source(record, options):
-    """
-    Add a 598 field identifying the source MARC file name and processing date
-    """
-
-    global RECORD_COUNT
  
-    source = os.path.basename(options['input'])
-
-    marc_source = pymarc.Field(tag = '598',
-        indicators = [' ', ' '],
-        subfields = [
-            'a', source,
-            'b', date.today().isoformat(),
-            'c', str(RECORD_COUNT)
-        ]
-    )
-    record.add_field(marc_source)
-
-def process_urls(field, options, publisher):
+def process_urls(field, options):
      """Creates 856 fields required by Conifer"""
  
      new_fields = []
@@ -567,10 +491,6 @@ def process_urls(field, options, publisher):
              new_fields.append(enrich)
      else:
          for lib in options['libraries']:
-
-            # Tweak for Algoma for combined CUP/OUP
-            if lib == 'algoma' and 'Cambridge' in publisher:
-                continue
              data = options['settings'].get_settings(lib)
              subs = get_subfields(field, data)
              eight_five_six = pymarc.Field(tag = '856',
@@ -697,5 +617,4 @@ def get_subfields(field, data):
  
  
  if __name__ == '__main__':
-    
      process_records(parse_opts())
author	Dan Scott <dan@coffeecode.net>
	Thu, 28 Jul 2011 15:16:42 +0000 (11:16 -0400)
committer	Dan Scott <dscott@laurentian.ca>
	Tue, 7 May 2013 18:36:33 +0000 (14:36 -0400)