Adding 2 options to prep_ebook_records.py script

author Art Rhyno <art632000@yahoo.ca>

Thu, 4 Oct 2012 18:32:46 +0000 (14:32 -0400)

committer Dan Scott <dscott@laurentian.ca>

Tue, 7 May 2013 18:58:04 +0000 (14:58 -0400)
author Art Rhyno <art632000@yahoo.ca>
Thu, 4 Oct 2012 18:32:46 +0000 (14:32 -0400)
committer Dan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:58:04 +0000 (14:58 -0400)
diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py

index 0884634..7ce07e1 100644 (file)
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python
  # -*- coding: utf-8 -*-
  """
  Prepare sets of electronic resource MARC records for loading into Evergreen
@@ -14,11 +14,17 @@ requirements that would be the same for each record and therefore can
  be accommodated in batch load.
  """
  
-import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json
+from urllib import quote
  from datetime import date
  from BeautifulSoup import BeautifulSoup
  
  RECORD_COUNT = 0
+DUP_COUNT = 0
+GATEWAY_URL = "http://www.concat.ca/osrf-gateway-v1"
+OPENSRF_ISBN_CALL = "open-ils.search.biblio.isbn"
+OPENSRF_TCN_CALL = "open-ils.search.biblio.tcn"
+OPENSRF_KEYWORD_CALL = "open-ils.search.biblio.multiclass.query"
  
  class Institution():
      """Defines standard settings for each Conifer institution"""
@@ -27,6 +33,8 @@ class Institution():
          """Initialize the Institution object"""
          self.algoma = { \
              "code": "ALGOMASYS", \
+            "lac_symbol": "OSTMA", \
+            "org_unit": "111", \
              "ebrary_code": "algomauca", \
              "proxy": "http://libproxy.auc.ca/login?url=", \
              "link_text": "Available online", \
@@ -36,6 +44,8 @@ class Institution():
  
          self.boreal = { \
              "code": "BOREALSYS", \
+            "lac_symbol": "BOREALSYS", \
+            "org_unit": "135", \
              "ebrary_code": "ocls", \
              "proxy": "http://ra.ocls.ca/ra/login.aspx?url=", \
              "link_text": "Disponible en ligne", \
@@ -44,6 +54,8 @@ class Institution():
          
          self.laurentian = { \
              "code": "LUSYS", \
+            "lac_symbol": "OSUL", \
+            "org_unit": "105", \
              "ebrary_code": "jndlu", \
              "gale_code": "sudb78095", \
              "proxy": "https://librweb.laurentian.ca/login?url=", \
@@ -54,6 +66,8 @@ class Institution():
  
          self.windsor = { \
              "code": "WINDSYS", \
+            "lac_symbol": "OWA", \
+            "org_unit": "106", \
              "ebrary_code": "oculwindsor", \
              "gale_code": "wind05901", \
              "proxy": "http://ezproxy.uwindsor.ca/login?url=", \
@@ -121,6 +135,12 @@ Required arguments:
      -W / --windsor : Add an 856 for University of Windsor
  
  Optional arguments:
+    -d / --duplicate : The name of the file to route ISBN duplicates to.
+
+    -t / --tcn : The name of the file to route TCN duplicates to.
+
+    -u / --url : The name of the file to route URL duplicates to.
+
      -e / --ebrary : Put Ebrary 001 into 924 as a URN for SFX lookup purposes
  
      -n / --note : The text of the internal note to be inserted into a 590 field.
@@ -143,10 +163,13 @@ def consolidate_options(opts):
          '-o': '--output',
          '-a': '--authorization',
          '-c': '--consortium',
+        '-d': '--duplicate',
          '-e': '--ebrary',
          '-p': '--publisher',
          '-P': '--platform',
          '-n': '--note',
+        '-t': '--tcn',
+        '-u': '--url',
          '-A': '--algoma',
          '-B': '--boreal',
          '-L': '--laurentian',
@@ -213,6 +236,15 @@ def check_options(options):
      clean_opts['authorization'] = options['--authorization'].decode('utf-8')
  
  
+    if '--duplicate' in options:
+        clean_opts['duplicate'] = options['--duplicate']
+
+    if '--tcn' in options:
+        clean_opts['tcn'] = options['--tcn']
+
+    if '--url' in options:
+        clean_opts['url'] = options['--url']
+
      if '--ebrary' in options:
          clean_opts['ebrary'] = True
  
@@ -232,6 +264,84 @@ def check_options(options):
  
      return clean_opts
  
+def evergreen_request(method, *args, **kwargs):
+    service = '.'.join(method.split('.')[:2])
+    kwargs.update({'service':service, 'method':method})
+    params =  ['%s=%s' % (k,quote(v)) for k,v in kwargs.items()]
+    params += ['param=%s' % quote(json.dumps(a)) for a in args]
+    url = '%s?%s' % (GATEWAY_URL, '&'.join(params)) 
+    #print '--->', url
+    req = urllib2.urlopen(url)
+    resp = json.load(req)
+    if resp['status'] != 200:
+        raise Exception('error during evergreen request', resp)
+    payload = resp['payload']
+    #print '<---', payload
+    return payload
+
+def url_check(record, options):
+    global DUP_COUNT, RECORD_COUNT
+
+    match = False
+    match_id = 0
+    for url in record.get_fields('856'):
+        for urlval in url.get_subfields('u'):
+            # print "urlval", urlval
+            for library in options['libraries']:
+                libopts = options['settings'].get_settings(library)
+                keyword_info = evergreen_request(OPENSRF_KEYWORD_CALL,
+                    {'org_unit': libopts['org_unit'],
+                    'depth': 1, 'limit': 5, 'offset': 0,
+                    'visibility_limit': 3000,
+                    'default_class': 'keyword'},
+                    urlval, 1)
+                bib_ids = keyword_info[0]['ids']
+                for bib_id in bib_ids:
+                    match_id = bib_id
+                    print("* %d of %d - URL match on %s for %s"
+                       % (DUP_COUNT + 1, RECORD_COUNT, urlval, bib_id[0])
+                    )
+                    match = True
+    return match_id, match
+
+def tcn_check(record):
+    global DUP_COUNT, RECORD_COUNT
+
+    match = False
+    match_id = 0
+    for tcn in record.get_fields('001'):
+        tcn_val = tcn.value()
+        tcn_info = evergreen_request(OPENSRF_TCN_CALL,tcn_val)
+        bib_ids = tcn_info[0]['ids']
+        # print "tcn_info", tcn_info
+        for bib_id in bib_ids:
+            match_id = bib_id
+            print("* %d of %d - TCN match on %s for %s"
+                % (DUP_COUNT + 1, RECORD_COUNT, tcn_val, bib_id)
+            )
+            match = True
+    return match_id, match
+
+def isbn_check(record):
+    global DUP_COUNT, RECORD_COUNT
+
+    match = False
+    match_id = 0
+    for isbn in record.get_fields('020', '024'):
+        for isbnval in isbn.get_subfields('a', 'z'):
+            isbn_val = clean_isbn(isbnval)
+            isbn_info = evergreen_request(OPENSRF_ISBN_CALL,isbnval)
+            match_count = isbn_info[0]['count']
+            #print "count", isbn_info[0]['count']
+            bib_ids = isbn_info[0]['ids']
+            for bib_id in bib_ids:
+                match_id = bib_id
+                print("* %d of %d - ISBN match on %s for %s"
+                    % (DUP_COUNT + 1, RECORD_COUNT, isbn_val, bib_id)
+                )
+                match = True
+    return match_id, match
+
  def append_period(text):
      """
      Append a period to the incoming text if required
@@ -256,10 +366,10 @@ def check_libraries(options):
  def parse_opts():
      """Get command-line arguments from the script"""
      try:
-        _short_opts = 'i:o:a:c:p:ABLWen:P:s:h'
+        _short_opts = 'i:o:a:c:p:P:ABLWe:d:t:u:n:s:h'
          _long_opts = ['input=', 'output=', 'authorization=', 'consortium=', 
-            'publisher=', 'algoma', 'boreal', 'laurentian', 'windsor', 'ebrary',
-            'note=', 'platform=', 'sample=', 'help'
+            'publisher=', 'platform=', 'algoma', 'boreal', 'laurentian', 'windsor', 'ebrary',
+            'duplicate=', 'tcn=', 'url=', 'note=','sample=', 'help'
          ]
          opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) 
      except getopt.GetoptError, ex:
@@ -273,7 +383,11 @@ def process_records(options):
      """Converts raw ebook MARC records to Conifer-ready MARC records"""
  
      global RECORD_COUNT
+    global DUP_COUNT
      sample = ''
+    duplicate = ''
+    tcn = ''
+    url = ''
  
      try:
          reader = pymarc.MARCReader(
@@ -286,8 +400,26 @@ def process_records(options):
          writer = pymarc.MARCWriter(open(options['output'], mode='wb'))
      except Exception, ex:
          print("Could not open output file [%s]" % options['output'])
-
-    if ('sample' in options):
+        
+    if 'duplicate' in options:
+       try:
+            duplicate = pymarc.MARCWriter(open(options['duplicate'], mode='wb'))
+       except Exception, ex:
+           print("Could not open output file [%s]" % options['duplicate'])
+
+    if 'tcn' in options:
+       try:
+            tcn = pymarc.MARCWriter(open(options['tcn'], mode='wb'))
+       except Exception, ex:
+           print("Could not open output file [%s]" % options['tcn'])
+
+    if 'url' in options:
+       try:
+            url = pymarc.MARCWriter(open(options['url'], mode='wb'))
+       except Exception, ex:
+           print("Could not open output file [%s]" % options['url'])
+
+    if 'sample' in options:
          sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
  
      for record in reader:
@@ -297,16 +429,42 @@ def process_records(options):
                  print("* No 856 for record # %s in file %s"
                          % (RECORD_COUNT, options['input'])
                  )
-
-            new_record = process_fields(record, options)
-
-            writer.write(new_record)
-            if (sample and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0))):
-                sample.write(new_record)
+            else:
+                print ("%d - %s\n" % (RECORD_COUNT,record['856']))
+
+
+            new_record = ''
+            dup_flag = False
+
+            if duplicate:
+               bib_id, dup_flag = isbn_check(record)
+               new_record = process_fields(record, options, bib_id, dup_flag)
+               if dup_flag:
+                  duplicate.write(new_record)
+            if tcn:
+               bib_id, dup_flag = tcn_check(record)
+               new_record = process_fields(record, options, bib_id, dup_flag)
+               if dup_flag:
+                  tcn.write(new_record)
+            if url:
+               bib_id, dup_flag = url_check(record, options)
+               new_record = process_fields(record, options, bib_id, dup_flag)
+               if dup_flag:
+                  url.write(new_record)
+
+            if not dup_flag:
+                new_record = process_fields(record, options, 0, False)
+            else:
+                DUP_COUNT += 1
+            
+            if new_record:
+                writer.write(new_record)
+                if (sample and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0))):
+                    sample.write(new_record)
          except Exception, ex:
              print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
  
-def process_fields(record, options):
+def process_fields(record, options, bib_id, dup_flag):
      """Decide which fields to add, delete, and keep"""
  
      new_record = pymarc.Record(to_unicode=True, force_utf8=True)
@@ -316,19 +474,33 @@ def process_fields(record, options):
  
      # 590
      if 'note' in options:
+        note_value = options['note']
          note = pymarc.Field(tag = '590',
              indicators = [' ', ' '],
              subfields = [
-                'a', options['note']
+                'a', note_value
              ]
          )
          record.add_field(note)
  
+    # 909
+    if dup_flag: 
+        dup_value = bib_id + ""
+        dup = pymarc.Field(tag = '909',
+            indicators = [' ', ' '],
+            subfields = [
+                'a', dup_value
+            ]
+        )
+        record.add_field(dup)
+
      add_marc_source(record, options) # 598
      publisher = add_publisher(record, options) # 710
      add_restriction(record, options, publisher) # 506
      add_platform(record, options) # 710
  
+    # USE FOR SKIPPING SFX
+    # marked_isbn = False
      marked_isbn = mark_isbn_for_sfx(record, options)
  
      for field in record.get_fields():
@@ -605,7 +777,7 @@ def mark_isbn_for_sfx(record, options):
                      return True
  
      # For ebrary records, add a 924 for the custom URN
-    if options['ebrary'] is True:
+    if 'ebrary' in options:
          urn = None
          for scn in record.get_fields('001'):
              urn = pymarc.Field(tag = '924',
@@ -710,7 +882,7 @@ def add_restriction(new_record, options, publisher):
                  'a', append_space_semi_space(libopts['access_note']),
                  'b', append_space_semi_space(options['consortium']),
                  'e', authnote,
-                '9', libopts['code']
+                '9', libopts['lac_symbol']
              ]
          )
          new_record.add_field(note)
author	Art Rhyno <art632000@yahoo.ca>
	Thu, 4 Oct 2012 18:32:46 +0000 (14:32 -0400)
committer	Dan Scott <dscott@laurentian.ca>
	Tue, 7 May 2013 18:58:04 +0000 (14:58 -0400)