Adding 2 options to prep_ebook_records.py script
authorArt Rhyno <art632000@yahoo.ca>
Thu, 4 Oct 2012 18:32:46 +0000 (14:32 -0400)
committerDan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:58:04 +0000 (14:58 -0400)
This supports 2 additional options to Dan Scott's ebook script:

    -t / --tcn : The name of the file to route TCN duplicates to.

The tcn search takes the 001 and does an OpenSRF open-ils.search.biblio.tcn
call.

   -u / --url : The name of the file to route URL duplicates to.

The url search is based on the 856u field and is done with the OpenSRF
open-ils.search.biblio.multiclass.query call.

Signed-off-by: Art Rhyno <art632000@yahoo.ca>
Signed-off-by: Dan Scott <dscott@laurentian.ca>
tools/ebooks/prep_ebook_records.py

index 0884634..7ce07e1 100644 (file)
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """
 Prepare sets of electronic resource MARC records for loading into Evergreen
@@ -14,11 +14,17 @@ requirements that would be the same for each record and therefore can
 be accommodated in batch load.
 """
 
-import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json
+from urllib import quote
 from datetime import date
 from BeautifulSoup import BeautifulSoup
 
 RECORD_COUNT = 0
+DUP_COUNT = 0
+GATEWAY_URL = "http://www.concat.ca/osrf-gateway-v1"
+OPENSRF_ISBN_CALL = "open-ils.search.biblio.isbn"
+OPENSRF_TCN_CALL = "open-ils.search.biblio.tcn"
+OPENSRF_KEYWORD_CALL = "open-ils.search.biblio.multiclass.query"
 
 class Institution():
     """Defines standard settings for each Conifer institution"""
@@ -27,6 +33,8 @@ class Institution():
         """Initialize the Institution object"""
         self.algoma = { \
             "code": "ALGOMASYS", \
+            "lac_symbol": "OSTMA", \
+            "org_unit": "111", \
             "ebrary_code": "algomauca", \
             "proxy": "http://libproxy.auc.ca/login?url=", \
             "link_text": "Available online", \
@@ -36,6 +44,8 @@ class Institution():
 
         self.boreal = { \
             "code": "BOREALSYS", \
+            "lac_symbol": "BOREALSYS", \
+            "org_unit": "135", \
             "ebrary_code": "ocls", \
             "proxy": "http://ra.ocls.ca/ra/login.aspx?url=", \
             "link_text": "Disponible en ligne", \
@@ -44,6 +54,8 @@ class Institution():
         
         self.laurentian = { \
             "code": "LUSYS", \
+            "lac_symbol": "OSUL", \
+            "org_unit": "105", \
             "ebrary_code": "jndlu", \
             "gale_code": "sudb78095", \
             "proxy": "https://librweb.laurentian.ca/login?url=", \
@@ -54,6 +66,8 @@ class Institution():
 
         self.windsor = { \
             "code": "WINDSYS", \
+            "lac_symbol": "OWA", \
+            "org_unit": "106", \
             "ebrary_code": "oculwindsor", \
             "gale_code": "wind05901", \
             "proxy": "http://ezproxy.uwindsor.ca/login?url=", \
@@ -121,6 +135,12 @@ Required arguments:
     -W / --windsor : Add an 856 for University of Windsor
 
 Optional arguments:
+    -d / --duplicate : The name of the file to route ISBN duplicates to.
+
+    -t / --tcn : The name of the file to route TCN duplicates to.
+
+    -u / --url : The name of the file to route URL duplicates to.
+
     -e / --ebrary : Put Ebrary 001 into 924 as a URN for SFX lookup purposes
 
     -n / --note : The text of the internal note to be inserted into a 590 field.
@@ -143,10 +163,13 @@ def consolidate_options(opts):
         '-o': '--output',
         '-a': '--authorization',
         '-c': '--consortium',
+        '-d': '--duplicate',
         '-e': '--ebrary',
         '-p': '--publisher',
         '-P': '--platform',
         '-n': '--note',
+        '-t': '--tcn',
+        '-u': '--url',
         '-A': '--algoma',
         '-B': '--boreal',
         '-L': '--laurentian',
@@ -213,6 +236,15 @@ def check_options(options):
     clean_opts['authorization'] = options['--authorization'].decode('utf-8')
 
 
+    if '--duplicate' in options:
+        clean_opts['duplicate'] = options['--duplicate']
+
+    if '--tcn' in options:
+        clean_opts['tcn'] = options['--tcn']
+
+    if '--url' in options:
+        clean_opts['url'] = options['--url']
+
     if '--ebrary' in options:
         clean_opts['ebrary'] = True
 
@@ -232,6 +264,84 @@ def check_options(options):
 
     return clean_opts
 
+def evergreen_request(method, *args, **kwargs):
+    service = '.'.join(method.split('.')[:2])
+    kwargs.update({'service':service, 'method':method})
+    params =  ['%s=%s' % (k,quote(v)) for k,v in kwargs.items()]
+    params += ['param=%s' % quote(json.dumps(a)) for a in args]
+    url = '%s?%s' % (GATEWAY_URL, '&'.join(params)) 
+    #print '--->', url
+    req = urllib2.urlopen(url)
+    resp = json.load(req)
+    if resp['status'] != 200:
+        raise Exception('error during evergreen request', resp)
+    payload = resp['payload']
+    #print '<---', payload
+    return payload
+
+def url_check(record, options):
+    global DUP_COUNT, RECORD_COUNT
+
+    match = False
+    match_id = 0
+    for url in record.get_fields('856'):
+        for urlval in url.get_subfields('u'):
+            # print "urlval", urlval
+            for library in options['libraries']:
+                libopts = options['settings'].get_settings(library)
+                keyword_info = evergreen_request(OPENSRF_KEYWORD_CALL,
+                    {'org_unit': libopts['org_unit'],
+                    'depth': 1, 'limit': 5, 'offset': 0,
+                    'visibility_limit': 3000,
+                    'default_class': 'keyword'},
+                    urlval, 1)
+                bib_ids = keyword_info[0]['ids']
+                for bib_id in bib_ids:
+                    match_id = bib_id
+                    print("* %d of %d - URL match on %s for %s"
+                       % (DUP_COUNT + 1, RECORD_COUNT, urlval, bib_id[0])
+                    )
+                    match = True
+    return match_id, match
+
+def tcn_check(record):
+    global DUP_COUNT, RECORD_COUNT
+
+    match = False
+    match_id = 0
+    for tcn in record.get_fields('001'):
+        tcn_val = tcn.value()
+        tcn_info = evergreen_request(OPENSRF_TCN_CALL,tcn_val)
+        bib_ids = tcn_info[0]['ids']
+        # print "tcn_info", tcn_info
+        for bib_id in bib_ids:
+            match_id = bib_id
+            print("* %d of %d - TCN match on %s for %s"
+                % (DUP_COUNT + 1, RECORD_COUNT, tcn_val, bib_id)
+            )
+            match = True
+    return match_id, match
+
+def isbn_check(record):
+    global DUP_COUNT, RECORD_COUNT
+
+    match = False
+    match_id = 0
+    for isbn in record.get_fields('020', '024'):
+        for isbnval in isbn.get_subfields('a', 'z'):
+            isbn_val = clean_isbn(isbnval)
+            isbn_info = evergreen_request(OPENSRF_ISBN_CALL,isbnval)
+            match_count = isbn_info[0]['count']
+            #print "count", isbn_info[0]['count']
+            bib_ids = isbn_info[0]['ids']
+            for bib_id in bib_ids:
+                match_id = bib_id
+                print("* %d of %d - ISBN match on %s for %s"
+                    % (DUP_COUNT + 1, RECORD_COUNT, isbn_val, bib_id)
+                )
+                match = True
+    return match_id, match
+
 def append_period(text):
     """
     Append a period to the incoming text if required
@@ -256,10 +366,10 @@ def check_libraries(options):
 def parse_opts():
     """Get command-line arguments from the script"""
     try:
-        _short_opts = 'i:o:a:c:p:ABLWen:P:s:h'
+        _short_opts = 'i:o:a:c:p:P:ABLWe:d:t:u:n:s:h'
         _long_opts = ['input=', 'output=', 'authorization=', 'consortium=', 
-            'publisher=', 'algoma', 'boreal', 'laurentian', 'windsor', 'ebrary',
-            'note=', 'platform=', 'sample=', 'help'
+            'publisher=', 'platform=', 'algoma', 'boreal', 'laurentian', 'windsor', 'ebrary',
+            'duplicate=', 'tcn=', 'url=', 'note=','sample=', 'help'
         ]
         opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) 
     except getopt.GetoptError, ex:
@@ -273,7 +383,11 @@ def process_records(options):
     """Converts raw ebook MARC records to Conifer-ready MARC records"""
 
     global RECORD_COUNT
+    global DUP_COUNT
     sample = ''
+    duplicate = ''
+    tcn = ''
+    url = ''
 
     try:
         reader = pymarc.MARCReader(
@@ -286,8 +400,26 @@ def process_records(options):
         writer = pymarc.MARCWriter(open(options['output'], mode='wb'))
     except Exception, ex:
         print("Could not open output file [%s]" % options['output'])
-
-    if ('sample' in options):
+        
+    if 'duplicate' in options:
+       try:
+            duplicate = pymarc.MARCWriter(open(options['duplicate'], mode='wb'))
+       except Exception, ex:
+           print("Could not open output file [%s]" % options['duplicate'])
+
+    if 'tcn' in options:
+       try:
+            tcn = pymarc.MARCWriter(open(options['tcn'], mode='wb'))
+       except Exception, ex:
+           print("Could not open output file [%s]" % options['tcn'])
+
+    if 'url' in options:
+       try:
+            url = pymarc.MARCWriter(open(options['url'], mode='wb'))
+       except Exception, ex:
+           print("Could not open output file [%s]" % options['url'])
+
+    if 'sample' in options:
         sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
 
     for record in reader:
@@ -297,16 +429,42 @@ def process_records(options):
                 print("* No 856 for record # %s in file %s"
                         % (RECORD_COUNT, options['input'])
                 )
-
-            new_record = process_fields(record, options)
-
-            writer.write(new_record)
-            if (sample and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0))):
-                sample.write(new_record)
+            else:
+                print ("%d - %s\n" % (RECORD_COUNT,record['856']))
+
+
+            new_record = ''
+            dup_flag = False
+
+            if duplicate:
+               bib_id, dup_flag = isbn_check(record)
+               new_record = process_fields(record, options, bib_id, dup_flag)
+               if dup_flag:
+                  duplicate.write(new_record)
+            if tcn:
+               bib_id, dup_flag = tcn_check(record)
+               new_record = process_fields(record, options, bib_id, dup_flag)
+               if dup_flag:
+                  tcn.write(new_record)
+            if url:
+               bib_id, dup_flag = url_check(record, options)
+               new_record = process_fields(record, options, bib_id, dup_flag)
+               if dup_flag:
+                  url.write(new_record)
+
+            if not dup_flag:
+                new_record = process_fields(record, options, 0, False)
+            else:
+                DUP_COUNT += 1
+            
+            if new_record:
+                writer.write(new_record)
+                if (sample and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0))):
+                    sample.write(new_record)
         except Exception, ex:
             print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
 
-def process_fields(record, options):
+def process_fields(record, options, bib_id, dup_flag):
     """Decide which fields to add, delete, and keep"""
 
     new_record = pymarc.Record(to_unicode=True, force_utf8=True)
@@ -316,19 +474,33 @@ def process_fields(record, options):
 
     # 590
     if 'note' in options:
+        note_value = options['note']
         note = pymarc.Field(tag = '590',
             indicators = [' ', ' '],
             subfields = [
-                'a', options['note']
+                'a', note_value
             ]
         )
         record.add_field(note)
 
+    # 909
+    if dup_flag: 
+        dup_value = bib_id + ""
+        dup = pymarc.Field(tag = '909',
+            indicators = [' ', ' '],
+            subfields = [
+                'a', dup_value
+            ]
+        )
+        record.add_field(dup)
+
     add_marc_source(record, options) # 598
     publisher = add_publisher(record, options) # 710
     add_restriction(record, options, publisher) # 506
     add_platform(record, options) # 710
 
+    # USE FOR SKIPPING SFX
+    # marked_isbn = False
     marked_isbn = mark_isbn_for_sfx(record, options)
 
     for field in record.get_fields():
@@ -605,7 +777,7 @@ def mark_isbn_for_sfx(record, options):
                     return True
 
     # For ebrary records, add a 924 for the custom URN
-    if options['ebrary'] is True:
+    if 'ebrary' in options:
         urn = None
         for scn in record.get_fields('001'):
             urn = pymarc.Field(tag = '924',
@@ -710,7 +882,7 @@ def add_restriction(new_record, options, publisher):
                 'a', append_space_semi_space(libopts['access_note']),
                 'b', append_space_semi_space(options['consortium']),
                 'e', authnote,
-                '9', libopts['code']
+                '9', libopts['lac_symbol']
             ]
         )
         new_record.add_field(note)