From: Dan Scott Date: Wed, 22 Jun 2011 16:04:10 +0000 (-0400) Subject: Implement ISBN lookup in SFX X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=624ed5975f1872933b723250b359493c5916b0ae;p=contrib%2FConifer.git Implement ISBN lookup in SFX Check the SFX knowledgebase for a matching ISBN and mark it via the magical $9 SFX subfield so that we can key off of that for subsequent queries via the catalogue. Given that only one library may be running this script for a set of ebook records, ensure that we're checking the results. Signed-off-by: Dan Scott --- diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 92fd8d1c05..b18c54858e 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -26,6 +26,7 @@ class Institution(): "ebrary_code": "algomauca", \ "proxy": "http://libproxy.auc.ca/login?url=", \ "link_text": "Available online", \ + "sfx_url": "http://sfx.scholarsportal.info/algoma", \ "access_note": "Access restricted to users with a valid Algoma University ID" \ } @@ -34,6 +35,7 @@ class Institution(): "ebrary_code": "jndlu", \ "proxy": "https://librweb.laurentian.ca/login?url=", \ "link_text": "Available online / disponible en ligne", \ + "sfx_url": "http://sfx.scholarsportal.info/laurentian", \ "access_note": "Access restricted to users with a valid Laurentian University ID" \ } @@ -42,6 +44,7 @@ class Institution(): "ebrary_code": "oculwindsor", \ "proxy": "http://ezproxy.uwindsor.ca/login?url=", \ "link_text": "To view Windsor's electronic resource click here.", \ + "sfx_url": "http://sfx.scholarsportal.info/windsor", \ "access_note": "Access restricted to users with a valid University of Windsor ID" \ } @@ -161,7 +164,7 @@ def check_options(options): print "* Missing -c / --consortium argument!" _help = True - if '--restriction' not in options: + if '--authorization' not in options: print "* Missing -a / --authorization argument!" _help = True @@ -229,9 +232,11 @@ def check_libraries(options): def parse_opts(): """Get command-line arguments from the script""" try: - _short_opts = 'i:o:p:ALWn:s:h' - _long_opts = ['input=', 'output=', 'publisher=', 'algoma', \ - 'laurentian', 'windsor', 'note=', 'sample=', 'help'] + _short_opts = 'i:o:a:c:p:ALWn:s:h' + _long_opts = ['input=', 'output=', 'authorization=', 'consortium=', + 'publisher=', 'algoma', 'laurentian', 'windsor', 'note=', + 'sample=', 'help' + ] opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) except getopt.GetoptError, ex: print "* %s" % str(ex) @@ -291,6 +296,11 @@ def process_fields(record, options): add_publisher(record, new_record, options) add_restriction(new_record, options) + marked_isbn = mark_isbn_for_sfx(record, options) + if not marked_isbn: + print("No matching ISBN target found in SFX for %s" % + (new_record['856']['u']) + ) if 'note' in options: note = pymarc.Field(tag = '590', @@ -340,6 +350,82 @@ def add_publisher(record, new_record, options): ) new_record.add_field(seven_ten) +def mark_isbn_for_sfx(record, options): + """ + Adds a $9 subfield to the 020 (ISBN) field to use for SFX look-ups + + Assumes that the holdings in the SFX knowledgebase have been enabled + before the ebook processing script runs, or else we will not find any + matches. + """ + + for isbn in record.get_fields('020'): + for isbnval in isbn.get_subfields('a'): + isbnval = clean_isbn(isbnval) + sfx = 'http://sfx.scholarsportal.info/windsor' + # check to see if there are holdings in SFX knowledgebase + # use one of the participating libraries + if 'windsor' in options['libraries']: + sfx = options['settings'].get_settings('windsor')['sfx_url'] + elif 'laurentian' in options['libraries']: + sfx = options['settings'].get_settings('laurentian')['sfx_url'] + elif 'algoma' in options['libraries']: + sfx = options['settings'].get_settings('algoma')['sfx_url'] + + url = "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&" \ + "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&" \ + "sfx.ignore_date_threshold=1&" \ + "sfx.response_type=multi_obj_detailed_xml" \ + "&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval) + + try: + req = urllib2.urlopen(url) + sfx_res = BeautifulSoup(req.read()) + except urllib2.HTTPError, ex: + print("%s for URL %s" % (ex, url)) + return None + except urllib2.URLError, ex: + print("%s for URL %s" % (ex, url)) + return None + + # We want a target with a service_type element of 'getFullTxt' + targets = sfx_res.ctx_obj.ctx_obj_targets.findAll( + 'target', recursive=False + ) + + if len(targets) == 0: + # No SFX targets found for this ISBN - next! + continue + + for target in targets: + if target.service_type.renderContents() == 'getFullTxt': + # Add the $9 subfield to mark this as a good one + isbn.add_subfield('9', 'SFX') + return True + return False + +def clean_isbn(isbn): + """ + Return a normalized ISBN from a MARC subfield + + Trims whitespace, removes hyphens, and removes trailing descriptions + like '(pbk)' and the like so that the ISBN can be reliably used in lookups + """ + + # Remove whitespace from both sides + isbn = isbn.strip() + + # Grab the first string beginning with a digit + isbn_match = re.search(r'^[\D]*([\d]+\S+).*?$', isbn) + + if not isbn_match.group(1): + return None + + # Replace hyphens + isbn = isbn_match.group(1).replace('-', ''); + + return isbn + def add_restriction(new_record, options): """ Adds a 506 access restriction note per institution @@ -527,5 +613,5 @@ def get_subfields(field, data): if __name__ == '__main__': - + process_records(parse_opts())