From: Art Rhyno Date: Thu, 4 Oct 2012 18:32:46 +0000 (-0400) Subject: Adding 2 options to prep_ebook_records.py script X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=e60ce16e1fc7cebdb8da57e81d659e5990ad5c07;p=contrib%2FConifer.git Adding 2 options to prep_ebook_records.py script This supports 2 additional options to Dan Scott's ebook script: -t / --tcn : The name of the file to route TCN duplicates to. The tcn search takes the 001 and does an OpenSRF open-ils.search.biblio.tcn call. -u / --url : The name of the file to route URL duplicates to. The url search is based on the 856u field and is done with the OpenSRF open-ils.search.biblio.multiclass.query call. Signed-off-by: Art Rhyno Signed-off-by: Dan Scott --- diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 0884634e58..7ce07e1406 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python # -*- coding: utf-8 -*- """ Prepare sets of electronic resource MARC records for loading into Evergreen @@ -14,11 +14,17 @@ requirements that would be the same for each record and therefore can be accommodated in batch load. """ -import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2 +import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json +from urllib import quote from datetime import date from BeautifulSoup import BeautifulSoup RECORD_COUNT = 0 +DUP_COUNT = 0 +GATEWAY_URL = "http://www.concat.ca/osrf-gateway-v1" +OPENSRF_ISBN_CALL = "open-ils.search.biblio.isbn" +OPENSRF_TCN_CALL = "open-ils.search.biblio.tcn" +OPENSRF_KEYWORD_CALL = "open-ils.search.biblio.multiclass.query" class Institution(): """Defines standard settings for each Conifer institution""" @@ -27,6 +33,8 @@ class Institution(): """Initialize the Institution object""" self.algoma = { \ "code": "ALGOMASYS", \ + "lac_symbol": "OSTMA", \ + "org_unit": "111", \ "ebrary_code": "algomauca", \ "proxy": "http://libproxy.auc.ca/login?url=", \ "link_text": "Available online", \ @@ -36,6 +44,8 @@ class Institution(): self.boreal = { \ "code": "BOREALSYS", \ + "lac_symbol": "BOREALSYS", \ + "org_unit": "135", \ "ebrary_code": "ocls", \ "proxy": "http://ra.ocls.ca/ra/login.aspx?url=", \ "link_text": "Disponible en ligne", \ @@ -44,6 +54,8 @@ class Institution(): self.laurentian = { \ "code": "LUSYS", \ + "lac_symbol": "OSUL", \ + "org_unit": "105", \ "ebrary_code": "jndlu", \ "gale_code": "sudb78095", \ "proxy": "https://librweb.laurentian.ca/login?url=", \ @@ -54,6 +66,8 @@ class Institution(): self.windsor = { \ "code": "WINDSYS", \ + "lac_symbol": "OWA", \ + "org_unit": "106", \ "ebrary_code": "oculwindsor", \ "gale_code": "wind05901", \ "proxy": "http://ezproxy.uwindsor.ca/login?url=", \ @@ -121,6 +135,12 @@ Required arguments: -W / --windsor : Add an 856 for University of Windsor Optional arguments: + -d / --duplicate : The name of the file to route ISBN duplicates to. + + -t / --tcn : The name of the file to route TCN duplicates to. + + -u / --url : The name of the file to route URL duplicates to. + -e / --ebrary : Put Ebrary 001 into 924 as a URN for SFX lookup purposes -n / --note : The text of the internal note to be inserted into a 590 field. @@ -143,10 +163,13 @@ def consolidate_options(opts): '-o': '--output', '-a': '--authorization', '-c': '--consortium', + '-d': '--duplicate', '-e': '--ebrary', '-p': '--publisher', '-P': '--platform', '-n': '--note', + '-t': '--tcn', + '-u': '--url', '-A': '--algoma', '-B': '--boreal', '-L': '--laurentian', @@ -213,6 +236,15 @@ def check_options(options): clean_opts['authorization'] = options['--authorization'].decode('utf-8') + if '--duplicate' in options: + clean_opts['duplicate'] = options['--duplicate'] + + if '--tcn' in options: + clean_opts['tcn'] = options['--tcn'] + + if '--url' in options: + clean_opts['url'] = options['--url'] + if '--ebrary' in options: clean_opts['ebrary'] = True @@ -232,6 +264,84 @@ def check_options(options): return clean_opts +def evergreen_request(method, *args, **kwargs): + service = '.'.join(method.split('.')[:2]) + kwargs.update({'service':service, 'method':method}) + params = ['%s=%s' % (k,quote(v)) for k,v in kwargs.items()] + params += ['param=%s' % quote(json.dumps(a)) for a in args] + url = '%s?%s' % (GATEWAY_URL, '&'.join(params)) + #print '--->', url + req = urllib2.urlopen(url) + resp = json.load(req) + if resp['status'] != 200: + raise Exception('error during evergreen request', resp) + payload = resp['payload'] + #print '<---', payload + return payload + +def url_check(record, options): + global DUP_COUNT, RECORD_COUNT + + match = False + match_id = 0 + for url in record.get_fields('856'): + for urlval in url.get_subfields('u'): + # print "urlval", urlval + for library in options['libraries']: + libopts = options['settings'].get_settings(library) + keyword_info = evergreen_request(OPENSRF_KEYWORD_CALL, + {'org_unit': libopts['org_unit'], + 'depth': 1, 'limit': 5, 'offset': 0, + 'visibility_limit': 3000, + 'default_class': 'keyword'}, + urlval, 1) + bib_ids = keyword_info[0]['ids'] + for bib_id in bib_ids: + match_id = bib_id + print("* %d of %d - URL match on %s for %s" + % (DUP_COUNT + 1, RECORD_COUNT, urlval, bib_id[0]) + ) + match = True + return match_id, match + +def tcn_check(record): + global DUP_COUNT, RECORD_COUNT + + match = False + match_id = 0 + for tcn in record.get_fields('001'): + tcn_val = tcn.value() + tcn_info = evergreen_request(OPENSRF_TCN_CALL,tcn_val) + bib_ids = tcn_info[0]['ids'] + # print "tcn_info", tcn_info + for bib_id in bib_ids: + match_id = bib_id + print("* %d of %d - TCN match on %s for %s" + % (DUP_COUNT + 1, RECORD_COUNT, tcn_val, bib_id) + ) + match = True + return match_id, match + +def isbn_check(record): + global DUP_COUNT, RECORD_COUNT + + match = False + match_id = 0 + for isbn in record.get_fields('020', '024'): + for isbnval in isbn.get_subfields('a', 'z'): + isbn_val = clean_isbn(isbnval) + isbn_info = evergreen_request(OPENSRF_ISBN_CALL,isbnval) + match_count = isbn_info[0]['count'] + #print "count", isbn_info[0]['count'] + bib_ids = isbn_info[0]['ids'] + for bib_id in bib_ids: + match_id = bib_id + print("* %d of %d - ISBN match on %s for %s" + % (DUP_COUNT + 1, RECORD_COUNT, isbn_val, bib_id) + ) + match = True + return match_id, match + def append_period(text): """ Append a period to the incoming text if required @@ -256,10 +366,10 @@ def check_libraries(options): def parse_opts(): """Get command-line arguments from the script""" try: - _short_opts = 'i:o:a:c:p:ABLWen:P:s:h' + _short_opts = 'i:o:a:c:p:P:ABLWe:d:t:u:n:s:h' _long_opts = ['input=', 'output=', 'authorization=', 'consortium=', - 'publisher=', 'algoma', 'boreal', 'laurentian', 'windsor', 'ebrary', - 'note=', 'platform=', 'sample=', 'help' + 'publisher=', 'platform=', 'algoma', 'boreal', 'laurentian', 'windsor', 'ebrary', + 'duplicate=', 'tcn=', 'url=', 'note=','sample=', 'help' ] opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) except getopt.GetoptError, ex: @@ -273,7 +383,11 @@ def process_records(options): """Converts raw ebook MARC records to Conifer-ready MARC records""" global RECORD_COUNT + global DUP_COUNT sample = '' + duplicate = '' + tcn = '' + url = '' try: reader = pymarc.MARCReader( @@ -286,8 +400,26 @@ def process_records(options): writer = pymarc.MARCWriter(open(options['output'], mode='wb')) except Exception, ex: print("Could not open output file [%s]" % options['output']) - - if ('sample' in options): + + if 'duplicate' in options: + try: + duplicate = pymarc.MARCWriter(open(options['duplicate'], mode='wb')) + except Exception, ex: + print("Could not open output file [%s]" % options['duplicate']) + + if 'tcn' in options: + try: + tcn = pymarc.MARCWriter(open(options['tcn'], mode='wb')) + except Exception, ex: + print("Could not open output file [%s]" % options['tcn']) + + if 'url' in options: + try: + url = pymarc.MARCWriter(open(options['url'], mode='wb')) + except Exception, ex: + print("Could not open output file [%s]" % options['url']) + + if 'sample' in options: sample = pymarc.MARCWriter(open(options['sample'], mode='wb')) for record in reader: @@ -297,16 +429,42 @@ def process_records(options): print("* No 856 for record # %s in file %s" % (RECORD_COUNT, options['input']) ) - - new_record = process_fields(record, options) - - writer.write(new_record) - if (sample and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0))): - sample.write(new_record) + else: + print ("%d - %s\n" % (RECORD_COUNT,record['856'])) + + + new_record = '' + dup_flag = False + + if duplicate: + bib_id, dup_flag = isbn_check(record) + new_record = process_fields(record, options, bib_id, dup_flag) + if dup_flag: + duplicate.write(new_record) + if tcn: + bib_id, dup_flag = tcn_check(record) + new_record = process_fields(record, options, bib_id, dup_flag) + if dup_flag: + tcn.write(new_record) + if url: + bib_id, dup_flag = url_check(record, options) + new_record = process_fields(record, options, bib_id, dup_flag) + if dup_flag: + url.write(new_record) + + if not dup_flag: + new_record = process_fields(record, options, 0, False) + else: + DUP_COUNT += 1 + + if new_record: + writer.write(new_record) + if (sample and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0))): + sample.write(new_record) except Exception, ex: print("* Error processing record %d - %s" % (RECORD_COUNT, ex)) -def process_fields(record, options): +def process_fields(record, options, bib_id, dup_flag): """Decide which fields to add, delete, and keep""" new_record = pymarc.Record(to_unicode=True, force_utf8=True) @@ -316,19 +474,33 @@ def process_fields(record, options): # 590 if 'note' in options: + note_value = options['note'] note = pymarc.Field(tag = '590', indicators = [' ', ' '], subfields = [ - 'a', options['note'] + 'a', note_value ] ) record.add_field(note) + # 909 + if dup_flag: + dup_value = bib_id + "" + dup = pymarc.Field(tag = '909', + indicators = [' ', ' '], + subfields = [ + 'a', dup_value + ] + ) + record.add_field(dup) + add_marc_source(record, options) # 598 publisher = add_publisher(record, options) # 710 add_restriction(record, options, publisher) # 506 add_platform(record, options) # 710 + # USE FOR SKIPPING SFX + # marked_isbn = False marked_isbn = mark_isbn_for_sfx(record, options) for field in record.get_fields(): @@ -605,7 +777,7 @@ def mark_isbn_for_sfx(record, options): return True # For ebrary records, add a 924 for the custom URN - if options['ebrary'] is True: + if 'ebrary' in options: urn = None for scn in record.get_fields('001'): urn = pymarc.Field(tag = '924', @@ -710,7 +882,7 @@ def add_restriction(new_record, options, publisher): 'a', append_space_semi_space(libopts['access_note']), 'b', append_space_semi_space(options['consortium']), 'e', authnote, - '9', libopts['code'] + '9', libopts['lac_symbol'] ] ) new_record.add_field(note)