From: Dan Scott Date: Thu, 28 Jul 2011 15:16:42 +0000 (-0400) Subject: Check for 020$z, 024$a,$z for ISBNs X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=ab315a0eee453dba7a32e8afb645d4586ff6c9c1;p=contrib%2FConifer.git Check for 020$z, 024$a,$z for ISBNs Either our MARC record sources are horribly corrupted, or the SFX knowledge base is questionable, or both, as we're having to search cancelled/invalid/non-ISBNs to get matches in SFX for our electronic records. Also, use 506 $9 to record our institutional ID in authorization notes. Signed-off-by: Dan Scott --- diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 7a607e9ba2..e753c9c2ff 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -14,11 +14,8 @@ be accommodated in batch load. """ import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2 -from datetime import date from BeautifulSoup import BeautifulSoup -RECORD_COUNT = 0 - class Institution(): """Defines standard settings for each Conifer institution""" @@ -30,7 +27,7 @@ class Institution(): "proxy": "http://libproxy.auc.ca/login?url=", \ "link_text": "Available online", \ "sfx_url": "http://sfx.scholarsportal.info/algoma", \ - "access_note": "Access restricted to users with a valid Algoma University ID ;" \ + "access_note": "Access restricted to users with a valid Algoma University ID" \ } self.laurentian = { \ @@ -39,7 +36,7 @@ class Institution(): "proxy": "https://librweb.laurentian.ca/login?url=", \ "link_text": "Available online / disponible en ligne", \ "sfx_url": "http://sfx.scholarsportal.info/laurentian", \ - "access_note": "Access restricted to users with a valid Laurentian University ID ;" \ + "access_note": "Access restricted to users with a valid Laurentian University ID" \ } self.windsor = { \ @@ -48,7 +45,7 @@ class Institution(): "proxy": "http://ezproxy.uwindsor.ca/login?url=", \ "link_text": "Available online", \ "sfx_url": "http://sfx.scholarsportal.info/windsor", \ - "access_note": "Access restricted to users with a valid University of Windsor ID ;" \ + "access_note": "Access restricted to users with a valid University of Windsor ID" \ } def get_settings(self, lib): @@ -95,11 +92,7 @@ Required arguments: -c / --consortium : The name of the consortial license to be inserted in each 506$b access restriction note. - -p / --publisher : The name of the publisher to be inserted in a 710 field - with a subfield 4 relator code 'pbl'. - - -P / --platform: The name of the digital platform to be inserted in a 710 - field. + -p / --publisher : The name of the publisher to be inserted in a 710 field. -A / --algoma: Add an 856 for Algoma University @@ -136,8 +129,6 @@ def consolidate_options(opts): _options['--consortium'] = val elif key == '-p': _options['--publisher'] = val - elif key == '-P': - _options['--platform'] = val elif key == '-n': _options['--note'] = val elif key == '-A': @@ -215,9 +206,6 @@ def check_options(options): if '--note' in options: clean_opts['note'] = options['--note'] - if '--platform' in options: - clean_opts['platform'] = options['--platform'] - clean_opts['libraries'] = _libraries clean_opts['input'] = _input clean_opts['output'] = _output @@ -244,10 +232,10 @@ def check_libraries(options): def parse_opts(): """Get command-line arguments from the script""" try: - _short_opts = 'i:o:a:c:p:ALWn:P:s:h' + _short_opts = 'i:o:a:c:p:ALWn:s:h' _long_opts = ['input=', 'output=', 'authorization=', 'consortium=', 'publisher=', 'algoma', 'laurentian', 'windsor', 'note=', - 'platform=', 'sample=', 'help' + 'sample=', 'help' ] opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) except getopt.GetoptError, ex: @@ -260,7 +248,6 @@ def parse_opts(): def process_records(options): """Converts raw ebook MARC records to Conifer-ready MARC records""" - global RECORD_COUNT sample = '' reader = pymarc.MARCReader( open(options['input'], mode='rb'), to_unicode=True @@ -269,50 +256,32 @@ def process_records(options): if ('sample' in options): sample = pymarc.MARCWriter(open(options['sample'], mode='wb')) + cnt = 0 for record in reader: - RECORD_COUNT += 1 + cnt = cnt + 1 try: if not (record['856'] and record['856']['u']): - print("* No 856 for record # %d in file %s" - % (RECORD_COUNT, options['input']) + print("* No 856 for record # %s in file %s" + % (cnt, options['input']) ) new_record = process_fields(record, options) writer.write(new_record) - if (sample and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0))): + if (sample and ((cnt == 1) or (cnt % 100 == 0))): sample.write(new_record) except Exception, ex: - print("* Error processing record %d - %s" % (RECORD_COUNT, ex)) + print("* Error processing record %s - %s" % (cnt, ex)) def process_fields(record, options): """Decide which fields to add, delete, and keep""" new_record = pymarc.Record(to_unicode=True, force_utf8=True) - add_cat_source(new_record, options) # 040 - add_restriction(record, options) # 506 - - # 590 - if 'note' in options: - note = pymarc.Field(tag = '590', - indicators = [' ', ' '], - subfields = [ - 'a', options['note'] - ] - ) - record.add_field(note) - - add_marc_source(record, options) # 598 - publisher = add_publisher(record, options) # 710 - add_platform(record, options) # 710 - - marked_isbn = mark_isbn_for_sfx(record, options) - for field in record.get_fields(): # Process all of the 856 fields if field.tag == '856': - new_fields = process_urls(field, options, publisher) + new_fields = process_urls(field, options) if new_fields: for new_856 in new_fields: new_record.add_field(new_856) @@ -325,46 +294,41 @@ def process_fields(record, options): else: new_record.add_field(field) + add_publisher(record, new_record, options) + add_restriction(new_record, options) + marked_isbn = mark_isbn_for_sfx(new_record, options) if not marked_isbn: - try: - isbn = record['020']['a'] - print("ISBN: [%s] - no matching ISBN target found in SFX for %s" % - (isbn, new_record['856']['u']) - ) - except: - print("No matching ISBN target found in SFX for %s" % - (new_record['856']['u']) - ) + print("No matching ISBN target found in SFX for %s" % + (new_record['856']['u']) + ) + + if 'note' in options: + note = pymarc.Field(tag = '590', + indicators = [' ', ' '], + subfields = [ + 'a', options['note'] + ] + ) + new_record.add_field(note) + + add_cat_source(new_record, options) return new_record -def add_publisher(record, options): +def add_publisher(record, new_record, options): """ This is a convoluted way to avoid creating a new 710 if we already have a matching 710 and just need to add the publisher relator code. """ - publisher = options['publisher'] munge_publisher = False need_publisher = True need_relator = True - raw_publisher = None - try: - raw_publisher = record['260']['b'] - except: - pass - - if raw_publisher: - if 'Oxford' in raw_publisher or 'Clarendon' in raw_publisher: - publisher = 'Oxford University Press' - elif 'Cambridge' in raw_publisher: - publisher = 'Cambridge University Press' - # Iterate through all of the existing 710 fields for sten in record.get_fields('710'): for pub in sten.get_subfields('a'): - if pub == publisher: + if pub == options['publisher']: munge_publisher = True for rel in sten.get_subfields('4'): if rel == 'pbl': @@ -380,38 +344,11 @@ def add_publisher(record, options): seven_ten = pymarc.Field(tag = '710', indicators = ['2', ' '], subfields = [ - 'a', publisher, + 'a', options['publisher'], '4', 'pbl' ] ) - record.add_field(seven_ten) - - return publisher - -def add_platform(record, options): - """ - This is a convoluted way to avoid creating a new 710 if we already - have a matching 710 for digital platform. - """ - - platform = options['platform'] - need_platform = True - - # Iterate through all of the existing 710 fields - for sten in record.get_fields('710'): - for pub in sten.get_subfields('a'): - if pub == platform: - need_platform = False - - if need_platform: - # Add the platform - seven_ten = pymarc.Field(tag = '710', - indicators = ['2', ' '], - subfields = [ - 'a', platform - ] - ) - record.add_field(seven_ten) + new_record.add_field(seven_ten) def mark_isbn_for_sfx(record, options): """ @@ -423,42 +360,52 @@ def mark_isbn_for_sfx(record, options): """ # For every ISBN in the record - for isbn in record.get_fields('020'): - for isbnval in isbn.get_subfields('a'): + for isbn in record.get_fields('020', '024'): + for isbnval in isbn.get_subfields('a', 'z'): isbnval = clean_isbn(isbnval) # And for every library we have enabled for lib in options['libraries']: - sfx = options['settings'].get_settings(lib)['sfx_url'] - url = "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&" \ - "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&" \ - "sfx.ignore_date_threshold=1&" \ - "sfx.response_type=multi_obj_detailed_xml" \ - "&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval) - - try: - req = urllib2.urlopen(url) - sfx_res = BeautifulSoup(req.read()) - except urllib2.HTTPError, ex: - print("%s for URL %s" % (ex, url)) - continue - except urllib2.URLError, ex: - print("%s for URL %s" % (ex, url)) - continue - - # We want a target with a service_type element of 'getFullTxt' - targets = sfx_res.ctx_obj.ctx_obj_targets.findAll( - 'target', recursive=False - ) + found = check_for_isbn(options, lib, isbnval) + if found: + # Add the $9 subfield to mark this as a good one + isbn.add_subfield('9', 'SFX') + return True + return False + +def check_for_isbn(options, lib, isbnval): + """ + Given an ISBN value, check SFX at the specified library for a match + """ + sfx = options['settings'].get_settings(lib)['sfx_url'] + url = "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&" \ + "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&" \ + "sfx.ignore_date_threshold=1&" \ + "sfx.response_type=multi_obj_detailed_xml" \ + "&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval) + + try: + req = urllib2.urlopen(url) + sfx_res = BeautifulSoup(req.read()) + except urllib2.HTTPError, ex: + print("%s for URL %s" % (ex, url)) + return False + except urllib2.URLError, ex: + print("%s for URL %s" % (ex, url)) + return False - if len(targets) == 0: - # No SFX targets found for this ISBN - next! - continue + # We want a target with a service_type element of 'getFullTxt' + targets = sfx_res.ctx_obj.ctx_obj_targets.findAll( + 'target', recursive=False + ) + + if len(targets) == 0: + # No SFX targets found for this ISBN - next! + return False + + for target in targets: + if target.service_type.renderContents() == 'getFullTxt': + return True - for target in targets: - if target.service_type.renderContents() == 'getFullTxt': - # Add the $9 subfield to mark this as a good one - isbn.add_subfield('9', 'SFX') - return True return False def clean_isbn(isbn): @@ -494,11 +441,6 @@ def add_restriction(new_record, options): * $9 - Institutional code to which this note applies """ - # Add a period if the authorization ends with a number or letter - authnote = options['authorization'] - if authnote[-1] not in '.)]': - authnote += '.' - for library in options['libraries']: libopts = options['settings'].get_settings(library) # Add the access restriction note @@ -506,8 +448,8 @@ def add_restriction(new_record, options): indicators = ['1', ' '], subfields = [ 'a', libopts['access_note'], - 'b', options['consortium'] + ' ; ', - 'e', authnote, + 'b', options['consortium'], + 'e', options['authorization'], '9', libopts['code'] ] ) @@ -532,26 +474,8 @@ def add_cat_source(record, options): ) record.add_field(forty) -def add_marc_source(record, options): - """ - Add a 598 field identifying the source MARC file name and processing date - """ - - global RECORD_COUNT - source = os.path.basename(options['input']) - - marc_source = pymarc.Field(tag = '598', - indicators = [' ', ' '], - subfields = [ - 'a', source, - 'b', date.today().isoformat(), - 'c', str(RECORD_COUNT) - ] - ) - record.add_field(marc_source) - -def process_urls(field, options, publisher): +def process_urls(field, options): """Creates 856 fields required by Conifer""" new_fields = [] @@ -567,10 +491,6 @@ def process_urls(field, options, publisher): new_fields.append(enrich) else: for lib in options['libraries']: - - # Tweak for Algoma for combined CUP/OUP - if lib == 'algoma' and 'Cambridge' in publisher: - continue data = options['settings'].get_settings(lib) subs = get_subfields(field, data) eight_five_six = pymarc.Field(tag = '856', @@ -697,5 +617,4 @@ def get_subfields(field, data): if __name__ == '__main__': - process_records(parse_opts())