From 319aacb0d404fc8324b9e7cad340094e2810f41a Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Mon, 20 Jan 2020 21:35:35 -0500 Subject: [PATCH] Run black and pylint-3 against prep_ebook_records.py Signed-off-by: Dan Scott --- tools/ebooks/prep_ebook_records.py | 998 ++++++++++++++++++++----------------- 1 file changed, 535 insertions(+), 463 deletions(-) diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index d9fae25905..3c9b8b7bf5 100755 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -14,8 +14,15 @@ requirements that would be the same for each record and therefore can be accommodated in batch load. """ -import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, json -import codecs, copy +import os +import os.path +import sys +import getopt +import pymarc +import re +import json +import codecs +import copy import requests from datetime import date from bs4 import BeautifulSoup @@ -23,61 +30,64 @@ import traceback RECORD_COUNT = 0 DUP_COUNT = 0 -GATEWAY_URL = "http://www.concat.ca/osrf-gateway-v1" +GATEWAY_URL = "https://www.concat.ca/osrf-gateway-v1" OPENSRF_ISBN_CALL = "open-ils.search.biblio.isbn" OPENSRF_TCN_CALL = "open-ils.search.biblio.tcn" OPENSRF_KEYWORD_CALL = "open-ils.search.biblio.multiclass.query" OPTIONS = {} FILES = {} -class Institution(): + +class Institution: """Defines standard settings for each Conifer institution""" def __init__(self): """Initialize the Institution object""" - self.algoma = { \ - "code": "ALGOMASYS", \ - "lac_symbol": "OSTMA", \ - "org_unit": "111", \ - "ebrary_code": "algomauca", \ - "proxy": "http://libproxy.auc.ca/login?url=", \ - "link_text": "Available online", \ - "sfx_url": "http://sfx.scholarsportal.info/algoma", \ - "access_note": "Access restricted to users with a valid Algoma University ID ;" \ + self.algoma = { + "code": "ALGOMASYS", + "lac_symbol": "OSTMA", + "org_unit": "111", + "ebrary_code": "algomauca", + "proxy": "http://libproxy.auc.ca/login?url=", + "link_text": "Available online", + "sfx_url": "http://sfx.scholarsportal.info/algoma", + "access_note": "Access restricted to users with a valid Algoma University ID ;", } - self.boreal = { \ - "code": "BOREALSYS", \ - "lac_symbol": "BOREALSYS", \ - "org_unit": "135", \ - "ebrary_code": "ocls", \ - "proxy": "http://ra.ocls.ca/ra/login.aspx?url=", \ - "link_text": "Disponible en ligne", \ - "access_note": u"Accès réservé aux utilisateurs avec un ID valide Collège Boréal ;" \ + self.boreal = { + "code": "BOREALSYS", + "lac_symbol": "BOREALSYS", + "org_unit": "135", + "ebrary_code": "ocls", + "proxy": "http://ra.ocls.ca/ra/login.aspx?url=", + "link_text": "Disponible en ligne", + "access_note": u"Accès réservé aux utilisateurs avec un ID valide Collège Boréal ;", } - self.laurentian = { \ - "code": "LUSYS", \ - "lac_symbol": "OSUL", \ - "org_unit": "105", \ - "ebrary_code": "jndlu", \ - "gale_code": "sudb78095", \ - "proxy": "https://login.librweb.laurentian.ca/login?url=", \ - "link_text": "Available online / disponible en ligne", \ - "sfx_url": "https://sfx.scholarsportal.info/laurentian", \ - "access_note": "Access restricted to users with a valid Laurentian University ID ;" \ + self.laurentian = { + "code": "LUSYS", + "lac_symbol": "OSUL", + "org_unit": "105", + "ebrary_code": "jndlu", + "gale_code": "sudb78095", + "proxy": "https://login.librweb.laurentian.ca/login?url=", + "link_text": "Available online / disponible en ligne", + "sfx_url": "https://sfx.scholarsportal.info/laurentian", + "access_note": "Access restricted to users with a valid Laurentian University ID ;", } def get_settings(self, lib): """Return the settings for a library by name""" return getattr(self, lib) + def do_help(): - ''' + """ Print help for the Conifer ebook MARC processor - ''' + """ - print(''' + print( + """ Conifer ebook MARC processor This script takes a set of MARC records and processes them to generate a set @@ -152,34 +162,37 @@ Optional arguments: Examples: %s --algoma -i crkn.mrc -o /tmp/crkn_out.mrc -p "eBrary Inc." - ''' % (sys.argv[0],)) + """ + % (sys.argv[0],) + ) sys.exit(0) + def consolidate_options(opts): """Make long arguments the standard form in command line options""" _shortlong = { - '-i': '--input', - '-o': '--output', - '-a': '--authorization', - '-c': '--consortium', - '-C': '--clean', - '-d': '--duplicate', - '-e': '--ebrary', - '-F': '--from-format', - '-I': '--isbn-sfx', - '-p': '--publisher', - '-P': '--platform', - '-n': '--note', - '-t': '--tcn', - '-T': '--to-format', - '-u': '--url', - '-A': '--algoma', - '-B': '--boreal', - '-L': '--laurentian', - '-s': '--sample', - '-x': '--cut-field', - '-h': '--help' + "-i": "--input", + "-o": "--output", + "-a": "--authorization", + "-c": "--consortium", + "-C": "--clean", + "-d": "--duplicate", + "-e": "--ebrary", + "-F": "--from-format", + "-I": "--isbn-sfx", + "-p": "--publisher", + "-P": "--platform", + "-n": "--note", + "-t": "--tcn", + "-T": "--to-format", + "-u": "--url", + "-A": "--algoma", + "-B": "--boreal", + "-L": "--laurentian", + "-s": "--sample", + "-x": "--cut-field", + "-h": "--help", } _options = dict(opts) @@ -190,18 +203,19 @@ def consolidate_options(opts): return _options + def check_options(options): """Check the validity of options that were passed in""" _help = False _req = { - '--input': "* Missing -i / --input argument!", - '--output': "* Missing -o / --output argument!", - '--authorization': "* Missing -a / --authorization argument!", - '--publisher': "* Missing -p / --publisher argument!" + "--input": "* Missing -i / --input argument!", + "--output": "* Missing -o / --output argument!", + "--authorization": "* Missing -a / --authorization argument!", + "--publisher": "* Missing -p / --publisher argument!", } - if '--help' in options: + if "--help" in options: do_help() for reqkey, reqwarn in _req.items(): @@ -210,15 +224,15 @@ def check_options(options): _help = True _libraries = check_libraries(options) - if len(_libraries.keys()) == 0: + if not _libraries.keys(): _help = True - if _help == True: + if _help: do_help() # Get the input and output files - _input = options['--input'] - _output = options['--output'] + _input = options["--input"] + _output = options["--output"] try: os.stat(_input) @@ -233,27 +247,27 @@ def check_options(options): sys.exit(0) _bool_opts = { - '--clean': 'clean', - '--ebrary': 'ebrary', - '--isbn-sfx': 'isbn-sfx', + "--clean": "clean", + "--ebrary": "ebrary", + "--isbn-sfx": "isbn-sfx", } _string_opts = { - '--authorization': 'authorization', - '--consortium': 'consortium', - '--duplicate': 'duplicate', - '--from-format': 'from-format', - '--note': 'note', - '--platform': 'platform', - '--sample': 'sample', - '--cut-field': 'cut-field', - '--tcn': 'tcn', - '--to-format': 'to-format', - '--url': 'url', + "--authorization": "authorization", + "--consortium": "consortium", + "--duplicate": "duplicate", + "--from-format": "from-format", + "--note": "note", + "--platform": "platform", + "--sample": "sample", + "--cut-field": "cut-field", + "--tcn": "tcn", + "--to-format": "to-format", + "--url": "url", } clean_opts = dict() - clean_opts['publisher'] = append_period(options['--publisher']) + clean_opts["publisher"] = append_period(options["--publisher"]) for optkey, optval in _bool_opts.items(): if optkey in options: @@ -263,30 +277,33 @@ def check_options(options): if optkey in options: clean_opts[optval] = options[optkey] - clean_opts['libraries'] = _libraries - clean_opts['input'] = _input - clean_opts['output'] = _output - clean_opts['settings'] = Institution() + clean_opts["libraries"] = _libraries + clean_opts["input"] = _input + clean_opts["output"] = _output + clean_opts["settings"] = Institution() return clean_opts + def evergreen_request(method, *args, **kwargs): """Issue a basic gateway request against Evergreen""" - - service = '.'.join(method.split('.')[:2]) - kwargs.update({'service':service, 'method':method}) - params = ['%s=%s' % (k, quote(v)) for k, v in kwargs.items()] - params += ['param=%s' % quote(json.dumps(a)) for a in args] - url = '%s?%s' % (GATEWAY_URL, '&'.join(params)) - #print '--->', url + from urllib.parse import quote + + service = ".".join(method.split(".")[:2]) + kwargs.update({"service": service, "method": method}) + params = ["%s=%s" % (k, quote(v)) for k, v in kwargs.items()] + params += ["param=%s" % quote(json.dumps(a)) for a in args] + url = "%s?%s" % (GATEWAY_URL, "&".join(params)) + # print '--->', url req = requests.get(url) resp = req.json() - if resp['status'] != 200: - raise Exception('error during evergreen request', resp) - payload = resp['payload'] - #print '<---', payload + if resp["status"] != 200: + raise Exception("error during evergreen request", resp) + payload = resp["payload"] + # print '<---', payload return payload + def url_check(record, options): """Check for a matching URL in Evergreen""" @@ -295,26 +312,35 @@ def url_check(record, options): match = False match_id = 0 # Oxford MARC files from ScholarsPortal have DOIs in 956(!) - for url in record.get_fields('856','956'): - for urlval in url.get_subfields('u'): + for url in record.get_fields("856", "956"): + for urlval in url.get_subfields("u"): # print "urlval", urlval - for library in options['libraries']: - libopts = options['settings'].get_settings(library) - keyword_info = evergreen_request(OPENSRF_KEYWORD_CALL, - {'org_unit': libopts['org_unit'], - 'depth': 1, 'limit': 5, 'offset': 0, - 'visibility_limit': 3000, - 'default_class': 'keyword'}, - urlval, 1) - bib_ids = keyword_info[0]['ids'] + for library in options["libraries"]: + libopts = options["settings"].get_settings(library) + keyword_info = evergreen_request( + OPENSRF_KEYWORD_CALL, + { + "org_unit": libopts["org_unit"], + "depth": 1, + "limit": 5, + "offset": 0, + "visibility_limit": 3000, + "default_class": "keyword", + }, + urlval, + 1, + ) + bib_ids = keyword_info[0]["ids"] for bib_id in bib_ids: match_id = bib_id - print("* %d of %d - URL match on %s for %s" - % (DUP_COUNT + 1, RECORD_COUNT, urlval, bib_id[0]) + print( + "* %d of %d - URL match on %s for %s" + % (DUP_COUNT + 1, RECORD_COUNT, urlval, bib_id[0]) ) match = True return match_id, match + def tcn_check(record): """Check for a matching TCN in Evergreen""" @@ -322,19 +348,21 @@ def tcn_check(record): match = False match_id = 0 - for tcn in record.get_fields('001'): + for tcn in record.get_fields("001"): tcn_val = tcn.value() tcn_info = evergreen_request(OPENSRF_TCN_CALL, tcn_val) - bib_ids = tcn_info[0]['ids'] + bib_ids = tcn_info[0]["ids"] # print "tcn_info", tcn_info for bib_id in bib_ids: match_id = bib_id - print("* %d of %d - TCN match on %s for %s" + print( + "* %d of %d - TCN match on %s for %s" % (DUP_COUNT + 1, RECORD_COUNT, tcn_val, bib_id) ) match = True return match_id, match + def isbn_check(record): """Check for a matching ISBN in Evergreen""" @@ -342,36 +370,39 @@ def isbn_check(record): match = False match_id = 0 - for isbn in record.get_fields('020', '024'): - for isbnval in isbn.get_subfields('a', 'z'): + for isbn in record.get_fields("020", "024"): + for isbnval in isbn.get_subfields("a", "z"): isbn_val = clean_isbn(isbnval) isbn_info = evergreen_request(OPENSRF_ISBN_CALL, isbnval) - #print "count", isbn_info[0]['count'] - bib_ids = isbn_info[0]['ids'] + # print "count", isbn_info[0]['count'] + bib_ids = isbn_info[0]["ids"] for bib_id in bib_ids: match_id = bib_id - print("* %d of %d - ISBN match on %s for %s" + print( + "* %d of %d - ISBN match on %s for %s" % (DUP_COUNT + 1, RECORD_COUNT, isbn_val, bib_id) ) match = True return match_id, match + def append_period(text): """ Append a period to the incoming text if required """ - if text[-1] != '.': - text += '.' + if text[-1] != ".": + text += "." return text + def check_libraries(options): """Build a dict of the libraries that were requested for this batch""" _libraries = dict() - for lib in ['algoma', 'boreal', 'laurentian']: - if '--' + lib in options: + for lib in ["algoma", "boreal", "laurentian"]: + if "--" + lib in options: _libraries[lib] = True return _libraries @@ -380,12 +411,29 @@ def check_libraries(options): def parse_opts(): """Get command-line arguments from the script""" try: - _short_opts = 'i:o:a:p:P:ABLc:eCId:F:T:t:u:n:s:x:h' - _long_opts = ['input=', 'output=', 'authorization=', - 'publisher=', 'platform=', 'algoma', 'boreal', 'laurentian', - 'consortium=', 'ebrary', 'clean', 'isbn-sfx', 'duplicate=', - 'from-format=', 'to-format=', 'tcn=', 'url=', 'note=', 'sample=', - 'cut-field=', 'help' + _short_opts = "i:o:a:p:P:ABLc:eCId:F:T:t:u:n:s:x:h" + _long_opts = [ + "input=", + "output=", + "authorization=", + "publisher=", + "platform=", + "algoma", + "boreal", + "laurentian", + "consortium=", + "ebrary", + "clean", + "isbn-sfx", + "duplicate=", + "from-format=", + "to-format=", + "tcn=", + "url=", + "note=", + "sample=", + "cut-field=", + "help", ] opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) except getopt.GetoptError as ex: @@ -395,181 +443,177 @@ def parse_opts(): _options = consolidate_options(opts[0]) return check_options(_options) + def process_marc(options): """Converts raw ebook MARC records to Conifer-ready MARC records""" global FILES files = FILES - if 'from-format' in options and options['from-format'] == 'xml': - pymarc.map_xml(process_xml, options['input']) + if "from-format" in options and options["from-format"] == "xml": + pymarc.map_xml(process_xml, options["input"]) else: try: reader = pymarc.MARCReader( - open(options['input'], mode='rb'), to_unicode=True + open(options["input"], mode="rb"), to_unicode=True ) except Exception as ex: - print("Could not open input file [%s]" % options['input']) + print("Could not open input file [%s]" % options["input"]) for record in reader: process_record(record, options, files) + def process_record(record, options, files): global RECORD_COUNT global DUP_COUNT RECORD_COUNT += 1 try: - if not (record['856'] and record['856']['u']): - print("* No 856 for record # %s in file %s" - % (RECORD_COUNT, options['input']) + if not (record["856"] and record["856"]["u"]): + print( + "* No 856 for record # %s in file %s" % (RECORD_COUNT, options["input"]) ) else: - print("%d - %s" % (RECORD_COUNT, record['856'])) + print("%d - %s" % (RECORD_COUNT, record["856"])) dupe_flags = {} - if 'duplicate' in files: + if "duplicate" in files: tmp_record = process_fields(copy.deepcopy(record), options) - bib_id, dupe_flags['isbn'] = isbn_check(tmp_record) - if dupe_flags['isbn']: + bib_id, dupe_flags["isbn"] = isbn_check(tmp_record) + if dupe_flags["isbn"]: tmp_record = add_dupe_field(tmp_record, bib_id) - files['duplicate'].write(tmp_record) + files["duplicate"].write(tmp_record) else: - del(dupe_flags['isbn']) + del dupe_flags["isbn"] - if 'tcn' in files and len(dupe_flags) == 0: + if "tcn" in files and not dupe_flags: tmp_record = process_fields(copy.deepcopy(record), options) - bib_id, dupe_flags['tcn'] = tcn_check(tmp_record) - if dupe_flags['tcn']: + bib_id, dupe_flags["tcn"] = tcn_check(tmp_record) + if dupe_flags["tcn"]: tmp_record = add_dupe_field(tmp_record, bib_id) - files['tcn'].write(tmp_record) + files["tcn"].write(tmp_record) else: - del(dupe_flags['tcn']) + del dupe_flags["tcn"] - if 'url' in files and len(dupe_flags) == 0: + if "url" in files and not dupe_flags: tmp_record = process_fields(copy.deepcopy(record), options) - bib_id, dupe_flags['url'] = url_check(tmp_record, options) - if dupe_flags['url']: + bib_id, dupe_flags["url"] = url_check(tmp_record, options) + if dupe_flags["url"]: tmp_record = add_dupe_field(tmp_record, bib_id) - files['url'].write(tmp_record) + files["url"].write(tmp_record) else: - del(dupe_flags['url']) + del dupe_flags["url"] - if len(dupe_flags): + if dupe_flags: DUP_COUNT += 1 else: new_record = process_fields(record, options) - if 'to-format' in options and options['to-format'] == 'xml': - new_record = pymarc.record_to_xml(new_record) + '\n' - files['output'].write(new_record) - if ('sample' in files and ( - (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0) - )): - files['sample'].write(new_record) + if "to-format" in options and options["to-format"] == "xml": + new_record = pymarc.record_to_xml(new_record) + "\n" + files["output"].write(new_record) + if "sample" in files and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)): + files["sample"].write(new_record) except Exception as ex: print("* Error processing record %d - %s" % (RECORD_COUNT, ex)) traceback.print_exc() + def process_fields(record, options): """Decide which fields to add, delete, and keep""" new_record = pymarc.Record(to_unicode=True, force_utf8=True) - leader = record.leader[:6] + 'a' + record.leader[7:] + leader = record.leader[:6] + "a" + record.leader[7:] new_record.leader = leader - add_cat_source(record, options) # 040 + add_cat_source(record, options) # 040 # 590 - if 'note' in options: - note_value = options['note'] - note = pymarc.Field(tag = '590', - indicators = [' ', ' '], - subfields = [ - 'a', note_value - ] + if "note" in options: + note_value = options["note"] + note = pymarc.Field( + tag="590", indicators=[" ", " "], subfields=["a", note_value] ) record.add_ordered_field(note) - add_marc_source(record, options) # 598 - if record.get_fields('336') is None: - add_rda_fields(record, options) # 336,337,338 - publisher = add_publisher(record, options) # 710 - add_restriction(record, options, publisher) # 506 - add_platform(record, options) # 710 + add_marc_source(record, options) # 598 + if record.get_fields("336") is None: + add_rda_fields(record, options) # 336,337,338 + publisher = add_publisher(record, options) # 710 + add_restriction(record, options, publisher) # 506 + add_platform(record, options) # 710 - if 'isbn-sfx' in options: + if "isbn-sfx" in options: marked_isbn = mark_isbn_for_sfx(record, options) for field in record.get_fields(): - if 'clean' in options: + if "clean" in options: field = clean_diacritics(field) # Process all of the 856 fields # Oxford MARC files from ScholarsPortal have DOIs in 956(!) - if field.tag == '856' or field.tag == '956': + if field.tag == "856" or field.tag == "956": new_fields = process_urls(field, options, publisher) if new_fields: for new_856 in new_fields: new_record.add_ordered_field(new_856) # Strip out 9xx fields: we don't want local fields in our records # except for 924 fields that we create - elif field.tag[0] == '9' and field.tag != '924': + elif field.tag[0] == "9" and field.tag != "924": pass # ISBN cleanup - elif field.tag == '020': + elif field.tag == "020": new_isbn = create_clean_isbn(field) new_record.add_ordered_field(new_isbn) # Strip out 300 fields that only contain placeholders - elif field.tag == '300' and field['a'] == 'p. cm.': + elif field.tag == "300" and field["a"] == "p. cm.": pass # Add relator URIs - elif field.tag.startswith('33') and field['0'] is None: + elif field.tag.startswith("33") and field["0"] is None: field = add_relator_uri(field) new_record.add_ordered_field(field) # Strip out useless fields - elif 'cut-field' in options and field.tag in options['cut-field']: + elif "cut-field" in options and field.tag in options["cut-field"]: pass - elif field.tag == '008' and field.value()[23] != 's': + elif field.tag == "008" and field.value()[23] != "s": fixed_field = pymarc.Field( - tag='008', - data=field.value()[:23] + 's' + field.value()[24:] + tag="008", data=field.value()[:23] + "s" + field.value()[24:] ) new_record.add_ordered_field(fixed_field) # Strip out GMD - elif field.tag == '245': - if 'h' in field: - suffix = field['h'][-3:] - field.delete_subfield('h') - field['a'] = field['a'] + suffix + elif field.tag == "245": + if "h" in field: + # Grab the trailing " /" + suffix = field["h"][-2:] + field.delete_subfield("h") + field["a"] = field["a"] + suffix new_record.add_ordered_field(field) else: new_record.add_ordered_field(field) - if 'isbn-sfx' in options and not marked_isbn: + if "isbn-sfx" in options and not marked_isbn: try: - isbn = record['020']['a'] - print("ISBN: [%s] - no matching ISBN target found in SFX for %s" % - (isbn, new_record['856']['u']) + isbn = record["020"]["a"] + print( + "ISBN: [%s] - no matching ISBN target found in SFX for %s" + % (isbn, new_record["856"]["u"]) ) except: - print("No matching ISBN target found in SFX for %s" % - (new_record['856']['u']) + print( + "No matching ISBN target found in SFX for %s" % (new_record["856"]["u"]) ) return new_record + def add_dupe_field(record, bib_id): """Add a 909 field marking the duplicate record""" dup_value = str(bib_id) - dup = pymarc.Field(tag = '909', - indicators = [' ', ' '], - subfields = [ - 'a', dup_value - ] - ) + dup = pymarc.Field(tag="909", indicators=[" ", " "], subfields=["a", dup_value]) record.add_ordered_field(dup) return record + def clean_diacritics(field): """ Change specific patterns of bytes into other patterns of bytes @@ -582,135 +626,142 @@ def clean_diacritics(field): return field new_field = pymarc.Field( - tag=field.tag, - indicators=[field.indicator1, field.indicator2] + tag=field.tag, indicators=[field.indicator1, field.indicator2] ) for subfield in field: - if r'\x' not in repr(subfield[1]): + if r"\x" not in repr(subfield[1]): new_field.add_subfield(subfield[0], subfield[1]) continue # Let the substitutions commence - maybe move to a map table? # COMBINING MACRON - tmpsf = subfield[1].replace(u'\xd5A', u'A\u0304') - tmpsf = tmpsf.replace(u'\xd5a', u'a\u0304') - tmpsf = tmpsf.replace(u'\xd5E', u'E\u0304') - tmpsf = tmpsf.replace(u'\xd5e', u'e\u0304') - tmpsf = tmpsf.replace(u'\xd5I', u'I\u0304') - tmpsf = tmpsf.replace(u'\xd5i', u'i\u0304') - tmpsf = tmpsf.replace(u'\xd5O', u'O\u0304') - tmpsf = tmpsf.replace(u'\xd5o', u'o\u0304') - tmpsf = tmpsf.replace(u'\xd5U', u'U\u0304') - tmpsf = tmpsf.replace(u'\xd5u', u'u\u0304') + tmpsf = subfield[1].replace(u"\xd5A", u"A\u0304") + tmpsf = tmpsf.replace(u"\xd5a", u"a\u0304") + tmpsf = tmpsf.replace(u"\xd5E", u"E\u0304") + tmpsf = tmpsf.replace(u"\xd5e", u"e\u0304") + tmpsf = tmpsf.replace(u"\xd5I", u"I\u0304") + tmpsf = tmpsf.replace(u"\xd5i", u"i\u0304") + tmpsf = tmpsf.replace(u"\xd5O", u"O\u0304") + tmpsf = tmpsf.replace(u"\xd5o", u"o\u0304") + tmpsf = tmpsf.replace(u"\xd5U", u"U\u0304") + tmpsf = tmpsf.replace(u"\xd5u", u"u\u0304") # LATIN LETTER C WITH ACUTE - tmpsf = tmpsf.replace(u'\xd4C', u'\u0106') - tmpsf = tmpsf.replace(u'\xd4c', u'\u0107') + tmpsf = tmpsf.replace(u"\xd4C", u"\u0106") + tmpsf = tmpsf.replace(u"\xd4c", u"\u0107") # LATIN LETTER L WITH STROKE - tmpsf = tmpsf.replace(u'\u00b0', u'\u0141') + tmpsf = tmpsf.replace(u"\u00b0", u"\u0141") - lstroke = tmpsf.find(u'\00b1') - if lstroke and tmpsf[lstroke + 1] == 'i': + lstroke = tmpsf.find(u"\00b1") + if lstroke and tmpsf[lstroke + 1] == "i": # Modifier prime instead - tmpsf = tmpsf.replace(u'\u00b1', u'\u02b9') + tmpsf = tmpsf.replace(u"\u00b1", u"\u02b9") else: - tmpsf = tmpsf.replace(u'\u00b1', u'\u0142') + tmpsf = tmpsf.replace(u"\u00b1", u"\u0142") # COMBINING MODIFIER LETTER HALF RING - tmpsf = tmpsf.replace(u'\xb1', u'\u02be') + tmpsf = tmpsf.replace(u"\xb1", u"\u02be") # COMBINING TILDE - tmpsf = tmpsf.replace(u'\xf5n', u'n\u0303') + tmpsf = tmpsf.replace(u"\xf5n", u"n\u0303") # COMBINING CEDILLA - tmpsf = tmpsf.replace(u'\xb0c', u'c\u0327') - tmpsf = tmpsf.replace(u'\u01afS', u'S\u0327') - tmpsf = tmpsf.replace(u'\u01afs', u's\u0327') + tmpsf = tmpsf.replace(u"\xb0c", u"c\u0327") + tmpsf = tmpsf.replace(u"\u01afS", u"S\u0327") + tmpsf = tmpsf.replace(u"\u01afs", u"s\u0327") # S WITH COMBINING ACUTE ACCENT - tmpsf = tmpsf.replace(u'\xd4S', u'\u015a') - tmpsf = tmpsf.replace(u'\xd4s', u'\u015b') + tmpsf = tmpsf.replace(u"\xd4S", u"\u015a") + tmpsf = tmpsf.replace(u"\xd4s", u"\u015b") # A CARON - tmpsf = tmpsf.replace(u'\xdaA', u'\u0100') - tmpsf = tmpsf.replace(u'\xdaa', u'\u0101') + tmpsf = tmpsf.replace(u"\xdaA", u"\u0100") + tmpsf = tmpsf.replace(u"\xdaa", u"\u0101") # C CARON - tmpsf = tmpsf.replace(u'\xdaC', u'\u010c') - tmpsf = tmpsf.replace(u'\xdac', u'\u010d') + tmpsf = tmpsf.replace(u"\xdaC", u"\u010c") + tmpsf = tmpsf.replace(u"\xdac", u"\u010d") # R CARON - tmpsf = tmpsf.replace(u'\xdaR', u'\u0158') - tmpsf = tmpsf.replace(u'\xdar', u'\u0159') + tmpsf = tmpsf.replace(u"\xdaR", u"\u0158") + tmpsf = tmpsf.replace(u"\xdar", u"\u0159") # E BREVE - tmpsf = tmpsf.replace(u'\xe6E', u'\u0114') - tmpsf = tmpsf.replace(u'\xe6e', u'\u0115') + tmpsf = tmpsf.replace(u"\xe6E", u"\u0114") + tmpsf = tmpsf.replace(u"\xe6e", u"\u0115") # S CARON - tmpsf = tmpsf.replace(u'\xdaS', u'\u0160') - tmpsf = tmpsf.replace(u'\xdas', u'\u0161') + tmpsf = tmpsf.replace(u"\xdaS", u"\u0160") + tmpsf = tmpsf.replace(u"\xdas", u"\u0161") # U CARON - tmpsf = tmpsf.replace(u'\u00e6U', u'\u01d3') - tmpsf = tmpsf.replace(u'\u00e6u', u'\u01d4') + tmpsf = tmpsf.replace(u"\u00e6U", u"\u01d3") + tmpsf = tmpsf.replace(u"\u00e6u", u"\u01d4") # G BREVE - tmpsf = tmpsf.replace(u'\xe6G', u'\u011e') - tmpsf = tmpsf.replace(u'\xe6g', u'\u011f') + tmpsf = tmpsf.replace(u"\xe6G", u"\u011e") + tmpsf = tmpsf.replace(u"\xe6g", u"\u011f") # I BREVE - tmpsf = tmpsf.replace(u'\xe6I', u'\u012c') - tmpsf = tmpsf.replace(u'\xe6i', u'\u012d') + tmpsf = tmpsf.replace(u"\xe6I", u"\u012c") + tmpsf = tmpsf.replace(u"\xe6i", u"\u012d") # COMBINING DOT ABOVE - tmpsf = tmpsf.replace(u'\xfeI', u'I\u0307') + tmpsf = tmpsf.replace(u"\xfeI", u"I\u0307") # COMBINING LIGATURE LEFT HALF - tmpsf = tmpsf.replace(u'\xd9i', u'i\ufe20') - tmpsf = tmpsf.replace(u'\xd9I', u'I\ufe20') - tmpsf = tmpsf.replace(u'\xd9t', u't\ufe20') + tmpsf = tmpsf.replace(u"\xd9i", u"i\ufe20") + tmpsf = tmpsf.replace(u"\xd9I", u"I\ufe20") + tmpsf = tmpsf.replace(u"\xd9t", u"t\ufe20") # COMBINING LIGATURE RIGHT HALF - tmpsf = tmpsf.replace(u'\xfda', u'a\ufe21') - tmpsf = tmpsf.replace(u'\xfds', u's\ufe21') - tmpsf = tmpsf.replace(u'\xfdU', u'U\ufe21') + tmpsf = tmpsf.replace(u"\xfda", u"a\ufe21") + tmpsf = tmpsf.replace(u"\xfds", u"s\ufe21") + tmpsf = tmpsf.replace(u"\xfdU", u"U\ufe21") # MODIFIER LETTER PRIME - tmpsf = tmpsf.replace(u'\xf0', u'\u02b9') + tmpsf = tmpsf.replace(u"\xf0", u"\u02b9") # LATIN SMALL LETTER DOTLESS I - tmpsf = tmpsf.replace(u'\u00a9', u'\u0131') + tmpsf = tmpsf.replace(u"\u00a9", u"\u0131") # LATIN LETTER E WITH DOT ABOVE - tmpsf = tmpsf.replace(u'\u00feE', u'\u0116') - tmpsf = tmpsf.replace(u'\u00fee', u'\u0117') + tmpsf = tmpsf.replace(u"\u00feE", u"\u0116") + tmpsf = tmpsf.replace(u"\u00fee", u"\u0117") new_field.add_subfield(subfield[0], tmpsf) global RECORD_COUNT - if r'\x' in repr(tmpsf): - print(" * %d Hex value found in %s:%s - [%s] [%s]" % ( - RECORD_COUNT, field.tag, subfield[0], - tmpsf.encode('utf8'), repr(tmpsf) - )) + if r"\x" in repr(tmpsf): + print( + " * %d Hex value found in %s:%s - [%s] [%s]" + % ( + RECORD_COUNT, + field.tag, + subfield[0], + tmpsf.encode("utf8"), + repr(tmpsf), + ) + ) - if (repr(subfield[1]) != repr(tmpsf)): - print("* %d\tOld: [%s]\tNew: [%s]" % ( - RECORD_COUNT, subfield[1].encode('utf8'), tmpsf.encode('utf8') - )) + if repr(subfield[1]) != repr(tmpsf): + print( + "* %d\tOld: [%s]\tNew: [%s]" + % (RECORD_COUNT, subfield[1].encode("utf8"), tmpsf.encode("utf8")) + ) return new_field + def add_publisher(record, options): """ This is a convoluted way to avoid creating a new 710 if we already have a matching 710 and just need to add the publisher relator code. """ - publisher = options['publisher'] + publisher = options["publisher"] munge_publisher = False need_publisher = True @@ -718,75 +769,79 @@ def add_publisher(record, options): raw_publisher = None try: - raw_publisher = record['260']['b'] + raw_publisher = record["260"]["b"] except: pass if raw_publisher: - if 'Oxford' in raw_publisher or 'Clarendon' in raw_publisher: - publisher = 'Oxford University Press.' - elif 'Cambridge' in raw_publisher: - publisher = 'Cambridge University Press.' + if "Oxford" in raw_publisher or "Clarendon" in raw_publisher: + publisher = "Oxford University Press." + elif "Cambridge" in raw_publisher: + publisher = "Cambridge University Press." # Iterate through all of the existing 710 fields - for sten in record.get_fields('710'): - for pub in sten.get_subfields('a'): + for sten in record.get_fields("710"): + for pub in sten.get_subfields("a"): if pub == publisher: munge_publisher = True - for rel in sten.get_subfields('4'): - if rel == 'pbl': + for rel in sten.get_subfields("4"): + if rel == "pbl": uri_for_relator = True need_publisher = False need_relator = False if munge_publisher: if need_relator: - sten.add_subfield('4', 'http://id.loc.gov/vocabulary/relators/pbl') + sten.add_subfield( + "4", "http://id.loc.gov/vocabulary/relators/pbl" + ) elif uri_for_relator: - sten['4'] = 'http://id.loc.gov/vocabulary/relators/pbl' + sten["4"] = "http://id.loc.gov/vocabulary/relators/pbl" need_publisher = False if need_publisher: # Add the publisher, with relator code - seven_ten = pymarc.Field(tag = '710', - indicators = ['2', ' '], - subfields = [ - 'a', publisher, - '4', 'http://id.loc.gov/vocabulary/relators/pbl' - ] + seven_ten = pymarc.Field( + tag="710", + indicators=["2", " "], + subfields=[ + "a", + publisher, + "4", + "http://id.loc.gov/vocabulary/relators/pbl", + ], ) record.add_ordered_field(seven_ten) return publisher + def add_platform(record, options): """ This is a convoluted way to avoid creating a new 710 if we already have a matching 710 for digital platform. """ - if not 'platform' in options: + if not "platform" in options: return False - platform = options['platform'] + platform = options["platform"] need_platform = True # Iterate through all of the existing 710 fields - for sten in record.get_fields('710'): - for pub in sten.get_subfields('a'): - if pub == platform or (pub == platform + '.'): + for sten in record.get_fields("710"): + for pub in sten.get_subfields("a"): + if pub == platform or (pub == platform + "."): need_platform = False if need_platform: # Add the platform - seven_ten = pymarc.Field(tag = '710', - indicators = ['2', ' '], - subfields = [ - 'a', platform - ] + seven_ten = pymarc.Field( + tag="710", indicators=["2", " "], subfields=["a", platform] ) record.add_ordered_field(seven_ten) + def mark_isbn_for_sfx(record, options): """ Adds a $9 subfield to the 020 (ISBN) field to use for SFX look-ups @@ -797,29 +852,27 @@ def mark_isbn_for_sfx(record, options): """ # For every ISBN in the record - for isbn in record.get_fields('020', '024'): - for isbnval in isbn.get_subfields('a', 'z'): + for isbn in record.get_fields("020", "024"): + for isbnval in isbn.get_subfields("a", "z"): isbnval = clean_isbn(isbnval) # And for every library we have enabled - for lib in options['libraries']: - if lib == 'boreal': + for lib in options["libraries"]: + if lib == "boreal": return False found = check_for_isbn(options, lib, isbnval) if found: # Add the $9 subfield to mark this as a good one - isbn.add_subfield('9', 'SFX') + isbn.add_subfield("9", "SFX") return True # For ebrary records, add a 924 for the custom URN - if 'ebrary' in options: + if "ebrary" in options: urn = None - for scn in record.get_fields('001'): - urn = pymarc.Field(tag = '924', - indicators = ['8', ' '], - subfields = [ - 'a', scn.value(), - '9', 'SFX' - ] + for scn in record.get_fields("001"): + urn = pymarc.Field( + tag="924", + indicators=["8", " "], + subfields=["a", scn.value(), "9", "SFX"], ) if urn is not None: @@ -828,18 +881,23 @@ def mark_isbn_for_sfx(record, options): return False + def check_for_isbn(options, lib, isbnval): """ Given an ISBN value, check SFX at the specified library for a match """ - sfx = options['settings'].get_settings(lib)['sfx_url'] - url = "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&" \ - "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&" \ - "sfx.ignore_date_threshold=1&" \ - "sfx.response_type=multi_obj_detailed_xml" \ + sfx = options["settings"].get_settings(lib)["sfx_url"] + url = ( + "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&" + "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&" + "sfx.ignore_date_threshold=1&" + "sfx.response_type=multi_obj_detailed_xml" "&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval) + ) - headers = {'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'} + headers = { + "user-agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0" + } req = requests.get(url, headers=headers) try: req.raise_for_status() @@ -853,20 +911,19 @@ def check_for_isbn(options, lib, isbnval): sfx_res = BeautifulSoup(req.text, "html.parser") # We want a target with a service_type element of 'getFullTxt' - targets = sfx_res.ctx_obj.ctx_obj_targets.findAll( - 'target', recursive=False - ) + targets = sfx_res.ctx_obj.ctx_obj_targets.findAll("target", recursive=False) if len(targets) == 0: # No SFX targets found for this ISBN - next! return False for target in targets: - if target.service_type.renderContents() == 'getFullTxt': + if target.service_type.renderContents() == "getFullTxt": return True return False + def clean_isbn(isbn): """ Return a normalized ISBN from a MARC subfield @@ -879,16 +936,17 @@ def clean_isbn(isbn): isbn = isbn.strip() # Grab the first string beginning with a digit - isbn_match = re.search(r'^[\D]*([\d]+\S+).*?$', isbn) + isbn_match = re.search(r"^[\D]*([\d]+\S+).*?$", isbn) if not isbn_match.group(1): return None # Replace hyphens - isbn = isbn_match.group(1).replace('-', '') + isbn = isbn_match.group(1).replace("-", "") return isbn + def add_restriction(new_record, options, publisher): """ Adds a 506 access restriction note per institution @@ -901,123 +959,148 @@ def add_restriction(new_record, options, publisher): """ # Add a period if the authorization ends with a number or letter - authnote = options['authorization'] - if authnote[-1] not in '.)]': - authnote += '.' + authnote = options["authorization"] + if authnote[-1] not in ".)]": + authnote += "." - for library in options['libraries']: + for library in options["libraries"]: # Skip auth note if Algoma + CUP - if library == 'algoma' and 'Cambridge' in publisher: + if library == "algoma" and "Cambridge" in publisher: continue - libopts = options['settings'].get_settings(library) + libopts = options["settings"].get_settings(library) # Add the access restriction note - if 'consortium' in options: + if "consortium" in options: subfields = [ - 'a', append_space_semi_space(libopts['access_note']), - 'b', append_space_semi_space(options['consortium']), - 'e', authnote, - '9', libopts['lac_symbol'] + "a", + append_space_semi_space(libopts["access_note"]), + "b", + append_space_semi_space(options["consortium"]), + "e", + authnote, + "9", + libopts["lac_symbol"], ] else: subfields = [ - 'a', append_space_semi_space(libopts['access_note']), - 'e', authnote, - '9', libopts['lac_symbol'] + "a", + append_space_semi_space(libopts["access_note"]), + "e", + authnote, + "9", + libopts["lac_symbol"], ] - note = pymarc.Field(tag = '506', - indicators = ['1', ' '], - subfields = subfields - ) + note = pymarc.Field(tag="506", indicators=["1", " "], subfields=subfields) new_record.add_ordered_field(note) + def append_space_semi_space(note): """ Try to ensure the given text ends with ' ; ' """ - if note[-3:] == ' ; ': + if note[-3:] == " ; ": pass - elif note[-1] == ';': - note += ' ' - elif note[-1] == ' ': - note += '; ' + elif note[-1] == ";": + note += " " + elif note[-1] == " ": + note += "; " else: - note += ' ; ' + note += " ; " return note + def add_cat_source(record, options): """Add or extend the 040 field to identify the cataloguing source""" # Only do this for Laurentian - if 'laurentian' not in options['libraries']: + if "laurentian" not in options["libraries"]: return - cat_source = record['040'] + cat_source = record["040"] if cat_source: # Add subfield 'd' identifying Laurentian - cat_source.add_subfield('d', 'CaOSUL') + cat_source.add_subfield("d", "CaOSUL") else: # Add a 040 with subfield 'd' identifying Laurentian - forty = pymarc.Field(tag = '040', - indicators = [' ', ' '], - subfields = [ 'd', 'CaOSUL' ] + forty = pymarc.Field( + tag="040", indicators=[" ", " "], subfields=["d", "CaOSUL"] ) record.add_ordered_field(forty) + def add_relator_uri(field): """ Add URIs to RDA 33x fields """ - if 'b' not in field: + if "b" not in field: pass - elif field.tag == '336': - field.add_subfield('0', 'http://id.loc.gov/vocabulary/contentTypes/' + field['b']) - elif field.tag == '337': - field.add_subfield('0', 'http://id.loc.gov/vocabulary/mediaTypes/' + field['b']) - elif field.tag == '338': - field.add_subfield('0', 'http://id.loc.gov/vocabulary/carriers/' + field['b']) + elif field.tag == "336": + field.add_subfield( + "0", "http://id.loc.gov/vocabulary/contentTypes/" + field["b"] + ) + elif field.tag == "337": + field.add_subfield("0", "http://id.loc.gov/vocabulary/mediaTypes/" + field["b"]) + elif field.tag == "338": + field.add_subfield("0", "http://id.loc.gov/vocabulary/carriers/" + field["b"]) return field + def add_rda_fields(record): """ Add 336,337,338 fields identifying the content as an ebook """ - content = pymarc.Field(tag = '336', - indicators = [' ', ' '], - subfields = [ - 'a', 'text', - 'b', 'txt', - '2', 'rdacontent', - '0', 'http://id.loc.gov/vocabulary/contentTypes/txt' - ] + content = pymarc.Field( + tag="336", + indicators=[" ", " "], + subfields=[ + "a", + "text", + "b", + "txt", + "2", + "rdacontent", + "0", + "http://id.loc.gov/vocabulary/contentTypes/txt", + ], ) - media = pymarc.Field(tag = '337', - indicators = [' ', ' '], - subfields = [ - 'a', 'computer', - 'b', 'c', - '2', 'rdamedia', - '0', 'http://id.loc.gov/vocabulary/mediaTypes/c' - ] + media = pymarc.Field( + tag="337", + indicators=[" ", " "], + subfields=[ + "a", + "computer", + "b", + "c", + "2", + "rdamedia", + "0", + "http://id.loc.gov/vocabulary/mediaTypes/c", + ], ) - carrier = pymarc.Field(tag = '338', - indicators = [' ', ' '], - subfields = [ - 'a', 'online resource', - 'b', 'cr', - '2', 'rdacarrier', - '0', 'http://id.loc.gov/vocabulary/carriers/cr' - ] + carrier = pymarc.Field( + tag="338", + indicators=[" ", " "], + subfields=[ + "a", + "online resource", + "b", + "cr", + "2", + "rdacarrier", + "0", + "http://id.loc.gov/vocabulary/carriers/cr", + ], ) record.add_ordered_field(content) record.add_ordered_field(media) record.add_ordered_field(carrier) + def add_marc_source(record, options): """ Add a 598 field identifying the source MARC file name and processing date @@ -1025,87 +1108,85 @@ def add_marc_source(record, options): global RECORD_COUNT - source = os.path.basename(options['input']) + source = os.path.basename(options["input"]) - marc_source = pymarc.Field(tag = '598', - indicators = [' ', ' '], - subfields = [ - 'a', source, - 'b', date.today().isoformat(), - 'c', str(RECORD_COUNT) - ] + marc_source = pymarc.Field( + tag="598", + indicators=[" ", " "], + subfields=["a", source, "b", date.today().isoformat(), "c", str(RECORD_COUNT)], ) record.add_ordered_field(marc_source) + def create_clean_isbn(field): """Move 020a junk to 020q""" - - if not field.get_subfields('a') or ' ' not in field['a']: + + if not field.get_subfields("a") or " " not in field["a"]: return field - isbn = pymarc.Field( - tag = '020', - indicators=[field.indicator1, field.indicator2] - ) + isbn = pymarc.Field(tag="020", indicators=[field.indicator1, field.indicator2]) for sf in field: - if sf[0] == 'a' and ' ' in sf[1]: + if sf[0] == "a" and " " in sf[1]: junk = sf[1].strip() - junk = junk[junk.find(' '):].strip() - isbn.add_subfield('a', clean_isbn(sf[1])) - isbn.add_subfield('q', junk) + junk = junk[junk.find(" ") :].strip() + isbn.add_subfield("a", clean_isbn(sf[1])) + isbn.add_subfield("q", junk) else: isbn.add_subfield(sf[0], sf[1]) return isbn + def process_urls(field, options, publisher): """Creates 856 fields required by Conifer""" new_fields = [] - if not field['u']: + if not field["u"]: print("* No subfield 'u' found in this 856") return None # If we have a ToC or author notes or whatever, replace with content - if field['u'].find('.loc.gov') > -1: + if field["u"].find(".loc.gov") > -1: enrich = substitute_content(field) if enrich and isinstance(enrich, pymarc.field.Field): new_fields.append(enrich) else: - for lib in options['libraries']: + for lib in options["libraries"]: # Tweak for Algoma for combined CUP/OUP - if lib == 'algoma' and 'Cambridge' in publisher: + if lib == "algoma" and "Cambridge" in publisher: continue - data = options['settings'].get_settings(lib) + data = options["settings"].get_settings(lib) - platform = options['platform'] - if field['u'].find('books.scholarsportal') > -1: - platform = 'ScholarsPortal' + platform = options["platform"] + if field["u"].find("books.scholarsportal") > -1: + platform = "ScholarsPortal" subs = get_subfields(field, data, platform) - eight_five_six = pymarc.Field(tag = '856', - indicators = ['4', '0'], - subfields = subs + eight_five_six = pymarc.Field( + tag="856", indicators=["4", "0"], subfields=subs ) new_fields.append(eight_five_six) return new_fields + def substitute_content(field): """Parses a ToC or author notes URL and generates a field""" - url = field['u'] + url = field["u"] content_field = None - raw_content = '' + raw_content = "" # Skip machine-generated tables of contents - if url.find('/toc/') > -1: + if url.find("/toc/") > -1: return None # Get the data from the supplied URL - headers = {'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'} + headers = { + "user-agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0" + } req = requests.get(url, headers=headers) try: req.raise_for_status() @@ -1121,56 +1202,49 @@ def substitute_content(field): if not content: return None - if url.endswith('-b.html'): - # Biographical note + if url.endswith("-b.html"): + # Biographical note content_field = pymarc.Field( - tag = '545', - indicators = ['1', ' '], - subfields = ['a', content] + tag="545", indicators=["1", " "], subfields=["a", content] ) - elif url.endswith('-d.html'): - # Summary written by publisher + elif url.endswith("-d.html"): + # Summary written by publisher content_field = pymarc.Field( - tag = '520', - indicators = ['3', ' '], - subfields = ['a', content] + tag="520", indicators=["3", " "], subfields=["a", content] ) - elif url.endswith('-t.html'): - # Table of contents + elif url.endswith("-t.html"): + # Table of contents content_field = pymarc.Field( - tag = '505', - indicators = [' ', ' '], - subfields = ['a', content] + tag="505", indicators=[" ", " "], subfields=["a", content] ) else: print("URL %s didn't match known LoC type" % (url)) return content_field + def process_loc_data(raw_content): """Given the LoC enriched data, make it usable""" # Short-circuit if we have an OCRed ToC; the quality is terrible - if raw_content.find(text='Electronic data is machine generated'): + if raw_content.find(text="Electronic data is machine generated"): return None - elif raw_content.find('
'):
+    elif raw_content.find("
"):
         return None
 
     # Get all of the text after the horizontal rule
-    content = ' '.join(
-        raw_content.find('hr').findAllNext(text=True)
-    )
+    content = " ".join(raw_content.find("hr").findAllNext(text=True))
 
     # Remove linefeeds
-    content = content.replace('\n', ' ')
-    content = content.replace('\r', ' ')
+    content = content.replace("\n", " ")
+    content = content.replace("\r", " ")
 
     # Replace multiple contiguous whitespace with a single space
-    content = re.sub(r'\s+', r' ', content)
+    content = re.sub(r"\s+", r" ", content)
 
     # Remove inline subject headings to avoid too much indexing boost
-    lcsh = content.find('Library of Congress subject headings')
+    lcsh = content.find("Library of Congress subject headings")
     if lcsh > -1:
         content = content[0:lcsh]
 
@@ -1179,57 +1253,54 @@ def process_loc_data(raw_content):
 
     return content
 
+
 def get_subfields(field, data, platform):
     """Creates 856 subfields required by Conifer"""
 
     subs = []
-    url = field['u']
+    url = field["u"]
 
     # Is this an ebrary URL?
     ebrary = False
-    if url.find('.ebrary.com') > -1:
+    if url.find(".ebrary.com") > -1:
         ebrary = True
 
     # ebrary URLs look like: http://site.ebrary.com/lib//Doc?id=2001019
     # we need to replace  with the library-specific channel
     if ebrary:
-        ebrary_url = re.search(r'^(.+?/lib/).+?(/.+?)$', url)
-        url = ebrary_url.group(1) + data['ebrary_code'] + ebrary_url.group(2)
+        ebrary_url = re.search(r"^(.+?/lib/).+?(/.+?)$", url)
+        url = ebrary_url.group(1) + data["ebrary_code"] + ebrary_url.group(2)
 
     # Only Boreal still wants proxied ebrary links
-    if ebrary and data['ebrary_code'] != 'ocls':
-        subs.extend(['u', url])
+    if ebrary and data["ebrary_code"] != "ocls":
+        subs.extend(["u", url])
     else:
-        if (data['ebrary_code'] == 'ocls' and
-            re.search(r'ra.ocls.ca', field['u'])
-        ):
-            subs.extend(['u', field['u']])
+        if data["ebrary_code"] == "ocls" and re.search(r"ra.ocls.ca", field["u"]):
+            subs.extend(["u", field["u"]])
         else:
-            subs.extend(['u', data['proxy'] + field['u']])
+            subs.extend(["u", data["proxy"] + field["u"]])
 
     # Check for a $z as the first 856; in Springer records, at least, this
     # indicates a multi-volume set that requires keeping the $z around
-    if field.subfields[0] == 'z' and (
+    if field.subfields[0] == "z" and (
         # However, we don't want to keep garbage-y public notes
-        not field.get_subfields('z')[0].startswith('Connect to MyiLibrary')
+        not field.get_subfields("z")[0].startswith("Connect to MyiLibrary")
     ):
         subs.extend([field.subfields[0], field.subfields[1]])
 
-    link_text = data['link_text']
+    link_text = data["link_text"]
     # We don't know what the 956 platform is
-    if platform and field.tag != '956':
-        link_text = "%s (%s)" % (data['link_text'], platform)
-    elif url.find('springer.com') > -1:
-        link_text = "%s (%s)" % (data['link_text'], 'Springer')
+    if platform and field.tag != "956":
+        link_text = "%s (%s)" % (data["link_text"], platform)
+    elif url.find("springer.com") > -1:
+        link_text = "%s (%s)" % (data["link_text"], "Springer")
     else:
-        link_text = "%s" % (data['link_text'])
-    subs.extend([
-            'y', link_text,
-            '9', data['code']
-    ])
+        link_text = "%s" % (data["link_text"])
+    subs.extend(["y", link_text, "9", data["code"]])
 
     return subs
 
+
 def process_xml(record):
     global OPTIONS
     global FILES
@@ -1237,17 +1308,18 @@ def process_xml(record):
     files = FILES
     process_record(record, options, files)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     OPTIONS = parse_opts()
-    for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'):
+    for fname in ("duplicate", "tcn", "url", "sample", "output"):
         if fname in OPTIONS:
             try:
-                if 'to-format' in OPTIONS and OPTIONS['to-format'] == 'xml':
-                    FILES[fname] = codecs.open(OPTIONS[fname], 'wb', 'utf-8')
+                if "to-format" in OPTIONS and OPTIONS["to-format"] == "xml":
+                    FILES[fname] = codecs.open(OPTIONS[fname], "wb", "utf-8")
                 else:
-                    FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'wb'))
+                    FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], "wb"))
             except Exception as ex:
                 print("Could not open output file [%s]: %s" % (OPTIONS[fname], ex))
 
     process_marc(OPTIONS)
-    #pymarc.map_xml(process_xml, '/home/dan/Downloads/AlexanderStreetPress_JazzMusicLibrary_Canada_MONOSER_2012-05-23.xml')
+    # pymarc.map_xml(process_xml, '/home/dan/Downloads/AlexanderStreetPress_JazzMusicLibrary_Canada_MONOSER_2012-05-23.xml')
-- 
2.11.0