be accommodated in batch load.
"""
-import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, json
-import codecs, copy
+import os
+import os.path
+import sys
+import getopt
+import pymarc
+import re
+import json
+import codecs
+import copy
import requests
from datetime import date
from bs4 import BeautifulSoup
RECORD_COUNT = 0
DUP_COUNT = 0
-GATEWAY_URL = "http://www.concat.ca/osrf-gateway-v1"
+GATEWAY_URL = "https://www.concat.ca/osrf-gateway-v1"
OPENSRF_ISBN_CALL = "open-ils.search.biblio.isbn"
OPENSRF_TCN_CALL = "open-ils.search.biblio.tcn"
OPENSRF_KEYWORD_CALL = "open-ils.search.biblio.multiclass.query"
OPTIONS = {}
FILES = {}
-class Institution():
+
+class Institution:
"""Defines standard settings for each Conifer institution"""
def __init__(self):
"""Initialize the Institution object"""
- self.algoma = { \
- "code": "ALGOMASYS", \
- "lac_symbol": "OSTMA", \
- "org_unit": "111", \
- "ebrary_code": "algomauca", \
- "proxy": "http://libproxy.auc.ca/login?url=", \
- "link_text": "Available online", \
- "sfx_url": "http://sfx.scholarsportal.info/algoma", \
- "access_note": "Access restricted to users with a valid Algoma University ID ;" \
+ self.algoma = {
+ "code": "ALGOMASYS",
+ "lac_symbol": "OSTMA",
+ "org_unit": "111",
+ "ebrary_code": "algomauca",
+ "proxy": "http://libproxy.auc.ca/login?url=",
+ "link_text": "Available online",
+ "sfx_url": "http://sfx.scholarsportal.info/algoma",
+ "access_note": "Access restricted to users with a valid Algoma University ID ;",
}
- self.boreal = { \
- "code": "BOREALSYS", \
- "lac_symbol": "BOREALSYS", \
- "org_unit": "135", \
- "ebrary_code": "ocls", \
- "proxy": "http://ra.ocls.ca/ra/login.aspx?url=", \
- "link_text": "Disponible en ligne", \
- "access_note": u"Accès réservé aux utilisateurs avec un ID valide Collège Boréal ;" \
+ self.boreal = {
+ "code": "BOREALSYS",
+ "lac_symbol": "BOREALSYS",
+ "org_unit": "135",
+ "ebrary_code": "ocls",
+ "proxy": "http://ra.ocls.ca/ra/login.aspx?url=",
+ "link_text": "Disponible en ligne",
+ "access_note": u"Accès réservé aux utilisateurs avec un ID valide Collège Boréal ;",
}
- self.laurentian = { \
- "code": "LUSYS", \
- "lac_symbol": "OSUL", \
- "org_unit": "105", \
- "ebrary_code": "jndlu", \
- "gale_code": "sudb78095", \
- "proxy": "https://login.librweb.laurentian.ca/login?url=", \
- "link_text": "Available online / disponible en ligne", \
- "sfx_url": "https://sfx.scholarsportal.info/laurentian", \
- "access_note": "Access restricted to users with a valid Laurentian University ID ;" \
+ self.laurentian = {
+ "code": "LUSYS",
+ "lac_symbol": "OSUL",
+ "org_unit": "105",
+ "ebrary_code": "jndlu",
+ "gale_code": "sudb78095",
+ "proxy": "https://login.librweb.laurentian.ca/login?url=",
+ "link_text": "Available online / disponible en ligne",
+ "sfx_url": "https://sfx.scholarsportal.info/laurentian",
+ "access_note": "Access restricted to users with a valid Laurentian University ID ;",
}
def get_settings(self, lib):
"""Return the settings for a library by name"""
return getattr(self, lib)
+
def do_help():
- '''
+ """
Print help for the Conifer ebook MARC processor
- '''
+ """
- print('''
+ print(
+ """
Conifer ebook MARC processor
This script takes a set of MARC records and processes them to generate a set
Examples:
%s --algoma -i crkn.mrc -o /tmp/crkn_out.mrc -p "eBrary Inc."
- ''' % (sys.argv[0],))
+ """
+ % (sys.argv[0],)
+ )
sys.exit(0)
+
def consolidate_options(opts):
"""Make long arguments the standard form in command line options"""
_shortlong = {
- '-i': '--input',
- '-o': '--output',
- '-a': '--authorization',
- '-c': '--consortium',
- '-C': '--clean',
- '-d': '--duplicate',
- '-e': '--ebrary',
- '-F': '--from-format',
- '-I': '--isbn-sfx',
- '-p': '--publisher',
- '-P': '--platform',
- '-n': '--note',
- '-t': '--tcn',
- '-T': '--to-format',
- '-u': '--url',
- '-A': '--algoma',
- '-B': '--boreal',
- '-L': '--laurentian',
- '-s': '--sample',
- '-x': '--cut-field',
- '-h': '--help'
+ "-i": "--input",
+ "-o": "--output",
+ "-a": "--authorization",
+ "-c": "--consortium",
+ "-C": "--clean",
+ "-d": "--duplicate",
+ "-e": "--ebrary",
+ "-F": "--from-format",
+ "-I": "--isbn-sfx",
+ "-p": "--publisher",
+ "-P": "--platform",
+ "-n": "--note",
+ "-t": "--tcn",
+ "-T": "--to-format",
+ "-u": "--url",
+ "-A": "--algoma",
+ "-B": "--boreal",
+ "-L": "--laurentian",
+ "-s": "--sample",
+ "-x": "--cut-field",
+ "-h": "--help",
}
_options = dict(opts)
return _options
+
def check_options(options):
"""Check the validity of options that were passed in"""
_help = False
_req = {
- '--input': "* Missing -i / --input argument!",
- '--output': "* Missing -o / --output argument!",
- '--authorization': "* Missing -a / --authorization argument!",
- '--publisher': "* Missing -p / --publisher argument!"
+ "--input": "* Missing -i / --input argument!",
+ "--output": "* Missing -o / --output argument!",
+ "--authorization": "* Missing -a / --authorization argument!",
+ "--publisher": "* Missing -p / --publisher argument!",
}
- if '--help' in options:
+ if "--help" in options:
do_help()
for reqkey, reqwarn in _req.items():
_help = True
_libraries = check_libraries(options)
- if len(_libraries.keys()) == 0:
+ if not _libraries.keys():
_help = True
- if _help == True:
+ if _help:
do_help()
# Get the input and output files
- _input = options['--input']
- _output = options['--output']
+ _input = options["--input"]
+ _output = options["--output"]
try:
os.stat(_input)
sys.exit(0)
_bool_opts = {
- '--clean': 'clean',
- '--ebrary': 'ebrary',
- '--isbn-sfx': 'isbn-sfx',
+ "--clean": "clean",
+ "--ebrary": "ebrary",
+ "--isbn-sfx": "isbn-sfx",
}
_string_opts = {
- '--authorization': 'authorization',
- '--consortium': 'consortium',
- '--duplicate': 'duplicate',
- '--from-format': 'from-format',
- '--note': 'note',
- '--platform': 'platform',
- '--sample': 'sample',
- '--cut-field': 'cut-field',
- '--tcn': 'tcn',
- '--to-format': 'to-format',
- '--url': 'url',
+ "--authorization": "authorization",
+ "--consortium": "consortium",
+ "--duplicate": "duplicate",
+ "--from-format": "from-format",
+ "--note": "note",
+ "--platform": "platform",
+ "--sample": "sample",
+ "--cut-field": "cut-field",
+ "--tcn": "tcn",
+ "--to-format": "to-format",
+ "--url": "url",
}
clean_opts = dict()
- clean_opts['publisher'] = append_period(options['--publisher'])
+ clean_opts["publisher"] = append_period(options["--publisher"])
for optkey, optval in _bool_opts.items():
if optkey in options:
if optkey in options:
clean_opts[optval] = options[optkey]
- clean_opts['libraries'] = _libraries
- clean_opts['input'] = _input
- clean_opts['output'] = _output
- clean_opts['settings'] = Institution()
+ clean_opts["libraries"] = _libraries
+ clean_opts["input"] = _input
+ clean_opts["output"] = _output
+ clean_opts["settings"] = Institution()
return clean_opts
+
def evergreen_request(method, *args, **kwargs):
"""Issue a basic gateway request against Evergreen"""
-
- service = '.'.join(method.split('.')[:2])
- kwargs.update({'service':service, 'method':method})
- params = ['%s=%s' % (k, quote(v)) for k, v in kwargs.items()]
- params += ['param=%s' % quote(json.dumps(a)) for a in args]
- url = '%s?%s' % (GATEWAY_URL, '&'.join(params))
- #print '--->', url
+ from urllib.parse import quote
+
+ service = ".".join(method.split(".")[:2])
+ kwargs.update({"service": service, "method": method})
+ params = ["%s=%s" % (k, quote(v)) for k, v in kwargs.items()]
+ params += ["param=%s" % quote(json.dumps(a)) for a in args]
+ url = "%s?%s" % (GATEWAY_URL, "&".join(params))
+ # print '--->', url
req = requests.get(url)
resp = req.json()
- if resp['status'] != 200:
- raise Exception('error during evergreen request', resp)
- payload = resp['payload']
- #print '<---', payload
+ if resp["status"] != 200:
+ raise Exception("error during evergreen request", resp)
+ payload = resp["payload"]
+ # print '<---', payload
return payload
+
def url_check(record, options):
"""Check for a matching URL in Evergreen"""
match = False
match_id = 0
# Oxford MARC files from ScholarsPortal have DOIs in 956(!)
- for url in record.get_fields('856','956'):
- for urlval in url.get_subfields('u'):
+ for url in record.get_fields("856", "956"):
+ for urlval in url.get_subfields("u"):
# print "urlval", urlval
- for library in options['libraries']:
- libopts = options['settings'].get_settings(library)
- keyword_info = evergreen_request(OPENSRF_KEYWORD_CALL,
- {'org_unit': libopts['org_unit'],
- 'depth': 1, 'limit': 5, 'offset': 0,
- 'visibility_limit': 3000,
- 'default_class': 'keyword'},
- urlval, 1)
- bib_ids = keyword_info[0]['ids']
+ for library in options["libraries"]:
+ libopts = options["settings"].get_settings(library)
+ keyword_info = evergreen_request(
+ OPENSRF_KEYWORD_CALL,
+ {
+ "org_unit": libopts["org_unit"],
+ "depth": 1,
+ "limit": 5,
+ "offset": 0,
+ "visibility_limit": 3000,
+ "default_class": "keyword",
+ },
+ urlval,
+ 1,
+ )
+ bib_ids = keyword_info[0]["ids"]
for bib_id in bib_ids:
match_id = bib_id
- print("* %d of %d - URL match on %s for %s"
- % (DUP_COUNT + 1, RECORD_COUNT, urlval, bib_id[0])
+ print(
+ "* %d of %d - URL match on %s for %s"
+ % (DUP_COUNT + 1, RECORD_COUNT, urlval, bib_id[0])
)
match = True
return match_id, match
+
def tcn_check(record):
"""Check for a matching TCN in Evergreen"""
match = False
match_id = 0
- for tcn in record.get_fields('001'):
+ for tcn in record.get_fields("001"):
tcn_val = tcn.value()
tcn_info = evergreen_request(OPENSRF_TCN_CALL, tcn_val)
- bib_ids = tcn_info[0]['ids']
+ bib_ids = tcn_info[0]["ids"]
# print "tcn_info", tcn_info
for bib_id in bib_ids:
match_id = bib_id
- print("* %d of %d - TCN match on %s for %s"
+ print(
+ "* %d of %d - TCN match on %s for %s"
% (DUP_COUNT + 1, RECORD_COUNT, tcn_val, bib_id)
)
match = True
return match_id, match
+
def isbn_check(record):
"""Check for a matching ISBN in Evergreen"""
match = False
match_id = 0
- for isbn in record.get_fields('020', '024'):
- for isbnval in isbn.get_subfields('a', 'z'):
+ for isbn in record.get_fields("020", "024"):
+ for isbnval in isbn.get_subfields("a", "z"):
isbn_val = clean_isbn(isbnval)
isbn_info = evergreen_request(OPENSRF_ISBN_CALL, isbnval)
- #print "count", isbn_info[0]['count']
- bib_ids = isbn_info[0]['ids']
+ # print "count", isbn_info[0]['count']
+ bib_ids = isbn_info[0]["ids"]
for bib_id in bib_ids:
match_id = bib_id
- print("* %d of %d - ISBN match on %s for %s"
+ print(
+ "* %d of %d - ISBN match on %s for %s"
% (DUP_COUNT + 1, RECORD_COUNT, isbn_val, bib_id)
)
match = True
return match_id, match
+
def append_period(text):
"""
Append a period to the incoming text if required
"""
- if text[-1] != '.':
- text += '.'
+ if text[-1] != ".":
+ text += "."
return text
+
def check_libraries(options):
"""Build a dict of the libraries that were requested for this batch"""
_libraries = dict()
- for lib in ['algoma', 'boreal', 'laurentian']:
- if '--' + lib in options:
+ for lib in ["algoma", "boreal", "laurentian"]:
+ if "--" + lib in options:
_libraries[lib] = True
return _libraries
def parse_opts():
"""Get command-line arguments from the script"""
try:
- _short_opts = 'i:o:a:p:P:ABLc:eCId:F:T:t:u:n:s:x:h'
- _long_opts = ['input=', 'output=', 'authorization=',
- 'publisher=', 'platform=', 'algoma', 'boreal', 'laurentian',
- 'consortium=', 'ebrary', 'clean', 'isbn-sfx', 'duplicate=',
- 'from-format=', 'to-format=', 'tcn=', 'url=', 'note=', 'sample=',
- 'cut-field=', 'help'
+ _short_opts = "i:o:a:p:P:ABLc:eCId:F:T:t:u:n:s:x:h"
+ _long_opts = [
+ "input=",
+ "output=",
+ "authorization=",
+ "publisher=",
+ "platform=",
+ "algoma",
+ "boreal",
+ "laurentian",
+ "consortium=",
+ "ebrary",
+ "clean",
+ "isbn-sfx",
+ "duplicate=",
+ "from-format=",
+ "to-format=",
+ "tcn=",
+ "url=",
+ "note=",
+ "sample=",
+ "cut-field=",
+ "help",
]
opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts)
except getopt.GetoptError as ex:
_options = consolidate_options(opts[0])
return check_options(_options)
+
def process_marc(options):
"""Converts raw ebook MARC records to Conifer-ready MARC records"""
global FILES
files = FILES
- if 'from-format' in options and options['from-format'] == 'xml':
- pymarc.map_xml(process_xml, options['input'])
+ if "from-format" in options and options["from-format"] == "xml":
+ pymarc.map_xml(process_xml, options["input"])
else:
try:
reader = pymarc.MARCReader(
- open(options['input'], mode='rb'), to_unicode=True
+ open(options["input"], mode="rb"), to_unicode=True
)
except Exception as ex:
- print("Could not open input file [%s]" % options['input'])
+ print("Could not open input file [%s]" % options["input"])
for record in reader:
process_record(record, options, files)
+
def process_record(record, options, files):
global RECORD_COUNT
global DUP_COUNT
RECORD_COUNT += 1
try:
- if not (record['856'] and record['856']['u']):
- print("* No 856 for record # %s in file %s"
- % (RECORD_COUNT, options['input'])
+ if not (record["856"] and record["856"]["u"]):
+ print(
+ "* No 856 for record # %s in file %s" % (RECORD_COUNT, options["input"])
)
else:
- print("%d - %s" % (RECORD_COUNT, record['856']))
+ print("%d - %s" % (RECORD_COUNT, record["856"]))
dupe_flags = {}
- if 'duplicate' in files:
+ if "duplicate" in files:
tmp_record = process_fields(copy.deepcopy(record), options)
- bib_id, dupe_flags['isbn'] = isbn_check(tmp_record)
- if dupe_flags['isbn']:
+ bib_id, dupe_flags["isbn"] = isbn_check(tmp_record)
+ if dupe_flags["isbn"]:
tmp_record = add_dupe_field(tmp_record, bib_id)
- files['duplicate'].write(tmp_record)
+ files["duplicate"].write(tmp_record)
else:
- del(dupe_flags['isbn'])
+ del dupe_flags["isbn"]
- if 'tcn' in files and len(dupe_flags) == 0:
+ if "tcn" in files and not dupe_flags:
tmp_record = process_fields(copy.deepcopy(record), options)
- bib_id, dupe_flags['tcn'] = tcn_check(tmp_record)
- if dupe_flags['tcn']:
+ bib_id, dupe_flags["tcn"] = tcn_check(tmp_record)
+ if dupe_flags["tcn"]:
tmp_record = add_dupe_field(tmp_record, bib_id)
- files['tcn'].write(tmp_record)
+ files["tcn"].write(tmp_record)
else:
- del(dupe_flags['tcn'])
+ del dupe_flags["tcn"]
- if 'url' in files and len(dupe_flags) == 0:
+ if "url" in files and not dupe_flags:
tmp_record = process_fields(copy.deepcopy(record), options)
- bib_id, dupe_flags['url'] = url_check(tmp_record, options)
- if dupe_flags['url']:
+ bib_id, dupe_flags["url"] = url_check(tmp_record, options)
+ if dupe_flags["url"]:
tmp_record = add_dupe_field(tmp_record, bib_id)
- files['url'].write(tmp_record)
+ files["url"].write(tmp_record)
else:
- del(dupe_flags['url'])
+ del dupe_flags["url"]
- if len(dupe_flags):
+ if dupe_flags:
DUP_COUNT += 1
else:
new_record = process_fields(record, options)
- if 'to-format' in options and options['to-format'] == 'xml':
- new_record = pymarc.record_to_xml(new_record) + '\n'
- files['output'].write(new_record)
- if ('sample' in files and (
- (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)
- )):
- files['sample'].write(new_record)
+ if "to-format" in options and options["to-format"] == "xml":
+ new_record = pymarc.record_to_xml(new_record) + "\n"
+ files["output"].write(new_record)
+ if "sample" in files and ((RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)):
+ files["sample"].write(new_record)
except Exception as ex:
print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
traceback.print_exc()
+
def process_fields(record, options):
"""Decide which fields to add, delete, and keep"""
new_record = pymarc.Record(to_unicode=True, force_utf8=True)
- leader = record.leader[:6] + 'a' + record.leader[7:]
+ leader = record.leader[:6] + "a" + record.leader[7:]
new_record.leader = leader
- add_cat_source(record, options) # 040
+ add_cat_source(record, options) # 040
# 590
- if 'note' in options:
- note_value = options['note']
- note = pymarc.Field(tag = '590',
- indicators = [' ', ' '],
- subfields = [
- 'a', note_value
- ]
+ if "note" in options:
+ note_value = options["note"]
+ note = pymarc.Field(
+ tag="590", indicators=[" ", " "], subfields=["a", note_value]
)
record.add_ordered_field(note)
- add_marc_source(record, options) # 598
- if record.get_fields('336') is None:
- add_rda_fields(record, options) # 336,337,338
- publisher = add_publisher(record, options) # 710
- add_restriction(record, options, publisher) # 506
- add_platform(record, options) # 710
+ add_marc_source(record, options) # 598
+ if record.get_fields("336") is None:
+ add_rda_fields(record, options) # 336,337,338
+ publisher = add_publisher(record, options) # 710
+ add_restriction(record, options, publisher) # 506
+ add_platform(record, options) # 710
- if 'isbn-sfx' in options:
+ if "isbn-sfx" in options:
marked_isbn = mark_isbn_for_sfx(record, options)
for field in record.get_fields():
- if 'clean' in options:
+ if "clean" in options:
field = clean_diacritics(field)
# Process all of the 856 fields
# Oxford MARC files from ScholarsPortal have DOIs in 956(!)
- if field.tag == '856' or field.tag == '956':
+ if field.tag == "856" or field.tag == "956":
new_fields = process_urls(field, options, publisher)
if new_fields:
for new_856 in new_fields:
new_record.add_ordered_field(new_856)
# Strip out 9xx fields: we don't want local fields in our records
# except for 924 fields that we create
- elif field.tag[0] == '9' and field.tag != '924':
+ elif field.tag[0] == "9" and field.tag != "924":
pass
# ISBN cleanup
- elif field.tag == '020':
+ elif field.tag == "020":
new_isbn = create_clean_isbn(field)
new_record.add_ordered_field(new_isbn)
# Strip out 300 fields that only contain placeholders
- elif field.tag == '300' and field['a'] == 'p. cm.':
+ elif field.tag == "300" and field["a"] == "p. cm.":
pass
# Add relator URIs
- elif field.tag.startswith('33') and field['0'] is None:
+ elif field.tag.startswith("33") and field["0"] is None:
field = add_relator_uri(field)
new_record.add_ordered_field(field)
# Strip out useless fields
- elif 'cut-field' in options and field.tag in options['cut-field']:
+ elif "cut-field" in options and field.tag in options["cut-field"]:
pass
- elif field.tag == '008' and field.value()[23] != 's':
+ elif field.tag == "008" and field.value()[23] != "s":
fixed_field = pymarc.Field(
- tag='008',
- data=field.value()[:23] + 's' + field.value()[24:]
+ tag="008", data=field.value()[:23] + "s" + field.value()[24:]
)
new_record.add_ordered_field(fixed_field)
# Strip out GMD
- elif field.tag == '245':
- if 'h' in field:
- suffix = field['h'][-3:]
- field.delete_subfield('h')
- field['a'] = field['a'] + suffix
+ elif field.tag == "245":
+ if "h" in field:
+ # Grab the trailing " /"
+ suffix = field["h"][-2:]
+ field.delete_subfield("h")
+ field["a"] = field["a"] + suffix
new_record.add_ordered_field(field)
else:
new_record.add_ordered_field(field)
- if 'isbn-sfx' in options and not marked_isbn:
+ if "isbn-sfx" in options and not marked_isbn:
try:
- isbn = record['020']['a']
- print("ISBN: [%s] - no matching ISBN target found in SFX for %s" %
- (isbn, new_record['856']['u'])
+ isbn = record["020"]["a"]
+ print(
+ "ISBN: [%s] - no matching ISBN target found in SFX for %s"
+ % (isbn, new_record["856"]["u"])
)
except:
- print("No matching ISBN target found in SFX for %s" %
- (new_record['856']['u'])
+ print(
+ "No matching ISBN target found in SFX for %s" % (new_record["856"]["u"])
)
return new_record
+
def add_dupe_field(record, bib_id):
"""Add a 909 field marking the duplicate record"""
dup_value = str(bib_id)
- dup = pymarc.Field(tag = '909',
- indicators = [' ', ' '],
- subfields = [
- 'a', dup_value
- ]
- )
+ dup = pymarc.Field(tag="909", indicators=[" ", " "], subfields=["a", dup_value])
record.add_ordered_field(dup)
return record
+
def clean_diacritics(field):
"""
Change specific patterns of bytes into other patterns of bytes
return field
new_field = pymarc.Field(
- tag=field.tag,
- indicators=[field.indicator1, field.indicator2]
+ tag=field.tag, indicators=[field.indicator1, field.indicator2]
)
for subfield in field:
- if r'\x' not in repr(subfield[1]):
+ if r"\x" not in repr(subfield[1]):
new_field.add_subfield(subfield[0], subfield[1])
continue
# Let the substitutions commence - maybe move to a map table?
# COMBINING MACRON
- tmpsf = subfield[1].replace(u'\xd5A', u'A\u0304')
- tmpsf = tmpsf.replace(u'\xd5a', u'a\u0304')
- tmpsf = tmpsf.replace(u'\xd5E', u'E\u0304')
- tmpsf = tmpsf.replace(u'\xd5e', u'e\u0304')
- tmpsf = tmpsf.replace(u'\xd5I', u'I\u0304')
- tmpsf = tmpsf.replace(u'\xd5i', u'i\u0304')
- tmpsf = tmpsf.replace(u'\xd5O', u'O\u0304')
- tmpsf = tmpsf.replace(u'\xd5o', u'o\u0304')
- tmpsf = tmpsf.replace(u'\xd5U', u'U\u0304')
- tmpsf = tmpsf.replace(u'\xd5u', u'u\u0304')
+ tmpsf = subfield[1].replace(u"\xd5A", u"A\u0304")
+ tmpsf = tmpsf.replace(u"\xd5a", u"a\u0304")
+ tmpsf = tmpsf.replace(u"\xd5E", u"E\u0304")
+ tmpsf = tmpsf.replace(u"\xd5e", u"e\u0304")
+ tmpsf = tmpsf.replace(u"\xd5I", u"I\u0304")
+ tmpsf = tmpsf.replace(u"\xd5i", u"i\u0304")
+ tmpsf = tmpsf.replace(u"\xd5O", u"O\u0304")
+ tmpsf = tmpsf.replace(u"\xd5o", u"o\u0304")
+ tmpsf = tmpsf.replace(u"\xd5U", u"U\u0304")
+ tmpsf = tmpsf.replace(u"\xd5u", u"u\u0304")
# LATIN LETTER C WITH ACUTE
- tmpsf = tmpsf.replace(u'\xd4C', u'\u0106')
- tmpsf = tmpsf.replace(u'\xd4c', u'\u0107')
+ tmpsf = tmpsf.replace(u"\xd4C", u"\u0106")
+ tmpsf = tmpsf.replace(u"\xd4c", u"\u0107")
# LATIN LETTER L WITH STROKE
- tmpsf = tmpsf.replace(u'\u00b0', u'\u0141')
+ tmpsf = tmpsf.replace(u"\u00b0", u"\u0141")
- lstroke = tmpsf.find(u'\00b1')
- if lstroke and tmpsf[lstroke + 1] == 'i':
+ lstroke = tmpsf.find(u"\00b1")
+ if lstroke and tmpsf[lstroke + 1] == "i":
# Modifier prime instead
- tmpsf = tmpsf.replace(u'\u00b1', u'\u02b9')
+ tmpsf = tmpsf.replace(u"\u00b1", u"\u02b9")
else:
- tmpsf = tmpsf.replace(u'\u00b1', u'\u0142')
+ tmpsf = tmpsf.replace(u"\u00b1", u"\u0142")
# COMBINING MODIFIER LETTER HALF RING
- tmpsf = tmpsf.replace(u'\xb1', u'\u02be')
+ tmpsf = tmpsf.replace(u"\xb1", u"\u02be")
# COMBINING TILDE
- tmpsf = tmpsf.replace(u'\xf5n', u'n\u0303')
+ tmpsf = tmpsf.replace(u"\xf5n", u"n\u0303")
# COMBINING CEDILLA
- tmpsf = tmpsf.replace(u'\xb0c', u'c\u0327')
- tmpsf = tmpsf.replace(u'\u01afS', u'S\u0327')
- tmpsf = tmpsf.replace(u'\u01afs', u's\u0327')
+ tmpsf = tmpsf.replace(u"\xb0c", u"c\u0327")
+ tmpsf = tmpsf.replace(u"\u01afS", u"S\u0327")
+ tmpsf = tmpsf.replace(u"\u01afs", u"s\u0327")
# S WITH COMBINING ACUTE ACCENT
- tmpsf = tmpsf.replace(u'\xd4S', u'\u015a')
- tmpsf = tmpsf.replace(u'\xd4s', u'\u015b')
+ tmpsf = tmpsf.replace(u"\xd4S", u"\u015a")
+ tmpsf = tmpsf.replace(u"\xd4s", u"\u015b")
# A CARON
- tmpsf = tmpsf.replace(u'\xdaA', u'\u0100')
- tmpsf = tmpsf.replace(u'\xdaa', u'\u0101')
+ tmpsf = tmpsf.replace(u"\xdaA", u"\u0100")
+ tmpsf = tmpsf.replace(u"\xdaa", u"\u0101")
# C CARON
- tmpsf = tmpsf.replace(u'\xdaC', u'\u010c')
- tmpsf = tmpsf.replace(u'\xdac', u'\u010d')
+ tmpsf = tmpsf.replace(u"\xdaC", u"\u010c")
+ tmpsf = tmpsf.replace(u"\xdac", u"\u010d")
# R CARON
- tmpsf = tmpsf.replace(u'\xdaR', u'\u0158')
- tmpsf = tmpsf.replace(u'\xdar', u'\u0159')
+ tmpsf = tmpsf.replace(u"\xdaR", u"\u0158")
+ tmpsf = tmpsf.replace(u"\xdar", u"\u0159")
# E BREVE
- tmpsf = tmpsf.replace(u'\xe6E', u'\u0114')
- tmpsf = tmpsf.replace(u'\xe6e', u'\u0115')
+ tmpsf = tmpsf.replace(u"\xe6E", u"\u0114")
+ tmpsf = tmpsf.replace(u"\xe6e", u"\u0115")
# S CARON
- tmpsf = tmpsf.replace(u'\xdaS', u'\u0160')
- tmpsf = tmpsf.replace(u'\xdas', u'\u0161')
+ tmpsf = tmpsf.replace(u"\xdaS", u"\u0160")
+ tmpsf = tmpsf.replace(u"\xdas", u"\u0161")
# U CARON
- tmpsf = tmpsf.replace(u'\u00e6U', u'\u01d3')
- tmpsf = tmpsf.replace(u'\u00e6u', u'\u01d4')
+ tmpsf = tmpsf.replace(u"\u00e6U", u"\u01d3")
+ tmpsf = tmpsf.replace(u"\u00e6u", u"\u01d4")
# G BREVE
- tmpsf = tmpsf.replace(u'\xe6G', u'\u011e')
- tmpsf = tmpsf.replace(u'\xe6g', u'\u011f')
+ tmpsf = tmpsf.replace(u"\xe6G", u"\u011e")
+ tmpsf = tmpsf.replace(u"\xe6g", u"\u011f")
# I BREVE
- tmpsf = tmpsf.replace(u'\xe6I', u'\u012c')
- tmpsf = tmpsf.replace(u'\xe6i', u'\u012d')
+ tmpsf = tmpsf.replace(u"\xe6I", u"\u012c")
+ tmpsf = tmpsf.replace(u"\xe6i", u"\u012d")
# COMBINING DOT ABOVE
- tmpsf = tmpsf.replace(u'\xfeI', u'I\u0307')
+ tmpsf = tmpsf.replace(u"\xfeI", u"I\u0307")
# COMBINING LIGATURE LEFT HALF
- tmpsf = tmpsf.replace(u'\xd9i', u'i\ufe20')
- tmpsf = tmpsf.replace(u'\xd9I', u'I\ufe20')
- tmpsf = tmpsf.replace(u'\xd9t', u't\ufe20')
+ tmpsf = tmpsf.replace(u"\xd9i", u"i\ufe20")
+ tmpsf = tmpsf.replace(u"\xd9I", u"I\ufe20")
+ tmpsf = tmpsf.replace(u"\xd9t", u"t\ufe20")
# COMBINING LIGATURE RIGHT HALF
- tmpsf = tmpsf.replace(u'\xfda', u'a\ufe21')
- tmpsf = tmpsf.replace(u'\xfds', u's\ufe21')
- tmpsf = tmpsf.replace(u'\xfdU', u'U\ufe21')
+ tmpsf = tmpsf.replace(u"\xfda", u"a\ufe21")
+ tmpsf = tmpsf.replace(u"\xfds", u"s\ufe21")
+ tmpsf = tmpsf.replace(u"\xfdU", u"U\ufe21")
# MODIFIER LETTER PRIME
- tmpsf = tmpsf.replace(u'\xf0', u'\u02b9')
+ tmpsf = tmpsf.replace(u"\xf0", u"\u02b9")
# LATIN SMALL LETTER DOTLESS I
- tmpsf = tmpsf.replace(u'\u00a9', u'\u0131')
+ tmpsf = tmpsf.replace(u"\u00a9", u"\u0131")
# LATIN LETTER E WITH DOT ABOVE
- tmpsf = tmpsf.replace(u'\u00feE', u'\u0116')
- tmpsf = tmpsf.replace(u'\u00fee', u'\u0117')
+ tmpsf = tmpsf.replace(u"\u00feE", u"\u0116")
+ tmpsf = tmpsf.replace(u"\u00fee", u"\u0117")
new_field.add_subfield(subfield[0], tmpsf)
global RECORD_COUNT
- if r'\x' in repr(tmpsf):
- print(" * %d Hex value found in %s:%s - [%s] [%s]" % (
- RECORD_COUNT, field.tag, subfield[0],
- tmpsf.encode('utf8'), repr(tmpsf)
- ))
+ if r"\x" in repr(tmpsf):
+ print(
+ " * %d Hex value found in %s:%s - [%s] [%s]"
+ % (
+ RECORD_COUNT,
+ field.tag,
+ subfield[0],
+ tmpsf.encode("utf8"),
+ repr(tmpsf),
+ )
+ )
- if (repr(subfield[1]) != repr(tmpsf)):
- print("* %d\tOld: [%s]\tNew: [%s]" % (
- RECORD_COUNT, subfield[1].encode('utf8'), tmpsf.encode('utf8')
- ))
+ if repr(subfield[1]) != repr(tmpsf):
+ print(
+ "* %d\tOld: [%s]\tNew: [%s]"
+ % (RECORD_COUNT, subfield[1].encode("utf8"), tmpsf.encode("utf8"))
+ )
return new_field
+
def add_publisher(record, options):
"""
This is a convoluted way to avoid creating a new 710 if we already
have a matching 710 and just need to add the publisher relator code.
"""
- publisher = options['publisher']
+ publisher = options["publisher"]
munge_publisher = False
need_publisher = True
raw_publisher = None
try:
- raw_publisher = record['260']['b']
+ raw_publisher = record["260"]["b"]
except:
pass
if raw_publisher:
- if 'Oxford' in raw_publisher or 'Clarendon' in raw_publisher:
- publisher = 'Oxford University Press.'
- elif 'Cambridge' in raw_publisher:
- publisher = 'Cambridge University Press.'
+ if "Oxford" in raw_publisher or "Clarendon" in raw_publisher:
+ publisher = "Oxford University Press."
+ elif "Cambridge" in raw_publisher:
+ publisher = "Cambridge University Press."
# Iterate through all of the existing 710 fields
- for sten in record.get_fields('710'):
- for pub in sten.get_subfields('a'):
+ for sten in record.get_fields("710"):
+ for pub in sten.get_subfields("a"):
if pub == publisher:
munge_publisher = True
- for rel in sten.get_subfields('4'):
- if rel == 'pbl':
+ for rel in sten.get_subfields("4"):
+ if rel == "pbl":
uri_for_relator = True
need_publisher = False
need_relator = False
if munge_publisher:
if need_relator:
- sten.add_subfield('4', 'http://id.loc.gov/vocabulary/relators/pbl')
+ sten.add_subfield(
+ "4", "http://id.loc.gov/vocabulary/relators/pbl"
+ )
elif uri_for_relator:
- sten['4'] = 'http://id.loc.gov/vocabulary/relators/pbl'
+ sten["4"] = "http://id.loc.gov/vocabulary/relators/pbl"
need_publisher = False
if need_publisher:
# Add the publisher, with relator code
- seven_ten = pymarc.Field(tag = '710',
- indicators = ['2', ' '],
- subfields = [
- 'a', publisher,
- '4', 'http://id.loc.gov/vocabulary/relators/pbl'
- ]
+ seven_ten = pymarc.Field(
+ tag="710",
+ indicators=["2", " "],
+ subfields=[
+ "a",
+ publisher,
+ "4",
+ "http://id.loc.gov/vocabulary/relators/pbl",
+ ],
)
record.add_ordered_field(seven_ten)
return publisher
+
def add_platform(record, options):
"""
This is a convoluted way to avoid creating a new 710 if we already
have a matching 710 for digital platform.
"""
- if not 'platform' in options:
+ if not "platform" in options:
return False
- platform = options['platform']
+ platform = options["platform"]
need_platform = True
# Iterate through all of the existing 710 fields
- for sten in record.get_fields('710'):
- for pub in sten.get_subfields('a'):
- if pub == platform or (pub == platform + '.'):
+ for sten in record.get_fields("710"):
+ for pub in sten.get_subfields("a"):
+ if pub == platform or (pub == platform + "."):
need_platform = False
if need_platform:
# Add the platform
- seven_ten = pymarc.Field(tag = '710',
- indicators = ['2', ' '],
- subfields = [
- 'a', platform
- ]
+ seven_ten = pymarc.Field(
+ tag="710", indicators=["2", " "], subfields=["a", platform]
)
record.add_ordered_field(seven_ten)
+
def mark_isbn_for_sfx(record, options):
"""
Adds a $9 subfield to the 020 (ISBN) field to use for SFX look-ups
"""
# For every ISBN in the record
- for isbn in record.get_fields('020', '024'):
- for isbnval in isbn.get_subfields('a', 'z'):
+ for isbn in record.get_fields("020", "024"):
+ for isbnval in isbn.get_subfields("a", "z"):
isbnval = clean_isbn(isbnval)
# And for every library we have enabled
- for lib in options['libraries']:
- if lib == 'boreal':
+ for lib in options["libraries"]:
+ if lib == "boreal":
return False
found = check_for_isbn(options, lib, isbnval)
if found:
# Add the $9 subfield to mark this as a good one
- isbn.add_subfield('9', 'SFX')
+ isbn.add_subfield("9", "SFX")
return True
# For ebrary records, add a 924 for the custom URN
- if 'ebrary' in options:
+ if "ebrary" in options:
urn = None
- for scn in record.get_fields('001'):
- urn = pymarc.Field(tag = '924',
- indicators = ['8', ' '],
- subfields = [
- 'a', scn.value(),
- '9', 'SFX'
- ]
+ for scn in record.get_fields("001"):
+ urn = pymarc.Field(
+ tag="924",
+ indicators=["8", " "],
+ subfields=["a", scn.value(), "9", "SFX"],
)
if urn is not None:
return False
+
def check_for_isbn(options, lib, isbnval):
"""
Given an ISBN value, check SFX at the specified library for a match
"""
- sfx = options['settings'].get_settings(lib)['sfx_url']
- url = "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&" \
- "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&" \
- "sfx.ignore_date_threshold=1&" \
- "sfx.response_type=multi_obj_detailed_xml" \
+ sfx = options["settings"].get_settings(lib)["sfx_url"]
+ url = (
+ "%s?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&"
+ "ctx_enc=UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/evergreen&"
+ "sfx.ignore_date_threshold=1&"
+ "sfx.response_type=multi_obj_detailed_xml"
"&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval)
+ )
- headers = {'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
+ headers = {
+ "user-agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0"
+ }
req = requests.get(url, headers=headers)
try:
req.raise_for_status()
sfx_res = BeautifulSoup(req.text, "html.parser")
# We want a target with a service_type element of 'getFullTxt'
- targets = sfx_res.ctx_obj.ctx_obj_targets.findAll(
- 'target', recursive=False
- )
+ targets = sfx_res.ctx_obj.ctx_obj_targets.findAll("target", recursive=False)
if len(targets) == 0:
# No SFX targets found for this ISBN - next!
return False
for target in targets:
- if target.service_type.renderContents() == 'getFullTxt':
+ if target.service_type.renderContents() == "getFullTxt":
return True
return False
+
def clean_isbn(isbn):
"""
Return a normalized ISBN from a MARC subfield
isbn = isbn.strip()
# Grab the first string beginning with a digit
- isbn_match = re.search(r'^[\D]*([\d]+\S+).*?$', isbn)
+ isbn_match = re.search(r"^[\D]*([\d]+\S+).*?$", isbn)
if not isbn_match.group(1):
return None
# Replace hyphens
- isbn = isbn_match.group(1).replace('-', '')
+ isbn = isbn_match.group(1).replace("-", "")
return isbn
+
def add_restriction(new_record, options, publisher):
"""
Adds a 506 access restriction note per institution
"""
# Add a period if the authorization ends with a number or letter
- authnote = options['authorization']
- if authnote[-1] not in '.)]':
- authnote += '.'
+ authnote = options["authorization"]
+ if authnote[-1] not in ".)]":
+ authnote += "."
- for library in options['libraries']:
+ for library in options["libraries"]:
# Skip auth note if Algoma + CUP
- if library == 'algoma' and 'Cambridge' in publisher:
+ if library == "algoma" and "Cambridge" in publisher:
continue
- libopts = options['settings'].get_settings(library)
+ libopts = options["settings"].get_settings(library)
# Add the access restriction note
- if 'consortium' in options:
+ if "consortium" in options:
subfields = [
- 'a', append_space_semi_space(libopts['access_note']),
- 'b', append_space_semi_space(options['consortium']),
- 'e', authnote,
- '9', libopts['lac_symbol']
+ "a",
+ append_space_semi_space(libopts["access_note"]),
+ "b",
+ append_space_semi_space(options["consortium"]),
+ "e",
+ authnote,
+ "9",
+ libopts["lac_symbol"],
]
else:
subfields = [
- 'a', append_space_semi_space(libopts['access_note']),
- 'e', authnote,
- '9', libopts['lac_symbol']
+ "a",
+ append_space_semi_space(libopts["access_note"]),
+ "e",
+ authnote,
+ "9",
+ libopts["lac_symbol"],
]
- note = pymarc.Field(tag = '506',
- indicators = ['1', ' '],
- subfields = subfields
- )
+ note = pymarc.Field(tag="506", indicators=["1", " "], subfields=subfields)
new_record.add_ordered_field(note)
+
def append_space_semi_space(note):
"""
Try to ensure the given text ends with ' ; '
"""
- if note[-3:] == ' ; ':
+ if note[-3:] == " ; ":
pass
- elif note[-1] == ';':
- note += ' '
- elif note[-1] == ' ':
- note += '; '
+ elif note[-1] == ";":
+ note += " "
+ elif note[-1] == " ":
+ note += "; "
else:
- note += ' ; '
+ note += " ; "
return note
+
def add_cat_source(record, options):
"""Add or extend the 040 field to identify the cataloguing source"""
# Only do this for Laurentian
- if 'laurentian' not in options['libraries']:
+ if "laurentian" not in options["libraries"]:
return
- cat_source = record['040']
+ cat_source = record["040"]
if cat_source:
# Add subfield 'd' identifying Laurentian
- cat_source.add_subfield('d', 'CaOSUL')
+ cat_source.add_subfield("d", "CaOSUL")
else:
# Add a 040 with subfield 'd' identifying Laurentian
- forty = pymarc.Field(tag = '040',
- indicators = [' ', ' '],
- subfields = [ 'd', 'CaOSUL' ]
+ forty = pymarc.Field(
+ tag="040", indicators=[" ", " "], subfields=["d", "CaOSUL"]
)
record.add_ordered_field(forty)
+
def add_relator_uri(field):
"""
Add URIs to RDA 33x fields
"""
- if 'b' not in field:
+ if "b" not in field:
pass
- elif field.tag == '336':
- field.add_subfield('0', 'http://id.loc.gov/vocabulary/contentTypes/' + field['b'])
- elif field.tag == '337':
- field.add_subfield('0', 'http://id.loc.gov/vocabulary/mediaTypes/' + field['b'])
- elif field.tag == '338':
- field.add_subfield('0', 'http://id.loc.gov/vocabulary/carriers/' + field['b'])
+ elif field.tag == "336":
+ field.add_subfield(
+ "0", "http://id.loc.gov/vocabulary/contentTypes/" + field["b"]
+ )
+ elif field.tag == "337":
+ field.add_subfield("0", "http://id.loc.gov/vocabulary/mediaTypes/" + field["b"])
+ elif field.tag == "338":
+ field.add_subfield("0", "http://id.loc.gov/vocabulary/carriers/" + field["b"])
return field
+
def add_rda_fields(record):
"""
Add 336,337,338 fields identifying the content as an ebook
"""
- content = pymarc.Field(tag = '336',
- indicators = [' ', ' '],
- subfields = [
- 'a', 'text',
- 'b', 'txt',
- '2', 'rdacontent',
- '0', 'http://id.loc.gov/vocabulary/contentTypes/txt'
- ]
+ content = pymarc.Field(
+ tag="336",
+ indicators=[" ", " "],
+ subfields=[
+ "a",
+ "text",
+ "b",
+ "txt",
+ "2",
+ "rdacontent",
+ "0",
+ "http://id.loc.gov/vocabulary/contentTypes/txt",
+ ],
)
- media = pymarc.Field(tag = '337',
- indicators = [' ', ' '],
- subfields = [
- 'a', 'computer',
- 'b', 'c',
- '2', 'rdamedia',
- '0', 'http://id.loc.gov/vocabulary/mediaTypes/c'
- ]
+ media = pymarc.Field(
+ tag="337",
+ indicators=[" ", " "],
+ subfields=[
+ "a",
+ "computer",
+ "b",
+ "c",
+ "2",
+ "rdamedia",
+ "0",
+ "http://id.loc.gov/vocabulary/mediaTypes/c",
+ ],
)
- carrier = pymarc.Field(tag = '338',
- indicators = [' ', ' '],
- subfields = [
- 'a', 'online resource',
- 'b', 'cr',
- '2', 'rdacarrier',
- '0', 'http://id.loc.gov/vocabulary/carriers/cr'
- ]
+ carrier = pymarc.Field(
+ tag="338",
+ indicators=[" ", " "],
+ subfields=[
+ "a",
+ "online resource",
+ "b",
+ "cr",
+ "2",
+ "rdacarrier",
+ "0",
+ "http://id.loc.gov/vocabulary/carriers/cr",
+ ],
)
record.add_ordered_field(content)
record.add_ordered_field(media)
record.add_ordered_field(carrier)
+
def add_marc_source(record, options):
"""
Add a 598 field identifying the source MARC file name and processing date
global RECORD_COUNT
- source = os.path.basename(options['input'])
+ source = os.path.basename(options["input"])
- marc_source = pymarc.Field(tag = '598',
- indicators = [' ', ' '],
- subfields = [
- 'a', source,
- 'b', date.today().isoformat(),
- 'c', str(RECORD_COUNT)
- ]
+ marc_source = pymarc.Field(
+ tag="598",
+ indicators=[" ", " "],
+ subfields=["a", source, "b", date.today().isoformat(), "c", str(RECORD_COUNT)],
)
record.add_ordered_field(marc_source)
+
def create_clean_isbn(field):
"""Move 020a junk to 020q"""
-
- if not field.get_subfields('a') or ' ' not in field['a']:
+
+ if not field.get_subfields("a") or " " not in field["a"]:
return field
- isbn = pymarc.Field(
- tag = '020',
- indicators=[field.indicator1, field.indicator2]
- )
+ isbn = pymarc.Field(tag="020", indicators=[field.indicator1, field.indicator2])
for sf in field:
- if sf[0] == 'a' and ' ' in sf[1]:
+ if sf[0] == "a" and " " in sf[1]:
junk = sf[1].strip()
- junk = junk[junk.find(' '):].strip()
- isbn.add_subfield('a', clean_isbn(sf[1]))
- isbn.add_subfield('q', junk)
+ junk = junk[junk.find(" ") :].strip()
+ isbn.add_subfield("a", clean_isbn(sf[1]))
+ isbn.add_subfield("q", junk)
else:
isbn.add_subfield(sf[0], sf[1])
return isbn
+
def process_urls(field, options, publisher):
"""Creates 856 fields required by Conifer"""
new_fields = []
- if not field['u']:
+ if not field["u"]:
print("* No subfield 'u' found in this 856")
return None
# If we have a ToC or author notes or whatever, replace with content
- if field['u'].find('.loc.gov') > -1:
+ if field["u"].find(".loc.gov") > -1:
enrich = substitute_content(field)
if enrich and isinstance(enrich, pymarc.field.Field):
new_fields.append(enrich)
else:
- for lib in options['libraries']:
+ for lib in options["libraries"]:
# Tweak for Algoma for combined CUP/OUP
- if lib == 'algoma' and 'Cambridge' in publisher:
+ if lib == "algoma" and "Cambridge" in publisher:
continue
- data = options['settings'].get_settings(lib)
+ data = options["settings"].get_settings(lib)
- platform = options['platform']
- if field['u'].find('books.scholarsportal') > -1:
- platform = 'ScholarsPortal'
+ platform = options["platform"]
+ if field["u"].find("books.scholarsportal") > -1:
+ platform = "ScholarsPortal"
subs = get_subfields(field, data, platform)
- eight_five_six = pymarc.Field(tag = '856',
- indicators = ['4', '0'],
- subfields = subs
+ eight_five_six = pymarc.Field(
+ tag="856", indicators=["4", "0"], subfields=subs
)
new_fields.append(eight_five_six)
return new_fields
+
def substitute_content(field):
"""Parses a ToC or author notes URL and generates a field"""
- url = field['u']
+ url = field["u"]
content_field = None
- raw_content = ''
+ raw_content = ""
# Skip machine-generated tables of contents
- if url.find('/toc/') > -1:
+ if url.find("/toc/") > -1:
return None
# Get the data from the supplied URL
- headers = {'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
+ headers = {
+ "user-agent": "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0"
+ }
req = requests.get(url, headers=headers)
try:
req.raise_for_status()
if not content:
return None
- if url.endswith('-b.html'):
- # Biographical note
+ if url.endswith("-b.html"):
+ # Biographical note
content_field = pymarc.Field(
- tag = '545',
- indicators = ['1', ' '],
- subfields = ['a', content]
+ tag="545", indicators=["1", " "], subfields=["a", content]
)
- elif url.endswith('-d.html'):
- # Summary written by publisher
+ elif url.endswith("-d.html"):
+ # Summary written by publisher
content_field = pymarc.Field(
- tag = '520',
- indicators = ['3', ' '],
- subfields = ['a', content]
+ tag="520", indicators=["3", " "], subfields=["a", content]
)
- elif url.endswith('-t.html'):
- # Table of contents
+ elif url.endswith("-t.html"):
+ # Table of contents
content_field = pymarc.Field(
- tag = '505',
- indicators = [' ', ' '],
- subfields = ['a', content]
+ tag="505", indicators=[" ", " "], subfields=["a", content]
)
else:
print("URL %s didn't match known LoC type" % (url))
return content_field
+
def process_loc_data(raw_content):
"""Given the LoC enriched data, make it usable"""
# Short-circuit if we have an OCRed ToC; the quality is terrible
- if raw_content.find(text='Electronic data is machine generated'):
+ if raw_content.find(text="Electronic data is machine generated"):
return None
- elif raw_content.find('<pre>'):
+ elif raw_content.find("<pre>"):
return None
# Get all of the text after the horizontal rule
- content = ' '.join(
- raw_content.find('hr').findAllNext(text=True)
- )
+ content = " ".join(raw_content.find("hr").findAllNext(text=True))
# Remove linefeeds
- content = content.replace('\n', ' ')
- content = content.replace('\r', ' ')
+ content = content.replace("\n", " ")
+ content = content.replace("\r", " ")
# Replace multiple contiguous whitespace with a single space
- content = re.sub(r'\s+', r' ', content)
+ content = re.sub(r"\s+", r" ", content)
# Remove inline subject headings to avoid too much indexing boost
- lcsh = content.find('Library of Congress subject headings')
+ lcsh = content.find("Library of Congress subject headings")
if lcsh > -1:
content = content[0:lcsh]
return content
+
def get_subfields(field, data, platform):
"""Creates 856 subfields required by Conifer"""
subs = []
- url = field['u']
+ url = field["u"]
# Is this an ebrary URL?
ebrary = False
- if url.find('.ebrary.com') > -1:
+ if url.find(".ebrary.com") > -1:
ebrary = True
# ebrary URLs look like: http://site.ebrary.com/lib/<channel>/Doc?id=2001019
# we need to replace <channel> with the library-specific channel
if ebrary:
- ebrary_url = re.search(r'^(.+?/lib/).+?(/.+?)$', url)
- url = ebrary_url.group(1) + data['ebrary_code'] + ebrary_url.group(2)
+ ebrary_url = re.search(r"^(.+?/lib/).+?(/.+?)$", url)
+ url = ebrary_url.group(1) + data["ebrary_code"] + ebrary_url.group(2)
# Only Boreal still wants proxied ebrary links
- if ebrary and data['ebrary_code'] != 'ocls':
- subs.extend(['u', url])
+ if ebrary and data["ebrary_code"] != "ocls":
+ subs.extend(["u", url])
else:
- if (data['ebrary_code'] == 'ocls' and
- re.search(r'ra.ocls.ca', field['u'])
- ):
- subs.extend(['u', field['u']])
+ if data["ebrary_code"] == "ocls" and re.search(r"ra.ocls.ca", field["u"]):
+ subs.extend(["u", field["u"]])
else:
- subs.extend(['u', data['proxy'] + field['u']])
+ subs.extend(["u", data["proxy"] + field["u"]])
# Check for a $z as the first 856; in Springer records, at least, this
# indicates a multi-volume set that requires keeping the $z around
- if field.subfields[0] == 'z' and (
+ if field.subfields[0] == "z" and (
# However, we don't want to keep garbage-y public notes
- not field.get_subfields('z')[0].startswith('Connect to MyiLibrary')
+ not field.get_subfields("z")[0].startswith("Connect to MyiLibrary")
):
subs.extend([field.subfields[0], field.subfields[1]])
- link_text = data['link_text']
+ link_text = data["link_text"]
# We don't know what the 956 platform is
- if platform and field.tag != '956':
- link_text = "%s (%s)" % (data['link_text'], platform)
- elif url.find('springer.com') > -1:
- link_text = "%s (%s)" % (data['link_text'], 'Springer')
+ if platform and field.tag != "956":
+ link_text = "%s (%s)" % (data["link_text"], platform)
+ elif url.find("springer.com") > -1:
+ link_text = "%s (%s)" % (data["link_text"], "Springer")
else:
- link_text = "%s" % (data['link_text'])
- subs.extend([
- 'y', link_text,
- '9', data['code']
- ])
+ link_text = "%s" % (data["link_text"])
+ subs.extend(["y", link_text, "9", data["code"]])
return subs
+
def process_xml(record):
global OPTIONS
global FILES
files = FILES
process_record(record, options, files)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
OPTIONS = parse_opts()
- for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'):
+ for fname in ("duplicate", "tcn", "url", "sample", "output"):
if fname in OPTIONS:
try:
- if 'to-format' in OPTIONS and OPTIONS['to-format'] == 'xml':
- FILES[fname] = codecs.open(OPTIONS[fname], 'wb', 'utf-8')
+ if "to-format" in OPTIONS and OPTIONS["to-format"] == "xml":
+ FILES[fname] = codecs.open(OPTIONS[fname], "wb", "utf-8")
else:
- FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'wb'))
+ FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], "wb"))
except Exception as ex:
print("Could not open output file [%s]: %s" % (OPTIONS[fname], ex))
process_marc(OPTIONS)
- #pymarc.map_xml(process_xml, '/home/dan/Downloads/AlexanderStreetPress_JazzMusicLibrary_Canada_MONOSER_2012-05-23.xml')
+ # pymarc.map_xml(process_xml, '/home/dan/Downloads/AlexanderStreetPress_JazzMusicLibrary_Canada_MONOSER_2012-05-23.xml')