From 0258109e4998b57571ea1dd000a8a2c7d6c7689c Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Wed, 8 May 2013 09:41:09 -0400 Subject: [PATCH] Add initial version of eresource processing script Signed-off-by: Dan Scott --- tools/ebooks/prep_ebook_records.py | 482 +++++++++++++++++++++++++++++++++++++ 1 file changed, 482 insertions(+) create mode 100644 tools/ebooks/prep_ebook_records.py diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py new file mode 100644 index 0000000000..ff72922465 --- /dev/null +++ b/tools/ebooks/prep_ebook_records.py @@ -0,0 +1,482 @@ +#!/usr/bin/env python +""" +Prepare sets of electronic resource MARC records for loading into Evergreen + +To avoid duplicating MARC records in Conifer, to minimize manual labour, +and to make records as consistent as possible, we want to automate the +processing of electronic resource MARC records purchased by two or more +Conifer institutions. + +Each institution must confirm the standard data they require to be added +to e-book MARC records. The principle here is to identify standard +requirements that would be the same for each record and therefore can +be accommodated in batch load. +""" + +import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2 +from BeautifulSoup import BeautifulSoup + +class Institution(): + """Defines standard settings for each Conifer institution""" + + def __init__(self): + """Initialize the Institution object""" + self.algoma = { \ + "code": "OSTMA", \ + "ebrary_code": "algomauca", \ + "proxy": "http://libproxy.auc.ca/login?url=", \ + "link_text": "Available online" \ + } + + self.laurentian = { \ + "code": "OSUL", \ + "ebrary_code": "jndlu", \ + "proxy": "https://librweb.laurentian.ca/login?url=", \ + "link_text": "Available online / disponible en ligne" \ + } + + self.windsor = { \ + "code": "OWA", \ + "ebrary_code": "oculwindsor", \ + "proxy": "http://ezproxy.uwindsor.ca/login?url=", \ + "link_text": "To view Windsor's electronic resource click here." \ + } + + def get_settings(self, lib): + """Return the settings for a library by name""" + return getattr(self, lib) + + +def do_help(): + ''' + Print help for the Conifer ebook MARC processor + ''' + + print ''' +Conifer ebook MARC processor + +This script takes a set of MARC records and processes them to generate a set +of MARC records ready for loading into the Conifer consortial library +system. The processing consists of taking the existing 856 field and creating +one or more new 856 fields for each Conifer institution that should have access +to these resources. + +The script customizes the following aspects of each record: + + * Adds one 856 per institution specified at the command line: + * $u (URL) - prepends the institutional proxy and, for eBrary records, + changes the insitutional code + * $y (link text) - sets preferred text of the link to the resource + * $z (public note) - sets public note for the resource + + * Adds a 710 field to identify the publisher using the value specified + at the command line + * Adds a 590 internal note field using the value specified at the command + line. + +Required arguments: + -i / --input : The name of the input MARC file. + + -o / --output : The name of the output MARC file. + + -p / --publisher : The name of the publisher to be inserted in a 710 field. + + -A / --algoma: Add an 856 for Algoma University + + -L / --laurentian: Add an 856 for Laurentian University + + -W / --windsor : Add an 856 for University of Windsor + +Optional arguments: + -n / --note : The text of the internal note to be inserted into a 590 field. + + -s / --sample : The name of the sample output MARC file (generates + 1 sample record for every 100 records processed) + + -h / --help : Prints help message + +Examples: + %s --algoma --windsor -i crkn.mrc -o /tmp/crkn_out.mrc -p "eBrary Inc." + ''' % sys.argv[0] + sys.exit(0) + +def consolidate_options(opts): + """Make long arguments the standard form in command line options""" + + _options = dict(opts) + + for key, val in opts: + if key == '-i': + _options['--input'] = val + elif key == '-o': + _options['--output'] = val + elif key == '-p': + _options['--publisher'] = val + elif key == '-n': + _options['--note'] = val + elif key == '-A': + _options['--algoma'] = val + elif key == '-L': + _options['--laurentian'] = val + elif key == '-W': + _options['--windsor'] = val + elif key == '-s': + _options['--sample'] = val + elif key == '-h': + _options['--help'] = val + + return _options + +def check_options(options): + """Check the validity of options that were passed in""" + + _help = False + + if '--help' in options: + do_help() + + if '--input' not in options: + print "* Missing -i / --input argument!" + _help = True + + if '--output' not in options: + print "* Missing -o / --output argument!" + _help = True + + if '--publisher' not in options: + print "* Missing -p / --publisher argument!" + _help = True + + _libraries = check_libraries(options) + if len(_libraries.keys()) == 0: + _help = True + + if _help == True: + do_help() + + # Get the input and output files + _input = options['--input'] + _output = options['--output'] + + try: + os.stat(_input) + except OSError: + print("* Cannot read input file %s" % (_input)) + sys.exit(0) + + try: + os.access(os.path.dirname(_output), os.W_OK) + except OSError: + print("* Cannot write to output path %s" % (os.path.dirname(_output))) + sys.exit(0) + + clean_opts = dict() + clean_opts['publisher'] = options['--publisher'] + + if '--sample' in options: + clean_opts['sample'] = options['--sample'] + + if '--note' in options: + clean_opts['note'] = options['--note'] + + clean_opts['libraries'] = _libraries + clean_opts['input'] = _input + clean_opts['output'] = _output + clean_opts['settings'] = Institution() + + return clean_opts + +def check_libraries(options): + """Build a dict of the libraries that were requested for this batch""" + + _libraries = dict() + if '--algoma' in options: + _libraries['algoma'] = True + + if '--laurentian' in options: + _libraries['laurentian'] = True + + if '--windsor' in options: + _libraries['windsor'] = True + + return _libraries + + +def parse_opts(): + """Get command-line arguments from the script""" + try: + _short_opts = 'i:o:p:ALWn:s:h' + _long_opts = ['input=', 'output=', 'publisher=', 'algoma', \ + 'laurentian', 'windsor', 'note=', 'sample=', 'help'] + opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) + except getopt.GetoptError, ex: + print "* %s" % str(ex) + do_help() + + _options = consolidate_options(opts[0]) + return check_options(_options) + +def process_records(options): + """Converts raw ebook MARC records to Conifer-ready MARC records""" + + sample = '' + reader = pymarc.MARCReader( + open(options['input'], mode='rb'), to_unicode=True + ) + writer = pymarc.MARCWriter(open(options['output'], mode='wb')) + if ('sample' in options): + sample = pymarc.MARCWriter(open(options['sample'], mode='wb')) + + cnt = 0 + for record in reader: + cnt = cnt + 1 + try: + if not (record['856'] and record['856']['u']): + print("* No 856 for record # %s in file %s" + % (cnt, options['input']) + ) + + new_record = process_fields(record, options) + + writer.write(new_record) + if (sample and ((cnt == 1) or (cnt % 100 == 0))): + sample.write(new_record) + except Exception, ex: + print("* Error processing record %s - %s" % (cnt, ex)) + +def process_fields(record, options): + """Decide which fields to add, delete, and keep""" + + new_record = pymarc.Record(to_unicode=True, force_utf8=True) + + for field in record.get_fields(): + # Process all of the 856 fields + if field.tag == '856': + new_fields = process_urls(field, options) + if new_fields: + for new_856 in new_fields: + new_record.add_field(new_856) + # Strip out 9xx fields: we don't want local fields in our records + elif field.tag[0] == '9': + pass + # Strip out 300 fields that only contain placeholders + elif field.tag == '300' and field['a'] == 'p. cm.': + pass + else: + new_record.add_field(field) + + add_publisher(record, new_record, options) + + if 'note' in options: + note = pymarc.Field(tag = '590', + indicators = [' ', ' '], + subfields = [ + 'a', options['note'] + ] + ) + new_record.add_field(note) + + add_cat_source(new_record, options) + + return new_record + +def add_publisher(record, new_record, options): + """ + This is a convoluted way to avoid creating a new 710 if we already + have a matching 710 and just need to add the publisher relator code. + """ + + munge_publisher = False + need_publisher = True + need_relator = True + + # Iterate through all of the existing 710 fields + for sten in record.get_fields('710'): + for pub in sten.get_subfields('a'): + if pub == options['publisher']: + munge_publisher = True + for rel in sten.get_subfields('4'): + if rel == 'pbl': + need_publisher = False + need_relator = False + + if munge_publisher and need_relator: + sten.add_subfield('4', 'pbl') + need_publisher = False + + if need_publisher: + # Add the publisher, with relator code + seven_ten = pymarc.Field(tag = '710', + indicators = ['2', ' '], + subfields = [ + 'a', options['publisher'], + '4', 'pbl' + ] + ) + new_record.add_field(seven_ten) + + +def add_cat_source(record, options): + """Add or extend the 040 field to identify the cataloguing source""" + + # Only Windsor wants to do this + if 'windsor' not in options['libraries']: + return + + cat_source = record['040'] + if cat_source: + # Add subfield 'd' identifying Windsor + cat_source.add_subfield('d', 'CaOWA') + else: + # Add a 040 with subfield 'd' identifying Windsor + forty = pymarc.Field(tag = '040', + indicators = [' ', ' '], + subfields = [ 'd', 'CaOWA' ] + ) + record.add_field(forty) + + +def process_urls(field, options): + """Creates 856 fields required by Conifer""" + + new_fields = [] + + if not field['u']: + print "* No subfield 'u' found in this 856" + return None + + # If we have a ToC or author notes or whatever, replace with content + if field['u'].find('.loc.gov') > -1: + enrich = substitute_content(field) + if enrich and isinstance(enrich, pymarc.field.Field): + new_fields.append(enrich) + else: + for lib in options['libraries']: + data = options['settings'].get_settings(lib) + subs = get_subfields(field, data) + eight_five_six = pymarc.Field(tag = '856', + indicators = ['4', '0'], + subfields = subs + ) + new_fields.append(eight_five_six) + + return new_fields + +def substitute_content(field): + """Parses a ToC or author notes URL and generates a field""" + + url = field['u'] + + content_field = None + raw_content = '' + + # Skip machine-generated tables of contents + if url.find('/toc/') > -1: + return None + + # Get the data from the supplied URL + try: + req = urllib2.urlopen(url) + raw_content = BeautifulSoup(req.read()) + except urllib2.HTTPError, ex: + print("%s for URL %s" % (ex, url)) + return None + except urllib2.URLError, ex: + print("%s for URL %s" % (ex, url)) + return None + + content = process_loc_data(raw_content) + if not content: + return None + + if url.endswith('-b.html'): + # Biographical note + content_field = pymarc.Field( + tag = '545', + indicators = ['1', ' '], + subfields = ['a', content] + ) + elif url.endswith('-d.html'): + # Summary written by publisher + content_field = pymarc.Field( + tag = '520', + indicators = ['3', ' '], + subfields = ['a', content] + ) + + elif url.endswith('-t.html'): + # Table of contents + content_field = pymarc.Field( + tag = '505', + indicators = [' ', ' '], + subfields = ['a', content] + ) + else: + print("URL %s didn't match known LoC type" % (url)) + + return content_field + +def process_loc_data(raw_content): + """Given the LoC enriched data, make it usable""" + + # Short-circuit if we have an OCRed ToC; the quality is terrible + if raw_content.find(text='Electronic data is machine generated'): + return None + elif raw_content.find('
'):
+        return None
+
+    # Get all of the text after the horizontal rule
+    content = ' '.join(
+        raw_content.find('hr').findAllNext(text=True)
+    ).encode('utf8')
+
+    # Remove linefeeds
+    content = content.replace('\n', ' ')
+    content = content.replace('\r', ' ')
+
+    # Remove inline subject headings to avoid too much indexing boost
+    lcsh = content.find('Library of Congress subject headings')
+    if lcsh > -1:
+        content = content[0:lcsh]
+
+    # Farewell, starting and ending whitespace
+    content = content.strip().decode('utf8')
+
+    return content
+
+def get_subfields(field, data):
+    """Creates 856 subfields required by Conifer"""
+
+    subs = []
+    url = field['u']
+
+    # Is this an ebrary URL?
+    ebrary = False
+    if url.find('.ebrary.com') > -1:
+        ebrary = True
+        
+    # ebrary URLs look like: http://site.ebrary.com/lib//Doc?id=2001019
+    # we need to replace  with the library-specific channel
+    if ebrary:
+        ebrary_url = re.search(r'^(.+?/lib/).+?(/.+?)$', url) 
+        url = ebrary_url.group(1) + data['ebrary_code'] + ebrary_url.group(2)
+        subs.extend(['u', url])
+    else:
+        subs.extend(['u', data['proxy'] + field['u']])
+
+    # Check for a $z as the first 856; in Springer records, at least, this
+    # indicates a multi-volume set that requires keeping the $z around
+    if field.subfields[0] == 'z':
+        subs.extend([field.subfields[0], field.subfields[1]])
+
+    subs.extend([
+            'y', data['link_text'],
+            '9', data['code']
+    ])
+
+    return subs
+
+
+if __name__ == '__main__':
+
+    process_records(parse_opts())
-- 
2.11.0