--- /dev/null
+#!/usr/bin/env python
+"""
+Prepare sets of electronic resource MARC records for loading into Evergreen
+
+To avoid duplicating MARC records in Conifer, to minimize manual labour,
+and to make records as consistent as possible, we want to automate the
+processing of electronic resource MARC records purchased by two or more
+Conifer institutions.
+
+Each institution must confirm the standard data they require to be added
+to e-book MARC records. The principle here is to identify standard
+requirements that would be the same for each record and therefore can
+be accommodated in batch load.
+"""
+
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+from BeautifulSoup import BeautifulSoup
+
+class Institution():
+ """Defines standard settings for each Conifer institution"""
+
+ def __init__(self):
+ """Initialize the Institution object"""
+ self.algoma = { \
+ "code": "OSTMA", \
+ "ebrary_code": "algomauca", \
+ "proxy": "http://libproxy.auc.ca/login?url=", \
+ "link_text": "Available online" \
+ }
+
+ self.laurentian = { \
+ "code": "OSUL", \
+ "ebrary_code": "jndlu", \
+ "proxy": "https://librweb.laurentian.ca/login?url=", \
+ "link_text": "Available online / disponible en ligne" \
+ }
+
+ self.windsor = { \
+ "code": "OWA", \
+ "ebrary_code": "oculwindsor", \
+ "proxy": "http://ezproxy.uwindsor.ca/login?url=", \
+ "link_text": "To view Windsor's electronic resource click here." \
+ }
+
+ def get_settings(self, lib):
+ """Return the settings for a library by name"""
+ return getattr(self, lib)
+
+
+def do_help():
+ '''
+ Print help for the Conifer ebook MARC processor
+ '''
+
+ print '''
+Conifer ebook MARC processor
+
+This script takes a set of MARC records and processes them to generate a set
+of MARC records ready for loading into the Conifer consortial library
+system. The processing consists of taking the existing 856 field and creating
+one or more new 856 fields for each Conifer institution that should have access
+to these resources.
+
+The script customizes the following aspects of each record:
+
+ * Adds one 856 per institution specified at the command line:
+ * $u (URL) - prepends the institutional proxy and, for eBrary records,
+ changes the insitutional code
+ * $y (link text) - sets preferred text of the link to the resource
+ * $z (public note) - sets public note for the resource
+
+ * Adds a 710 field to identify the publisher using the value specified
+ at the command line
+ * Adds a 590 internal note field using the value specified at the command
+ line.
+
+Required arguments:
+ -i / --input : The name of the input MARC file.
+
+ -o / --output : The name of the output MARC file.
+
+ -p / --publisher : The name of the publisher to be inserted in a 710 field.
+
+ -A / --algoma: Add an 856 for Algoma University
+
+ -L / --laurentian: Add an 856 for Laurentian University
+
+ -W / --windsor : Add an 856 for University of Windsor
+
+Optional arguments:
+ -n / --note : The text of the internal note to be inserted into a 590 field.
+
+ -s / --sample : The name of the sample output MARC file (generates
+ 1 sample record for every 100 records processed)
+
+ -h / --help : Prints help message
+
+Examples:
+ %s --algoma --windsor -i crkn.mrc -o /tmp/crkn_out.mrc -p "eBrary Inc."
+ ''' % sys.argv[0]
+ sys.exit(0)
+
+def consolidate_options(opts):
+ """Make long arguments the standard form in command line options"""
+
+ _options = dict(opts)
+
+ for key, val in opts:
+ if key == '-i':
+ _options['--input'] = val
+ elif key == '-o':
+ _options['--output'] = val
+ elif key == '-p':
+ _options['--publisher'] = val
+ elif key == '-n':
+ _options['--note'] = val
+ elif key == '-A':
+ _options['--algoma'] = val
+ elif key == '-L':
+ _options['--laurentian'] = val
+ elif key == '-W':
+ _options['--windsor'] = val
+ elif key == '-s':
+ _options['--sample'] = val
+ elif key == '-h':
+ _options['--help'] = val
+
+ return _options
+
+def check_options(options):
+ """Check the validity of options that were passed in"""
+
+ _help = False
+
+ if '--help' in options:
+ do_help()
+
+ if '--input' not in options:
+ print "* Missing -i / --input argument!"
+ _help = True
+
+ if '--output' not in options:
+ print "* Missing -o / --output argument!"
+ _help = True
+
+ if '--publisher' not in options:
+ print "* Missing -p / --publisher argument!"
+ _help = True
+
+ _libraries = check_libraries(options)
+ if len(_libraries.keys()) == 0:
+ _help = True
+
+ if _help == True:
+ do_help()
+
+ # Get the input and output files
+ _input = options['--input']
+ _output = options['--output']
+
+ try:
+ os.stat(_input)
+ except OSError:
+ print("* Cannot read input file %s" % (_input))
+ sys.exit(0)
+
+ try:
+ os.access(os.path.dirname(_output), os.W_OK)
+ except OSError:
+ print("* Cannot write to output path %s" % (os.path.dirname(_output)))
+ sys.exit(0)
+
+ clean_opts = dict()
+ clean_opts['publisher'] = options['--publisher']
+
+ if '--sample' in options:
+ clean_opts['sample'] = options['--sample']
+
+ if '--note' in options:
+ clean_opts['note'] = options['--note']
+
+ clean_opts['libraries'] = _libraries
+ clean_opts['input'] = _input
+ clean_opts['output'] = _output
+ clean_opts['settings'] = Institution()
+
+ return clean_opts
+
+def check_libraries(options):
+ """Build a dict of the libraries that were requested for this batch"""
+
+ _libraries = dict()
+ if '--algoma' in options:
+ _libraries['algoma'] = True
+
+ if '--laurentian' in options:
+ _libraries['laurentian'] = True
+
+ if '--windsor' in options:
+ _libraries['windsor'] = True
+
+ return _libraries
+
+
+def parse_opts():
+ """Get command-line arguments from the script"""
+ try:
+ _short_opts = 'i:o:p:ALWn:s:h'
+ _long_opts = ['input=', 'output=', 'publisher=', 'algoma', \
+ 'laurentian', 'windsor', 'note=', 'sample=', 'help']
+ opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts)
+ except getopt.GetoptError, ex:
+ print "* %s" % str(ex)
+ do_help()
+
+ _options = consolidate_options(opts[0])
+ return check_options(_options)
+
+def process_records(options):
+ """Converts raw ebook MARC records to Conifer-ready MARC records"""
+
+ sample = ''
+ reader = pymarc.MARCReader(
+ open(options['input'], mode='rb'), to_unicode=True
+ )
+ writer = pymarc.MARCWriter(open(options['output'], mode='wb'))
+ if ('sample' in options):
+ sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
+
+ cnt = 0
+ for record in reader:
+ cnt = cnt + 1
+ try:
+ if not (record['856'] and record['856']['u']):
+ print("* No 856 for record # %s in file %s"
+ % (cnt, options['input'])
+ )
+
+ new_record = process_fields(record, options)
+
+ writer.write(new_record)
+ if (sample and ((cnt == 1) or (cnt % 100 == 0))):
+ sample.write(new_record)
+ except Exception, ex:
+ print("* Error processing record %s - %s" % (cnt, ex))
+
+def process_fields(record, options):
+ """Decide which fields to add, delete, and keep"""
+
+ new_record = pymarc.Record(to_unicode=True, force_utf8=True)
+
+ for field in record.get_fields():
+ # Process all of the 856 fields
+ if field.tag == '856':
+ new_fields = process_urls(field, options)
+ if new_fields:
+ for new_856 in new_fields:
+ new_record.add_field(new_856)
+ # Strip out 9xx fields: we don't want local fields in our records
+ elif field.tag[0] == '9':
+ pass
+ # Strip out 300 fields that only contain placeholders
+ elif field.tag == '300' and field['a'] == 'p. cm.':
+ pass
+ else:
+ new_record.add_field(field)
+
+ add_publisher(record, new_record, options)
+
+ if 'note' in options:
+ note = pymarc.Field(tag = '590',
+ indicators = [' ', ' '],
+ subfields = [
+ 'a', options['note']
+ ]
+ )
+ new_record.add_field(note)
+
+ add_cat_source(new_record, options)
+
+ return new_record
+
+def add_publisher(record, new_record, options):
+ """
+ This is a convoluted way to avoid creating a new 710 if we already
+ have a matching 710 and just need to add the publisher relator code.
+ """
+
+ munge_publisher = False
+ need_publisher = True
+ need_relator = True
+
+ # Iterate through all of the existing 710 fields
+ for sten in record.get_fields('710'):
+ for pub in sten.get_subfields('a'):
+ if pub == options['publisher']:
+ munge_publisher = True
+ for rel in sten.get_subfields('4'):
+ if rel == 'pbl':
+ need_publisher = False
+ need_relator = False
+
+ if munge_publisher and need_relator:
+ sten.add_subfield('4', 'pbl')
+ need_publisher = False
+
+ if need_publisher:
+ # Add the publisher, with relator code
+ seven_ten = pymarc.Field(tag = '710',
+ indicators = ['2', ' '],
+ subfields = [
+ 'a', options['publisher'],
+ '4', 'pbl'
+ ]
+ )
+ new_record.add_field(seven_ten)
+
+
+def add_cat_source(record, options):
+ """Add or extend the 040 field to identify the cataloguing source"""
+
+ # Only Windsor wants to do this
+ if 'windsor' not in options['libraries']:
+ return
+
+ cat_source = record['040']
+ if cat_source:
+ # Add subfield 'd' identifying Windsor
+ cat_source.add_subfield('d', 'CaOWA')
+ else:
+ # Add a 040 with subfield 'd' identifying Windsor
+ forty = pymarc.Field(tag = '040',
+ indicators = [' ', ' '],
+ subfields = [ 'd', 'CaOWA' ]
+ )
+ record.add_field(forty)
+
+
+def process_urls(field, options):
+ """Creates 856 fields required by Conifer"""
+
+ new_fields = []
+
+ if not field['u']:
+ print "* No subfield 'u' found in this 856"
+ return None
+
+ # If we have a ToC or author notes or whatever, replace with content
+ if field['u'].find('.loc.gov') > -1:
+ enrich = substitute_content(field)
+ if enrich and isinstance(enrich, pymarc.field.Field):
+ new_fields.append(enrich)
+ else:
+ for lib in options['libraries']:
+ data = options['settings'].get_settings(lib)
+ subs = get_subfields(field, data)
+ eight_five_six = pymarc.Field(tag = '856',
+ indicators = ['4', '0'],
+ subfields = subs
+ )
+ new_fields.append(eight_five_six)
+
+ return new_fields
+
+def substitute_content(field):
+ """Parses a ToC or author notes URL and generates a field"""
+
+ url = field['u']
+
+ content_field = None
+ raw_content = ''
+
+ # Skip machine-generated tables of contents
+ if url.find('/toc/') > -1:
+ return None
+
+ # Get the data from the supplied URL
+ try:
+ req = urllib2.urlopen(url)
+ raw_content = BeautifulSoup(req.read())
+ except urllib2.HTTPError, ex:
+ print("%s for URL %s" % (ex, url))
+ return None
+ except urllib2.URLError, ex:
+ print("%s for URL %s" % (ex, url))
+ return None
+
+ content = process_loc_data(raw_content)
+ if not content:
+ return None
+
+ if url.endswith('-b.html'):
+ # Biographical note
+ content_field = pymarc.Field(
+ tag = '545',
+ indicators = ['1', ' '],
+ subfields = ['a', content]
+ )
+ elif url.endswith('-d.html'):
+ # Summary written by publisher
+ content_field = pymarc.Field(
+ tag = '520',
+ indicators = ['3', ' '],
+ subfields = ['a', content]
+ )
+
+ elif url.endswith('-t.html'):
+ # Table of contents
+ content_field = pymarc.Field(
+ tag = '505',
+ indicators = [' ', ' '],
+ subfields = ['a', content]
+ )
+ else:
+ print("URL %s didn't match known LoC type" % (url))
+
+ return content_field
+
+def process_loc_data(raw_content):
+ """Given the LoC enriched data, make it usable"""
+
+ # Short-circuit if we have an OCRed ToC; the quality is terrible
+ if raw_content.find(text='Electronic data is machine generated'):
+ return None
+ elif raw_content.find('<pre>'):
+ return None
+
+ # Get all of the text after the horizontal rule
+ content = ' '.join(
+ raw_content.find('hr').findAllNext(text=True)
+ ).encode('utf8')
+
+ # Remove linefeeds
+ content = content.replace('\n', ' ')
+ content = content.replace('\r', ' ')
+
+ # Remove inline subject headings to avoid too much indexing boost
+ lcsh = content.find('Library of Congress subject headings')
+ if lcsh > -1:
+ content = content[0:lcsh]
+
+ # Farewell, starting and ending whitespace
+ content = content.strip().decode('utf8')
+
+ return content
+
+def get_subfields(field, data):
+ """Creates 856 subfields required by Conifer"""
+
+ subs = []
+ url = field['u']
+
+ # Is this an ebrary URL?
+ ebrary = False
+ if url.find('.ebrary.com') > -1:
+ ebrary = True
+
+ # ebrary URLs look like: http://site.ebrary.com/lib/<channel>/Doc?id=2001019
+ # we need to replace <channel> with the library-specific channel
+ if ebrary:
+ ebrary_url = re.search(r'^(.+?/lib/).+?(/.+?)$', url)
+ url = ebrary_url.group(1) + data['ebrary_code'] + ebrary_url.group(2)
+ subs.extend(['u', url])
+ else:
+ subs.extend(['u', data['proxy'] + field['u']])
+
+ # Check for a $z as the first 856; in Springer records, at least, this
+ # indicates a multi-volume set that requires keeping the $z around
+ if field.subfields[0] == 'z':
+ subs.extend([field.subfields[0], field.subfields[1]])
+
+ subs.extend([
+ 'y', data['link_text'],
+ '9', data['code']
+ ])
+
+ return subs
+
+
+if __name__ == '__main__':
+
+ process_records(parse_opts())