Add initial version of eresource processing script
authorDan Scott <dscott@laurentian.ca>
Wed, 8 May 2013 13:41:09 +0000 (09:41 -0400)
committerDan Scott <dscott@laurentian.ca>
Wed, 8 May 2013 13:41:31 +0000 (09:41 -0400)
Signed-off-by: Dan Scott <dscott@laurentian.ca>
tools/ebooks/prep_ebook_records.py [new file with mode: 0644]

diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py
new file mode 100644 (file)
index 0000000..ff72922
--- /dev/null
@@ -0,0 +1,482 @@
+#!/usr/bin/env python
+"""
+Prepare sets of electronic resource MARC records for loading into Evergreen
+
+To avoid duplicating MARC records in Conifer, to minimize manual labour,
+and to make records as consistent as possible, we want to automate the
+processing of electronic resource MARC records purchased by two or more
+Conifer institutions.
+
+Each institution must confirm the standard data they require to be added
+to e-book MARC records. The principle here is to identify standard
+requirements that would be the same for each record and therefore can
+be accommodated in batch load.
+"""
+
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+from BeautifulSoup import BeautifulSoup
+
+class Institution():
+    """Defines standard settings for each Conifer institution"""
+
+    def __init__(self):
+        """Initialize the Institution object"""
+        self.algoma = { \
+            "code": "OSTMA", \
+            "ebrary_code": "algomauca", \
+            "proxy": "http://libproxy.auc.ca/login?url=", \
+            "link_text": "Available online" \
+        }
+        
+        self.laurentian = { \
+            "code": "OSUL", \
+            "ebrary_code": "jndlu", \
+            "proxy": "https://librweb.laurentian.ca/login?url=", \
+            "link_text": "Available online / disponible en ligne" \
+        }
+
+        self.windsor = { \
+            "code": "OWA", \
+            "ebrary_code": "oculwindsor", \
+            "proxy": "http://ezproxy.uwindsor.ca/login?url=", \
+            "link_text": "To view Windsor's electronic resource click here." \
+        }
+
+    def get_settings(self, lib):
+        """Return the settings for a library by name"""
+        return getattr(self, lib)
+    
+
+def do_help():
+    '''
+    Print help for the Conifer ebook MARC processor
+    '''
+
+    print '''
+Conifer ebook MARC processor
+
+This script takes a set of MARC records and processes them to generate a set
+of MARC records ready for loading into the Conifer consortial library
+system. The processing consists of taking the existing 856 field and creating
+one or more new 856 fields for each Conifer institution that should have access
+to these resources.
+
+The script customizes the following aspects of each record:
+
+  * Adds one 856 per institution specified at the command line:
+      * $u (URL) - prepends the institutional proxy and, for eBrary records,
+        changes the insitutional code
+      * $y (link text) - sets preferred text of the link to the resource
+      * $z (public note) - sets public note for the resource
+
+  * Adds a 710 field to identify the publisher using the value specified
+    at the command line
+  * Adds a 590 internal note field using the value specified at the command
+    line.
+
+Required arguments:
+    -i / --input : The name of the input MARC file.
+
+    -o / --output : The name of the output MARC file.
+
+    -p / --publisher : The name of the publisher to be inserted in a 710 field.
+
+    -A / --algoma: Add an 856 for Algoma University
+
+    -L / --laurentian: Add an 856 for Laurentian University
+
+    -W / --windsor : Add an 856 for University of Windsor
+
+Optional arguments:
+    -n / --note : The text of the internal note to be inserted into a 590 field.
+
+    -s / --sample : The name of the sample output MARC file (generates
+                    1 sample record for every 100 records processed)
+
+    -h / --help : Prints help message
+
+Examples:
+    %s --algoma --windsor -i crkn.mrc -o /tmp/crkn_out.mrc -p "eBrary Inc."
+    ''' % sys.argv[0]
+    sys.exit(0)
+
+def consolidate_options(opts):
+    """Make long arguments the standard form in command line options"""
+
+    _options = dict(opts)
+
+    for key, val in opts:
+        if key == '-i':
+            _options['--input'] = val
+        elif key == '-o':
+            _options['--output'] = val
+        elif key == '-p':
+            _options['--publisher'] = val
+        elif key == '-n':
+            _options['--note'] = val
+        elif key == '-A':
+            _options['--algoma'] = val
+        elif key == '-L':
+            _options['--laurentian'] = val
+        elif key == '-W':
+            _options['--windsor'] = val
+        elif key == '-s':
+            _options['--sample'] = val
+        elif key == '-h':
+            _options['--help'] = val
+
+    return _options
+
+def check_options(options):
+    """Check the validity of options that were passed in"""
+
+    _help = False
+
+    if '--help' in options:
+        do_help()
+
+    if '--input' not in options:
+        print "* Missing -i / --input argument!"
+        _help = True
+
+    if '--output' not in options:
+        print "* Missing -o / --output argument!"
+        _help = True
+
+    if '--publisher' not in options:
+        print "* Missing -p / --publisher argument!"
+        _help = True
+
+    _libraries = check_libraries(options)
+    if len(_libraries.keys()) == 0:
+        _help = True
+
+    if _help == True:
+        do_help()
+
+    # Get the input and output files
+    _input = options['--input']
+    _output = options['--output']
+
+    try:
+        os.stat(_input)
+    except OSError:
+        print("* Cannot read input file %s" % (_input))
+        sys.exit(0)
+
+    try:
+        os.access(os.path.dirname(_output), os.W_OK)
+    except OSError:
+        print("* Cannot write to output path %s" % (os.path.dirname(_output)))
+        sys.exit(0)
+
+    clean_opts = dict()
+    clean_opts['publisher'] = options['--publisher']
+
+    if '--sample' in options:
+        clean_opts['sample'] = options['--sample']
+
+    if '--note' in options:
+        clean_opts['note'] = options['--note']
+
+    clean_opts['libraries'] = _libraries
+    clean_opts['input'] = _input
+    clean_opts['output'] = _output
+    clean_opts['settings'] = Institution()
+
+    return clean_opts
+
+def check_libraries(options):
+    """Build a dict of the libraries that were requested for this batch"""
+    
+    _libraries = dict() 
+    if '--algoma' in options:
+        _libraries['algoma'] = True
+
+    if '--laurentian' in options:
+        _libraries['laurentian'] = True
+
+    if '--windsor' in options:
+        _libraries['windsor'] = True
+
+    return _libraries
+
+
+def parse_opts():
+    """Get command-line arguments from the script"""
+    try:
+        _short_opts = 'i:o:p:ALWn:s:h'
+        _long_opts = ['input=', 'output=', 'publisher=', 'algoma', \
+                'laurentian', 'windsor', 'note=', 'sample=', 'help']
+        opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) 
+    except getopt.GetoptError, ex:
+        print "* %s" % str(ex)
+        do_help()
+
+    _options = consolidate_options(opts[0])
+    return check_options(_options)    
+
+def process_records(options):
+    """Converts raw ebook MARC records to Conifer-ready MARC records"""
+
+    sample = ''
+    reader = pymarc.MARCReader(
+        open(options['input'], mode='rb'), to_unicode=True
+    )
+    writer = pymarc.MARCWriter(open(options['output'], mode='wb'))
+    if ('sample' in options):
+        sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
+
+    cnt = 0
+    for record in reader:
+        cnt = cnt + 1
+        try:
+            if not (record['856'] and record['856']['u']):
+                print("* No 856 for record # %s in file %s"
+                        % (cnt, options['input'])
+                )
+
+            new_record = process_fields(record, options)
+
+            writer.write(new_record)
+            if (sample and ((cnt == 1) or (cnt % 100 == 0))):
+                sample.write(new_record)
+        except Exception, ex:
+            print("* Error processing record %s - %s" % (cnt, ex))
+
+def process_fields(record, options):
+    """Decide which fields to add, delete, and keep"""
+
+    new_record = pymarc.Record(to_unicode=True, force_utf8=True)
+
+    for field in record.get_fields():
+        # Process all of the 856 fields
+        if field.tag == '856':
+            new_fields = process_urls(field, options)
+            if new_fields:
+                for new_856 in new_fields:
+                    new_record.add_field(new_856)
+        # Strip out 9xx fields: we don't want local fields in our records
+        elif field.tag[0] == '9':
+            pass
+        # Strip out 300 fields that only contain placeholders
+        elif field.tag == '300' and field['a'] == 'p. cm.':
+            pass
+        else:
+            new_record.add_field(field)
+
+    add_publisher(record, new_record, options)
+
+    if 'note' in options:
+        note = pymarc.Field(tag = '590',
+            indicators = [' ', ' '],
+            subfields = [
+                'a', options['note']
+            ]
+        )
+        new_record.add_field(note)
+
+    add_cat_source(new_record, options)
+
+    return new_record
+
+def add_publisher(record, new_record, options):
+    """
+    This is a convoluted way to avoid creating a new 710 if we already
+    have a matching 710 and just need to add the publisher relator code.
+    """
+
+    munge_publisher = False
+    need_publisher = True
+    need_relator = True
+
+    # Iterate through all of the existing 710 fields
+    for sten in record.get_fields('710'):
+        for pub in sten.get_subfields('a'):
+            if pub == options['publisher']:
+                munge_publisher = True
+                for rel in sten.get_subfields('4'): 
+                    if rel == 'pbl':
+                        need_publisher = False
+                        need_relator = False
+
+                if munge_publisher and need_relator:
+                    sten.add_subfield('4', 'pbl')
+                    need_publisher = False
+
+    if need_publisher:
+        # Add the publisher, with relator code
+        seven_ten = pymarc.Field(tag = '710',
+            indicators = ['2', ' '],
+            subfields = [
+                'a', options['publisher'],
+                '4', 'pbl'
+            ]
+        )
+        new_record.add_field(seven_ten)
+
+
+def add_cat_source(record, options):
+    """Add or extend the 040 field to identify the cataloguing source"""
+
+    # Only Windsor wants to do this
+    if 'windsor' not in options['libraries']:
+        return
+
+    cat_source = record['040']
+    if cat_source:
+        # Add subfield 'd' identifying Windsor
+        cat_source.add_subfield('d', 'CaOWA')
+    else:
+        # Add a 040 with subfield 'd' identifying Windsor
+        forty = pymarc.Field(tag = '040',
+            indicators = [' ', ' '],
+            subfields = [ 'd', 'CaOWA' ]
+        )
+        record.add_field(forty)
+
+
+def process_urls(field, options):
+    """Creates 856 fields required by Conifer"""
+
+    new_fields = []
+
+    if not field['u']:
+        print "* No subfield 'u' found in this 856"
+        return None
+
+    # If we have a ToC or author notes or whatever, replace with content
+    if field['u'].find('.loc.gov') > -1:
+        enrich = substitute_content(field)
+        if enrich and isinstance(enrich, pymarc.field.Field):
+            new_fields.append(enrich)
+    else:
+        for lib in options['libraries']:
+            data = options['settings'].get_settings(lib)
+            subs = get_subfields(field, data)
+            eight_five_six = pymarc.Field(tag = '856',
+                indicators = ['4', '0'],
+                subfields = subs 
+            )
+            new_fields.append(eight_five_six)
+
+    return new_fields
+
+def substitute_content(field):
+    """Parses a ToC or author notes URL and generates a field"""
+
+    url = field['u']
+
+    content_field = None
+    raw_content = ''
+
+    # Skip machine-generated tables of contents
+    if url.find('/toc/') > -1:
+        return None
+
+    # Get the data from the supplied URL
+    try:
+        req = urllib2.urlopen(url)
+        raw_content = BeautifulSoup(req.read())
+    except urllib2.HTTPError, ex:
+        print("%s for URL %s" % (ex, url))
+        return None
+    except urllib2.URLError, ex:
+        print("%s for URL %s" % (ex, url))
+        return None
+
+    content = process_loc_data(raw_content)
+    if not content:
+        return None
+
+    if url.endswith('-b.html'):
+    # Biographical note
+        content_field = pymarc.Field(
+            tag = '545',
+            indicators = ['1', ' '],
+            subfields = ['a', content]
+        )
+    elif url.endswith('-d.html'):
+    # Summary written by publisher
+        content_field = pymarc.Field(
+            tag = '520',
+            indicators = ['3', ' '],
+            subfields = ['a', content]
+        )
+
+    elif url.endswith('-t.html'):
+    # Table of contents
+        content_field = pymarc.Field(
+            tag = '505',
+            indicators = [' ', ' '],
+            subfields = ['a', content]
+        )
+    else:
+        print("URL %s didn't match known LoC type" % (url))
+
+    return content_field
+
+def process_loc_data(raw_content):
+    """Given the LoC enriched data, make it usable"""
+
+    # Short-circuit if we have an OCRed ToC; the quality is terrible
+    if raw_content.find(text='Electronic data is machine generated'):
+        return None
+    elif raw_content.find('<pre>'):
+        return None
+
+    # Get all of the text after the horizontal rule
+    content = ' '.join(
+        raw_content.find('hr').findAllNext(text=True)
+    ).encode('utf8')
+
+    # Remove linefeeds
+    content = content.replace('\n', ' ')
+    content = content.replace('\r', ' ')
+
+    # Remove inline subject headings to avoid too much indexing boost
+    lcsh = content.find('Library of Congress subject headings')
+    if lcsh > -1:
+        content = content[0:lcsh]
+
+    # Farewell, starting and ending whitespace
+    content = content.strip().decode('utf8')
+
+    return content
+
+def get_subfields(field, data):
+    """Creates 856 subfields required by Conifer"""
+
+    subs = []
+    url = field['u']
+
+    # Is this an ebrary URL?
+    ebrary = False
+    if url.find('.ebrary.com') > -1:
+        ebrary = True
+        
+    # ebrary URLs look like: http://site.ebrary.com/lib/<channel>/Doc?id=2001019
+    # we need to replace <channel> with the library-specific channel
+    if ebrary:
+        ebrary_url = re.search(r'^(.+?/lib/).+?(/.+?)$', url) 
+        url = ebrary_url.group(1) + data['ebrary_code'] + ebrary_url.group(2)
+        subs.extend(['u', url])
+    else:
+        subs.extend(['u', data['proxy'] + field['u']])
+
+    # Check for a $z as the first 856; in Springer records, at least, this
+    # indicates a multi-volume set that requires keeping the $z around
+    if field.subfields[0] == 'z':
+        subs.extend([field.subfields[0], field.subfields[1]])
+
+    subs.extend([
+            'y', data['link_text'],
+            '9', data['code']
+    ])
+
+    return subs
+
+
+if __name__ == '__main__':
+
+    process_records(parse_opts())