From 39dfd0c590cf4d5d2a05d8706797b7d94bbc6747 Mon Sep 17 00:00:00 2001 From: dbs Date: Fri, 19 Nov 2010 22:33:03 +0000 Subject: [PATCH] Close to working ebook MARC record processing script Needs some massive refactoring for a smarter, more object-oriented approach. But some days you just need to bash out something that mostly works. Also needs the ebrary URL distinction and Algoma's settings. git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/branches/rel_1_6_1@1076 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- tools/ebooks/prep_ebook_records.py | 286 +++++++++++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 tools/ebooks/prep_ebook_records.py diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py new file mode 100644 index 0000000000..6e17dfaeb7 --- /dev/null +++ b/tools/ebooks/prep_ebook_records.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python + +import codecs, os, os.path, sys, getopt, pymarc, pymarc.marc8 + +class Institution(): + """Defines standard settings for each Conifer institution""" + + def __init__(self): + """Initialize the Institution object""" + self.algoma = { \ + "code": "OSTMA", \ + "ebrary_code": "XXX", \ + "proxy": "XXX", \ + "public_note": "XXX", \ + "link_text": "XXX" \ + } + + self.laurentian = { \ + "code": "OSUL", \ + "ebrary_code": "jndlu", \ + "proxy": "https://librweb.laurentian.ca/login?url=", \ + "public_note": "Available online / disponible en ligne", \ + "link_text": "Available online / disponible en ligne" \ + } + + self.windsor = { \ + "code": "OWA", \ + "ebrary_code": "oculwindsor", \ + "proxy": "http://ezproxy.uwindsor.ca/login?url=", \ + "public_note": "To view Windsor's electronic resource click here.", \ + "link_text": "To view Windsor's electronic resource click here." \ + } + + def get_settings(self, lib): + """Return the settings for a library by name""" + return getattr(self, lib) + + +def do_help(): + ''' + Print help for the Conifer ebook MARCXML processor + ''' + + print ''' +Conifer ebook MARCXML processor + +This script takes a set of MARCXML records and processes them to generate a set +of MARCXML records ready for loading into the Conifer consortial library +system. The processing consists of taking the existing 856 field and creating +one or more new 856 fields for each Conifer institution that should have access +to these resources. + +The script customizes the following aspects of each record: + + * Adds one 856 per institution specified at the command line: + * $u (URL) - prepends the institutional proxy and, for eBrary records, + changes the insitutional code + * $y (link text) - sets preferred text of the link to the resource + * $z (public note) - sets public note for the resource + + * Adds a 710 field to identify the publisher using the value specified + at the command line + * Adds a 590 internal note field using the value specified at the command + line. + +Required arguments: + -i / --input : The name of the input MARCXML file. + + -o / --output : The name of the output MARCXML file. + + -p / --publisher : The name of the publisher to be inserted in a 710 field. + + -A / --algoma: Add an 856 for Algoma University + + -L / --laurentian: Add an 856 for Laurentian University + + -W / --windsor : Add an 856 for University of Windsor + +Optional arguments: + -n / --note : The text of the internal note to be inserted into a 590 field. + + -h / --help : Prints help message + +Examples: + %s --algoma --windsor -i crkn.xml -o /tmp/crkn_out.xml -p "eBrary Inc." + ''' % sys.argv[0] + sys.exit(0) + +def consolidate_options(opts): + """Make long arguments the standard form in command line options""" + + _options = dict(opts) + + for key, val in opts: + if key == '-i': + _options['--input'] = val + elif key == '-o': + _options['--output'] = val + elif key == '-p': + _options['--publisher'] = val + elif key == '-n': + _options['--note'] = val + elif key == '-A': + _options['--algoma'] = val + elif key == '-L': + _options['--laurentian'] = val + elif key == '-W': + _options['--windsor'] = val + elif key == '-h': + _options['--help'] = val + + return _options + +def check_options(options): + """Check the validity of options that were passed in""" + + _help = False + + if '--help' in options: + do_help() + + if '--input' not in options: + print "* Missing -i / --input argument!" + _help = True + + if '--output' not in options: + print "* Missing -o / --output argument!" + _help = True + + if '--publisher' not in options: + print "* Missing -p / --publisher argument!" + _help = True + + _libraries = dict() + if '--algoma' in options: + _libraries['algoma'] = True + + if '--laurentian' in options: + _libraries['laurentian'] = True + + if '--windsor' in options: + _libraries['windsor'] = True + + if len(_libraries.keys()) == 0: + _help = True + + if _help == True: + do_help() + + # Get the input and output files + _input = options['--input'] + _output = options['--output'] + + try: + os.stat(_input) + except OSError: + print("* Cannot read input file %s" % (_input)) + sys.exit(0) + + try: + os.access(os.path.dirname(_output), os.W_OK) + except OSError: + print("* Cannot write to output path %s" % (os.path.dirname(_output))) + sys.exit(0) + + clean_opts = dict() + clean_opts['publisher'] = options['--publisher'] + + if '--note' in options: + clean_opts['note'] = options['--note'] + + clean_opts['libraries'] = _libraries + clean_opts['input'] = _input + clean_opts['output'] = _output + clean_opts['settings'] = Institution() + + return clean_opts + +def parse_opts(): + """Get command-line arguments from the script""" + try: + _short_opts = 'i:o:p:ALWn:h' + _long_opts = ['input=', 'output=', 'publisher=', 'algoma', \ + 'laurentian', 'windsor', 'note=', 'help'] + opts, args = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) + except getopt.GetoptError, ex: + print "* %s" % str(ex) + do_help() + + _options = consolidate_options(opts) + return check_options(_options) + +def process_records(options): + """Converts raw ebook MARC records to Conifer-ready MARC records""" + + reader = pymarc.MARCReader(open(options['input'], 'rb')) + writer = pymarc.MARCWriter(open(options['output'], 'wb')) + + cnt = 0 + for record in reader: + url = False + cnt = cnt + 1 + if record['856'] and record['856']['u']: + url_tag = record['856']['u'] + # print url + else: + print("* No 856 for record # %s" % (cnt)) + + new_record = pymarc.Record() + for field in record.get_fields(): + # Only process the first 856 field, for better or worse + if field.tag == '856': + if url == False: + url = True + new_fields = process_urls(field, options) + for nf in new_fields: + new_record.add_field(nf) + else: + new_record.add_field(field) + + seven_ten = pymarc.Field(tag = '710', + indicators = ['2', ' '], + subfields = [ + 'a', options['publisher'] + ] + ) + new_record.add_field(seven_ten) + + if 'note' in options: + note = pymarc.Field(tag = '590', + indicators = [' ', ' '], + subfields = [ + 'a', options['note'] + ] + ) + new_record.add_field(note) + + writer.write(new_record) + +def process_urls(field, options): + """Creates 856 fields required by Conifer""" + + new_fields = [] + + try: + url = field['u'] + except Error: + print "* No subfield 'u' found in this 856" + return None + + for lib in options['libraries']: + data = options['settings'].get_settings(lib) + eight_five_six = pymarc.Field(tag = '856', + indicators = ['4', '0'], + subfields = [ + 'u', data['proxy'] + url, + 'y', data['link_text'], + 'z', data['public_note'], + '9', data['code'] + ] + ) + new_fields.append(eight_five_six) + + return new_fields + +if __name__ == '__main__': + + options = parse_opts() + process_records(options); + +## Okay, made it through the basic invocation requirements; moving on +# +#For each MARC record: +# +#Find the 856 u ($url) +#for each institution: +#create a new 856 40 +#if $url =~ /\.ebrary\./, then: +#$url =~ s/^.*?id=(\d+)\s*$/$1/ +#$url = http://site.ebrary.com/lib/ + institution.ebrary_code + "/Doc?id=" + $url +#else: +#$url = institution.proxy + $url +#$u = $url +#$y = institution.link_text +#$z = institution.public_note +#$9 = institution.code -- 2.11.0