From 1d655492bd29c6659d13a2589e4d7b74dcf28529 Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Tue, 20 Nov 2012 13:55:02 -0500 Subject: [PATCH] EResource handling improvements * Add the ability to read (via -F xml) and write (via -T xml) MARCXML records. Writing XML records will place one per line, making it ideal for inserting into a database via COPY(). * Make the "Mark ISBN for SFX" functionality optional via the -I flag, as not all electronic resources have ISBNs. (Hi, journals and music records). * Reduce code duplication significantly Signed-off-by: Dan Scott --- tools/ebooks/prep_ebook_records.py | 226 +++++++++++++++++++++---------------- 1 file changed, 127 insertions(+), 99 deletions(-) diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 7e483be8c7..f3220cd95b 100755 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -15,7 +15,7 @@ be accommodated in batch load. """ import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json -import copy +import codecs, copy from urllib import quote from datetime import date from BeautifulSoup import BeautifulSoup @@ -26,6 +26,8 @@ GATEWAY_URL = "http://www.concat.ca/osrf-gateway-v1" OPENSRF_ISBN_CALL = "open-ils.search.biblio.isbn" OPENSRF_TCN_CALL = "open-ils.search.biblio.tcn" OPENSRF_KEYWORD_CALL = "open-ils.search.biblio.multiclass.query" +OPTIONS = {} +FILES = {} class Institution(): """Defines standard settings for each Conifer institution""" @@ -140,12 +142,18 @@ Optional arguments: -d / --duplicate : The name of the file to route ISBN duplicates to. + -F / --from-format : The format ('xml' or 'marc21') of the input file + + -T / --to-format : The format ('xml' or 'marc21') of the output file + -t / --tcn : The name of the file to route TCN duplicates to. -u / --url : The name of the file to route URL duplicates to. -e / --ebrary : Put Ebrary 001 into 924 as a URN for SFX lookup purposes + -I / --isbn-sfx: Mark the ISBN if found in SFX + -n / --note : The text of the internal note to be inserted into a 590 field. -s / --sample : The name of the sample output MARC file (generates @@ -169,10 +177,13 @@ def consolidate_options(opts): '-C': '--clean', '-d': '--duplicate', '-e': '--ebrary', + '-F': '--from-format', + '-I': '--isbn-sfx', '-p': '--publisher', '-P': '--platform', '-n': '--note', '-t': '--tcn', + '-T': '--to-format', '-u': '--url', '-A': '--algoma', '-B': '--boreal', @@ -233,35 +244,35 @@ def check_options(options): print("* Cannot write to output path %s" % (os.path.dirname(_output))) sys.exit(0) - clean_opts = dict() - clean_opts['publisher'] = append_period(options['--publisher']) - - clean_opts['consortium'] = options['--consortium'].decode('utf-8') - clean_opts['authorization'] = options['--authorization'].decode('utf-8') - - if '--clean' in options: - clean_opts['clean'] = True - - if '--duplicate' in options: - clean_opts['duplicate'] = options['--duplicate'] - - if '--tcn' in options: - clean_opts['tcn'] = options['--tcn'] - - if '--url' in options: - clean_opts['url'] = options['--url'] + _bool_opts = { + '--clean': 'clean', + '--ebrary': 'ebrary', + '--isbn-sfx': 'isbn-sfx', + } - if '--ebrary' in options: - clean_opts['ebrary'] = True + _string_opts = { + '--authorization': 'authorization', + '--consortium': 'consortium', + '--duplicate': 'duplicate', + '--from-format': 'from-format', + '--note': 'note', + '--platform': 'platform', + '--sample': 'sample', + '--tcn': 'tcn', + '--to-format': 'to-format', + '--url': 'url', + } - if '--sample' in options: - clean_opts['sample'] = options['--sample'] + clean_opts = dict() + clean_opts['publisher'] = append_period(options['--publisher']) - if '--note' in options: - clean_opts['note'] = options['--note'] + for optkey, optval in _bool_opts.items(): + if optkey in options: + clean_opts[optval] = True - if '--platform' in options: - clean_opts['platform'] = append_period(options['--platform']) + for optkey, optval in _string_opts.items(): + if optkey in options: + clean_opts[optval] = options[optkey].decode('utf-8') clean_opts['libraries'] = _libraries clean_opts['input'] = _input @@ -379,11 +390,12 @@ def check_libraries(options): def parse_opts(): """Get command-line arguments from the script""" try: - _short_opts = 'i:o:a:c:p:P:ABCLWe:d:t:u:n:s:h' + _short_opts = 'i:o:a:c:p:P:ABLWeCI:d:F:T:t:u:n:s:h' _long_opts = ['input=', 'output=', 'authorization=', 'consortium=', 'publisher=', 'platform=', 'algoma', 'boreal', 'laurentian', - 'windsor', 'ebrary', 'clean', 'duplicate=', 'tcn=', 'url=', 'note=', - 'sample=', 'help' + 'windsor', 'ebrary', 'clean', 'isbn-sfx', 'duplicate=', + 'from-format=', 'to-format=', 'tcn=', 'url=', 'note=', 'sample=', + 'help' ] opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) except getopt.GetoptError, ex: @@ -393,80 +405,79 @@ def parse_opts(): _options = consolidate_options(opts[0]) return check_options(_options) -def process_records(options): +def process_marc(options): """Converts raw ebook MARC records to Conifer-ready MARC records""" + global FILES + files = FILES + + if 'from-format' in options and options['from-format'] == 'xml': + pymarc.map_xml(process_xml, options['input']) + else: + try: + reader = pymarc.MARCReader( + open(options['input'], mode='rb'), to_unicode=True + ) + except Exception, ex: + print("Could not open input file [%s]" % options['input']) + + for record in reader: + process_record(record, options, files) + +def process_record(record, options, files): global RECORD_COUNT global DUP_COUNT - files = {} - + RECORD_COUNT += 1 try: - reader = pymarc.MARCReader( - open(options['input'], mode='rb'), to_unicode=True - ) - except Exception, ex: - print("Could not open input file [%s]" % options['input']) - - for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'): - if fname in options: - _fname = options[fname] - try: - files[fname] = pymarc.MARCWriter(open(_fname, mode='wb')) - except Exception, ex: - print("Could not open output file [%s]: %s" % (_fname, ex)) + if not (record['856'] and record['856']['u']): + print("* No 856 for record # %s in file %s" + % (RECORD_COUNT, options['input']) + ) + else: + print ("%d - %s" % (RECORD_COUNT, record['856'])) - writer = files['output'] + dupe_flags = {} - for record in reader: - RECORD_COUNT += 1 - try: - if not (record['856'] and record['856']['u']): - print("* No 856 for record # %s in file %s" - % (RECORD_COUNT, options['input']) - ) + if 'duplicate' in files: + tmp_record = process_fields(copy.deepcopy(record), options) + bib_id, dupe_flags['isbn'] = isbn_check(tmp_record) + if dupe_flags['isbn']: + tmp_record = add_dupe_field(tmp_record, bib_id) + files['duplicate'].write(tmp_record) else: - print ("%d - %s\n" % (RECORD_COUNT, record['856'])) - - dupe_flags = {} - - if 'duplicate' in files: - tmp_record = process_fields(copy.deepcopy(record), options) - bib_id, dupe_flags['isbn'] = isbn_check(tmp_record) - if dupe_flags['isbn']: - tmp_record = add_dupe_field(tmp_record, bib_id) - files['duplicate'].write(tmp_record) - else: - del(dupe_flags['isbn']) - - if 'tcn' in files and len(dupe_flags) == 0: - tmp_record = process_fields(copy.deepcopy(record), options) - bib_id, dupe_flags['tcn'] = tcn_check(tmp_record) - if dupe_flags['tcn']: - tmp_record = add_dupe_field(tmp_record, bib_id) - files['tcn'].write(tmp_record) - else: - del(dupe_flags['tcn']) - - if 'url' in files and len(dupe_flags) == 0: - tmp_record = process_fields(copy.deepcopy(record), options) - bib_id, dupe_flags['url'] = url_check(tmp_record, options) - if dupe_flags['url']: - tmp_record = add_dupe_field(tmp_record, bib_id) - files['url'].write(tmp_record) - else: - del(dupe_flags['url']) - - if len(dupe_flags): - DUP_COUNT += 1 + del(dupe_flags['isbn']) + + if 'tcn' in files and len(dupe_flags) == 0: + tmp_record = process_fields(copy.deepcopy(record), options) + bib_id, dupe_flags['tcn'] = tcn_check(tmp_record) + if dupe_flags['tcn']: + tmp_record = add_dupe_field(tmp_record, bib_id) + files['tcn'].write(tmp_record) else: - new_record = process_fields(record, options) - writer.write(new_record) - if ('sample' in files and ( - (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0) - )): - files['sample'].write(new_record) - except Exception, ex: - print("* Error processing record %d - %s" % (RECORD_COUNT, ex)) + del(dupe_flags['tcn']) + + if 'url' in files and len(dupe_flags) == 0: + tmp_record = process_fields(copy.deepcopy(record), options) + bib_id, dupe_flags['url'] = url_check(tmp_record, options) + if dupe_flags['url']: + tmp_record = add_dupe_field(tmp_record, bib_id) + files['url'].write(tmp_record) + else: + del(dupe_flags['url']) + + if len(dupe_flags): + DUP_COUNT += 1 + else: + new_record = process_fields(record, options) + if 'to-format' in options and options['to-format'] == 'xml': + new_record = pymarc.record_to_xml(new_record) + '\n' + files['output'].write(new_record) + if ('sample' in files and ( + (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0) + )): + files['sample'].write(new_record) + except Exception, ex: + print("* Error processing record %d - %s" % (RECORD_COUNT, ex)) def process_fields(record, options): """Decide which fields to add, delete, and keep""" @@ -492,9 +503,8 @@ def process_fields(record, options): add_restriction(record, options, publisher) # 506 add_platform(record, options) # 710 - # USE FOR SKIPPING SFX - # marked_isbn = False - marked_isbn = mark_isbn_for_sfx(record, options) + if 'isbn-sfx' in options: + marked_isbn = mark_isbn_for_sfx(record, options) for field in record.get_fields(): if 'clean' in options: @@ -521,7 +531,7 @@ def process_fields(record, options): else: new_record.add_ordered_field(field) - if not marked_isbn: + if 'isbn-sfx' in options and not marked_isbn: try: isbn = record['020']['a'] print("ISBN: [%s] - no matching ISBN target found in SFX for %s" % @@ -1104,6 +1114,24 @@ def get_subfields(field, data): return subs +def process_xml(record): + global OPTIONS + global FILES + options = OPTIONS + files = FILES + process_record(record, options, files) if __name__ == '__main__': - process_records(parse_opts()) + OPTIONS = parse_opts() + for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'): + if fname in OPTIONS: + try: + if 'to-format' in OPTIONS and OPTIONS['to-format'] == 'xml': + FILES[fname] = codecs.open(OPTIONS[fname], 'w', 'utf-8') + else: + FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'w')) + except Exception, ex: + print("Could not open output file [%s]: %s" % (OPTIONS[fname], ex)) + + process_marc(OPTIONS) + #pymarc.map_xml(process_xml, '/home/dan/Downloads/AlexanderStreetPress_JazzMusicLibrary_Canada_MONOSER_2012-05-23.xml') -- 2.11.0