From b5bdb67a8adad7f07b1d7eb5fb13d6220c16811c Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Thu, 4 Oct 2012 17:17:04 -0400 Subject: [PATCH] Ebooks: handle multiple dupe options process_fields() modifies the base record, and we were inadvertently running the base record through the ringer every time we checked for dupes - which would be evidenced by the addition of one 506 and 590 per consortial member per dupe check we ran. In the process of cleaning this up, try to dedupe some of our own code (heh). We can probably go futher but this is a good start. Signed-off-by: Dan Scott --- tools/ebooks/prep_ebook_records.py | 119 ++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 67 deletions(-) diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 21b7a46e74..ee11a94e7e 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -15,6 +15,7 @@ be accommodated in batch load. """ import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json +import copy from urllib import quote from datetime import date from BeautifulSoup import BeautifulSoup @@ -397,10 +398,7 @@ def process_records(options): global RECORD_COUNT global DUP_COUNT - sample = '' - duplicate = '' - tcn = '' - url = '' + files = {} try: reader = pymarc.MARCReader( @@ -409,31 +407,15 @@ def process_records(options): except Exception, ex: print("Could not open input file [%s]" % options['input']) - try: - writer = pymarc.MARCWriter(open(options['output'], mode='wb')) - except Exception, ex: - print("Could not open output file [%s]" % options['output']) + for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'): + if fname in options: + _fname = options[fname] + try: + files[fname] = pymarc.MARCWriter(open(_fname, mode='wb')) + except Exception, ex: + print("Could not open output file [%s]: %s" % (_fname, ex)) - if 'duplicate' in options: - try: - duplicate = pymarc.MARCWriter(open(options['duplicate'], mode='wb')) - except Exception, ex: - print("Could not open output file [%s]" % options['duplicate']) - - if 'tcn' in options: - try: - tcn = pymarc.MARCWriter(open(options['tcn'], mode='wb')) - except Exception, ex: - print("Could not open output file [%s]" % options['tcn']) - - if 'url' in options: - try: - url = pymarc.MARCWriter(open(options['url'], mode='wb')) - except Exception, ex: - print("Could not open output file [%s]" % options['url']) - - if 'sample' in options: - sample = pymarc.MARCWriter(open(options['sample'], mode='wb')) + writer = files['output'] for record in reader: RECORD_COUNT += 1 @@ -445,41 +427,42 @@ def process_records(options): else: print ("%d - %s\n" % (RECORD_COUNT, record['856'])) - - new_record = '' - dup_flag = False - - if duplicate: - bib_id, dup_flag = isbn_check(record) - new_record = process_fields(record, options, bib_id, dup_flag) - if dup_flag: - duplicate.write(new_record) - if tcn: - bib_id, dup_flag = tcn_check(record) - new_record = process_fields(record, options, bib_id, dup_flag) - if dup_flag: - tcn.write(new_record) - if url: - bib_id, dup_flag = url_check(record, options) - new_record = process_fields(record, options, bib_id, dup_flag) - if dup_flag: - url.write(new_record) - - if not dup_flag: - new_record = process_fields(record, options, 0, False) - else: + dupe_flags = {} + + if files['duplicate']: + tmp_record = process_fields(copy.deepcopy(record), options) + bib_id, dupe_flags['isbn'] = isbn_check(tmp_record) + if dupe_flags['isbn']: + tmp_record = add_dupe_field(tmp_record, bib_id) + files['duplicate'].write(tmp_record) + + if files['tcn']: + tmp_record = process_fields(copy.deepcopy(record), options) + bib_id, dupe_flags['tcn'] = tcn_check(tmp_record) + if dupe_flags['tcn']: + tmp_record = add_dupe_field(tmp_record, bib_id) + files['tcn'].write(tmp_record) + + if files['url']: + tmp_record = process_fields(copy.deepcopy(record), options) + bib_id, dupe_flags['url'] = url_check(tmp_record, options) + if dupe_flags['url']: + tmp_record = add_dupe_field(tmp_record, bib_id) + files['url'].write(tmp_record) + + if len(dupe_flags): DUP_COUNT += 1 - - if new_record: + else: + new_record = process_fields(record, options) writer.write(new_record) - if (sample and ( + if (files['sample'] and ( (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0) )): - sample.write(new_record) + files['sample'].write(new_record) except Exception, ex: print("* Error processing record %d - %s" % (RECORD_COUNT, ex)) -def process_fields(record, options, bib_id, dup_flag): +def process_fields(record, options): """Decide which fields to add, delete, and keep""" new_record = pymarc.Record(to_unicode=True, force_utf8=True) @@ -498,17 +481,6 @@ def process_fields(record, options, bib_id, dup_flag): ) record.add_field(note) - # 909 - if dup_flag: - dup_value = str(bib_id) - dup = pymarc.Field(tag = '909', - indicators = [' ', ' '], - subfields = [ - 'a', dup_value - ] - ) - record.add_field(dup) - add_marc_source(record, options) # 598 publisher = add_publisher(record, options) # 710 add_restriction(record, options, publisher) # 506 @@ -556,6 +528,19 @@ def process_fields(record, options, bib_id, dup_flag): return new_record +def add_dupe_field(record, bib_id): + """Add a 909 field marking the duplicate record""" + + dup_value = str(bib_id) + dup = pymarc.Field(tag = '909', + indicators = [' ', ' '], + subfields = [ + 'a', dup_value + ] + ) + record.add_field(dup) + return record + def clean_diacritics(field): """ Change specific patterns of bytes into other patterns of bytes -- 2.11.0