"""
import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json
+import copy
from urllib import quote
from datetime import date
from BeautifulSoup import BeautifulSoup
global RECORD_COUNT
global DUP_COUNT
- sample = ''
- duplicate = ''
- tcn = ''
- url = ''
+ files = {}
try:
reader = pymarc.MARCReader(
except Exception, ex:
print("Could not open input file [%s]" % options['input'])
- try:
- writer = pymarc.MARCWriter(open(options['output'], mode='wb'))
- except Exception, ex:
- print("Could not open output file [%s]" % options['output'])
+ for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'):
+ if fname in options:
+ _fname = options[fname]
+ try:
+ files[fname] = pymarc.MARCWriter(open(_fname, mode='wb'))
+ except Exception, ex:
+ print("Could not open output file [%s]: %s" % (_fname, ex))
- if 'duplicate' in options:
- try:
- duplicate = pymarc.MARCWriter(open(options['duplicate'], mode='wb'))
- except Exception, ex:
- print("Could not open output file [%s]" % options['duplicate'])
-
- if 'tcn' in options:
- try:
- tcn = pymarc.MARCWriter(open(options['tcn'], mode='wb'))
- except Exception, ex:
- print("Could not open output file [%s]" % options['tcn'])
-
- if 'url' in options:
- try:
- url = pymarc.MARCWriter(open(options['url'], mode='wb'))
- except Exception, ex:
- print("Could not open output file [%s]" % options['url'])
-
- if 'sample' in options:
- sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
+ writer = files['output']
for record in reader:
RECORD_COUNT += 1
else:
print ("%d - %s\n" % (RECORD_COUNT, record['856']))
-
- new_record = ''
- dup_flag = False
-
- if duplicate:
- bib_id, dup_flag = isbn_check(record)
- new_record = process_fields(record, options, bib_id, dup_flag)
- if dup_flag:
- duplicate.write(new_record)
- if tcn:
- bib_id, dup_flag = tcn_check(record)
- new_record = process_fields(record, options, bib_id, dup_flag)
- if dup_flag:
- tcn.write(new_record)
- if url:
- bib_id, dup_flag = url_check(record, options)
- new_record = process_fields(record, options, bib_id, dup_flag)
- if dup_flag:
- url.write(new_record)
-
- if not dup_flag:
- new_record = process_fields(record, options, 0, False)
- else:
+ dupe_flags = {}
+
+ if files['duplicate']:
+ tmp_record = process_fields(copy.deepcopy(record), options)
+ bib_id, dupe_flags['isbn'] = isbn_check(tmp_record)
+ if dupe_flags['isbn']:
+ tmp_record = add_dupe_field(tmp_record, bib_id)
+ files['duplicate'].write(tmp_record)
+
+ if files['tcn']:
+ tmp_record = process_fields(copy.deepcopy(record), options)
+ bib_id, dupe_flags['tcn'] = tcn_check(tmp_record)
+ if dupe_flags['tcn']:
+ tmp_record = add_dupe_field(tmp_record, bib_id)
+ files['tcn'].write(tmp_record)
+
+ if files['url']:
+ tmp_record = process_fields(copy.deepcopy(record), options)
+ bib_id, dupe_flags['url'] = url_check(tmp_record, options)
+ if dupe_flags['url']:
+ tmp_record = add_dupe_field(tmp_record, bib_id)
+ files['url'].write(tmp_record)
+
+ if len(dupe_flags):
DUP_COUNT += 1
-
- if new_record:
+ else:
+ new_record = process_fields(record, options)
writer.write(new_record)
- if (sample and (
+ if (files['sample'] and (
(RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)
)):
- sample.write(new_record)
+ files['sample'].write(new_record)
except Exception, ex:
print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
-def process_fields(record, options, bib_id, dup_flag):
+def process_fields(record, options):
"""Decide which fields to add, delete, and keep"""
new_record = pymarc.Record(to_unicode=True, force_utf8=True)
)
record.add_field(note)
- # 909
- if dup_flag:
- dup_value = str(bib_id)
- dup = pymarc.Field(tag = '909',
- indicators = [' ', ' '],
- subfields = [
- 'a', dup_value
- ]
- )
- record.add_field(dup)
-
add_marc_source(record, options) # 598
publisher = add_publisher(record, options) # 710
add_restriction(record, options, publisher) # 506
return new_record
+def add_dupe_field(record, bib_id):
+ """Add a 909 field marking the duplicate record"""
+
+ dup_value = str(bib_id)
+ dup = pymarc.Field(tag = '909',
+ indicators = [' ', ' '],
+ subfields = [
+ 'a', dup_value
+ ]
+ )
+ record.add_field(dup)
+ return record
+
def clean_diacritics(field):
"""
Change specific patterns of bytes into other patterns of bytes