"""
import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json
-import copy
+import codecs, copy
from urllib import quote
from datetime import date
from BeautifulSoup import BeautifulSoup
OPENSRF_ISBN_CALL = "open-ils.search.biblio.isbn"
OPENSRF_TCN_CALL = "open-ils.search.biblio.tcn"
OPENSRF_KEYWORD_CALL = "open-ils.search.biblio.multiclass.query"
+OPTIONS = {}
+FILES = {}
class Institution():
"""Defines standard settings for each Conifer institution"""
-d / --duplicate : The name of the file to route ISBN duplicates to.
+ -F / --from-format : The format ('xml' or 'marc21') of the input file
+
+ -T / --to-format : The format ('xml' or 'marc21') of the output file
+
-t / --tcn : The name of the file to route TCN duplicates to.
-u / --url : The name of the file to route URL duplicates to.
-e / --ebrary : Put Ebrary 001 into 924 as a URN for SFX lookup purposes
+ -I / --isbn-sfx: Mark the ISBN if found in SFX
+
-n / --note : The text of the internal note to be inserted into a 590 field.
-s / --sample : The name of the sample output MARC file (generates
'-C': '--clean',
'-d': '--duplicate',
'-e': '--ebrary',
+ '-F': '--from-format',
+ '-I': '--isbn-sfx',
'-p': '--publisher',
'-P': '--platform',
'-n': '--note',
'-t': '--tcn',
+ '-T': '--to-format',
'-u': '--url',
'-A': '--algoma',
'-B': '--boreal',
print("* Cannot write to output path %s" % (os.path.dirname(_output)))
sys.exit(0)
- clean_opts = dict()
- clean_opts['publisher'] = append_period(options['--publisher'])
-
- clean_opts['consortium'] = options['--consortium'].decode('utf-8')
- clean_opts['authorization'] = options['--authorization'].decode('utf-8')
-
- if '--clean' in options:
- clean_opts['clean'] = True
-
- if '--duplicate' in options:
- clean_opts['duplicate'] = options['--duplicate']
-
- if '--tcn' in options:
- clean_opts['tcn'] = options['--tcn']
-
- if '--url' in options:
- clean_opts['url'] = options['--url']
+ _bool_opts = {
+ '--clean': 'clean',
+ '--ebrary': 'ebrary',
+ '--isbn-sfx': 'isbn-sfx',
+ }
- if '--ebrary' in options:
- clean_opts['ebrary'] = True
+ _string_opts = {
+ '--authorization': 'authorization',
+ '--consortium': 'consortium',
+ '--duplicate': 'duplicate',
+ '--from-format': 'from-format',
+ '--note': 'note',
+ '--platform': 'platform',
+ '--sample': 'sample',
+ '--tcn': 'tcn',
+ '--to-format': 'to-format',
+ '--url': 'url',
+ }
- if '--sample' in options:
- clean_opts['sample'] = options['--sample']
+ clean_opts = dict()
+ clean_opts['publisher'] = append_period(options['--publisher'])
- if '--note' in options:
- clean_opts['note'] = options['--note']
+ for optkey, optval in _bool_opts.items():
+ if optkey in options:
+ clean_opts[optval] = True
- if '--platform' in options:
- clean_opts['platform'] = append_period(options['--platform'])
+ for optkey, optval in _string_opts.items():
+ if optkey in options:
+ clean_opts[optval] = options[optkey].decode('utf-8')
clean_opts['libraries'] = _libraries
clean_opts['input'] = _input
def parse_opts():
"""Get command-line arguments from the script"""
try:
- _short_opts = 'i:o:a:c:p:P:ABCLWe:d:t:u:n:s:h'
+ _short_opts = 'i:o:a:c:p:P:ABLWeCI:d:F:T:t:u:n:s:h'
_long_opts = ['input=', 'output=', 'authorization=', 'consortium=',
'publisher=', 'platform=', 'algoma', 'boreal', 'laurentian',
- 'windsor', 'ebrary', 'clean', 'duplicate=', 'tcn=', 'url=', 'note=',
- 'sample=', 'help'
+ 'windsor', 'ebrary', 'clean', 'isbn-sfx', 'duplicate=',
+ 'from-format=', 'to-format=', 'tcn=', 'url=', 'note=', 'sample=',
+ 'help'
]
opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts)
except getopt.GetoptError, ex:
_options = consolidate_options(opts[0])
return check_options(_options)
-def process_records(options):
+def process_marc(options):
"""Converts raw ebook MARC records to Conifer-ready MARC records"""
+ global FILES
+ files = FILES
+
+ if 'from-format' in options and options['from-format'] == 'xml':
+ pymarc.map_xml(process_xml, options['input'])
+ else:
+ try:
+ reader = pymarc.MARCReader(
+ open(options['input'], mode='rb'), to_unicode=True
+ )
+ except Exception, ex:
+ print("Could not open input file [%s]" % options['input'])
+
+ for record in reader:
+ process_record(record, options, files)
+
+def process_record(record, options, files):
global RECORD_COUNT
global DUP_COUNT
- files = {}
-
+ RECORD_COUNT += 1
try:
- reader = pymarc.MARCReader(
- open(options['input'], mode='rb'), to_unicode=True
- )
- except Exception, ex:
- print("Could not open input file [%s]" % options['input'])
-
- for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'):
- if fname in options:
- _fname = options[fname]
- try:
- files[fname] = pymarc.MARCWriter(open(_fname, mode='wb'))
- except Exception, ex:
- print("Could not open output file [%s]: %s" % (_fname, ex))
+ if not (record['856'] and record['856']['u']):
+ print("* No 856 for record # %s in file %s"
+ % (RECORD_COUNT, options['input'])
+ )
+ else:
+ print ("%d - %s" % (RECORD_COUNT, record['856']))
- writer = files['output']
+ dupe_flags = {}
- for record in reader:
- RECORD_COUNT += 1
- try:
- if not (record['856'] and record['856']['u']):
- print("* No 856 for record # %s in file %s"
- % (RECORD_COUNT, options['input'])
- )
+ if 'duplicate' in files:
+ tmp_record = process_fields(copy.deepcopy(record), options)
+ bib_id, dupe_flags['isbn'] = isbn_check(tmp_record)
+ if dupe_flags['isbn']:
+ tmp_record = add_dupe_field(tmp_record, bib_id)
+ files['duplicate'].write(tmp_record)
else:
- print ("%d - %s\n" % (RECORD_COUNT, record['856']))
-
- dupe_flags = {}
-
- if 'duplicate' in files:
- tmp_record = process_fields(copy.deepcopy(record), options)
- bib_id, dupe_flags['isbn'] = isbn_check(tmp_record)
- if dupe_flags['isbn']:
- tmp_record = add_dupe_field(tmp_record, bib_id)
- files['duplicate'].write(tmp_record)
- else:
- del(dupe_flags['isbn'])
-
- if 'tcn' in files and len(dupe_flags) == 0:
- tmp_record = process_fields(copy.deepcopy(record), options)
- bib_id, dupe_flags['tcn'] = tcn_check(tmp_record)
- if dupe_flags['tcn']:
- tmp_record = add_dupe_field(tmp_record, bib_id)
- files['tcn'].write(tmp_record)
- else:
- del(dupe_flags['tcn'])
-
- if 'url' in files and len(dupe_flags) == 0:
- tmp_record = process_fields(copy.deepcopy(record), options)
- bib_id, dupe_flags['url'] = url_check(tmp_record, options)
- if dupe_flags['url']:
- tmp_record = add_dupe_field(tmp_record, bib_id)
- files['url'].write(tmp_record)
- else:
- del(dupe_flags['url'])
-
- if len(dupe_flags):
- DUP_COUNT += 1
+ del(dupe_flags['isbn'])
+
+ if 'tcn' in files and len(dupe_flags) == 0:
+ tmp_record = process_fields(copy.deepcopy(record), options)
+ bib_id, dupe_flags['tcn'] = tcn_check(tmp_record)
+ if dupe_flags['tcn']:
+ tmp_record = add_dupe_field(tmp_record, bib_id)
+ files['tcn'].write(tmp_record)
else:
- new_record = process_fields(record, options)
- writer.write(new_record)
- if ('sample' in files and (
- (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)
- )):
- files['sample'].write(new_record)
- except Exception, ex:
- print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
+ del(dupe_flags['tcn'])
+
+ if 'url' in files and len(dupe_flags) == 0:
+ tmp_record = process_fields(copy.deepcopy(record), options)
+ bib_id, dupe_flags['url'] = url_check(tmp_record, options)
+ if dupe_flags['url']:
+ tmp_record = add_dupe_field(tmp_record, bib_id)
+ files['url'].write(tmp_record)
+ else:
+ del(dupe_flags['url'])
+
+ if len(dupe_flags):
+ DUP_COUNT += 1
+ else:
+ new_record = process_fields(record, options)
+ if 'to-format' in options and options['to-format'] == 'xml':
+ new_record = pymarc.record_to_xml(new_record) + '\n'
+ files['output'].write(new_record)
+ if ('sample' in files and (
+ (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)
+ )):
+ files['sample'].write(new_record)
+ except Exception, ex:
+ print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
def process_fields(record, options):
"""Decide which fields to add, delete, and keep"""
add_restriction(record, options, publisher) # 506
add_platform(record, options) # 710
- # USE FOR SKIPPING SFX
- # marked_isbn = False
- marked_isbn = mark_isbn_for_sfx(record, options)
+ if 'isbn-sfx' in options:
+ marked_isbn = mark_isbn_for_sfx(record, options)
for field in record.get_fields():
if 'clean' in options:
else:
new_record.add_ordered_field(field)
- if not marked_isbn:
+ if 'isbn-sfx' in options and not marked_isbn:
try:
isbn = record['020']['a']
print("ISBN: [%s] - no matching ISBN target found in SFX for %s" %
return subs
+def process_xml(record):
+ global OPTIONS
+ global FILES
+ options = OPTIONS
+ files = FILES
+ process_record(record, options, files)
if __name__ == '__main__':
- process_records(parse_opts())
+ OPTIONS = parse_opts()
+ for fname in ('duplicate', 'tcn', 'url', 'sample', 'output'):
+ if fname in OPTIONS:
+ try:
+ if 'to-format' in OPTIONS and OPTIONS['to-format'] == 'xml':
+ FILES[fname] = codecs.open(OPTIONS[fname], 'w', 'utf-8')
+ else:
+ FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'w'))
+ except Exception, ex:
+ print("Could not open output file [%s]: %s" % (OPTIONS[fname], ex))
+
+ process_marc(OPTIONS)
+ #pymarc.map_xml(process_xml, '/home/dan/Downloads/AlexanderStreetPress_JazzMusicLibrary_Canada_MONOSER_2012-05-23.xml')