From: dbs Date: Thu, 16 Dec 2010 16:42:24 +0000 (+0000) Subject: Convert uniformly to Unicode output X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=8527821cbe87a891c7bcf4c38e1508d0740347b5;p=contrib%2FConifer.git Convert uniformly to Unicode output LoC added content appears to come in ISO-8859-1, so decode that accordingly. Then generate UTF8-encoded Unicode MARC records on output. git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/branches/rel_1_6_1@1106 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 4b20b20149..f1177f295d 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -220,29 +220,34 @@ def process_records(options): """Converts raw ebook MARC records to Conifer-ready MARC records""" sample = '' - reader = pymarc.MARCReader(open(options['input'], 'rb')) - writer = pymarc.MARCWriter(open(options['output'], 'wb')) + reader = pymarc.MARCReader( + open(options['input'], mode='rb'), to_unicode=True + ) + writer = pymarc.MARCWriter(open(options['output'], mode='wb')) if ('sample' in options): - sample = pymarc.MARCWriter(open(options['sample'], 'wb')) + sample = pymarc.MARCWriter(open(options['sample'], mode='wb')) cnt = 0 for record in reader: cnt = cnt + 1 - if not (record['856'] and record['856']['u']): - print("* No 856 for record # %s in file %s" - % (cnt, options['input']) - ) + try: + if not (record['856'] and record['856']['u']): + print("* No 856 for record # %s in file %s" + % (cnt, options['input']) + ) - new_record = process_fields(record, options) + new_record = process_fields(record, options) - writer.write(new_record) - if (sample and ((cnt == 1) or (cnt % 100 == 0))): - sample.write(new_record) + writer.write(new_record) + if (sample and ((cnt == 1) or (cnt % 100 == 0))): + sample.write(new_record) + except Exception, ex: + print("* Error processing record %s - %s" % (cnt, ex)) def process_fields(record, options): """Decide which fields to add, delete, and keep""" - new_record = pymarc.Record() + new_record = pymarc.Record(to_unicode=True, force_utf8=True) for field in record.get_fields(): # Process all of the 856 fields @@ -435,7 +440,7 @@ def process_loc_data(raw_content): content = content[0:lcsh] # Farewell, starting and ending whitespace - content = content.strip() + content = content.strip().decode('iso8859-1') return content