Convert uniformly to Unicode output
authordbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Thu, 16 Dec 2010 16:42:24 +0000 (16:42 +0000)
committerdbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Thu, 16 Dec 2010 16:42:24 +0000 (16:42 +0000)
LoC added content appears to come in ISO-8859-1, so decode that accordingly.

Then generate UTF8-encoded Unicode MARC records on output.

git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/branches/rel_1_6_1@1106 6d9bc8c9-1ec2-4278-b937-99fde70a366f

tools/ebooks/prep_ebook_records.py

index 4b20b20..f1177f2 100644 (file)
@@ -220,29 +220,34 @@ def process_records(options):
     """Converts raw ebook MARC records to Conifer-ready MARC records"""
 
     sample = ''
-    reader = pymarc.MARCReader(open(options['input'], 'rb'))
-    writer = pymarc.MARCWriter(open(options['output'], 'wb'))
+    reader = pymarc.MARCReader(
+        open(options['input'], mode='rb'), to_unicode=True
+    )
+    writer = pymarc.MARCWriter(open(options['output'], mode='wb'))
     if ('sample' in options):
-        sample = pymarc.MARCWriter(open(options['sample'], 'wb'))
+        sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
 
     cnt = 0
     for record in reader:
         cnt = cnt + 1
-        if not (record['856'] and record['856']['u']):
-            print("* No 856 for record # %s in file %s"
-                    % (cnt, options['input'])
-            )
+        try:
+            if not (record['856'] and record['856']['u']):
+                print("* No 856 for record # %s in file %s"
+                        % (cnt, options['input'])
+                )
 
-        new_record = process_fields(record, options)
+            new_record = process_fields(record, options)
 
-        writer.write(new_record)
-        if (sample and ((cnt == 1) or (cnt % 100 == 0))):
-            sample.write(new_record)
+            writer.write(new_record)
+            if (sample and ((cnt == 1) or (cnt % 100 == 0))):
+                sample.write(new_record)
+        except Exception, ex:
+            print("* Error processing record %s - %s" % (cnt, ex))
 
 def process_fields(record, options):
     """Decide which fields to add, delete, and keep"""
 
-    new_record = pymarc.Record()
+    new_record = pymarc.Record(to_unicode=True, force_utf8=True)
 
     for field in record.get_fields():
         # Process all of the 856 fields
@@ -435,7 +440,7 @@ def process_loc_data(raw_content):
         content = content[0:lcsh]
 
     # Farewell, starting and ending whitespace
-    content = content.strip()
+    content = content.strip().decode('iso8859-1')
 
     return content