Begin addressing the specific manglings of the CRKN records
authorDan Scott <dan@coffeecode.net>
Fri, 29 Jul 2011 15:58:36 +0000 (11:58 -0400)
committerDan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:36:43 +0000 (14:36 -0400)
To begin with, correct the mangled macrons and modifier letter half ring
that are popular in the Arabic language. Here's hoping that correcting
these byte sequences doesn't lead to corruption elsewhere.

Signed-off-by: Dan Scott <dscott@laurentian.ca>
tools/ebooks/prep_ebook_records.py

index 993a4c5..c49e0da 100644 (file)
@@ -310,6 +310,7 @@ def process_fields(record, options):
     marked_isbn = mark_isbn_for_sfx(record, options)
 
     for field in record.get_fields():
+        field = clean_diacritics(field)
         # Process all of the 856 fields
         if field.tag == '856':
             new_fields = process_urls(field, options, publisher)
@@ -338,6 +339,42 @@ def process_fields(record, options):
 
     return new_record
 
+def clean_diacritics(field):
+    """
+    Change specific patterns of bytes into other patterns of bytes
+
+    We get some horribly corrupted records. This is an attempt to reverse the
+    horror via equally horrible byte-matching for known messed up conditions.
+    """
+
+    if field.is_control_field():
+        return field
+
+    new_field = pymarc.Field(
+        tag=field.tag,
+        indicators=[field.indicator1, field.indicator2]
+    )
+
+    for subfield in field:
+        if r'\x' not in repr(subfield[1]):
+            new_field.add_subfield(subfield[0], subfield[1])
+            continue
+
+        # Let the substitutions commence - maybe move to a map table?
+        tmpsf = subfield[1].replace(u'\xd5a', u'a\u0304')
+        tmpsf = tmpsf.replace(u'\xd5i', u'i\u0304')
+        tmpsf = tmpsf.replace(u'i\xb1', u'i\u02be')
+
+        new_field.add_subfield(subfield[0], tmpsf)
+        if r'\x' in repr(tmpsf):
+            global RECORD_COUNT
+            print " * %d Hex value found in %s:%s - [%s] [%s]" % (
+                RECORD_COUNT, field.tag, subfield[0], tmpsf, repr(tmpsf)
+            )
+
+    return new_field
+        
+
 def add_publisher(record, options):
     """
     This is a convoluted way to avoid creating a new 710 if we already