Iterating through CRKN_OUP.mrc for more corruption to fix
authorDan Scott <dan@coffeecode.net>
Fri, 29 Jul 2011 17:21:30 +0000 (13:21 -0400)
committerDan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:36:45 +0000 (14:36 -0400)
Tildes, accute accents, cedillas, we got em all.

Signed-off-by: Dan Scott <dscott@laurentian.ca>
tools/ebooks/prep_ebook_records.py

index c49e0da..f81554c 100644 (file)
@@ -361,15 +361,31 @@ def clean_diacritics(field):
             continue
 
         # Let the substitutions commence - maybe move to a map table?
+
+        # COMBINING MACRON
         tmpsf = subfield[1].replace(u'\xd5a', u'a\u0304')
+        tmpsf = tmpsf.replace(u'\xd5e', u'e\u0304')
         tmpsf = tmpsf.replace(u'\xd5i', u'i\u0304')
+        tmpsf = tmpsf.replace(u'\xd5o', u'o\u0304')
+        tmpsf = tmpsf.replace(u'\xd5u', u'u\u0304')
+
+        # COMBINING MODIFIER LETTER HALF RING
         tmpsf = tmpsf.replace(u'i\xb1', u'i\u02be')
 
+        # COMBINING TILDE
+        tmpsf = tmpsf.replace(u'\xf5n', u'n\u0303')
+
+        # COMBINING CEDILLA
+        tmpsf = tmpsf.replace(u'\xb0c', u'c\u0327')
+
+        # COMBINING ACUTE ACCENT
+        tmpsf = tmpsf.replace(u'\xd4s', u's\u0301')
+
         new_field.add_subfield(subfield[0], tmpsf)
         if r'\x' in repr(tmpsf):
             global RECORD_COUNT
             print " * %d Hex value found in %s:%s - [%s] [%s]" % (
-                RECORD_COUNT, field.tag, subfield[0], tmpsf, repr(tmpsf)
+                RECORD_COUNT, field.tag, subfield[0], tmpsf.encode('utf8'), repr(tmpsf)
             )
 
     return new_field