Correct I BREVE for Mr. Sakharov
authorDan Scott <dan@coffeecode.net>
Tue, 2 Aug 2011 19:44:36 +0000 (15:44 -0400)
committerDan Scott <dscott@laurentian.ca>
Mon, 12 Nov 2012 17:57:54 +0000 (12:57 -0500)
We were adding an extra i to the composed I BREVE chars.

Also log diffs so we can eyeball the changes and know that they are
good.

Signed-off-by: Dan Scott <dscott@laurentian.ca>
tools/ebooks/prep_ebook_records.py

index 43d756e..ef39467 100644 (file)
@@ -401,8 +401,8 @@ def clean_diacritics(field):
         tmpsf = tmpsf.replace(u'\xe6g', u'\u011f')
 
         # I BREVE
-        tmpsf = tmpsf.replace(u'\xe6I', u'i\u012c')
-        tmpsf = tmpsf.replace(u'\xe6i', u'i\u012d')
+        tmpsf = tmpsf.replace(u'\xe6I', u'\u012c')
+        tmpsf = tmpsf.replace(u'\xe6i', u'\u012d')
 
         # COMBINING DOT ABOVE
         tmpsf = tmpsf.replace(u'\xfeI', u'I\u0307')
@@ -419,12 +419,17 @@ def clean_diacritics(field):
         tmpsf = tmpsf.replace(u'\xf0', u'\u02b9')
 
         new_field.add_subfield(subfield[0], tmpsf)
+        global RECORD_COUNT
         if r'\x' in repr(tmpsf):
-            global RECORD_COUNT
             print " * %d Hex value found in %s:%s - [%s] [%s]" % (
                 RECORD_COUNT, field.tag, subfield[0], tmpsf.encode('utf8'), repr(tmpsf)
             )
 
+        if (repr(subfield[1]) != repr(tmpsf)):
+            print "* %d\tOld: [%s]\tNew: [%s]" % (
+                RECORD_COUNT, subfield[1].encode('utf8'), tmpsf.encode('utf8')
+            )
+
     return new_field