From: Dan Scott <dan@coffeecode.net>
Date: Thu, 11 Aug 2011 20:45:45 +0000 (-0400)
Subject: Add a few more T&F data uncorruption fixes
X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=790c2fbf6e935beace2df53b4019ff78ff5baf39;p=contrib%2FConifer.git

Add a few more T&F data uncorruption fixes

Note that we're now seeing conflicting patterns, so some
legitimate characters are getting corrupted by the script,
while others could go either way (l stroke vs modifier prime).
As long as the fix rate is 95%+ then I'm happy.

Signed-off-by: Dan Scott <dscott@laurentian.ca>
---

diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py
index 745ed589f5..242eea2bf4 100644
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -389,6 +389,16 @@ def clean_diacritics(field):
         tmpsf = tmpsf.replace(u'\xd4C', u'\u0106')
         tmpsf = tmpsf.replace(u'\xd4c', u'\u0107')
 
+        # LATIN LETTER L WITH STROKE
+        tmpsf = tmpsf.replace(u'\u00b0', u'\u0141')
+
+        lstroke = tmpsf.find(u'\00b1')
+        if lstroke and tmpsf[lstroke + 1] == 'i':
+            # Modifier prime instead
+            tmpsf = tmpsf.replace(u'\u00b1', u'\u02b9')
+        else:
+            tmpsf = tmpsf.replace(u'\u00b1', u'\u0142')
+
         # COMBINING MODIFIER LETTER HALF RING
         tmpsf = tmpsf.replace(u'\xb1', u'\u02be')
 
@@ -398,6 +408,7 @@ def clean_diacritics(field):
         # COMBINING CEDILLA
         tmpsf = tmpsf.replace(u'\xb0c', u'c\u0327')
         tmpsf = tmpsf.replace(u'\u01afS', u'S\u0327')
+        tmpsf = tmpsf.replace(u'\u01afs', u's\u0327')
 
         # S WITH COMBINING ACUTE ACCENT
         tmpsf = tmpsf.replace(u'\xd4S', u'\u015a')
@@ -415,10 +426,18 @@ def clean_diacritics(field):
         tmpsf = tmpsf.replace(u'\xdaR', u'\u0158')
         tmpsf = tmpsf.replace(u'\xdar', u'\u0159')
 
+        # E BREVE
+        tmpsf = tmpsf.replace(u'\xe6E', u'\u0114')
+        tmpsf = tmpsf.replace(u'\xe6e', u'\u0115')
+
         # S CARON
         tmpsf = tmpsf.replace(u'\xdaS', u'\u0160')
         tmpsf = tmpsf.replace(u'\xdas', u'\u0161')
 
+        # U CARON
+        tmpsf = tmpsf.replace(u'\u00e6U', u'\u01d3')
+        tmpsf = tmpsf.replace(u'\u00e6u', u'\u01d4')
+
         # G BREVE
         tmpsf = tmpsf.replace(u'\xe6G', u'\u011e')
         tmpsf = tmpsf.replace(u'\xe6g', u'\u011f')
@@ -432,15 +451,24 @@ def clean_diacritics(field):
 
         # COMBINING LIGATURE LEFT HALF
         tmpsf = tmpsf.replace(u'\xd9i', u'i\ufe20')
+        tmpsf = tmpsf.replace(u'\xd9I', u'I\ufe20')
         tmpsf = tmpsf.replace(u'\xd9t', u't\ufe20')
 
         # COMBINING LIGATURE RIGHT HALF
         tmpsf = tmpsf.replace(u'\xfda', u'a\ufe21')
         tmpsf = tmpsf.replace(u'\xfds', u's\ufe21')
+        tmpsf = tmpsf.replace(u'\xfdU', u'U\ufe21')
 
         # MODIFIER LETTER PRIME
         tmpsf = tmpsf.replace(u'\xf0', u'\u02b9')
 
+        # LATIN SMALL LETTER DOTLESS I
+        tmpsf = tmpsf.replace(u'\u00a9', u'\u0131')
+
+        # LATIN LETTER E WITH DOT ABOVE
+        tmpsf = tmpsf.replace(u'\u00feE', u'\u0116')
+        tmpsf = tmpsf.replace(u'\u00fee', u'\u0117')
+
         new_field.add_subfield(subfield[0], tmpsf)
         global RECORD_COUNT
         if r'\x' in repr(tmpsf):