From 790c2fbf6e935beace2df53b4019ff78ff5baf39 Mon Sep 17 00:00:00 2001
From: Dan Scott <dan@coffeecode.net>
Date: Thu, 11 Aug 2011 16:45:45 -0400
Subject: [PATCH] Add a few more T&F data uncorruption fixes

Note that we're now seeing conflicting patterns, so some
legitimate characters are getting corrupted by the script,
while others could go either way (l stroke vs modifier prime).
As long as the fix rate is 95%+ then I'm happy.

Signed-off-by: Dan Scott <dscott@laurentian.ca>
---
 tools/ebooks/prep_ebook_records.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py
index 745ed589f5..242eea2bf4 100644
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -389,6 +389,16 @@ def clean_diacritics(field):
         tmpsf = tmpsf.replace(u'\xd4C', u'\u0106')
         tmpsf = tmpsf.replace(u'\xd4c', u'\u0107')
 
+        # LATIN LETTER L WITH STROKE
+        tmpsf = tmpsf.replace(u'\u00b0', u'\u0141')
+
+        lstroke = tmpsf.find(u'\00b1')
+        if lstroke and tmpsf[lstroke + 1] == 'i':
+            # Modifier prime instead
+            tmpsf = tmpsf.replace(u'\u00b1', u'\u02b9')
+        else:
+            tmpsf = tmpsf.replace(u'\u00b1', u'\u0142')
+
         # COMBINING MODIFIER LETTER HALF RING
         tmpsf = tmpsf.replace(u'\xb1', u'\u02be')
 
@@ -398,6 +408,7 @@ def clean_diacritics(field):
         # COMBINING CEDILLA
         tmpsf = tmpsf.replace(u'\xb0c', u'c\u0327')
         tmpsf = tmpsf.replace(u'\u01afS', u'S\u0327')
+        tmpsf = tmpsf.replace(u'\u01afs', u's\u0327')
 
         # S WITH COMBINING ACUTE ACCENT
         tmpsf = tmpsf.replace(u'\xd4S', u'\u015a')
@@ -415,10 +426,18 @@ def clean_diacritics(field):
         tmpsf = tmpsf.replace(u'\xdaR', u'\u0158')
         tmpsf = tmpsf.replace(u'\xdar', u'\u0159')
 
+        # E BREVE
+        tmpsf = tmpsf.replace(u'\xe6E', u'\u0114')
+        tmpsf = tmpsf.replace(u'\xe6e', u'\u0115')
+
         # S CARON
         tmpsf = tmpsf.replace(u'\xdaS', u'\u0160')
         tmpsf = tmpsf.replace(u'\xdas', u'\u0161')
 
+        # U CARON
+        tmpsf = tmpsf.replace(u'\u00e6U', u'\u01d3')
+        tmpsf = tmpsf.replace(u'\u00e6u', u'\u01d4')
+
         # G BREVE
         tmpsf = tmpsf.replace(u'\xe6G', u'\u011e')
         tmpsf = tmpsf.replace(u'\xe6g', u'\u011f')
@@ -432,15 +451,24 @@ def clean_diacritics(field):
 
         # COMBINING LIGATURE LEFT HALF
         tmpsf = tmpsf.replace(u'\xd9i', u'i\ufe20')
+        tmpsf = tmpsf.replace(u'\xd9I', u'I\ufe20')
         tmpsf = tmpsf.replace(u'\xd9t', u't\ufe20')
 
         # COMBINING LIGATURE RIGHT HALF
         tmpsf = tmpsf.replace(u'\xfda', u'a\ufe21')
         tmpsf = tmpsf.replace(u'\xfds', u's\ufe21')
+        tmpsf = tmpsf.replace(u'\xfdU', u'U\ufe21')
 
         # MODIFIER LETTER PRIME
         tmpsf = tmpsf.replace(u'\xf0', u'\u02b9')
 
+        # LATIN SMALL LETTER DOTLESS I
+        tmpsf = tmpsf.replace(u'\u00a9', u'\u0131')
+
+        # LATIN LETTER E WITH DOT ABOVE
+        tmpsf = tmpsf.replace(u'\u00feE', u'\u0116')
+        tmpsf = tmpsf.replace(u'\u00fee', u'\u0117')
+
         new_field.add_subfield(subfield[0], tmpsf)
         global RECORD_COUNT
         if r'\x' in repr(tmpsf):
-- 
2.11.0