Add a few more T&F data uncorruption fixes

author Dan Scott <dan@coffeecode.net>

Thu, 11 Aug 2011 20:45:45 +0000 (16:45 -0400)

committer Dan Scott <dscott@laurentian.ca>

Mon, 12 Nov 2012 17:57:54 +0000 (12:57 -0500)
author Dan Scott <dan@coffeecode.net>
Thu, 11 Aug 2011 20:45:45 +0000 (16:45 -0400)
committer Dan Scott <dscott@laurentian.ca>
Mon, 12 Nov 2012 17:57:54 +0000 (12:57 -0500)
diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py

index 745ed58..242eea2 100644 (file)
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -389,6 +389,16 @@ def clean_diacritics(field):
          tmpsf = tmpsf.replace(u'\xd4C', u'\u0106')
          tmpsf = tmpsf.replace(u'\xd4c', u'\u0107')
  
+        # LATIN LETTER L WITH STROKE
+        tmpsf = tmpsf.replace(u'\u00b0', u'\u0141')
+
+        lstroke = tmpsf.find(u'\00b1')
+        if lstroke and tmpsf[lstroke + 1] == 'i':
+            # Modifier prime instead
+            tmpsf = tmpsf.replace(u'\u00b1', u'\u02b9')
+        else:
+            tmpsf = tmpsf.replace(u'\u00b1', u'\u0142')
+
          # COMBINING MODIFIER LETTER HALF RING
          tmpsf = tmpsf.replace(u'\xb1', u'\u02be')
  
@@ -398,6 +408,7 @@ def clean_diacritics(field):
          # COMBINING CEDILLA
          tmpsf = tmpsf.replace(u'\xb0c', u'c\u0327')
          tmpsf = tmpsf.replace(u'\u01afS', u'S\u0327')
+        tmpsf = tmpsf.replace(u'\u01afs', u's\u0327')
  
          # S WITH COMBINING ACUTE ACCENT
          tmpsf = tmpsf.replace(u'\xd4S', u'\u015a')
@@ -415,10 +426,18 @@ def clean_diacritics(field):
          tmpsf = tmpsf.replace(u'\xdaR', u'\u0158')
          tmpsf = tmpsf.replace(u'\xdar', u'\u0159')
  
+        # E BREVE
+        tmpsf = tmpsf.replace(u'\xe6E', u'\u0114')
+        tmpsf = tmpsf.replace(u'\xe6e', u'\u0115')
+
          # S CARON
          tmpsf = tmpsf.replace(u'\xdaS', u'\u0160')
          tmpsf = tmpsf.replace(u'\xdas', u'\u0161')
  
+        # U CARON
+        tmpsf = tmpsf.replace(u'\u00e6U', u'\u01d3')
+        tmpsf = tmpsf.replace(u'\u00e6u', u'\u01d4')
+
          # G BREVE
          tmpsf = tmpsf.replace(u'\xe6G', u'\u011e')
          tmpsf = tmpsf.replace(u'\xe6g', u'\u011f')
@@ -432,15 +451,24 @@ def clean_diacritics(field):
  
          # COMBINING LIGATURE LEFT HALF
          tmpsf = tmpsf.replace(u'\xd9i', u'i\ufe20')
+        tmpsf = tmpsf.replace(u'\xd9I', u'I\ufe20')
          tmpsf = tmpsf.replace(u'\xd9t', u't\ufe20')
  
          # COMBINING LIGATURE RIGHT HALF
          tmpsf = tmpsf.replace(u'\xfda', u'a\ufe21')
          tmpsf = tmpsf.replace(u'\xfds', u's\ufe21')
+        tmpsf = tmpsf.replace(u'\xfdU', u'U\ufe21')
  
          # MODIFIER LETTER PRIME
          tmpsf = tmpsf.replace(u'\xf0', u'\u02b9')
  
+        # LATIN SMALL LETTER DOTLESS I
+        tmpsf = tmpsf.replace(u'\u00a9', u'\u0131')
+
+        # LATIN LETTER E WITH DOT ABOVE
+        tmpsf = tmpsf.replace(u'\u00feE', u'\u0116')
+        tmpsf = tmpsf.replace(u'\u00fee', u'\u0117')
+
          new_field.add_subfield(subfield[0], tmpsf)
          global RECORD_COUNT
          if r'\x' in repr(tmpsf):
author	Dan Scott <dan@coffeecode.net>
	Thu, 11 Aug 2011 20:45:45 +0000 (16:45 -0400)
committer	Dan Scott <dscott@laurentian.ca>
	Mon, 12 Nov 2012 17:57:54 +0000 (12:57 -0500)