From 790c2fbf6e935beace2df53b4019ff78ff5baf39 Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Thu, 11 Aug 2011 16:45:45 -0400 Subject: [PATCH] Add a few more T&F data uncorruption fixes Note that we're now seeing conflicting patterns, so some legitimate characters are getting corrupted by the script, while others could go either way (l stroke vs modifier prime). As long as the fix rate is 95%+ then I'm happy. Signed-off-by: Dan Scott --- tools/ebooks/prep_ebook_records.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 745ed589f5..242eea2bf4 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -389,6 +389,16 @@ def clean_diacritics(field): tmpsf = tmpsf.replace(u'\xd4C', u'\u0106') tmpsf = tmpsf.replace(u'\xd4c', u'\u0107') + # LATIN LETTER L WITH STROKE + tmpsf = tmpsf.replace(u'\u00b0', u'\u0141') + + lstroke = tmpsf.find(u'\00b1') + if lstroke and tmpsf[lstroke + 1] == 'i': + # Modifier prime instead + tmpsf = tmpsf.replace(u'\u00b1', u'\u02b9') + else: + tmpsf = tmpsf.replace(u'\u00b1', u'\u0142') + # COMBINING MODIFIER LETTER HALF RING tmpsf = tmpsf.replace(u'\xb1', u'\u02be') @@ -398,6 +408,7 @@ def clean_diacritics(field): # COMBINING CEDILLA tmpsf = tmpsf.replace(u'\xb0c', u'c\u0327') tmpsf = tmpsf.replace(u'\u01afS', u'S\u0327') + tmpsf = tmpsf.replace(u'\u01afs', u's\u0327') # S WITH COMBINING ACUTE ACCENT tmpsf = tmpsf.replace(u'\xd4S', u'\u015a') @@ -415,10 +426,18 @@ def clean_diacritics(field): tmpsf = tmpsf.replace(u'\xdaR', u'\u0158') tmpsf = tmpsf.replace(u'\xdar', u'\u0159') + # E BREVE + tmpsf = tmpsf.replace(u'\xe6E', u'\u0114') + tmpsf = tmpsf.replace(u'\xe6e', u'\u0115') + # S CARON tmpsf = tmpsf.replace(u'\xdaS', u'\u0160') tmpsf = tmpsf.replace(u'\xdas', u'\u0161') + # U CARON + tmpsf = tmpsf.replace(u'\u00e6U', u'\u01d3') + tmpsf = tmpsf.replace(u'\u00e6u', u'\u01d4') + # G BREVE tmpsf = tmpsf.replace(u'\xe6G', u'\u011e') tmpsf = tmpsf.replace(u'\xe6g', u'\u011f') @@ -432,15 +451,24 @@ def clean_diacritics(field): # COMBINING LIGATURE LEFT HALF tmpsf = tmpsf.replace(u'\xd9i', u'i\ufe20') + tmpsf = tmpsf.replace(u'\xd9I', u'I\ufe20') tmpsf = tmpsf.replace(u'\xd9t', u't\ufe20') # COMBINING LIGATURE RIGHT HALF tmpsf = tmpsf.replace(u'\xfda', u'a\ufe21') tmpsf = tmpsf.replace(u'\xfds', u's\ufe21') + tmpsf = tmpsf.replace(u'\xfdU', u'U\ufe21') # MODIFIER LETTER PRIME tmpsf = tmpsf.replace(u'\xf0', u'\u02b9') + # LATIN SMALL LETTER DOTLESS I + tmpsf = tmpsf.replace(u'\u00a9', u'\u0131') + + # LATIN LETTER E WITH DOT ABOVE + tmpsf = tmpsf.replace(u'\u00feE', u'\u0116') + tmpsf = tmpsf.replace(u'\u00fee', u'\u0117') + new_field.add_subfield(subfield[0], tmpsf) global RECORD_COUNT if r'\x' in repr(tmpsf): -- 2.11.0