From: Dan Scott Date: Thu, 11 Aug 2011 20:45:45 +0000 (-0400) Subject: Add a few more T&F data uncorruption fixes X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=790c2fbf6e935beace2df53b4019ff78ff5baf39;p=contrib%2FConifer.git Add a few more T&F data uncorruption fixes Note that we're now seeing conflicting patterns, so some legitimate characters are getting corrupted by the script, while others could go either way (l stroke vs modifier prime). As long as the fix rate is 95%+ then I'm happy. Signed-off-by: Dan Scott --- diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 745ed589f5..242eea2bf4 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -389,6 +389,16 @@ def clean_diacritics(field): tmpsf = tmpsf.replace(u'\xd4C', u'\u0106') tmpsf = tmpsf.replace(u'\xd4c', u'\u0107') + # LATIN LETTER L WITH STROKE + tmpsf = tmpsf.replace(u'\u00b0', u'\u0141') + + lstroke = tmpsf.find(u'\00b1') + if lstroke and tmpsf[lstroke + 1] == 'i': + # Modifier prime instead + tmpsf = tmpsf.replace(u'\u00b1', u'\u02b9') + else: + tmpsf = tmpsf.replace(u'\u00b1', u'\u0142') + # COMBINING MODIFIER LETTER HALF RING tmpsf = tmpsf.replace(u'\xb1', u'\u02be') @@ -398,6 +408,7 @@ def clean_diacritics(field): # COMBINING CEDILLA tmpsf = tmpsf.replace(u'\xb0c', u'c\u0327') tmpsf = tmpsf.replace(u'\u01afS', u'S\u0327') + tmpsf = tmpsf.replace(u'\u01afs', u's\u0327') # S WITH COMBINING ACUTE ACCENT tmpsf = tmpsf.replace(u'\xd4S', u'\u015a') @@ -415,10 +426,18 @@ def clean_diacritics(field): tmpsf = tmpsf.replace(u'\xdaR', u'\u0158') tmpsf = tmpsf.replace(u'\xdar', u'\u0159') + # E BREVE + tmpsf = tmpsf.replace(u'\xe6E', u'\u0114') + tmpsf = tmpsf.replace(u'\xe6e', u'\u0115') + # S CARON tmpsf = tmpsf.replace(u'\xdaS', u'\u0160') tmpsf = tmpsf.replace(u'\xdas', u'\u0161') + # U CARON + tmpsf = tmpsf.replace(u'\u00e6U', u'\u01d3') + tmpsf = tmpsf.replace(u'\u00e6u', u'\u01d4') + # G BREVE tmpsf = tmpsf.replace(u'\xe6G', u'\u011e') tmpsf = tmpsf.replace(u'\xe6g', u'\u011f') @@ -432,15 +451,24 @@ def clean_diacritics(field): # COMBINING LIGATURE LEFT HALF tmpsf = tmpsf.replace(u'\xd9i', u'i\ufe20') + tmpsf = tmpsf.replace(u'\xd9I', u'I\ufe20') tmpsf = tmpsf.replace(u'\xd9t', u't\ufe20') # COMBINING LIGATURE RIGHT HALF tmpsf = tmpsf.replace(u'\xfda', u'a\ufe21') tmpsf = tmpsf.replace(u'\xfds', u's\ufe21') + tmpsf = tmpsf.replace(u'\xfdU', u'U\ufe21') # MODIFIER LETTER PRIME tmpsf = tmpsf.replace(u'\xf0', u'\u02b9') + # LATIN SMALL LETTER DOTLESS I + tmpsf = tmpsf.replace(u'\u00a9', u'\u0131') + + # LATIN LETTER E WITH DOT ABOVE + tmpsf = tmpsf.replace(u'\u00feE', u'\u0116') + tmpsf = tmpsf.replace(u'\u00fee', u'\u0117') + new_field.add_subfield(subfield[0], tmpsf) global RECORD_COUNT if r'\x' in repr(tmpsf):