From 64df62ee28320f7df4c013bbf9ca65a99e68a748 Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Fri, 29 Jul 2011 13:21:30 -0400 Subject: [PATCH] Iterating through CRKN_OUP.mrc for more corruption to fix Tildes, accute accents, cedillas, we got em all. Signed-off-by: Dan Scott --- tools/ebooks/prep_ebook_records.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index c49e0dab9b..f81554c2ff 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -361,15 +361,31 @@ def clean_diacritics(field): continue # Let the substitutions commence - maybe move to a map table? + + # COMBINING MACRON tmpsf = subfield[1].replace(u'\xd5a', u'a\u0304') + tmpsf = tmpsf.replace(u'\xd5e', u'e\u0304') tmpsf = tmpsf.replace(u'\xd5i', u'i\u0304') + tmpsf = tmpsf.replace(u'\xd5o', u'o\u0304') + tmpsf = tmpsf.replace(u'\xd5u', u'u\u0304') + + # COMBINING MODIFIER LETTER HALF RING tmpsf = tmpsf.replace(u'i\xb1', u'i\u02be') + # COMBINING TILDE + tmpsf = tmpsf.replace(u'\xf5n', u'n\u0303') + + # COMBINING CEDILLA + tmpsf = tmpsf.replace(u'\xb0c', u'c\u0327') + + # COMBINING ACUTE ACCENT + tmpsf = tmpsf.replace(u'\xd4s', u's\u0301') + new_field.add_subfield(subfield[0], tmpsf) if r'\x' in repr(tmpsf): global RECORD_COUNT print " * %d Hex value found in %s:%s - [%s] [%s]" % ( - RECORD_COUNT, field.tag, subfield[0], tmpsf, repr(tmpsf) + RECORD_COUNT, field.tag, subfield[0], tmpsf.encode('utf8'), repr(tmpsf) ) return new_field -- 2.11.0