From: Dan Scott Date: Fri, 29 Jul 2011 15:58:36 +0000 (-0400) Subject: Begin addressing the specific manglings of the CRKN records X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=609dc20ec57a08ea0cb51a57e71a743d09e9476c;p=contrib%2FConifer.git Begin addressing the specific manglings of the CRKN records To begin with, correct the mangled macrons and modifier letter half ring that are popular in the Arabic language. Here's hoping that correcting these byte sequences doesn't lead to corruption elsewhere. Signed-off-by: Dan Scott --- diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 993a4c57c8..c49e0dab9b 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -310,6 +310,7 @@ def process_fields(record, options): marked_isbn = mark_isbn_for_sfx(record, options) for field in record.get_fields(): + field = clean_diacritics(field) # Process all of the 856 fields if field.tag == '856': new_fields = process_urls(field, options, publisher) @@ -338,6 +339,42 @@ def process_fields(record, options): return new_record +def clean_diacritics(field): + """ + Change specific patterns of bytes into other patterns of bytes + + We get some horribly corrupted records. This is an attempt to reverse the + horror via equally horrible byte-matching for known messed up conditions. + """ + + if field.is_control_field(): + return field + + new_field = pymarc.Field( + tag=field.tag, + indicators=[field.indicator1, field.indicator2] + ) + + for subfield in field: + if r'\x' not in repr(subfield[1]): + new_field.add_subfield(subfield[0], subfield[1]) + continue + + # Let the substitutions commence - maybe move to a map table? + tmpsf = subfield[1].replace(u'\xd5a', u'a\u0304') + tmpsf = tmpsf.replace(u'\xd5i', u'i\u0304') + tmpsf = tmpsf.replace(u'i\xb1', u'i\u02be') + + new_field.add_subfield(subfield[0], tmpsf) + if r'\x' in repr(tmpsf): + global RECORD_COUNT + print " * %d Hex value found in %s:%s - [%s] [%s]" % ( + RECORD_COUNT, field.tag, subfield[0], tmpsf, repr(tmpsf) + ) + + return new_field + + def add_publisher(record, options): """ This is a convoluted way to avoid creating a new 710 if we already