From 4bb5a91f6c31145205c823a47d6e030392863baf Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Wed, 24 Aug 2011 13:00:16 -0400 Subject: [PATCH] Add a script for quickly checking for encoding errors Signed-off-by: Dan Scott --- tools/ebooks/check_encoding.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 tools/ebooks/check_encoding.py diff --git a/tools/ebooks/check_encoding.py b/tools/ebooks/check_encoding.py new file mode 100644 index 0000000000..42f147c204 --- /dev/null +++ b/tools/ebooks/check_encoding.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +""" +Flag potential corrupted characters in MARC records + +This is a very simple script that simply prints out the subfields +containing values matching \x in the repr() representation of the subfield +using the tab-delimited format of record number, tag, code, Unicode value, +and repr() value of the matching subfield. + +Shortcomings: if the subfield contains Unicode sequences (\u####) then the +script assumes that they're valid and does not flag them. +""" + +import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2 + +marcfile = 'gibson_cppc-2011-05-24_362.mrc.utf8' + +record_cnt = 0 + +reader = pymarc.MARCReader( + open(marcfile, mode='rb'), to_unicode=True +) +for record in reader: + record_cnt += 1 + # print "Record %d" % record_cnt + + field_cnt = 0 + for field in record.get_fields(): + if field.is_control_field(): + continue + + # print "Field %d, tag %s" % (field_cnt, field.tag) + for subfield in field: + # print repr(subfield[1]) + if r'\x' not in repr(subfield[1]): + continue + + print "%d\t%s\t%s\t%s\t%s" % ( + record_cnt, field.tag, subfield[0], subfield[1].encode('utf8'), repr(subfield[1]) + ) + -- 2.11.0