Flag potential corrupted characters in MARC records
This is a very simple script that simply prints out the subfields
-containing values matching \x in the repr() representation of the subfield
+containing values matching \\x in the repr() representation of the subfield
using the tab-delimited format of record number, tag, code, Unicode value,
and repr() value of the matching subfield.
import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
-marcfile = 'gibson_cppc-2011-05-24_362.mrc.utf8'
+marcfile = '/tmp/ravit/duke-2011-06-06_21.mrc.utf8'
+marcfile = '/tmp/ravit/Duke_MARC_records_April_2011.mrc.utf8'
record_cnt = 0
# print "Field %d, tag %s" % (field_cnt, field.tag)
for subfield in field:
# print repr(subfield[1])
- if r'\x' not in repr(subfield[1]):
+ if r'\x' not in repr(subfield[1]) and r'\u' not in repr(subfield[1]):
continue
print "%d\t%s\t%s\t%s\t%s" % (