--- /dev/null
+#!/usr/bin/env python
+"""
+Flag potential corrupted characters in MARC records
+
+This is a very simple script that simply prints out the subfields
+containing values matching \x in the repr() representation of the subfield
+using the tab-delimited format of record number, tag, code, Unicode value,
+and repr() value of the matching subfield.
+
+Shortcomings: if the subfield contains Unicode sequences (\u####) then the
+script assumes that they're valid and does not flag them.
+"""
+
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+
+marcfile = 'gibson_cppc-2011-05-24_362.mrc.utf8'
+
+record_cnt = 0
+
+reader = pymarc.MARCReader(
+ open(marcfile, mode='rb'), to_unicode=True
+)
+for record in reader:
+ record_cnt += 1
+ # print "Record %d" % record_cnt
+
+ field_cnt = 0
+ for field in record.get_fields():
+ if field.is_control_field():
+ continue
+
+ # print "Field %d, tag %s" % (field_cnt, field.tag)
+ for subfield in field:
+ # print repr(subfield[1])
+ if r'\x' not in repr(subfield[1]):
+ continue
+
+ print "%d\t%s\t%s\t%s\t%s" % (
+ record_cnt, field.tag, subfield[0], subfield[1].encode('utf8'), repr(subfield[1])
+ )
+