From 4bb5a91f6c31145205c823a47d6e030392863baf Mon Sep 17 00:00:00 2001
From: Dan Scott <dan@coffeecode.net>
Date: Wed, 24 Aug 2011 13:00:16 -0400
Subject: [PATCH] Add a script for quickly checking for encoding errors

Signed-off-by: Dan Scott <dscott@laurentian.ca>
---
 tools/ebooks/check_encoding.py | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 tools/ebooks/check_encoding.py

diff --git a/tools/ebooks/check_encoding.py b/tools/ebooks/check_encoding.py
new file mode 100644
index 0000000000..42f147c204
--- /dev/null
+++ b/tools/ebooks/check_encoding.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+"""
+Flag potential corrupted characters in MARC records
+
+This is a very simple script that simply prints out the subfields
+containing values matching \x in the repr() representation of the subfield
+using the tab-delimited format of record number, tag, code, Unicode value,
+and repr() value of the matching subfield.
+
+Shortcomings: if the subfield contains Unicode sequences (\u####) then the
+script assumes that they're valid and does not flag them.
+"""
+
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+
+marcfile = 'gibson_cppc-2011-05-24_362.mrc.utf8'
+
+record_cnt = 0
+
+reader = pymarc.MARCReader(
+    open(marcfile, mode='rb'), to_unicode=True
+)
+for record in reader:
+    record_cnt += 1
+    # print "Record %d" % record_cnt
+
+    field_cnt = 0
+    for field in record.get_fields():
+        if field.is_control_field():
+            continue
+
+        # print "Field %d, tag %s" % (field_cnt, field.tag)
+        for subfield in field:
+            # print repr(subfield[1])
+            if r'\x' not in repr(subfield[1]):
+                continue
+
+            print "%d\t%s\t%s\t%s\t%s" % (
+                record_cnt, field.tag, subfield[0], subfield[1].encode('utf8'), repr(subfield[1])
+            )
+
-- 
2.11.0