Add a script for quickly checking for encoding errors
authorDan Scott <dan@coffeecode.net>
Wed, 24 Aug 2011 17:00:16 +0000 (13:00 -0400)
committerDan Scott <dscott@laurentian.ca>
Wed, 8 May 2013 13:43:18 +0000 (09:43 -0400)
Signed-off-by: Dan Scott <dscott@laurentian.ca>
tools/ebooks/check_encoding.py [new file with mode: 0644]

diff --git a/tools/ebooks/check_encoding.py b/tools/ebooks/check_encoding.py
new file mode 100644 (file)
index 0000000..42f147c
--- /dev/null
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+"""
+Flag potential corrupted characters in MARC records
+
+This is a very simple script that simply prints out the subfields
+containing values matching \x in the repr() representation of the subfield
+using the tab-delimited format of record number, tag, code, Unicode value,
+and repr() value of the matching subfield.
+
+Shortcomings: if the subfield contains Unicode sequences (\u####) then the
+script assumes that they're valid and does not flag them.
+"""
+
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+
+marcfile = 'gibson_cppc-2011-05-24_362.mrc.utf8'
+
+record_cnt = 0
+
+reader = pymarc.MARCReader(
+    open(marcfile, mode='rb'), to_unicode=True
+)
+for record in reader:
+    record_cnt += 1
+    # print "Record %d" % record_cnt
+
+    field_cnt = 0
+    for field in record.get_fields():
+        if field.is_control_field():
+            continue
+
+        # print "Field %d, tag %s" % (field_cnt, field.tag)
+        for subfield in field:
+            # print repr(subfield[1])
+            if r'\x' not in repr(subfield[1]):
+                continue
+
+            print "%d\t%s\t%s\t%s\t%s" % (
+                record_cnt, field.tag, subfield[0], subfield[1].encode('utf8'), repr(subfield[1])
+            )
+