From 85794c0d7ba78023c807e55fda0a4d2d6c4bde1b Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Thu, 1 Sep 2011 14:06:04 -0400 Subject: [PATCH] It's nice when a script actually runs Signed-off-by: Dan Scott --- tools/ebooks/check_encoding.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/ebooks/check_encoding.py b/tools/ebooks/check_encoding.py index 42f147c204..c0c7cc006b 100644 --- a/tools/ebooks/check_encoding.py +++ b/tools/ebooks/check_encoding.py @@ -3,7 +3,7 @@ Flag potential corrupted characters in MARC records This is a very simple script that simply prints out the subfields -containing values matching \x in the repr() representation of the subfield +containing values matching \\x in the repr() representation of the subfield using the tab-delimited format of record number, tag, code, Unicode value, and repr() value of the matching subfield. @@ -13,7 +13,8 @@ script assumes that they're valid and does not flag them. import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2 -marcfile = 'gibson_cppc-2011-05-24_362.mrc.utf8' +marcfile = '/tmp/ravit/duke-2011-06-06_21.mrc.utf8' +marcfile = '/tmp/ravit/Duke_MARC_records_April_2011.mrc.utf8' record_cnt = 0 @@ -32,7 +33,7 @@ for record in reader: # print "Field %d, tag %s" % (field_cnt, field.tag) for subfield in field: # print repr(subfield[1]) - if r'\x' not in repr(subfield[1]): + if r'\x' not in repr(subfield[1]) and r'\u' not in repr(subfield[1]): continue print "%d\t%s\t%s\t%s\t%s" % ( -- 2.11.0