It's nice when a script actually runs

author Dan Scott <dan@coffeecode.net>

Thu, 1 Sep 2011 18:06:04 +0000 (14:06 -0400)

committer Dan Scott <dscott@laurentian.ca>

Tue, 7 May 2013 18:38:17 +0000 (14:38 -0400)
author Dan Scott <dan@coffeecode.net>
Thu, 1 Sep 2011 18:06:04 +0000 (14:06 -0400)
committer Dan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:38:17 +0000 (14:38 -0400)
diff --git a/tools/ebooks/check_encoding.py b/tools/ebooks/check_encoding.py

index 42f147c..c0c7cc0 100644 (file)
--- a/tools/ebooks/check_encoding.py
+++ b/tools/ebooks/check_encoding.py
@@ -3,7 +3,7 @@
  Flag potential corrupted characters in MARC records
  
  This is a very simple script that simply prints out the subfields
-containing values matching \x in the repr() representation of the subfield
+containing values matching \\x in the repr() representation of the subfield
  using the tab-delimited format of record number, tag, code, Unicode value,
  and repr() value of the matching subfield.
  
@@ -13,7 +13,8 @@ script assumes that they're valid and does not flag them.
  
  import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
  
-marcfile = 'gibson_cppc-2011-05-24_362.mrc.utf8'
+marcfile = '/tmp/ravit/duke-2011-06-06_21.mrc.utf8'
+marcfile = '/tmp/ravit/Duke_MARC_records_April_2011.mrc.utf8'
  
  record_cnt = 0
  
@@ -32,7 +33,7 @@ for record in reader:
          # print "Field %d, tag %s" % (field_cnt, field.tag)
          for subfield in field:
              # print repr(subfield[1])
-            if r'\x' not in repr(subfield[1]):
+            if r'\x' not in repr(subfield[1]) and r'\u' not in repr(subfield[1]):
                  continue
  
              print "%d\t%s\t%s\t%s\t%s" % (
author	Dan Scott <dan@coffeecode.net>
	Thu, 1 Sep 2011 18:06:04 +0000 (14:06 -0400)
committer	Dan Scott <dscott@laurentian.ca>
	Tue, 7 May 2013 18:38:17 +0000 (14:38 -0400)