Ebooks: add an option to clean diacritics

author Dan Scott <dscott@laurentian.ca>

Thu, 4 Oct 2012 21:16:15 +0000 (17:16 -0400)

committer Dan Scott <dscott@laurentian.ca>

Tue, 7 May 2013 18:58:10 +0000 (14:58 -0400)
author Dan Scott <dscott@laurentian.ca>
Thu, 4 Oct 2012 21:16:15 +0000 (17:16 -0400)
committer Dan Scott <dscott@laurentian.ca>
Tue, 7 May 2013 18:58:10 +0000 (14:58 -0400)
diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py

index 6908a7e..21b7a46 100644 (file)
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -134,6 +134,9 @@ Required arguments:
      -W / --windsor : Add an 856 for University of Windsor
  
  Optional arguments:
+    -C / --clean : Try to clean up diacritics - some of the records we get
+                   have corrupted diacritics.
+
      -d / --duplicate : The name of the file to route ISBN duplicates to.
  
      -t / --tcn : The name of the file to route TCN duplicates to.
@@ -162,6 +165,7 @@ def consolidate_options(opts):
          '-o': '--output',
          '-a': '--authorization',
          '-c': '--consortium',
+        '-C': '--clean',
          '-d': '--duplicate',
          '-e': '--ebrary',
          '-p': '--publisher',
@@ -234,6 +238,8 @@ def check_options(options):
      clean_opts['consortium'] = options['--consortium'].decode('utf-8')
      clean_opts['authorization'] = options['--authorization'].decode('utf-8')
  
+    if '--clean' in options:
+        clean_opts['clean'] = True
  
      if '--duplicate' in options:
          clean_opts['duplicate'] = options['--duplicate']
@@ -372,10 +378,10 @@ def check_libraries(options):
  def parse_opts():
      """Get command-line arguments from the script"""
      try:
-        _short_opts = 'i:o:a:c:p:P:ABLWe:d:t:u:n:s:h'
+        _short_opts = 'i:o:a:c:p:P:ABCLWe:d:t:u:n:s:h'
          _long_opts = ['input=', 'output=', 'authorization=', 'consortium=',
              'publisher=', 'platform=', 'algoma', 'boreal', 'laurentian',
-            'windsor', 'ebrary', 'duplicate=', 'tcn=', 'url=', 'note=',
+            'windsor', 'ebrary', 'clean', 'duplicate=', 'tcn=', 'url=', 'note=',
              'sample=', 'help'
          ]
          opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts)
@@ -513,7 +519,8 @@ def process_fields(record, options, bib_id, dup_flag):
      marked_isbn = mark_isbn_for_sfx(record, options)
  
      for field in record.get_fields():
-        field = clean_diacritics(field)
+        if 'clean' in options:
+            field = clean_diacritics(field)
          # Process all of the 856 fields
          if field.tag == '856':
              new_fields = process_urls(field, options, publisher)
author	Dan Scott <dscott@laurentian.ca>
	Thu, 4 Oct 2012 21:16:15 +0000 (17:16 -0400)
committer	Dan Scott <dscott@laurentian.ca>
	Tue, 7 May 2013 18:58:10 +0000 (14:58 -0400)