From: Dan Scott Date: Thu, 4 Oct 2012 21:16:15 +0000 (-0400) Subject: Ebooks: add an option to clean diacritics X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=7ecaf3385c207cc6a41a6fb64ad1fd826b025cd4;p=contrib%2FConifer.git Ebooks: add an option to clean diacritics Some sets of records seem to come with cleaner diacritics than others. Huzzah for an increasingly UTF8 world? Signed-off-by: Dan Scott --- diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 6908a7eccd..21b7a46e74 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -134,6 +134,9 @@ Required arguments: -W / --windsor : Add an 856 for University of Windsor Optional arguments: + -C / --clean : Try to clean up diacritics - some of the records we get + have corrupted diacritics. + -d / --duplicate : The name of the file to route ISBN duplicates to. -t / --tcn : The name of the file to route TCN duplicates to. @@ -162,6 +165,7 @@ def consolidate_options(opts): '-o': '--output', '-a': '--authorization', '-c': '--consortium', + '-C': '--clean', '-d': '--duplicate', '-e': '--ebrary', '-p': '--publisher', @@ -234,6 +238,8 @@ def check_options(options): clean_opts['consortium'] = options['--consortium'].decode('utf-8') clean_opts['authorization'] = options['--authorization'].decode('utf-8') + if '--clean' in options: + clean_opts['clean'] = True if '--duplicate' in options: clean_opts['duplicate'] = options['--duplicate'] @@ -372,10 +378,10 @@ def check_libraries(options): def parse_opts(): """Get command-line arguments from the script""" try: - _short_opts = 'i:o:a:c:p:P:ABLWe:d:t:u:n:s:h' + _short_opts = 'i:o:a:c:p:P:ABCLWe:d:t:u:n:s:h' _long_opts = ['input=', 'output=', 'authorization=', 'consortium=', 'publisher=', 'platform=', 'algoma', 'boreal', 'laurentian', - 'windsor', 'ebrary', 'duplicate=', 'tcn=', 'url=', 'note=', + 'windsor', 'ebrary', 'clean', 'duplicate=', 'tcn=', 'url=', 'note=', 'sample=', 'help' ] opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) @@ -513,7 +519,8 @@ def process_fields(record, options, bib_id, dup_flag): marked_isbn = mark_isbn_for_sfx(record, options) for field in record.get_fields(): - field = clean_diacritics(field) + if 'clean' in options: + field = clean_diacritics(field) # Process all of the 856 fields if field.tag == '856': new_fields = process_urls(field, options, publisher)