From: Dan Scott Date: Tue, 24 Apr 2018 16:36:56 +0000 (-0400) Subject: Support Python 3 X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=226e5cc11c964d807a48839348d8275e4233d70c;p=contrib%2FConifer.git Support Python 3 Switch to requests and beautifulsoup4 modules, use the interoperable print() function, adjust to the updated exceptions approach, and use dict.items() instead of the (gone) dict.iteritems(). Signed-off-by: Dan Scott --- diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index 6de5e7ad06..b9a4a02a06 100755 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -14,11 +14,12 @@ requirements that would be the same for each record and therefore can be accommodated in batch load. """ -import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json +import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, json import codecs, copy -from urllib import quote +import requests from datetime import date -from BeautifulSoup import BeautifulSoup +from bs4 import BeautifulSoup +import traceback RECORD_COUNT = 0 DUP_COUNT = 0 @@ -76,7 +77,7 @@ def do_help(): Print help for the Conifer ebook MARC processor ''' - print ''' + print(''' Conifer ebook MARC processor This script takes a set of MARC records and processes them to generate a set @@ -151,7 +152,7 @@ Optional arguments: Examples: %s --algoma -i crkn.mrc -o /tmp/crkn_out.mrc -p "eBrary Inc." - ''' % sys.argv[0] + ''' % (sys.argv[0],)) sys.exit(0) def consolidate_options(opts): @@ -203,9 +204,9 @@ def check_options(options): if '--help' in options: do_help() - for reqkey, reqwarn in _req.iteritems(): + for reqkey, reqwarn in _req.items(): if reqkey not in options: - print reqwarn + print(reqwarn) _help = True _libraries = check_libraries(options) @@ -260,7 +261,7 @@ def check_options(options): for optkey, optval in _string_opts.items(): if optkey in options: - clean_opts[optval] = options[optkey].decode('utf-8') + clean_opts[optval] = options[optkey] clean_opts['libraries'] = _libraries clean_opts['input'] = _input @@ -278,8 +279,8 @@ def evergreen_request(method, *args, **kwargs): params += ['param=%s' % quote(json.dumps(a)) for a in args] url = '%s?%s' % (GATEWAY_URL, '&'.join(params)) #print '--->', url - req = urllib2.urlopen(url) - resp = json.load(req) + req = requests.get(url) + resp = req.json() if resp['status'] != 200: raise Exception('error during evergreen request', resp) payload = resp['payload'] @@ -387,8 +388,8 @@ def parse_opts(): 'cut-field=', 'help' ] opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) - except getopt.GetoptError, ex: - print "* %s" % str(ex) + except getopt.GetoptError as ex: + print("* %s" % (str(ex),)) do_help() _options = consolidate_options(opts[0]) @@ -407,7 +408,7 @@ def process_marc(options): reader = pymarc.MARCReader( open(options['input'], mode='rb'), to_unicode=True ) - except Exception, ex: + except Exception as ex: print("Could not open input file [%s]" % options['input']) for record in reader: @@ -423,7 +424,7 @@ def process_record(record, options, files): % (RECORD_COUNT, options['input']) ) else: - print ("%d - %s" % (RECORD_COUNT, record['856'])) + print("%d - %s" % (RECORD_COUNT, record['856'])) dupe_flags = {} @@ -465,8 +466,9 @@ def process_record(record, options, files): (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0) )): files['sample'].write(new_record) - except Exception, ex: + except Exception as ex: print("* Error processing record %d - %s" % (RECORD_COUNT, ex)) + traceback.print_exc() def process_fields(record, options): """Decide which fields to add, delete, and keep""" @@ -685,15 +687,15 @@ def clean_diacritics(field): new_field.add_subfield(subfield[0], tmpsf) global RECORD_COUNT if r'\x' in repr(tmpsf): - print " * %d Hex value found in %s:%s - [%s] [%s]" % ( + print(" * %d Hex value found in %s:%s - [%s] [%s]" % ( RECORD_COUNT, field.tag, subfield[0], tmpsf.encode('utf8'), repr(tmpsf) - ) + )) if (repr(subfield[1]) != repr(tmpsf)): - print "* %d\tOld: [%s]\tNew: [%s]" % ( + print("* %d\tOld: [%s]\tNew: [%s]" % ( RECORD_COUNT, subfield[1].encode('utf8'), tmpsf.encode('utf8') - ) + )) return new_field @@ -703,7 +705,7 @@ def add_publisher(record, options): have a matching 710 and just need to add the publisher relator code. """ - publisher = options['publisher'].decode('utf-8') + publisher = options['publisher'] munge_publisher = False need_publisher = True @@ -757,7 +759,7 @@ def add_platform(record, options): if not 'platform' in options: return False - platform = options['platform'].decode('utf-8') + platform = options['platform'] need_platform = True # Iterate through all of the existing 710 fields @@ -828,16 +830,19 @@ def check_for_isbn(options, lib, isbnval): "sfx.response_type=multi_obj_detailed_xml" \ "&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval) + headers = {'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'} + req = requests.get(url, headers=headers) try: - req = urllib2.urlopen(url) - sfx_res = BeautifulSoup(req.read()) - except urllib2.HTTPError, ex: + req.raise_for_status() + except requests.exceptions.HTTPError as ex: print("%s for URL %s" % (ex, url)) return False - except urllib2.URLError, ex: + except requests.exceptions.URLError as ex: print("%s for URL %s" % (ex, url)) return False + sfx_res = BeautifulSoup(req.text, "html.parser") + # We want a target with a service_type element of 'getFullTxt' targets = sfx_res.ctx_obj.ctx_obj_targets.findAll( 'target', recursive=False @@ -1033,7 +1038,7 @@ def process_urls(field, options, publisher): new_fields = [] if not field['u']: - print "* No subfield 'u' found in this 856" + print("* No subfield 'u' found in this 856") return None # If we have a ToC or author notes or whatever, replace with content @@ -1075,16 +1080,18 @@ def substitute_content(field): return None # Get the data from the supplied URL + headers = {'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'} + req = requests.get(url, headers=headers) try: - req = urllib2.urlopen(url) - raw_content = BeautifulSoup(req.read()) - except urllib2.HTTPError, ex: + req.raise_for_status() + except requests.exceptions.HTTPError as ex: print("%s for URL %s" % (ex, url)) return None - except urllib2.URLError, ex: + except requests.exceptions.URLError as ex: print("%s for URL %s" % (ex, url)) return None + raw_content = BeautifulSoup(req.text, "html.parser") content = process_loc_data(raw_content) if not content: return None @@ -1128,19 +1135,22 @@ def process_loc_data(raw_content): # Get all of the text after the horizontal rule content = ' '.join( raw_content.find('hr').findAllNext(text=True) - ).encode('utf8') + ) # Remove linefeeds content = content.replace('\n', ' ') content = content.replace('\r', ' ') + # Replace multiple contiguous whitespace with a single space + content = re.sub(r'\s+', r' ', content) + # Remove inline subject headings to avoid too much indexing boost lcsh = content.find('Library of Congress subject headings') if lcsh > -1: content = content[0:lcsh] # Farewell, starting and ending whitespace - content = content.strip().decode('utf8') + content = content.strip() return content @@ -1208,10 +1218,10 @@ if __name__ == '__main__': if fname in OPTIONS: try: if 'to-format' in OPTIONS and OPTIONS['to-format'] == 'xml': - FILES[fname] = codecs.open(OPTIONS[fname], 'w', 'utf-8') + FILES[fname] = codecs.open(OPTIONS[fname], 'wb', 'utf-8') else: - FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'w')) - except Exception, ex: + FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'wb')) + except Exception as ex: print("Could not open output file [%s]: %s" % (OPTIONS[fname], ex)) process_marc(OPTIONS)