Support Python 3

author Dan Scott <dscott@laurentian.ca>

Tue, 24 Apr 2018 16:36:56 +0000 (12:36 -0400)

committer Dan Scott <dscott@laurentian.ca>

Tue, 24 Apr 2018 16:36:56 +0000 (12:36 -0400)
author Dan Scott <dscott@laurentian.ca>
Tue, 24 Apr 2018 16:36:56 +0000 (12:36 -0400)
committer Dan Scott <dscott@laurentian.ca>
Tue, 24 Apr 2018 16:36:56 +0000 (12:36 -0400)
diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py

index 6de5e7a..b9a4a02 100755 (executable)
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -14,11 +14,12 @@ requirements that would be the same for each record and therefore can
  be accommodated in batch load.
  """
  
-import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, json
  import codecs, copy
-from urllib import quote
+import requests
  from datetime import date
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
+import traceback
  
  RECORD_COUNT = 0
  DUP_COUNT = 0
@@ -76,7 +77,7 @@ def do_help():
      Print help for the Conifer ebook MARC processor
      '''
  
-    print '''
+    print('''
  Conifer ebook MARC processor
  
  This script takes a set of MARC records and processes them to generate a set
@@ -151,7 +152,7 @@ Optional arguments:
  
  Examples:
      %s --algoma -i crkn.mrc -o /tmp/crkn_out.mrc -p "eBrary Inc."
-    ''' % sys.argv[0]
+    ''' % (sys.argv[0],))
      sys.exit(0)
  
  def consolidate_options(opts):
@@ -203,9 +204,9 @@ def check_options(options):
      if '--help' in options:
          do_help()
  
-    for reqkey, reqwarn in _req.iteritems():
+    for reqkey, reqwarn in _req.items():
          if reqkey not in options:
-            print reqwarn
+            print(reqwarn)
              _help = True
  
      _libraries = check_libraries(options)
@@ -260,7 +261,7 @@ def check_options(options):
  
      for optkey, optval in _string_opts.items():
          if optkey in options:
-            clean_opts[optval] = options[optkey].decode('utf-8')
+            clean_opts[optval] = options[optkey]
  
      clean_opts['libraries'] = _libraries
      clean_opts['input'] = _input
@@ -278,8 +279,8 @@ def evergreen_request(method, *args, **kwargs):
      params += ['param=%s' % quote(json.dumps(a)) for a in args]
      url = '%s?%s' % (GATEWAY_URL, '&'.join(params))
      #print '--->', url
-    req = urllib2.urlopen(url)
-    resp = json.load(req)
+    req = requests.get(url)
+    resp = req.json()
      if resp['status'] != 200:
          raise Exception('error during evergreen request', resp)
      payload = resp['payload']
@@ -387,8 +388,8 @@ def parse_opts():
              'cut-field=', 'help'
          ]
          opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts)
-    except getopt.GetoptError, ex:
-        print "* %s" % str(ex)
+    except getopt.GetoptError as ex:
+        print("* %s" % (str(ex),))
          do_help()
  
      _options = consolidate_options(opts[0])
@@ -407,7 +408,7 @@ def process_marc(options):
              reader = pymarc.MARCReader(
                  open(options['input'], mode='rb'), to_unicode=True
              )
-        except Exception, ex:
+        except Exception as ex:
              print("Could not open input file [%s]" % options['input'])
  
          for record in reader:
@@ -423,7 +424,7 @@ def process_record(record, options, files):
                      % (RECORD_COUNT, options['input'])
              )
          else:
-            print ("%d - %s" % (RECORD_COUNT, record['856']))
+            print("%d - %s" % (RECORD_COUNT, record['856']))
  
          dupe_flags = {}
  
@@ -465,8 +466,9 @@ def process_record(record, options, files):
                  (RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)
              )):
                  files['sample'].write(new_record)
-    except Exception, ex:
+    except Exception as ex:
          print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
+        traceback.print_exc()
  
  def process_fields(record, options):
      """Decide which fields to add, delete, and keep"""
@@ -685,15 +687,15 @@ def clean_diacritics(field):
          new_field.add_subfield(subfield[0], tmpsf)
          global RECORD_COUNT
          if r'\x' in repr(tmpsf):
-            print " * %d Hex value found in %s:%s - [%s] [%s]" % (
+            print(" * %d Hex value found in %s:%s - [%s] [%s]" % (
                  RECORD_COUNT, field.tag, subfield[0],
                  tmpsf.encode('utf8'), repr(tmpsf)
-            )
+            ))
  
          if (repr(subfield[1]) != repr(tmpsf)):
-            print "* %d\tOld: [%s]\tNew: [%s]" % (
+            print("* %d\tOld: [%s]\tNew: [%s]" % (
                  RECORD_COUNT, subfield[1].encode('utf8'), tmpsf.encode('utf8')
-            )
+            ))
  
      return new_field
  
@@ -703,7 +705,7 @@ def add_publisher(record, options):
      have a matching 710 and just need to add the publisher relator code.
      """
  
-    publisher = options['publisher'].decode('utf-8')
+    publisher = options['publisher']
  
      munge_publisher = False
      need_publisher = True
@@ -757,7 +759,7 @@ def add_platform(record, options):
      if not 'platform' in options:
          return False
  
-    platform = options['platform'].decode('utf-8')
+    platform = options['platform']
      need_platform = True
  
      # Iterate through all of the existing 710 fields
@@ -828,16 +830,19 @@ def check_for_isbn(options, lib, isbnval):
          "sfx.response_type=multi_obj_detailed_xml" \
          "&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval)
  
+    headers = {'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
+    req = requests.get(url, headers=headers)
      try:
-        req = urllib2.urlopen(url)
-        sfx_res = BeautifulSoup(req.read())
-    except urllib2.HTTPError, ex:
+        req.raise_for_status()
+    except requests.exceptions.HTTPError as ex:
          print("%s for URL %s" % (ex, url))
          return False
-    except urllib2.URLError, ex:
+    except requests.exceptions.URLError as ex:
          print("%s for URL %s" % (ex, url))
          return False
  
+    sfx_res = BeautifulSoup(req.text, "html.parser")
+
      # We want a target with a service_type element of 'getFullTxt'
      targets = sfx_res.ctx_obj.ctx_obj_targets.findAll(
          'target', recursive=False
@@ -1033,7 +1038,7 @@ def process_urls(field, options, publisher):
      new_fields = []
  
      if not field['u']:
-        print "* No subfield 'u' found in this 856"
+        print("* No subfield 'u' found in this 856")
          return None
  
      # If we have a ToC or author notes or whatever, replace with content
@@ -1075,16 +1080,18 @@ def substitute_content(field):
          return None
  
      # Get the data from the supplied URL
+    headers = {'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
+    req = requests.get(url, headers=headers)
      try:
-        req = urllib2.urlopen(url)
-        raw_content = BeautifulSoup(req.read())
-    except urllib2.HTTPError, ex:
+        req.raise_for_status()
+    except requests.exceptions.HTTPError as ex:
          print("%s for URL %s" % (ex, url))
          return None
-    except urllib2.URLError, ex:
+    except requests.exceptions.URLError as ex:
          print("%s for URL %s" % (ex, url))
          return None
  
+    raw_content = BeautifulSoup(req.text, "html.parser")
      content = process_loc_data(raw_content)
      if not content:
          return None
@@ -1128,19 +1135,22 @@ def process_loc_data(raw_content):
      # Get all of the text after the horizontal rule
      content = ' '.join(
          raw_content.find('hr').findAllNext(text=True)
-    ).encode('utf8')
+    )
  
      # Remove linefeeds
      content = content.replace('\n', ' ')
      content = content.replace('\r', ' ')
  
+    # Replace multiple contiguous whitespace with a single space
+    content = re.sub(r'\s+', r' ', content)
+
      # Remove inline subject headings to avoid too much indexing boost
      lcsh = content.find('Library of Congress subject headings')
      if lcsh > -1:
          content = content[0:lcsh]
  
      # Farewell, starting and ending whitespace
-    content = content.strip().decode('utf8')
+    content = content.strip()
  
      return content
  
@@ -1208,10 +1218,10 @@ if __name__ == '__main__':
          if fname in OPTIONS:
              try:
                  if 'to-format' in OPTIONS and OPTIONS['to-format'] == 'xml':
-                    FILES[fname] = codecs.open(OPTIONS[fname], 'w', 'utf-8')
+                    FILES[fname] = codecs.open(OPTIONS[fname], 'wb', 'utf-8')
                  else:
-                    FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'w'))
-            except Exception, ex:
+                    FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'wb'))
+            except Exception as ex:
                  print("Could not open output file [%s]: %s" % (OPTIONS[fname], ex))
  
      process_marc(OPTIONS)
author	Dan Scott <dscott@laurentian.ca>
	Tue, 24 Apr 2018 16:36:56 +0000 (12:36 -0400)
committer	Dan Scott <dscott@laurentian.ca>
	Tue, 24 Apr 2018 16:36:56 +0000 (12:36 -0400)