be accommodated in batch load.
"""
-import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2, json
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, json
import codecs, copy
-from urllib import quote
+import requests
from datetime import date
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
+import traceback
RECORD_COUNT = 0
DUP_COUNT = 0
Print help for the Conifer ebook MARC processor
'''
- print '''
+ print('''
Conifer ebook MARC processor
This script takes a set of MARC records and processes them to generate a set
Examples:
%s --algoma -i crkn.mrc -o /tmp/crkn_out.mrc -p "eBrary Inc."
- ''' % sys.argv[0]
+ ''' % (sys.argv[0],))
sys.exit(0)
def consolidate_options(opts):
if '--help' in options:
do_help()
- for reqkey, reqwarn in _req.iteritems():
+ for reqkey, reqwarn in _req.items():
if reqkey not in options:
- print reqwarn
+ print(reqwarn)
_help = True
_libraries = check_libraries(options)
for optkey, optval in _string_opts.items():
if optkey in options:
- clean_opts[optval] = options[optkey].decode('utf-8')
+ clean_opts[optval] = options[optkey]
clean_opts['libraries'] = _libraries
clean_opts['input'] = _input
params += ['param=%s' % quote(json.dumps(a)) for a in args]
url = '%s?%s' % (GATEWAY_URL, '&'.join(params))
#print '--->', url
- req = urllib2.urlopen(url)
- resp = json.load(req)
+ req = requests.get(url)
+ resp = req.json()
if resp['status'] != 200:
raise Exception('error during evergreen request', resp)
payload = resp['payload']
'cut-field=', 'help'
]
opts = getopt.getopt(sys.argv[1:], _short_opts, _long_opts)
- except getopt.GetoptError, ex:
- print "* %s" % str(ex)
+ except getopt.GetoptError as ex:
+ print("* %s" % (str(ex),))
do_help()
_options = consolidate_options(opts[0])
reader = pymarc.MARCReader(
open(options['input'], mode='rb'), to_unicode=True
)
- except Exception, ex:
+ except Exception as ex:
print("Could not open input file [%s]" % options['input'])
for record in reader:
% (RECORD_COUNT, options['input'])
)
else:
- print ("%d - %s" % (RECORD_COUNT, record['856']))
+ print("%d - %s" % (RECORD_COUNT, record['856']))
dupe_flags = {}
(RECORD_COUNT == 1) or (RECORD_COUNT % 100 == 0)
)):
files['sample'].write(new_record)
- except Exception, ex:
+ except Exception as ex:
print("* Error processing record %d - %s" % (RECORD_COUNT, ex))
+ traceback.print_exc()
def process_fields(record, options):
"""Decide which fields to add, delete, and keep"""
new_field.add_subfield(subfield[0], tmpsf)
global RECORD_COUNT
if r'\x' in repr(tmpsf):
- print " * %d Hex value found in %s:%s - [%s] [%s]" % (
+ print(" * %d Hex value found in %s:%s - [%s] [%s]" % (
RECORD_COUNT, field.tag, subfield[0],
tmpsf.encode('utf8'), repr(tmpsf)
- )
+ ))
if (repr(subfield[1]) != repr(tmpsf)):
- print "* %d\tOld: [%s]\tNew: [%s]" % (
+ print("* %d\tOld: [%s]\tNew: [%s]" % (
RECORD_COUNT, subfield[1].encode('utf8'), tmpsf.encode('utf8')
- )
+ ))
return new_field
have a matching 710 and just need to add the publisher relator code.
"""
- publisher = options['publisher'].decode('utf-8')
+ publisher = options['publisher']
munge_publisher = False
need_publisher = True
if not 'platform' in options:
return False
- platform = options['platform'].decode('utf-8')
+ platform = options['platform']
need_platform = True
# Iterate through all of the existing 710 fields
"sfx.response_type=multi_obj_detailed_xml" \
"&__service_type=getFullTxt&rft.isbn=%s" % (sfx, isbnval)
+ headers = {'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
+ req = requests.get(url, headers=headers)
try:
- req = urllib2.urlopen(url)
- sfx_res = BeautifulSoup(req.read())
- except urllib2.HTTPError, ex:
+ req.raise_for_status()
+ except requests.exceptions.HTTPError as ex:
print("%s for URL %s" % (ex, url))
return False
- except urllib2.URLError, ex:
+ except requests.exceptions.URLError as ex:
print("%s for URL %s" % (ex, url))
return False
+ sfx_res = BeautifulSoup(req.text, "html.parser")
+
# We want a target with a service_type element of 'getFullTxt'
targets = sfx_res.ctx_obj.ctx_obj_targets.findAll(
'target', recursive=False
new_fields = []
if not field['u']:
- print "* No subfield 'u' found in this 856"
+ print("* No subfield 'u' found in this 856")
return None
# If we have a ToC or author notes or whatever, replace with content
return None
# Get the data from the supplied URL
+ headers = {'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
+ req = requests.get(url, headers=headers)
try:
- req = urllib2.urlopen(url)
- raw_content = BeautifulSoup(req.read())
- except urllib2.HTTPError, ex:
+ req.raise_for_status()
+ except requests.exceptions.HTTPError as ex:
print("%s for URL %s" % (ex, url))
return None
- except urllib2.URLError, ex:
+ except requests.exceptions.URLError as ex:
print("%s for URL %s" % (ex, url))
return None
+ raw_content = BeautifulSoup(req.text, "html.parser")
content = process_loc_data(raw_content)
if not content:
return None
# Get all of the text after the horizontal rule
content = ' '.join(
raw_content.find('hr').findAllNext(text=True)
- ).encode('utf8')
+ )
# Remove linefeeds
content = content.replace('\n', ' ')
content = content.replace('\r', ' ')
+ # Replace multiple contiguous whitespace with a single space
+ content = re.sub(r'\s+', r' ', content)
+
# Remove inline subject headings to avoid too much indexing boost
lcsh = content.find('Library of Congress subject headings')
if lcsh > -1:
content = content[0:lcsh]
# Farewell, starting and ending whitespace
- content = content.strip().decode('utf8')
+ content = content.strip()
return content
if fname in OPTIONS:
try:
if 'to-format' in OPTIONS and OPTIONS['to-format'] == 'xml':
- FILES[fname] = codecs.open(OPTIONS[fname], 'w', 'utf-8')
+ FILES[fname] = codecs.open(OPTIONS[fname], 'wb', 'utf-8')
else:
- FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'w'))
- except Exception, ex:
+ FILES[fname] = pymarc.MARCWriter(open(OPTIONS[fname], 'wb'))
+ except Exception as ex:
print("Could not open output file [%s]: %s" % (OPTIONS[fname], ex))
process_marc(OPTIONS)