be accommodated in batch load.
"""
-import os, os.path, sys, getopt, pymarc, pymarc.marc8, re
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+from BeautifulSoup import BeautifulSoup
class Institution():
"""Defines standard settings for each Conifer institution"""
def process_fields(record, options):
"""Decide which fields to add, delete, and keep"""
- url = False
new_record = pymarc.Record()
for field in record.get_fields():
- # Only process the first 856 field, for better or worse
+ # Process all of the 856 fields
if field.tag == '856':
- if url == False:
- url = True
- new_fields = process_urls(field, options)
+ new_fields = process_urls(field, options)
+ if new_fields:
for new_856 in new_fields:
new_record.add_field(new_856)
# Strip out 9xx fields: we don't want local fields in our records
print "* No subfield 'u' found in this 856"
return None
+ # If we have a ToC or author notes or whatever, replace with content
+ if field['u'].find('.loc.gov') > -1:
+ content = substitute_content(field)
+ if (content):
+ new_fields.append(content)
+ else:
+ for lib in options['libraries']:
+ data = options['settings'].get_settings(lib)
+ subs = get_subfields(field, data)
+ eight_five_six = pymarc.Field(tag = '856',
+ indicators = ['4', '0'],
+ subfields = subs
+ )
+ new_fields.append(eight_five_six)
+
+ return new_fields
+
+def substitute_content(field):
+ """Parses a ToC or author notes URL and generates a field"""
+
+ content_field = None
+ raw_content = ''
+
+ url = field['u']
+
+ # Skip machine-generated tables of contents
+ if url.find('/toc/') > -1:
+ return None
+
+ try:
+ req = urllib2.urlopen(url)
+ raw_content = BeautifulSoup(req.read())
+ except urllib2.HTTPError, ex:
+ print("%s for URL %s" % (ex, url))
+ return None
+
+ # Short-circuit if we have an OCRed ToC; the quality is terrible
+ if raw_content.find(text='Electronic data is machine generated'):
+ return None
+ elif raw_content.find('<pre>'):
+ return None
+
+ content = ''.join(raw_content.find('hr').findAllNext(text=True)).encode('utf8')
+ content = content.replace('\n', ' ')
+
+ if url.find('-b.html') > -1:
+ # Biographical note
+ content_field = pymarc.Field(
+ tag = '545',
+ indicators = ['1', ' '],
+ subfields = ['a', content]
+ )
+ elif url.find('-d.html') > -1:
+ # Summary written by publisher
+ content_field = pymarc.Field(
+ tag = '520',
+ indicators = ['3', ' '],
+ subfields = ['a', content]
+ )
- for lib in options['libraries']:
- data = options['settings'].get_settings(lib)
- subs = get_subfields(field, data)
- eight_five_six = pymarc.Field(tag = '856',
- indicators = ['4', '0'],
- subfields = subs
+ elif url.find('-t.html') > -1:
+ # Table of contents
+ content_field = pymarc.Field(
+ tag = '505',
+ indicators = [' ', ' '],
+ subfields = ['a', content]
)
- new_fields.append(eight_five_six)
+ else:
+ print("URL %s didn't match known LoC type" % (url))
+ return None
- return new_fields
+ return content_field
def get_subfields(field, data):
"""Creates 856 subfields required by Conifer"""