From: dbs Date: Wed, 8 Dec 2010 17:06:18 +0000 (+0000) Subject: Teach the ebook processing script how to turn LoC URLs into indexable content X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=a2ca339db754295b410e861931fa80b14d812e99;p=contrib%2FConifer.git Teach the ebook processing script how to turn LoC URLs into indexable content There is a risk that we will duplicate content already in the MARC record, but so far that doesn't appear to be the case. Also, we skip the OCRed machine-generated tables of contents because their content is entirely untrustworthy. git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/branches/rel_1_6_1@1101 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index a02075585b..d14548acc2 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -13,7 +13,8 @@ requirements that would be the same for each record and therefore can be accommodated in batch load. """ -import os, os.path, sys, getopt, pymarc, pymarc.marc8, re +import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2 +from BeautifulSoup import BeautifulSoup class Institution(): """Defines standard settings for each Conifer institution""" @@ -241,15 +242,13 @@ def process_records(options): def process_fields(record, options): """Decide which fields to add, delete, and keep""" - url = False new_record = pymarc.Record() for field in record.get_fields(): - # Only process the first 856 field, for better or worse + # Process all of the 856 fields if field.tag == '856': - if url == False: - url = True - new_fields = process_urls(field, options) + new_fields = process_urls(field, options) + if new_fields: for new_856 in new_fields: new_record.add_field(new_856) # Strip out 9xx fields: we don't want local fields in our records @@ -313,17 +312,78 @@ def process_urls(field, options): print "* No subfield 'u' found in this 856" return None + # If we have a ToC or author notes or whatever, replace with content + if field['u'].find('.loc.gov') > -1: + content = substitute_content(field) + if (content): + new_fields.append(content) + else: + for lib in options['libraries']: + data = options['settings'].get_settings(lib) + subs = get_subfields(field, data) + eight_five_six = pymarc.Field(tag = '856', + indicators = ['4', '0'], + subfields = subs + ) + new_fields.append(eight_five_six) + + return new_fields + +def substitute_content(field): + """Parses a ToC or author notes URL and generates a field""" + + content_field = None + raw_content = '' + + url = field['u'] + + # Skip machine-generated tables of contents + if url.find('/toc/') > -1: + return None + + try: + req = urllib2.urlopen(url) + raw_content = BeautifulSoup(req.read()) + except urllib2.HTTPError, ex: + print("%s for URL %s" % (ex, url)) + return None + + # Short-circuit if we have an OCRed ToC; the quality is terrible + if raw_content.find(text='Electronic data is machine generated'): + return None + elif raw_content.find('
'):
+        return None
+
+    content = ''.join(raw_content.find('hr').findAllNext(text=True)).encode('utf8')
+    content = content.replace('\n', ' ')
+
+    if url.find('-b.html') > -1:
+    # Biographical note
+        content_field = pymarc.Field(
+            tag = '545',
+            indicators = ['1', ' '],
+            subfields = ['a', content]
+        )
+    elif url.find('-d.html') > -1:
+    # Summary written by publisher
+        content_field = pymarc.Field(
+            tag = '520',
+            indicators = ['3', ' '],
+            subfields = ['a', content]
+        )
 
-    for lib in options['libraries']:
-        data = options['settings'].get_settings(lib)
-        subs = get_subfields(field, data)
-        eight_five_six = pymarc.Field(tag = '856',
-            indicators = ['4', '0'],
-            subfields = subs 
+    elif url.find('-t.html') > -1:
+    # Table of contents
+        content_field = pymarc.Field(
+            tag = '505',
+            indicators = [' ', ' '],
+            subfields = ['a', content]
         )
-        new_fields.append(eight_five_six)
+    else:
+        print("URL %s didn't match known LoC type" % (url))
+        return None
 
-    return new_fields
+    return content_field
 
 def get_subfields(field, data):
     """Creates 856 subfields required by Conifer"""