Teach the ebook processing script how to turn LoC URLs into indexable content
authordbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Wed, 8 Dec 2010 17:06:18 +0000 (17:06 +0000)
committerdbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Wed, 8 Dec 2010 17:06:18 +0000 (17:06 +0000)
There is a risk that we will duplicate content already in the MARC record,
but so far that doesn't appear to be the case. Also, we skip the
OCRed machine-generated tables of contents because their content is
entirely untrustworthy.

git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/branches/rel_1_6_1@1101 6d9bc8c9-1ec2-4278-b937-99fde70a366f

tools/ebooks/prep_ebook_records.py

index a020755..d14548a 100644 (file)
@@ -13,7 +13,8 @@ requirements that would be the same for each record and therefore can
 be accommodated in batch load.
 """
 
-import os, os.path, sys, getopt, pymarc, pymarc.marc8, re
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+from BeautifulSoup import BeautifulSoup
 
 class Institution():
     """Defines standard settings for each Conifer institution"""
@@ -241,15 +242,13 @@ def process_records(options):
 def process_fields(record, options):
     """Decide which fields to add, delete, and keep"""
 
-    url = False
     new_record = pymarc.Record()
 
     for field in record.get_fields():
-        # Only process the first 856 field, for better or worse
+        # Process all of the 856 fields
         if field.tag == '856':
-            if url == False:
-                url = True
-                new_fields = process_urls(field, options)
+            new_fields = process_urls(field, options)
+            if new_fields:
                 for new_856 in new_fields:
                     new_record.add_field(new_856)
         # Strip out 9xx fields: we don't want local fields in our records
@@ -313,17 +312,78 @@ def process_urls(field, options):
         print "* No subfield 'u' found in this 856"
         return None
 
+    # If we have a ToC or author notes or whatever, replace with content
+    if field['u'].find('.loc.gov') > -1:
+        content = substitute_content(field)
+        if (content):
+            new_fields.append(content)
+    else:
+        for lib in options['libraries']:
+            data = options['settings'].get_settings(lib)
+            subs = get_subfields(field, data)
+            eight_five_six = pymarc.Field(tag = '856',
+                indicators = ['4', '0'],
+                subfields = subs 
+            )
+            new_fields.append(eight_five_six)
+
+    return new_fields
+
+def substitute_content(field):
+    """Parses a ToC or author notes URL and generates a field"""
+
+    content_field = None
+    raw_content = ''
+
+    url = field['u']
+
+    # Skip machine-generated tables of contents
+    if url.find('/toc/') > -1:
+        return None
+
+    try:
+        req = urllib2.urlopen(url)
+        raw_content = BeautifulSoup(req.read())
+    except urllib2.HTTPError, ex:
+        print("%s for URL %s" % (ex, url))
+        return None
+
+    # Short-circuit if we have an OCRed ToC; the quality is terrible
+    if raw_content.find(text='Electronic data is machine generated'):
+        return None
+    elif raw_content.find('<pre>'):
+        return None
+
+    content = ''.join(raw_content.find('hr').findAllNext(text=True)).encode('utf8')
+    content = content.replace('\n', ' ')
+
+    if url.find('-b.html') > -1:
+    # Biographical note
+        content_field = pymarc.Field(
+            tag = '545',
+            indicators = ['1', ' '],
+            subfields = ['a', content]
+        )
+    elif url.find('-d.html') > -1:
+    # Summary written by publisher
+        content_field = pymarc.Field(
+            tag = '520',
+            indicators = ['3', ' '],
+            subfields = ['a', content]
+        )
 
-    for lib in options['libraries']:
-        data = options['settings'].get_settings(lib)
-        subs = get_subfields(field, data)
-        eight_five_six = pymarc.Field(tag = '856',
-            indicators = ['4', '0'],
-            subfields = subs 
+    elif url.find('-t.html') > -1:
+    # Table of contents
+        content_field = pymarc.Field(
+            tag = '505',
+            indicators = [' ', ' '],
+            subfields = ['a', content]
         )
-        new_fields.append(eight_five_six)
+    else:
+        print("URL %s didn't match known LoC type" % (url))
+        return None
 
-    return new_fields
+    return content_field
 
 def get_subfields(field, data):
     """Creates 856 subfields required by Conifer"""