Teach the ebook processing script how to turn LoC URLs into indexable content

author dbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>

Wed, 8 Dec 2010 17:06:18 +0000 (17:06 +0000)

committer dbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>

Wed, 8 Dec 2010 17:06:18 +0000 (17:06 +0000)
author dbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Wed, 8 Dec 2010 17:06:18 +0000 (17:06 +0000)
committer dbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Wed, 8 Dec 2010 17:06:18 +0000 (17:06 +0000)
diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py

index a020755..d14548a 100644 (file)
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -13,7 +13,8 @@ requirements that would be the same for each record and therefore can
  be accommodated in batch load.
  """
  
-import os, os.path, sys, getopt, pymarc, pymarc.marc8, re
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+from BeautifulSoup import BeautifulSoup
  
  class Institution():
      """Defines standard settings for each Conifer institution"""
@@ -241,15 +242,13 @@ def process_records(options):
  def process_fields(record, options):
      """Decide which fields to add, delete, and keep"""
  
-    url = False
      new_record = pymarc.Record()
  
      for field in record.get_fields():
-        # Only process the first 856 field, for better or worse
+        # Process all of the 856 fields
          if field.tag == '856':
-            if url == False:
-                url = True
-                new_fields = process_urls(field, options)
+            new_fields = process_urls(field, options)
+            if new_fields:
                  for new_856 in new_fields:
                      new_record.add_field(new_856)
          # Strip out 9xx fields: we don't want local fields in our records
@@ -313,17 +312,78 @@ def process_urls(field, options):
          print "* No subfield 'u' found in this 856"
          return None
  
+    # If we have a ToC or author notes or whatever, replace with content
+    if field['u'].find('.loc.gov') > -1:
+        content = substitute_content(field)
+        if (content):
+            new_fields.append(content)
+    else:
+        for lib in options['libraries']:
+            data = options['settings'].get_settings(lib)
+            subs = get_subfields(field, data)
+            eight_five_six = pymarc.Field(tag = '856',
+                indicators = ['4', '0'],
+                subfields = subs 
+            )
+            new_fields.append(eight_five_six)
+
+    return new_fields
+
+def substitute_content(field):
+    """Parses a ToC or author notes URL and generates a field"""
+
+    content_field = None
+    raw_content = ''
+
+    url = field['u']
+
+    # Skip machine-generated tables of contents
+    if url.find('/toc/') > -1:
+        return None
+
+    try:
+        req = urllib2.urlopen(url)
+        raw_content = BeautifulSoup(req.read())
+    except urllib2.HTTPError, ex:
+        print("%s for URL %s" % (ex, url))
+        return None
+
+    # Short-circuit if we have an OCRed ToC; the quality is terrible
+    if raw_content.find(text='Electronic data is machine generated'):
+        return None
+    elif raw_content.find('<pre>'):
+        return None
+
+    content = ''.join(raw_content.find('hr').findAllNext(text=True)).encode('utf8')
+    content = content.replace('\n', ' ')
+
+    if url.find('-b.html') > -1:
+    # Biographical note
+        content_field = pymarc.Field(
+            tag = '545',
+            indicators = ['1', ' '],
+            subfields = ['a', content]
+        )
+    elif url.find('-d.html') > -1:
+    # Summary written by publisher
+        content_field = pymarc.Field(
+            tag = '520',
+            indicators = ['3', ' '],
+            subfields = ['a', content]
+        )
  
-    for lib in options['libraries']:
-        data = options['settings'].get_settings(lib)
-        subs = get_subfields(field, data)
-        eight_five_six = pymarc.Field(tag = '856',
-            indicators = ['4', '0'],
-            subfields = subs 
+    elif url.find('-t.html') > -1:
+    # Table of contents
+        content_field = pymarc.Field(
+            tag = '505',
+            indicators = [' ', ' '],
+            subfields = ['a', content]
          )
-        new_fields.append(eight_five_six)
+    else:
+        print("URL %s didn't match known LoC type" % (url))
+        return None
  
-    return new_fields
+    return content_field
  
  def get_subfields(field, data):
      """Creates 856 subfields required by Conifer"""
author	dbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
	Wed, 8 Dec 2010 17:06:18 +0000 (17:06 +0000)
committer	dbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
	Wed, 8 Dec 2010 17:06:18 +0000 (17:06 +0000)