From cd76f069126bf52fbc1630228775357c487849df Mon Sep 17 00:00:00 2001
From: dbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Date: Wed, 8 Dec 2010 19:18:27 +0000
Subject: [PATCH] Better error handling & pull LoC data handling into its own
 function

git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/branches/rel_1_6_1@1102 6d9bc8c9-1ec2-4278-b937-99fde70a366f
---
 tools/ebooks/prep_ebook_records.py | 58 +++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 16 deletions(-)
diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py
index d14548acc2..a914f9eb7b 100644
--- a/tools/ebooks/prep_ebook_records.py
+++ b/tools/ebooks/prep_ebook_records.py
@@ -314,9 +314,9 @@ def process_urls(field, options):
 
     # If we have a ToC or author notes or whatever, replace with content
     if field['u'].find('.loc.gov') > -1:
-        content = substitute_content(field)
-        if (content):
-            new_fields.append(content)
+        enrich = substitute_content(field)
+        if enrich and isinstance(enrich, pymarc.field.Field):
+            new_fields.append(enrich)
     else:
         for lib in options['libraries']:
             data = options['settings'].get_settings(lib)
@@ -332,39 +332,38 @@ def process_urls(field, options):
 def substitute_content(field):
     """Parses a ToC or author notes URL and generates a field"""
 
+    url = field['u']
+
     content_field = None
     raw_content = ''
 
-    url = field['u']
-
     # Skip machine-generated tables of contents
     if url.find('/toc/') > -1:
         return None
 
+    # Get the data from the supplied URL
     try:
         req = urllib2.urlopen(url)
         raw_content = BeautifulSoup(req.read())
     except urllib2.HTTPError, ex:
         print("%s for URL %s" % (ex, url))
         return None
-
-    # Short-circuit if we have an OCRed ToC; the quality is terrible
-    if raw_content.find(text='Electronic data is machine generated'):
-        return None
-    elif raw_content.find('<pre>'):
+    except urllib2.URLError, ex:
+        print("%s for URL %s" % (ex, url))
         return None
 
-    content = ''.join(raw_content.find('hr').findAllNext(text=True)).encode('utf8')
-    content = content.replace('\n', ' ')
+    content = process_loc_data(raw_content)
+    if not content:
+        return None
 
-    if url.find('-b.html') > -1:
+    if url.endswith('-b.html'):
     # Biographical note
         content_field = pymarc.Field(
             tag = '545',
             indicators = ['1', ' '],
             subfields = ['a', content]
         )
-    elif url.find('-d.html') > -1:
+    elif url.endswith('-d.html'):
     # Summary written by publisher
         content_field = pymarc.Field(
             tag = '520',
@@ -372,7 +371,7 @@ def substitute_content(field):
             subfields = ['a', content]
         )
 
-    elif url.find('-t.html') > -1:
+    elif url.endswith('-t.html'):
     # Table of contents
         content_field = pymarc.Field(
             tag = '505',
@@ -381,10 +380,37 @@ def substitute_content(field):
         )
     else:
         print("URL %s didn't match known LoC type" % (url))
-        return None
 
     return content_field
 
+def process_loc_data(raw_content):
+    """Given the LoC enriched data, make it usable"""
+
+    # Short-circuit if we have an OCRed ToC; the quality is terrible
+    if raw_content.find(text='Electronic data is machine generated'):
+        return None
+    elif raw_content.find('<pre>'):
+        return None
+
+    # Get all of the text after the horizontal rule
+    content = ' '.join(
+        raw_content.find('hr').findAllNext(text=True)
+    ).encode('utf8')
+
+    # Remove linefeeds
+    content = content.replace('\n', ' ')
+    content = content.replace('\r', ' ')
+
+    # Remove inline subject headings to avoid too much indexing boost
+    lcsh = content.find('Library of Congress subject headings')
+    if lcsh > -1:
+        content = content[0:lcsh]
+
+    # Farewell, starting and ending whitespace
+    content = content.strip()
+
+    return content
+
 def get_subfields(field, data):
     """Creates 856 subfields required by Conifer"""
 
-- 
2.11.0