From cd76f069126bf52fbc1630228775357c487849df Mon Sep 17 00:00:00 2001 From: dbs Date: Wed, 8 Dec 2010 19:18:27 +0000 Subject: [PATCH] Better error handling & pull LoC data handling into its own function git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/branches/rel_1_6_1@1102 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- tools/ebooks/prep_ebook_records.py | 58 +++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/tools/ebooks/prep_ebook_records.py b/tools/ebooks/prep_ebook_records.py index d14548acc2..a914f9eb7b 100644 --- a/tools/ebooks/prep_ebook_records.py +++ b/tools/ebooks/prep_ebook_records.py @@ -314,9 +314,9 @@ def process_urls(field, options): # If we have a ToC or author notes or whatever, replace with content if field['u'].find('.loc.gov') > -1: - content = substitute_content(field) - if (content): - new_fields.append(content) + enrich = substitute_content(field) + if enrich and isinstance(enrich, pymarc.field.Field): + new_fields.append(enrich) else: for lib in options['libraries']: data = options['settings'].get_settings(lib) @@ -332,39 +332,38 @@ def process_urls(field, options): def substitute_content(field): """Parses a ToC or author notes URL and generates a field""" + url = field['u'] + content_field = None raw_content = '' - url = field['u'] - # Skip machine-generated tables of contents if url.find('/toc/') > -1: return None + # Get the data from the supplied URL try: req = urllib2.urlopen(url) raw_content = BeautifulSoup(req.read()) except urllib2.HTTPError, ex: print("%s for URL %s" % (ex, url)) return None - - # Short-circuit if we have an OCRed ToC; the quality is terrible - if raw_content.find(text='Electronic data is machine generated'): - return None - elif raw_content.find('
'):
+    except urllib2.URLError, ex:
+        print("%s for URL %s" % (ex, url))
         return None
 
-    content = ''.join(raw_content.find('hr').findAllNext(text=True)).encode('utf8')
-    content = content.replace('\n', ' ')
+    content = process_loc_data(raw_content)
+    if not content:
+        return None
 
-    if url.find('-b.html') > -1:
+    if url.endswith('-b.html'):
     # Biographical note
         content_field = pymarc.Field(
             tag = '545',
             indicators = ['1', ' '],
             subfields = ['a', content]
         )
-    elif url.find('-d.html') > -1:
+    elif url.endswith('-d.html'):
     # Summary written by publisher
         content_field = pymarc.Field(
             tag = '520',
@@ -372,7 +371,7 @@ def substitute_content(field):
             subfields = ['a', content]
         )
 
-    elif url.find('-t.html') > -1:
+    elif url.endswith('-t.html'):
     # Table of contents
         content_field = pymarc.Field(
             tag = '505',
@@ -381,10 +380,37 @@ def substitute_content(field):
         )
     else:
         print("URL %s didn't match known LoC type" % (url))
-        return None
 
     return content_field
 
+def process_loc_data(raw_content):
+    """Given the LoC enriched data, make it usable"""
+
+    # Short-circuit if we have an OCRed ToC; the quality is terrible
+    if raw_content.find(text='Electronic data is machine generated'):
+        return None
+    elif raw_content.find('
'):
+        return None
+
+    # Get all of the text after the horizontal rule
+    content = ' '.join(
+        raw_content.find('hr').findAllNext(text=True)
+    ).encode('utf8')
+
+    # Remove linefeeds
+    content = content.replace('\n', ' ')
+    content = content.replace('\r', ' ')
+
+    # Remove inline subject headings to avoid too much indexing boost
+    lcsh = content.find('Library of Congress subject headings')
+    if lcsh > -1:
+        content = content[0:lcsh]
+
+    # Farewell, starting and ending whitespace
+    content = content.strip()
+
+    return content
+
 def get_subfields(field, data):
     """Creates 856 subfields required by Conifer"""
 
-- 
2.11.0