From: gfawcett Date: Sun, 9 Jan 2011 00:56:07 +0000 (+0000) Subject: uwindsor_migration/, containing some migrational stuff. (Remove this later.) X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=37fff9beadeb57e6e7082488951019afb26aadf4;p=Syrup.git uwindsor_migration/, containing some migrational stuff. (Remove this later.) git-svn-id: svn://svn.open-ils.org/ILS-Contrib/servres/trunk@1170 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- diff --git a/.gitignore b/.gitignore index 4def612..df280c4 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ private_local_settings.py *~ /conifer/test.db /conifer/syrup/test.db +/conifer/uwindsor_migration/data/ diff --git a/conifer/uwindsor_migration/eres.py b/conifer/uwindsor_migration/eres.py new file mode 100755 index 0000000..808cb9e --- /dev/null +++ b/conifer/uwindsor_migration/eres.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +# This script scrapes ERES and saves raw content to the 'data' directory. + +from subprocess import * +import os +import re +import sys + +import warnings +warnings.filterwarnings('ignore') # to avoid some twill import noise. + +from twill.commands import * +from twill import get_browser + +try: + username = os.environ['ERESUSER'] + password = os.environ['ERESPASS'] +except: + print + print 'Example usage:' + print ' ERESUSER=xxxx ERESPASS=xxx %s ' % sys.argv[0] + print + print 'Course codes are like CRIM48-567, as they appear in the ERES interface.' + print + print 'Fancier usage: ' + print ' export ERESUSER=xxx; export ERESPASS=xxx' + print ' export CODES="coursecode1 coursecode2 coursecode3 ..."' + print ' for code in $CODES; do %s $code; done' % sys.argv[0] + raise SystemExit + +browser = get_browser() + +redirect_output('/dev/null') +go('http://ereserves.uwindsor.ca/eres/login.aspx') + +fv(1, 3, username) +fv(1, 4, password) +submit(5) + +go('http://ereserves.uwindsor.ca/eres/courses.aspx') + +COURSE = sys.argv[1] + +follow(COURSE) + +PATH = 'data/%s' % COURSE + +try: + os.makedirs(PATH) +except: + pass + +submit(3) # 'accept' on the License page + +follow('Documents') +BASE = url('.*').rsplit('/', 1)[0] + +filename = '%s/items.html' % PATH +save_html(filename) +html = open(filename).read() + +save_cookies('%s/c' % PATH) +log = open('%s/log' % PATH, 'w') + +itemlinkpat = re.compile(r"documentview.aspx\?cid=(\d+)&associd=(\d+)") +done = set() + +n = 0 +for (cid, aid) in itemlinkpat.findall(html): + if (cid, aid) in done: + continue + + itemurl = "%s/documentview.aspx?cid=%s&associd=%s" % (BASE, cid, aid) + print n, itemurl + go(itemurl) + + filename = '%s/item%03d.html' % (PATH, n) + save_html(filename) + html = open(filename).read() + + linkpat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information""") + m = linkpat.search(html) + if m: + print >> log, (n, 'link', m.groups()) + else: + filepat = re.compile(r"""onClick="javascript:pop\('(download.aspx\?docID=(\d+)&shortname=(.*?))'""") + m = filepat.search(html) + if m: + print >> log, (n, 'file', m.groups()) + urlpath, itemid, origfile = m.groups() + binary_url = '%s/%s' % (BASE, urlpath) + cookie = browser.cj[0] + destfile = '%s/data%03d' % (PATH, n) + cmd = 'curl -s -b "%s=%s" "%s" > %s' % (cookie.name, cookie.value, binary_url, destfile) + os.system(cmd) + back() + done.add((cid, aid)) + n += 1 + +log.close() diff --git a/conifer/uwindsor_migration/metadata.py b/conifer/uwindsor_migration/metadata.py new file mode 100644 index 0000000..0c09fc4 --- /dev/null +++ b/conifer/uwindsor_migration/metadata.py @@ -0,0 +1,73 @@ +# After having scraped ERES, the Metadata class can extract items' +# metadata from the the scraped HTML. + +from pprint import pprint +import re +import os + +class Metadata(object): + + def __init__(self, path): + self._path = path + self.html = open(name).read() + self.localid = re.search(r'item(\d+)', self._path).group(1) + self._scrape() + del self.html + + @property + def data(self): + return self.__dict__ + + def __scrape(self, **kwargs): + for name, pat in kwargs.items(): + try: + setattr(self, name, re.search(pat, self.html).group(1).strip()) + except: + pass + + def _scrape(self): + self.__scrape( + title=r'Title:(.*?)<', + source_title=r'Title Primary:(.*?)<', + journal=r'Journal:(.*?)<', + volume=r'Volume:(.*?)<', + issue=r'Issue:(.*?)<', + author=r'Author Primary:(.*?)<', + author2=r'Author Secondary:(.*?)<', + pages='Page Range / Chapter:(.*?)<', + publisher='Publisher:(.*?)<', + published='Date Published:(.*?)<', + course='  (.*?) -', + instructor='  .*? - .*? - (.*?)<', + term='  .*? - .*? \((.*?)\)', + ) + if hasattr(self, 'journal'): + self.source_title = self.journal + del self.journal + + pat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information""") + m = pat.search(self.html) + if m: + self.type = 'url' + self.url = m.group(1) + else: + pat = re.compile(r"""onClick="javascript:pop\('(download.aspx\?docID=(\d+)&shortname=(.*?))'""") + m = pat.search(self.html) + if m: + self.type = 'file' + urlpath, itemid, origfile = m.groups() + self.filename = origfile + datafile = re.sub(r'(.*)/item(\d+).html', + r'\1/data\2', self._path) + datafile = os.path.abspath(datafile) + self.datafile = datafile + + + +if __name__ == '__main__': + items = [] + for name in os.popen('find data -name "item0*.html"').readlines(): + name = name.strip() + m = Metadata(name) + items.append(m) + pprint(m.data) diff --git a/eres/.gitignore b/eres/.gitignore new file mode 100644 index 0000000..46bf302 --- /dev/null +++ b/eres/.gitignore @@ -0,0 +1,5 @@ +/data/ +*# +*~ +*.pyc +.#*