From: gfawcett <gfawcett@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Date: Sun, 9 Jan 2011 00:56:07 +0000 (+0000)
Subject: uwindsor_migration/, containing some migrational stuff. (Remove this later.)
X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=37fff9beadeb57e6e7082488951019afb26aadf4;p=Syrup.git

uwindsor_migration/, containing some migrational stuff. (Remove this later.)

git-svn-id: svn://svn.open-ils.org/ILS-Contrib/servres/trunk@1170 6d9bc8c9-1ec2-4278-b937-99fde70a366f
---

diff --git a/.gitignore b/.gitignore
index 4def612..df280c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ private_local_settings.py
 *~
 /conifer/test.db
 /conifer/syrup/test.db
+/conifer/uwindsor_migration/data/
diff --git a/conifer/uwindsor_migration/eres.py b/conifer/uwindsor_migration/eres.py
new file mode 100755
index 0000000..808cb9e
--- /dev/null
+++ b/conifer/uwindsor_migration/eres.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+# This script scrapes ERES and saves raw content to the 'data' directory.
+
+from subprocess import *
+import os
+import re
+import sys
+
+import warnings
+warnings.filterwarnings('ignore') # to avoid some twill import noise.
+
+from twill.commands import *
+from twill import get_browser
+
+try:
+    username = os.environ['ERESUSER']
+    password = os.environ['ERESPASS']
+except:
+    print
+    print 'Example usage:'
+    print ' ERESUSER=xxxx ERESPASS=xxx %s <coursecode>' % sys.argv[0]
+    print
+    print 'Course codes are like CRIM48-567, as they appear in the ERES interface.'
+    print
+    print 'Fancier usage: '
+    print ' export ERESUSER=xxx; export ERESPASS=xxx'
+    print ' export CODES="coursecode1 coursecode2 coursecode3 ..."'
+    print ' for code in $CODES; do %s $code; done' % sys.argv[0]
+    raise SystemExit
+
+browser = get_browser()
+
+redirect_output('/dev/null')
+go('http://ereserves.uwindsor.ca/eres/login.aspx')
+
+fv(1, 3, username)
+fv(1, 4, password)
+submit(5)
+
+go('http://ereserves.uwindsor.ca/eres/courses.aspx')
+
+COURSE = sys.argv[1]
+
+follow(COURSE)
+
+PATH = 'data/%s' % COURSE
+
+try:
+    os.makedirs(PATH)
+except:
+    pass
+
+submit(3)                       # 'accept' on the License page
+
+follow('Documents')
+BASE = url('.*').rsplit('/', 1)[0]
+
+filename = '%s/items.html' % PATH
+save_html(filename)
+html = open(filename).read()
+
+save_cookies('%s/c' % PATH)
+log = open('%s/log' % PATH, 'w')
+
+itemlinkpat = re.compile(r"documentview.aspx\?cid=(\d+)&associd=(\d+)")
+done = set()
+
+n = 0
+for (cid, aid) in itemlinkpat.findall(html):
+    if (cid, aid) in done:
+        continue
+
+    itemurl = "%s/documentview.aspx?cid=%s&associd=%s" % (BASE, cid, aid)
+    print n, itemurl
+    go(itemurl)
+
+    filename = '%s/item%03d.html' % (PATH, n)
+    save_html(filename)
+    html = open(filename).read()
+
+    linkpat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information</a>""")
+    m = linkpat.search(html)
+    if m:
+        print >> log, (n, 'link', m.groups())
+    else:
+        filepat = re.compile(r"""onClick="javascript:pop\('(download.aspx\?docID=(\d+)&shortname=(.*?))'""")
+        m = filepat.search(html)
+        if m:
+            print >> log, (n, 'file', m.groups())
+            urlpath, itemid, origfile = m.groups()
+            binary_url = '%s/%s' % (BASE, urlpath)
+            cookie = browser.cj[0]
+            destfile = '%s/data%03d' % (PATH, n)
+            cmd = 'curl -s -b "%s=%s" "%s" > %s' % (cookie.name, cookie.value, binary_url, destfile)
+            os.system(cmd)
+    back()
+    done.add((cid, aid))
+    n += 1
+
+log.close()
diff --git a/conifer/uwindsor_migration/metadata.py b/conifer/uwindsor_migration/metadata.py
new file mode 100644
index 0000000..0c09fc4
--- /dev/null
+++ b/conifer/uwindsor_migration/metadata.py
@@ -0,0 +1,73 @@
+# After having scraped ERES, the Metadata class can extract items'
+# metadata from the the scraped HTML.
+
+from pprint import pprint
+import re
+import os
+
+class Metadata(object):
+
+    def __init__(self, path):
+        self._path = path
+        self.html = open(name).read()
+        self.localid = re.search(r'item(\d+)', self._path).group(1)
+        self._scrape()
+        del self.html
+
+    @property
+    def data(self):
+        return self.__dict__
+
+    def __scrape(self, **kwargs):
+        for name, pat in kwargs.items():
+            try:
+                setattr(self, name, re.search(pat, self.html).group(1).strip())
+            except:
+                pass
+
+    def _scrape(self):
+        self.__scrape(
+            title=r'<td align="left" nowrap="nowrap">Title:</td><td align="left" width="100%">(.*?)<',
+            source_title=r'<td align="left" nowrap="nowrap">Title Primary:</td><td align="left" width="100%">(.*?)<',
+            journal=r'<td align="left" nowrap="nowrap">Journal:</td><td align="left" width="100%">(.*?)<',
+            volume=r'<td align="left" nowrap="nowrap">Volume:</td><td align="left" width="100%">(.*?)<',
+            issue=r'<td align="left" nowrap="nowrap">Issue:</td><td align="left" width="100%">(.*?)<',
+            author=r'<td align="left" nowrap="nowrap">Author Primary:</td><td align="left" width="100%">(.*?)<',
+            author2=r'<td align="left" nowrap="nowrap">Author Secondary:</td><td align="left" width="100%">(.*?)<',
+            pages='<td align="left" nowrap="nowrap">Page Range / Chapter:</td><td align="left" width="100%">(.*?)<',
+            publisher='<td align="left" nowrap="nowrap">Publisher:</td><td align="left" width="100%">(.*?)<',
+            published='<td align="left" nowrap="nowrap">Date Published:</td><td align="left" width="100%">(.*?)<',
+            course='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;(.*?) -',
+            instructor='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;.*? - .*? - (.*?)<',
+            term='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;.*? - .*? \((.*?)\)',
+            )
+        if hasattr(self, 'journal'):
+            self.source_title = self.journal
+            del self.journal
+
+        pat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information</a>""")
+        m = pat.search(self.html)
+        if m:
+            self.type = 'url'
+            self.url = m.group(1)
+        else:
+            pat = re.compile(r"""onClick="javascript:pop\('(download.aspx\?docID=(\d+)&shortname=(.*?))'""")
+            m = pat.search(self.html)
+            if m:
+                self.type = 'file'
+                urlpath, itemid, origfile = m.groups()
+                self.filename = origfile
+                datafile = re.sub(r'(.*)/item(\d+).html', 
+                                  r'\1/data\2', self._path)
+                datafile = os.path.abspath(datafile)
+                self.datafile = datafile
+
+
+
+if __name__ == '__main__':
+    items = []
+    for name in os.popen('find data -name "item0*.html"').readlines():
+        name = name.strip()
+        m = Metadata(name)
+        items.append(m)
+        pprint(m.data)
diff --git a/eres/.gitignore b/eres/.gitignore
new file mode 100644
index 0000000..46bf302
--- /dev/null
+++ b/eres/.gitignore
@@ -0,0 +1,5 @@
+/data/
+*#
+*~
+*.pyc
+.#*