From a8ee794685bf00996ba51099034f4417a39c84a9 Mon Sep 17 00:00:00 2001 From: gfawcett Date: Sun, 9 Jan 2011 01:00:03 +0000 Subject: [PATCH] uwindsor: migration of content from ERES to Syrup actually works! git-svn-id: svn://svn.open-ils.org/ILS-Contrib/servres/trunk@1172 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- conifer/syrup/models.py | 1 + conifer/uwindsor_migration/eres-into-syrup.py | 106 ++++++++++++++++++++++++++ conifer/uwindsor_migration/eres.py | 4 + conifer/uwindsor_migration/metadata.py | 30 +++++--- 4 files changed, 132 insertions(+), 9 deletions(-) create mode 100644 conifer/uwindsor_migration/eres-into-syrup.py diff --git a/conifer/syrup/models.py b/conifer/syrup/models.py index 3c4cdf8..9f62257 100644 --- a/conifer/syrup/models.py +++ b/conifer/syrup/models.py @@ -602,6 +602,7 @@ class Item(BaseModel): ('FD', 'fair dealing'), ('PG', 'permission granted'), ('LC', 'licensed content'), + ('AV', 'available to students'), ] copyright_status = m.CharField(max_length=2, diff --git a/conifer/uwindsor_migration/eres-into-syrup.py b/conifer/uwindsor_migration/eres-into-syrup.py new file mode 100644 index 0000000..d789cc1 --- /dev/null +++ b/conifer/uwindsor_migration/eres-into-syrup.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python-django + +from conifer.syrup.models import * + +from django.core.files import File +import shutil +import re +import hashlib +import os, sys +from os.path import * +from metadata import Metadata +from pprint import pprint +from django.conf import settings + +upload_dir = Item._meta.get_field('fileobj').upload_to + +known_profs = dict([ + ("Burgess","aburgess"), + ("Fitzgerald","afitz"), + ("Burr","burrc"), + ("Jacobs","djacobs"), + ("Gannage","gannage"), + ("Huffaker","huffaker"), + ("Carter","icarter"), + ("Lewis","lewis3"), + ("Parr","parr1"), + ("McKay","pmckay"), + ("Phipps","pphipps"), + ("Samson","psamson"), + ("Dienesch","rdienesc"), + ("Orsini","sorsini"), + ("Yun","yshhsy"),]) + +def ensure_user(username): + user, created = User.objects.get_or_create(username=username) + user.maybe_decorate() + return user + +def site_for_item(item): + termcode, prof = item.term, item.instructor + termcode = termcode.split(' ')[-1] + termcode[0] # Winter 2011 -> 2011W + coursecode = re.search('\d\d-\d\d\d', item.course).group(0) + profs = [ensure_user(known_profs[p.strip()]) + for p in prof.split(',')] + primary = profs[0] + course = Course.objects.get(code__contains=coursecode) + term = Term.objects.get(code=termcode) + site, created = Site.objects.get_or_create( + owner = primary, + start_term = term, + course = course, + defaults = dict(service_desk = ServiceDesk.default(), + end_term = term)) + return site + +DATA = 'data/' +COURSES = os.listdir(DATA) + + +for course in COURSES: + items = list(Metadata.find_all(join(DATA, course))) + if not items: + continue + _item = items[0] + + site = site_for_item(_item) + print site + + Item.objects.filter(site=site).delete() # fixme, just for testing. + + for m in items: + d = m.data.copy() + + if 'author2' in d: + d['author'] = '%s;%s' % (d['author'], d['author2']) + + for key in ['_path', 'author2', 'course', 'datafile', 'filename', 'instructor', + 'localid', 'term', 'type']: + if key in d: + del d[key] + + if m.type == 'url': + assert 'url' in d, ('No URL', m.data) + Item.objects.create(site=site, item_type='URL', **d) + + elif m.type == 'file': + if m.mimetype is None: + pprint(m.data) + raise Exception('stop: a bad file?') + + with open(m.datafile) as f: + digest = hashlib.md5(f.read()).hexdigest() + dest = digest + i = Item.objects.create(site=site, item_type='ELEC', + fileobj_mimetype = m.mimetype, + fileobj_origname = m.filename, + copyright_status='AV', + **d) + + fullpath = os.path.join(settings.MEDIA_ROOT, upload_dir, dest) + if os.path.isfile(fullpath): + i.fileobj.name = os.path.join(upload_dir, dest) + else: + with open(m.datafile) as f: + i.fileobj.save(dest, File(f), save=False) + i.save() diff --git a/conifer/uwindsor_migration/eres.py b/conifer/uwindsor_migration/eres.py index 808cb9e..06b4130 100755 --- a/conifer/uwindsor_migration/eres.py +++ b/conifer/uwindsor_migration/eres.py @@ -68,6 +68,8 @@ done = set() n = 0 for (cid, aid) in itemlinkpat.findall(html): + print (n, cid, aid) + if (cid, aid) in done: continue @@ -90,9 +92,11 @@ for (cid, aid) in itemlinkpat.findall(html): print >> log, (n, 'file', m.groups()) urlpath, itemid, origfile = m.groups() binary_url = '%s/%s' % (BASE, urlpath) + binary_url = binary_url.replace('[', r'\[').replace(']', r'\]') cookie = browser.cj[0] destfile = '%s/data%03d' % (PATH, n) cmd = 'curl -s -b "%s=%s" "%s" > %s' % (cookie.name, cookie.value, binary_url, destfile) + #print cmd os.system(cmd) back() done.add((cid, aid)) diff --git a/conifer/uwindsor_migration/metadata.py b/conifer/uwindsor_migration/metadata.py index 0c09fc4..d054f22 100644 --- a/conifer/uwindsor_migration/metadata.py +++ b/conifer/uwindsor_migration/metadata.py @@ -9,7 +9,7 @@ class Metadata(object): def __init__(self, path): self._path = path - self.html = open(name).read() + self.html = open(path).read() self.localid = re.search(r'item(\d+)', self._path).group(1) self._scrape() del self.html @@ -39,13 +39,13 @@ class Metadata(object): published='Date Published:(.*?)<', course='  (.*?) -', instructor='  .*? - .*? - (.*?)<', - term='  .*? - .*? \((.*?)\)', + term='  .* - .* \((.*?)\)', ) if hasattr(self, 'journal'): self.source_title = self.journal del self.journal - pat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information""") + pat = re.compile(r"""onClick="javascript:popall\('(.*?)'.*?">Click here for more information""") m = pat.search(self.html) if m: self.type = 'url' @@ -62,12 +62,24 @@ class Metadata(object): datafile = os.path.abspath(datafile) self.datafile = datafile + @property + def mimetype(self): + assert self.datafile + with os.popen('file -i ' + self.datafile) as f: + tmp = f.readline() + try: + return re.search(r': (\w+/\w+);', tmp).group(1) + except: + return None + + @classmethod + def find_all(cls, path): + for name in os.popen('find "%s" -name "item0*.html"' % path).readlines(): + yield Metadata(name.strip()) if __name__ == '__main__': - items = [] - for name in os.popen('find data -name "item0*.html"').readlines(): - name = name.strip() - m = Metadata(name) - items.append(m) - pprint(m.data) + for m in Metadata.find_all('data/'): + #pprint(m.data) + if m.type == 'file': + pprint(m.mimetype) -- 2.11.0