--- /dev/null
+#!/usr/bin/env python-django
+
+from conifer.syrup.models import *
+
+from django.core.files import File
+import shutil
+import re
+import hashlib
+import os, sys
+from os.path import *
+from metadata import Metadata
+from pprint import pprint
+from django.conf import settings
+
+upload_dir = Item._meta.get_field('fileobj').upload_to
+
+known_profs = dict([
+ ("Burgess","aburgess"),
+ ("Fitzgerald","afitz"),
+ ("Burr","burrc"),
+ ("Jacobs","djacobs"),
+ ("Gannage","gannage"),
+ ("Huffaker","huffaker"),
+ ("Carter","icarter"),
+ ("Lewis","lewis3"),
+ ("Parr","parr1"),
+ ("McKay","pmckay"),
+ ("Phipps","pphipps"),
+ ("Samson","psamson"),
+ ("Dienesch","rdienesc"),
+ ("Orsini","sorsini"),
+ ("Yun","yshhsy"),])
+
+def ensure_user(username):
+ user, created = User.objects.get_or_create(username=username)
+ user.maybe_decorate()
+ return user
+
+def site_for_item(item):
+ termcode, prof = item.term, item.instructor
+ termcode = termcode.split(' ')[-1] + termcode[0] # Winter 2011 -> 2011W
+ coursecode = re.search('\d\d-\d\d\d', item.course).group(0)
+ profs = [ensure_user(known_profs[p.strip()])
+ for p in prof.split(',')]
+ primary = profs[0]
+ course = Course.objects.get(code__contains=coursecode)
+ term = Term.objects.get(code=termcode)
+ site, created = Site.objects.get_or_create(
+ owner = primary,
+ start_term = term,
+ course = course,
+ defaults = dict(service_desk = ServiceDesk.default(),
+ end_term = term))
+ return site
+
+DATA = 'data/'
+COURSES = os.listdir(DATA)
+
+
+for course in COURSES:
+ items = list(Metadata.find_all(join(DATA, course)))
+ if not items:
+ continue
+ _item = items[0]
+
+ site = site_for_item(_item)
+ print site
+
+ Item.objects.filter(site=site).delete() # fixme, just for testing.
+
+ for m in items:
+ d = m.data.copy()
+
+ if 'author2' in d:
+ d['author'] = '%s;%s' % (d['author'], d['author2'])
+
+ for key in ['_path', 'author2', 'course', 'datafile', 'filename', 'instructor',
+ 'localid', 'term', 'type']:
+ if key in d:
+ del d[key]
+
+ if m.type == 'url':
+ assert 'url' in d, ('No URL', m.data)
+ Item.objects.create(site=site, item_type='URL', **d)
+
+ elif m.type == 'file':
+ if m.mimetype is None:
+ pprint(m.data)
+ raise Exception('stop: a bad file?')
+
+ with open(m.datafile) as f:
+ digest = hashlib.md5(f.read()).hexdigest()
+ dest = digest
+ i = Item.objects.create(site=site, item_type='ELEC',
+ fileobj_mimetype = m.mimetype,
+ fileobj_origname = m.filename,
+ copyright_status='AV',
+ **d)
+
+ fullpath = os.path.join(settings.MEDIA_ROOT, upload_dir, dest)
+ if os.path.isfile(fullpath):
+ i.fileobj.name = os.path.join(upload_dir, dest)
+ else:
+ with open(m.datafile) as f:
+ i.fileobj.save(dest, File(f), save=False)
+ i.save()
n = 0
for (cid, aid) in itemlinkpat.findall(html):
+ print (n, cid, aid)
+
if (cid, aid) in done:
continue
print >> log, (n, 'file', m.groups())
urlpath, itemid, origfile = m.groups()
binary_url = '%s/%s' % (BASE, urlpath)
+ binary_url = binary_url.replace('[', r'\[').replace(']', r'\]')
cookie = browser.cj[0]
destfile = '%s/data%03d' % (PATH, n)
cmd = 'curl -s -b "%s=%s" "%s" > %s' % (cookie.name, cookie.value, binary_url, destfile)
+ #print cmd
os.system(cmd)
back()
done.add((cid, aid))
def __init__(self, path):
self._path = path
- self.html = open(name).read()
+ self.html = open(path).read()
self.localid = re.search(r'item(\d+)', self._path).group(1)
self._scrape()
del self.html
published='<td align="left" nowrap="nowrap">Date Published:</td><td align="left" width="100%">(.*?)<',
course='<td class="HEADER1" valign="middle" align="left" height="25"> (.*?) -',
instructor='<td class="HEADER1" valign="middle" align="left" height="25"> .*? - .*? - (.*?)<',
- term='<td class="HEADER1" valign="middle" align="left" height="25"> .*? - .*? \((.*?)\)',
+ term='<td class="HEADER1" valign="middle" align="left" height="25"> .* - .* \((.*?)\)',
)
if hasattr(self, 'journal'):
self.source_title = self.journal
del self.journal
- pat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information</a>""")
+ pat = re.compile(r"""onClick="javascript:popall\('(.*?)'.*?">Click here for more information</a>""")
m = pat.search(self.html)
if m:
self.type = 'url'
datafile = os.path.abspath(datafile)
self.datafile = datafile
+ @property
+ def mimetype(self):
+ assert self.datafile
+ with os.popen('file -i ' + self.datafile) as f:
+ tmp = f.readline()
+ try:
+ return re.search(r': (\w+/\w+);', tmp).group(1)
+ except:
+ return None
+
+ @classmethod
+ def find_all(cls, path):
+ for name in os.popen('find "%s" -name "item0*.html"' % path).readlines():
+ yield Metadata(name.strip())
if __name__ == '__main__':
- items = []
- for name in os.popen('find data -name "item0*.html"').readlines():
- name = name.strip()
- m = Metadata(name)
- items.append(m)
- pprint(m.data)
+ for m in Metadata.find_all('data/'):
+ #pprint(m.data)
+ if m.type == 'file':
+ pprint(m.mimetype)