uwindsor: migration of content from ERES to Syrup actually works!

author gfawcett <gfawcett@6d9bc8c9-1ec2-4278-b937-99fde70a366f>

Sun, 9 Jan 2011 01:00:03 +0000 (01:00 +0000)

committer gfawcett <gfawcett@6d9bc8c9-1ec2-4278-b937-99fde70a366f>

Sun, 9 Jan 2011 01:00:03 +0000 (01:00 +0000)
author gfawcett <gfawcett@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Sun, 9 Jan 2011 01:00:03 +0000 (01:00 +0000)
committer gfawcett <gfawcett@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Sun, 9 Jan 2011 01:00:03 +0000 (01:00 +0000)
diff --git a/conifer/syrup/models.py b/conifer/syrup/models.py

index 3c4cdf8..9f62257 100644 (file)
--- a/conifer/syrup/models.py
+++ b/conifer/syrup/models.py
@@ -602,6 +602,7 @@ class Item(BaseModel):
          ('FD', 'fair dealing'),
          ('PG', 'permission granted'),
          ('LC', 'licensed content'),
+        ('AV', 'available to students'),
          ]
  
      copyright_status = m.CharField(max_length=2, 
diff --git a/conifer/uwindsor_migration/eres-into-syrup.py b/conifer/uwindsor_migration/eres-into-syrup.py

new file mode 100644 (file)

index 0000000..d789cc1
--- /dev/null
+++ b/conifer/uwindsor_migration/eres-into-syrup.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python-django
+
+from conifer.syrup.models import *
+
+from django.core.files import File
+import shutil
+import re
+import hashlib
+import os, sys
+from os.path import *
+from metadata import Metadata
+from pprint import pprint
+from django.conf import settings
+
+upload_dir = Item._meta.get_field('fileobj').upload_to
+
+known_profs = dict([
+        ("Burgess","aburgess"),
+        ("Fitzgerald","afitz"),
+        ("Burr","burrc"),
+        ("Jacobs","djacobs"),
+        ("Gannage","gannage"),
+        ("Huffaker","huffaker"),
+        ("Carter","icarter"),
+        ("Lewis","lewis3"),
+        ("Parr","parr1"),
+        ("McKay","pmckay"),
+        ("Phipps","pphipps"),
+        ("Samson","psamson"),
+        ("Dienesch","rdienesc"),
+        ("Orsini","sorsini"),
+        ("Yun","yshhsy"),])
+
+def ensure_user(username):
+    user, created = User.objects.get_or_create(username=username)
+    user.maybe_decorate()
+    return user
+
+def site_for_item(item):
+    termcode, prof = item.term, item.instructor
+    termcode = termcode.split(' ')[-1] + termcode[0] # Winter 2011 -> 2011W
+    coursecode = re.search('\d\d-\d\d\d', item.course).group(0)
+    profs = [ensure_user(known_profs[p.strip()])
+             for p in prof.split(',')]
+    primary = profs[0]
+    course = Course.objects.get(code__contains=coursecode)
+    term = Term.objects.get(code=termcode)
+    site, created = Site.objects.get_or_create(
+        owner = primary,
+        start_term = term,
+        course = course,
+        defaults = dict(service_desk = ServiceDesk.default(),
+                        end_term = term))
+    return site
+    
+DATA = 'data/'
+COURSES = os.listdir(DATA)
+
+
+for course in COURSES:
+    items = list(Metadata.find_all(join(DATA, course)))
+    if not items:
+        continue
+    _item = items[0]
+
+    site = site_for_item(_item)
+    print site
+
+    Item.objects.filter(site=site).delete() # fixme, just for testing.
+
+    for m in items:
+        d = m.data.copy()
+
+        if 'author2' in d:
+            d['author'] = '%s;%s' % (d['author'], d['author2'])
+
+        for key in ['_path', 'author2', 'course', 'datafile', 'filename', 'instructor', 
+                    'localid', 'term', 'type']:
+            if key in d:
+                del d[key]
+        
+        if m.type == 'url':
+            assert 'url' in d, ('No URL', m.data)
+            Item.objects.create(site=site, item_type='URL', **d)
+
+        elif m.type == 'file':
+            if m.mimetype is None:
+                pprint(m.data)
+                raise Exception('stop: a bad file?')
+
+            with open(m.datafile) as f:
+                digest = hashlib.md5(f.read()).hexdigest()
+            dest = digest
+            i = Item.objects.create(site=site, item_type='ELEC',
+                                    fileobj_mimetype = m.mimetype,
+                                    fileobj_origname = m.filename,
+                                    copyright_status='AV',
+                                    **d)
+
+            fullpath = os.path.join(settings.MEDIA_ROOT, upload_dir, dest)
+            if os.path.isfile(fullpath):
+                i.fileobj.name = os.path.join(upload_dir, dest)
+            else:
+                with open(m.datafile) as f:
+                    i.fileobj.save(dest, File(f), save=False)
+            i.save()
diff --git a/conifer/uwindsor_migration/eres.py b/conifer/uwindsor_migration/eres.py

index 808cb9e..06b4130 100755 (executable)
--- a/conifer/uwindsor_migration/eres.py
+++ b/conifer/uwindsor_migration/eres.py
@@ -68,6 +68,8 @@ done = set()
  
  n = 0
  for (cid, aid) in itemlinkpat.findall(html):
+    print (n, cid, aid)
+
      if (cid, aid) in done:
          continue
  
@@ -90,9 +92,11 @@ for (cid, aid) in itemlinkpat.findall(html):
              print >> log, (n, 'file', m.groups())
              urlpath, itemid, origfile = m.groups()
              binary_url = '%s/%s' % (BASE, urlpath)
+            binary_url = binary_url.replace('[', r'\[').replace(']', r'\]')   
              cookie = browser.cj[0]
              destfile = '%s/data%03d' % (PATH, n)
              cmd = 'curl -s -b "%s=%s" "%s" > %s' % (cookie.name, cookie.value, binary_url, destfile)
+            #print cmd
              os.system(cmd)
      back()
      done.add((cid, aid))
diff --git a/conifer/uwindsor_migration/metadata.py b/conifer/uwindsor_migration/metadata.py

index 0c09fc4..d054f22 100644 (file)
--- a/conifer/uwindsor_migration/metadata.py
+++ b/conifer/uwindsor_migration/metadata.py
@@ -9,7 +9,7 @@ class Metadata(object):
  
      def __init__(self, path):
          self._path = path
-        self.html = open(name).read()
+        self.html = open(path).read()
          self.localid = re.search(r'item(\d+)', self._path).group(1)
          self._scrape()
          del self.html
@@ -39,13 +39,13 @@ class Metadata(object):
              published='<td align="left" nowrap="nowrap">Date Published:</td><td align="left" width="100%">(.*?)<',
              course='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;(.*?) -',
              instructor='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;.*? - .*? - (.*?)<',
-            term='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;.*? - .*? \((.*?)\)',
+            term='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;.* - .* \((.*?)\)',
              )
          if hasattr(self, 'journal'):
              self.source_title = self.journal
              del self.journal
  
-        pat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information</a>""")
+        pat = re.compile(r"""onClick="javascript:popall\('(.*?)'.*?">Click here for more information</a>""")
          m = pat.search(self.html)
          if m:
              self.type = 'url'
@@ -62,12 +62,24 @@ class Metadata(object):
                  datafile = os.path.abspath(datafile)
                  self.datafile = datafile
  
+    @property
+    def mimetype(self):
+        assert self.datafile
+        with os.popen('file -i ' + self.datafile) as f:
+            tmp = f.readline()
+        try:
+            return re.search(r': (\w+/\w+);', tmp).group(1)
+        except:
+            return None
+        
  
+    @classmethod
+    def find_all(cls, path):
+        for name in os.popen('find "%s" -name "item0*.html"' % path).readlines():
+            yield Metadata(name.strip())
  
  if __name__ == '__main__':
-    items = []
-    for name in os.popen('find data -name "item0*.html"').readlines():
-        name = name.strip()
-        m = Metadata(name)
-        items.append(m)
-        pprint(m.data)
+    for m in Metadata.find_all('data/'):
+        #pprint(m.data)
+        if m.type == 'file':
+            pprint(m.mimetype)
author	gfawcett <gfawcett@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
	Sun, 9 Jan 2011 01:00:03 +0000 (01:00 +0000)
committer	gfawcett <gfawcett@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
	Sun, 9 Jan 2011 01:00:03 +0000 (01:00 +0000)
conifer/syrup/models.py		patch \| blob \| history
conifer/uwindsor_migration/eres-into-syrup.py	[new file with mode: 0644]	patch \| blob
conifer/uwindsor_migration/eres.py		patch \| blob \| history
conifer/uwindsor_migration/metadata.py		patch \| blob \| history