working on Mark Physical Items As Arrived: fuzzy match is working.
authorgfawcett <gfawcett@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Sat, 4 Apr 2009 02:42:24 +0000 (02:42 +0000)
committergfawcett <gfawcett@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Sat, 4 Apr 2009 02:42:24 +0000 (02:42 +0000)
The fuzzy-match is in place; not tuned yet, but it's there. It's a
relevance-engine based on Levenshtein-distance comparison of the
title, author (and to a lesser degree the publisher and
pubdate). Ideas for improvements are most welcome.

Note that this version of the code takes a full snapshot of the MARC
record when a Physical Item is requested from the catalogue. So there
are more opportunities for item comparison.

The code is a horrible mess. Much cleanup to do.

git-svn-id: svn://svn.open-ils.org/ILS-Contrib/servres/trunk@258 6d9bc8c9-1ec2-4278-b937-99fde70a366f

conifer/custom/lib_integration.py
conifer/libsystems/evergreen/item_status.py
conifer/libsystems/z3950/marctools.py
conifer/libsystems/z3950/marcxml.py [new file with mode: 0644]
conifer/libsystems/z3950/yaz_search.py
conifer/syrup/fuzzy_match.py [new file with mode: 0644]
conifer/syrup/models.py
conifer/syrup/urls.py
conifer/syrup/views.py
conifer/templates/phys/mark_arrived.xhtml [new file with mode: 0644]
conifer/templates/phys/mark_arrived_choose.xhtml [new file with mode: 0644]

index 1f89406..2aa0732 100644 (file)
 # SIP for patron and item_info, and for item checkout and checkin,
 # OpenSRF for extended item info.
 
+
+# define a @caching decorator to exploit the Django cache. Fixme, move
+# this somewhere else.
+from django.core.cache import cache
+def caching(prefix, timeout=60):
+    def g(func):
+        def f(*args):
+            v = cache.get((prefix, args))
+            if v:
+                return v
+            else:
+                v = func(*args)
+                if v:
+                    cache.set((prefix, args), v, timeout)
+                    return v
+        return f
+    return g
+
+
 from django.conf import settings
 #LIBINT = settings.LIBRARY_INTEGRATION # more on this later.
 
+
+from conifer.libsystems.evergreen import item_status as I
 from conifer.libsystems.sip.sipclient import SIP
 
 
@@ -34,6 +55,15 @@ def checkout(conn, patron_barcode, item_barcode):
 def checkin(conn, item_barcode):
     return conn.checkin(item_barcode, institution='', location='')
 
-    
 
+@caching('bcbi', timeout=3600)
+def barcode_to_bib_id(barcode):
+    return I.barcode_to_bib_id(barcode)
+
+@caching('bccp', timeout=3600)
+def barcode_to_copy(barcode):
+    return I.barcode_to_copy(barcode)
 
+@caching('bimx', timeout=3600)
+def bib_id_to_marcxml(bib_id):
+    return I.bib_id_to_marcxml(bib_id)
index 4d5f34e..026607f 100644 (file)
@@ -1,29 +1,15 @@
-import warnings
 from support import ER, E1
-from pprint import pprint
 
-# Proposing this as an interface method. Given a bib ID, return a dict
-# giving the item's bibid, barcode, availability (boolean),
-# holdability (boolean), and location (a string description). If the
-# bib ID is invalid, return None.
-
-def lookup_availability(bib_id):
-    rec = E1('open-ils.search.asset.copy.fleshed2.retrieve', bib_id)
-    if 'stacktrace' in rec:
-        warnings.warn(repr(('no such bib id', bib_id, repr(rec))))
+def barcode_to_bib_id(barcode):
+    bib_id = (E1('open-ils.search.bib_id.by_barcode', barcode))
+    if isinstance(bib_id, basestring): # it would be a dict if barcode not found.
+        return bib_id
+    else:
         return None
-    resp = {
-        'bibid':     bib_id,
-        'barcode':   rec['barcode'],
-        'available': rec['status']['name'] == 'Available',
-        'holdable':  rec['status']['holdable'] == 't',
-        'location':  rec['location']['name']}
-    return resp
 
+def bib_id_to_marcxml(bib_id):
+    return E1('open-ils.supercat.record.marcxml.retrieve', bib_id)
 
 if __name__ == '__main__':
-    DYLAN = 1321798
-    #print lookup_availability(DYLAN)
-
-    MISCHIEF = 2063351
-    pprint(E1('open-ils.search.biblio.record.mods_slim.retrieve', MISCHIEF))
+    from pprint import pprint
+    print bib_id_to_marcxml(barcode_to_bib_id(31862016799294))
index e6ed72c..a06f14c 100644 (file)
@@ -91,6 +91,8 @@ class locToUTF8(object):
 
     def replace(self, str):
         "Given string str, returns unicode string with correct character replcements"
+        if isinstance(str, unicode): # added by Graham
+            return str
         searchchars = []
         # build subset of search/replace pairs to use based on if first char of search appears in str
         prev = range(0,3)
diff --git a/conifer/libsystems/z3950/marcxml.py b/conifer/libsystems/z3950/marcxml.py
new file mode 100644 (file)
index 0000000..6f54d74
--- /dev/null
@@ -0,0 +1,32 @@
+from xml.etree import ElementTree
+import marctools
+
+loc_to_unicode = marctools.locToUTF8().replace
+
+def marcxml_to_dictionary(rec):
+    tree = ElementTree.fromstring(rec)
+    dct = {}
+    for df in tree.findall('{http://www.loc.gov/MARC21/slim}datafield'):
+        t = df.attrib['tag']
+        for sf in df.findall('{http://www.loc.gov/MARC21/slim}subfield'):
+            c = sf.attrib['code']
+            v = sf.text
+            dct[t+c] = loc_to_unicode(v)
+    return dct
+
+def marcxml_dictionary_to_dc(dct):
+    """Take a dictionary generated by marcxml_to_dictionary, and
+    extract some Dublin Core elements from it. Fixme, I'm sure this
+    could be way improved."""
+    out = {}
+    meta = [('245a', 'dc:title'), ('100a', 'dc:creator'), ('260b', 'dc:publisher'),
+            ('260c', 'dc:date'), ('700a', 'dc:contributor')]
+    for marc, dc in meta:
+        value = dct.get(marc)
+        if value:
+            out[dc] = value
+    if '245b' in meta and 'dc:title' in out:
+        out['dc:title'] += (' %s' % meta['245b'])
+    return out
+
+    
index 1daf6ed..64f2996 100644 (file)
@@ -6,19 +6,15 @@
 
 import warnings
 import re
-from xml.etree import ElementTree
 import pexpect
-import marctools
 import sys
-
-loc_to_unicode = marctools.locToUTF8().replace
+from marcxml import marcxml_to_dictionary
 
 LOG = sys.stderr #None              #  for pexpect debugging, try LOG = sys.stderr
 YAZ_CLIENT = 'yaz-client'
 GENERAL_TIMEOUT = 10
 PRESENT_TIMEOUT = 30
 
-
 def search(host, database, query, start=1, limit=None):
 
     server = pexpect.spawn('yaz-client', timeout=GENERAL_TIMEOUT, logfile=LOG)
@@ -60,26 +56,18 @@ def search(host, database, query, start=1, limit=None):
 
     parsed = []
     for rec in raw_records:
-        dct = {}
-        parsed.append(dct)
         try:
-            tree = ElementTree.fromstring(rec)
+            dct = marcxml_to_dictionary(rec)
         except:
             raise rec
-        for df in tree.findall('{http://www.loc.gov/MARC21/slim}datafield'):
-            t = df.attrib['tag']
-            for sf in df.findall('{http://www.loc.gov/MARC21/slim}subfield'):
-                c = sf.attrib['code']
-                v = sf.text
-                dct[t+c] = loc_to_unicode(v)
-
+        parsed.append(dct)
     return parsed
 
+
 #------------------------------------------------------------
 # some tests
 
 if __name__ == '__main__':
-    print loc_to_unicode('A\\XCC\\X81n')
     tests = [
         ('dwarf.cs.uoguelph.ca:2210', 'conifer', '@and "Musson" "Evil"'),
         ('dwarf.cs.uoguelph.ca:2210', 'conifer', '@and "Denis" "Gravel"'),
diff --git a/conifer/syrup/fuzzy_match.py b/conifer/syrup/fuzzy_match.py
new file mode 100644 (file)
index 0000000..a40ab87
--- /dev/null
@@ -0,0 +1,46 @@
+from conifer.syrup import models
+
+#http://www.poromenos.org/node/87. Credit to Poromenos. It's under BSD.
+def levenshtein_distance(first, second):
+    """Find the Levenshtein distance between two strings."""
+    if len(first) > len(second):
+        first, second = second, first
+    if len(second) == 0:
+        return len(first)
+    first_length = len(first) + 1
+    second_length = len(second) + 1
+    distance_matrix = [range(second_length) for x in range(first_length)]
+    for i in xrange(1, first_length):
+        for j in range(1, second_length):
+            deletion = distance_matrix[i-1][j] + 1
+            insertion = distance_matrix[i][j-1] + 1
+            substitution = distance_matrix[i-1][j-1]
+            if first[i-1] != second[j-1]:
+                substitution += 1
+            distance_matrix[i][j] = min(insertion, deletion, substitution)
+
+    return distance_matrix[first_length-1][second_length-1]
+
+def rank_pending_items(dct):
+    title = dct.get('dc:title','')
+    author = dct.get('dc:creator','')
+    publisher = dct.get('dc:publisher','')
+    pubdate  = dct.get('dc:pubdate','')
+
+    all_pending_items = models.Item.objects.filter(item_type='PHYS') # not right... also, prefetch metadata
+    results = []
+    # not sure I like these weights, but let's play a bit.
+    METRICS = (('dc:title', 1), ('dc:creator', 1), ('dc:publisher', 0.5), ('dc:pubdate', 0.25))
+    for item in all_pending_items:
+        scores = []
+        for heading, weight in METRICS:
+            try:
+                ival = item.metadata_set.get(name=heading).value or ''
+            except:
+                ival = ''
+            dist = levenshtein_distance(dct.get(heading) or '', ival)
+            scores.append(dist/weight)
+        score = sum(scores)
+        results.append((score, item))
+    results.sort()
+    return results
index 69ab4a5..448dba7 100644 (file)
@@ -281,6 +281,10 @@ class Course(m.Model):
     def get_students(self):
         return User.objects.filter(member__course__exact=self, member__role__exact='STUDT') \
             .order_by('last_name', 'first_name')
+    
+    def get_instructors(self):
+        return User.objects.filter(member__course__exact=self, member__role__exact='INSTR') \
+            .order_by('last_name', 'first_name')
 
 def _merge_sections(secs):
     delim = course_sections.sections_tuple_delimiter
@@ -544,3 +548,4 @@ class Checkout(m.Model):
     completed = m.DateTimeField(default=None, null=True)
     outcome  = m.CharField(max_length=100, null=True)
     
+
index 17cdb53..3b0da4a 100644 (file)
@@ -48,6 +48,7 @@ urlpatterns = patterns('conifer.syrup.views',
 
     (r'^phys/$', 'phys_index'),
     (r'^phys/checkout/$', 'phys_checkout'),
+    (r'^phys/mark_arrived/$', 'phys_mark_arrived'),
 
     (r'^course/(?P<course_id>\d+)/reseq$', 'course_reseq'),
     (ITEM_PREFIX + r'reseq', 'item_heading_reseq'),
index cc708ad..ba49688 100644 (file)
@@ -35,6 +35,8 @@ import re
 import sys
 from django.forms.models import modelformset_factory
 from conifer.custom import lib_integration
+from conifer.libsystems.z3950.marcxml import marcxml_to_dictionary, marcxml_dictionary_to_dc
+from fuzzy_match import rank_pending_items
 
 #-----------------------------------------------------------------------------
 # Z39.50 Support
@@ -268,6 +270,7 @@ def z3950_test(request):
     return g.render('z3950_test.xhtml', res_str=res_str)
 
 def graham_z3950_test(request):
+    raise NotImplementedError   # delete this function, its template, etc.
     query = request.GET.get('query', '@and "Denis" "Gravel"')
     from conifer.libsystems.z3950 import yaz_search
     from conifer.libsystems.evergreen.item_status import lookup_availability
@@ -743,18 +746,18 @@ def item_add_cat_search(request, course_id, item_id):
             return _access_denied(_('You are not an editor.'))
 
         pickitem = eval(_pickitem) # fixme, dangerous. cache result server-side instead, or encrypt it.
+        dublin = marcxml_dictionary_to_dc(pickitem)
+
         item = course.item_set.create(parent_heading=parent_item,
                                       title=pickitem.get('245a', 'Untitled'),
                                       item_type='PHYS')
         item.save()
-        # these are a temporary hack, must replace
-        meta = [('245a', 'dc:title'), ('100a', 'dc:creator'), ('260b', 'dc:publisher'),
-                ('260c', 'dc:date'), ('700a', 'dc:contributor')]
-        for marc, dc in meta:
-            value = pickitem.get(marc)
-            if value:
-                md = item.metadata_set.create(item=item, name=dc, value=value)
-        item.metadata_set.create(item=item, name='syrup:marc', value=simplejson.dumps(pickitem))
+
+        for dc, value in dublin.items():
+            md = item.metadata_set.create(item=item, name=dc, value=value)
+        # store the whole darn MARC-dict as well.
+        json = simplejson.dumps(pickitem)
+        item.metadata_set.create(item=item, name='syrup:marc', value=json)
         item.save()
         return HttpResponseRedirect('../../../%d/' % item.id)
 
@@ -1264,4 +1267,24 @@ def phys_checkout(request):
             return g.render('phys/checkout.xhtml', step=2, 
                             patron=patron,
                             patron_descrip=post('patron_descrip'))
+
         
+def phys_mark_arrived(request):
+    if request.method != 'POST':
+        return g.render('phys/mark_arrived.xhtml')
+    else:
+        barcode = request.POST.get('item', '').strip()
+        bib_id  = lib_integration.barcode_to_bib_id(barcode)
+        marcxml = lib_integration.bib_id_to_marcxml(bib_id)
+        dct     = marcxml_to_dictionary(marcxml)
+        dublin  = marcxml_dictionary_to_dc(dct)
+        # merge them
+        dct.update(dublin)
+        ranked = rank_pending_items(dct)
+        return g.render('phys/mark_arrived_choose.xhtml', 
+                        barcode=barcode,
+                        bib_id=bib_id,
+                        ranked=ranked,
+                        metadata=dct)
+
+    
diff --git a/conifer/templates/phys/mark_arrived.xhtml b/conifer/templates/phys/mark_arrived.xhtml
new file mode 100644 (file)
index 0000000..fdaf920
--- /dev/null
@@ -0,0 +1,38 @@
+<?python
+sample_item = '31862016799294'  # fixme, just for testing.
+title = _('Mark Items as Arrived')
+?>
+<html xmlns="http://www.w3.org/1999/xhtml"
+      xmlns:xi="http://www.w3.org/2001/XInclude"
+      xmlns:py="http://genshi.edgewall.org/">
+<xi:include href="../master.xhtml"/>
+<head>
+  <title>${title}</title>
+  <script>
+    $(function() { $('form:last input:visible:first').focus(); });
+  </script>
+  <style>
+    .success { background-color: #dfd; }
+    .failure { background-color: #fdd; }
+  </style>
+</head>
+<body>
+  <h1>${title}</h1>
+  <form action="." method="POST">
+    <div>
+    <table class="metadata_table">
+      <tr>
+       <th>Item Barcode</th>
+       <td>
+         <input type="text" id="item" name="item" style="width: 400;" value="${sample_item}"/>
+       </td>
+      </tr>
+      <tr>
+       <th/>
+       <td><input type="submit" value="${False and _('Check out another item') or _('Continue')}"/></td>
+      </tr>
+    </table>
+  </div>
+  </form>
+</body>
+</html>
diff --git a/conifer/templates/phys/mark_arrived_choose.xhtml b/conifer/templates/phys/mark_arrived_choose.xhtml
new file mode 100644 (file)
index 0000000..b3b8df1
--- /dev/null
@@ -0,0 +1,58 @@
+<?python
+import re
+sample_item = '31862016799294'  # fixme, just for testing.
+title = _('Mark Items as Arrived: Choose Match')
+?>
+<html xmlns="http://www.w3.org/1999/xhtml"
+      xmlns:xi="http://www.w3.org/2001/XInclude"
+      xmlns:py="http://genshi.edgewall.org/">
+<xi:include href="../master.xhtml"/>
+<head>
+  <title>${title}</title>
+  <script>
+    $(function() { $('form:last input:visible:first').focus(); });
+  </script>
+  <style>
+    .success { background-color: #dfd; }
+    .failure { background-color: #fdd; }
+    .likely { background-color: #dfd; }
+    .doubtful { font-size: 75%; }
+  </style>
+</head>
+<body>
+  <h1>${title}</h1>
+  <form action="." method="POST">
+    <div>
+      <table class="metadata_table">
+       <tr><th>Item Barcode</th><td>${barcode}</td></tr>
+       <tr py:for="k in ['dc:title', 'dc:creator', 'dc:publisher', 'dc:date']" py:if="k in metadata">
+         <th>${k}</th><td>${metadata[k]}</td>
+       </tr>
+       <tr><th/><td><a href="javascript:$('#more_detail').toggle(); void(0);">Detail</a></td></tr>
+      </table>
+      <table id="more_detail" style="display: none;" class="metadata_table">
+      <tr py:for="k in metadata">
+       <th>${k}</th><td>${metadata[k]}</td>
+      </tr>
+    </table>
+    <h2>Matches</h2>
+    <p><button>Associate with matches selected below</button></p>
+    <table class="metadata_table">
+      <thead style="font-size: 70%;">
+       <tr><th py:for="v in 'Select Title Author Course Instructor Score'.split(' ')">${v}</th></tr>
+      </thead>
+      <tbody>
+      <tr py:for="score, item in ranked" class="${score &lt; 5 and 'likely' or score &lt; 50 and 'maybe' or 'doubtful'}">
+       <td><input type="checkbox" name="choose_${item.id}" id="choose_${item.id}"/></td>
+       <td><label for="choose_${item.id}">${item}</label></td>
+       <td>${item.author()}</td>
+       <td>${item.course.title}</td>
+       <td>${','.join(n.last_name for n in item.course.get_instructors())}</td>
+       <td>${repr(score)}</td>
+       </tr>
+       </tbody>
+    </table>
+  </div>
+  </form>
+</body>
+</html>