From 764aa2f9e1cb1c1a05b4d072eac8eee0bdcede9b Mon Sep 17 00:00:00 2001 From: gfawcett Date: Sat, 4 Apr 2009 02:42:24 +0000 Subject: [PATCH] working on Mark Physical Items As Arrived: fuzzy match is working. The fuzzy-match is in place; not tuned yet, but it's there. It's a relevance-engine based on Levenshtein-distance comparison of the title, author (and to a lesser degree the publisher and pubdate). Ideas for improvements are most welcome. Note that this version of the code takes a full snapshot of the MARC record when a Physical Item is requested from the catalogue. So there are more opportunities for item comparison. The code is a horrible mess. Much cleanup to do. git-svn-id: svn://svn.open-ils.org/ILS-Contrib/servres/trunk@258 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- conifer/custom/lib_integration.py | 32 ++++++++++++- conifer/libsystems/evergreen/item_status.py | 32 ++++--------- conifer/libsystems/z3950/marctools.py | 2 + conifer/libsystems/z3950/marcxml.py | 32 +++++++++++++ conifer/libsystems/z3950/yaz_search.py | 20 ++------ conifer/syrup/fuzzy_match.py | 46 +++++++++++++++++++ conifer/syrup/models.py | 5 ++ conifer/syrup/urls.py | 1 + conifer/syrup/views.py | 39 ++++++++++++---- conifer/templates/phys/mark_arrived.xhtml | 38 ++++++++++++++++ conifer/templates/phys/mark_arrived_choose.xhtml | 58 ++++++++++++++++++++++++ 11 files changed, 257 insertions(+), 48 deletions(-) create mode 100644 conifer/libsystems/z3950/marcxml.py create mode 100644 conifer/syrup/fuzzy_match.py create mode 100644 conifer/templates/phys/mark_arrived.xhtml create mode 100644 conifer/templates/phys/mark_arrived_choose.xhtml diff --git a/conifer/custom/lib_integration.py b/conifer/custom/lib_integration.py index 1f89406..2aa0732 100644 --- a/conifer/custom/lib_integration.py +++ b/conifer/custom/lib_integration.py @@ -12,9 +12,30 @@ # SIP for patron and item_info, and for item checkout and checkin, # OpenSRF for extended item info. + +# define a @caching decorator to exploit the Django cache. Fixme, move +# this somewhere else. +from django.core.cache import cache +def caching(prefix, timeout=60): + def g(func): + def f(*args): + v = cache.get((prefix, args)) + if v: + return v + else: + v = func(*args) + if v: + cache.set((prefix, args), v, timeout) + return v + return f + return g + + from django.conf import settings #LIBINT = settings.LIBRARY_INTEGRATION # more on this later. + +from conifer.libsystems.evergreen import item_status as I from conifer.libsystems.sip.sipclient import SIP @@ -34,6 +55,15 @@ def checkout(conn, patron_barcode, item_barcode): def checkin(conn, item_barcode): return conn.checkin(item_barcode, institution='', location='') - +@caching('bcbi', timeout=3600) +def barcode_to_bib_id(barcode): + return I.barcode_to_bib_id(barcode) + +@caching('bccp', timeout=3600) +def barcode_to_copy(barcode): + return I.barcode_to_copy(barcode) +@caching('bimx', timeout=3600) +def bib_id_to_marcxml(bib_id): + return I.bib_id_to_marcxml(bib_id) diff --git a/conifer/libsystems/evergreen/item_status.py b/conifer/libsystems/evergreen/item_status.py index 4d5f34e..026607f 100644 --- a/conifer/libsystems/evergreen/item_status.py +++ b/conifer/libsystems/evergreen/item_status.py @@ -1,29 +1,15 @@ -import warnings from support import ER, E1 -from pprint import pprint -# Proposing this as an interface method. Given a bib ID, return a dict -# giving the item's bibid, barcode, availability (boolean), -# holdability (boolean), and location (a string description). If the -# bib ID is invalid, return None. - -def lookup_availability(bib_id): - rec = E1('open-ils.search.asset.copy.fleshed2.retrieve', bib_id) - if 'stacktrace' in rec: - warnings.warn(repr(('no such bib id', bib_id, repr(rec)))) +def barcode_to_bib_id(barcode): + bib_id = (E1('open-ils.search.bib_id.by_barcode', barcode)) + if isinstance(bib_id, basestring): # it would be a dict if barcode not found. + return bib_id + else: return None - resp = { - 'bibid': bib_id, - 'barcode': rec['barcode'], - 'available': rec['status']['name'] == 'Available', - 'holdable': rec['status']['holdable'] == 't', - 'location': rec['location']['name']} - return resp +def bib_id_to_marcxml(bib_id): + return E1('open-ils.supercat.record.marcxml.retrieve', bib_id) if __name__ == '__main__': - DYLAN = 1321798 - #print lookup_availability(DYLAN) - - MISCHIEF = 2063351 - pprint(E1('open-ils.search.biblio.record.mods_slim.retrieve', MISCHIEF)) + from pprint import pprint + print bib_id_to_marcxml(barcode_to_bib_id(31862016799294)) diff --git a/conifer/libsystems/z3950/marctools.py b/conifer/libsystems/z3950/marctools.py index e6ed72c..a06f14c 100644 --- a/conifer/libsystems/z3950/marctools.py +++ b/conifer/libsystems/z3950/marctools.py @@ -91,6 +91,8 @@ class locToUTF8(object): def replace(self, str): "Given string str, returns unicode string with correct character replcements" + if isinstance(str, unicode): # added by Graham + return str searchchars = [] # build subset of search/replace pairs to use based on if first char of search appears in str prev = range(0,3) diff --git a/conifer/libsystems/z3950/marcxml.py b/conifer/libsystems/z3950/marcxml.py new file mode 100644 index 0000000..6f54d74 --- /dev/null +++ b/conifer/libsystems/z3950/marcxml.py @@ -0,0 +1,32 @@ +from xml.etree import ElementTree +import marctools + +loc_to_unicode = marctools.locToUTF8().replace + +def marcxml_to_dictionary(rec): + tree = ElementTree.fromstring(rec) + dct = {} + for df in tree.findall('{http://www.loc.gov/MARC21/slim}datafield'): + t = df.attrib['tag'] + for sf in df.findall('{http://www.loc.gov/MARC21/slim}subfield'): + c = sf.attrib['code'] + v = sf.text + dct[t+c] = loc_to_unicode(v) + return dct + +def marcxml_dictionary_to_dc(dct): + """Take a dictionary generated by marcxml_to_dictionary, and + extract some Dublin Core elements from it. Fixme, I'm sure this + could be way improved.""" + out = {} + meta = [('245a', 'dc:title'), ('100a', 'dc:creator'), ('260b', 'dc:publisher'), + ('260c', 'dc:date'), ('700a', 'dc:contributor')] + for marc, dc in meta: + value = dct.get(marc) + if value: + out[dc] = value + if '245b' in meta and 'dc:title' in out: + out['dc:title'] += (' %s' % meta['245b']) + return out + + diff --git a/conifer/libsystems/z3950/yaz_search.py b/conifer/libsystems/z3950/yaz_search.py index 1daf6ed..64f2996 100644 --- a/conifer/libsystems/z3950/yaz_search.py +++ b/conifer/libsystems/z3950/yaz_search.py @@ -6,19 +6,15 @@ import warnings import re -from xml.etree import ElementTree import pexpect -import marctools import sys - -loc_to_unicode = marctools.locToUTF8().replace +from marcxml import marcxml_to_dictionary LOG = sys.stderr #None # for pexpect debugging, try LOG = sys.stderr YAZ_CLIENT = 'yaz-client' GENERAL_TIMEOUT = 10 PRESENT_TIMEOUT = 30 - def search(host, database, query, start=1, limit=None): server = pexpect.spawn('yaz-client', timeout=GENERAL_TIMEOUT, logfile=LOG) @@ -60,26 +56,18 @@ def search(host, database, query, start=1, limit=None): parsed = [] for rec in raw_records: - dct = {} - parsed.append(dct) try: - tree = ElementTree.fromstring(rec) + dct = marcxml_to_dictionary(rec) except: raise rec - for df in tree.findall('{http://www.loc.gov/MARC21/slim}datafield'): - t = df.attrib['tag'] - for sf in df.findall('{http://www.loc.gov/MARC21/slim}subfield'): - c = sf.attrib['code'] - v = sf.text - dct[t+c] = loc_to_unicode(v) - + parsed.append(dct) return parsed + #------------------------------------------------------------ # some tests if __name__ == '__main__': - print loc_to_unicode('A\\XCC\\X81n') tests = [ ('dwarf.cs.uoguelph.ca:2210', 'conifer', '@and "Musson" "Evil"'), ('dwarf.cs.uoguelph.ca:2210', 'conifer', '@and "Denis" "Gravel"'), diff --git a/conifer/syrup/fuzzy_match.py b/conifer/syrup/fuzzy_match.py new file mode 100644 index 0000000..a40ab87 --- /dev/null +++ b/conifer/syrup/fuzzy_match.py @@ -0,0 +1,46 @@ +from conifer.syrup import models + +#http://www.poromenos.org/node/87. Credit to Poromenos. It's under BSD. +def levenshtein_distance(first, second): + """Find the Levenshtein distance between two strings.""" + if len(first) > len(second): + first, second = second, first + if len(second) == 0: + return len(first) + first_length = len(first) + 1 + second_length = len(second) + 1 + distance_matrix = [range(second_length) for x in range(first_length)] + for i in xrange(1, first_length): + for j in range(1, second_length): + deletion = distance_matrix[i-1][j] + 1 + insertion = distance_matrix[i][j-1] + 1 + substitution = distance_matrix[i-1][j-1] + if first[i-1] != second[j-1]: + substitution += 1 + distance_matrix[i][j] = min(insertion, deletion, substitution) + + return distance_matrix[first_length-1][second_length-1] + +def rank_pending_items(dct): + title = dct.get('dc:title','') + author = dct.get('dc:creator','') + publisher = dct.get('dc:publisher','') + pubdate = dct.get('dc:pubdate','') + + all_pending_items = models.Item.objects.filter(item_type='PHYS') # not right... also, prefetch metadata + results = [] + # not sure I like these weights, but let's play a bit. + METRICS = (('dc:title', 1), ('dc:creator', 1), ('dc:publisher', 0.5), ('dc:pubdate', 0.25)) + for item in all_pending_items: + scores = [] + for heading, weight in METRICS: + try: + ival = item.metadata_set.get(name=heading).value or '' + except: + ival = '' + dist = levenshtein_distance(dct.get(heading) or '', ival) + scores.append(dist/weight) + score = sum(scores) + results.append((score, item)) + results.sort() + return results diff --git a/conifer/syrup/models.py b/conifer/syrup/models.py index 69ab4a5..448dba7 100644 --- a/conifer/syrup/models.py +++ b/conifer/syrup/models.py @@ -281,6 +281,10 @@ class Course(m.Model): def get_students(self): return User.objects.filter(member__course__exact=self, member__role__exact='STUDT') \ .order_by('last_name', 'first_name') + + def get_instructors(self): + return User.objects.filter(member__course__exact=self, member__role__exact='INSTR') \ + .order_by('last_name', 'first_name') def _merge_sections(secs): delim = course_sections.sections_tuple_delimiter @@ -544,3 +548,4 @@ class Checkout(m.Model): completed = m.DateTimeField(default=None, null=True) outcome = m.CharField(max_length=100, null=True) + diff --git a/conifer/syrup/urls.py b/conifer/syrup/urls.py index 17cdb53..3b0da4a 100644 --- a/conifer/syrup/urls.py +++ b/conifer/syrup/urls.py @@ -48,6 +48,7 @@ urlpatterns = patterns('conifer.syrup.views', (r'^phys/$', 'phys_index'), (r'^phys/checkout/$', 'phys_checkout'), + (r'^phys/mark_arrived/$', 'phys_mark_arrived'), (r'^course/(?P\d+)/reseq$', 'course_reseq'), (ITEM_PREFIX + r'reseq', 'item_heading_reseq'), diff --git a/conifer/syrup/views.py b/conifer/syrup/views.py index cc708ad..ba49688 100644 --- a/conifer/syrup/views.py +++ b/conifer/syrup/views.py @@ -35,6 +35,8 @@ import re import sys from django.forms.models import modelformset_factory from conifer.custom import lib_integration +from conifer.libsystems.z3950.marcxml import marcxml_to_dictionary, marcxml_dictionary_to_dc +from fuzzy_match import rank_pending_items #----------------------------------------------------------------------------- # Z39.50 Support @@ -268,6 +270,7 @@ def z3950_test(request): return g.render('z3950_test.xhtml', res_str=res_str) def graham_z3950_test(request): + raise NotImplementedError # delete this function, its template, etc. query = request.GET.get('query', '@and "Denis" "Gravel"') from conifer.libsystems.z3950 import yaz_search from conifer.libsystems.evergreen.item_status import lookup_availability @@ -743,18 +746,18 @@ def item_add_cat_search(request, course_id, item_id): return _access_denied(_('You are not an editor.')) pickitem = eval(_pickitem) # fixme, dangerous. cache result server-side instead, or encrypt it. + dublin = marcxml_dictionary_to_dc(pickitem) + item = course.item_set.create(parent_heading=parent_item, title=pickitem.get('245a', 'Untitled'), item_type='PHYS') item.save() - # these are a temporary hack, must replace - meta = [('245a', 'dc:title'), ('100a', 'dc:creator'), ('260b', 'dc:publisher'), - ('260c', 'dc:date'), ('700a', 'dc:contributor')] - for marc, dc in meta: - value = pickitem.get(marc) - if value: - md = item.metadata_set.create(item=item, name=dc, value=value) - item.metadata_set.create(item=item, name='syrup:marc', value=simplejson.dumps(pickitem)) + + for dc, value in dublin.items(): + md = item.metadata_set.create(item=item, name=dc, value=value) + # store the whole darn MARC-dict as well. + json = simplejson.dumps(pickitem) + item.metadata_set.create(item=item, name='syrup:marc', value=json) item.save() return HttpResponseRedirect('../../../%d/' % item.id) @@ -1264,4 +1267,24 @@ def phys_checkout(request): return g.render('phys/checkout.xhtml', step=2, patron=patron, patron_descrip=post('patron_descrip')) + +def phys_mark_arrived(request): + if request.method != 'POST': + return g.render('phys/mark_arrived.xhtml') + else: + barcode = request.POST.get('item', '').strip() + bib_id = lib_integration.barcode_to_bib_id(barcode) + marcxml = lib_integration.bib_id_to_marcxml(bib_id) + dct = marcxml_to_dictionary(marcxml) + dublin = marcxml_dictionary_to_dc(dct) + # merge them + dct.update(dublin) + ranked = rank_pending_items(dct) + return g.render('phys/mark_arrived_choose.xhtml', + barcode=barcode, + bib_id=bib_id, + ranked=ranked, + metadata=dct) + + diff --git a/conifer/templates/phys/mark_arrived.xhtml b/conifer/templates/phys/mark_arrived.xhtml new file mode 100644 index 0000000..fdaf920 --- /dev/null +++ b/conifer/templates/phys/mark_arrived.xhtml @@ -0,0 +1,38 @@ + + + + + ${title} + + + + +

${title}

+
+
+ + + + + + + + + +
+
+ + diff --git a/conifer/templates/phys/mark_arrived_choose.xhtml b/conifer/templates/phys/mark_arrived_choose.xhtml new file mode 100644 index 0000000..b3b8df1 --- /dev/null +++ b/conifer/templates/phys/mark_arrived_choose.xhtml @@ -0,0 +1,58 @@ + + + + + ${title} + + + + +

${title}

+
+
+ + + + + + + + + + + + +

Matches

+

+ + + + + + + + + + + + + + + +
+
+ + -- 2.11.0