From f2c789472aecfe1da4a3a2aead524e5877f7b268 Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Wed, 24 Aug 2011 13:00:16 -0400 Subject: [PATCH] Add a script for quickly checking for encoding errors Also update the ldap_sync script to be more grown-up. Signed-off-by: Dan Scott --- tools/ebooks/check_encoding.py | 41 +++++++++ tools/patron-load/ldap_sync | 193 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 214 insertions(+), 20 deletions(-) create mode 100644 tools/ebooks/check_encoding.py diff --git a/tools/ebooks/check_encoding.py b/tools/ebooks/check_encoding.py new file mode 100644 index 0000000000..42f147c204 --- /dev/null +++ b/tools/ebooks/check_encoding.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +""" +Flag potential corrupted characters in MARC records + +This is a very simple script that simply prints out the subfields +containing values matching \x in the repr() representation of the subfield +using the tab-delimited format of record number, tag, code, Unicode value, +and repr() value of the matching subfield. + +Shortcomings: if the subfield contains Unicode sequences (\u####) then the +script assumes that they're valid and does not flag them. +""" + +import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2 + +marcfile = 'gibson_cppc-2011-05-24_362.mrc.utf8' + +record_cnt = 0 + +reader = pymarc.MARCReader( + open(marcfile, mode='rb'), to_unicode=True +) +for record in reader: + record_cnt += 1 + # print "Record %d" % record_cnt + + field_cnt = 0 + for field in record.get_fields(): + if field.is_control_field(): + continue + + # print "Field %d, tag %s" % (field_cnt, field.tag) + for subfield in field: + # print repr(subfield[1]) + if r'\x' not in repr(subfield[1]): + continue + + print "%d\t%s\t%s\t%s\t%s" % ( + record_cnt, field.tag, subfield[0], subfield[1].encode('utf8'), repr(subfield[1]) + ) + diff --git a/tools/patron-load/ldap_sync b/tools/patron-load/ldap_sync index 32b8888bae..aca97de1c8 100644 --- a/tools/patron-load/ldap_sync +++ b/tools/patron-load/ldap_sync @@ -1,22 +1,175 @@ #!/usr/bin/env python -import ldap, sys - -con = ldap.initialize('ldap://142.51.1.188') - -#try: -# con.start_tls_s() -#except ldap.LDAPError, e: -# print e.message['info'] -# if type(e.message) == dict and e.message.has_key('desc'): -# print e.message['desc'] -# else: -# print e -# sys.exit() -# - -dn = "uid=Libr_LDAP;ou=EMPL;o=LUL" -pw = "" - -auth = con.simple_bind_s(dn, pw) -print auth +""" +ldap_sync: create and update Evergreen accounts based on an LDAP directory + +LDAP authentication information is stored in a separate Python file and +imported to avoid storing credentials in the VCS. + +Rough plan: + +1. Create new accounts + a. Pull new LDAP records since a given time from the LDAP directory + using the filter (createTimestamp>=$time) and insert into a + staging table with the following columns; included is a sample + mapping to the LU LDAP attributes: + * first_given_name (givenName) + * family_name (sn) + * ident_value (lulColleagueId) + * usrname (cn) + * language (preferredLanguage) + * profile (lulPrimaryAffiliation) + * datatel_barcode (datatel_to_barcode(lulColleagueId) + b. For each LDAP record, create a new library system account if it + does not already exist (check for matches based on usrname, email + address, datatel_barcode). Map LDAP attributes to account profile, + first and last names, email address. + * Set passwd to a randomly generated value; first time users can + reset via email + * Set ident_type = 2, ident_value = ident_value + * Set home_ou appropriately + * Set expire_date to next September for students, 20 years from + now (?) for faculty / staff + * Set preferred language stat cat + c. Create a new barcode for the user via a PostgreSQL routine; draw + the base number from a database series. We no longer want to use + barcodes based on the Datatel number. This routine should update + the actor.usr.card column with the appropriate card ID. +2. Update existing accounts + a. If we found a match in 1(b), then update attributes accordingly: + * Set preferred language stat cat + * Update ident_type / ident_value to Datatel ID + * Set email address based on cn +""" + +import sys +import ldap, luauth + +def datatel_to_barcode(datatel): + """ + Converts a Datatel Colleague ID into a barcode + + Used only for matching legacy barcodes for the purposes of updates. + New users will get a barcode generated for them from a database series. + + >>> datatel_to_barcode('0104923') + '00007001049233' + """ + + barcode = '000070%s' % (datatel) + barcode = '%s%d' % (barcode, mod10_checksum(barcode)) + + return barcode + +def mod10_checksum(barcode): + """ + Calculates the mod10 checksum for a given string of digits + + This checksum algorithm is used for Code 3 of 9 barcodes. + """ + + total, position = 0, 0 + for digit in barcode: + digit = int(digit) + position += 1 + if (position % 2): + digit *= 2 + if digit < 10: + total += digit + else: + total += digit - 9 + else: + total += digit + + rem = total % 10 + if rem: + return 10 - rem + return rem + +def database_mod10(): + """ + Define a PostgreSQL function for generating mod10 check digits + """ + + print """CREATE OR REPLACE FUNCTION evergreen.mod10(TEXT) RETURNS TEXT AS $$ + use strict; + use warnings; + + my $barcode = shift; + my $total = 0; + my $position = 0; + foreach my $digit (split('', $barcode)) { + $position++; + if ($position % 2) { + # Double it + $digit *= 2; + # If less than 10, add to the total + if ($digit < 10) { + $total += $digit; + } else { + $total += $digit - 9; + } + } else { + $total += $digit; + } + } + my $rem = $total % 10; + if ($rem) { + return 10 - $rem; + } + return $rem; +$$ LANGUAGE PLPERLU STRICT IMMUTABLE; +""" + +def create_staging_table(): + """ + Create a staging table for creating or updating user accounts + """ + + print """ +DROP TABLE IF EXISTS scratchpad.usr_staging; +CREATE TABLE scratchpad.usr_staging (usrname TEXT, family_name TEXT, first_given_name TEXT, ident_value TEXT, lang TEXT); +""" + +def search_for_students(con): + base_dn = 'o=lul' + search_scope = ldap.SCOPE_SUBTREE + attributes = ['lulPrimaryAffiliation', 'cn', 'mail', 'givenName', 'sn', 'lulColleagueId', 'preferredLanguage'] + filter = '(&(objectclass=lulEduPerson))' + filter = '(&(objectclass=lulEduPerson)(lulPrimaryAffiliation=*))' + + try: + result_id = con.search(base_dn, search_scope, filter, attributes) + result_set = [] + while 1: + result_type, result_data = con.result(result_id, 0) + if result_data == []: + break + else: + print result_data[0][0] + for key in result_data[0][1]: + print key, result_data[0][1][key] + except ldap.LDAPError, e: + print e + +if __name__ == '__main__': + import doctest + doctest.testmod() + + con = ldap.initialize(luauth.hostname) + con.set_option(ldap.OPT_REFERRALS, 0) + + try: + con.simple_bind_s(luauth.dn, luauth.pw) + search_for_students(con) + except ldap.LDAPError, e: + print "Could not connect: " + e.message['info'] + if type(e.message) == dict and e.message.has_key('desc'): + print e.message['desc'] + else: + print e + sys.exit() + finally: + con.unbind() + +# vim: et:ts=4:sw=4:tw=78: -- 2.11.0