From b5dd05aa72c091e73f2344138df4afbed7fa7fc2 Mon Sep 17 00:00:00 2001 From: dbs Date: Tue, 3 Aug 2010 03:14:49 +0000 Subject: [PATCH] Slight overhaul of ISBN indexing focused on the translate_isbn1013 function * Switch to a PLPERLU function built on Business::ISBN for more robust indexing of records with problematic ISBNs; thanks to Jason Stephenson for the initial implementation * Remove the first_word, naco_normalize, and split_date_range normalizers from the indexing chain for ISBNs, as these unnecessarily munge the ISBNs. We can trust Business::ISBN to determine what is a real ISBN, hyphens and all. * Index all ISBNs in a given record and generate the corresponding ISBN10/ISBN13 variations. Also, fix the checksum if given an ISBN with a bad checksum, but be sure to index the original bad-checksum ISBN as well. * Add a set of 10 MARC records with bad ISBNs (thanks to Jason Stephenson again for these) for testing purposes to Open-ILS/tests/datasets * Add the Business::ISBN Perl module as a prerequisite to the installer * Remove contrib/isn from the PostgreSQL install instructions git-svn-id: svn://svn.open-ils.org/ILS/trunk@17066 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/extras/Makefile.install | 6 ++ Open-ILS/src/sql/Pg/002.schema.config.sql | 56 +++++++++++++++++- Open-ILS/src/sql/Pg/950.data.seed-values.sql | 13 +---- .../upgrade/0357.schema.isbn1013-translation.sql | 68 ++++++++++++++++++++++ Open-ILS/tests/datasets/README | 1 + Open-ILS/tests/datasets/badisbns.xml | 13 +++++ README | 1 - 7 files changed, 143 insertions(+), 15 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0357.schema.isbn1013-translation.sql create mode 100644 Open-ILS/tests/datasets/badisbns.xml diff --git a/Open-ILS/src/extras/Makefile.install b/Open-ILS/src/extras/Makefile.install index 7f2ef0904..ee29b7ddd 100644 --- a/Open-ILS/src/extras/Makefile.install +++ b/Open-ILS/src/extras/Makefile.install @@ -101,6 +101,8 @@ DEBS = \ # Debian Lenny and Ubuntu Intrepid bundle recent versions of yaz EXTRA_DEBS = \ + libbusiness-isbn-perl\ + libbusiness-isbn-data-perl\ libmarc-charset-perl \ libmarc-xml-perl \ libnet-z3950-zoom-perl \ @@ -158,6 +160,8 @@ FEDORA_13_RPMS = \ ncurses-devel \ ncurses-libs \ perl-Business-CreditCard \ + perl-Business-ISBN \ + perl-Business-ISBN-Data \ perl-Email-Send \ perl-GDGraph3d \ perl-MARC-Record \ @@ -248,6 +252,8 @@ CPAN_MODULES_SAFE = \ # Recent Debian/Ubuntus have libmarc-charset-perl, libmarc-xml-perl, libnet-z3950-zoom-perl CPAN_MODULES_MARC = \ + Business::ISBN \ + Business::ISBN::Data \ MARC::Charset \ MARC::File::XML \ Net::Z3950::ZOOM diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 17839a59a..020b9e4f4 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -68,7 +68,7 @@ CREATE TABLE config.upgrade_log ( install_date TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); -INSERT INTO config.upgrade_log (version) VALUES ('0356'); -- miker +INSERT INTO config.upgrade_log (version) VALUES ('0357'); -- dbs CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, @@ -724,8 +724,58 @@ CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func $func$ LANGUAGE SQL STRICT IMMUTABLE; CREATE OR REPLACE FUNCTION public.translate_isbn1013( TEXT ) RETURNS TEXT AS $func$ - SELECT isbn FROM (SELECT isn_weak(true), $1 || ' ' || REPLACE( CASE WHEN length($1) = 10 THEN isbn13($1)::TEXT WHEN length($1) = 13 THEN isbn($1)::TEXT ELSE '' END, '-', '') AS isbn)x; -$func$ LANGUAGE SQL STRICT IMMUTABLE; + use Business::ISBN; + use strict; + use warnings; + + # For each ISBN found in a single string containing a set of ISBNs: + # * Normalize an incoming ISBN to have the correct checksum and no hyphens + # * Convert an incoming ISBN10 or ISBN13 to its counterpart and return + + my $input = shift; + my $output = ''; + + foreach my $word (split(/\s/, $input)) { + my $isbn = Business::ISBN->new($word); + + # First check the checksum; if it is not valid, fix it and add the original + # bad-checksum ISBN to the output + if ($isbn && $isbn->is_valid_checksum() == Business::ISBN::BAD_CHECKSUM) { + $output .= $isbn->isbn() . " "; + $isbn->fix_checksum(); + } + + # If we now have a valid ISBN, convert it to its counterpart ISBN10/ISBN13 + # and add the normalized original ISBN to the output + if ($isbn && $isbn->is_valid()) { + my $isbn_xlated = ($isbn->type eq "ISBN13") ? $isbn->as_isbn10 : $isbn->as_isbn13; + $output .= $isbn->isbn . " "; + + # If we successfully converted the ISBN to its counterpart, add the + # converted ISBN to the output as well + $output .= ($isbn_xlated->isbn . " ") if ($isbn_xlated); + } + } + return $output if $output; + + # If there were no valid ISBNs, just return the raw input + return $input; +$func$ LANGUAGE PLPERLU; + +COMMENT ON FUNCTION public.translate_isbn1013(TEXT) IS $$ +/* + * Copyright (C) 2010 Merrimack Valley Library Consortium + * Jason Stephenson + * Copyright (C) 2010 Laurentian University + * Dan Scott + * + * The translate_isbn1013 function takes an input ISBN and returns the + * following in a single space-delimited string if the input ISBN is valid: + * - The normalized input ISBN (hyphens stripped) + * - The normalized input ISBN with a fixed checksum if the checksum was bad + * - The ISBN converted to its ISBN10 or ISBN13 counterpart, if possible + */ +$$; -- And ... a table in which to register them diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql index 9876e41a8..01bf79ad0 100644 --- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql +++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql @@ -4226,7 +4226,7 @@ INSERT INTO config.index_normalizer (name, description, func, param_count) VALUE INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( 'Extract Dewey-like number', - 'Extract a string of numeric characters ther resembles a DDC number.', + 'Extract a string of numeric characters that resembles a DDC number.', 'call_number_dewey', 0 ); @@ -4281,16 +4281,7 @@ INSERT INTO config.metabib_field_index_norm_map (field,norm) FROM config.metabib_field m, config.index_normalizer i WHERE i.func IN ('naco_normalize','split_date_range') - AND m.id NOT IN (19); - -INSERT INTO config.metabib_field_index_norm_map (field,norm,pos) - SELECT m.id, - i.id, - 1 - FROM config.metabib_field m, - config.index_normalizer i - WHERE i.func IN ('first_word') - AND m.id IN (18); + AND m.id NOT IN (18, 19); INSERT INTO config.metabib_field_index_norm_map (field,norm,pos) SELECT m.id, diff --git a/Open-ILS/src/sql/Pg/upgrade/0357.schema.isbn1013-translation.sql b/Open-ILS/src/sql/Pg/upgrade/0357.schema.isbn1013-translation.sql new file mode 100644 index 000000000..b389bb5cb --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0357.schema.isbn1013-translation.sql @@ -0,0 +1,68 @@ +BEGIN; + +INSERT INTO config.upgrade_log (version) VALUES ('0357'); -- dbs + +DELETE FROM config.metabib_field_index_norm_map + WHERE norm IN ( + SELECT id + FROM config.index_normalizer + WHERE func IN ('first_word', 'naco_normalize', 'split_date_range') + ) + AND field = 18 +; + +CREATE OR REPLACE FUNCTION public.translate_isbn1013( TEXT ) RETURNS TEXT AS $func$ + use Business::ISBN; + use strict; + use warnings; + + # For each ISBN found in a single string containing a set of ISBNs: + # * Normalize an incoming ISBN to have the correct checksum and no hyphens + # * Convert an incoming ISBN10 or ISBN13 to its counterpart and return + + my $input = shift; + my $output = ''; + + foreach my $word (split(/\s/, $input)) { + my $isbn = Business::ISBN->new($word); + + # First check the checksum; if it is not valid, fix it and add the original + # bad-checksum ISBN to the output + if ($isbn && $isbn->is_valid_checksum() == Business::ISBN::BAD_CHECKSUM) { + $output .= $isbn->isbn() . " "; + $isbn->fix_checksum(); + } + + # If we now have a valid ISBN, convert it to its counterpart ISBN10/ISBN13 + # and add the normalized original ISBN to the output + if ($isbn && $isbn->is_valid()) { + my $isbn_xlated = ($isbn->type eq "ISBN13") ? $isbn->as_isbn10 : $isbn->as_isbn13; + $output .= $isbn->isbn . " "; + + # If we successfully converted the ISBN to its counterpart, add the + # converted ISBN to the output as well + $output .= ($isbn_xlated->isbn . " ") if ($isbn_xlated); + } + } + return $output if $output; + + # If there were no valid ISBNs, just return the raw input + return $input; +$func$ LANGUAGE PLPERLU; + +COMMENT ON FUNCTION public.translate_isbn1013(TEXT) IS $$ +/* + * Copyright (C) 2010 Merrimack Valley Library Consortium + * Jason Stephenson + * Copyright (C) 2010 Laurentian University + * Dan Scott + * + * The translate_isbn1013 function takes an input ISBN and returns the + * following in a single space-delimited string if the input ISBN is valid: + * - The normalized input ISBN (hyphens stripped) + * - The normalized input ISBN with a fixed checksum if the checksum was bad + * - The ISBN converted to its ISBN10 or ISBN13 counterpart, if possible + */ +$$; + +COMMIT; diff --git a/Open-ILS/tests/datasets/README b/Open-ILS/tests/datasets/README index ef88a1829..1e13e4f96 100644 --- a/Open-ILS/tests/datasets/README +++ b/Open-ILS/tests/datasets/README @@ -3,6 +3,7 @@ The following table lists the data sets we have collected for testing purposes. ^ File name ^ Format ^ Encoding ^ Source system ^ Description ^ | auth-1066.mrc | MARC21 | UTF8 | Aleph | Dutch authority records from IISH (missing 0 in leader 22) | | auth-subset100.mrc| MARC21 | UTF8 | Aleph | Dutch authority records from IISH (missing 0 in leader 22) | +| badisbns.xml | MARC21XML | UTF8 | | 10 records with known bad ISBNs | | FSL.marc | MARC21 | UTF8 | Aleph | Armenian and Cyrillic scripts, collected from the Fundamental Science Library in Yerevan, Armenia | | hebrew.marc | MARC21 | MARC8 | III | Hebrew scripts, 25 records | | lul_fre_100.marc | MARC21 | MARC8 | Unicorn GL3.1 | 100 records, French, pre-1923 | diff --git a/Open-ILS/tests/datasets/badisbns.xml b/Open-ILS/tests/datasets/badisbns.xml new file mode 100644 index 000000000..a1c28beb8 --- /dev/null +++ b/Open-ILS/tests/datasets/badisbns.xml @@ -0,0 +1,13 @@ + + +00663ncm a2200229Ia 4500ocm2061267120090604115400.0891108s1989 mnupp i n eng dHL00361126 :$14.95TULTULUtOrBLWMRQA782.421630264S6986 1989Songs of the 60's :piano, vocal, guitar.Winona, MN :H. Leonard,c1989.1 score (175 p.) :ill. ;31 cm.Decade seriesWith a preface by Stanley Green.Popular music1961-1970Songs of the sixties.DE (Series)*SOT60S99000 +00631nam a2200217Ii 4500ocm0570287420080722155500.0791116s1978 wiua j 000 1 eng d089542262X295LSMLSMm.c.UtOrBLWMRQAFicDicJDickens, Charles,1812-1870A Christmas carol /by Charles Dickens.Milwaukee, Wis. :Ideals Pub. Corp.,c1978.[46] p. :ill. (part col.) ;28 cm.On cover: Dickens' Christmas carol.Dickens' Christmas carol.DICCCAR15000 +00882cam a2200277Ma 4500ocm2510451120090602130300.0830627t19751972onca 000 1 eng d055310131517505532043000553268473 (pbk.) :$6.99TEUTEUMRQUtOrBLWMRQAJLe Guin, Ursula K.,1929-The farthest shore /Ursula K. Le Guin ; illustrated by Gail Garraty.Bantam ed.Toronto [Ont.] ;New York, N.Y. :Bantam Books,1975, c1972.197 p. :ill. ;18 cm.Earthsea trilogy ; Bk. 3.A Bantam book.10131-5This is the third of the author's Earthsea trilogy.Earthsea trilogy ; Bk. 3.Bantam book10131-5 ;10131-5.LE*FSHO97000 +00624nam a22002050 4500mrq00398759MAnMC20090602092700.0880222s1947 nyu a 0 eng d4980858214bbd :$9.00MVLCengUtOrBLWCTL0210694Mason, Bernard Sterling,1896-1953Cabins, cottages and summer homes /by Bernard S. Mason and Frederick H. Kock.New York :A.S. Barnes,[1947]168p. :ill., plans ;29 cm.Log cabinsCottagesKock, Frederic H.,1902-MASCCAS99000 +00684nam a22002410i 4500mrq00459563MAnMC20090604115300.0840313r19791958nyu a f eng c0440487331pbk.MVLCengUtOrBLWPZ7.P3145To2CTL0242335JPearce, PhilippaTom's midnight garden /by A. Philippa Pearce ; illustrated by Susan Linzig. --.New York :Dell Publishing,1979.229 p. :ill. --A Yearling BookReprint of 1958 edition.Linzig, Susan,ill.Yearling bookPEATMGA98000 +00926nam a2200265Ka 4500ocm51762602OCoLC20080729125900.0030228s2003 vau 000 0 eng dJL03022802 set :$800.00IJVIJVMRQUtOrBLWMRQAR 332.67S785Standard & Poor's register of corporations, directors and executives, 2003.v. 2.New York, N.Y. :Standard & Poor's Corp.,c2003.3 v. ;29 cm.v. 2. Directors and Executives.Directors of corporationsUnited StatesDirectories.CorporationsUnited StatesDirectoriesExecutivesUnited StatesDirectoriesStandard and Poor's Corporation[2003 ed.]Standard & Poor's register of corporations, directors and executives, 2003.X0MRQ +01199pam a2200373 a 4500ocm69733027OCoLC20070917154400.0060403s2006 enka 001 0 eng GBA651402bnb013480703Uk978184533213X (hbk.)184533213X (hbk.)9781845332136 (hbk.)(OCoLC)69733027UKMUKMBAKERCPLVP@QBXBTCTANZFEPMRQukscpMRQANK4660.M55 2006738.8/2/07522738.07522Miller's ceramic figures.Ceramic figuresLondon :Miller's,2006.288 p. :ill. (some col.) ;24 cm.Previous ed.: Tenterden: Millers, 2000.Includes bibliographical references (p. 284) and indexes.Porcelain figuresPrices.Porcelain figuresCollectors and collecting.Baker & TaylorBKTY24.9518.71184533213X0006763197activeQuality Books, Inc.QUALqbi07800321Baker and TaylorBTCPBK0006763197C0MRQ +00474nam a2200169 a 4500mrq01096224MAnMC20080804205500.0071109s2005 nyu 000 1 eng d(0373197527pbk.) :$4.25UtOrBLWRussell, RebeccaThe substitute fiancee /Rebecca Russell.New York :Silhouette,2005.186 p. ;18 cm.Silhouette romance1752Silhouette romance1752. +01474cgm a2200433Ka 4500ocn214285386OCoLC20100628155400.0vd cvaizq080327s2007 cc --- vlchi d978788430094XCNE220704980(OCoLC)214285386MBTMBTMRQMRQA$1Chuang guan dong.Disc 1-3, Episode 1-9 /[videorecording]Zhongyang dian shi tai wen yi zhong xin ying shi bu, Shangdong dian ying dian shi ju zhi zuo zhong xin, Dalian dian shi tai lian he she zhi ; bian ju: Gao Mantang, Sun Jianye ; dao yan: Kong Sheng, Wang Bing ; zong dao yan: Zhang Xinjian.Jinan :Qi lu dian zi yin xiang chu ban she,2007.3 videodiscs :sd., col. ;4 3/4 in.Chang pian (52 ji) dian shi lian xu ju.Li Youbin, Sa Rina, Song Jia, Niu Li, Ma Enran, Wang Quirong.Summary in vernacular field only.DVD.In Mandarin with Chinese subtitles.ChinaRural conditionsDrama.Television programsGao, MantangSun, JianyeKong, Sheng.Wang, BingZhang, Xinjian.Li, Youbin.Sa, Rina.Song, Jia.Niu, Li.Ma, Enran.Wang, Quirong.C0MRQ +00490ngm a2200169 a 4500mrq01239723MAnMC20100616133100.0vd gaizu s2010 v eng d8901452100503DHIF2338SNH 6-11-10 mbu (Length of dvd & pub date not listed AM)Teen PattiWS[videorecording]WS.[s.l.], India :Serendipity Fims,c2010?1 videodisc (? min.) :sd., col. ;4 3/4 in. + \ No newline at end of file diff --git a/README b/README index 201ffbb19..d83fb359a 100644 --- a/README +++ b/README @@ -132,7 +132,6 @@ createlang plpgsql evergreen psql -f /usr/share/postgresql/8.4/contrib/tablefunc.sql -d evergreen psql -f /usr/share/postgresql/8.4/contrib/tsearch2.sql -d evergreen psql -f /usr/share/postgresql/8.4/contrib/pgxml.sql -d evergreen -psql -f /usr/share/postgresql/8.4/contrib/isn.sql -d evergreen Once you have created the Evergreen database, you need to create a PostgreSQL user to access the database. Issue the following command as the "postgres" -- 2.11.0