From 452f1b30d0746063b22c1506eab628485604fbe5 Mon Sep 17 00:00:00 2001 From: miker Date: Mon, 12 Oct 2009 17:06:01 +0000 Subject: [PATCH] This patch moves towards in-database indexed value normalization. The eventual goal of this is to move to pure database ingest handled by triggers. This will free us from the grip of the Ingest server, speed up ingest altogether, and cause ingest to occur entirely within the same database transaction as the INSERT or UPDATE to the MARC that is the cause of the ingest. This means no more potential for race conditions on ingest, and simpler data import. In this first step, we add some normalization routines for dealing with basic string data. NACO normalization and the like. With these functions we can do everything that the Ingest server can do with regard to munging indexed strings. You can register these normalizers with specific indexed fields, and define the order in which they are to be applied. Next up: work on the scaffolding to actually apply the functions, define the IDL entries, and create MARC-handling functions to do the xpath dances. This functionallity will require either: * a custom (I have a patch) pgxml contrib module or * Postgresql 8.3+ XML/XPath support git-svn-id: svn://svn.open-ils.org/ILS/trunk@14375 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/sql/Pg/002.functions.config.sql | 4 + Open-ILS/src/sql/Pg/002.schema.config.sql | 35 ++++++++- Open-ILS/src/sql/Pg/950.data.seed-values.sql | 84 +++++++++++++++++++++ .../src/sql/Pg/upgrade/0032.ingest-normalizers.sql | 41 +++++++++++ .../Pg/upgrade/0033.data.ingest-normalizers.sql | 85 ++++++++++++++++++++++ 5 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql create mode 100644 Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql diff --git a/Open-ILS/src/sql/Pg/002.functions.config.sql b/Open-ILS/src/sql/Pg/002.functions.config.sql index 38d426c133..e8740b3b7e 100644 --- a/Open-ILS/src/sql/Pg/002.functions.config.sql +++ b/Open-ILS/src/sql/Pg/002.functions.config.sql @@ -120,5 +120,9 @@ CREATE OR REPLACE FUNCTION public.first_word ( TEXT ) RETURNS TEXT AS $$ SELECT SUBSTRING( $1 FROM $_$^\S+$_$); $$ LANGUAGE SQL; +CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$ + SELECT public.naco_normalize($1,'a'); +$func$ LANGUAGE SQL STRICT IMMUTABLE; + COMMIT; diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index c63a5ccb56..de80e10d8b 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -51,7 +51,7 @@ CREATE TABLE config.upgrade_log ( install_date TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); -INSERT INTO config.upgrade_log (version) VALUES ('0031'); -- miker +INSERT INTO config.upgrade_log (version) VALUES ('0033'); -- miker CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, @@ -562,5 +562,38 @@ CREATE TABLE config.org_unit_setting_type ( ( datatype <> 'link' AND fm_class IS NULL ) ) ); + +-- Some handy functions, based on existing ones, to provide optional ingest normalization + +CREATE OR REPLACE FUNCTION public.left_trunc( TEXT, INT ) RETURNS TEXT AS $func$ + SELECT SUBSTRING($1,$2); +$func$ LANGUAGE SQL STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION public.right_trunc( TEXT, INT ) RETURNS TEXT AS $func$ + SELECT SUBSTRING($1,1,$2); +$func$ LANGUAGE SQL STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func$ + SELECT REGEXP_REPLACE( $1, E'(\\d{4})-(\\d{4})', E'\\1 \\2', 'g' ); +$func$ LANGUAGE SQL STRICT IMMUTABLE; + +-- And ... a table in which to register them + +CREATE TABLE config.index_normalizer ( + id SERIAL PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + func TEXT NOT NULL, + param_count INT NOT NULL DEFAULT 0 +); + +CREATE TABLE config.metabib_field_index_norm_map ( + id SERIAL PRIMARY KEY, + field INT NOT NULL REFERENCES config.metabib_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + norm INT NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + params TEXT, + pos INT NOT NULL DEFAULT 0 +); + + COMMIT; diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql index 1239c1c3ab..9e3b7b07ba 100644 --- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql +++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql @@ -769,6 +769,8 @@ INSERT INTO config.z3950_attr (id, source, name, label, code, format) INSERT INTO config.z3950_attr (id, source, name, label, code, format) VALUES (9, 'loc', 'item_type', oils_i18n_gettext(9, 'Item Type', 'cza', 'label'), 1001, 1); +UPDATE config.z3950_attr SET truncation = 1 WHERE source = 'loc'; + INSERT INTO config.z3950_attr (id, source, name, label, code, format) VALUES (10, 'oclc', 'tcn', oils_i18n_gettext(10, 'Title Control Number', 'cza', 'label'), 12, 1); INSERT INTO config.z3950_attr (id, source, name, label, code, format) @@ -2117,3 +2119,85 @@ INSERT INTO config.org_unit_setting_type (name, label, description, datatype) VA 'bool' ); + +-- in-db indexing normalizers +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'NACO Normalize', + 'Apply NACO normalization rules to the extracted text. See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.', + 'naco_normalize', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Normalize date range', + 'Split date ranges in the form of "XXXX-YYYY" into "XXXX YYYY" for proper index.', + 'split_date_range', + 1 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'NACO Normalize -- retain first comma', + 'Apply NACO normalization rules to the extracted text, retaining the first comma. See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.', + 'naco_normalize_keep_comma', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Strip Diacritics', + 'Convert text to NFD form and remove non-spacing combining marks.', + 'remove_diacritics', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Up-case', + 'Convert text upper case.', + 'uppercase', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Down-case', + 'Convert text lower case.', + 'lowercase', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Extract Dewey-like number', + 'Extract a string of numeric characters ther resembles a DDC number.', + 'call_number_dewey', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Left truncation', + 'Discard the specified number of characters from the left side of the string.', + 'left_trunc', + 1 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Right truncation', + 'Include only the specified number of characters from the left side of the string.', + 'right_trunc', + 1 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'First word', + 'Include only the first space-separated word of a string.', + 'first_word', + 0 +); + +-- make use of the index normalizers + +INSERT INTO config.metabib_field_index_norm_map (field,norm) + SELECT m.id, + i.id + FROM config.metabib_field m, + config.index_normalizer i + WHERE i.func IN ('naco_normalize','split_date_range'); + + diff --git a/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql b/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql new file mode 100644 index 0000000000..2fdd6cd5f5 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql @@ -0,0 +1,41 @@ +BEGIN; + +INSERT INTO config.upgrade_log (version) VALUES ('0032'); -- miker + +-- Some handy functions, based on existing ones, to provide optional ingest normalization + +CREATE OR REPLACE FUNCTION public.left_trunc( TEXT, INT ) RETURNS TEXT AS $func$ + SELECT SUBSTRING($1,$2); +$func$ LANGUAGE SQL STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION public.right_trunc( TEXT, INT ) RETURNS TEXT AS $func$ + SELECT SUBSTRING($1,1,$2); +$func$ LANGUAGE SQL STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$ + SELECT public.naco_normalize($1,'a'); +$func$ LANGUAGE SQL STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func$ + SELECT REGEXP_REPLACE( $1, E'(\\d{4})-(\\d{4})', E'\\1 \\2', 'g' ); +$func$ LANGUAGE SQL STRICT IMMUTABLE; + +-- And ... a table in which to register them + +CREATE TABLE config.index_normalizer ( + id SERIAL PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + func TEXT NOT NULL, + param_count INT NOT NULL DEFAULT 0 +); + +CREATE TABLE config.metabib_field_index_norm_map ( + id SERIAL PRIMARY KEY, + field INT NOT NULL REFERENCES config.metabib_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + norm INT NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + params TEXT, + pos INT NOT NULL DEFAULT 0 +); + +COMMIT; + diff --git a/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql b/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql new file mode 100644 index 0000000000..12d31e796a --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql @@ -0,0 +1,85 @@ +BEGIN; + +INSERT INTO config.upgrade_log (version) VALUES ('0033'); -- miker + + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'NACO Normalize', + 'Apply NACO normalization rules to the extracted text. See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.', + 'naco_normalize', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Normalize date range', + 'Split date ranges in the form of "XXXX-YYYY" into "XXXX YYYY" for proper index.', + 'split_date_range', + 1 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'NACO Normalize -- retain first comma', + 'Apply NACO normalization rules to the extracted text, retaining the first comma. See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.', + 'naco_normalize_keep_comma', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Strip Diacritics', + 'Convert text to NFD form and remove non-spacing combining marks.', + 'remove_diacritics', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Up-case', + 'Convert text upper case.', + 'uppercase', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Down-case', + 'Convert text lower case.', + 'lowercase', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Extract Dewey-like number', + 'Extract a string of numeric characters ther resembles a DDC number.', + 'call_number_dewey', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Left truncation', + 'Discard the specified number of characters from the left side of the string.', + 'left_trunc', + 1 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Right truncation', + 'Include only the specified number of characters from the left side of the string.', + 'right_trunc', + 1 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'First word', + 'Include only the first space-separated word of a string.', + 'first_word', + 0 +); + + +INSERT INTO config.metabib_field_index_norm_map (field,norm) + SELECT m.id, + i.id + FROM config.metabib_field m, + config.index_normalizer i + WHERE i.func IN ('naco_normalize','split_date_range'); + +COMMIT; + -- 2.11.0