From 452f1b30d0746063b22c1506eab628485604fbe5 Mon Sep 17 00:00:00 2001
From: miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
Date: Mon, 12 Oct 2009 17:06:01 +0000
Subject: [PATCH] This patch moves towards in-database indexed value
 normalization.  The eventual goal of this is to move to pure database ingest
 handled by triggers.  This will free us from the grip of the Ingest server,
 speed up ingest altogether, and cause ingest to occur entirely within the
 same database transaction as the INSERT or UPDATE to the MARC that is the
 cause of the ingest.  This means no more potential for race conditions on
 ingest, and simpler data import.

In this first step, we add some normalization routines for dealing with basic
string data.  NACO normalization and the like.  With these functions we can
do everything that the Ingest server can do with regard to munging indexed
strings.

You can register these normalizers with specific indexed fields, and define the
order in which they are to be applied.

Next up: work on the scaffolding to actually apply the functions, define the
IDL entries, and create MARC-handling functions to do the xpath dances.

This functionallity will require either:

 * a custom (I have a patch) pgxml contrib module or
 * Postgresql 8.3+ XML/XPath support




git-svn-id: svn://svn.open-ils.org/ILS/trunk@14375 dcc99617-32d9-48b4-a31d-7c20da2025e4
---
 Open-ILS/src/sql/Pg/002.functions.config.sql       |  4 +
 Open-ILS/src/sql/Pg/002.schema.config.sql          | 35 ++++++++-
 Open-ILS/src/sql/Pg/950.data.seed-values.sql       | 84 +++++++++++++++++++++
 .../src/sql/Pg/upgrade/0032.ingest-normalizers.sql | 41 +++++++++++
 .../Pg/upgrade/0033.data.ingest-normalizers.sql    | 85 ++++++++++++++++++++++
 5 files changed, 248 insertions(+), 1 deletion(-)
 create mode 100644 Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql
 create mode 100644 Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql

diff --git a/Open-ILS/src/sql/Pg/002.functions.config.sql b/Open-ILS/src/sql/Pg/002.functions.config.sql
index 38d426c133..e8740b3b7e 100644
--- a/Open-ILS/src/sql/Pg/002.functions.config.sql
+++ b/Open-ILS/src/sql/Pg/002.functions.config.sql
@@ -120,5 +120,9 @@ CREATE OR REPLACE FUNCTION public.first_word ( TEXT ) RETURNS TEXT AS $$
         SELECT SUBSTRING( $1 FROM $_$^\S+$_$);
 $$ LANGUAGE SQL;
 
+CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
+        SELECT public.naco_normalize($1,'a');
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
 COMMIT;
 
diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql
index c63a5ccb56..de80e10d8b 100644
--- a/Open-ILS/src/sql/Pg/002.schema.config.sql
+++ b/Open-ILS/src/sql/Pg/002.schema.config.sql
@@ -51,7 +51,7 @@ CREATE TABLE config.upgrade_log (
     install_date    TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 
-INSERT INTO config.upgrade_log (version) VALUES ('0031'); -- miker
+INSERT INTO config.upgrade_log (version) VALUES ('0033'); -- miker
 
 CREATE TABLE config.bib_source (
 	id		SERIAL	PRIMARY KEY,
@@ -562,5 +562,38 @@ CREATE TABLE config.org_unit_setting_type (
       ( datatype <> 'link' AND fm_class IS NULL ) )
 );
 
+
+-- Some handy functions, based on existing ones, to provide optional ingest normalization
+
+CREATE OR REPLACE FUNCTION public.left_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+        SELECT SUBSTRING($1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.right_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+        SELECT SUBSTRING($1,1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func$
+        SELECT REGEXP_REPLACE( $1, E'(\\d{4})-(\\d{4})', E'\\1 \\2', 'g' );
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+-- And ... a table in which to register them
+
+CREATE TABLE config.index_normalizer (
+        id              SERIAL  PRIMARY KEY,
+        name            TEXT    UNIQUE NOT NULL,
+        func            TEXT    NOT NULL,
+        param_count     INT     NOT NULL DEFAULT 0
+);
+
+CREATE TABLE config.metabib_field_index_norm_map (
+        id      SERIAL  PRIMARY KEY,
+        field   INT     NOT NULL REFERENCES config.metabib_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+        norm    INT     NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+        params  TEXT,
+        pos     INT     NOT NULL DEFAULT 0
+);
+
+
 COMMIT;
 
diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql
index 1239c1c3ab..9e3b7b07ba 100644
--- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql
+++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql
@@ -769,6 +769,8 @@ INSERT INTO config.z3950_attr (id, source, name, label, code, format)
 INSERT INTO config.z3950_attr (id, source, name, label, code, format)
 	VALUES (9, 'loc', 'item_type', oils_i18n_gettext(9, 'Item Type', 'cza', 'label'), 1001, 1);
 
+UPDATE config.z3950_attr SET truncation = 1 WHERE source = 'loc';
+
 INSERT INTO config.z3950_attr (id, source, name, label, code, format)
 	VALUES (10, 'oclc', 'tcn', oils_i18n_gettext(10, 'Title Control Number', 'cza', 'label'), 12, 1);
 INSERT INTO config.z3950_attr (id, source, name, label, code, format)
@@ -2117,3 +2119,85 @@ INSERT INTO config.org_unit_setting_type (name, label, description, datatype) VA
     'bool'
 );
 
+
+-- in-db indexing normalizers
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'NACO Normalize',
+	'Apply NACO normalization rules to the extracted text.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+	'naco_normalize',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Normalize date range',
+	'Split date ranges in the form of "XXXX-YYYY" into "XXXX YYYY" for proper index.',
+	'split_date_range',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'NACO Normalize -- retain first comma',
+	'Apply NACO normalization rules to the extracted text, retaining the first comma.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+	'naco_normalize_keep_comma',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Strip Diacritics',
+	'Convert text to NFD form and remove non-spacing combining marks.',
+	'remove_diacritics',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Up-case',
+	'Convert text upper case.',
+	'uppercase',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Down-case',
+	'Convert text lower case.',
+	'lowercase',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Extract Dewey-like number',
+	'Extract a string of numeric characters ther resembles a DDC number.',
+	'call_number_dewey',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Left truncation',
+	'Discard the specified number of characters from the left side of the string.',
+	'left_trunc',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Right truncation',
+	'Include only the specified number of characters from the left side of the string.',
+	'right_trunc',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'First word',
+	'Include only the first space-separated word of a string.',
+	'first_word',
+	0
+);
+
+-- make use of the index normalizers
+
+INSERT INTO config.metabib_field_index_norm_map (field,norm)
+	SELECT	m.id,
+		i.id
+	  FROM	config.metabib_field m,
+		config.index_normalizer i
+	  WHERE	i.func IN ('naco_normalize','split_date_range');
+
+
diff --git a/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql b/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql
new file mode 100644
index 0000000000..2fdd6cd5f5
--- /dev/null
+++ b/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql
@@ -0,0 +1,41 @@
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0032'); -- miker
+
+-- Some handy functions, based on existing ones, to provide optional ingest normalization
+
+CREATE OR REPLACE FUNCTION public.left_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+	SELECT SUBSTRING($1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.right_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+	SELECT SUBSTRING($1,1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
+	SELECT public.naco_normalize($1,'a');
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func$
+	SELECT REGEXP_REPLACE( $1, E'(\\d{4})-(\\d{4})', E'\\1 \\2', 'g' );
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+-- And ... a table in which to register them
+
+CREATE TABLE config.index_normalizer (
+	id		SERIAL	PRIMARY KEY,
+	name		TEXT	UNIQUE NOT NULL,
+	func		TEXT	NOT NULL,
+	param_count	INT	NOT NULL DEFAULT 0
+);
+
+CREATE TABLE config.metabib_field_index_norm_map (
+	id	SERIAL	PRIMARY KEY,
+	field	INT	NOT NULL REFERENCES config.metabib_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+	norm	INT	NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+	params	TEXT,
+	pos	INT	NOT NULL DEFAULT 0
+);
+
+COMMIT;
+
diff --git a/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql b/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql
new file mode 100644
index 0000000000..12d31e796a
--- /dev/null
+++ b/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql
@@ -0,0 +1,85 @@
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0033'); -- miker
+
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'NACO Normalize',
+	'Apply NACO normalization rules to the extracted text.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+	'naco_normalize',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Normalize date range',
+	'Split date ranges in the form of "XXXX-YYYY" into "XXXX YYYY" for proper index.',
+	'split_date_range',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'NACO Normalize -- retain first comma',
+	'Apply NACO normalization rules to the extracted text, retaining the first comma.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+	'naco_normalize_keep_comma',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Strip Diacritics',
+	'Convert text to NFD form and remove non-spacing combining marks.',
+	'remove_diacritics',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Up-case',
+	'Convert text upper case.',
+	'uppercase',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Down-case',
+	'Convert text lower case.',
+	'lowercase',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Extract Dewey-like number',
+	'Extract a string of numeric characters ther resembles a DDC number.',
+	'call_number_dewey',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Left truncation',
+	'Discard the specified number of characters from the left side of the string.',
+	'left_trunc',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Right truncation',
+	'Include only the specified number of characters from the left side of the string.',
+	'right_trunc',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'First word',
+	'Include only the first space-separated word of a string.',
+	'first_word',
+	0
+);
+
+
+INSERT INTO config.metabib_field_index_norm_map (field,norm)
+	SELECT	m.id,
+		i.id
+	  FROM	config.metabib_field m,
+		config.index_normalizer i
+	  WHERE	i.func IN ('naco_normalize','split_date_range');
+
+COMMIT;
+
-- 
2.11.0