goal of this is to move to pure database ingest handled by triggers. This will
free us from the grip of the Ingest server, speed up ingest altogether, and
cause ingest to occur entirely within the same database transaction as the
INSERT or UPDATE to the MARC that is the cause of the ingest. This means
no more potential for race conditions on ingest, and simpler data import.
In this first step, we add some normalization routines for dealing with basic
string data. NACO normalization and the like. With these functions we can
do everything that the Ingest server can do with regard to munging indexed
strings.
You can register these normalizers with specific indexed fields, and define the
order in which they are to be applied.
Next up: work on the scaffolding to actually apply the functions, define the
IDL entries, and create MARC-handling functions to do the xpath dances.
This functionallity will require either:
* a custom (I have a patch) pgxml contrib module or
* Postgresql 8.3+ XML/XPath support
git-svn-id: svn://svn.open-ils.org/ILS/trunk@14375
dcc99617-32d9-48b4-a31d-
7c20da2025e4
SELECT SUBSTRING( $1 FROM $_$^\S+$_$);
$$ LANGUAGE SQL;
+CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
+ SELECT public.naco_normalize($1,'a');
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
COMMIT;
install_date TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
);
-INSERT INTO config.upgrade_log (version) VALUES ('0031'); -- miker
+INSERT INTO config.upgrade_log (version) VALUES ('0033'); -- miker
CREATE TABLE config.bib_source (
id SERIAL PRIMARY KEY,
( datatype <> 'link' AND fm_class IS NULL ) )
);
+
+-- Some handy functions, based on existing ones, to provide optional ingest normalization
+
+CREATE OR REPLACE FUNCTION public.left_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+ SELECT SUBSTRING($1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.right_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+ SELECT SUBSTRING($1,1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func$
+ SELECT REGEXP_REPLACE( $1, E'(\\d{4})-(\\d{4})', E'\\1 \\2', 'g' );
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+-- And ... a table in which to register them
+
+CREATE TABLE config.index_normalizer (
+ id SERIAL PRIMARY KEY,
+ name TEXT UNIQUE NOT NULL,
+ func TEXT NOT NULL,
+ param_count INT NOT NULL DEFAULT 0
+);
+
+CREATE TABLE config.metabib_field_index_norm_map (
+ id SERIAL PRIMARY KEY,
+ field INT NOT NULL REFERENCES config.metabib_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+ norm INT NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+ params TEXT,
+ pos INT NOT NULL DEFAULT 0
+);
+
+
COMMIT;
INSERT INTO config.z3950_attr (id, source, name, label, code, format)
VALUES (9, 'loc', 'item_type', oils_i18n_gettext(9, 'Item Type', 'cza', 'label'), 1001, 1);
+UPDATE config.z3950_attr SET truncation = 1 WHERE source = 'loc';
+
INSERT INTO config.z3950_attr (id, source, name, label, code, format)
VALUES (10, 'oclc', 'tcn', oils_i18n_gettext(10, 'Title Control Number', 'cza', 'label'), 12, 1);
INSERT INTO config.z3950_attr (id, source, name, label, code, format)
'bool'
);
+
+-- in-db indexing normalizers
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'NACO Normalize',
+ 'Apply NACO normalization rules to the extracted text. See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+ 'naco_normalize',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Normalize date range',
+ 'Split date ranges in the form of "XXXX-YYYY" into "XXXX YYYY" for proper index.',
+ 'split_date_range',
+ 1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'NACO Normalize -- retain first comma',
+ 'Apply NACO normalization rules to the extracted text, retaining the first comma. See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+ 'naco_normalize_keep_comma',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Strip Diacritics',
+ 'Convert text to NFD form and remove non-spacing combining marks.',
+ 'remove_diacritics',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Up-case',
+ 'Convert text upper case.',
+ 'uppercase',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Down-case',
+ 'Convert text lower case.',
+ 'lowercase',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Extract Dewey-like number',
+ 'Extract a string of numeric characters ther resembles a DDC number.',
+ 'call_number_dewey',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Left truncation',
+ 'Discard the specified number of characters from the left side of the string.',
+ 'left_trunc',
+ 1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Right truncation',
+ 'Include only the specified number of characters from the left side of the string.',
+ 'right_trunc',
+ 1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'First word',
+ 'Include only the first space-separated word of a string.',
+ 'first_word',
+ 0
+);
+
+-- make use of the index normalizers
+
+INSERT INTO config.metabib_field_index_norm_map (field,norm)
+ SELECT m.id,
+ i.id
+ FROM config.metabib_field m,
+ config.index_normalizer i
+ WHERE i.func IN ('naco_normalize','split_date_range');
+
+
--- /dev/null
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0032'); -- miker
+
+-- Some handy functions, based on existing ones, to provide optional ingest normalization
+
+CREATE OR REPLACE FUNCTION public.left_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+ SELECT SUBSTRING($1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.right_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+ SELECT SUBSTRING($1,1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
+ SELECT public.naco_normalize($1,'a');
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func$
+ SELECT REGEXP_REPLACE( $1, E'(\\d{4})-(\\d{4})', E'\\1 \\2', 'g' );
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+-- And ... a table in which to register them
+
+CREATE TABLE config.index_normalizer (
+ id SERIAL PRIMARY KEY,
+ name TEXT UNIQUE NOT NULL,
+ func TEXT NOT NULL,
+ param_count INT NOT NULL DEFAULT 0
+);
+
+CREATE TABLE config.metabib_field_index_norm_map (
+ id SERIAL PRIMARY KEY,
+ field INT NOT NULL REFERENCES config.metabib_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+ norm INT NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+ params TEXT,
+ pos INT NOT NULL DEFAULT 0
+);
+
+COMMIT;
+
--- /dev/null
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0033'); -- miker
+
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'NACO Normalize',
+ 'Apply NACO normalization rules to the extracted text. See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+ 'naco_normalize',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Normalize date range',
+ 'Split date ranges in the form of "XXXX-YYYY" into "XXXX YYYY" for proper index.',
+ 'split_date_range',
+ 1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'NACO Normalize -- retain first comma',
+ 'Apply NACO normalization rules to the extracted text, retaining the first comma. See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+ 'naco_normalize_keep_comma',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Strip Diacritics',
+ 'Convert text to NFD form and remove non-spacing combining marks.',
+ 'remove_diacritics',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Up-case',
+ 'Convert text upper case.',
+ 'uppercase',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Down-case',
+ 'Convert text lower case.',
+ 'lowercase',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Extract Dewey-like number',
+ 'Extract a string of numeric characters ther resembles a DDC number.',
+ 'call_number_dewey',
+ 0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Left truncation',
+ 'Discard the specified number of characters from the left side of the string.',
+ 'left_trunc',
+ 1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Right truncation',
+ 'Include only the specified number of characters from the left side of the string.',
+ 'right_trunc',
+ 1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'First word',
+ 'Include only the first space-separated word of a string.',
+ 'first_word',
+ 0
+);
+
+
+INSERT INTO config.metabib_field_index_norm_map (field,norm)
+ SELECT m.id,
+ i.id
+ FROM config.metabib_field m,
+ config.index_normalizer i
+ WHERE i.func IN ('naco_normalize','split_date_range');
+
+COMMIT;
+