This patch moves towards in-database indexed value normalization. The eventual

author miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>

Mon, 12 Oct 2009 17:06:01 +0000 (17:06 +0000)

committer miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>

Mon, 12 Oct 2009 17:06:01 +0000 (17:06 +0000)
author miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
Mon, 12 Oct 2009 17:06:01 +0000 (17:06 +0000)
committer miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
Mon, 12 Oct 2009 17:06:01 +0000 (17:06 +0000)
diff --git a/Open-ILS/src/sql/Pg/002.functions.config.sql b/Open-ILS/src/sql/Pg/002.functions.config.sql

index 38d426c..e8740b3 100644 (file)
--- a/Open-ILS/src/sql/Pg/002.functions.config.sql
+++ b/Open-ILS/src/sql/Pg/002.functions.config.sql
@@ -120,5 +120,9 @@ CREATE OR REPLACE FUNCTION public.first_word ( TEXT ) RETURNS TEXT AS $$
          SELECT SUBSTRING( $1 FROM $_$^\S+$_$);
  $$ LANGUAGE SQL;
  
+CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
+        SELECT public.naco_normalize($1,'a');
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
  COMMIT;
  
diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql

index c63a5cc..de80e10 100644 (file)
--- a/Open-ILS/src/sql/Pg/002.schema.config.sql
+++ b/Open-ILS/src/sql/Pg/002.schema.config.sql
@@ -51,7 +51,7 @@ CREATE TABLE config.upgrade_log (
      install_date    TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
  );
  
-INSERT INTO config.upgrade_log (version) VALUES ('0031'); -- miker
+INSERT INTO config.upgrade_log (version) VALUES ('0033'); -- miker
  
  CREATE TABLE config.bib_source (
         id              SERIAL  PRIMARY KEY,
@@ -562,5 +562,38 @@ CREATE TABLE config.org_unit_setting_type (
        ( datatype <> 'link' AND fm_class IS NULL ) )
  );
  
+
+-- Some handy functions, based on existing ones, to provide optional ingest normalization
+
+CREATE OR REPLACE FUNCTION public.left_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+        SELECT SUBSTRING($1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.right_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+        SELECT SUBSTRING($1,1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func$
+        SELECT REGEXP_REPLACE( $1, E'(\\d{4})-(\\d{4})', E'\\1 \\2', 'g' );
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+-- And ... a table in which to register them
+
+CREATE TABLE config.index_normalizer (
+        id              SERIAL  PRIMARY KEY,
+        name            TEXT    UNIQUE NOT NULL,
+        func            TEXT    NOT NULL,
+        param_count     INT     NOT NULL DEFAULT 0
+);
+
+CREATE TABLE config.metabib_field_index_norm_map (
+        id      SERIAL  PRIMARY KEY,
+        field   INT     NOT NULL REFERENCES config.metabib_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+        norm    INT     NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+        params  TEXT,
+        pos     INT     NOT NULL DEFAULT 0
+);
+
+
  COMMIT;
  
diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql

index 1239c1c..9e3b7b0 100644 (file)
--- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql
+++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql
@@ -769,6 +769,8 @@ INSERT INTO config.z3950_attr (id, source, name, label, code, format)
  INSERT INTO config.z3950_attr (id, source, name, label, code, format)
         VALUES (9, 'loc', 'item_type', oils_i18n_gettext(9, 'Item Type', 'cza', 'label'), 1001, 1);
  
+UPDATE config.z3950_attr SET truncation = 1 WHERE source = 'loc';
+
  INSERT INTO config.z3950_attr (id, source, name, label, code, format)
         VALUES (10, 'oclc', 'tcn', oils_i18n_gettext(10, 'Title Control Number', 'cza', 'label'), 12, 1);
  INSERT INTO config.z3950_attr (id, source, name, label, code, format)
@@ -2117,3 +2119,85 @@ INSERT INTO config.org_unit_setting_type (name, label, description, datatype) VA
      'bool'
  );
  
+
+-- in-db indexing normalizers
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'NACO Normalize',
+       'Apply NACO normalization rules to the extracted text.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+       'naco_normalize',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Normalize date range',
+       'Split date ranges in the form of "XXXX-YYYY" into "XXXX YYYY" for proper index.',
+       'split_date_range',
+       1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'NACO Normalize -- retain first comma',
+       'Apply NACO normalization rules to the extracted text, retaining the first comma.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+       'naco_normalize_keep_comma',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Strip Diacritics',
+       'Convert text to NFD form and remove non-spacing combining marks.',
+       'remove_diacritics',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Up-case',
+       'Convert text upper case.',
+       'uppercase',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Down-case',
+       'Convert text lower case.',
+       'lowercase',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Extract Dewey-like number',
+       'Extract a string of numeric characters ther resembles a DDC number.',
+       'call_number_dewey',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Left truncation',
+       'Discard the specified number of characters from the left side of the string.',
+       'left_trunc',
+       1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Right truncation',
+       'Include only the specified number of characters from the left side of the string.',
+       'right_trunc',
+       1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'First word',
+       'Include only the first space-separated word of a string.',
+       'first_word',
+       0
+);
+
+-- make use of the index normalizers
+
+INSERT INTO config.metabib_field_index_norm_map (field,norm)
+       SELECT  m.id,
+               i.id
+         FROM  config.metabib_field m,
+               config.index_normalizer i
+         WHERE i.func IN ('naco_normalize','split_date_range');
+
+
diff --git a/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql b/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql

new file mode 100644 (file)

index 0000000..2fdd6cd
--- /dev/null
+++ b/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql
@@ -0,0 +1,41 @@
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0032'); -- miker
+
+-- Some handy functions, based on existing ones, to provide optional ingest normalization
+
+CREATE OR REPLACE FUNCTION public.left_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+       SELECT SUBSTRING($1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.right_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+       SELECT SUBSTRING($1,1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
+       SELECT public.naco_normalize($1,'a');
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func$
+       SELECT REGEXP_REPLACE( $1, E'(\\d{4})-(\\d{4})', E'\\1 \\2', 'g' );
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+-- And ... a table in which to register them
+
+CREATE TABLE config.index_normalizer (
+       id              SERIAL  PRIMARY KEY,
+       name            TEXT    UNIQUE NOT NULL,
+       func            TEXT    NOT NULL,
+       param_count     INT     NOT NULL DEFAULT 0
+);
+
+CREATE TABLE config.metabib_field_index_norm_map (
+       id      SERIAL  PRIMARY KEY,
+       field   INT     NOT NULL REFERENCES config.metabib_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+       norm    INT     NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+       params  TEXT,
+       pos     INT     NOT NULL DEFAULT 0
+);
+
+COMMIT;
+
diff --git a/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql b/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql

new file mode 100644 (file)

index 0000000..12d31e7
--- /dev/null
+++ b/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql
@@ -0,0 +1,85 @@
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0033'); -- miker
+
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'NACO Normalize',
+       'Apply NACO normalization rules to the extracted text.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+       'naco_normalize',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Normalize date range',
+       'Split date ranges in the form of "XXXX-YYYY" into "XXXX YYYY" for proper index.',
+       'split_date_range',
+       1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'NACO Normalize -- retain first comma',
+       'Apply NACO normalization rules to the extracted text, retaining the first comma.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+       'naco_normalize_keep_comma',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Strip Diacritics',
+       'Convert text to NFD form and remove non-spacing combining marks.',
+       'remove_diacritics',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Up-case',
+       'Convert text upper case.',
+       'uppercase',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Down-case',
+       'Convert text lower case.',
+       'lowercase',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Extract Dewey-like number',
+       'Extract a string of numeric characters ther resembles a DDC number.',
+       'call_number_dewey',
+       0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Left truncation',
+       'Discard the specified number of characters from the left side of the string.',
+       'left_trunc',
+       1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'Right truncation',
+       'Include only the specified number of characters from the left side of the string.',
+       'right_trunc',
+       1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+       'First word',
+       'Include only the first space-separated word of a string.',
+       'first_word',
+       0
+);
+
+
+INSERT INTO config.metabib_field_index_norm_map (field,norm)
+       SELECT  m.id,
+               i.id
+         FROM  config.metabib_field m,
+               config.index_normalizer i
+         WHERE i.func IN ('naco_normalize','split_date_range');
+
+COMMIT;
+
author	miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
	Mon, 12 Oct 2009 17:06:01 +0000 (17:06 +0000)
committer	miker <miker@dcc99617-32d9-48b4-a31d-7c20da2025e4>
	Mon, 12 Oct 2009 17:06:01 +0000 (17:06 +0000)
Open-ILS/src/sql/Pg/002.functions.config.sql		patch \| blob \| history
Open-ILS/src/sql/Pg/002.schema.config.sql		patch \| blob \| history
Open-ILS/src/sql/Pg/950.data.seed-values.sql		patch \| blob \| history
Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql	[new file with mode: 0644]	patch \| blob
Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql	[new file with mode: 0644]	patch \| blob