From 46644424389001a5242b54c6c14372818ce21a64 Mon Sep 17 00:00:00 2001 From: Jeffrey Bond Date: Thu, 24 Jan 2013 15:43:53 -0800 Subject: [PATCH] Search Normalization Options - Added the option for DBAs to modify an internal flag to normalize meta data differently. See release notes for more information. Signed-off-by: Jeffrey Bond --- Open-ILS/src/sql/Pg/002.functions.config.sql | 34 ++- Open-ILS/src/sql/Pg/002.schema.config.sql | 31 +++ ...XX.schema.config.search_normalize_functions.sql | 282 +++++++++++++++++++++ .../RELEASE_NOTES_NEXT/search_normalize_option.txt | 48 ++++ 4 files changed, 392 insertions(+), 3 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.schema.config.search_normalize_functions.sql create mode 100644 docs/RELEASE_NOTES_NEXT/search_normalize_option.txt diff --git a/Open-ILS/src/sql/Pg/002.functions.config.sql b/Open-ILS/src/sql/Pg/002.functions.config.sql index 87b6e285d0..00488266a6 100644 --- a/Open-ILS/src/sql/Pg/002.functions.config.sql +++ b/Open-ILS/src/sql/Pg/002.functions.config.sql @@ -771,7 +771,7 @@ CREATE OR REPLACE FUNCTION public.search_normalize( TEXT, TEXT ) RETURNS TEXT AS # transformations based on Unicode category codes $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g; - if ($sf && $sf =~ /^a/o) { + if ($sf && $sf =~ /^a/o && $sf ne 'altered_search_normalize') { my $commapos = index($str, ','); if ($commapos > -1) { if ($commapos != length($str) - 1) { @@ -780,6 +780,11 @@ CREATE OR REPLACE FUNCTION public.search_normalize( TEXT, TEXT ) RETURNS TEXT AS } } + if($sf eq 'altered_search_normalize'){ + $str =~ s/\'//g; + $str =~ s/\,//g; + } + # since we've stripped out the control characters, we can now # use a few as placeholders temporarily $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/; @@ -813,9 +818,32 @@ CREATE OR REPLACE FUNCTION public.search_normalize_keep_comma( TEXT ) RETURNS TE $func$ LANGUAGE SQL STRICT IMMUTABLE; CREATE OR REPLACE FUNCTION public.search_normalize( TEXT ) RETURNS TEXT AS $func$ - SELECT public.search_normalize($1,''); -$func$ LANGUAGE 'sql' STRICT IMMUTABLE; +DECLARE + var text := ''; + glob boolean := public.get_internal_flag('ingest.altered_search_normalize'); +BEGIN + + IF glob = 't' THEN + var := 'altered_search_normalize'; + END IF; + + RETURN (SELECT public.search_normalize($1,var)); +END; +$func$ LANGUAGE 'plpgsql' STRICT IMMUTABLE; + + +CREATE OR REPLACE FUNCTION public.get_internal_flag(arg text) + RETURNS boolean AS +$func$ +DECLARE + var boolean := 'f'; +BEGIN + var := (SELECT enabled FROM config.internal_flag AS cfg WHERE cfg.name = arg); + RETURN var; +END; +$func$ + LANGUAGE plpgsql VOLATILE; COMMIT; diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 25fa29294f..b5d18a1c51 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -47,6 +47,7 @@ INSERT INTO config.internal_flag (name) VALUES ('ingest.disable_metabib_rec_desc INSERT INTO config.internal_flag (name) VALUES ('ingest.disable_metabib_field_entry'); INSERT INTO config.internal_flag (name) VALUES ('ingest.assume_inserts_only'); INSERT INTO config.internal_flag (name) VALUES ('serial.rematerialize_on_same_holding_code'); +INSERT INTO config.internal_flag (name, enabled) VALUES ('ingest.altered_search_normalize', 'f'); CREATE TABLE config.global_flag ( label TEXT NOT NULL @@ -788,9 +789,11 @@ CREATE OR REPLACE FUNCTION oils_tsearch2 () RETURNS TRIGGER AS $$ DECLARE normalizer RECORD; value TEXT := ''; + glob_setting boolean := 'f'; BEGIN value := NEW.value; + glob_setting := public.get_internal_flag('ingest.altered_search_normalize'); IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN FOR normalizer IN @@ -847,6 +850,10 @@ BEGIN value := public.search_normalize(value); END IF; + IF glob_setting = 't' THEN + value := public.search_normalize(value); + END IF; + NEW.index_vector = to_tsvector((TG_ARGV[0])::regconfig, value); RETURN NEW; @@ -1001,5 +1008,29 @@ CREATE TABLE config.usr_activity_type ( CREATE UNIQUE INDEX unique_wwh ON config.usr_activity_type (COALESCE(ewho,''), COALESCE (ewhat,''), COALESCE(ehow,'')); +CREATE OR REPLACE FUNCTION config.altered_search_normalize() + RETURNS trigger AS +$func$ +DECLARE + ingest text := 'ingest.reingest.force_on_same_marc'; + norm_search text := 'ingest.altered_search_normalize'; + var boolean := public.get_internal_flag(norm_search); +BEGIN + IF NEW.name = norm_search THEN + UPDATE config.internal_flag SET enabled = 't' WHERE name = ingest; + UPDATE biblio.record_entry SET tcn_source = tcn_source WHERE 1=1; + UPDATE config.internal_flag SET enabled = var WHERE name = ingest; + END IF; + RETURN NULL; +END; +$func$ + LANGUAGE plpgsql VOLATILE; + + +CREATE TRIGGER altered_search_normalize + AFTER UPDATE + ON config.internal_flag + FOR EACH ROW + EXECUTE PROCEDURE config.altered_search_normalize(); COMMIT; diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.config.search_normalize_functions.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.config.search_normalize_functions.sql new file mode 100644 index 0000000000..4829c1500f --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.config.search_normalize_functions.sql @@ -0,0 +1,282 @@ +BEGIN; + +INSERT INTO config.internal_flag (name, enabled) VALUES ('ingest.altered_search_normalize', 'f'); + +CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$ + + use strict; + use Unicode::Normalize; + use Encode; + + my $str = decode_utf8(shift); + my $sf = shift; + + # Apply NACO normalization to input string; based on + # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf + # + # Note that unlike a strict reading of the NACO normalization rules, + # output is returned as lowercase instead of uppercase for compatibility + # with previous versions of the Evergreen naco_normalize routine. + + # Convert to upper-case first; even though final output will be lowercase, doing this will + # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly. + # If there are any bugs in Perl's implementation of upcasing, they will be passed through here. + $str = uc $str; + + # remove non-filing strings + $str =~ s/\x{0098}.*?\x{009C}//g; + + $str = NFKD($str); + + # additional substitutions - 3.6. + $str =~ s/\x{00C6}/AE/g; + $str =~ s/\x{00DE}/TH/g; + $str =~ s/\x{0152}/OE/g; + $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d; + + # transformations based on Unicode category codes + $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g; + + if ($sf && $sf =~ /^a/o) { + my $commapos = index($str, ','); + if ($commapos > -1) { + if ($commapos != length($str) - 1) { + $str =~ s/,/\x07/; # preserve first comma + } + } + } + + # since we've stripped out the control characters, we can now + # use a few as placeholders temporarily + $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/; + $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g; + $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/; + + # decimal digits + $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/; + + # intentionally skipping step 8 of the NACO algorithm; if the string + # gets normalized away, that's fine. + + # leading and trailing spaces + $str =~ s/\s+/ /g; + $str =~ s/^\s+//; + $str =~ s/\s+$//g; + + return lc $str; +$func$ LANGUAGE 'plperlu' STRICT IMMUTABLE; + +-- Currently, the only difference from naco_normalize is that search_normalize +-- turns apostrophes into spaces, while naco_normalize collapses them. +CREATE OR REPLACE FUNCTION public.search_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$ + + use strict; + use Unicode::Normalize; + use Encode; + + my $str = decode_utf8(shift); + my $sf = shift; + + # Apply NACO normalization to input string; based on + # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf + # + # Note that unlike a strict reading of the NACO normalization rules, + # output is returned as lowercase instead of uppercase for compatibility + # with previous versions of the Evergreen naco_normalize routine. + + # Convert to upper-case first; even though final output will be lowercase, doing this will + # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly. + # If there are any bugs in Perl's implementation of upcasing, they will be passed through here. + $str = uc $str; + + # remove non-filing strings + $str =~ s/\x{0098}.*?\x{009C}//g; + + $str = NFKD($str); + + # additional substitutions - 3.6. + $str =~ s/\x{00C6}/AE/g; + $str =~ s/\x{00DE}/TH/g; + $str =~ s/\x{0152}/OE/g; + $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}][/DDOLl/d; + + # transformations based on Unicode category codes + $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g; + + if ($sf && $sf =~ /^a/o && $sf ne 'altered_search_normalize') { + my $commapos = index($str, ','); + if ($commapos > -1) { + if ($commapos != length($str) - 1) { + $str =~ s/,/\x07/; # preserve first comma + } + } + } + + if($sf eq 'altered_search_normalize'){ + $str =~ s/\'//g; + $str =~ s/\,//g; + } + + # since we've stripped out the control characters, we can now + # use a few as placeholders temporarily + $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/; + $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g; + $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/; + + # decimal digits + $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/; + + # intentionally skipping step 8 of the NACO algorithm; if the string + # gets normalized away, that's fine. + + # leading and trailing spaces + $str =~ s/\s+/ /g; + $str =~ s/^\s+//; + $str =~ s/\s+$//g; + + return lc $str; +$func$ LANGUAGE 'plperlu' STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$ + SELECT public.naco_normalize($1,'a'); +$func$ LANGUAGE SQL STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$ + SELECT public.naco_normalize($1,''); +$func$ LANGUAGE 'sql' STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION public.search_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$ + SELECT public.search_normalize($1,'a'); +$func$ LANGUAGE SQL STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION public.search_normalize( TEXT ) RETURNS TEXT AS $func$ + +DECLARE + var text := ''; + glob boolean := public.get_internal_flag('ingest.altered_search_normalize'); +BEGIN + + IF glob = 't' THEN + var := 'altered_search_normalize'; + END IF; + + RETURN (SELECT public.search_normalize($1,var)); +END; +$func$ LANGUAGE 'plpgsql' STRICT IMMUTABLE; + + +CREATE OR REPLACE FUNCTION public.get_internal_flag(arg text) + RETURNS boolean AS +$func$ +DECLARE + var boolean := 'f'; +BEGIN + var := (SELECT enabled FROM config.internal_flag AS cfg WHERE cfg.name = arg); + RETURN var; +END; +$func$ + LANGUAGE plpgsql VOLATILE; + + + +CREATE OR REPLACE FUNCTION oils_tsearch2 () RETURNS TRIGGER AS $$ +DECLARE + normalizer RECORD; + value TEXT := ''; + glob_setting boolean := 'f'; +BEGIN + + value := NEW.value; + glob_setting := public.get_internal_flag('ingest.altered_search_normalize'); + + IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE field = NEW.field AND m.pos < 0 + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( value ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO value; + + END LOOP; + + NEW.value := value; + END IF; + + IF NEW.index_vector = ''::tsvector THEN + RETURN NEW; + END IF; + + IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE field = NEW.field AND m.pos >= 0 + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( value ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO value; + + END LOOP; + END IF; + + IF TG_TABLE_NAME::TEXT ~ 'browse_entry$' THEN + value := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(value, E'\\W+'), ' ' + ); + value := public.search_normalize(value); + END IF; + + IF glob_setting = 't' THEN + value := public.search_normalize(value); + END IF; + + NEW.index_vector = to_tsvector((TG_ARGV[0])::regconfig, value); + + RETURN NEW; +END; +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION config.altered_search_normalize() + RETURNS trigger AS +$func$ +DECLARE + ingest text := 'ingest.reingest.force_on_same_marc'; + norm_search text := 'ingest.altered_search_normalize'; + var boolean := public.get_internal_flag(norm_search); +BEGIN + IF NEW.name = norm_search THEN + UPDATE config.internal_flag SET enabled = 't' WHERE name = ingest; + UPDATE biblio.record_entry SET tcn_source = tcn_source WHERE 1=1; + UPDATE config.internal_flag SET enabled = var WHERE name = ingest; + END IF; + RETURN NULL; +END; +$func$ + LANGUAGE plpgsql VOLATILE; + +CREATE TRIGGER altered_search_normalize + AFTER UPDATE + ON config.internal_flag + FOR EACH ROW + EXECUTE PROCEDURE config.altered_search_normalize(); + + +COMMIT; diff --git a/docs/RELEASE_NOTES_NEXT/search_normalize_option.txt b/docs/RELEASE_NOTES_NEXT/search_normalize_option.txt new file mode 100644 index 0000000000..5d09686f90 --- /dev/null +++ b/docs/RELEASE_NOTES_NEXT/search_normalize_option.txt @@ -0,0 +1,48 @@ +== Search Normalization + +Jeffrey Bond + +=== Summary This feature would give DBAs an internal value to set to +normalize meta data differently. + +NACO normalization standards can be found +[[http://www.oclc.org/resources/research/publications/library/2006/naco- +lrts.pdf|here]] + +The change to normalization is minor. According to NACO standards commas +and apostrophes are normalized to white space. This search normalization +feature converts commas and apostrophes to nothing rather than white +space. The feature can be turned on or off using an internal flag. Using +this feature affects the way all data is normalized when using the +search_normalize stored procedure. + +The following stored procedures have been modified: + +* search_normalize(text) - Modified wrapper function to check if flag +was turned on in database. + +* search_normalize(text, text) - Modified main search normalization +function to delete commas and apostrophes when proper flags were on. + +* get_internal_flag - New stored procedure added. Takes a line of text +representing the name field for the config.internal_flag table. Returns +the enabled field of the matching name. + +* oils_tsearch2 - Modified the way the tsearch2 reads in new ts_vectors. +Now normalizes the value before throwing it into to_tsvector(). + +* config.altered_search_normalize - New trigger function added to handle +updating meta data when flag is changed inside internal_flag table. + +The following tables have been modified: + +* config.internal_flag - Added row to handle altered search +normalization rules. + +=== Blue Print + +https://bugs.launchpad.net/evergreen/+bug/1083261 + +=== Deliverable + +* An option to alter search normalization functionality. -- 2.11.0