From: Mike Rylander Date: Sat, 21 May 2011 14:16:03 +0000 (-0400) Subject: Add a transliteration search normalizer X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=82e2f8e6a645fc82d482e9d00c62db66c38bd1db;p=working%2FEvergreen.git Add a transliteration search normalizer Signed-off-by: Mike Rylander --- diff --git a/Open-ILS/src/sql/Pg/000.functions.general.sql b/Open-ILS/src/sql/Pg/000.functions.general.sql index 2bf6340fff..2dd3e5c5b2 100644 --- a/Open-ILS/src/sql/Pg/000.functions.general.sql +++ b/Open-ILS/src/sql/Pg/000.functions.general.sql @@ -26,4 +26,9 @@ CREATE OR REPLACE FUNCTION evergreen.xml_escape(str TEXT) RETURNS text AS $$ '>', '>'); $$ LANGUAGE SQL IMMUTABLE; +CREATE OR REPLACE FUNCTION evergreen.unidecode (TEXT) RETURNS TEXT AS $func$ + use Text::Unidecode; + return unidecode(shift()); +$func$ LANGUAGE PLPERLU IMMUTABLE; + COMMIT; diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql index e8f0f621cb..83858ed46c 100644 --- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql +++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql @@ -5656,6 +5656,13 @@ INSERT INTO config.org_unit_setting_type (name, label, description, datatype) VA -- in-db indexing normalizers INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Unicode Transliteration', + 'Transliterate non-ASCII Unicode code points to ASCII text', + 'everngreen.unidecode', + 0 +); + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( 'NACO Normalize', 'Apply NACO normalization rules to the extracted text. See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.', 'naco_normalize', @@ -5760,7 +5767,8 @@ INSERT INTO config.metabib_field_index_norm_map (field,norm) i.id FROM config.metabib_field m, config.index_normalizer i - WHERE i.func IN ('naco_normalize','split_date_range') + WHERE i.func IN ('evergreen.unidecode','naco_normalize','split_date_range') + AND m.search_field AND m.id NOT IN (18, 19); INSERT INTO config.metabib_field_index_norm_map (field,norm,pos) diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.sql.transliteration-index-normalizer.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.sql.transliteration-index-normalizer.sql new file mode 100644 index 0000000000..fb5c007056 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.sql.transliteration-index-normalizer.sql @@ -0,0 +1,76 @@ +/* + * First, install the required Perl module as root: + * + * # cpan Text::Unidecode + * + * and after that is successfully installed, apply the following SQL script. + * + */ + +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); + +-- A simple function using Text::Unidecode +CREATE OR REPLACE FUNCTION evergreen.unidecode (TEXT) RETURNS TEXT AS $func$ + use Text::Unidecode; + return unidecode(shift()); +$func$ LANGUAGE PLPERLU IMMUTABLE; + +-- Register that function +INSERT INTO config.index_normalizer ( + name, + description, + func, + param_count +) VALUES ( + 'Unicode Transliteration', + 'Transliterate non-ASCII Unicode code points to ASCII text', + 'everngreen.unidecode', + 0 +); + +-- Apply unidecode to all search fields +INSERT INTO config.metabib_field_index_norm_map (field, norm) + SELECT f.id, n.id + FROM config.metabib_field f, + config.index_normalizer n + WHERE n.func = 'evergreen.unidecode' + AND f.search_field + AND f.id NOT IN (18,19); + +-- Re-index all search fields to apply the new normalization. This could take a while... +UPDATE metabib.title_field_entry + SET value=value + WHERE field IN (SELECT id FROM config.metabib_field WHERE search_field); + +UPDATE metabib.author_field_entry + SET value=value + WHERE field IN (SELECT id FROM config.metabib_field WHERE search_field); + +UPDATE metabib.subject_field_entry + SET value=value + WHERE field IN (SELECT id FROM config.metabib_field WHERE search_field); + +UPDATE metabib.keyword_field_entry + SET value=value + WHERE field IN (SELECT id FROM config.metabib_field WHERE search_field); + +UPDATE metabib.series_field_entry + SET value=value + WHERE field IN (SELECT id FROM config.metabib_field WHERE search_field); + +UPDATE metabib.identifier_field_entry + SET value=value + WHERE field IN (SELECT id FROM config.metabib_field WHERE search_field); + +-- Clean up a bit after ourselves +VACUUM FULL ANALYZE VERBOSE metabib.title_field_entry; +VACUUM FULL ANALYZE VERBOSE metabib.author_field_entry; +VACUUM FULL ANALYZE VERBOSE metabib.subject_field_entry; +VACUUM FULL ANALYZE VERBOSE metabib.keyword_field_entry; +VACUUM FULL ANALYZE VERBOSE metabib.series_field_entry; +VACUUM FULL ANALYZE VERBOSE metabib.identifier_field_entry; + +COMMIT; +