From: Dan Scott <dscott@laurentian.ca> Date: Sun, 4 Mar 2012 17:28:51 +0000 (-0500) Subject: Number the autosuggest normalization upgrade script X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=d3bf88739ea88768d92831062219f76f58b6a1e3;p=evergreen%2Fmasslnc.git Number the autosuggest normalization upgrade script Signed-off-by: Dan Scott <dscott@laurentian.ca> --- diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 66d37a8d12..4c1bc1e89b 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -86,7 +86,7 @@ CREATE TRIGGER no_overlapping_deps BEFORE INSERT OR UPDATE ON config.db_patch_dependencies FOR EACH ROW EXECUTE PROCEDURE evergreen.array_overlap_check ('deprecates'); -INSERT INTO config.upgrade_log (version, applied_to) VALUES ('0678', :eg_version); -- berick/miker +INSERT INTO config.upgrade_log (version, applied_to) VALUES ('0679', :eg_version); -- berick/miker CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/upgrade/0679.schema.autosuggest.search-normalize.sql b/Open-ILS/src/sql/Pg/upgrade/0679.schema.autosuggest.search-normalize.sql new file mode 100644 index 0000000000..d82af47df9 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0679.schema.autosuggest.search-normalize.sql @@ -0,0 +1,263 @@ +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('0679', :eg_version); + +-- Address typo in column name +ALTER TABLE config.metabib_class ADD COLUMN buoyant BOOL DEFAULT FALSE NOT NULL; +UPDATE config.metabib_class SET buoyant = bouyant; +ALTER TABLE config.metabib_class DROP COLUMN bouyant; + +CREATE OR REPLACE FUNCTION oils_tsearch2 () RETURNS TRIGGER AS $$ +DECLARE + normalizer RECORD; + value TEXT := ''; +BEGIN + + value := NEW.value; + + IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE field = NEW.field AND m.pos < 0 + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( value ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO value; + + END LOOP; + + NEW.value := value; + END IF; + + IF NEW.index_vector = ''::tsvector THEN + RETURN NEW; + END IF; + + IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE field = NEW.field AND m.pos >= 0 + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( value ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO value; + + END LOOP; + END IF; + + IF TG_TABLE_NAME::TEXT ~ 'browse_entry$' THEN + value := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(value, E'\\W+'), ' ' + ); + value := public.search_normalize(value); + END IF; + + NEW.index_vector = to_tsvector((TG_ARGV[0])::regconfig, value); + + RETURN NEW; +END; +$$ LANGUAGE PLPGSQL; + +-- Given a string such as a user might type into a search box, prepare +-- two changed variants for TO_TSQUERY(). See +-- http://www.postgresql.org/docs/9.0/static/textsearch-controls.html +-- The first variant is normalized to match indexed documents regardless +-- of diacritics. The second variant keeps its diacritics for proper +-- highlighting via TS_HEADLINE(). +CREATE OR REPLACE + FUNCTION metabib.autosuggest_prepare_tsquery(orig TEXT) RETURNS TEXT[] AS +$$ +DECLARE + orig_ended_in_space BOOLEAN; + result RECORD; + plain TEXT; + normalized TEXT; +BEGIN + orig_ended_in_space := orig ~ E'\\s$'; + + orig := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(orig, E'\\W+'), ' ' + ); + + normalized := public.search_normalize(orig); -- also trim()s + plain := trim(orig); + + IF NOT orig_ended_in_space THEN + plain := plain || ':*'; + normalized := normalized || ':*'; + END IF; + + plain := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(plain, E'\\s+'), ' & ' + ); + normalized := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(normalized, E'\\s+'), ' & ' + ); + + RETURN ARRAY[normalized, plain]; +END; +$$ LANGUAGE PLPGSQL; + + +-- Definition of OUT parameters changes, so must drop first +DROP FUNCTION IF EXISTS metabib.suggest_browse_entries (TEXT, TEXT, TEXT, INTEGER, INTEGER, INTEGER); + +CREATE OR REPLACE + FUNCTION metabib.suggest_browse_entries( + raw_query_text TEXT, -- actually typed by humans at the UI level + search_class TEXT, -- 'alias' or 'class' or 'class|field..', etc + headline_opts TEXT, -- markup options for ts_headline() + visibility_org INTEGER,-- null if you don't want opac visibility test + query_limit INTEGER,-- use in LIMIT clause of interal query + normalization INTEGER -- argument to TS_RANK_CD() + ) RETURNS TABLE ( + value TEXT, -- plain + field INTEGER, + buoyant_and_class_match BOOL, + field_match BOOL, + field_weight INTEGER, + rank REAL, + buoyant BOOL, + match TEXT -- marked up + ) AS $func$ +DECLARE + prepared_query_texts TEXT[]; + query TSQUERY; + plain_query TSQUERY; + opac_visibility_join TEXT; + search_class_join TEXT; + r_fields RECORD; +BEGIN + prepared_query_texts := metabib.autosuggest_prepare_tsquery(raw_query_text); + + query := TO_TSQUERY('keyword', prepared_query_texts[1]); + plain_query := TO_TSQUERY('keyword', prepared_query_texts[2]); + + IF visibility_org IS NOT NULL THEN + opac_visibility_join := ' + JOIN asset.opac_visible_copies aovc ON ( + aovc.record = mbedm.source AND + aovc.circ_lib IN (SELECT id FROM actor.org_unit_descendants($4)) + )'; + ELSE + opac_visibility_join := ''; + END IF; + + -- The following determines whether we only provide suggestsons matching + -- the user's selected search_class, or whether we show other suggestions + -- too. The reason for MIN() is that for search_classes like + -- 'title|proper|uniform' you would otherwise get multiple rows. The + -- implication is that if title as a class doesn't have restrict, + -- nor does the proper field, but the uniform field does, you're going + -- to get 'false' for your overall evaluation of 'should we restrict?' + -- To invert that, change from MIN() to MAX(). + + SELECT + INTO r_fields + MIN(cmc.restrict::INT) AS restrict_class, + MIN(cmf.restrict::INT) AS restrict_field + FROM metabib.search_class_to_registered_components(search_class) + AS _registered (field_class TEXT, field INT) + JOIN + config.metabib_class cmc ON (cmc.name = _registered.field_class) + LEFT JOIN + config.metabib_field cmf ON (cmf.id = _registered.field); + + -- evaluate 'should we restrict?' + IF r_fields.restrict_field::BOOL OR r_fields.restrict_class::BOOL THEN + search_class_join := ' + JOIN + metabib.search_class_to_registered_components($2) + AS _registered (field_class TEXT, field INT) ON ( + (_registered.field IS NULL AND + _registered.field_class = cmf.field_class) OR + (_registered.field = cmf.id) + ) + '; + ELSE + search_class_join := ' + LEFT JOIN + metabib.search_class_to_registered_components($2) + AS _registered (field_class TEXT, field INT) ON ( + _registered.field_class = cmc.name + ) + '; + END IF; + + RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $7, $3) FROM (SELECT DISTINCT + mbe.value, + cmf.id, + cmc.buoyant AND _registered.field_class IS NOT NULL, + _registered.field = cmf.id, + cmf.weight, + TS_RANK_CD(mbe.index_vector, $1, $6), + cmc.buoyant + FROM metabib.browse_entry_def_map mbedm + JOIN metabib.browse_entry mbe ON (mbe.id = mbedm.entry) + JOIN config.metabib_field cmf ON (cmf.id = mbedm.def) + JOIN config.metabib_class cmc ON (cmf.field_class = cmc.name) + ' || search_class_join || opac_visibility_join || + ' WHERE $1 @@ mbe.index_vector + ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC + LIMIT $5) x + ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC + ' -- sic, repeat the order by clause in the outer select too + USING + query, search_class, headline_opts, + visibility_org, query_limit, normalization, plain_query + ; + + -- sort order: + -- buoyant AND chosen class = match class + -- chosen field = match field + -- field weight + -- rank + -- buoyancy + -- value itself + +END; +$func$ LANGUAGE PLPGSQL; + + +\qecho +\qecho The following takes about a minute per 100,000 rows in +\qecho metabib.browse_entry on my development system, which is only a VM with +\qecho 4 GB of memory and 2 cores. +\qecho +\qecho The following is a very loose estimate of how long the next UPDATE +\qecho statement would take to finish on MY machine, based on YOUR number +\qecho of rows in metabib.browse_entry: +\qecho + +SELECT (COUNT(id) / 100000.0) * INTERVAL '1 minute' + AS "approximate duration of following UPDATE statement" + FROM metabib.browse_entry; + +UPDATE metabib.browse_entry SET index_vector = TO_TSVECTOR( + 'keyword', + public.search_normalize( + ARRAY_TO_STRING( + evergreen.regexp_split_to_array(value, E'\\W+'), ' ' + ) + ) +); + +COMMIT; diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql deleted file mode 100644 index 4aca7139c7..0000000000 --- a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql +++ /dev/null @@ -1,258 +0,0 @@ -BEGIN; - -SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); - -CREATE OR REPLACE FUNCTION oils_tsearch2 () RETURNS TRIGGER AS $$ -DECLARE - normalizer RECORD; - value TEXT := ''; -BEGIN - - value := NEW.value; - - IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN - FOR normalizer IN - SELECT n.func AS func, - n.param_count AS param_count, - m.params AS params - FROM config.index_normalizer n - JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) - WHERE field = NEW.field AND m.pos < 0 - ORDER BY m.pos LOOP - EXECUTE 'SELECT ' || normalizer.func || '(' || - quote_literal( value ) || - CASE - WHEN normalizer.param_count > 0 - THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') - ELSE '' - END || - ')' INTO value; - - END LOOP; - - NEW.value := value; - END IF; - - IF NEW.index_vector = ''::tsvector THEN - RETURN NEW; - END IF; - - IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN - FOR normalizer IN - SELECT n.func AS func, - n.param_count AS param_count, - m.params AS params - FROM config.index_normalizer n - JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) - WHERE field = NEW.field AND m.pos >= 0 - ORDER BY m.pos LOOP - EXECUTE 'SELECT ' || normalizer.func || '(' || - quote_literal( value ) || - CASE - WHEN normalizer.param_count > 0 - THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') - ELSE '' - END || - ')' INTO value; - - END LOOP; - END IF; - - IF TG_TABLE_NAME::TEXT ~ 'browse_entry$' THEN - value := ARRAY_TO_STRING( - evergreen.regexp_split_to_array(value, E'\\W+'), ' ' - ); - value := public.search_normalize(value); - END IF; - - NEW.index_vector = to_tsvector((TG_ARGV[0])::regconfig, value); - - RETURN NEW; -END; -$$ LANGUAGE PLPGSQL; - --- Given a string such as a user might type into a search box, prepare --- two changed variants for TO_TSQUERY(). See --- http://www.postgresql.org/docs/9.0/static/textsearch-controls.html --- The first variant is normalized to match indexed documents regardless --- of diacritics. The second variant keeps its diacritics for proper --- highlighting via TS_HEADLINE(). -CREATE OR REPLACE - FUNCTION metabib.autosuggest_prepare_tsquery(orig TEXT) RETURNS TEXT[] AS -$$ -DECLARE - orig_ended_in_space BOOLEAN; - result RECORD; - plain TEXT; - normalized TEXT; -BEGIN - orig_ended_in_space := orig ~ E'\\s$'; - - orig := ARRAY_TO_STRING( - evergreen.regexp_split_to_array(orig, E'\\W+'), ' ' - ); - - normalized := public.search_normalize(orig); -- also trim()s - plain := trim(orig); - - IF NOT orig_ended_in_space THEN - plain := plain || ':*'; - normalized := normalized || ':*'; - END IF; - - plain := ARRAY_TO_STRING( - evergreen.regexp_split_to_array(plain, E'\\s+'), ' & ' - ); - normalized := ARRAY_TO_STRING( - evergreen.regexp_split_to_array(normalized, E'\\s+'), ' & ' - ); - - RETURN ARRAY[normalized, plain]; -END; -$$ LANGUAGE PLPGSQL; - - --- Definition of OUT parameters changes, so must drop first -DROP FUNCTION IF EXISTS metabib.suggest_browse_entries (TEXT, TEXT, TEXT, INTEGER, INTEGER, INTEGER); - -CREATE OR REPLACE - FUNCTION metabib.suggest_browse_entries( - raw_query_text TEXT, -- actually typed by humans at the UI level - search_class TEXT, -- 'alias' or 'class' or 'class|field..', etc - headline_opts TEXT, -- markup options for ts_headline() - visibility_org INTEGER,-- null if you don't want opac visibility test - query_limit INTEGER,-- use in LIMIT clause of interal query - normalization INTEGER -- argument to TS_RANK_CD() - ) RETURNS TABLE ( - value TEXT, -- plain - field INTEGER, - buoyant_and_class_match BOOL, - field_match BOOL, - field_weight INTEGER, - rank REAL, - buoyant BOOL, - match TEXT -- marked up - ) AS $func$ -DECLARE - prepared_query_texts TEXT[]; - query TSQUERY; - plain_query TSQUERY; - opac_visibility_join TEXT; - search_class_join TEXT; - r_fields RECORD; -BEGIN - prepared_query_texts := metabib.autosuggest_prepare_tsquery(raw_query_text); - - query := TO_TSQUERY('keyword', prepared_query_texts[1]); - plain_query := TO_TSQUERY('keyword', prepared_query_texts[2]); - - IF visibility_org IS NOT NULL THEN - opac_visibility_join := ' - JOIN asset.opac_visible_copies aovc ON ( - aovc.record = mbedm.source AND - aovc.circ_lib IN (SELECT id FROM actor.org_unit_descendants($4)) - )'; - ELSE - opac_visibility_join := ''; - END IF; - - -- The following determines whether we only provide suggestsons matching - -- the user's selected search_class, or whether we show other suggestions - -- too. The reason for MIN() is that for search_classes like - -- 'title|proper|uniform' you would otherwise get multiple rows. The - -- implication is that if title as a class doesn't have restrict, - -- nor does the proper field, but the uniform field does, you're going - -- to get 'false' for your overall evaluation of 'should we restrict?' - -- To invert that, change from MIN() to MAX(). - - SELECT - INTO r_fields - MIN(cmc.restrict::INT) AS restrict_class, - MIN(cmf.restrict::INT) AS restrict_field - FROM metabib.search_class_to_registered_components(search_class) - AS _registered (field_class TEXT, field INT) - JOIN - config.metabib_class cmc ON (cmc.name = _registered.field_class) - LEFT JOIN - config.metabib_field cmf ON (cmf.id = _registered.field); - - -- evaluate 'should we restrict?' - IF r_fields.restrict_field::BOOL OR r_fields.restrict_class::BOOL THEN - search_class_join := ' - JOIN - metabib.search_class_to_registered_components($2) - AS _registered (field_class TEXT, field INT) ON ( - (_registered.field IS NULL AND - _registered.field_class = cmf.field_class) OR - (_registered.field = cmf.id) - ) - '; - ELSE - search_class_join := ' - LEFT JOIN - metabib.search_class_to_registered_components($2) - AS _registered (field_class TEXT, field INT) ON ( - _registered.field_class = cmc.name - ) - '; - END IF; - - RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $7, $3) FROM (SELECT DISTINCT - mbe.value, - cmf.id, - cmc.buoyant AND _registered.field_class IS NOT NULL, - _registered.field = cmf.id, - cmf.weight, - TS_RANK_CD(mbe.index_vector, $1, $6), - cmc.buoyant - FROM metabib.browse_entry_def_map mbedm - JOIN metabib.browse_entry mbe ON (mbe.id = mbedm.entry) - JOIN config.metabib_field cmf ON (cmf.id = mbedm.def) - JOIN config.metabib_class cmc ON (cmf.field_class = cmc.name) - ' || search_class_join || opac_visibility_join || - ' WHERE $1 @@ mbe.index_vector - ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC - LIMIT $5) x - ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC - ' -- sic, repeat the order by clause in the outer select too - USING - query, search_class, headline_opts, - visibility_org, query_limit, normalization, plain_query - ; - - -- sort order: - -- buoyant AND chosen class = match class - -- chosen field = match field - -- field weight - -- rank - -- buoyancy - -- value itself - -END; -$func$ LANGUAGE PLPGSQL; - - -\qecho -\qecho The following takes about a minute per 100,000 rows in -\qecho metabib.browse_entry on my development system, which is only a VM with -\qecho 4 GB of memory and 2 cores. -\qecho -\qecho The following is a very loose estimate of how long the next UPDATE -\qecho statement would take to finish on MY machine, based on YOUR number -\qecho of rows in metabib.browse_entry: -\qecho - -SELECT (COUNT(id) / 100000.0) * INTERVAL '1 minute' - AS "approximate duration of following UPDATE statement" - FROM metabib.browse_entry; - -UPDATE metabib.browse_entry SET index_vector = TO_TSVECTOR( - 'keyword', - public.search_normalize( - ARRAY_TO_STRING( - evergreen.regexp_split_to_array(value, E'\\W+'), ' ' - ) - ) -); - -COMMIT;