From dd21cdc01d2b3e6139714b2196c8c9d274c856a5 Mon Sep 17 00:00:00 2001 From: Lebbeous Fogle-Weekley Date: Wed, 29 Feb 2012 14:30:59 -0500 Subject: [PATCH] =?utf8?q?AutoSuggest:=20suggest=20"jos=C3=A9"=20when=20th?= =?utf8?q?e=20user=20types=20"jose"?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit "josé" will serve as my example of a term with diacritics in it. I don't mean that there is special handling just for the word "josé" per se. Before now, the user could type "josé" and see suggestions containing exactly that. You could not type "jose" and expect to see "josé". Now you can. In other words, typing "jose" and "josé" should now produce the exact same set of suggestions. Only the version you actually typed will be *highlighted* in the suggestions, however. Signed-off-by: Lebbeous Fogle-Weekley Signed-off-by: Dan Scott --- .../src/perlmods/lib/OpenILS/WWW/AutoSuggest.pm | 14 +- Open-ILS/src/sql/Pg/002.schema.config.sql | 1 + Open-ILS/src/sql/Pg/030.schema.metabib.sql | 54 ++++- .../XXXX.schema.autosuggest.search-normalize.sql | 255 +++++++++++++++++++++ 4 files changed, 307 insertions(+), 17 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql diff --git a/Open-ILS/src/perlmods/lib/OpenILS/WWW/AutoSuggest.pm b/Open-ILS/src/perlmods/lib/OpenILS/WWW/AutoSuggest.pm index cf8644bc1b..7ed9bf7c65 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/WWW/AutoSuggest.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/WWW/AutoSuggest.pm @@ -49,18 +49,6 @@ my @_output_handler_types = sort { # END package globals -# Given a string such as a user might type into a search box, prepare -# it for to_tsquery(). See -# http://www.postgresql.org/docs/9.0/static/textsearch-controls.html -sub prepare_for_tsquery { - my ($str) = shift; - - $str =~ s/[^\w\s]/ /ig; - $str .= ":*" unless $str =~ /\s$/; - - return join(" & ", grep(length, split(/\s+/, $str))); -} - # The third argument to our stored procedure, metabib.suggest_browse_entries(), # is passed through directly to ts_headline() as the 'options' arugment. sub prepare_headline_opts { @@ -106,7 +94,7 @@ sub get_suggestions { return $editor->json_query({ "from" => [ "metabib.suggest_browse_entries", - prepare_for_tsquery($query), + $query, $search_class, $headline_opts, $org_unit, diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 3eba0105c3..40e3af0eea 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -811,6 +811,7 @@ BEGIN value := ARRAY_TO_STRING( evergreen.regexp_split_to_array(value, E'\\W+'), ' ' ); + value := public.search_normalize(value); END IF; NEW.index_vector = to_tsvector((TG_ARGV[0])::regconfig, value); diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index 4cc72c0266..04b9f56e55 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -1297,9 +1297,50 @@ END; $func$ LANGUAGE PLPGSQL; +-- Given a string such as a user might type into a search box, prepare +-- two changed variants for TO_TSQUERY(). See +-- http://www.postgresql.org/docs/9.0/static/textsearch-controls.html +-- The first variant is normalized to match indexed documents regardless +-- of diacritics. The second variant keeps its diacritics for proper +-- highlighting via TS_HEADLINE(). +CREATE OR REPLACE + FUNCTION metabib.autosuggest_prepare_tsquery(orig TEXT) RETURNS TEXT[] AS +$$ +DECLARE + orig_ended_in_space BOOLEAN; + result RECORD; + plain TEXT; + normalized TEXT; +BEGIN + orig_ended_in_space := orig ~ E'\\s$'; + + orig := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(orig, E'\\W+'), ' ' + ); + + normalized := public.search_normalize(orig); -- also trim()s + plain := trim(orig); + + IF NOT orig_ended_in_space THEN + plain := plain || ':*'; + normalized := normalized || ':*'; + END IF; + + plain := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(plain, E'\\s+'), ' & ' + ); + normalized := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(normalized, E'\\s+'), ' & ' + ); + + RETURN ARRAY[normalized, plain]; +END; +$$ LANGUAGE PLPGSQL; + + CREATE OR REPLACE FUNCTION metabib.suggest_browse_entries( - query_text TEXT, -- 'foo' or 'foo & ba:*',ready for to_tsquery() + raw_query_text TEXT, -- actually typed by humans at the UI level search_class TEXT, -- 'alias' or 'class' or 'class|field..', etc headline_opts TEXT, -- markup options for ts_headline() visibility_org INTEGER,-- null if you don't want opac visibility test @@ -1316,12 +1357,17 @@ CREATE OR REPLACE match TEXT -- marked up ) AS $func$ DECLARE + prepared_query_texts TEXT[]; query TSQUERY; + plain_query TSQUERY; opac_visibility_join TEXT; search_class_join TEXT; r_fields RECORD; BEGIN - query := TO_TSQUERY('keyword', query_text); + prepared_query_texts := metabib.autosuggest_prepare_tsquery(raw_query_text); + + query := TO_TSQUERY('keyword', prepared_query_texts[1]); + plain_query := TO_TSQUERY('keyword', prepared_query_texts[2]); IF visibility_org IS NOT NULL THEN opac_visibility_join := ' @@ -1374,7 +1420,7 @@ BEGIN '; END IF; - RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $1, $3) FROM (SELECT DISTINCT + RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $7, $3) FROM (SELECT DISTINCT mbe.value, cmf.id, cmc.bouyant AND _registered.field_class IS NOT NULL, @@ -1394,7 +1440,7 @@ BEGIN ' -- sic, repeat the order by clause in the outer select too USING query, search_class, headline_opts, - visibility_org, query_limit, normalization + visibility_org, query_limit, normalization, plain_query ; -- sort order: diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql new file mode 100644 index 0000000000..14284b5710 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql @@ -0,0 +1,255 @@ +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); + +CREATE OR REPLACE FUNCTION oils_tsearch2 () RETURNS TRIGGER AS $$ +DECLARE + normalizer RECORD; + value TEXT := ''; +BEGIN + + value := NEW.value; + + IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE field = NEW.field AND m.pos < 0 + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( value ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO value; + + END LOOP; + + NEW.value := value; + END IF; + + IF NEW.index_vector = ''::tsvector THEN + RETURN NEW; + END IF; + + IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE field = NEW.field AND m.pos >= 0 + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( value ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO value; + + END LOOP; + END IF; + + IF TG_TABLE_NAME::TEXT ~ 'browse_entry$' THEN + value := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(value, E'\\W+'), ' ' + ); + value := public.search_normalize(value); + END IF; + + NEW.index_vector = to_tsvector((TG_ARGV[0])::regconfig, value); + + RETURN NEW; +END; +$$ LANGUAGE PLPGSQL; + +-- Given a string such as a user might type into a search box, prepare +-- two changed variants for TO_TSQUERY(). See +-- http://www.postgresql.org/docs/9.0/static/textsearch-controls.html +-- The first variant is normalized to match indexed documents regardless +-- of diacritics. The second variant keeps its diacritics for proper +-- highlighting via TS_HEADLINE(). +CREATE OR REPLACE + FUNCTION metabib.autosuggest_prepare_tsquery(orig TEXT) RETURNS TEXT[] AS +$$ +DECLARE + orig_ended_in_space BOOLEAN; + result RECORD; + plain TEXT; + normalized TEXT; +BEGIN + orig_ended_in_space := orig ~ E'\\s$'; + + orig := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(orig, E'\\W+'), ' ' + ); + + normalized := public.search_normalize(orig); -- also trim()s + plain := trim(orig); + + IF NOT orig_ended_in_space THEN + plain := plain || ':*'; + normalized := normalized || ':*'; + END IF; + + plain := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(plain, E'\\s+'), ' & ' + ); + normalized := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(normalized, E'\\s+'), ' & ' + ); + + RETURN ARRAY[normalized, plain]; +END; +$$ LANGUAGE PLPGSQL; + + +CREATE OR REPLACE + FUNCTION metabib.suggest_browse_entries( + raw_query_text TEXT, -- actually typed by humans at the UI level + search_class TEXT, -- 'alias' or 'class' or 'class|field..', etc + headline_opts TEXT, -- markup options for ts_headline() + visibility_org INTEGER,-- null if you don't want opac visibility test + query_limit INTEGER,-- use in LIMIT clause of interal query + normalization INTEGER -- argument to TS_RANK_CD() + ) RETURNS TABLE ( + value TEXT, -- plain + field INTEGER, + bouyant_and_class_match BOOL, + field_match BOOL, + field_weight INTEGER, + rank REAL, + bouyant BOOL, + match TEXT -- marked up + ) AS $func$ +DECLARE + prepared_query_texts TEXT[]; + query TSQUERY; + plain_query TSQUERY; + opac_visibility_join TEXT; + search_class_join TEXT; + r_fields RECORD; +BEGIN + prepared_query_texts := metabib.autosuggest_prepare_tsquery(raw_query_text); + + query := TO_TSQUERY('keyword', prepared_query_texts[1]); + plain_query := TO_TSQUERY('keyword', prepared_query_texts[2]); + + IF visibility_org IS NOT NULL THEN + opac_visibility_join := ' + JOIN asset.opac_visible_copies aovc ON ( + aovc.record = mbedm.source AND + aovc.circ_lib IN (SELECT id FROM actor.org_unit_descendants($4)) + )'; + ELSE + opac_visibility_join := ''; + END IF; + + -- The following determines whether we only provide suggestsons matching + -- the user's selected search_class, or whether we show other suggestions + -- too. The reason for MIN() is that for search_classes like + -- 'title|proper|uniform' you would otherwise get multiple rows. The + -- implication is that if title as a class doesn't have restrict, + -- nor does the proper field, but the uniform field does, you're going + -- to get 'false' for your overall evaluation of 'should we restrict?' + -- To invert that, change from MIN() to MAX(). + + SELECT + INTO r_fields + MIN(cmc.restrict::INT) AS restrict_class, + MIN(cmf.restrict::INT) AS restrict_field + FROM metabib.search_class_to_registered_components(search_class) + AS _registered (field_class TEXT, field INT) + JOIN + config.metabib_class cmc ON (cmc.name = _registered.field_class) + LEFT JOIN + config.metabib_field cmf ON (cmf.id = _registered.field); + + -- evaluate 'should we restrict?' + IF r_fields.restrict_field::BOOL OR r_fields.restrict_class::BOOL THEN + search_class_join := ' + JOIN + metabib.search_class_to_registered_components($2) + AS _registered (field_class TEXT, field INT) ON ( + (_registered.field IS NULL AND + _registered.field_class = cmf.field_class) OR + (_registered.field = cmf.id) + ) + '; + ELSE + search_class_join := ' + LEFT JOIN + metabib.search_class_to_registered_components($2) + AS _registered (field_class TEXT, field INT) ON ( + _registered.field_class = cmc.name + ) + '; + END IF; + + RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $7, $3) FROM (SELECT DISTINCT + mbe.value, + cmf.id, + cmc.bouyant AND _registered.field_class IS NOT NULL, + _registered.field = cmf.id, + cmf.weight, + TS_RANK_CD(mbe.index_vector, $1, $6), + cmc.bouyant + FROM metabib.browse_entry_def_map mbedm + JOIN metabib.browse_entry mbe ON (mbe.id = mbedm.entry) + JOIN config.metabib_field cmf ON (cmf.id = mbedm.def) + JOIN config.metabib_class cmc ON (cmf.field_class = cmc.name) + ' || search_class_join || opac_visibility_join || + ' WHERE $1 @@ mbe.index_vector + ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC + LIMIT $5) x + ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC + ' -- sic, repeat the order by clause in the outer select too + USING + query, search_class, headline_opts, + visibility_org, query_limit, normalization, plain_query + ; + + -- sort order: + -- bouyant AND chosen class = match class + -- chosen field = match field + -- field weight + -- rank + -- bouyancy + -- value itself + +END; +$func$ LANGUAGE PLPGSQL; + + +\qecho +\qecho The following takes about a minute per 100,000 rows in +\qecho metabib.browse_entry on my development system, which is only a VM with +\qecho 4 GB of memory and 2 cores. +\qecho +\qecho The following is a very loose estimate of how long the next UPDATE +\qecho statement would take to finish on MY machine, based on YOUR number +\qecho of rows in metabib.browse_entry: +\qecho + +SELECT (COUNT(id) / 100000.0) * INTERVAL '1 minute' + AS "approximate duration of following UPDATE statement" + FROM metabib.browse_entry; + +UPDATE metabib.browse_entry SET index_vector = TO_TSVECTOR( + 'keyword', + public.search_normalize( + ARRAY_TO_STRING( + evergreen.regexp_split_to_array(value, E'\\W+'), ' ' + ) + ) +); + +COMMIT; -- 2.11.0