From 1c4eb99c4cd1de9b079f5728e79ccd06df234768 Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Sun, 4 Mar 2012 14:10:48 -0500 Subject: [PATCH] Merge autosuggest normalization changes into version upgrade tsbere was working on an updated version_upgrade/2.1-2.2 script in a separate branch, so sign off on his work (based on bshum's assertion that it was good) and merge in the autosuggest normalization changes to get things back into sync. Signed-off-by: Dan Scott --- .../sql/Pg/version-upgrade/2.1-2.2-upgrade-db.sql | 84 +++++++++++++++++----- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/Open-ILS/src/sql/Pg/version-upgrade/2.1-2.2-upgrade-db.sql b/Open-ILS/src/sql/Pg/version-upgrade/2.1-2.2-upgrade-db.sql index ded1aacdc5..ff322e0699 100644 --- a/Open-ILS/src/sql/Pg/version-upgrade/2.1-2.2-upgrade-db.sql +++ b/Open-ILS/src/sql/Pg/version-upgrade/2.1-2.2-upgrade-db.sql @@ -11711,7 +11711,7 @@ CREATE TABLE metabib.browse_entry_def_map ( ALTER TABLE config.metabib_field ADD COLUMN browse_field BOOLEAN DEFAULT TRUE NOT NULL; ALTER TABLE config.metabib_field ADD COLUMN browse_xpath TEXT; -ALTER TABLE config.metabib_class ADD COLUMN bouyant BOOLEAN DEFAULT FALSE NOT NULL; +ALTER TABLE config.metabib_class ADD COLUMN buoyant BOOLEAN DEFAULT FALSE NOT NULL; ALTER TABLE config.metabib_class ADD COLUMN restrict BOOLEAN DEFAULT FALSE NOT NULL; ALTER TABLE config.metabib_field ADD COLUMN restrict BOOLEAN DEFAULT FALSE NOT NULL; @@ -12200,10 +12200,58 @@ BEGIN END; $func$ LANGUAGE PLPGSQL; +SELECT evergreen.upgrade_deps_block_check('0679', :eg_version); + +-- The advantage of this over the stock regexp_split_to_array() is that it +-- won't degrade unicode strings. +CREATE OR REPLACE FUNCTION evergreen.regexp_split_to_array(TEXT, TEXT) +RETURNS TEXT[] AS $$ + return encode_array_literal([split $_[1], $_[0]]); +$$ LANGUAGE PLPERLU STRICT IMMUTABLE; + +-- Given a string such as a user might type into a search box, prepare +-- two changed variants for TO_TSQUERY(). See +-- http://www.postgresql.org/docs/9.0/static/textsearch-controls.html +-- The first variant is normalized to match indexed documents regardless +-- of diacritics. The second variant keeps its diacritics for proper +-- highlighting via TS_HEADLINE(). +CREATE OR REPLACE + FUNCTION metabib.autosuggest_prepare_tsquery(orig TEXT) RETURNS TEXT[] AS +$$ +DECLARE + orig_ended_in_space BOOLEAN; + result RECORD; + plain TEXT; + normalized TEXT; +BEGIN + orig_ended_in_space := orig ~ E'\\s$'; + + orig := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(orig, E'\\W+'), ' ' + ); + + normalized := public.search_normalize(orig); -- also trim()s + plain := trim(orig); + + IF NOT orig_ended_in_space THEN + plain := plain || ':*'; + normalized := normalized || ':*'; + END IF; + + plain := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(plain, E'\\s+'), ' & ' + ); + normalized := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(normalized, E'\\s+'), ' & ' + ); + + RETURN ARRAY[normalized, plain]; +END; +$$ LANGUAGE PLPGSQL; CREATE OR REPLACE FUNCTION metabib.suggest_browse_entries( - query_text TEXT, -- 'foo' or 'foo & ba:*',ready for to_tsquery() + raw_query_text TEXT, -- actually typed by humans at the UI level search_class TEXT, -- 'alias' or 'class' or 'class|field..', etc headline_opts TEXT, -- markup options for ts_headline() visibility_org INTEGER,-- null if you don't want opac visibility test @@ -12212,20 +12260,25 @@ CREATE OR REPLACE ) RETURNS TABLE ( value TEXT, -- plain field INTEGER, - bouyant_and_class_match BOOL, + buoyant_and_class_match BOOL, field_match BOOL, field_weight INTEGER, rank REAL, - bouyant BOOL, + buoyant BOOL, match TEXT -- marked up ) AS $func$ DECLARE + prepared_query_texts TEXT[]; query TSQUERY; + plain_query TSQUERY; opac_visibility_join TEXT; search_class_join TEXT; r_fields RECORD; BEGIN - query := TO_TSQUERY('keyword', query_text); + prepared_query_texts := metabib.autosuggest_prepare_tsquery(raw_query_text); + + query := TO_TSQUERY('keyword', prepared_query_texts[1]); + plain_query := TO_TSQUERY('keyword', prepared_query_texts[2]); IF visibility_org IS NOT NULL THEN opac_visibility_join := ' @@ -12278,14 +12331,14 @@ BEGIN '; END IF; - RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $1, $3) FROM (SELECT DISTINCT + RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $7, $3) FROM (SELECT DISTINCT mbe.value, cmf.id, - cmc.bouyant AND _registered.field_class IS NOT NULL, + cmc.buoyant AND _registered.field_class IS NOT NULL, _registered.field = cmf.id, cmf.weight, TS_RANK_CD(mbe.index_vector, $1, $6), - cmc.bouyant + cmc.buoyant FROM metabib.browse_entry_def_map mbedm JOIN metabib.browse_entry mbe ON (mbe.id = mbedm.entry) JOIN config.metabib_field cmf ON (cmf.id = mbedm.def) @@ -12298,28 +12351,20 @@ BEGIN ' -- sic, repeat the order by clause in the outer select too USING query, search_class, headline_opts, - visibility_org, query_limit, normalization + visibility_org, query_limit, normalization, plain_query ; -- sort order: - -- bouyant AND chosen class = match class + -- buoyant AND chosen class = match class -- chosen field = match field -- field weight -- rank - -- bouyancy + -- buoyancy -- value itself END; $func$ LANGUAGE PLPGSQL; --- The advantage of this over the stock regexp_split_to_array() is that it --- won't degrade unicode strings. -CREATE OR REPLACE FUNCTION evergreen.regexp_split_to_array(TEXT, TEXT) -RETURNS TEXT[] AS $$ - return encode_array_literal([split $_[1], $_[0]]); -$$ LANGUAGE PLPERLU STRICT IMMUTABLE; - - -- Adds some logic for browse_entry to split on non-word chars for index_vector, post-normalize CREATE OR REPLACE FUNCTION oils_tsearch2 () RETURNS TRIGGER AS $$ DECLARE @@ -12381,6 +12426,7 @@ BEGIN value := ARRAY_TO_STRING( evergreen.regexp_split_to_array(value, E'\\W+'), ' ' ); + value := public.search_normalize(value); END IF; NEW.index_vector = to_tsvector((TG_ARGV[0])::regconfig, value); -- 2.11.0