From dd21cdc01d2b3e6139714b2196c8c9d274c856a5 Mon Sep 17 00:00:00 2001
From: Lebbeous Fogle-Weekley <lebbeous@esilibrary.com>
Date: Wed, 29 Feb 2012 14:30:59 -0500
Subject: [PATCH] =?utf8?q?AutoSuggest:=20suggest=20"jos=C3=A9"=20when=20th?=
 =?utf8?q?e=20user=20types=20"jose"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

"josÃ©" will serve as my example of a term with diacritics in it.  I
don't mean that there is special handling just for the word "josÃ©" per se.

Before now, the user could type "josÃ©" and see suggestions containing
exactly that.  You could not type "jose" and expect to see "josÃ©".  Now
you can.

In other words, typing "jose" and "josÃ©" should now produce the exact same
set of suggestions.  Only the version you actually typed will be
*highlighted* in the suggestions, however.

Signed-off-by: Lebbeous Fogle-Weekley <lebbeous@esilibrary.com>
Signed-off-by: Dan Scott <dscott@laurentian.ca>
---
 .../src/perlmods/lib/OpenILS/WWW/AutoSuggest.pm    |  14 +-
 Open-ILS/src/sql/Pg/002.schema.config.sql          |   1 +
 Open-ILS/src/sql/Pg/030.schema.metabib.sql         |  54 ++++-
 .../XXXX.schema.autosuggest.search-normalize.sql   | 255 +++++++++++++++++++++
 4 files changed, 307 insertions(+), 17 deletions(-)
 create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql

diff --git a/Open-ILS/src/perlmods/lib/OpenILS/WWW/AutoSuggest.pm b/Open-ILS/src/perlmods/lib/OpenILS/WWW/AutoSuggest.pm
index cf8644bc1b..7ed9bf7c65 100644
--- a/Open-ILS/src/perlmods/lib/OpenILS/WWW/AutoSuggest.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/WWW/AutoSuggest.pm
@@ -49,18 +49,6 @@ my @_output_handler_types = sort {
 
 # END package globals
 
-# Given a string such as a user might type into a search box, prepare
-# it for to_tsquery(). See
-# http://www.postgresql.org/docs/9.0/static/textsearch-controls.html
-sub prepare_for_tsquery {
-    my ($str) = shift;
-
-    $str =~ s/[^\w\s]/ /ig;
-    $str .= ":*" unless $str =~ /\s$/;
-
-    return join(" & ", grep(length, split(/\s+/, $str)));
-}
-
 # The third argument to our stored procedure, metabib.suggest_browse_entries(),
 # is passed through directly to ts_headline() as the 'options' arugment.
 sub prepare_headline_opts {
@@ -106,7 +94,7 @@ sub get_suggestions {
     return $editor->json_query({
         "from" => [
             "metabib.suggest_browse_entries",
-            prepare_for_tsquery($query),
+            $query,
             $search_class,
             $headline_opts,
             $org_unit,
diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql
index 3eba0105c3..40e3af0eea 100644
--- a/Open-ILS/src/sql/Pg/002.schema.config.sql
+++ b/Open-ILS/src/sql/Pg/002.schema.config.sql
@@ -811,6 +811,7 @@ BEGIN
         value :=  ARRAY_TO_STRING(
             evergreen.regexp_split_to_array(value, E'\\W+'), ' '
         );
+        value := public.search_normalize(value);
     END IF;
 
     NEW.index_vector = to_tsvector((TG_ARGV[0])::regconfig, value);
diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql
index 4cc72c0266..04b9f56e55 100644
--- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql
+++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql
@@ -1297,9 +1297,50 @@ END;
 $func$ LANGUAGE PLPGSQL;
 
 
+-- Given a string such as a user might type into a search box, prepare
+-- two changed variants for TO_TSQUERY(). See
+-- http://www.postgresql.org/docs/9.0/static/textsearch-controls.html
+-- The first variant is normalized to match indexed documents regardless
+-- of diacritics.  The second variant keeps its diacritics for proper
+-- highlighting via TS_HEADLINE().
+CREATE OR REPLACE
+    FUNCTION metabib.autosuggest_prepare_tsquery(orig TEXT) RETURNS TEXT[] AS
+$$
+DECLARE
+    orig_ended_in_space     BOOLEAN;
+    result                  RECORD;
+    plain                   TEXT;
+    normalized              TEXT;
+BEGIN
+    orig_ended_in_space := orig ~ E'\\s$';
+
+    orig := ARRAY_TO_STRING(
+        evergreen.regexp_split_to_array(orig, E'\\W+'), ' '
+    );
+
+    normalized := public.search_normalize(orig); -- also trim()s
+    plain := trim(orig);
+
+    IF NOT orig_ended_in_space THEN
+        plain := plain || ':*';
+        normalized := normalized || ':*';
+    END IF;
+
+    plain := ARRAY_TO_STRING(
+        evergreen.regexp_split_to_array(plain, E'\\s+'), ' & '
+    );
+    normalized := ARRAY_TO_STRING(
+        evergreen.regexp_split_to_array(normalized, E'\\s+'), ' & '
+    );
+
+    RETURN ARRAY[normalized, plain];
+END;
+$$ LANGUAGE PLPGSQL;
+
+
 CREATE OR REPLACE
     FUNCTION metabib.suggest_browse_entries(
-        query_text      TEXT,   -- 'foo' or 'foo & ba:*',ready for to_tsquery()
+        raw_query_text  TEXT,   -- actually typed by humans at the UI level
         search_class    TEXT,   -- 'alias' or 'class' or 'class|field..', etc
         headline_opts   TEXT,   -- markup options for ts_headline()
         visibility_org  INTEGER,-- null if you don't want opac visibility test
@@ -1316,12 +1357,17 @@ CREATE OR REPLACE
         match                   TEXT    -- marked up
     ) AS $func$
 DECLARE
+    prepared_query_texts    TEXT[];
     query                   TSQUERY;
+    plain_query             TSQUERY;
     opac_visibility_join    TEXT;
     search_class_join       TEXT;
     r_fields                RECORD;
 BEGIN
-    query := TO_TSQUERY('keyword', query_text);
+    prepared_query_texts := metabib.autosuggest_prepare_tsquery(raw_query_text);
+
+    query := TO_TSQUERY('keyword', prepared_query_texts[1]);
+    plain_query := TO_TSQUERY('keyword', prepared_query_texts[2]);
 
     IF visibility_org IS NOT NULL THEN
         opac_visibility_join := '
@@ -1374,7 +1420,7 @@ BEGIN
     ';
     END IF;
 
-    RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $1, $3) FROM (SELECT DISTINCT
+    RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $7, $3) FROM (SELECT DISTINCT
         mbe.value,
         cmf.id,
         cmc.bouyant AND _registered.field_class IS NOT NULL,
@@ -1394,7 +1440,7 @@ BEGIN
     '   -- sic, repeat the order by clause in the outer select too
     USING
         query, search_class, headline_opts,
-        visibility_org, query_limit, normalization
+        visibility_org, query_limit, normalization, plain_query
         ;
 
     -- sort order:
diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql
new file mode 100644
index 0000000000..14284b5710
--- /dev/null
+++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.autosuggest.search-normalize.sql
@@ -0,0 +1,255 @@
+BEGIN;
+
+SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
+
+CREATE OR REPLACE FUNCTION oils_tsearch2 () RETURNS TRIGGER AS $$
+DECLARE
+    normalizer      RECORD;
+    value           TEXT := '';
+BEGIN
+
+    value := NEW.value;
+
+    IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN
+        FOR normalizer IN
+            SELECT  n.func AS func,
+                    n.param_count AS param_count,
+                    m.params AS params
+              FROM  config.index_normalizer n
+                    JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id)
+              WHERE field = NEW.field AND m.pos < 0
+              ORDER BY m.pos LOOP
+                EXECUTE 'SELECT ' || normalizer.func || '(' ||
+                    quote_literal( value ) ||
+                    CASE
+                        WHEN normalizer.param_count > 0
+                            THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'')
+                            ELSE ''
+                        END ||
+                    ')' INTO value;
+
+        END LOOP;
+
+        NEW.value := value;
+    END IF;
+
+    IF NEW.index_vector = ''::tsvector THEN
+        RETURN NEW;
+    END IF;
+
+    IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN
+        FOR normalizer IN
+            SELECT  n.func AS func,
+                    n.param_count AS param_count,
+                    m.params AS params
+              FROM  config.index_normalizer n
+                    JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id)
+              WHERE field = NEW.field AND m.pos >= 0
+              ORDER BY m.pos LOOP
+                EXECUTE 'SELECT ' || normalizer.func || '(' ||
+                    quote_literal( value ) ||
+                    CASE
+                        WHEN normalizer.param_count > 0
+                            THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'')
+                            ELSE ''
+                        END ||
+                    ')' INTO value;
+
+        END LOOP;
+    END IF;
+
+    IF TG_TABLE_NAME::TEXT ~ 'browse_entry$' THEN
+        value :=  ARRAY_TO_STRING(
+            evergreen.regexp_split_to_array(value, E'\\W+'), ' '
+        );
+        value := public.search_normalize(value);
+    END IF;
+
+    NEW.index_vector = to_tsvector((TG_ARGV[0])::regconfig, value);
+
+    RETURN NEW;
+END;
+$$ LANGUAGE PLPGSQL;
+
+-- Given a string such as a user might type into a search box, prepare
+-- two changed variants for TO_TSQUERY(). See
+-- http://www.postgresql.org/docs/9.0/static/textsearch-controls.html
+-- The first variant is normalized to match indexed documents regardless
+-- of diacritics.  The second variant keeps its diacritics for proper
+-- highlighting via TS_HEADLINE().
+CREATE OR REPLACE
+    FUNCTION metabib.autosuggest_prepare_tsquery(orig TEXT) RETURNS TEXT[] AS
+$$
+DECLARE
+    orig_ended_in_space     BOOLEAN;
+    result                  RECORD;
+    plain                   TEXT;
+    normalized              TEXT;
+BEGIN
+    orig_ended_in_space := orig ~ E'\\s$';
+
+    orig := ARRAY_TO_STRING(
+        evergreen.regexp_split_to_array(orig, E'\\W+'), ' '
+    );
+
+    normalized := public.search_normalize(orig); -- also trim()s
+    plain := trim(orig);
+
+    IF NOT orig_ended_in_space THEN
+        plain := plain || ':*';
+        normalized := normalized || ':*';
+    END IF;
+
+    plain := ARRAY_TO_STRING(
+        evergreen.regexp_split_to_array(plain, E'\\s+'), ' & '
+    );
+    normalized := ARRAY_TO_STRING(
+        evergreen.regexp_split_to_array(normalized, E'\\s+'), ' & '
+    );
+
+    RETURN ARRAY[normalized, plain];
+END;
+$$ LANGUAGE PLPGSQL;
+
+
+CREATE OR REPLACE
+    FUNCTION metabib.suggest_browse_entries(
+        raw_query_text  TEXT,   -- actually typed by humans at the UI level
+        search_class    TEXT,   -- 'alias' or 'class' or 'class|field..', etc
+        headline_opts   TEXT,   -- markup options for ts_headline()
+        visibility_org  INTEGER,-- null if you don't want opac visibility test
+        query_limit     INTEGER,-- use in LIMIT clause of interal query
+        normalization   INTEGER -- argument to TS_RANK_CD()
+    ) RETURNS TABLE (
+        value                   TEXT,   -- plain
+        field                   INTEGER,
+        bouyant_and_class_match BOOL,
+        field_match             BOOL,
+        field_weight            INTEGER,
+        rank                    REAL,
+        bouyant                 BOOL,
+        match                   TEXT    -- marked up
+    ) AS $func$
+DECLARE
+    prepared_query_texts    TEXT[];
+    query                   TSQUERY;
+    plain_query             TSQUERY;
+    opac_visibility_join    TEXT;
+    search_class_join       TEXT;
+    r_fields                RECORD;
+BEGIN
+    prepared_query_texts := metabib.autosuggest_prepare_tsquery(raw_query_text);
+
+    query := TO_TSQUERY('keyword', prepared_query_texts[1]);
+    plain_query := TO_TSQUERY('keyword', prepared_query_texts[2]);
+
+    IF visibility_org IS NOT NULL THEN
+        opac_visibility_join := '
+    JOIN asset.opac_visible_copies aovc ON (
+        aovc.record = mbedm.source AND
+        aovc.circ_lib IN (SELECT id FROM actor.org_unit_descendants($4))
+    )';
+    ELSE
+        opac_visibility_join := '';
+    END IF;
+
+    -- The following determines whether we only provide suggestsons matching
+    -- the user's selected search_class, or whether we show other suggestions
+    -- too. The reason for MIN() is that for search_classes like
+    -- 'title|proper|uniform' you would otherwise get multiple rows.  The
+    -- implication is that if title as a class doesn't have restrict,
+    -- nor does the proper field, but the uniform field does, you're going
+    -- to get 'false' for your overall evaluation of 'should we restrict?'
+    -- To invert that, change from MIN() to MAX().
+
+    SELECT
+        INTO r_fields
+            MIN(cmc.restrict::INT) AS restrict_class,
+            MIN(cmf.restrict::INT) AS restrict_field
+        FROM metabib.search_class_to_registered_components(search_class)
+            AS _registered (field_class TEXT, field INT)
+        JOIN
+            config.metabib_class cmc ON (cmc.name = _registered.field_class)
+        LEFT JOIN
+            config.metabib_field cmf ON (cmf.id = _registered.field);
+
+    -- evaluate 'should we restrict?'
+    IF r_fields.restrict_field::BOOL OR r_fields.restrict_class::BOOL THEN
+        search_class_join := '
+    JOIN
+        metabib.search_class_to_registered_components($2)
+        AS _registered (field_class TEXT, field INT) ON (
+            (_registered.field IS NULL AND
+                _registered.field_class = cmf.field_class) OR
+            (_registered.field = cmf.id)
+        )
+    ';
+    ELSE
+        search_class_join := '
+    LEFT JOIN
+        metabib.search_class_to_registered_components($2)
+        AS _registered (field_class TEXT, field INT) ON (
+            _registered.field_class = cmc.name
+        )
+    ';
+    END IF;
+
+    RETURN QUERY EXECUTE 'SELECT *, TS_HEADLINE(value, $7, $3) FROM (SELECT DISTINCT
+        mbe.value,
+        cmf.id,
+        cmc.bouyant AND _registered.field_class IS NOT NULL,
+        _registered.field = cmf.id,
+        cmf.weight,
+        TS_RANK_CD(mbe.index_vector, $1, $6),
+        cmc.bouyant
+    FROM metabib.browse_entry_def_map mbedm
+    JOIN metabib.browse_entry mbe ON (mbe.id = mbedm.entry)
+    JOIN config.metabib_field cmf ON (cmf.id = mbedm.def)
+    JOIN config.metabib_class cmc ON (cmf.field_class = cmc.name)
+    '  || search_class_join || opac_visibility_join ||
+    ' WHERE $1 @@ mbe.index_vector
+    ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC
+    LIMIT $5) x
+    ORDER BY 3 DESC, 4 DESC NULLS LAST, 5 DESC, 6 DESC, 7 DESC, 1 ASC
+    '   -- sic, repeat the order by clause in the outer select too
+    USING
+        query, search_class, headline_opts,
+        visibility_org, query_limit, normalization, plain_query
+        ;
+
+    -- sort order:
+    --  bouyant AND chosen class = match class
+    --  chosen field = match field
+    --  field weight
+    --  rank
+    --  bouyancy
+    --  value itself
+
+END;
+$func$ LANGUAGE PLPGSQL;
+
+
+\qecho 
+\qecho The following takes about a minute per 100,000 rows in
+\qecho metabib.browse_entry on my development system, which is only a VM with
+\qecho 4 GB of memory and 2 cores.
+\qecho 
+\qecho The following is a very loose estimate of how long the next UPDATE
+\qecho statement would take to finish on MY machine, based on YOUR number
+\qecho of rows in metabib.browse_entry:
+\qecho 
+
+SELECT (COUNT(id) / 100000.0) * INTERVAL '1 minute'
+    AS "approximate duration of following UPDATE statement"
+    FROM metabib.browse_entry;
+
+UPDATE metabib.browse_entry SET index_vector = TO_TSVECTOR(
+    'keyword',
+    public.search_normalize(
+        ARRAY_TO_STRING(
+            evergreen.regexp_split_to_array(value, E'\\W+'), ' '
+        )
+    )
+);
+
+COMMIT;
-- 
2.11.0