JBAS-1050 Avoid stripping periods from search indexes
authorBill Erickson <berickxx@gmail.com>
Mon, 2 Apr 2018 19:13:41 +0000 (15:13 -0400)
committerBill Erickson <berickxx@gmail.com>
Thu, 21 Mar 2019 19:46:23 +0000 (15:46 -0400)
Roll back an old customization that stripped periods from search index
values.  Removing the periods is unnecessary becuase the values are also
normalized by the search_normalize() function, which replaces periods
with spaces.

Adds an additional keyword index using the 'replace' normalizer to strip
periods from indexed values (without replacing them with a space) so
that we can additionaly index (for example) "g.i. joe" as "gi joe".

Signed-off-by: Bill Erickson <berickxx@gmail.com>
KCLS/sql/schema/deploy/search-index-keep-periods.sql [new file with mode: 0644]
KCLS/sql/schema/revert/search-index-keep-periods.sql [new file with mode: 0644]
KCLS/sql/schema/sqitch.plan

diff --git a/KCLS/sql/schema/deploy/search-index-keep-periods.sql b/KCLS/sql/schema/deploy/search-index-keep-periods.sql
new file mode 100644 (file)
index 0000000..fbe1d19
--- /dev/null
@@ -0,0 +1,230 @@
+-- Deploy kcls-evergreen:search-index-keep-periods to pg
+-- requires: vand-import-edit-date-fix
+
+BEGIN;
+
+INSERT INTO config.metabib_field
+    (field_class, name, label, xpath, weight, format, search_field, browse_field)
+VALUES (
+    'keyword',
+    'keyword_strip_periods',
+    'General Keywords With Periods Removed',
+    $$//mods32:mods/*[not(local-name()='originInfo')]$$, -- for vim */
+    5, 'mods32', TRUE, FALSE
+);  
+
+-- call replace('.', '') before search_normalize on keyword index
+INSERT INTO config.metabib_field_index_norm_map (field, norm, pos, params)
+VALUES ( 
+    (SELECT id FROM config.metabib_field WHERE name = 'keyword_strip_periods'),
+    12, -- replace
+    0,
+    $$[".",""]$$
+), ( 
+    (SELECT id FROM config.metabib_field WHERE name = 'keyword_strip_periods'),
+    2, -- split_date_range
+    1,
+    NULL
+), ( 
+    (SELECT id FROM config.metabib_field WHERE name = 'keyword_strip_periods'),
+    19, -- search_normalize
+    1,
+    NULL
+);
+
+
+CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( rid BIGINT, default_joiner TEXT ) RETURNS SETOF metabib.field_entry_template AS $func$
+DECLARE
+    bib     biblio.record_entry%ROWTYPE;
+    idx     config.metabib_field%ROWTYPE;
+    xfrm        config.xml_transform%ROWTYPE;
+    prev_xfrm   TEXT;
+    transformed_xml TEXT;
+    xml_node    TEXT;
+    xml_node_list   TEXT[];
+    facet_text  TEXT;
+    browse_text TEXT;
+    sort_value  TEXT;
+    raw_text    TEXT;
+    curr_text   TEXT;
+    joiner      TEXT := default_joiner; -- XXX will index defs supply a joiner?
+    authority_text TEXT;
+    authority_link BIGINT;
+    output_row  metabib.field_entry_template%ROWTYPE;
+BEGIN
+
+    -- Start out with no field-use bools set
+    output_row.browse_field = FALSE;
+    output_row.facet_field = FALSE;
+    output_row.search_field = FALSE;
+
+    -- Get the record
+    SELECT INTO bib * FROM biblio.record_entry WHERE id = rid;
+
+    -- Loop over the indexing entries
+    FOR idx IN SELECT * FROM config.metabib_field ORDER BY format LOOP
+
+        joiner := COALESCE(idx.joiner, default_joiner);
+
+        SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format;
+
+        -- See if we can skip the XSLT ... it's expensive
+        IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN
+            -- Can't skip the transform
+            IF xfrm.xslt <> '---' THEN
+                transformed_xml := oils_xslt_process(bib.marc,xfrm.xslt);
+            ELSE
+                transformed_xml := bib.marc;
+            END IF;
+
+            prev_xfrm := xfrm.name;
+        END IF;
+
+        xml_node_list := oils_xpath( idx.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] );
+
+        raw_text := NULL;
+        FOR xml_node IN SELECT x FROM unnest(xml_node_list) AS x LOOP
+            CONTINUE WHEN xml_node !~ E'^\\s*<';
+
+            -- XXX much of this should be moved into oils_xpath_string...
+            curr_text := ARRAY_TO_STRING(evergreen.array_remove_item_by_value(evergreen.array_remove_item_by_value(
+                oils_xpath( '//text()', -- get the content of all the nodes within the main selected node
+                    REGEXP_REPLACE( xml_node, E'\\s+', ' ', 'g' ) -- Translate adjacent whitespace to a single space
+                ), ' '), ''),  -- throw away morally empty (bankrupt?) strings
+                joiner
+            );
+
+            CONTINUE WHEN curr_text IS NULL OR curr_text = '';
+
+            IF raw_text IS NOT NULL THEN
+                raw_text := raw_text || joiner;
+            END IF;
+
+            raw_text := COALESCE(raw_text,'') || curr_text;
+
+            -- autosuggest/metabib.browse_entry
+            IF idx.browse_field THEN
+
+                IF idx.browse_xpath IS NOT NULL AND idx.browse_xpath <> '' THEN
+                    browse_text := oils_xpath_string( idx.browse_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] );
+                ELSE
+                    browse_text := curr_text;
+                END IF;
+
+                IF idx.browse_sort_xpath IS NOT NULL AND
+                    idx.browse_sort_xpath <> '' THEN
+
+                    sort_value := oils_xpath_string(
+                        idx.browse_sort_xpath, xml_node, joiner,
+                        ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]
+                    );
+                ELSE
+                    sort_value := browse_text;
+                END IF;
+
+                output_row.field_class = idx.field_class;
+                output_row.field = idx.id;
+                output_row.source = rid;
+                -- KCLS / Catalyst
+                -- output_row.value = BTRIM(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g'));
+                -- outer regexp_replace keeps all '.' expect the last one.
+                -- inner regexp_replace removes all connecting whitespace and replaces it with a single space
+                output_row.value = BTRIM(REGEXP_REPLACE(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g'), E'\\.$', '', 'g'));
+                -- /KCLS 
+
+                output_row.sort_value :=
+                    public.naco_normalize(sort_value);
+
+                output_row.authority := NULL;
+
+                IF idx.authority_xpath IS NOT NULL AND idx.authority_xpath <> '' THEN
+                    authority_text := oils_xpath_string(
+                        idx.authority_xpath, xml_node, joiner,
+                        ARRAY[
+                            ARRAY[xfrm.prefix, xfrm.namespace_uri],
+                            ARRAY['xlink','http://www.w3.org/1999/xlink']
+                        ]
+                    );
+
+                    IF authority_text ~ '^\d+$' THEN
+                        authority_link := authority_text::BIGINT;
+                        PERFORM * FROM authority.record_entry WHERE id = authority_link;
+                        IF FOUND THEN
+                            output_row.authority := authority_link;
+                        END IF;
+                    END IF;
+
+                END IF;
+
+                output_row.browse_field = TRUE;
+                -- Returning browse rows with search_field = true for search+browse
+                -- configs allows us to retain granularity of being able to search
+                -- browse fields with "starts with" type operators (for example, for
+                -- titles of songs in music albums)
+                IF idx.search_field THEN
+                    output_row.search_field = TRUE;
+                END IF;
+                RETURN NEXT output_row;
+                output_row.browse_field = FALSE;
+                output_row.search_field = FALSE;
+                output_row.sort_value := NULL;
+            END IF;
+
+            -- insert raw node text for faceting
+            IF idx.facet_field THEN
+
+                IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN
+                    facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] );
+                ELSE
+                    facet_text := curr_text;
+                END IF;
+
+                output_row.field_class = idx.field_class;
+                output_row.field = -1 * idx.id;
+                output_row.source = rid;
+                output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g'));
+
+                output_row.facet_field = TRUE;
+                RETURN NEXT output_row;
+                output_row.facet_field = FALSE;
+            END IF;
+
+        END LOOP;
+
+        CONTINUE WHEN raw_text IS NULL OR raw_text = '';
+
+        -- insert combined node text for searching
+        IF idx.search_field THEN
+            -- KCLS
+            -- TODO: move these mods into configs / search normalizers
+            IF idx.field_class = 'identifier' AND idx.name = 'bibcn' THEN
+                output_row.field_class = 'call_number';
+                output_row.browse_field = TRUE;
+                output_row.sort_value = public.naco_normalize_keep_decimal(raw_text,'');
+                output_row.value = raw_text;
+            ELSE
+                output_row.field_class = idx.field_class;
+                output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g'));
+            END IF;
+            -- /KCLS
+
+            output_row.field = idx.id;
+            output_row.source = rid;
+            -- KCLS -- value is set above
+            -- output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g'));
+            -- /KCLS
+
+            output_row.search_field = TRUE;
+            RETURN NEXT output_row;
+            output_row.search_field = FALSE;
+        END IF;
+
+    END LOOP;
+
+END;
+
+$func$ LANGUAGE PLPGSQL;
+
+
+
+COMMIT;
diff --git a/KCLS/sql/schema/revert/search-index-keep-periods.sql b/KCLS/sql/schema/revert/search-index-keep-periods.sql
new file mode 100644 (file)
index 0000000..94e5b35
--- /dev/null
@@ -0,0 +1,204 @@
+-- Revert kcls-evergreen:search-index-keep-periods from pg
+
+BEGIN;
+
+-- remove 'replace' normalizer
+DELETE FROM config.metabib_field_index_norm_map WHERE field = 
+    (SELECT id FROM config.metabib_field WHERE name = 'keyword_strip_periods');
+
+DELETE FROM config.metabib_field WHERE name = 'keyword_strip_periods';
+
+CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( rid BIGINT, default_joiner TEXT ) RETURNS SETOF metabib.field_entry_template AS $func$
+DECLARE
+    bib     biblio.record_entry%ROWTYPE;
+    idx     config.metabib_field%ROWTYPE;
+    xfrm        config.xml_transform%ROWTYPE;
+    prev_xfrm   TEXT;
+    transformed_xml TEXT;
+    xml_node    TEXT;
+    xml_node_list   TEXT[];
+    facet_text  TEXT;
+    browse_text TEXT;
+    sort_value  TEXT;
+    raw_text    TEXT;
+    curr_text   TEXT;
+    joiner      TEXT := default_joiner; -- XXX will index defs supply a joiner?
+    authority_text TEXT;
+    authority_link BIGINT;
+    output_row  metabib.field_entry_template%ROWTYPE;
+BEGIN
+
+    -- Start out with no field-use bools set
+    output_row.browse_field = FALSE;
+    output_row.facet_field = FALSE;
+    output_row.search_field = FALSE;
+
+    -- Get the record
+    SELECT INTO bib * FROM biblio.record_entry WHERE id = rid;
+
+    -- Loop over the indexing entries
+    FOR idx IN SELECT * FROM config.metabib_field ORDER BY format LOOP
+
+        joiner := COALESCE(idx.joiner, default_joiner);
+
+        SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format;
+
+        -- See if we can skip the XSLT ... it's expensive
+        IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN
+            -- Can't skip the transform
+            IF xfrm.xslt <> '---' THEN
+                transformed_xml := oils_xslt_process(bib.marc,xfrm.xslt);
+            ELSE
+                transformed_xml := bib.marc;
+            END IF;
+
+            prev_xfrm := xfrm.name;
+        END IF;
+
+        xml_node_list := oils_xpath( idx.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] );
+
+        raw_text := NULL;
+        FOR xml_node IN SELECT x FROM unnest(xml_node_list) AS x LOOP
+            CONTINUE WHEN xml_node !~ E'^\\s*<';
+
+            -- XXX much of this should be moved into oils_xpath_string...
+            curr_text := ARRAY_TO_STRING(evergreen.array_remove_item_by_value(evergreen.array_remove_item_by_value(
+                oils_xpath( '//text()', -- get the content of all the nodes within the main selected node
+                    REGEXP_REPLACE( xml_node, E'\\s+', ' ', 'g' ) -- Translate adjacent whitespace to a single space
+                ), ' '), ''),  -- throw away morally empty (bankrupt?) strings
+                joiner
+            );
+
+            CONTINUE WHEN curr_text IS NULL OR curr_text = '';
+
+            IF raw_text IS NOT NULL THEN
+                raw_text := raw_text || joiner;
+            END IF;
+
+            raw_text := COALESCE(raw_text,'') || curr_text;
+
+            -- autosuggest/metabib.browse_entry
+            IF idx.browse_field THEN
+
+                IF idx.browse_xpath IS NOT NULL AND idx.browse_xpath <> '' THEN
+                    browse_text := oils_xpath_string( idx.browse_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] );
+                ELSE
+                    browse_text := curr_text;
+                END IF;
+
+                IF idx.browse_sort_xpath IS NOT NULL AND
+                    idx.browse_sort_xpath <> '' THEN
+
+                    sort_value := oils_xpath_string(
+                        idx.browse_sort_xpath, xml_node, joiner,
+                        ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]
+                    );
+                ELSE
+                    sort_value := browse_text;
+                END IF;
+
+                output_row.field_class = idx.field_class;
+                output_row.field = idx.id;
+                output_row.source = rid;
+                -- KCLS / Catalyst
+                -- output_row.value = BTRIM(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g'));
+                -- outer regexp_replace keeps all '.' expect the last one.
+                -- inner regexp_replace removes all connecting whitespace and replaces it with a single space
+                output_row.value = BTRIM(REGEXP_REPLACE(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g'), E'\\.$', '', 'g'));
+                -- /KCLS 
+
+                output_row.sort_value :=
+                    public.naco_normalize(sort_value);
+
+                output_row.authority := NULL;
+
+                IF idx.authority_xpath IS NOT NULL AND idx.authority_xpath <> '' THEN
+                    authority_text := oils_xpath_string(
+                        idx.authority_xpath, xml_node, joiner,
+                        ARRAY[
+                            ARRAY[xfrm.prefix, xfrm.namespace_uri],
+                            ARRAY['xlink','http://www.w3.org/1999/xlink']
+                        ]
+                    );
+
+                    IF authority_text ~ '^\d+$' THEN
+                        authority_link := authority_text::BIGINT;
+                        PERFORM * FROM authority.record_entry WHERE id = authority_link;
+                        IF FOUND THEN
+                            output_row.authority := authority_link;
+                        END IF;
+                    END IF;
+
+                END IF;
+
+                output_row.browse_field = TRUE;
+                -- Returning browse rows with search_field = true for search+browse
+                -- configs allows us to retain granularity of being able to search
+                -- browse fields with "starts with" type operators (for example, for
+                -- titles of songs in music albums)
+                IF idx.search_field THEN
+                    output_row.search_field = TRUE;
+                END IF;
+                RETURN NEXT output_row;
+                output_row.browse_field = FALSE;
+                output_row.search_field = FALSE;
+                output_row.sort_value := NULL;
+            END IF;
+
+            -- insert raw node text for faceting
+            IF idx.facet_field THEN
+
+                IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN
+                    facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] );
+                ELSE
+                    facet_text := curr_text;
+                END IF;
+
+                output_row.field_class = idx.field_class;
+                output_row.field = -1 * idx.id;
+                output_row.source = rid;
+                output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g'));
+
+                output_row.facet_field = TRUE;
+                RETURN NEXT output_row;
+                output_row.facet_field = FALSE;
+            END IF;
+
+        END LOOP;
+
+        CONTINUE WHEN raw_text IS NULL OR raw_text = '';
+
+        -- insert combined node text for searching
+        IF idx.search_field THEN
+            -- KCLS
+            IF idx.field_class = 'identifier' AND idx.name = 'bibcn' THEN
+                output_row.field_class = 'call_number';
+                output_row.browse_field = TRUE;
+                output_row.sort_value = public.naco_normalize_keep_decimal(raw_text,'');
+                output_row.value = raw_text;
+            ELSE
+                output_row.field_class = idx.field_class;
+                output_row.value = BTRIM(REGEXP_REPLACE(
+                    REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g'), E'\\.+', '', 'g'));
+            END IF;
+            -- /KCLS
+
+            output_row.field = idx.id;
+            output_row.source = rid;
+            -- KCLS -- value is set above
+            -- output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g'));
+            -- /KCLS
+
+            output_row.search_field = TRUE;
+            RETURN NEXT output_row;
+            output_row.search_field = FALSE;
+        END IF;
+
+    END LOOP;
+
+END;
+
+$func$ LANGUAGE PLPGSQL;
+
+
+COMMIT;
index e079a91..40b672d 100644 (file)
@@ -67,3 +67,4 @@ search-strip-apos-recover [hold-queue-pos-materialized] 2018-03-30T16:15:21Z Bil
 vand-import-edit-date-fix [hold-queue-pos-materialized] 2018-03-30T14:50:53Z Bill Erickson,,, <berick@kcls-dev-local> # Recover lost vandelay edit date changes
 vand-preserve-all-fields [vand-import-edit-date-fix] 2018-04-09T20:15:27Z Bill Erickson,,, <berick@kcls-dev-local> # Bug in vandelay.replace_fields()
 track-bib-merges [vand-import-edit-date-fix] 2018-04-03T15:04:00Z Bill Erickson,,, <berick@kcls-dev-local> # Track bib merges
+search-index-keep-periods [vand-import-edit-date-fix] 2018-04-02T19:10:31Z Bill Erickson,,, <berick@kcls-dev-local> # Avoid stripping periods in search indexes