From c1495303dc627a6bd03a8e493b507b291d50e2d0 Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Mon, 2 Apr 2018 15:13:41 -0400 Subject: [PATCH] JBAS-1050 Avoid stripping periods from search indexes Roll back an old customization that stripped periods from search index values. Removing the periods is unnecessary becuase the values are also normalized by the search_normalize() function, which replaces periods with spaces. Adds an additional keyword index using the 'replace' normalizer to strip periods from indexed values (without replacing them with a space) so that we can additionaly index (for example) "g.i. joe" as "gi joe". Signed-off-by: Bill Erickson --- .../schema/deploy/search-index-keep-periods.sql | 230 +++++++++++++++++++++ .../schema/revert/search-index-keep-periods.sql | 204 ++++++++++++++++++ KCLS/sql/schema/sqitch.plan | 1 + 3 files changed, 435 insertions(+) create mode 100644 KCLS/sql/schema/deploy/search-index-keep-periods.sql create mode 100644 KCLS/sql/schema/revert/search-index-keep-periods.sql diff --git a/KCLS/sql/schema/deploy/search-index-keep-periods.sql b/KCLS/sql/schema/deploy/search-index-keep-periods.sql new file mode 100644 index 0000000000..fbe1d19bd7 --- /dev/null +++ b/KCLS/sql/schema/deploy/search-index-keep-periods.sql @@ -0,0 +1,230 @@ +-- Deploy kcls-evergreen:search-index-keep-periods to pg +-- requires: vand-import-edit-date-fix + +BEGIN; + +INSERT INTO config.metabib_field + (field_class, name, label, xpath, weight, format, search_field, browse_field) +VALUES ( + 'keyword', + 'keyword_strip_periods', + 'General Keywords With Periods Removed', + $$//mods32:mods/*[not(local-name()='originInfo')]$$, -- for vim */ + 5, 'mods32', TRUE, FALSE +); + +-- call replace('.', '') before search_normalize on keyword index +INSERT INTO config.metabib_field_index_norm_map (field, norm, pos, params) +VALUES ( + (SELECT id FROM config.metabib_field WHERE name = 'keyword_strip_periods'), + 12, -- replace + 0, + $$[".",""]$$ +), ( + (SELECT id FROM config.metabib_field WHERE name = 'keyword_strip_periods'), + 2, -- split_date_range + 1, + NULL +), ( + (SELECT id FROM config.metabib_field WHERE name = 'keyword_strip_periods'), + 19, -- search_normalize + 1, + NULL +); + + +CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( rid BIGINT, default_joiner TEXT ) RETURNS SETOF metabib.field_entry_template AS $func$ +DECLARE + bib biblio.record_entry%ROWTYPE; + idx config.metabib_field%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + xml_node TEXT; + xml_node_list TEXT[]; + facet_text TEXT; + browse_text TEXT; + sort_value TEXT; + raw_text TEXT; + curr_text TEXT; + joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? + authority_text TEXT; + authority_link BIGINT; + output_row metabib.field_entry_template%ROWTYPE; +BEGIN + + -- Start out with no field-use bools set + output_row.browse_field = FALSE; + output_row.facet_field = FALSE; + output_row.search_field = FALSE; + + -- Get the record + SELECT INTO bib * FROM biblio.record_entry WHERE id = rid; + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM config.metabib_field ORDER BY format LOOP + + joiner := COALESCE(idx.joiner, default_joiner); + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(bib.marc,xfrm.xslt); + ELSE + transformed_xml := bib.marc; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + xml_node_list := oils_xpath( idx.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + + raw_text := NULL; + FOR xml_node IN SELECT x FROM unnest(xml_node_list) AS x LOOP + CONTINUE WHEN xml_node !~ E'^\\s*<'; + + -- XXX much of this should be moved into oils_xpath_string... + curr_text := ARRAY_TO_STRING(evergreen.array_remove_item_by_value(evergreen.array_remove_item_by_value( + oils_xpath( '//text()', -- get the content of all the nodes within the main selected node + REGEXP_REPLACE( xml_node, E'\\s+', ' ', 'g' ) -- Translate adjacent whitespace to a single space + ), ' '), ''), -- throw away morally empty (bankrupt?) strings + joiner + ); + + CONTINUE WHEN curr_text IS NULL OR curr_text = ''; + + IF raw_text IS NOT NULL THEN + raw_text := raw_text || joiner; + END IF; + + raw_text := COALESCE(raw_text,'') || curr_text; + + -- autosuggest/metabib.browse_entry + IF idx.browse_field THEN + + IF idx.browse_xpath IS NOT NULL AND idx.browse_xpath <> '' THEN + browse_text := oils_xpath_string( idx.browse_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + browse_text := curr_text; + END IF; + + IF idx.browse_sort_xpath IS NOT NULL AND + idx.browse_sort_xpath <> '' THEN + + sort_value := oils_xpath_string( + idx.browse_sort_xpath, xml_node, joiner, + ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] + ); + ELSE + sort_value := browse_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + -- KCLS / Catalyst + -- output_row.value = BTRIM(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g')); + -- outer regexp_replace keeps all '.' expect the last one. + -- inner regexp_replace removes all connecting whitespace and replaces it with a single space + output_row.value = BTRIM(REGEXP_REPLACE(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g'), E'\\.$', '', 'g')); + -- /KCLS + + output_row.sort_value := + public.naco_normalize(sort_value); + + output_row.authority := NULL; + + IF idx.authority_xpath IS NOT NULL AND idx.authority_xpath <> '' THEN + authority_text := oils_xpath_string( + idx.authority_xpath, xml_node, joiner, + ARRAY[ + ARRAY[xfrm.prefix, xfrm.namespace_uri], + ARRAY['xlink','http://www.w3.org/1999/xlink'] + ] + ); + + IF authority_text ~ '^\d+$' THEN + authority_link := authority_text::BIGINT; + PERFORM * FROM authority.record_entry WHERE id = authority_link; + IF FOUND THEN + output_row.authority := authority_link; + END IF; + END IF; + + END IF; + + output_row.browse_field = TRUE; + -- Returning browse rows with search_field = true for search+browse + -- configs allows us to retain granularity of being able to search + -- browse fields with "starts with" type operators (for example, for + -- titles of songs in music albums) + IF idx.search_field THEN + output_row.search_field = TRUE; + END IF; + RETURN NEXT output_row; + output_row.browse_field = FALSE; + output_row.search_field = FALSE; + output_row.sort_value := NULL; + END IF; + + -- insert raw node text for faceting + IF idx.facet_field THEN + + IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN + facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + facet_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = -1 * idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g')); + + output_row.facet_field = TRUE; + RETURN NEXT output_row; + output_row.facet_field = FALSE; + END IF; + + END LOOP; + + CONTINUE WHEN raw_text IS NULL OR raw_text = ''; + + -- insert combined node text for searching + IF idx.search_field THEN + -- KCLS + -- TODO: move these mods into configs / search normalizers + IF idx.field_class = 'identifier' AND idx.name = 'bibcn' THEN + output_row.field_class = 'call_number'; + output_row.browse_field = TRUE; + output_row.sort_value = public.naco_normalize_keep_decimal(raw_text,''); + output_row.value = raw_text; + ELSE + output_row.field_class = idx.field_class; + output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g')); + END IF; + -- /KCLS + + output_row.field = idx.id; + output_row.source = rid; + -- KCLS -- value is set above + -- output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g')); + -- /KCLS + + output_row.search_field = TRUE; + RETURN NEXT output_row; + output_row.search_field = FALSE; + END IF; + + END LOOP; + +END; + +$func$ LANGUAGE PLPGSQL; + + + +COMMIT; diff --git a/KCLS/sql/schema/revert/search-index-keep-periods.sql b/KCLS/sql/schema/revert/search-index-keep-periods.sql new file mode 100644 index 0000000000..94e5b35845 --- /dev/null +++ b/KCLS/sql/schema/revert/search-index-keep-periods.sql @@ -0,0 +1,204 @@ +-- Revert kcls-evergreen:search-index-keep-periods from pg + +BEGIN; + +-- remove 'replace' normalizer +DELETE FROM config.metabib_field_index_norm_map WHERE field = + (SELECT id FROM config.metabib_field WHERE name = 'keyword_strip_periods'); + +DELETE FROM config.metabib_field WHERE name = 'keyword_strip_periods'; + +CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( rid BIGINT, default_joiner TEXT ) RETURNS SETOF metabib.field_entry_template AS $func$ +DECLARE + bib biblio.record_entry%ROWTYPE; + idx config.metabib_field%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + xml_node TEXT; + xml_node_list TEXT[]; + facet_text TEXT; + browse_text TEXT; + sort_value TEXT; + raw_text TEXT; + curr_text TEXT; + joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? + authority_text TEXT; + authority_link BIGINT; + output_row metabib.field_entry_template%ROWTYPE; +BEGIN + + -- Start out with no field-use bools set + output_row.browse_field = FALSE; + output_row.facet_field = FALSE; + output_row.search_field = FALSE; + + -- Get the record + SELECT INTO bib * FROM biblio.record_entry WHERE id = rid; + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM config.metabib_field ORDER BY format LOOP + + joiner := COALESCE(idx.joiner, default_joiner); + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(bib.marc,xfrm.xslt); + ELSE + transformed_xml := bib.marc; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + xml_node_list := oils_xpath( idx.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + + raw_text := NULL; + FOR xml_node IN SELECT x FROM unnest(xml_node_list) AS x LOOP + CONTINUE WHEN xml_node !~ E'^\\s*<'; + + -- XXX much of this should be moved into oils_xpath_string... + curr_text := ARRAY_TO_STRING(evergreen.array_remove_item_by_value(evergreen.array_remove_item_by_value( + oils_xpath( '//text()', -- get the content of all the nodes within the main selected node + REGEXP_REPLACE( xml_node, E'\\s+', ' ', 'g' ) -- Translate adjacent whitespace to a single space + ), ' '), ''), -- throw away morally empty (bankrupt?) strings + joiner + ); + + CONTINUE WHEN curr_text IS NULL OR curr_text = ''; + + IF raw_text IS NOT NULL THEN + raw_text := raw_text || joiner; + END IF; + + raw_text := COALESCE(raw_text,'') || curr_text; + + -- autosuggest/metabib.browse_entry + IF idx.browse_field THEN + + IF idx.browse_xpath IS NOT NULL AND idx.browse_xpath <> '' THEN + browse_text := oils_xpath_string( idx.browse_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + browse_text := curr_text; + END IF; + + IF idx.browse_sort_xpath IS NOT NULL AND + idx.browse_sort_xpath <> '' THEN + + sort_value := oils_xpath_string( + idx.browse_sort_xpath, xml_node, joiner, + ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] + ); + ELSE + sort_value := browse_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + -- KCLS / Catalyst + -- output_row.value = BTRIM(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g')); + -- outer regexp_replace keeps all '.' expect the last one. + -- inner regexp_replace removes all connecting whitespace and replaces it with a single space + output_row.value = BTRIM(REGEXP_REPLACE(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g'), E'\\.$', '', 'g')); + -- /KCLS + + output_row.sort_value := + public.naco_normalize(sort_value); + + output_row.authority := NULL; + + IF idx.authority_xpath IS NOT NULL AND idx.authority_xpath <> '' THEN + authority_text := oils_xpath_string( + idx.authority_xpath, xml_node, joiner, + ARRAY[ + ARRAY[xfrm.prefix, xfrm.namespace_uri], + ARRAY['xlink','http://www.w3.org/1999/xlink'] + ] + ); + + IF authority_text ~ '^\d+$' THEN + authority_link := authority_text::BIGINT; + PERFORM * FROM authority.record_entry WHERE id = authority_link; + IF FOUND THEN + output_row.authority := authority_link; + END IF; + END IF; + + END IF; + + output_row.browse_field = TRUE; + -- Returning browse rows with search_field = true for search+browse + -- configs allows us to retain granularity of being able to search + -- browse fields with "starts with" type operators (for example, for + -- titles of songs in music albums) + IF idx.search_field THEN + output_row.search_field = TRUE; + END IF; + RETURN NEXT output_row; + output_row.browse_field = FALSE; + output_row.search_field = FALSE; + output_row.sort_value := NULL; + END IF; + + -- insert raw node text for faceting + IF idx.facet_field THEN + + IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN + facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + facet_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = -1 * idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g')); + + output_row.facet_field = TRUE; + RETURN NEXT output_row; + output_row.facet_field = FALSE; + END IF; + + END LOOP; + + CONTINUE WHEN raw_text IS NULL OR raw_text = ''; + + -- insert combined node text for searching + IF idx.search_field THEN + -- KCLS + IF idx.field_class = 'identifier' AND idx.name = 'bibcn' THEN + output_row.field_class = 'call_number'; + output_row.browse_field = TRUE; + output_row.sort_value = public.naco_normalize_keep_decimal(raw_text,''); + output_row.value = raw_text; + ELSE + output_row.field_class = idx.field_class; + output_row.value = BTRIM(REGEXP_REPLACE( + REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g'), E'\\.+', '', 'g')); + END IF; + -- /KCLS + + output_row.field = idx.id; + output_row.source = rid; + -- KCLS -- value is set above + -- output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g')); + -- /KCLS + + output_row.search_field = TRUE; + RETURN NEXT output_row; + output_row.search_field = FALSE; + END IF; + + END LOOP; + +END; + +$func$ LANGUAGE PLPGSQL; + + +COMMIT; diff --git a/KCLS/sql/schema/sqitch.plan b/KCLS/sql/schema/sqitch.plan index e079a9160a..40b672df8c 100644 --- a/KCLS/sql/schema/sqitch.plan +++ b/KCLS/sql/schema/sqitch.plan @@ -67,3 +67,4 @@ search-strip-apos-recover [hold-queue-pos-materialized] 2018-03-30T16:15:21Z Bil vand-import-edit-date-fix [hold-queue-pos-materialized] 2018-03-30T14:50:53Z Bill Erickson,,, # Recover lost vandelay edit date changes vand-preserve-all-fields [vand-import-edit-date-fix] 2018-04-09T20:15:27Z Bill Erickson,,, # Bug in vandelay.replace_fields() track-bib-merges [vand-import-edit-date-fix] 2018-04-03T15:04:00Z Bill Erickson,,, # Track bib merges +search-index-keep-periods [vand-import-edit-date-fix] 2018-04-02T19:10:31Z Bill Erickson,,, # Avoid stripping periods in search indexes -- 2.11.0