From: Mike Rylander Date: Mon, 24 Feb 2020 17:01:49 +0000 (-0500) Subject: LP#1864516: case-insensitive browse entry display value comparison X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=fe23d265f02bd4ef9a5bf61b2abca0898f74c7be;p=evergreen%2Ftadl.git LP#1864516: case-insensitive browse entry display value comparison Following up on bug 1350831, there remains one complaint relating to "bad data makes browse suboptimal" that can't actually be handled today by the existing Evergreen code or a pullrequested branch: case differences, particularly in titles. We should allow an Evergreen admin to decide on a field-by-field basis whether case should be considered when determining the uniqueness of a browse entry coming from a bibliographic record, effectively allowing case folding. Note that authority fields, being by definition the authorized value that should be used, do not support this case folding. However, if authority records are processed first, then bibliographic fields can fold /into/ those authority fields, achieving the desired result and, in fact, using the best possible case-preserving display value. Signed-off-by: Mike Rylander Signed-off-by: Elaine Hardy Signed-off-by: Chris Sharp --- diff --git a/Open-ILS/examples/fm_IDL.xml b/Open-ILS/examples/fm_IDL.xml index e03ebbbf61..fc6ea58759 100644 --- a/Open-ILS/examples/fm_IDL.xml +++ b/Open-ILS/examples/fm_IDL.xml @@ -3120,6 +3120,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 0c57c8dbf7..7d78804f36 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -208,6 +208,7 @@ CREATE TABLE config.metabib_field ( search_field BOOL NOT NULL DEFAULT TRUE, facet_field BOOL NOT NULL DEFAULT FALSE, browse_field BOOL NOT NULL DEFAULT TRUE, + browse_nocase BOOL NOT NULL DEFAULT FALSE, browse_xpath TEXT, browse_sort_xpath TEXT, facet_xpath TEXT, diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index 2cede48920..7cb4e8c729 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -739,7 +739,8 @@ CREATE TYPE metabib.field_entry_template AS ( source BIGINT, value TEXT, authority BIGINT, - sort_value TEXT + sort_value TEXT, + browse_nocase BOOL ); CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( @@ -770,6 +771,7 @@ DECLARE BEGIN -- Start out with no field-use bools set + output_row.browse_nocase = FALSE; output_row.browse_field = FALSE; output_row.facet_field = FALSE; output_row.display_field = FALSE; @@ -829,6 +831,7 @@ BEGIN -- autosuggest/metabib.browse_entry IF idx.browse_field THEN + output_row.browse_nocase = idx.browse_nocase; IF idx.browse_xpath IS NOT NULL AND idx.browse_xpath <> '' THEN browse_text := oils_xpath_string( idx.browse_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); @@ -884,6 +887,7 @@ BEGIN output_row.search_field = TRUE; END IF; RETURN NEXT output_row; + output_row.browse_nocase = FALSE; output_row.browse_field = FALSE; output_row.search_field = FALSE; output_row.sort_value := NULL; @@ -1120,8 +1124,14 @@ BEGIN CONTINUE WHEN ind_data.sort_value IS NULL; value_prepped := metabib.browse_normalize(ind_data.value, ind_data.field); - SELECT INTO mbe_row * FROM metabib.browse_entry - WHERE value = value_prepped AND sort_value = ind_data.sort_value; + IF ind_data.browse_nocase THEN + SELECT INTO mbe_row * FROM metabib.browse_entry + WHERE evergreen.lowercase(value) = evergreen.lowercase(value_prepped) AND sort_value = ind_data.sort_value + ORDER BY sort_value, value LIMIT 1; -- gotta pick something, I guess + ELSE + SELECT INTO mbe_row * FROM metabib.browse_entry + WHERE value = value_prepped AND sort_value = ind_data.sort_value; + END IF; IF FOUND THEN mbe_id := mbe_row.id; diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.case-insensitive-browse-field-match.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.case-insensitive-browse-field-match.sql new file mode 100644 index 0000000000..de482c3196 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.case-insensitive-browse-field-match.sql @@ -0,0 +1,353 @@ +BEGIN; + +ALTER TYPE metabib.field_entry_template ADD ATTRIBUTE browse_nocase BOOL CASCADE; + +ALTER TABLE config.metabib_field ADD COLUMN browse_nocase BOOL NOT NULL DEFAULT FALSE; + +CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( + rid BIGINT, + default_joiner TEXT, + field_types TEXT[], + only_fields INT[] +) RETURNS SETOF metabib.field_entry_template AS $func$ +DECLARE + bib biblio.record_entry%ROWTYPE; + idx config.metabib_field%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + xml_node TEXT; + xml_node_list TEXT[]; + facet_text TEXT; + display_text TEXT; + browse_text TEXT; + sort_value TEXT; + raw_text TEXT; + curr_text TEXT; + joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? + authority_text TEXT; + authority_link BIGINT; + output_row metabib.field_entry_template%ROWTYPE; + process_idx BOOL; +BEGIN + + -- Start out with no field-use bools set + output_row.browse_nocase = FALSE; + output_row.browse_field = FALSE; + output_row.facet_field = FALSE; + output_row.display_field = FALSE; + output_row.search_field = FALSE; + + -- Get the record + SELECT INTO bib * FROM biblio.record_entry WHERE id = rid; + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM config.metabib_field WHERE id = ANY (only_fields) ORDER BY format LOOP + CONTINUE WHEN idx.xpath IS NULL OR idx.xpath = ''; -- pure virtual field + + process_idx := FALSE; + IF idx.display_field AND 'display' = ANY (field_types) THEN process_idx = TRUE; END IF; + IF idx.browse_field AND 'browse' = ANY (field_types) THEN process_idx = TRUE; END IF; + IF idx.search_field AND 'search' = ANY (field_types) THEN process_idx = TRUE; END IF; + IF idx.facet_field AND 'facet' = ANY (field_types) THEN process_idx = TRUE; END IF; + CONTINUE WHEN process_idx = FALSE; -- disabled for all types + + joiner := COALESCE(idx.joiner, default_joiner); + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(bib.marc,xfrm.xslt); + ELSE + transformed_xml := bib.marc; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + xml_node_list := oils_xpath( idx.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + + raw_text := NULL; + FOR xml_node IN SELECT x FROM unnest(xml_node_list) AS x LOOP + CONTINUE WHEN xml_node !~ E'^\\s*<'; + + -- XXX much of this should be moved into oils_xpath_string... + curr_text := ARRAY_TO_STRING(evergreen.array_remove_item_by_value(evergreen.array_remove_item_by_value( + oils_xpath( '//text()', -- get the content of all the nodes within the main selected node + REGEXP_REPLACE( xml_node, E'\\s+', ' ', 'g' ) -- Translate adjacent whitespace to a single space + ), ' '), ''), -- throw away morally empty (bankrupt?) strings + joiner + ); + + CONTINUE WHEN curr_text IS NULL OR curr_text = ''; + + IF raw_text IS NOT NULL THEN + raw_text := raw_text || joiner; + END IF; + + raw_text := COALESCE(raw_text,'') || curr_text; + + -- autosuggest/metabib.browse_entry + IF idx.browse_field THEN + output_row.browse_nocase = idx.browse_nocase; + + IF idx.browse_xpath IS NOT NULL AND idx.browse_xpath <> '' THEN + browse_text := oils_xpath_string( idx.browse_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + browse_text := curr_text; + END IF; + + IF idx.browse_sort_xpath IS NOT NULL AND + idx.browse_sort_xpath <> '' THEN + + sort_value := oils_xpath_string( + idx.browse_sort_xpath, xml_node, joiner, + ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] + ); + ELSE + sort_value := browse_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g')); + output_row.sort_value := + public.naco_normalize(sort_value); + + output_row.authority := NULL; + + IF idx.authority_xpath IS NOT NULL AND idx.authority_xpath <> '' THEN + authority_text := oils_xpath_string( + idx.authority_xpath, xml_node, joiner, + ARRAY[ + ARRAY[xfrm.prefix, xfrm.namespace_uri], + ARRAY['xlink','http://www.w3.org/1999/xlink'] + ] + ); + + IF authority_text ~ '^\d+$' THEN + authority_link := authority_text::BIGINT; + PERFORM * FROM authority.record_entry WHERE id = authority_link; + IF FOUND THEN + output_row.authority := authority_link; + END IF; + END IF; + + END IF; + + output_row.browse_field = TRUE; + -- Returning browse rows with search_field = true for search+browse + -- configs allows us to retain granularity of being able to search + -- browse fields with "starts with" type operators (for example, for + -- titles of songs in music albums) + IF idx.search_field THEN + output_row.search_field = TRUE; + END IF; + RETURN NEXT output_row; + output_row.browse_nocase = FALSE; + output_row.browse_field = FALSE; + output_row.search_field = FALSE; + output_row.sort_value := NULL; + END IF; + + -- insert raw node text for faceting + IF idx.facet_field THEN + + IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN + facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + facet_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = -1 * idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g')); + + output_row.facet_field = TRUE; + RETURN NEXT output_row; + output_row.facet_field = FALSE; + END IF; + + -- insert raw node text for display + IF idx.display_field THEN + + IF idx.display_xpath IS NOT NULL AND idx.display_xpath <> '' THEN + display_text := oils_xpath_string( idx.display_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + display_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = -1 * idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(display_text, E'\\s+', ' ', 'g')); + + output_row.display_field = TRUE; + RETURN NEXT output_row; + output_row.display_field = FALSE; + END IF; + + END LOOP; + + CONTINUE WHEN raw_text IS NULL OR raw_text = ''; + + -- insert combined node text for searching + IF idx.search_field THEN + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g')); + + output_row.search_field = TRUE; + RETURN NEXT output_row; + output_row.search_field = FALSE; + END IF; + + END LOOP; + +END; +$func$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION metabib.reingest_metabib_field_entries( + bib_id BIGINT, + skip_facet BOOL DEFAULT FALSE, + skip_display BOOL DEFAULT FALSE, + skip_browse BOOL DEFAULT FALSE, + skip_search BOOL DEFAULT FALSE, + only_fields INT[] DEFAULT '{}'::INT[] +) RETURNS VOID AS $func$ +DECLARE + fclass RECORD; + ind_data metabib.field_entry_template%ROWTYPE; + mbe_row metabib.browse_entry%ROWTYPE; + mbe_id BIGINT; + b_skip_facet BOOL; + b_skip_display BOOL; + b_skip_browse BOOL; + b_skip_search BOOL; + value_prepped TEXT; + field_list INT[] := only_fields; + field_types TEXT[] := '{}'::TEXT[]; +BEGIN + + IF field_list = '{}'::INT[] THEN + SELECT ARRAY_AGG(id) INTO field_list FROM config.metabib_field; + END IF; + + SELECT COALESCE(NULLIF(skip_facet, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_facet_indexing' AND enabled)) INTO b_skip_facet; + SELECT COALESCE(NULLIF(skip_display, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_display_indexing' AND enabled)) INTO b_skip_display; + SELECT COALESCE(NULLIF(skip_browse, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_browse_indexing' AND enabled)) INTO b_skip_browse; + SELECT COALESCE(NULLIF(skip_search, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_search_indexing' AND enabled)) INTO b_skip_search; + + IF NOT b_skip_facet THEN field_types := field_types || '{facet}'; END IF; + IF NOT b_skip_display THEN field_types := field_types || '{display}'; END IF; + IF NOT b_skip_browse THEN field_types := field_types || '{browse}'; END IF; + IF NOT b_skip_search THEN field_types := field_types || '{search}'; END IF; + + PERFORM * FROM config.internal_flag WHERE name = 'ingest.assume_inserts_only' AND enabled; + IF NOT FOUND THEN + IF NOT b_skip_search THEN + FOR fclass IN SELECT * FROM config.metabib_class LOOP + -- RAISE NOTICE 'Emptying out %', fclass.name; + EXECUTE $$DELETE FROM metabib.$$ || fclass.name || $$_field_entry WHERE source = $$ || bib_id; + END LOOP; + END IF; + IF NOT b_skip_facet THEN + DELETE FROM metabib.facet_entry WHERE source = bib_id; + END IF; + IF NOT b_skip_display THEN + DELETE FROM metabib.display_entry WHERE source = bib_id; + END IF; + IF NOT b_skip_browse THEN + DELETE FROM metabib.browse_entry_def_map WHERE source = bib_id; + END IF; + END IF; + + FOR ind_data IN SELECT * FROM biblio.extract_metabib_field_entry( bib_id, ' ', field_types, field_list ) LOOP + + -- don't store what has been normalized away + CONTINUE WHEN ind_data.value IS NULL; + + IF ind_data.field < 0 THEN + ind_data.field = -1 * ind_data.field; + END IF; + + IF ind_data.facet_field AND NOT b_skip_facet THEN + INSERT INTO metabib.facet_entry (field, source, value) + VALUES (ind_data.field, ind_data.source, ind_data.value); + END IF; + + IF ind_data.display_field AND NOT b_skip_display THEN + INSERT INTO metabib.display_entry (field, source, value) + VALUES (ind_data.field, ind_data.source, ind_data.value); + END IF; + + + IF ind_data.browse_field AND NOT b_skip_browse THEN + -- A caveat about this SELECT: this should take care of replacing + -- old mbe rows when data changes, but not if normalization (by + -- which I mean specifically the output of + -- evergreen.oils_tsearch2()) changes. It may or may not be + -- expensive to add a comparison of index_vector to index_vector + -- to the WHERE clause below. + + CONTINUE WHEN ind_data.sort_value IS NULL; + + value_prepped := metabib.browse_normalize(ind_data.value, ind_data.field); + IF ind_data.browse_nocase THEN + SELECT INTO mbe_row * FROM metabib.browse_entry + WHERE evergreen.lowercase(value) = evergreen.lowercase(value_prepped) AND sort_value = ind_data.sort_value + ORDER BY sort_value, value LIMIT 1; -- gotta pick something, I guess + ELSE + SELECT INTO mbe_row * FROM metabib.browse_entry + WHERE value = value_prepped AND sort_value = ind_data.sort_value; + END IF; + + IF FOUND THEN + mbe_id := mbe_row.id; + ELSE + INSERT INTO metabib.browse_entry + ( value, sort_value ) VALUES + ( value_prepped, ind_data.sort_value ); + + mbe_id := CURRVAL('metabib.browse_entry_id_seq'::REGCLASS); + END IF; + + INSERT INTO metabib.browse_entry_def_map (entry, def, source, authority) + VALUES (mbe_id, ind_data.field, ind_data.source, ind_data.authority); + END IF; + + IF ind_data.search_field AND NOT b_skip_search THEN + -- Avoid inserting duplicate rows + EXECUTE 'SELECT 1 FROM metabib.' || ind_data.field_class || + '_field_entry WHERE field = $1 AND source = $2 AND value = $3' + INTO mbe_id USING ind_data.field, ind_data.source, ind_data.value; + -- RAISE NOTICE 'Search for an already matching row returned %', mbe_id; + IF mbe_id IS NULL THEN + EXECUTE $$ + INSERT INTO metabib.$$ || ind_data.field_class || $$_field_entry (field, source, value) + VALUES ($$ || + quote_literal(ind_data.field) || $$, $$ || + quote_literal(ind_data.source) || $$, $$ || + quote_literal(ind_data.value) || + $$);$$; + END IF; + END IF; + + END LOOP; + + IF NOT b_skip_search THEN + PERFORM metabib.update_combined_index_vectors(bib_id); + END IF; + + RETURN; +END; +$func$ LANGUAGE PLPGSQL; + +COMMIT; +