From 57718f2c31aab68b1ae3ce9e4eb7572bcaf03c39 Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Thu, 13 Jun 2013 11:00:30 -0400 Subject: [PATCH] Add upgrades for apostrophe and phrase search See user/dbs/another_metabib_ingest_fix in working for the details. Signed-off-by: Dan Scott --- .../conifer-2_4_step1-single-updates.sql | 132 ++++++++++++++++++++- .../sql/Pg/version-upgrade/conifer-2_4_step4c.sql | 41 ++++++- 2 files changed, 167 insertions(+), 6 deletions(-) diff --git a/Open-ILS/src/sql/Pg/version-upgrade/conifer-2_4_step1-single-updates.sql b/Open-ILS/src/sql/Pg/version-upgrade/conifer-2_4_step1-single-updates.sql index da118ae8cd..34eb5a6496 100644 --- a/Open-ILS/src/sql/Pg/version-upgrade/conifer-2_4_step1-single-updates.sql +++ b/Open-ILS/src/sql/Pg/version-upgrade/conifer-2_4_step1-single-updates.sql @@ -605,7 +605,137 @@ BEGIN END; $f$ LANGUAGE PLPGSQL; +CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( rid BIGINT, default_joiner TEXT ) RETURNS SETOF metabib.field_entry_template AS $func$ +DECLARE + bib biblio.record_entry%ROWTYPE; + idx config.metabib_field%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + xml_node TEXT; + xml_node_list TEXT[]; + facet_text TEXT; + browse_text TEXT; + raw_text TEXT; + curr_text TEXT; + joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? + output_row metabib.field_entry_template%ROWTYPE; +BEGIN + + -- Start out with no field-use bools set + output_row.browse_field = FALSE; + output_row.facet_field = FALSE; + output_row.search_field = FALSE; + + -- Get the record + SELECT INTO bib * FROM biblio.record_entry WHERE id = rid; + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM config.metabib_field ORDER BY format LOOP + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(bib.marc,xfrm.xslt); + ELSE + transformed_xml := bib.marc; + END IF; --- done! we hope... + prev_xfrm := xfrm.name; + END IF; + + xml_node_list := oils_xpath( idx.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + + raw_text := NULL; + FOR xml_node IN SELECT x FROM unnest(xml_node_list) AS x LOOP + CONTINUE WHEN xml_node !~ E'^\\s*<'; + + curr_text := ARRAY_TO_STRING( + oils_xpath( '//text()', + REGEXP_REPLACE( -- This escapes all &s not followed by "amp;". Data ise returned from oils_xpath (above) in UTF-8, not entity encoded + REGEXP_REPLACE( -- This escapes embeded [^<]+)(<)([^>]+<)$re$, + E'\\1<\\3', + 'g' + ), + '&(?!amp;)', + '&', + 'g' + ) + ), + ' ' + ); + + CONTINUE WHEN curr_text IS NULL OR curr_text = ''; + + IF raw_text IS NOT NULL THEN + raw_text := raw_text || joiner; + END IF; + + raw_text := COALESCE(raw_text,'') || curr_text; + + -- autosuggest/metabib.browse_entry + IF idx.browse_field THEN + + IF idx.browse_xpath IS NOT NULL AND idx.browse_xpath <> '' THEN + browse_text := oils_xpath_string( idx.browse_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + browse_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g')); + + output_row.browse_field = TRUE; + RETURN NEXT output_row; + output_row.browse_field = FALSE; + END IF; + + -- insert raw node text for faceting + IF idx.facet_field THEN + + IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN + facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + facet_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = -1 * idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g')); + + output_row.facet_field = TRUE; + RETURN NEXT output_row; + output_row.facet_field = FALSE; + END IF; + + END LOOP; + + CONTINUE WHEN raw_text IS NULL OR raw_text = ''; + + -- insert combined node text for searching + IF idx.search_field THEN + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g')); + + output_row.search_field = TRUE; + RETURN NEXT output_row; + output_row.search_field = FALSE; + END IF; + + END LOOP; + +END; + +$func$ LANGUAGE PLPGSQL; COMMIT; diff --git a/Open-ILS/src/sql/Pg/version-upgrade/conifer-2_4_step4c.sql b/Open-ILS/src/sql/Pg/version-upgrade/conifer-2_4_step4c.sql index fcc701832e..50521b5c8d 100644 --- a/Open-ILS/src/sql/Pg/version-upgrade/conifer-2_4_step4c.sql +++ b/Open-ILS/src/sql/Pg/version-upgrade/conifer-2_4_step4c.sql @@ -786,6 +786,7 @@ $func$ LANGUAGE PLPGSQL; DROP FUNCTION IF EXISTS evergreen.oils_tsearch2() CASCADE; DROP FUNCTION IF EXISTS public.oils_tsearch2() CASCADE; +-- commit 1d05b0bc14b4e91 - bring back split brain indexing CREATE OR REPLACE FUNCTION public.oils_tsearch2 () RETURNS TRIGGER AS $$ DECLARE normalizer RECORD; @@ -794,6 +795,7 @@ DECLARE ts_rec RECORD; cur_weight "char"; BEGIN + value := NEW.value; NEW.index_vector = ''::tsvector; @@ -804,7 +806,7 @@ BEGIN m.params AS params FROM config.index_normalizer n JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) - WHERE field = NEW.field + WHERE field = NEW.field AND m.pos < 0 ORDER BY m.pos LOOP EXECUTE 'SELECT ' || normalizer.func || '(' || quote_literal( value ) || @@ -816,8 +818,28 @@ BEGIN ')' INTO value; END LOOP; + NEW.value = value; - END IF; + + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE field = NEW.field AND m.pos >= 0 + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( value ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO value; + + END LOOP; + END IF; IF TG_TABLE_NAME::TEXT ~ 'browse_entry$' THEN value := ARRAY_TO_STRING( @@ -1249,6 +1271,7 @@ INSERT INTO config.internal_flag (name) VALUES ('ingest.skip_browse_indexing'); INSERT INTO config.internal_flag (name) VALUES ('ingest.skip_search_indexing'); INSERT INTO config.internal_flag (name) VALUES ('ingest.skip_facet_indexing'); +-- commit 8d772605b30b - reduce index bloat but keep search granularity CREATE OR REPLACE FUNCTION metabib.reingest_metabib_field_entries( bib_id BIGINT, skip_facet BOOL DEFAULT FALSE, skip_browse BOOL DEFAULT FALSE, skip_search BOOL DEFAULT FALSE ) RETURNS VOID AS $func$ DECLARE fclass RECORD; @@ -1310,14 +1333,23 @@ BEGIN VALUES (mbe_id, ind_data.field, ind_data.source); END IF; - IF ind_data.search_field AND NOT b_skip_search THEN - EXECUTE $$ + -- Avoid inserting duplicate rows, but retain granularity of being + -- able to search browse fields with "starts with" type operators + -- (for example, for titles of songs in music albums) + IF (ind_data.search_field OR ind_data.browse_field) AND NOT b_skip_search THEN + EXECUTE 'SELECT 1 FROM metabib.' || ind_data.field_class || + '_field_entry WHERE field = $1 source = $2 AND value = $3' + INTO mbe_id USING ind_data.field, ind_data.source, ind_data.value; + -- RAISE NOTICE 'Search for an already matching row returned %', mbe_id; + IF mbe_id IS NULL THEN + EXECUTE $$ INSERT INTO metabib.$$ || ind_data.field_class || $$_field_entry (field, source, value) VALUES ($$ || quote_literal(ind_data.field) || $$, $$ || quote_literal(ind_data.source) || $$, $$ || quote_literal(ind_data.value) || $$);$$; + END IF; END IF; END LOOP; @@ -1330,7 +1362,6 @@ BEGIN END; $func$ LANGUAGE PLPGSQL; - SELECT evergreen.upgrade_deps_block_check('0763', :eg_version); INSERT INTO config.org_unit_setting_type ( -- 2.11.0