From deaf96795d2a0793dee1425e2edca1949ba27fc9 Mon Sep 17 00:00:00 2001 From: Bob Wicksall Date: Fri, 10 Aug 2012 10:53:11 -0400 Subject: [PATCH] Reduce index bloat involving non-search_field values Rows in metabib.title_field_entry, subject_field_entry, series_field_entry, and author_field_entry are doubled or tripled due to bad logic in biblio.extract_metabib_field_entry. This results in these tables being 2 or more times their correct size. This was introduced in 2.2.0 when the logic for browse_field and facet_field were added to biblio.extract_metabib_field_entry. 2.1 is not affected. The duplicates are caused when biblio.extract_metabib_field_entry returns TRUE in the search_field column for all rows even if they should just be facet_field or browse_field after the first search_field value is returned. Signed-off-by: Mike Rylander Signed-off-by: Dan Scott --- Open-ILS/src/sql/Pg/030.schema.metabib.sql | 6 + .../XXXX.function.extract_metabib_field_entry.sql | 134 +++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.function.extract_metabib_field_entry.sql diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index dbf3ba81b5..178e5a4dbd 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -349,6 +349,11 @@ DECLARE output_row metabib.field_entry_template%ROWTYPE; BEGIN + -- Start out with no field-use bools set + output_row.browse_field = FALSE; + output_row.facet_field = FALSE; + output_row.search_field = FALSE; + -- Get the record SELECT INTO bib * FROM biblio.record_entry WHERE id = rid; @@ -451,6 +456,7 @@ BEGIN output_row.search_field = TRUE; RETURN NEXT output_row; + output_row.search_field = FALSE; END IF; END LOOP; diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.function.extract_metabib_field_entry.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.function.extract_metabib_field_entry.sql new file mode 100644 index 0000000000..251796e451 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.function.extract_metabib_field_entry.sql @@ -0,0 +1,134 @@ + +CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry(rid bigint, default_joiner text) RETURNS SETOF metabib.field_entry_template AS $func$ +DECLARE + bib biblio.record_entry%ROWTYPE; + idx config.metabib_field%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + xml_node TEXT; + xml_node_list TEXT[]; + facet_text TEXT; + browse_text TEXT; + raw_text TEXT; + curr_text TEXT; + joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? + output_row metabib.field_entry_template%ROWTYPE; +BEGIN + + -- Start out with no field-use bools set + output_row.browse_field = FALSE; + output_row.facet_field = FALSE; + output_row.search_field = FALSE; + + -- Get the record + SELECT INTO bib * FROM biblio.record_entry WHERE id = rid; + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM config.metabib_field ORDER BY format LOOP + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(bib.marc,xfrm.xslt); + ELSE + transformed_xml := bib.marc; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + xml_node_list := oils_xpath( idx.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + + raw_text := NULL; + FOR xml_node IN SELECT x FROM unnest(xml_node_list) AS x LOOP + CONTINUE WHEN xml_node !~ E'^\\s*<'; + + curr_text := ARRAY_TO_STRING( + oils_xpath( '//text()', + REGEXP_REPLACE( -- This escapes all &s not followed by "amp;". Data ise returned from oils_xpath (above) in UTF-8, not entity encoded + REGEXP_REPLACE( -- This escapes embeded [^<]+)(<)([^>]+<)$re$, + E'\\1<\\3', + 'g' + ), + '&(?!amp;)', + '&', + 'g' + ) + ), + ' ' + ); + + CONTINUE WHEN curr_text IS NULL OR curr_text = ''; + + IF raw_text IS NOT NULL THEN + raw_text := raw_text || joiner; + END IF; + + raw_text := COALESCE(raw_text,'') || curr_text; + + -- autosuggest/metabib.browse_entry + IF idx.browse_field THEN + + IF idx.browse_xpath IS NOT NULL AND idx.browse_xpath <> '' THEN + browse_text := oils_xpath_string( idx.browse_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + browse_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(browse_text, E'\\s+', ' ', 'g')); + + output_row.browse_field = TRUE; + RETURN NEXT output_row; + output_row.browse_field = FALSE; + END IF; + + -- insert raw node text for faceting + IF idx.facet_field THEN + + IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN + facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + facet_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = -1 * idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g')); + + output_row.facet_field = TRUE; + RETURN NEXT output_row; + output_row.facet_field = FALSE; + END IF; + + END LOOP; + + CONTINUE WHEN raw_text IS NULL OR raw_text = ''; + + -- insert combined node text for searching + IF idx.search_field THEN + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g')); + + output_row.search_field = TRUE; + RETURN NEXT output_row; + output_row.search_field = FALSE; + END IF; + + END LOOP; + +END; + +$func$ LANGUAGE PLPGSQL; + -- 2.11.0