From: Lebbeous Fogle-Weekley Date: Thu, 30 Jun 2011 18:31:00 +0000 (-0400) Subject: Correct facet entry extraction from bibs upon ingest X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=refs%2Fheads%2Fuser%2Fsenator%2Fextract-metabib-field-entry-fix;p=working%2FEvergreen.git Correct facet entry extraction from bibs upon ingest Previously, any normalizers that should apply to facets (defined by rows in config.metabib_field_index_norm_map) were forgotten. Signed-off-by: Lebbeous Fogle-Weekley --- diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 4c4b95ad2e..6128dbc8a6 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -86,7 +86,7 @@ CREATE TRIGGER no_overlapping_deps BEFORE INSERT OR UPDATE ON config.db_patch_dependencies FOR EACH ROW EXECUTE PROCEDURE evergreen.array_overlap_check ('deprecates'); -INSERT INTO config.upgrade_log (version, applied_to) VALUES ('0569', :eg_version); -- miker +INSERT INTO config.upgrade_log (version, applied_to) VALUES ('0xxx', :eg_version); -- senator CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index 17a238332d..24fa87d4b5 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -276,6 +276,7 @@ DECLARE facet_text TEXT; raw_text TEXT; curr_text TEXT; + normalizer RECORD; joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? output_row metabib.field_entry_template%ROWTYPE; BEGIN @@ -331,7 +332,7 @@ BEGIN raw_text := COALESCE(raw_text,'') || curr_text; - -- insert raw node text for faceting + -- insert node text for faceting IF idx.facet_field THEN IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN @@ -340,6 +341,26 @@ BEGIN facet_text := curr_text; END IF; + -- apply any normalizers + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE m.field = idx.id AND m.pos < 0 + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( facet_text ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO facet_text; + + END LOOP; + output_row.field_class = idx.field_class; output_row.field = -1 * idx.id; output_row.source = rid; diff --git a/Open-ILS/src/sql/Pg/upgrade/0xxx.schema.extract-metabib-field-entry-fix.sql b/Open-ILS/src/sql/Pg/upgrade/0xxx.schema.extract-metabib-field-entry-fix.sql new file mode 100644 index 0000000000..cf8195f0c9 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0xxx.schema.extract-metabib-field-entry-fix.sql @@ -0,0 +1,129 @@ +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('0xxx', :eg_version); + +CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( rid BIGINT, default_joiner TEXT ) RETURNS SETOF metabib.field_entry_template AS $func$ +DECLARE + bib biblio.record_entry%ROWTYPE; + idx config.metabib_field%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + xml_node TEXT; + xml_node_list TEXT[]; + facet_text TEXT; + raw_text TEXT; + curr_text TEXT; + normalizer RECORD; + joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? + output_row metabib.field_entry_template%ROWTYPE; +BEGIN + + -- Get the record + SELECT INTO bib * FROM biblio.record_entry WHERE id = rid; + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM config.metabib_field ORDER BY format LOOP + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(bib.marc,xfrm.xslt); + ELSE + transformed_xml := bib.marc; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + xml_node_list := oils_xpath( idx.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + + raw_text := NULL; + FOR xml_node IN SELECT x FROM unnest(xml_node_list) AS x LOOP + CONTINUE WHEN xml_node !~ E'^\\s*<'; + + curr_text := ARRAY_TO_STRING( + oils_xpath( '//text()', + REGEXP_REPLACE( -- This escapes all &s not followed by "amp;". Data ise returned from oils_xpath (above) in UTF-8, not entity encoded + REGEXP_REPLACE( -- This escapes embeded [^<]+)(<)([^>]+<)$re$, + E'\\1<\\3', + 'g' + ), + '&(?!amp;)', + '&', + 'g' + ) + ), + ' ' + ); + + CONTINUE WHEN curr_text IS NULL OR curr_text = ''; + + IF raw_text IS NOT NULL THEN + raw_text := raw_text || joiner; + END IF; + + raw_text := COALESCE(raw_text,'') || curr_text; + + -- insert node text for faceting + IF idx.facet_field THEN + + IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN + facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + facet_text := curr_text; + END IF; + + -- apply any normalizers + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE m.field = idx.id AND m.pos < 0 + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( facet_text ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO facet_text; + + END LOOP; + + output_row.field_class = idx.field_class; + output_row.field = -1 * idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g')); + + RETURN NEXT output_row; + END IF; + + END LOOP; + + CONTINUE WHEN raw_text IS NULL OR raw_text = ''; + + -- insert combined node text for searching + IF idx.search_field THEN + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g')); + + RETURN NEXT output_row; + END IF; + + END LOOP; + +END; +$func$ LANGUAGE PLPGSQL; + +COMMIT;