From 3c11e7f64ae95fc0fa1c74c466e111097a9b7177 Mon Sep 17 00:00:00 2001 From: miker Date: Fri, 26 Mar 2010 17:23:43 +0000 Subject: [PATCH] add support for an XPath expression to pull out just what is needed for facet values git-svn-id: svn://svn.open-ils.org/ILS/trunk@16009 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/examples/fm_IDL.xml | 1 + .../Application/Storage/Driver/Pg/QueryParser.pm | 2 - Open-ILS/src/sql/Pg/002.schema.config.sql | 5 +- Open-ILS/src/sql/Pg/030.schema.metabib.sql | 11 +- Open-ILS/src/sql/Pg/950.data.seed-values.sql | 39 ++++--- .../src/sql/Pg/upgrade/0217.schema.facet_xpath.sql | 123 +++++++++++++++++++++ 6 files changed, 157 insertions(+), 24 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0217.schema.facet_xpath.sql diff --git a/Open-ILS/examples/fm_IDL.xml b/Open-ILS/examples/fm_IDL.xml index 1fe8b27203..65464ae808 100644 --- a/Open-ILS/examples/fm_IDL.xml +++ b/Open-ILS/examples/fm_IDL.xml @@ -1514,6 +1514,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Storage/Driver/Pg/QueryParser.pm b/Open-ILS/src/perlmods/OpenILS/Application/Storage/Driver/Pg/QueryParser.pm index 1995c9da8f..c240d0a5ad 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/Storage/Driver/Pg/QueryParser.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/Storage/Driver/Pg/QueryParser.pm @@ -209,8 +209,6 @@ sub initialize_aliases { __PACKAGE__->add_search_field_alias( $cmsa->field_class, $c->{field}, $cmsa->alias ); } } - - return $self->relevance_bumps; } sub initialize_relevance_bumps { diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 75bb60bc6e..441ab66012 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -60,7 +60,7 @@ CREATE TABLE config.upgrade_log ( install_date TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); -INSERT INTO config.upgrade_log (version) VALUES ('0216'); -- miker +INSERT INTO config.upgrade_log (version) VALUES ('0217'); -- miker CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, @@ -204,7 +204,8 @@ CREATE TABLE config.metabib_field ( weight INT NOT NULL DEFAULT 1, format TEXT NOT NULL DEFAULT 'mods33', search_field BOOL NOT NULL DEFAULT TRUE, - facet_field BOOL NOT NULL DEFAULT FALSE + facet_field BOOL NOT NULL DEFAULT FALSE, + facet_xpath TEXT ); COMMENT ON TABLE config.metabib_field IS $$ /* diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index 73a50db2ce..17ed7dcc61 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -240,6 +240,7 @@ DECLARE transformed_xml TEXT; xml_node TEXT; xml_node_list TEXT[]; + facet_text TEXT; raw_text TEXT; curr_text TEXT; joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? @@ -300,10 +301,16 @@ BEGIN -- insert raw node text for faceting IF idx.facet_field THEN + IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN + facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + facet_text := curr_text; + END IF; + output_row.field_class = idx.field_class; - output_row.field = -1 *idx.id; + output_row.field = -1 * idx.id; output_row.source = rid; - output_row.value = BTRIM(REGEXP_REPLACE(curr_text, E'\\s+', ' ', 'g')); + output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g')); RETURN NEXT output_row; END IF; diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql index 3d5abc9344..3ce194942c 100644 --- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql +++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql @@ -17,8 +17,9 @@ INSERT INTO config.metabib_class ( name, label ) VALUES ( 'author', oils_i18n_ge INSERT INTO config.metabib_class ( name, label ) VALUES ( 'subject', oils_i18n_gettext('subject', 'Subject', 'cmc', 'name') ); INSERT INTO config.metabib_class ( name, label ) VALUES ( 'series', oils_i18n_gettext('series', 'Series', 'cmc', 'name') ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (1, 'series', 'seriestitle', oils_i18n_gettext(1, 'Series Title', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:relatedItem[@type="series"]/mods32:titleInfo$$ ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, facet_field ) VALUES + (1, 'series', 'seriestitle', oils_i18n_gettext(1, 'Series Title', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:relatedItem[@type="series"]/mods32:titleInfo$$, TRUE ); + INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES (2, 'title', 'abbreviated', oils_i18n_gettext(2, 'Abbreviated Title', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:titleInfo[mods32:title and (@type='abbreviated')]$$ ); INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES @@ -29,22 +30,24 @@ INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) (5, 'title', 'uniform', oils_i18n_gettext(5, 'Uniform Title', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:titleInfo[mods32:title and (@type='uniform')]$$ ); INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES (6, 'title', 'proper', oils_i18n_gettext(6, 'Title Proper', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:titleInfo[mods32:title and not (@type)]$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (7, 'author', 'corporate', oils_i18n_gettext(7, 'Corporate Author', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:name[@type='corporate']/mods32:namePart[../mods32:role/mods32:roleTerm[text()='creator']]$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (8, 'author', 'personal', oils_i18n_gettext(8, 'Personal Author', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:name[@type='personal']/mods32:namePart[../mods32:role/mods32:roleTerm[text()='creator']]$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (9, 'author', 'conference', oils_i18n_gettext(9, 'Conference Author', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:name[@type='conference']/mods32:namePart[../mods32:role/mods32:roleTerm[text()='creator']]$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (10, 'author', 'other', oils_i18n_gettext(10, 'Other Author', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:name[@type='personal']/mods32:namePart[not(../mods32:role)]$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (11, 'subject', 'geographic', oils_i18n_gettext(11, 'Geographic Subject', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:subject/mods32:geographic$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (12, 'subject', 'name', oils_i18n_gettext(12, 'Name Subject', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:subject/mods32:name$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (13, 'subject', 'temporal', oils_i18n_gettext(13, 'Temporal Subject', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:subject/mods32:temporal$$ ); -INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES - (14, 'subject', 'topic', oils_i18n_gettext(14, 'Topic Subject', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:subject/mods32:topic$$ ); + +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, facet_xpath, facet_field ) VALUES + (7, 'author', 'corporate', oils_i18n_gettext(7, 'Corporate Author', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:name[@type='corporate' and mods32:role/mods32:roleTerm[text()='creator']]$$, $$*[local-name()='namePart']$$, TRUE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, facet_xpath, facet_field ) VALUES + (8, 'author', 'personal', oils_i18n_gettext(8, 'Personal Author', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:name[@type='personal' and mods32:role/mods32:roleTerm[text()='creator']]$$, $$*[local-name()='namePart']$$, TRUE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, facet_xpath, facet_field ) VALUES + (9, 'author', 'conference', oils_i18n_gettext(9, 'Conference Author', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:name[@type='conference' and mods32:role/mods32:roleTerm[text()='creator']]$$, $$*[local-name()='namePart']$$, TRUE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, facet_xpath, facet_field ) VALUES + (10, 'author', 'other', oils_i18n_gettext(10, 'Other Author', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:name[@type='personal' and not(mods32:role)]$$, $$*[local-name()='namePart']$$, TRUE ); + +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, facet_field ) VALUES + (11, 'subject', 'geographic', oils_i18n_gettext(11, 'Geographic Subject', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:subject/mods32:geographic$$, TRUE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, facet_xpath, facet_field ) VALUES + (12, 'subject', 'name', oils_i18n_gettext(12, 'Name Subject', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:subject/mods32:name$$, $$*[local-name()='namePart']$$, TRUE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, x, facet_fieldpath ) VALUES + (13, 'subject', 'temporal', oils_i18n_gettext(13, 'Temporal Subject', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:subject/mods32:temporal$$, TRUE ); +INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, facet_field ) VALUES + (14, 'subject', 'topic', oils_i18n_gettext(14, 'Topic Subject', 'cmf', 'label'), 'mods32', $$//mods32:mods/mods32:subject/mods32:topic$$, TRUE ); --INSERT INTO config.metabib_field ( id, field_class, name, format, xpath ) VALUES -- ( id, field_class, name, xpath ) VALUES ( 'subject', 'genre', 'mods32', $$//mods32:mods/mods32:genre$$ ); INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath ) VALUES diff --git a/Open-ILS/src/sql/Pg/upgrade/0217.schema.facet_xpath.sql b/Open-ILS/src/sql/Pg/upgrade/0217.schema.facet_xpath.sql new file mode 100644 index 0000000000..b5f5ebbefd --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0217.schema.facet_xpath.sql @@ -0,0 +1,123 @@ + +BEGIN; + +INSERT INTO config.upgrade_log (version) VALUES ('0217'); --miker + +ALTER TABLE config.metabib_field ADD COLUMN facet_xpath TEXT; + +UPDATE config.metabib_field SET facet_field=TRUE WHERE id = 1; +UPDATE config.metabib_field SET xpath=$$//mods32:mods/mods32:name[@type='corporate' and mods32:role/mods32:roleTerm[text()='creator']]$$, facet_field=TRUE, facet_xpath=$$*[local-name()='namePart']$$ WHERE id = 7; +UPDATE config.metabib_field SET xpath=$$//mods32:mods/mods32:name[@type='personal' and mods32:role/mods32:roleTerm[text()='creator']]$$, facet_field=TRUE, facet_xpath=$$*[local-name()='namePart']$$ WHERE id = 8; +UPDATE config.metabib_field SET xpath=$$//mods32:mods/mods32:name[@type='conference' and mods32:role/mods32:roleTerm[text()='creator']]$$, facet_field=TRUE, facet_xpath=$$*[local-name()='namePart']$$ WHERE id = 9; +UPDATE config.metabib_field SET xpath=$$//mods32:mods/mods32:name[@type='personal' and not(mods32:role)]$$, facet_field=TRUE, facet_xpath=$$*[local-name()='namePart']$$ WHERE id = 10; + +UPDATE config.metabib_field SET facet_field=TRUE WHERE id = 11; +UPDATE config.metabib_field SET facet_field=TRUE , facet_xpath=$$*[local-name()='namePart']$$ WHERE id = 12; +UPDATE config.metabib_field SET facet_field=TRUE WHERE id = 13; +UPDATE config.metabib_field SET facet_field=TRUE WHERE id = 14; + +CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( rid BIGINT, default_joiner TEXT ) RETURNS SETOF metabib.field_entry_template AS $func$ +DECLARE + bib biblio.record_entry%ROWTYPE; + idx config.metabib_field%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + xml_node TEXT; + xml_node_list TEXT[]; + facet_text TEXT; + raw_text TEXT; + curr_text TEXT; + joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? + output_row metabib.field_entry_template%ROWTYPE; +BEGIN + + -- Get the record + SELECT INTO bib * FROM biblio.record_entry WHERE id = rid; + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM config.metabib_field ORDER BY format LOOP + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(bib.marc,xfrm.xslt); + ELSE + transformed_xml := bib.marc; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + xml_node_list := oils_xpath( idx.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + + raw_text := NULL; + FOR xml_node IN SELECT x FROM explode_array(xml_node_list) AS x LOOP + CONTINUE WHEN xml_node !~ E'^\\s*<'; + + curr_text := ARRAY_TO_STRING( + oils_xpath( '//text()', + REGEXP_REPLACE( -- This escapes all &s not followed by "amp;". Data ise returned from oils_xpath (above) in UTF-8, not entity encoded + REGEXP_REPLACE( -- This escapes embeded [^<]+)(<)([^>]+<)$re$, + E'\\1<\\3', + 'g' + ), + '&(?!amp;)', + '&', + 'g' + ) + ), + ' ' + ); + + CONTINUE WHEN curr_text IS NULL OR curr_text = ''; + + IF raw_text IS NOT NULL THEN + raw_text := raw_text || joiner; + END IF; + + raw_text := COALESCE(raw_text,'') || curr_text; + + -- insert raw node text for faceting + IF idx.facet_field THEN + + IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN + facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + facet_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = -1 * idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g')); + + RETURN NEXT output_row; + END IF; + + END LOOP; + + CONTINUE WHEN raw_text IS NULL OR raw_text = ''; + + -- insert combined node text for searching + IF idx.search_field THEN + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g')); + + RETURN NEXT output_row; + END IF; + + END LOOP; + +END; +$func$ LANGUAGE PLPGSQL; + +COMMIT; + -- 2.11.0