From a0d7fbd80dfab2451920e097d4bfec5ef2088acf Mon Sep 17 00:00:00 2001 From: Lebbeous Fogle-Weekley Date: Mon, 19 Dec 2011 17:34:10 -0500 Subject: [PATCH] DB layer for autosuggest: seems to mostly work! Would like Mike's review generally, and if nothing else, we probably should exempty just a couple of fields from suggest_search = true? Signed-off-by: Lebbeous Fogle-Weekley --- Open-ILS/src/sql/Pg/002.schema.config.sql | 2 + Open-ILS/src/sql/Pg/030.schema.metabib.sql | 167 ++++++++++--- .../sql/Pg/upgrade/YYYY.schema.bib_autosuggest.sql | 258 +++++++++++++++++++++ 3 files changed, 392 insertions(+), 35 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/YYYY.schema.bib_autosuggest.sql diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 8eba8b3104..40a4a5b310 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -200,6 +200,8 @@ CREATE TABLE config.metabib_field ( format TEXT NOT NULL REFERENCES config.xml_transform (name) DEFAULT 'mods33', search_field BOOL NOT NULL DEFAULT TRUE, facet_field BOOL NOT NULL DEFAULT FALSE, + suggest_field BOOL NOT NULL DEFAULT TRUE, + suggest_xpath TEXT, facet_xpath TEXT ); COMMENT ON TABLE config.metabib_field IS $$ diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index 0ea948ea78..37e23be591 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -301,10 +301,14 @@ CREATE INDEX metabib_metarecord_source_map_source_record_idx ON metabib.metareco CREATE TYPE metabib.field_entry_template AS ( field_class TEXT, field INT, + facet_field BOOL, + search_field BOOL, + suggest_field BOOL, source BIGINT, value TEXT ); + CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( rid BIGINT, default_joiner TEXT ) RETURNS SETOF metabib.field_entry_template AS $func$ DECLARE bib biblio.record_entry%ROWTYPE; @@ -315,6 +319,7 @@ DECLARE xml_node TEXT; xml_node_list TEXT[]; facet_text TEXT; + suggest_text TEXT; raw_text TEXT; curr_text TEXT; joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? @@ -327,6 +332,10 @@ BEGIN -- Loop over the indexing entries FOR idx IN SELECT * FROM config.metabib_field ORDER BY format LOOP + output_row.facet_field = idx.facet_field; + output_row.suggest_field = idx.suggest_field; + output_row.search_field = idx.search_field; + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; -- See if we can skip the XSLT ... it's expensive @@ -372,6 +381,23 @@ BEGIN raw_text := COALESCE(raw_text,'') || curr_text; + -- autosuggest/metabib.browse_entry + IF idx.suggest_field THEN + + IF idx.suggest_xpath IS NOT NULL AND idx.suggest_xpath <> '' THEN + suggest_text := oils_xpath_string( idx.suggest_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + suggest_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(suggest_text, E'\\s+', ' ', 'g')); + + RETURN NEXT output_row; + END IF; + -- insert raw node text for faceting IF idx.facet_field THEN @@ -406,6 +432,64 @@ BEGIN END LOOP; END; + +$func$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION metabib.reingest_metabib_field_entries( bib_id BIGINT ) RETURNS VOID AS $func$ +DECLARE + fclass RECORD; + ind_data metabib.field_entry_template%ROWTYPE; + mbe_row metabib.browse_entry%ROWTYPE; + mbe_id BIGINT; +BEGIN + PERFORM * FROM config.internal_flag WHERE name = 'ingest.assume_inserts_only' AND enabled; + IF NOT FOUND THEN + FOR fclass IN SELECT * FROM config.metabib_class LOOP + -- RAISE NOTICE 'Emptying out %', fclass.name; + EXECUTE $$DELETE FROM metabib.$$ || fclass.name || $$_field_entry WHERE source = $$ || bib_id; + END LOOP; + DELETE FROM metabib.facet_entry WHERE source = bib_id; + DELETE FROM metabib.browse_entry_def_map WHERE source = bib_id; + END IF; + + FOR ind_data IN SELECT * FROM biblio.extract_metabib_field_entry( bib_id ) LOOP + IF ind_data.field < 0 THEN + ind_data.field = -1 * ind_data.field; + END IF; + + IF ind_data.facet_field THEN + INSERT INTO metabib.facet_entry (field, source, value) + VALUES (ind_data.field, ind_data.source, ind_data.value); + END IF; + + IF ind_data.suggest_field THEN + SELECT INTO mbe_row * FROM metabib.browse_entry WHERE value = ind_data.value; + IF FOUND THEN + mbe_id := mbe_row.id; + ELSE + INSERT INTO metabib.browse_entry (value) VALUES + (metabib.browse_normalize(ind_data.value, ind_data.field)); + mbe_id := CURRVAL('metabib.browse_entry_id_seq'::REGCLASS); + END IF; + + INSERT INTO metabib.browse_entry_def_map (entry, def, source) + VALUES (mbe_id, ind_data.field, ind_data.source); + END IF; + + IF ind_data.search_field THEN + EXECUTE $$ + INSERT INTO metabib.$$ || ind_data.field_class || $$_field_entry (field, source, value) + VALUES ($$ || + quote_literal(ind_data.field) || $$, $$ || + quote_literal(ind_data.source) || $$, $$ || + quote_literal(ind_data.value) || + $$);$$; + END IF; + + END LOOP; + + RETURN; +END; $func$ LANGUAGE PLPGSQL; -- default to a space joiner @@ -736,41 +820,6 @@ BEGIN END; $func$ LANGUAGE PLPGSQL; -CREATE OR REPLACE FUNCTION metabib.reingest_metabib_field_entries( bib_id BIGINT ) RETURNS VOID AS $func$ -DECLARE - fclass RECORD; - ind_data metabib.field_entry_template%ROWTYPE; -BEGIN - PERFORM * FROM config.internal_flag WHERE name = 'ingest.assume_inserts_only' AND enabled; - IF NOT FOUND THEN - FOR fclass IN SELECT * FROM config.metabib_class LOOP - -- RAISE NOTICE 'Emptying out %', fclass.name; - EXECUTE $$DELETE FROM metabib.$$ || fclass.name || $$_field_entry WHERE source = $$ || bib_id; - END LOOP; - DELETE FROM metabib.facet_entry WHERE source = bib_id; - END IF; - - FOR ind_data IN SELECT * FROM biblio.extract_metabib_field_entry( bib_id ) LOOP - IF ind_data.field < 0 THEN - ind_data.field = -1 * ind_data.field; - INSERT INTO metabib.facet_entry (field, source, value) - VALUES (ind_data.field, ind_data.source, ind_data.value); - ELSE - EXECUTE $$ - INSERT INTO metabib.$$ || ind_data.field_class || $$_field_entry (field, source, value) - VALUES ($$ || - quote_literal(ind_data.field) || $$, $$ || - quote_literal(ind_data.source) || $$, $$ || - quote_literal(ind_data.value) || - $$);$$; - END IF; - - END LOOP; - - RETURN; -END; -$func$ LANGUAGE PLPGSQL; - CREATE OR REPLACE FUNCTION biblio.extract_located_uris( bib_id BIGINT, marcxml TEXT, editor_id INT ) RETURNS VOID AS $func$ DECLARE uris TEXT[]; @@ -1109,4 +1158,52 @@ BEGIN END; $func$ LANGUAGE PLPGSQL; +CREATE TABLE metabib.browse_entry ( + id BIGSERIAL PRIMARY KEY, + value TEXT unique, + index_vector tsvector +); + +CREATE TABLE metabib.browse_entry_def_map ( + id BIGSERIAL PRIMARY KEY, + entry BIGINT REFERENCES metabib.browse_entry (id), + def INT REFERENCES config.metabib_field (id), + source BIGINT REFERENCES biblio.record_entry (id) +); + +CREATE OR REPLACE FUNCTION metabib.browse_normalize(facet_text TEXT, mapped_field INT) RETURNS TEXT AS $$ +DECLARE + normalizer RECORD; +BEGIN + + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE m.field = mapped_field AND m.pos < 0 + ORDER BY m.pos LOOP + + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( facet_text ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO facet_text; + + END LOOP; + + RETURN facet_text; +END; + +$$ LANGUAGE PLPGSQL; + +CREATE TRIGGER metabib_browse_entry_fti_trigger + BEFORE INSERT OR UPDATE ON metabib.browse_entry + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('default'); + + COMMIT; diff --git a/Open-ILS/src/sql/Pg/upgrade/YYYY.schema.bib_autosuggest.sql b/Open-ILS/src/sql/Pg/upgrade/YYYY.schema.bib_autosuggest.sql new file mode 100644 index 0000000000..502e88e1b9 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/YYYY.schema.bib_autosuggest.sql @@ -0,0 +1,258 @@ +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('YYYY', :eg_version); + +CREATE TABLE metabib.browse_entry ( + id BIGSERIAL PRIMARY KEY, + value TEXT unique, + index_vector tsvector +); + +CREATE TABLE metabib.browse_entry_def_map ( + id BIGSERIAL PRIMARY KEY, + entry BIGINT REFERENCES metabib.browse_entry (id), + def INT REFERENCES config.metabib_field (id), + source BIGINT REFERENCES biblio.record_entry (id) +); + +ALTER TABLE config.metabib_field ADD COLUMN suggest_field BOOLEAN DEFAULT TRUE NOT NULL; +ALTER TABLE config.metabib_field ADD COLUMN suggest_xpath TEXT; + +CREATE OR REPLACE FUNCTION metabib.browse_normalize(facet_text TEXT, mapped_field INT) RETURNS TEXT AS $$ +DECLARE + normalizer RECORD; +BEGIN + + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE m.field = mapped_field AND m.pos < 0 + ORDER BY m.pos LOOP + + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( facet_text ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO facet_text; + + END LOOP; + + RETURN facet_text; +END; + +$$ LANGUAGE PLPGSQL; + +CREATE TRIGGER metabib_browse_entry_fti_trigger + BEFORE INSERT OR UPDATE ON metabib.browse_entry + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('default'); + +DROP FUNCTION biblio.extract_metabib_field_entry(bigint, text); +DROP FUNCTION biblio.extract_metabib_field_entry(bigint); + +DROP TYPE metabib.field_entry_template; +CREATE TYPE metabib.field_entry_template AS ( + field_class TEXT, + field INT, + facet_field BOOL, + search_field BOOL, + suggest_field BOOL, + source BIGINT, + value TEXT +); + + +CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( rid BIGINT, default_joiner TEXT ) RETURNS SETOF metabib.field_entry_template AS $func$ +DECLARE + bib biblio.record_entry%ROWTYPE; + idx config.metabib_field%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + xml_node TEXT; + xml_node_list TEXT[]; + facet_text TEXT; + suggest_text TEXT; + raw_text TEXT; + curr_text TEXT; + joiner TEXT := default_joiner; -- XXX will index defs supply a joiner? + output_row metabib.field_entry_template%ROWTYPE; +BEGIN + + -- Get the record + SELECT INTO bib * FROM biblio.record_entry WHERE id = rid; + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM config.metabib_field ORDER BY format LOOP + + output_row.facet_field = idx.facet_field; + output_row.suggest_field = idx.suggest_field; + output_row.search_field = idx.search_field; + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(bib.marc,xfrm.xslt); + ELSE + transformed_xml := bib.marc; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + xml_node_list := oils_xpath( idx.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + + raw_text := NULL; + FOR xml_node IN SELECT x FROM unnest(xml_node_list) AS x LOOP + CONTINUE WHEN xml_node !~ E'^\\s*<'; + + curr_text := ARRAY_TO_STRING( + oils_xpath( '//text()', + REGEXP_REPLACE( -- This escapes all &s not followed by "amp;". Data ise returned from oils_xpath (above) in UTF-8, not entity encoded + REGEXP_REPLACE( -- This escapes embeded [^<]+)(<)([^>]+<)$re$, + E'\\1<\\3', + 'g' + ), + '&(?!amp;)', + '&', + 'g' + ) + ), + ' ' + ); + + CONTINUE WHEN curr_text IS NULL OR curr_text = ''; + + IF raw_text IS NOT NULL THEN + raw_text := raw_text || joiner; + END IF; + + raw_text := COALESCE(raw_text,'') || curr_text; + + -- autosuggest/metabib.browse_entry + IF idx.suggest_field THEN + + IF idx.suggest_xpath IS NOT NULL AND idx.suggest_xpath <> '' THEN + suggest_text := oils_xpath_string( idx.suggest_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + suggest_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(suggest_text, E'\\s+', ' ', 'g')); + + RETURN NEXT output_row; + END IF; + + -- insert raw node text for faceting + IF idx.facet_field THEN + + IF idx.facet_xpath IS NOT NULL AND idx.facet_xpath <> '' THEN + facet_text := oils_xpath_string( idx.facet_xpath, xml_node, joiner, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + ELSE + facet_text := curr_text; + END IF; + + output_row.field_class = idx.field_class; + output_row.field = -1 * idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(facet_text, E'\\s+', ' ', 'g')); + + RETURN NEXT output_row; + END IF; + + END LOOP; + + CONTINUE WHEN raw_text IS NULL OR raw_text = ''; + + -- insert combined node text for searching + IF idx.search_field THEN + output_row.field_class = idx.field_class; + output_row.field = idx.id; + output_row.source = rid; + output_row.value = BTRIM(REGEXP_REPLACE(raw_text, E'\\s+', ' ', 'g')); + + RETURN NEXT output_row; + END IF; + + END LOOP; + +END; +$func$ LANGUAGE PLPGSQL; + +-- default to a space joiner +CREATE OR REPLACE FUNCTION biblio.extract_metabib_field_entry ( BIGINT ) RETURNS SETOF metabib.field_entry_template AS $func$ + SELECT * FROM biblio.extract_metabib_field_entry($1, ' '); + $func$ LANGUAGE SQL; + + +CREATE OR REPLACE FUNCTION metabib.reingest_metabib_field_entries( bib_id BIGINT ) RETURNS VOID AS $func$ +DECLARE + fclass RECORD; + ind_data metabib.field_entry_template%ROWTYPE; + mbe_row metabib.browse_entry%ROWTYPE; + mbe_id BIGINT; +BEGIN + PERFORM * FROM config.internal_flag WHERE name = 'ingest.assume_inserts_only' AND enabled; + IF NOT FOUND THEN + FOR fclass IN SELECT * FROM config.metabib_class LOOP + -- RAISE NOTICE 'Emptying out %', fclass.name; + EXECUTE $$DELETE FROM metabib.$$ || fclass.name || $$_field_entry WHERE source = $$ || bib_id; + END LOOP; + DELETE FROM metabib.facet_entry WHERE source = bib_id; + DELETE FROM metabib.browse_entry_def_map WHERE source = bib_id; + END IF; + + FOR ind_data IN SELECT * FROM biblio.extract_metabib_field_entry( bib_id ) LOOP + IF ind_data.field < 0 THEN + ind_data.field = -1 * ind_data.field; + END IF; + + IF ind_data.facet_field THEN + INSERT INTO metabib.facet_entry (field, source, value) + VALUES (ind_data.field, ind_data.source, ind_data.value); + END IF; + + IF ind_data.suggest_field THEN + SELECT INTO mbe_row * FROM metabib.browse_entry WHERE value = ind_data.value; + IF FOUND THEN + mbe_id := mbe_row.id; + ELSE + INSERT INTO metabib.browse_entry (value) VALUES + (metabib.browse_normalize(ind_data.value, ind_data.field)); + mbe_id := CURRVAL('metabib.browse_entry_id_seq'::REGCLASS); + END IF; + + INSERT INTO metabib.browse_entry_def_map (entry, def, source) + VALUES (mbe_id, ind_data.field, ind_data.source); + END IF; + + IF ind_data.search_field THEN + EXECUTE $$ + INSERT INTO metabib.$$ || ind_data.field_class || $$_field_entry (field, source, value) + VALUES ($$ || + quote_literal(ind_data.field) || $$, $$ || + quote_literal(ind_data.source) || $$, $$ || + quote_literal(ind_data.value) || + $$);$$; + END IF; + + END LOOP; + + RETURN; +END; +$func$ LANGUAGE PLPGSQL; + +COMMIT; -- 2.11.0