From c7b99f8d90aa0b3a2be4fc609bfa83e5da261431 Mon Sep 17 00:00:00 2001 From: senator <lebbeous@esilibrary.com> Date: Fri, 22 Apr 2011 11:29:24 -0400 Subject: [PATCH] Replace vandelay.match_bib_record() with the new tree-y version --- Open-ILS/src/sql/Pg/012.schema.vandelay.sql | 276 +++++++++++++++++++++------- 1 file changed, 211 insertions(+), 65 deletions(-) diff --git a/Open-ILS/src/sql/Pg/012.schema.vandelay.sql b/Open-ILS/src/sql/Pg/012.schema.vandelay.sql index f64cc06d82..aee60a6f53 100644 --- a/Open-ILS/src/sql/Pg/012.schema.vandelay.sql +++ b/Open-ILS/src/sql/Pg/012.schema.vandelay.sql @@ -473,23 +473,216 @@ CREATE OR REPLACE FUNCTION vandelay.extract_rec_attrs ( xml TEXT ) RETURNS hstor SELECT vandelay.extract_rec_attrs( $1, (SELECT ARRAY_ACCUM(name) FROM config.record_attr_definition)); $_$ LANGUAGE SQL; -CREATE OR REPLACE FUNCTION vandelay.match_bib_record ( ) RETURNS TRIGGER AS $func$ +-- Everything between this comment and the beginning of the definition of +-- vandelay.match_bib_record() is strictly in service of that function. +CREATE TYPE vandelay.match_set_test_result AS (record BIGINT, quality INTEGER); + +CREATE OR REPLACE FUNCTION vandelay.match_set_test_marcxml( + match_set_id INTEGER, record_xml TEXT +) RETURNS SETOF vandelay.match_set_test_result AS $$ +DECLARE + tags_rstore HSTORE; + svf_rstore HSTORE; + coal TEXT; + joins TEXT; + query_ TEXT; + wq TEXT; + qvalue INTEGER; + rec RECORD; +BEGIN + tags_rstore := vandelay.flatten_marc_hstore(record_xml); + svf_rstore := vandelay.extract_rec_attrs(record_xml); + + CREATE TEMPORARY TABLE _vandelay_tmp_qrows (q INTEGER); + CREATE TEMPORARY TABLE _vandelay_tmp_jrows (j TEXT); + + -- generate the where clause and return that directly (into wq), and as + -- a side-effect, populate the _vandelay_tmp_[qj]rows tables. + wq := vandelay.get_expr_from_match_set(match_set_id); + + query_ := 'SELECT bre.id AS record, '; + + -- qrows table is for the quality bits we add to the SELECT clause + SELECT ARRAY_TO_STRING( + ARRAY_ACCUM('COALESCE(n' || q::TEXT || '.quality, 0)'), ' + ' + ) INTO coal FROM _vandelay_tmp_qrows; + + -- our query string so far is the SELECT clause and the inital FROM. + -- no JOINs yet nor the WHERE clause + query_ := query_ || coal || ' AS quality ' || E'\n' || + 'FROM biblio.record_entry bre '; + + -- jrows table is for the joins we must make (and the real text conditions) + SELECT ARRAY_TO_STRING(ARRAY_ACCUM(j), E'\n') INTO joins + FROM _vandelay_tmp_jrows; + + -- add those joins and the where clause to our query. + query_ := query_ || joins || E'\n' || 'WHERE ' || wq; + + -- this will return rows of record,quality + FOR rec IN EXECUTE query_ USING tags_rstore, svf_rstore LOOP + RETURN NEXT rec; + END LOOP; + + DROP TABLE _vandelay_tmp_qrows; + DROP TABLE _vandelay_tmp_jrows; + RETURN; +END; + +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.flatten_marc_hstore( + record_xml TEXT +) RETURNS HSTORE AS $$ +BEGIN + RETURN (SELECT + HSTORE( + ARRAY_ACCUM(tag || (COALESCE(subfield, ''))), + ARRAY_ACCUM(value) + ) + FROM ( + SELECT tag, subfield, ARRAY_ACCUM(value)::TEXT AS value + FROM vandelay.flatten_marc(record_xml) + GROUP BY tag, subfield ORDER BY tag, subfield + ) subquery + ); +END; +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.get_expr_from_match_set( + match_set_id INTEGER +) RETURNS TEXT AS $$ +DECLARE + root vandelay.match_set_point; +BEGIN + SELECT * INTO root FROM vandelay.match_set_point + WHERE parent IS NULL AND match_set = match_set_id; + + RETURN vandelay.get_expr_from_match_set_point(root); +END; +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.get_expr_from_match_set_point( + node vandelay.match_set_point +) RETURNS TEXT AS $$ +DECLARE + q TEXT; + i INTEGER; + this_op TEXT; + children INTEGER[]; + child vandelay.match_set_point; +BEGIN + SELECT ARRAY_ACCUM(id) INTO children FROM vandelay.match_set_point + WHERE parent = node.id; + + IF ARRAY_LENGTH(children, 1) > 0 THEN + this_op := vandelay._get_expr_render_one(node); + q := '('; + i := 1; + WHILE children[i] IS NOT NULL LOOP + SELECT * INTO child FROM vandelay.match_set_point + WHERE id = children[i]; + IF i > 1 THEN + q := q || ' ' || this_op || ' '; + END IF; + i := i + 1; + q := q || vandelay.get_expr_from_match_set_point(child); + END LOOP; + q := q || ')'; + RETURN q; + ELSIF node.bool_op IS NULL THEN + PERFORM vandelay._get_expr_push_qrow(node); + PERFORM vandelay._get_expr_push_jrow(node); + RETURN vandelay._get_expr_render_one(node); + ELSE + RETURN ''; + END IF; +END; +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay._get_expr_push_qrow( + node vandelay.match_set_point +) RETURNS VOID AS $$ +DECLARE +BEGIN + INSERT INTO _vandelay_tmp_qrows (q) VALUES (node.id); +END; +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay._get_expr_push_jrow( + node vandelay.match_set_point +) RETURNS VOID AS $$ +DECLARE + jrow TEXT; + my_alias TEXT; + op TEXT; + tagkey TEXT; +BEGIN + IF node.negate THEN + op := '<>'; + ELSE + op := '='; + END IF; + + IF node.tag IS NOT NULL THEN + tagkey := node.tag; + IF node.subfield IS NOT NULL THEN + tagkey := tagkey || node.subfield; + END IF; + END IF; + + my_alias := 'n' || node.id::TEXT; + + jrow := 'LEFT JOIN (SELECT *, ' || node.quality || + ' AS quality FROM metabib.'; + IF node.tag IS NOT NULL THEN + jrow := jrow || 'full_rec) ' || my_alias || ' ON (' || + my_alias || '.record = bre.id AND ' || my_alias || '.tag = ''' || + node.tag || ''''; + IF node.subfield IS NOT NULL THEN + jrow := jrow || ' AND ' || my_alias || '.subfield = ''' || + node.subfield || ''''; + END IF; + jrow := jrow || ' AND (' || my_alias || '.value ' || op || + ' ANY(($1->''' || tagkey || ''')::TEXT[])))'; + ELSE -- svf + jrow := jrow || 'record_attr) ' || my_alias || ' ON (' || + my_alias || '.id = bre.id AND (' || + my_alias || '.attrs->''' || node.svf || + ''' ' || op || ' $2->''' || node.svf || '''))'; + END IF; + INSERT INTO _vandelay_tmp_jrows (j) VALUES (jrow); +END; +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay._get_expr_render_one( + node vandelay.match_set_point +) RETURNS TEXT AS $$ +DECLARE + s TEXT; +BEGIN + IF node.bool_op IS NOT NULL THEN + RETURN node.bool_op; + ELSE + RETURN '(n' || node.id::TEXT || '.id IS ' || + (CASE WHEN node.negate THEN '' ELSE 'NOT ' END) || -- sic! + 'NULL)'; + END IF; +END; +$$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.match_bib_record() RETURNS TRIGGER AS $func$ DECLARE incoming_existing_id TEXT; my_bib_queue vandelay.bib_queue%ROWTYPE; - my_match_set vandelay.match_set%ROWTYPE; - test vandelay.match_set_point%ROWTYPE; - potential_matches BIGINT[]; - matches BIGINT[]; - rvalue TEXT; - quality_set hstore; + test_result vandelay.match_set_test_result%ROWTYPE; tmp_rec BIGINT; - tmp_quality INT; - first_round BOOL; BEGIN DELETE FROM vandelay.bib_match WHERE queued_record = NEW.id; - incoming_existing_id := oils_xpath_string('//*[@tag="901"]/*[@code="c"][1]',NEW.marc); + -- Perfect matches on 901$c exit early with a match with high quality. + incoming_existing_id := + oils_xpath_string('//*[@tag="901"]/*[@code="c"][1]', NEW.marc); IF incoming_existing_id IS NOT NULL AND incoming_existing_id != '' THEN SELECT id INTO tmp_rec FROM biblio.record_entry WHERE id = incoming_existing_id::bigint; @@ -501,64 +694,17 @@ BEGIN SELECT * INTO my_bib_queue FROM vandelay.bib_queue WHERE id = NEW.queue; - first_round := TRUE; - -- whew ... here we go ... - - - -- Commented out until replaced by tree-ish version -/* - FOR test IN SELECT * FROM vandelay.match_set_point WHERE match_set = my_bib_queue.match_set ORDER BY required DESC LOOP - IF test.tag IS NOT NULL THEN - FOR rvalue IN SELECT value FROM vandelay.flatten_marc( xml ) WHERE tag = test.tag AND subfield = test.subfield LOOP - SELECT ARRAY_ACCUM(DISTINCT record) INTO potential_matches FROM metabib.real_full_rec WHERE tag = test.tag AND subfield = test.subfield AND value = rvalue; - - IF first_round THEN - matches := potential_matches; - first_round := FALSE; - ELSIF test.required THEN - FOR tmp_rec IN SELECT * FROM UNNEST(matches) LOOP - IF tmp_rec NOT IN (SELECT * FROM UNNEST(potential_matches)) THEN - matches := evergreen.array_remove_item_by_value(matches, tmp_rec); - potential_matches := evergreen.array_remove_item_by_value(potential_matches, tmp_rec); - END IF; - END LOOP; - END IF; + FOR test_result IN SELECT * FROM + vandelay.match_set_test_marcxml(my_bib_queue.match_set, NEW.marc) LOOP - -- add the quality for this match - FOR tmp_rec IN SELECT * FROM UNNEST(potential_matches) LOOP - tmp_quality := COALESCE((quality_set -> tmp_rec::TEXT)::INT, 0); - quality_set := quality_set || hstore(tmp_rec::TEXT, (tmp_quality + test.quality)::TEXT); - END LOOP; - - END LOOP; - ELSE - rvalue := vandelay.vandelay.extract_rec_attrs(xml, ARRAY[test.svf]); - - IF first_round THEN - matches := potential_matches; - first_round := FALSE; - ELSIF test.required THEN - FOR tmp_rec IN SELECT * FROM UNNEST(matches) LOOP - IF tmp_rec NOT IN (SELECT * FROM UNNEST(potential_matches)) THEN - matches := evergreen.array_remove_item_by_value(matches, tmp_rec); - potential_matches := evergreen.array_remove_item_by_value(potential_matches, tmp_rec); - END IF; - END LOOP; - END IF; - - -- add the quality for this match - FOR tmp_rec IN SELECT * FROM UNNEST(potential_matches) LOOP - tmp_quality := COALESCE((quality_set -> tmp_rec::TEXT)::INT, 0); - quality_set := quality_set || hstore(tmp_rec::TEXT, (tmp_quality + test.quality)::TEXT); - END LOOP; - - END IF; - END LOOP; + INSERT INTO vandelay.bib_match ( + matched_set, queued_record, eg_record, quality + ) VALUES ( + my_bib_queue.match_set, NEW.id, test_result.record, + test_result.quality + ); - FOR tmp_rec IN SELECT * FROM UNNEST(matches) LOOP - INSERT INTO vandelay.bib_match (matched_set, queued_record, eg_record, quality) VALUES (my_bib_queue.match_set, NEW.id, tmp_rec, (quality_set -> tmp_rec::TEXT)); END LOOP; -*/ RETURN NEW; END; -- 2.11.0