From 0c094180d2dc176550ce59b553835c7da5565f64 Mon Sep 17 00:00:00 2001 From: Mike Rylander Date: Wed, 15 Jan 2014 12:07:59 -0500 Subject: [PATCH] New intarray-focused attribute extraction We rewrite the record attribute extraction to capture all the record-supplied values for each attribute (where multi is true) and store that list in the new (fkey-corrected) metabib.record_attr_vector_list table. Only filters make it into this table. We also insert a parameter after the record id to accept a list of record attributes we want to rewrite. This defaults to NULL to rewrite all of them. Sorters are stored in a new, separate table built specifically for them. metabib.record_attr becomes a vew atop metabib.record_attr_vector_list which expands the intarray stored therein into an hstore. For multi=true attributes, only one will be returned through this view, as is HSTORE's way, and which will be returned is undefined. However this view is only provided for the purpose of backward compat with reports or other locally defined logic. Signed-off-by: Mike Rylander --- Open-ILS/src/sql/Pg/030.schema.metabib.sql | 186 ++++++++++++++++++++++------- 1 file changed, 146 insertions(+), 40 deletions(-) diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index 6fbbf5d747..5a0fd26c53 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -273,17 +273,18 @@ CREATE SEQUENCE metabib.uncontrolled_record_attr_value_id_seq INCREMENT BY -1; CREATE TABLE metabib.uncontrolled_record_attr_value ( id BIGINT PRIMARY KEY DEFAULT nextval('metabib.uncontrolled_record_attr_value_id_seq'), - attr INT NOT NULL REFERENCES config.record_attr_definition (id), - value text NOT NULL + attr TEXT NOT NULL REFERENCES config.record_attr_definition (name), + value TEXT NOT NULL ); CREATE UNIQUE INDEX muv_once_idx ON metabib.uncontrolled_record_attr_value (attr,value); -create table metabib.record_attr_vector_list ( +CREATE TABLE metabib.record_attr_vector_list ( source BIGINT PRIMARY KEY REFERNECES biblio.record_entry (id), vlist INT[] NOT NULL -- stores id from ccvm AND murav ); CREATE INDEX mrca_vlist_idx ON metabib.record_attr_vector_list USING gin ( vlist gin__int_ops ); +/* This becomes a view, and we do sorters differently ... CREATE TABLE metabib.record_attr ( id BIGINT PRIMARY KEY REFERENCES biblio.record_entry (id) ON DELETE CASCADE, attrs HSTORE NOT NULL DEFAULT ''::HSTORE @@ -291,8 +292,36 @@ CREATE TABLE metabib.record_attr ( CREATE INDEX metabib_svf_attrs_idx ON metabib.record_attr USING GIST (attrs); CREATE INDEX metabib_svf_date1_idx ON metabib.record_attr ((attrs->'date1')); CREATE INDEX metabib_svf_dates_idx ON metabib.record_attr ((attrs->'date1'),(attrs->'date2')); +*/ + +/* ... like this */ +CREATE TABLE metabib.record_sorter ( + id BIGSERIAL PRIMARY KEY, + source BIGINT NOT NULL REFERENCES biblio.record_entry (id) ON DELETE CASCADE, + attr TEXT NOT NULL REFERENCES config.record_attr_definition (name) ON DELETE CASCADE, + value TEXT NOT NULL +); +CREATE INDEX metabib_sorter_source_idx ON metabib.record_sorter (source); +CREATE INDEX metabib_sorter_a_v_idx ON metabib.record_sorter (attr, value); + + +CREATE TYPE metabib.record_attr_type AS ( + id BIGINT, + attrs HSTORE +); + +-- Back-compat view ... we're moving to an INTARRAY world +CREATE VIEW metabib.record_attr AS + SELECT v.source AS id, + hstore( ARRAY_AGG( ARRAY[ COALESCE(c.ctype,u.attr), COALESCE(c.value,u.value) ] ) ) + FROM metabib.record_attr_vector_list v + LEFT JOIN metabib.uncontrolled_record_attr_value u ON ( u.id = ANY( v.vlist ) ) + LEFT JOIN config.coded_value_map c ON ( c.id = ANY( v.vlist ) ) + WHERE c.id IS NOT NULL OR u.id IS NOT NULL + GROUP BY 1; + --- Back-compat view ... we're moving to an HSTORE world +-- Back-back-compat view ... we use to live in an HSTORE world CREATE TYPE metabib.rec_desc_type AS ( item_type TEXT, item_form TEXT, @@ -1300,26 +1329,43 @@ CREATE OR REPLACE FUNCTION biblio.map_authority_linking (bibid BIGINT, marc TEXT SELECT $1; $func$ LANGUAGE SQL; -CREATE OR REPLACE FUNCTION metabib.reingest_record_attributes (rid BIGINT, prmarc TEXT DEFAULT NULL, rdeleted BOOL DEFAULT TRUE) RETURNS VOID AS $func$ +CREATE OR REPLACE FUNCTION metabib.reingest_record_attributes (rid BIGINT, pattr_list TEXT[] DEFAULT NULL, prmarc TEXT DEFAULT NULL, rdeleted BOOL DEFAULT TRUE) RETURNS VOID AS $func$ DECLARE transformed_xml TEXT; rmarc TEXT := prmarc; + tmp_val TEXT; prev_xfrm TEXT; normalizer RECORD; xfrm config.xml_transform%ROWTYPE; - attr_value TEXT; - new_attrs HSTORE := ''::HSTORE; + attr_vector INT[] := '{}'::INT[]; + attr_vector_tmp INT[]; + attr_list TEXT[] := pattr_list; + attr_value TEXT[]; + norm_attr_value TEXT[]; + tmp_xml XML; attr_def config.record_attr_definition%ROWTYPE; + ccvm_row config.code_value_map%ROWTYPE; BEGIN + IF attr_list IS NULL OR rdeleted THEN -- need to do the full dance on INSERT or undelete + SELECT ARRAY_AGG(name) INTO attr_list FROM config.record_attr_definition; + END IF; + IF rmarc IS NULL THEN SELECT marc INTO rmarc FROM biblio.record_entry WHERE id = rid; END IF; - FOR attr_def IN SELECT * FROM config.record_attr_definition ORDER BY format LOOP + FOR attr_def IN SELECT * FROM config.record_attr_definition WHERE name = ANY( attr_list ) ORDER BY format LOOP + + attr_value := '{}'::TEXT[] + norm_attr_value := '{}'::TEXT[] + attr_vector_tmp := '{}'::INT[] + SELECT * INTO ccvm_row FROM config.code_value_map c WHERE c.ctype = attr_def.name; + + -- tag+sf attrs only support SVF IF attr_def.tag IS NOT NULL THEN -- tag (and optional subfield list) selection - SELECT ARRAY_TO_STRING(ARRAY_ACCUM(value), COALESCE(attr_def.joiner,' ')) INTO attr_value + SELECT ARRAY[ARRAY_TO_STRING(ARRAY_AGG(value), COALESCE(attr_def.joiner,' '))] INTO attr_value FROM (SELECT * FROM metabib.full_rec ORDER BY tag, subfield) AS x WHERE record = rid AND tag LIKE attr_def.tag @@ -1333,7 +1379,11 @@ BEGIN LIMIT 1; ELSIF attr_def.fixed_field IS NOT NULL THEN -- a named fixed field, see config.marc21_ff_pos_map.fixed_field - attr_value := biblio.marc21_extract_fixed_field(rid, attr_def.fixed_field); + attr_value := vandelay.marc21_extract_fixed_field_list(rmarc, attr_def.fixed_field); + + IF NOT attr_def.multi THEN + attr_value := ARRAY[attr_value[1]]; + END IF; ELSIF attr_def.xpath IS NOT NULL THEN -- and xpath expression @@ -1357,47 +1407,103 @@ BEGIN prev_xfrm := xfrm.name; END IF; - attr_value := oils_xpath_string(attr_def.xpath, transformed_xml, COALESCE(attr_def.joiner,' '), ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]); + FOR tmp_xml IN SELECT XPATH(attr_def.xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]) LOOP + attr_value := attr_value || + oils_xpath_string( + '//*', + tmp_xml::TEXT, + COALESCE(attr_def.joiner,' '), + ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] + ); + EXIT WHEN NOT attr_def.multi; + END LOOP; ELSIF attr_def.phys_char_sf IS NOT NULL THEN -- a named Physical Characteristic, see config.marc21_physical_characteristic_*_map - SELECT m.value INTO attr_value - FROM biblio.marc21_physical_characteristics(rid) v - JOIN config.marc21_physical_characteristic_value_map m ON (m.id = v.value) + SELECT ARRAY_AGG(m.value) INTO attr_vlue + FROM vandelay.marc21_physical_characteristics(rmarc) v + LEFT JOIN config.marc21_physical_characteristic_value_map m ON (m.id = v.value) WHERE v.subfield = attr_def.phys_char_sf - LIMIT 1; -- Just in case ... + AND ( ccvm.id IS NULL OR ( ccvm.id IS NOT NULL AND v.id IS NOT NULL) ); + + IF NOT attr_def.multi THEN + attr_value := ARRAY[attr_value[1]]; + END IF; END IF; -- apply index normalizers to attr_value - FOR normalizer IN - SELECT n.func AS func, - n.param_count AS param_count, - m.params AS params - FROM config.index_normalizer n - JOIN config.record_attr_index_norm_map m ON (m.norm = n.id) - WHERE attr = attr_def.name - ORDER BY m.pos LOOP - EXECUTE 'SELECT ' || normalizer.func || '(' || - COALESCE( quote_literal( attr_value ), 'NULL' ) || - CASE - WHEN normalizer.param_count > 0 - THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') - ELSE '' - END || - ')' INTO attr_value; + FOR tmp_val IN SELECT value FROM UNNEST(attr_value) x(value); + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.record_attr_index_norm_map m ON (m.norm = n.id) + WHERE attr = attr_def.name + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + COALESCE( quote_literal( tmp_val ), 'NULL' ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO tmp_val; + END LOOP; + norm_attr_value := norm_attr_value || tmp_val; END LOOP; - -- Add the new value to the hstore - new_attrs := new_attrs || hstore( attr_def.name, attr_value ); + IF attr_def.filter THEN + -- Create unknown uncontrolled values and find the IDs of the values + IF ccvm.id IS NULL THEN + FOR tmp_val FROM SELECT value FROM UNNEST(norm_attr_value) x(value) LOOP + BEGIN; -- use subtransaction to isolate unique constraint violations + INSERT INTO metabib.uncontrolled_record_attr_value ( attr, value ) VALUES ( attr_def.name, tmp_val ); + EXCEPTION WHEN unique_violation THEN END; + END LOOP; + + SELECT ARRAY_AGG(id) INTO attr_vector_tmp FROM metabib.uncontrolled_record_attr_value WHERE attr = attr_def.name AND value = ANY( norm_attr_value ); + ELSE + SELECT ARRAY_AGG(id) INTO attr_vector_tmp FROM config.coded_value_map WHERE ctype = attr_def.name AND value = ANY( norm_attr_value ); + END IF; + + -- Add the new value to the vector + attr_vector := attr_vector || attr_vector_tmp; + END IF; + + IF attr_def.sorter THEN + IF rdeleted THEN -- initial insert OR revivication + DELETE FROM metabib.record_sorter WHERE source = rid AND attr = attr_def.name; + INSERT INTO metabib.record_sorter (source, attr, value) VALUES (rid, attr_def.name, norm_attr_value[1]); + ELSE + UPDATE metabib.record_sorter SET value = norm_attr_value[1] WHERE source = rid AND attr = attr_def.name; + END IF; + END IF; END LOOP; - IF rdeleted THEN -- initial insert OR revivication - DELETE FROM metabib.record_attr WHERE id = rid; - INSERT INTO metabib.record_attr (id, attrs) VALUES (rid, new_attrs); - ELSE - UPDATE metabib.record_attr SET attrs = new_attrs WHERE id = rid; + IF ARRAY_LENGTH(vlist, 1) > 0 THEN +/* We may need to rewrite the vlist to contain + the intersection of new values for requested + attrs and old values for ignored attrs. To + do this, we take the old attr vlist and + subtract any values that are valid for the + requested attrs, and then add back the new + set of attr values. */ + IF ARRAY_LENGTH(pattr_list, 1) > 0 THEN + SELECT vlist INTO attr_vector_tmp FROM metabib.record_attr_vector_list WHERE source = rid; + SELECT attr_vector_tmp - ARRAY_AGG(id) INTO attr_vector_tmp FROM metabib.uncontrolled_record_attr_value WHERE attr = ANY (pattr_list); + SELECT attr_vector_tmp - ARRAY_AGG(id) INTO attr_vector_tmp FROM config.coded_value_map WHERE ctype = ANY (pattr_list); + attr_vector := attr_vector || attr_vector_tmp; + END IF; + + IF rdeleted THEN -- initial insert OR revivication + DELETE FROM metabib.record_attr_vector_list WHERE source = rid; + INSERT INTO metabib.record_attr_vector_list (source, vlist) VALUES (rid, attr_vector); + ELSE + UPDATE metabib.record_attr_vector_list SET vlist = attr_vector WHERE source = rid; + END IF; END IF; END; @@ -1417,7 +1523,7 @@ BEGIN -- with the #deleted modifier, so one should turn on the named -- internal flag for that functionality. DELETE FROM metabib.metarecord_source_map WHERE source = NEW.id; - DELETE FROM metabib.record_attr WHERE id = NEW.id; + DELETE FROM metabib.record_attr_vector_list WHERE source = NEW.id; END IF; DELETE FROM authority.bib_linking WHERE bib = NEW.id; -- Avoid updating fields in bibs that are no longer visible @@ -1448,7 +1554,7 @@ BEGIN -- Now we pull out attribute data, which is dependent on the mfr for all but XPath-based fields PERFORM * FROM config.internal_flag WHERE name = 'ingest.disable_metabib_rec_descriptor' AND enabled; IF NOT FOUND THEN - PERFORM metabib.reingest_record_attributes(NEW.id, NEW.marc, TG_OP = 'INSERT' OR OLD.deleted); + PERFORM metabib.reingest_record_attributes(NEW.id, NULL, NEW.marc, TG_OP = 'INSERT' OR OLD.deleted); END IF; END IF; -- 2.11.0