From: Mike Rylander Date: Thu, 10 Mar 2011 21:35:44 +0000 (-0500) Subject: moving functions around; tables for configuring match points and quality metrics X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=57b7f22475b6b3cdbe3514546c82eb5b2947041c;p=evergreen%2Ftadl.git moving functions around; tables for configuring match points and quality metrics --- diff --git a/Open-ILS/src/sql/Pg/012.schema.vandelay.sql b/Open-ILS/src/sql/Pg/012.schema.vandelay.sql index 784117002e..9b3e3cf16e 100644 --- a/Open-ILS/src/sql/Pg/012.schema.vandelay.sql +++ b/Open-ILS/src/sql/Pg/012.schema.vandelay.sql @@ -18,7 +18,8 @@ CREATE TABLE vandelay.queued_record ( create_time TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), import_time TIMESTAMP WITH TIME ZONE, purpose TEXT NOT NULL DEFAULT 'import' CHECK (purpose IN ('import','overlay')), - marc TEXT NOT NULL + marc TEXT NOT NULL, + quality INT NOT NULL DEFAULT 0 ); @@ -95,7 +96,8 @@ CREATE TABLE vandelay.bib_match ( field_type TEXT NOT NULL CHECK (field_type in ('isbn','tcn_value','id')), matched_attr INT REFERENCES vandelay.queued_bib_record_attr (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, queued_record BIGINT REFERENCES vandelay.queued_bib_record (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, - eg_record BIGINT REFERENCES biblio.record_entry (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED + eg_record BIGINT REFERENCES biblio.record_entry (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + quality INT NOT NULL DEFAULT 0 ); -- DROP TABLE vandelay.import_item CASCADE; @@ -143,6 +145,251 @@ CREATE TABLE vandelay.merge_profile ( CONSTRAINT add_replace_strip_or_preserve CHECK ((preserve_spec IS NOT NULL OR replace_spec IS NOT NULL) OR (preserve_spec IS NULL AND replace_spec IS NULL)) ); +CREATE TABLE vandelay.match_set ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL, + owner INT NOT NULL REFERENCES actor.org_unit (id) ON DELETE CASCADE, + mtype TEXT NOT NULL DEFAULT 'biblio', -- 'biblio','authority','mfhd'?, others? + CONSTRAINT name_once_per_owner_mtype UNIQUE (name, owner, mtype) +); + +-- Table to define match points, either FF via SVF or tag+subfield +CREATE TABLE vandelay.match_set_point ( + id SERIAL PRIMARY KEY, + match_set INT NOT NULL REFERENCES vandelay.match_set (id), + fixed_field TEXT, -- should exist in config.marc21_ff_pos_map.fixed_field + tag TEXT, + subfield TEXT, + required BOOL NOT NULL DEFAULT TRUE, + quality INT NOT NULL DEFAULT 1, -- higher is better + CONSTRAINT vmsp_need_a_subfield_with_a_tag CHECK ((tag IS NOT NULL AND subfield IS NOT NULL) OR tag IS NULL), + CONSTRAINT vmsp_need_a_tag_or_a_ff CHECK (tag IS NOT NULL AND fixed_field IS NULL) OR (tag IS NULL AND fixed_field IS NOT NULL)), + CONSTRAINT vmsp_def_once_per_set UNIQUE (match_set, COALESCE(tag,''), COALESCE(subfield,''), COALESCE(fixed_field,'')) +); + +CREATE TABLE vandelay.match_set_quality ( + id SERIAL PRIMARY KEY, + match_set INT NOT NULL REFERENCES vandelay.match_set (id), + fixed_field TEXT, -- should exist in config.marc21_ff_pos_map.fixed_field + tag TEXT, + subfield TEXT, + value TEXT NOT NULL, + quality INT NOT NULL DEFAULT 1, -- higher is better + CONSTRAINT vmsq_need_a_subfield_with_a_tag CHECK ((tag IS NOT NULL AND subfield IS NOT NULL) OR tag IS NULL), + CONSTRAINT vmsq_need_a_tag_or_a_ff CHECK (tag IS NOT NULL AND fixed_field IS NULL) OR (tag IS NULL AND fixed_field IS NOT NULL)), + CONSTRAINT vmsq_def_once_per_set UNIQUE (match_set, COALESCE(tag,''), COALESCE(subfield,''), COALESCE(fixed_field,'')) +); + +CREATE OR REPLACE FUNCTION vandelay.marc21_record_type( marc TEXT ) RETURNS config.marc21_rec_type_map AS $func$ +DECLARE + ldr TEXT; + tval TEXT; + tval_rec RECORD; + bval TEXT; + bval_rec RECORD; + retval config.marc21_rec_type_map%ROWTYPE; +BEGIN + ldr := oils_xpath_string( '//*[local-name()="leader"]', marc ); + + IF ldr IS NULL OR ldr = '' THEN + SELECT * INTO retval FROM config.marc21_rec_type_map WHERE code = 'BKS'; + RETURN retval; + END IF; + + SELECT * INTO tval_rec FROM config.marc21_ff_pos_map WHERE fixed_field = 'Type' LIMIT 1; -- They're all the same + SELECT * INTO bval_rec FROM config.marc21_ff_pos_map WHERE fixed_field = 'BLvl' LIMIT 1; -- They're all the same + + + tval := SUBSTRING( ldr, tval_rec.start_pos + 1, tval_rec.length ); + bval := SUBSTRING( ldr, bval_rec.start_pos + 1, bval_rec.length ); + + -- RAISE NOTICE 'type %, blvl %, ldr %', tval, bval, ldr; + + SELECT * INTO retval FROM config.marc21_rec_type_map WHERE type_val LIKE '%' || tval || '%' AND blvl_val LIKE '%' || bval || '%'; + + + IF retval.code IS NULL THEN + SELECT * INTO retval FROM config.marc21_rec_type_map WHERE code = 'BKS'; + END IF; + + RETURN retval; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.marc21_extract_fixed_field( marc TEXT, ff TEXT ) RETURNS TEXT AS $func$ +DECLARE + rtype TEXT; + ff_pos RECORD; + tag_data RECORD; + val TEXT; +BEGIN + rtype := (vandelay.marc21_record_type( marc )).code; + FOR ff_pos IN SELECT * FROM config.marc21_ff_pos_map WHERE fixed_field = ff AND rec_type = rtype ORDER BY tag DESC LOOP + FOR tag_data IN SELECT value FROM UNNEST( oils_xpath( '//*[@tag="' || UPPER(ff_pos.tag) || '"]/text()', marc ) ) x(value) LOOP + val := SUBSTRING( tag_data.value, ff_pos.start_pos + 1, ff_pos.length ); + RETURN val; + END LOOP; + val := REPEAT( ff_pos.default_val, ff_pos.length ); + RETURN val; + END LOOP; + + RETURN NULL; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE TYPE biblio.record_ff_map AS (record BIGINT, ff_name TEXT, ff_value TEXT); +CREATE OR REPLACE FUNCTION vandelay.marc21_extract_all_fixed_fields( marc TEXT ) RETURNS SETOF biblio.record_ff_map AS $func$ +DECLARE + tag_data TEXT; + rtype TEXT; + ff_pos RECORD; + output biblio.record_ff_map%ROWTYPE; +BEGIN + rtype := (vandelay.marc21_record_type( marc )).code; + + FOR ff_pos IN SELECT * FROM config.marc21_ff_pos_map WHERE rec_type = rtype ORDER BY tag DESC LOOP + output.ff_name := ff_pos.fixed_field; + output.ff_value := NULL; + + FOR tag_data IN SELECT value FROM UNNEST( oils_xpath( '//*[@tag="' || UPPER(tag) || '"]/text()', marc ) ) x(value) LOOP + output.ff_value := SUBSTRING( tag_data.value, ff_pos.start_pos + 1, ff_pos.length ); + IF output.ff_value IS NULL THEN output.ff_value := REPEAT( ff_pos.default_val, ff_pos.length ); END IF; + RETURN NEXT output; + output.ff_value := NULL; + END LOOP; + + END LOOP; + + RETURN; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE TYPE biblio.marc21_physical_characteristics AS ( id INT, record BIGINT, ptype TEXT, subfield INT, value INT ); +CREATE OR REPLACE FUNCTION vandelay.marc21_physical_characteristics( marc TEXT) RETURNS SETOF biblio.marc21_physical_characteristics AS $func$ +DECLARE + rowid INT := 0; + _007 TEXT; + ptype config.marc21_physical_characteristic_type_map%ROWTYPE; + psf config.marc21_physical_characteristic_subfield_map%ROWTYPE; + pval config.marc21_physical_characteristic_value_map%ROWTYPE; + retval biblio.marc21_physical_characteristics%ROWTYPE; +BEGIN + + _007 := oils_xpath_string( '//*[@tag="007"]', marc ); + + IF _007 IS NOT NULL AND _007 <> '' THEN + SELECT * INTO ptype FROM config.marc21_physical_characteristic_type_map WHERE ptype_key = SUBSTRING( _007, 1, 1 ); + + IF ptype.ptype_key IS NOT NULL THEN + FOR psf IN SELECT * FROM config.marc21_physical_characteristic_subfield_map WHERE ptype_key = ptype.ptype_key LOOP + SELECT * INTO pval FROM config.marc21_physical_characteristic_value_map WHERE ptype_subfield = psf.id AND value = SUBSTRING( _007, psf.start_pos + 1, psf.length ); + + IF pval.id IS NOT NULL THEN + rowid := rowid + 1; + retval.id := rowid; + retval.ptype := ptype.ptype_key; + retval.subfield := psf.id; + retval.value := pval.id; + RETURN NEXT retval; + END IF; + + END LOOP; + END IF; + END IF; + + RETURN; +END; +$func$ LANGUAGE PLPGSQL; + +CREAT TYPE vandelay.flat_marc AS ( tag CHAR(3), ind1 TEXT, ind2 TEXT, subfield TEXT, value TEXT ); +CREATE OR REPLACE FUNCTION vandelay.flay_marc ( TEXT ) RETURNS SETOF vandelay.flat_marc AS $func$ + +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); + +my $xml = shift; +my $r = MARC::Record->new_from_xml( $xml ); + +return_next( { tag => 'LDR', value => $r->leader } ); + +for my $f ( $r->fields ) { + if ($f->is_control_field) { + return_next({ tag => $f->tag, value => $f->data }); + } else { + for my $s ($f->subfields) { + return_next({ + tag => $f->tag, + ind1 => $f->indicator(1), + ind2 => $f->indicator(2), + subfield => $s->[0], + value => $s->[1] + }); + + if ( $f->tag eq '245' and $s->[0] eq 'a' ) { + my $trim = $f->indicator(2) || 0; + return_next({ + tag => 'tnf', + ind1 => $f->indicator(1), + ind2 => $f->indicator(2), + subfield => 'a', + value => substr( $s->[1], $trim ) + }); + } + } + } +} + +return undef; + +$func$ LANGUAGE PLPERLU; + +CREATE OR REPLACE FUNCTION vandelay.flatten_marc ( marc TEXT ) RETURNS SETOF vandelay.flat_marc AS $func$ +DECLARE + output vandelay.flat_marc%ROWTYPE; + field RECORD; +BEGIN + FOR field IN SELECT * FROM vandelay.flay_marc( marc ) LOOP + output.ind1 := field.ind1; + output.ind2 := field.ind2; + output.tag := field.tag; + output.subfield := field.subfield; + IF field.subfield IS NOT NULL AND field.tag NOT IN ('020','022','024') THEN -- exclude standard numbers and control fields + output.value := naco_normalize(field.value, field.subfield); + ELSE + output.value := field.value; + END IF; + + CONTINUE WHEN output.value IS NULL; + + RETURN NEXT output; + END LOOP; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.incoming_record_quality ( xml TEXT, match_set_id INT ) RETURNS INT AS $_$ +DECLARE + out_q INT := 0; + rvalue TEXT; + test vandelay.match_set_quality%ROWTYPE; +BEGIN + + FOR test IN SELECT * FROM vandelay.match_set_quality WHERE match_set = match_set_id LOOP + IF test.tag IS NOT NULL THEN + FOR rvalue IN SELECT value FROM vandelay.flatten_marc( xml ) WHERE tag = test.tag AND subfield = test.subfield LOOP + IF test.value = rvalue THEN + out_q := out_q + test.quality; + END IF; + END LOOP; + ELSIF test.fixed_field IS NOT NULL THEN -- a named fixed field, see config.marc21_ff_pos_map.fixed_field + IF test.value = vandelay.marc21_extract_fixed_field(xml, test.fixed_field) THEN + out_q := out_q + test.quality; + END IF; + END IF; + END LOOP; + + RETURN out_q; +END; +$_$ LANGUAGE PLPGSQL; CREATE TYPE vandelay.tcn_data AS (tcn TEXT, tcn_source TEXT, used BOOL); CREATE OR REPLACE FUNCTION vandelay.find_bib_tcn_data ( xml TEXT ) RETURNS SETOF vandelay.tcn_data AS $_$ diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index fb2c4cb7f4..3cbdd49dfe 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -417,19 +417,13 @@ DECLARE BEGIN SELECT INTO bib * FROM biblio.record_entry WHERE id = rid; - FOR field IN SELECT * FROM biblio.flatten_marc( bib.marc ) LOOP + FOR field IN SELECT * FROM vandelay.flatten_marc( bib.marc ) LOOP output.record := rid; output.ind1 := field.ind1; output.ind2 := field.ind2; output.tag := field.tag; output.subfield := field.subfield; - IF field.subfield IS NOT NULL AND field.tag NOT IN ('020','022','024') THEN -- exclude standard numbers and control fields - output.value := naco_normalize(field.value, field.subfield); - ELSE - output.value := field.value; - END IF; - - CONTINUE WHEN output.value IS NULL; + output.value := field.value; RETURN NEXT output; END LOOP; @@ -620,43 +614,6 @@ CREATE OR REPLACE FUNCTION biblio.marc21_extract_all_fixed_fields( rid BIGINT ) SELECT $1 AS record, ff_name, ff_value FROM vandelay.marc21_extract_all_fixed_fields( (SELECT marc FROM biblio.record_entry WHERE id = $1) ); $func$ LANGUAGE SQL; -CREATE TYPE biblio.marc21_physical_characteristics AS ( id INT, record BIGINT, ptype TEXT, subfield INT, value INT ); -CREATE OR REPLACE FUNCTION vandelay.marc21_physical_characteristics( marc TEXT) RETURNS SETOF biblio.marc21_physical_characteristics AS $func$ -DECLARE - rowid INT := 0; - _007 TEXT; - ptype config.marc21_physical_characteristic_type_map%ROWTYPE; - psf config.marc21_physical_characteristic_subfield_map%ROWTYPE; - pval config.marc21_physical_characteristic_value_map%ROWTYPE; - retval biblio.marc21_physical_characteristics%ROWTYPE; -BEGIN - - _007 := oils_xpath_string( '//*[@tag="007"]', marc ); - - IF _007 IS NOT NULL AND _007 <> '' THEN - SELECT * INTO ptype FROM config.marc21_physical_characteristic_type_map WHERE ptype_key = SUBSTRING( _007, 1, 1 ); - - IF ptype.ptype_key IS NOT NULL THEN - FOR psf IN SELECT * FROM config.marc21_physical_characteristic_subfield_map WHERE ptype_key = ptype.ptype_key LOOP - SELECT * INTO pval FROM config.marc21_physical_characteristic_value_map WHERE ptype_subfield = psf.id AND value = SUBSTRING( _007, psf.start_pos + 1, psf.length ); - - IF pval.id IS NOT NULL THEN - rowid := rowid + 1; - retval.id := rowid; - retval.ptype := ptype.ptype_key; - retval.subfield := psf.id; - retval.value := pval.id; - RETURN NEXT retval; - END IF; - - END LOOP; - END IF; - END IF; - - RETURN; -END; -$func$ LANGUAGE PLPGSQL; - CREATE OR REPLACE FUNCTION biblio.marc21_physical_characteristics( rid BIGINT ) RETURNS SETOF biblio.marc21_physical_characteristics AS $func$ SELECT id, $1 AS record, ptype, subfield, value FROM vandelay.marc21_physical_characteristics( (SELECT marc FROM biblio.record_entry WHERE id = $1) ); $func$ LANGUAGE SQL;