From: Mike Rylander Date: Fri, 15 Apr 2011 16:22:18 +0000 (-0400) Subject: Merge fixed field extraction from trunk X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=a4d1a96d689f320946018c012a85ee5514ca7c3c;p=evergreen%2Fequinox.git Merge fixed field extraction from trunk --- a4d1a96d689f320946018c012a85ee5514ca7c3c diff --cc Open-ILS/src/sql/Pg/012.schema.vandelay.sql index cfa325d966,476ee07745..c6b73708c6 --- a/Open-ILS/src/sql/Pg/012.schema.vandelay.sql +++ b/Open-ILS/src/sql/Pg/012.schema.vandelay.sql @@@ -193,401 -143,6 +193,418 @@@ CREATE TABLE vandelay.merge_profile CONSTRAINT add_replace_strip_or_preserve CHECK ((preserve_spec IS NOT NULL OR replace_spec IS NOT NULL) OR (preserve_spec IS NULL AND replace_spec IS NULL)) ); +CREATE OR REPLACE FUNCTION vandelay.marc21_record_type( marc TEXT ) RETURNS config.marc21_rec_type_map AS $func$ +DECLARE + ldr TEXT; + tval TEXT; + tval_rec RECORD; + bval TEXT; + bval_rec RECORD; + retval config.marc21_rec_type_map%ROWTYPE; +BEGIN + ldr := oils_xpath_string( '//*[local-name()="leader"]', marc ); + + IF ldr IS NULL OR ldr = '' THEN + SELECT * INTO retval FROM config.marc21_rec_type_map WHERE code = 'BKS'; + RETURN retval; + END IF; + + SELECT * INTO tval_rec FROM config.marc21_ff_pos_map WHERE fixed_field = 'Type' LIMIT 1; -- They're all the same + SELECT * INTO bval_rec FROM config.marc21_ff_pos_map WHERE fixed_field = 'BLvl' LIMIT 1; -- They're all the same + + + tval := SUBSTRING( ldr, tval_rec.start_pos + 1, tval_rec.length ); + bval := SUBSTRING( ldr, bval_rec.start_pos + 1, bval_rec.length ); + + -- RAISE NOTICE 'type %, blvl %, ldr %', tval, bval, ldr; + + SELECT * INTO retval FROM config.marc21_rec_type_map WHERE type_val LIKE '%' || tval || '%' AND blvl_val LIKE '%' || bval || '%'; + + + IF retval.code IS NULL THEN + SELECT * INTO retval FROM config.marc21_rec_type_map WHERE code = 'BKS'; + END IF; + + RETURN retval; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.marc21_extract_fixed_field( marc TEXT, ff TEXT ) RETURNS TEXT AS $func$ +DECLARE + rtype TEXT; + ff_pos RECORD; + tag_data RECORD; + val TEXT; +BEGIN + rtype := (vandelay.marc21_record_type( marc )).code; + FOR ff_pos IN SELECT * FROM config.marc21_ff_pos_map WHERE fixed_field = ff AND rec_type = rtype ORDER BY tag DESC LOOP - FOR tag_data IN SELECT value FROM UNNEST( oils_xpath( '//*[@tag="' || UPPER(ff_pos.tag) || '"]/text()', marc ) ) x(value) LOOP - val := SUBSTRING( tag_data.value, ff_pos.start_pos + 1, ff_pos.length ); - RETURN val; - END LOOP; ++ IF ff_pos.tag = 'ldr' THEN ++ val := oils_xpath_string('//*[local-name()="leader"]', marc); ++ IF val IS NOT NULL THEN ++ val := SUBSTRING( val, ff_pos.start_pos + 1, ff_pos.length ); ++ RETURN val; ++ END IF; ++ ELSE ++ FOR tag_data IN SELECT value FROM UNNEST( oils_xpath( '//*[@tag="' || UPPER(ff_pos.tag) || '"]/text()', marc ) ) x(value) LOOP ++ val := SUBSTRING( tag_data.value, ff_pos.start_pos + 1, ff_pos.length ); ++ RETURN val; ++ END LOOP; ++ END IF; + val := REPEAT( ff_pos.default_val, ff_pos.length ); + RETURN val; + END LOOP; + + RETURN NULL; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE TYPE biblio.record_ff_map AS (record BIGINT, ff_name TEXT, ff_value TEXT); +CREATE OR REPLACE FUNCTION vandelay.marc21_extract_all_fixed_fields( marc TEXT ) RETURNS SETOF biblio.record_ff_map AS $func$ +DECLARE + tag_data TEXT; + rtype TEXT; + ff_pos RECORD; + output biblio.record_ff_map%ROWTYPE; +BEGIN + rtype := (vandelay.marc21_record_type( marc )).code; + + FOR ff_pos IN SELECT * FROM config.marc21_ff_pos_map WHERE rec_type = rtype ORDER BY tag DESC LOOP + output.ff_name := ff_pos.fixed_field; + output.ff_value := NULL; + - FOR tag_data IN SELECT value FROM UNNEST( oils_xpath( '//*[@tag="' || UPPER(tag) || '"]/text()', marc ) ) x(value) LOOP - output.ff_value := SUBSTRING( tag_data.value, ff_pos.start_pos + 1, ff_pos.length ); - IF output.ff_value IS NULL THEN output.ff_value := REPEAT( ff_pos.default_val, ff_pos.length ); END IF; - RETURN NEXT output; - output.ff_value := NULL; - END LOOP; - ++ IF ff_pos.tag = 'ldr' THEN ++ output.ff_value := oils_xpath_string('//*[local-name()="leader"]', marc); ++ IF output.ff_value IS NOT NULL THEN ++ output.ff_value := SUBSTRING( output.ff_value, ff_pos.start_pos + 1, ff_pos.length ); ++ RETURN NEXT output; ++ output.ff_value := NULL; ++ END IF; ++ ELSE ++ FOR tag_data IN SELECT value FROM UNNEST( oils_xpath( '//*[@tag="' || UPPER(ff_pos.tag) || '"]/text()', marc ) ) x(value) LOOP ++ output.ff_value := SUBSTRING( tag_data, ff_pos.start_pos + 1, ff_pos.length ); ++ IF output.ff_value IS NULL THEN output.ff_value := REPEAT( ff_pos.default_val, ff_pos.length ); END IF; ++ RETURN NEXT output; ++ output.ff_value := NULL; ++ END LOOP; ++ END IF; ++ + END LOOP; + + RETURN; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE TYPE biblio.marc21_physical_characteristics AS ( id INT, record BIGINT, ptype TEXT, subfield INT, value INT ); +CREATE OR REPLACE FUNCTION vandelay.marc21_physical_characteristics( marc TEXT) RETURNS SETOF biblio.marc21_physical_characteristics AS $func$ +DECLARE + rowid INT := 0; + _007 TEXT; + ptype config.marc21_physical_characteristic_type_map%ROWTYPE; + psf config.marc21_physical_characteristic_subfield_map%ROWTYPE; + pval config.marc21_physical_characteristic_value_map%ROWTYPE; + retval biblio.marc21_physical_characteristics%ROWTYPE; +BEGIN + + _007 := oils_xpath_string( '//*[@tag="007"]', marc ); + + IF _007 IS NOT NULL AND _007 <> '' THEN + SELECT * INTO ptype FROM config.marc21_physical_characteristic_type_map WHERE ptype_key = SUBSTRING( _007, 1, 1 ); + + IF ptype.ptype_key IS NOT NULL THEN + FOR psf IN SELECT * FROM config.marc21_physical_characteristic_subfield_map WHERE ptype_key = ptype.ptype_key LOOP + SELECT * INTO pval FROM config.marc21_physical_characteristic_value_map WHERE ptype_subfield = psf.id AND value = SUBSTRING( _007, psf.start_pos + 1, psf.length ); + + IF pval.id IS NOT NULL THEN + rowid := rowid + 1; + retval.id := rowid; + retval.ptype := ptype.ptype_key; + retval.subfield := psf.id; + retval.value := pval.id; + RETURN NEXT retval; + END IF; + + END LOOP; + END IF; + END IF; + + RETURN; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE TYPE vandelay.flat_marc AS ( tag CHAR(3), ind1 TEXT, ind2 TEXT, subfield TEXT, value TEXT ); +CREATE OR REPLACE FUNCTION vandelay.flay_marc ( TEXT ) RETURNS SETOF vandelay.flat_marc AS $func$ + +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); + +my $xml = shift; +my $r = MARC::Record->new_from_xml( $xml ); + +return_next( { tag => 'LDR', value => $r->leader } ); + +for my $f ( $r->fields ) { + if ($f->is_control_field) { + return_next({ tag => $f->tag, value => $f->data }); + } else { + for my $s ($f->subfields) { + return_next({ + tag => $f->tag, + ind1 => $f->indicator(1), + ind2 => $f->indicator(2), + subfield => $s->[0], + value => $s->[1] + }); + + if ( $f->tag eq '245' and $s->[0] eq 'a' ) { + my $trim = $f->indicator(2) || 0; + return_next({ + tag => 'tnf', + ind1 => $f->indicator(1), + ind2 => $f->indicator(2), + subfield => 'a', + value => substr( $s->[1], $trim ) + }); + } + } + } +} + +return undef; + +$func$ LANGUAGE PLPERLU; + +CREATE OR REPLACE FUNCTION vandelay.flatten_marc ( marc TEXT ) RETURNS SETOF vandelay.flat_marc AS $func$ +DECLARE + output vandelay.flat_marc%ROWTYPE; + field RECORD; +BEGIN + FOR field IN SELECT * FROM vandelay.flay_marc( marc ) LOOP + output.ind1 := field.ind1; + output.ind2 := field.ind2; + output.tag := field.tag; + output.subfield := field.subfield; + IF field.subfield IS NOT NULL AND field.tag NOT IN ('020','022','024') THEN -- exclude standard numbers and control fields + output.value := naco_normalize(field.value, field.subfield); + ELSE + output.value := field.value; + END IF; + + CONTINUE WHEN output.value IS NULL; + + RETURN NEXT output; + END LOOP; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.extract_rec_attrs ( xml TEXT, attr_defs TEXT[]) RETURNS hstore AS $_$ +DECLARE + transformed_xml TEXT; + prev_xfrm TEXT; + normalizer RECORD; + xfrm config.xml_transform%ROWTYPE; + attr_value TEXT; + new_attrs HSTORE := ''::HSTORE; + attr_def config.record_attr_definition%ROWTYPE; +BEGIN + + FOR attr_def IN SELECT * FROM config.record_attr_definition WHERE name IN (SELECT * FROM UNNEST(attr_defs)) ORDER BY format LOOP + + IF attr_def.tag IS NOT NULL THEN -- tag (and optional subfield list) selection + SELECT ARRAY_TO_STRING(ARRAY_ACCUM(x.value), COALESCE(attr_def.joiner,' ')) INTO attr_value + FROM vandelay.flatten_marc(xml) AS x + WHERE x.tag LIKE attr_def.tag + AND CASE + WHEN attr_def.sf_list IS NOT NULL + THEN POSITION(x.subfield IN attr_def.sf_list) > 0 + ELSE TRUE + END + GROUP BY x.tag + ORDER BY x.tag + LIMIT 1; + + ELSIF attr_def.fixed_field IS NOT NULL THEN -- a named fixed field, see config.marc21_ff_pos_map.fixed_field + attr_value := vandelay.marc21_extract_fixed_field(xml, attr_def.fixed_field); + + ELSIF attr_def.xpath IS NOT NULL THEN -- and xpath expression + + SELECT INTO xfrm * FROM config.xml_transform WHERE name = attr_def.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(xml,xfrm.xslt); + ELSE + transformed_xml := xml; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + IF xfrm.name IS NULL THEN + -- just grab the marcxml (empty) transform + SELECT INTO xfrm * FROM config.xml_transform WHERE xslt = '---' LIMIT 1; + prev_xfrm := xfrm.name; + END IF; + + attr_value := oils_xpath_string(attr_def.xpath, transformed_xml, COALESCE(attr_def.joiner,' '), ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]); + + ELSIF attr_def.phys_char_sf IS NOT NULL THEN -- a named Physical Characteristic, see config.marc21_physical_characteristic_*_map + SELECT value::TEXT INTO attr_value + FROM vandelay.marc21_physical_characteristics(xml) + WHERE subfield = attr_def.phys_char_sf + LIMIT 1; -- Just in case ... + + END IF; + + -- apply index normalizers to attr_value + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.record_attr_index_norm_map m ON (m.norm = n.id) + WHERE attr = attr_def.name + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( attr_value ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO attr_value; + + END LOOP; + + -- Add the new value to the hstore + new_attrs := new_attrs || hstore( attr_def.name, attr_value ); + + END LOOP; + + RETURN new_attrs; +END; +$_$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.extract_rec_attrs ( xml TEXT ) RETURNS hstore AS $_$ + SELECT vandelay.extract_rec_attrs( $1, (SELECT ARRAY_ACCUM(name) FROM config.record_attr_definition)); +$_$ LANGUAGE SQL; + +CREATE OR REPLACE FUNCTION vandelay.match_bib_record ( ) RETURNS TRIGGER AS $func$ +DECLARE + incoming_existing_id TEXT; + my_bib_queue vandelay.bib_queue%ROWTYPE; + my_match_set vandelay.match_set%ROWTYPE; + test vandelay.match_set_point%ROWTYPE; + potential_matches BIGINT[]; + matches BIGINT[]; + rvalue TEXT; + quality_set hstore; + tmp_rec BIGINT; + tmp_quality INT; + first_round BOOL; +BEGIN + DELETE FROM vandelay.bib_match WHERE queued_record = NEW.id; + + incoming_existing_id := oils_xpath_string('//*[@tag="901"]/*[@code="c"][1]',NEW.marc); + + IF incoming_existing_id IS NOT NULL THEN + SELECT id INTO tmp_rec FROM biblio.record_entry WHERE id = exact_id; + IF tmp_rec IS NOT NULL THEN + INSERT INTO vandelay.bib_match (queued_record, eg_record, quality) VALUES ( NEW.id, exact_id, 9999); + RETURN NEW; + END IF; + END IF; + + SELECT * INTO my_bib_queue FROM vandelay.bib_queue WHERE id = NEW.queue; + + first_round := TRUE; + -- whew ... here we go ... + + + -- Commented out until replaced by tree-ish version +/* + FOR test IN SELECT * FROM vandelay.match_set_point WHERE match_set = my_bib_queue.match_set ORDER BY required DESC LOOP + IF test.tag IS NOT NULL THEN + FOR rvalue IN SELECT value FROM vandelay.flatten_marc( xml ) WHERE tag = test.tag AND subfield = test.subfield LOOP + SELECT ARRAY_ACCUM(DISTINCT record) INTO potential_matches FROM metabib.real_full_rec WHERE tag = test.tag AND subfield = test.subfield AND value = rvalue; + + IF first_round THEN + matches := potential_matches; + first_round := FALSE; + ELSIF test.required THEN + FOR tmp_rec IN SELECT * FROM UNNEST(matches) LOOP + IF tmp_rec NOT IN (SELECT * FROM UNNEST(potential_matches)) THEN + matches := evergreen.array_remove_item_by_value(matches, tmp_rec); + potential_matches := evergreen.array_remove_item_by_value(potential_matches, tmp_rec); + END IF; + END LOOP; + END IF; + + -- add the quality for this match + FOR tmp_rec IN SELECT * FROM UNNEST(potential_matches) LOOP + tmp_quality := COALESCE((quality_set -> tmp_rec::TEXT)::INT, 0); + quality_set := quality_set || hstore(tmp_rec::TEXT, (tmp_quality + test.quality)::TEXT); + END LOOP; + + END LOOP; + ELSE + rvalue := vandelay.vandelay.extract_rec_attrs(xml, ARRAY[test.svf]); + + IF first_round THEN + matches := potential_matches; + first_round := FALSE; + ELSIF test.required THEN + FOR tmp_rec IN SELECT * FROM UNNEST(matches) LOOP + IF tmp_rec NOT IN (SELECT * FROM UNNEST(potential_matches)) THEN + matches := evergreen.array_remove_item_by_value(matches, tmp_rec); + potential_matches := evergreen.array_remove_item_by_value(potential_matches, tmp_rec); + END IF; + END LOOP; + END IF; + + -- add the quality for this match + FOR tmp_rec IN SELECT * FROM UNNEST(potential_matches) LOOP + tmp_quality := COALESCE((quality_set -> tmp_rec::TEXT)::INT, 0); + quality_set := quality_set || hstore(tmp_rec::TEXT, (tmp_quality + test.quality)::TEXT); + END LOOP; + + END IF; + END LOOP; + + FOR tmp_rec IN SELECT * FROM UNNEST(matches) LOOP + INSERT INTO vandelay.bib_match (matched_set, queued_record, eg_record, quality) VALUES (my_bib_queue.match_set, NEW.id, tmp_rec, (quality_set -> tmp_rec::TEXT)); + END LOOP; +*/ + + RETURN NEW; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION vandelay.incoming_record_quality ( xml TEXT, match_set_id INT ) RETURNS INT AS $_$ +DECLARE + out_q INT := 0; + rvalue TEXT; + test vandelay.match_set_quality%ROWTYPE; +BEGIN + + FOR test IN SELECT * FROM vandelay.match_set_quality WHERE match_set = match_set_id LOOP + IF test.tag IS NOT NULL THEN + FOR rvalue IN SELECT value FROM vandelay.flatten_marc( xml ) WHERE tag = test.tag AND subfield = test.subfield LOOP + IF test.value = rvalue THEN + out_q := out_q + test.quality; + END IF; + END LOOP; + ELSE + IF test.value = vandelay.extract_rec_attrs(xml, ARRAY[test.svf]) THEN + out_q := out_q + test.quality; + END IF; + END IF; + END LOOP; + + RETURN out_q; +END; +$_$ LANGUAGE PLPGSQL; CREATE TYPE vandelay.tcn_data AS (tcn TEXT, tcn_source TEXT, used BOOL); CREATE OR REPLACE FUNCTION vandelay.find_bib_tcn_data ( xml TEXT ) RETURNS SETOF vandelay.tcn_data AS $_$