From: Galen Charlton Date: Fri, 6 Jan 2017 22:43:34 +0000 (-0500) Subject: WIP: start work on schema X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=ba821441329081fbe67d34e5dfc6cf8e44fa789a;p=working%2FEvergreen.git WIP: start work on schema This patch starts work on the configuration tables, seed data, and functions for extracting headings from authority records based on (usually) the MARCXML to MADS XSLT. For the purpose of the WIP, everything is being stuck in the same upgrade script; this will be broken out later. TODO: * verify that MARC and MADS will produce results that are the same for uniform titles * add hooks for assigning normalizers * see if there is a reasonable way for the MADS stylesheet to generate http://www.loc.gov/mads/rdf/v1#authoritativeLabel values for display purposes * work out how to extract standard identifiers in the face of the typical 001/035 => 035$a during authority ingest (note that the default MARCXML2MADS stylesheet does *not* consider the 035) * fully populate the seed data * make per-heading thesaurus extraction smarter. *Most* of the time, variant and related headings will be in the same thesaurus as the main heading; if they're not, they explicitly will have overrides. Some implementation notes: * The authority.variant_heading_type and authority.related_heading_type enums are arguably too strict, but may protect us against errors later * The configuration tables are a lie. By this, I mean that authority.heading_field will likely embed assumptions that the format will always be MADS-like; while the mechanism should be flexible enough to accommodate MADS extensions and new headings types, there will undoubtedly be edge cases that will live in store procedure code. Signed-off-by: Galen Charlton --- diff --git a/Open-ILS/src/sql/Pg/upgrade/YYYY.schema.authority.sql b/Open-ILS/src/sql/Pg/upgrade/YYYY.schema.authority.sql new file mode 100644 index 0000000000..ed0113228d --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/YYYY.schema.authority.sql @@ -0,0 +1,182 @@ +BEGIN; + +-- subset of types listed in https://www.loc.gov/marc/authority/ad1xx3xx.html +-- for now, ignoring subdivisions +CREATE TYPE authority.heading_type AS ENUM ( + 'personal_name', + 'corporate_name', + 'meeting_name', + 'uniform_title', + 'named_event', + 'chronological_term', + 'topical_term', + 'geographic_name', + 'genre_form_term', + 'medium_of_performance_term' +); + +CREATE TYPE authority.variant_heading_type AS ENUM ( + 'abbreviation', + 'acronym', + 'translation', + 'expansion', + 'other', + 'hidden' +); + +CREATE TYPE authority.related_heading_type AS ENUM ( + 'earlier', + 'later', + 'parent organization', + 'broader', + 'narrower', + 'equivalent' +); + +CREATE TYPE authority.heading_purpose AS ENUM ( + 'main', + 'variant', + 'related' +); + +CREATE TABLE authority.heading_field ( + id SERIAL PRIMARY KEY, + heading_type authority.heading_type NOT NULL, + heading_purpose authority.heading_purpose NOT NULL, + label TEXT NOT NULL, + format TEXT NOT NULL REFERENCES config.xml_transform (name) DEFAULT 'mads21', + heading_xpath TEXT NOT NULL, + component_xpath TEXT NOT NULL, + type_xpath TEXT NULL, -- to extract related or variant type + thesaurus_xpath TEXT NULL, + joiner TEXT NULL +); + +INSERT INTO authority.heading_field(heading_type, heading_purpose, label, heading_xpath, component_xpath, type_xpath, thesaurus_xpath) VALUES +( 'topical_term', 'main', 'Main Topical Term', '/mads21:mads/mads21:authority', '//mads21:topic', NULL, '//mads21:topic[1]/@authority' ), +( 'topical_term', 'variant', 'Variant Topical Term', '/mads21:mads/mads21:variant', '//mads21:topic', '/mads21:variant/@type', '//mads21:topic[1]/@authority'), +( 'topical_term', 'related', 'Broader Topical Term', '/mads21:mads/mads21:related', '//mads21:topic', '/mads21:related/@type', '//mads21:topic[1]/@authority'), +( 'personal_name', 'main', 'Main Personal name', '/mads21:mads/mads21:authority', '//mads21:name', NULL, NULL ), +( 'personal_name', 'variant', 'Variant Personal name', '/mads21:mads/mads21:variant', '//mads21:name', NULL, NULL ) +; + +CREATE TYPE authority.heading AS ( + type authority.heading_type, + purpose authority.heading_purpose, + variant_type authority.variant_heading_type, + related_type authority.related_heading_type, + thesaurus TEXT, + heading TEXT, + normalized_heading TEXT +); + +CREATE OR REPLACE FUNCTION authority.extract_headings(rid BIGINT) RETURNS SETOF authority.heading AS $func$ +DECLARE + auth authority.record_entry%ROWTYPE; + idx authority.heading_field%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + heading_node TEXT; + heading_node_list TEXT[]; + component_node TEXT; + component_node_list TEXT[]; + raw_text TEXT; + curr_text TEXT; + joiner TEXT; + type_value TEXT; + output_row authority.heading; +BEGIN + + -- Get the record + SELECT INTO auth * FROM authority.record_entry WHERE id = rid; + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM authority.heading_field ORDER BY format LOOP + + output_row.type := idx.heading_type; + output_row.purpose := idx.heading_purpose; + + joiner := COALESCE(idx.joiner, ' '); + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(auth.marc, xfrm.xslt); + ELSE + transformed_xml := auth.marc; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + heading_node_list := oils_xpath( idx.heading_xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + + FOR heading_node IN SELECT x FROM unnest(heading_node_list) AS x LOOP + + CONTINUE WHEN heading_node !~ E'^\\s*<'; + + output_row.variant_type := NULL; + output_row.related_type := NULL; + output_row.thesaurus := NULL; + output_row.heading := NULL; + + IF idx.heading_purpose = 'variant' AND idx.type_xpath IS NOT NULL THEN + type_value := ARRAY_TO_STRING(oils_xpath(idx.type_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]), ''); + BEGIN + output_row.variant_type := type_value; + EXCEPTION WHEN invalid_text_representation THEN + RAISE NOTICE 'Do not recognize variant heading type %', type_value; + END; + END IF; + IF idx.heading_purpose = 'related' AND idx.type_xpath IS NOT NULL THEN + type_value := ARRAY_TO_STRING(oils_xpath(idx.type_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]), ''); + BEGIN + output_row.related_type := type_value; + EXCEPTION WHEN invalid_text_representation THEN + RAISE NOTICE 'Do not recognize related heading type %', type_value; + END; + END IF; + + IF idx.thesaurus_xpath IS NOT NULL THEN + output_row.thesaurus = ARRAY_TO_STRING(oils_xpath(idx.thesaurus_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]), ''); + END IF; + + raw_text := NULL; + + -- now iterate over components of heading + component_node_list := oils_xpath( idx.component_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + FOR component_node IN SELECT x FROM unnest(component_node_list) AS x LOOP + -- XXX much of this should be moved into oils_xpath_string... + curr_text := ARRAY_TO_STRING(evergreen.array_remove_item_by_value(evergreen.array_remove_item_by_value( + oils_xpath( '//text()', -- get the content of all the nodes within the main selected node + REGEXP_REPLACE( component_node, E'\\s+', ' ', 'g' ) -- Translate adjacent whitespace to a single space + ), ' '), ''), -- throw away morally empty (bankrupt?) strings + joiner + ); + + CONTINUE WHEN curr_text IS NULL OR curr_text = ''; + + IF raw_text IS NOT NULL THEN + raw_text := raw_text || joiner; + END IF; + + raw_text := COALESCE(raw_text,'') || curr_text; + END LOOP; + + IF raw_text IS NOT NULL THEN + output_row.heading = raw_text; + RETURN NEXT output_row; + END IF; + END LOOP; + + END LOOP; +END; +$func$ LANGUAGE PLPGSQL; + +SELECT * FROM authority.extract_headings(152); +SELECT * FROM authority.extract_headings(164); +SELECT * FROM authority.extract_headings(149);