From f6f69bfc2f62a1c5c4dfa2837307950d414a98e6 Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Thu, 31 Aug 2017 18:30:14 -0400 Subject: [PATCH] LP#1638299: reify schema and seed data for new databases Signed-off-by: Galen Charlton Signed-off-by: Kathy Lussier --- Open-ILS/src/sql/Pg/011.schema.authority.sql | 225 ++++++++++++++++++++++++++- Open-ILS/src/sql/Pg/950.data.seed-values.sql | 186 ++++++++++++++++++++++ Open-ILS/src/sql/Pg/955.data.MADS21-xsl.sql | 5 +- Open-ILS/src/sql/Pg/sql_file_manifest | 1 + 4 files changed, 414 insertions(+), 3 deletions(-) diff --git a/Open-ILS/src/sql/Pg/011.schema.authority.sql b/Open-ILS/src/sql/Pg/011.schema.authority.sql index 9afb840004..4387417973 100644 --- a/Open-ILS/src/sql/Pg/011.schema.authority.sql +++ b/Open-ILS/src/sql/Pg/011.schema.authority.sql @@ -22,6 +22,79 @@ DROP SCHEMA IF EXISTS authority CASCADE; BEGIN; CREATE SCHEMA authority; +-- subset of types listed in https://www.loc.gov/marc/authority/ad1xx3xx.html +-- for now, ignoring subdivisions +CREATE TYPE authority.heading_type AS ENUM ( + 'personal_name', + 'corporate_name', + 'meeting_name', + 'uniform_title', + 'named_event', + 'chronological_term', + 'topical_term', + 'geographic_name', + 'genre_form_term', + 'medium_of_performance_term' +); + +CREATE TYPE authority.variant_heading_type AS ENUM ( + 'abbreviation', + 'acronym', + 'translation', + 'expansion', + 'other', + 'hidden' +); + +CREATE TYPE authority.related_heading_type AS ENUM ( + 'earlier', + 'later', + 'parent organization', + 'broader', + 'narrower', + 'equivalent', + 'other' +); + +CREATE TYPE authority.heading_purpose AS ENUM ( + 'main', + 'variant', + 'related' +); + +CREATE TABLE authority.heading_field ( + id SERIAL PRIMARY KEY, + heading_type authority.heading_type NOT NULL, + heading_purpose authority.heading_purpose NOT NULL, + label TEXT NOT NULL, + format TEXT NOT NULL REFERENCES config.xml_transform (name) DEFAULT 'mads21', + heading_xpath TEXT NOT NULL, + component_xpath TEXT NOT NULL, + type_xpath TEXT NULL, -- to extract related or variant type + thesaurus_xpath TEXT NULL, + thesaurus_override_xpath TEXT NULL, + joiner TEXT NULL +); + +CREATE TABLE authority.heading_field_norm_map ( + id SERIAL PRIMARY KEY, + field INT NOT NULL REFERENCES authority.heading_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + norm INT NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED, + params TEXT, + pos INT NOT NULL DEFAULT 0 +); + +CREATE TYPE authority.heading AS ( + field INT, + type authority.heading_type, + purpose authority.heading_purpose, + variant_type authority.variant_heading_type, + related_type authority.related_heading_type, + thesaurus TEXT, + heading TEXT, + normalized_heading TEXT +); + CREATE TABLE authority.control_set ( id SERIAL PRIMARY KEY, name TEXT NOT NULL UNIQUE, -- i18n @@ -39,7 +112,8 @@ CREATE TABLE authority.control_set_authority_field ( name TEXT NOT NULL, -- i18n description TEXT, -- i18n joiner TEXT, - linking_subfield CHAR(1) + linking_subfield CHAR(1), + heading_field INTEGER REFERENCES authority.heading_field(id) ); CREATE TABLE authority.control_set_bib_field ( @@ -971,6 +1045,155 @@ CREATE OR REPLACE FUNCTION authority.atag_search_heading_refs( a TEXT, q TEXT, p SELECT * FROM authority.simple_heading_search_heading(authority.atag_authority_tags_refs($1), $2, $3, $4, $5) $$ LANGUAGE SQL ROWS 10; +CREATE OR REPLACE FUNCTION authority.extract_headings(marc TEXT, restrict INT[] DEFAULT NULL) RETURNS SETOF authority.heading AS $func$ +DECLARE + idx authority.heading_field%ROWTYPE; + xfrm config.xml_transform%ROWTYPE; + prev_xfrm TEXT; + transformed_xml TEXT; + heading_node TEXT; + heading_node_list TEXT[]; + component_node TEXT; + component_node_list TEXT[]; + raw_text TEXT; + normalized_text TEXT; + normalizer RECORD; + curr_text TEXT; + joiner TEXT; + type_value TEXT; + base_thesaurus TEXT := NULL; + output_row authority.heading; +BEGIN + + -- Loop over the indexing entries + FOR idx IN SELECT * FROM authority.heading_field WHERE restrict IS NULL OR id = ANY (restrict) ORDER BY format LOOP + + output_row.field := idx.id; + output_row.type := idx.heading_type; + output_row.purpose := idx.heading_purpose; + + joiner := COALESCE(idx.joiner, ' '); + + SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format; + + -- See if we can skip the XSLT ... it's expensive + IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN + -- Can't skip the transform + IF xfrm.xslt <> '---' THEN + transformed_xml := oils_xslt_process(marc, xfrm.xslt); + ELSE + transformed_xml := marc; + END IF; + + prev_xfrm := xfrm.name; + END IF; + + IF idx.thesaurus_xpath IS NOT NULL THEN + base_thesaurus := ARRAY_TO_STRING(oils_xpath(idx.thesaurus_xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]), ''); + END IF; + + heading_node_list := oils_xpath( idx.heading_xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + + FOR heading_node IN SELECT x FROM unnest(heading_node_list) AS x LOOP + + CONTINUE WHEN heading_node !~ E'^\\s*<'; + + output_row.variant_type := NULL; + output_row.related_type := NULL; + output_row.thesaurus := NULL; + output_row.heading := NULL; + + IF idx.heading_purpose = 'variant' AND idx.type_xpath IS NOT NULL THEN + type_value := ARRAY_TO_STRING(oils_xpath(idx.type_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]), ''); + BEGIN + output_row.variant_type := type_value; + EXCEPTION WHEN invalid_text_representation THEN + RAISE NOTICE 'Do not recognize variant heading type %', type_value; + END; + END IF; + IF idx.heading_purpose = 'related' AND idx.type_xpath IS NOT NULL THEN + type_value := ARRAY_TO_STRING(oils_xpath(idx.type_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]), ''); + BEGIN + output_row.related_type := type_value; + EXCEPTION WHEN invalid_text_representation THEN + RAISE NOTICE 'Do not recognize related heading type %', type_value; + END; + END IF; + + IF idx.thesaurus_override_xpath IS NOT NULL THEN + output_row.thesaurus := ARRAY_TO_STRING(oils_xpath(idx.thesaurus_override_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]), ''); + END IF; + IF output_row.thesaurus IS NULL THEN + output_row.thesaurus := base_thesaurus; + END IF; + + raw_text := NULL; + + -- now iterate over components of heading + component_node_list := oils_xpath( idx.component_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] ); + FOR component_node IN SELECT x FROM unnest(component_node_list) AS x LOOP + -- XXX much of this should be moved into oils_xpath_string... + curr_text := ARRAY_TO_STRING(evergreen.array_remove_item_by_value(evergreen.array_remove_item_by_value( + oils_xpath( '//text()', -- get the content of all the nodes within the main selected node + REGEXP_REPLACE( component_node, E'\\s+', ' ', 'g' ) -- Translate adjacent whitespace to a single space + ), ' '), ''), -- throw away morally empty (bankrupt?) strings + joiner + ); + + CONTINUE WHEN curr_text IS NULL OR curr_text = ''; + + IF raw_text IS NOT NULL THEN + raw_text := raw_text || joiner; + END IF; + + raw_text := COALESCE(raw_text,'') || curr_text; + END LOOP; + + IF raw_text IS NOT NULL THEN + output_row.heading := raw_text; + normalized_text := raw_text; + + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN authority.heading_field_norm_map m ON (m.norm = n.id) + WHERE m.field = idx.id + ORDER BY m.pos LOOP + + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( normalized_text ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO normalized_text; + + END LOOP; + + output_row.normalized_heading := normalized_text; + + RETURN NEXT output_row; + END IF; + END LOOP; + + END LOOP; +END; +$func$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION authority.extract_headings(rid BIGINT, restrict INT[] DEFAULT NULL) RETURNS SETOF authority.heading AS $func$ +DECLARE + auth authority.record_entry%ROWTYPE; + output_row authority.heading; +BEGIN + -- Get the record + SELECT INTO auth * FROM authority.record_entry WHERE id = rid; + + RETURN QUERY SELECT * FROM authority.extract_headings(auth.marc, restrict); +END; +$func$ LANGUAGE PLPGSQL; COMMIT; diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql index 3ff267542e..cbb08fb7ac 100644 --- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql +++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql @@ -111,6 +111,7 @@ INSERT INTO config.xml_transform VALUES ( 'mods3', 'http://www.loc.gov/mods/v3', INSERT INTO config.xml_transform VALUES ( 'mods32', 'http://www.loc.gov/mods/v3', 'mods32', ''); INSERT INTO config.xml_transform VALUES ( 'mods33', 'http://www.loc.gov/mods/v3', 'mods33', ''); INSERT INTO config.xml_transform VALUES ( 'marc21expand880', 'http://www.loc.gov/MARC21/slim', 'marc', '' ); +INSERT INTO config.xml_transform VALUES ( 'mads21', 'http://www.loc.gov/mads/v2', 'mads21', '' ); -- Index Definitions INSERT INTO config.metabib_field ( id, field_class, name, label, format, xpath, facet_field, authority_xpath, browse_field ) VALUES @@ -17772,3 +17773,188 @@ INSERT INTO config.i18n_core (fq_field, identity_value, translation, string ) AND t.xlate IS NOT NULL AND t.name <> (xlate->value); +INSERT INTO authority.heading_field(heading_type, heading_purpose, label, heading_xpath, component_xpath, type_xpath, thesaurus_xpath, thesaurus_override_xpath) VALUES + ( 'topical_term', 'main', 'Main Topical Term', '/mads21:mads/mads21:authority', '//mads21:topic', NULL, '/mads21:mads/mads21:authority/mads21:topic[1]/@authority', NULL ) +,( 'topical_term', 'variant', 'Variant Topical Term', '/mads21:mads/mads21:variant', '//mads21:topic', '/mads21:variant/@type', '/mads21:mads/mads21:authority/mads21:topic[1]/@authority', '//mads21:topic[1]/@authority') +,( 'topical_term', 'related', 'Related Topical Term', '/mads21:mads/mads21:related', '//mads21:topic', '/mads21:related/@type', '/mads21:mads/mads21:authority/mads21:topic[1]/@authority', '//mads21:topic[1]/@authority') +,( 'personal_name', 'main', 'Main Personal Name', '/mads21:mads/mads21:authority', '//mads21:name[@type="personal"]', NULL, NULL, NULL ) +,( 'personal_name', 'variant', 'Variant Personal Name', '/mads21:mads/mads21:variant', '//mads21:name[@type="personal"]', NULL, NULL, NULL ) +,( 'personal_name', 'related', 'Related Personal Name', '/mads21:mads/mads21:related', '//mads21:name[@type="personal"]', '/mads21:related/@type', NULL, NULL ) +,( 'corporate_name', 'main', 'Main Corporate name', '/mads21:mads/mads21:authority', '//mads21:name[@type="corporate"]', NULL, NULL, NULL ) +,( 'corporate_name', 'variant', 'Variant Corporate Name', '/mads21:mads/mads21:variant', '//mads21:name[@type="corporate"]', NULL, NULL, NULL ) +,( 'corporate_name', 'related', 'Related Corporate Name', '/mads21:mads/mads21:related', '//mads21:name[@type="corporate"]', '/mads21:related/@type', NULL, NULL ) +,( 'meeting_name', 'main', 'Main Meeting name', '/mads21:mads/mads21:authority', '//mads21:name[@type="conference"]', NULL, NULL, NULL ) +,( 'meeting_name', 'variant', 'Variant Meeting Name', '/mads21:mads/mads21:variant', '//mads21:name[@type="conference"]', NULL, NULL, NULL ) +,( 'meeting_name', 'related', 'Related Meeting Name', '/mads21:mads/mads21:related', '//mads21:name[@type="meeting"]', '/mads21:related/@type', NULL, NULL ) +,( 'geographic_name', 'main', 'Main Geographic Term', '/mads21:mads/mads21:authority', '//mads21:geographic', NULL, '/mads21:mads/mads21:authority/mads21:geographic[1]/@authority', NULL ) +,( 'geographic_name', 'variant', 'Variant Geographic Term', '/mads21:mads/mads21:variant', '//mads21:geographic', '/mads21:variant/@type', '/mads21:mads/mads21:authority/mads21:geographic[1]/@authority', '//mads21:geographic[1]/@authority') +,( 'geographic_name', 'related', 'Related Geographic Term', '/mads21:mads/mads21:related', '//mads21:geographic', '/mads21:related/@type', '/mads21:mads/mads21:authority/mads21:geographic[1]/@authority', '//mads21:geographic[1]/@authority') +,( 'genre_form_term', 'main', 'Main Genre/Form Term', '/mads21:mads/mads21:authority', '//mads21:genre', NULL, '/mads21:mads/mads21:authority/mads21:genre[1]/@authority', NULL ) +,( 'genre_form_term', 'variant', 'Variant Genre/Form Term', '/mads21:mads/mads21:variant', '//mads21:genre', '/mads21:variant/@type', '/mads21:mads/mads21:authority/mads21:genre[1]/@authority', '//mads21:genre[1]/@authority') +,( 'genre_form_term', 'related', 'Related Genre/Form Term', '/mads21:mads/mads21:related', '//mads21:genre', '/mads21:related/@type', '/mads21:mads/mads21:authority/mads21:genre[1]/@authority', '//mads21:genre[1]/@authority') +,( 'chronological_term', 'main', 'Main Chronological Term', '/mads21:mads/mads21:authority', '//mads21:temporal', NULL, '/mads21:mads/mads21:authority/mads21:temporal[1]/@authority', NULL ) +,( 'chronological_term', 'variant', 'Variant Chronological Term', '/mads21:mads/mads21:variant', '//mads21:temporal', '/mads21:variant/@type', '/mads21:mads/mads21:authority/mads21:temporal[1]/@authority', '//mads21:temporal[1]/@authority') +,( 'chronological_term', 'related', 'Related Chronological Term', '/mads21:mads/mads21:related', '//mads21:temporal', '/mads21:related/@type', '/mads21:mads/mads21:authority/mads21:temporal[1]/@authority', '//mads21:temporal[1]/@authority') +,( 'uniform_title', 'main', 'Main Uniform Title', '/mads21:mads/mads21:authority', '//mads21:title', NULL, '/mads21:mads/mads21:authority/mads21:title[1]/@authority', NULL ) +,( 'uniform_title', 'variant', 'Variant Uniform Title', '/mads21:mads/mads21:variant', '//mads21:title', '/mads21:variant/@type', '/mads21:mads/mads21:authority/mads21:title[1]/@authority', '//mads21:title[1]/@authority') +,( 'uniform_title', 'related', 'Related Uniform Title', '/mads21:mads/mads21:related', '//mads21:title', '/mads21:related/@type', '/mads21:mads/mads21:authority/mads21:title[1]/@authority', '//mads21:title[1]/@authority') +; + +-- NACO normalize all the things +INSERT INTO authority.heading_field_norm_map (field, norm, pos) +SELECT id, 1, 0 +FROM authority.heading_field; + +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '100' +AND control_set = 1 +AND ahf.heading_purpose = 'main' +AND ahf.heading_type = 'personal_name'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '400' +AND control_set = 1 +AND ahf.heading_purpose = 'variant' +AND ahf.heading_type = 'personal_name'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '500' +AND control_set = 1 +AND ahf.heading_purpose = 'related' +AND ahf.heading_type = 'personal_name'; + +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '110' +AND control_set = 1 +AND ahf.heading_purpose = 'main' +AND ahf.heading_type = 'corporate_name'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '410' +AND control_set = 1 +AND ahf.heading_purpose = 'variant' +AND ahf.heading_type = 'corporate_name'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '510' +AND control_set = 1 +AND ahf.heading_purpose = 'related' +AND ahf.heading_type = 'corporate_name'; + +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '111' +AND control_set = 1 +AND ahf.heading_purpose = 'main' +AND ahf.heading_type = 'meeting_name'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '411' +AND control_set = 1 +AND ahf.heading_purpose = 'variant' +AND ahf.heading_type = 'meeting_name'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '511' +AND control_set = 1 +AND ahf.heading_purpose = 'related' +AND ahf.heading_type = 'meeting_name'; + +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '130' +AND control_set = 1 +AND ahf.heading_purpose = 'main' +AND ahf.heading_type = 'uniform_title'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '430' +AND control_set = 1 +AND ahf.heading_purpose = 'variant' +AND ahf.heading_type = 'uniform_title'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '530' +AND control_set = 1 +AND ahf.heading_purpose = 'related' +AND ahf.heading_type = 'uniform_title'; + +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '150' +AND control_set = 1 +AND ahf.heading_purpose = 'main' +AND ahf.heading_type = 'topical_term'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '450' +AND control_set = 1 +AND ahf.heading_purpose = 'variant' +AND ahf.heading_type = 'topical_term'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '550' +AND control_set = 1 +AND ahf.heading_purpose = 'related' +AND ahf.heading_type = 'topical_term'; + +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '151' +AND control_set = 1 +AND ahf.heading_purpose = 'main' +AND ahf.heading_type = 'geographic_name'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '451' +AND control_set = 1 +AND ahf.heading_purpose = 'variant' +AND ahf.heading_type = 'geographic_name'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '551' +AND control_set = 1 +AND ahf.heading_purpose = 'related' +AND ahf.heading_type = 'geographic_name'; + +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '155' +AND control_set = 1 +AND ahf.heading_purpose = 'main' +AND ahf.heading_type = 'genre_form_term'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '455' +AND control_set = 1 +AND ahf.heading_purpose = 'variant' +AND ahf.heading_type = 'genre_form_term'; +UPDATE authority.control_set_authority_field acsaf +SET heading_field = ahf.id +FROM authority.heading_field ahf +WHERE tag = '555' +AND control_set = 1 +AND ahf.heading_purpose = 'related' +AND ahf.heading_type = 'genre_form_term'; diff --git a/Open-ILS/src/sql/Pg/955.data.MADS21-xsl.sql b/Open-ILS/src/sql/Pg/955.data.MADS21-xsl.sql index 225be32e52..94fb59a32b 100644 --- a/Open-ILS/src/sql/Pg/955.data.MADS21-xsl.sql +++ b/Open-ILS/src/sql/Pg/955.data.MADS21-xsl.sql @@ -1,4 +1,4 @@ -INSERT INTO config.xml_transform (name,namespace_uri,prefix,xslt) VALUES ('mads21','http://www.loc.gov/mads/v2','mads21',$XSLT$ +UPDATE config.xml_transform SET xslt=$XSLT$ @@ -1621,4 +1621,5 @@ INSERT INTO config.xml_transform (name,namespace_uri,prefix,xslt) VALUES ('mads2 -$XSLT$); +$XSLT$ +WHERE name = 'mads21';; diff --git a/Open-ILS/src/sql/Pg/sql_file_manifest b/Open-ILS/src/sql/Pg/sql_file_manifest index 9b16ab1f2d..6e01622c46 100644 --- a/Open-ILS/src/sql/Pg/sql_file_manifest +++ b/Open-ILS/src/sql/Pg/sql_file_manifest @@ -56,6 +56,7 @@ FTS_CONFIG_FILE 953.data.MODS32-xsl.sql 954.data.MODS33-xsl.sql 954.data.marc21expand880.sql +955.data.MADS21-xsl.sql 960.data.marc21-tag-tables.sql -- 2.11.0