WIP: start work on schema
authorGalen Charlton <gmc@esilibrary.com>
Fri, 6 Jan 2017 22:43:34 +0000 (17:43 -0500)
committerGalen Charlton <gmc@esilibrary.com>
Fri, 6 Jan 2017 22:43:34 +0000 (17:43 -0500)
This patch starts work on the configuration tables, seed data,
and functions for extracting headings from authority records based on
(usually) the MARCXML to MADS XSLT.  For the purpose of the WIP, everything
is being stuck in the same upgrade script; this will be broken out later.

TODO:

* verify that MARC and MADS will produce results that are the same
  for uniform titles
* add hooks for assigning normalizers
* see if there is a reasonable way for the MADS stylesheet to
  generate http://www.loc.gov/mads/rdf/v1#authoritativeLabel values
  for display purposes
* work out how to extract standard identifiers in the face of
  the typical 001/035 => 035$a during authority ingest (note that
  the default MARCXML2MADS stylesheet does *not* consider the 035)
* fully populate the seed data
* make per-heading thesaurus extraction smarter. *Most* of the time,
  variant and related headings will be in the same thesaurus as the
  main heading; if they're not, they explicitly will have overrides.

Some implementation notes:

* The authority.variant_heading_type and authority.related_heading_type
  enums are arguably too strict, but may protect us against errors
  later
* The configuration tables are a lie. By this, I mean that authority.heading_field
  will likely embed assumptions that the format will always be MADS-like;
  while the mechanism should be flexible enough to accommodate MADS
  extensions and new headings types, there will undoubtedly be
  edge cases that will live in store procedure code.

Signed-off-by: Galen Charlton <gmc@esilibrary.com>
Open-ILS/src/sql/Pg/upgrade/YYYY.schema.authority.sql [new file with mode: 0644]

diff --git a/Open-ILS/src/sql/Pg/upgrade/YYYY.schema.authority.sql b/Open-ILS/src/sql/Pg/upgrade/YYYY.schema.authority.sql
new file mode 100644 (file)
index 0000000..ed01132
--- /dev/null
@@ -0,0 +1,182 @@
+BEGIN;
+
+-- subset of types listed in https://www.loc.gov/marc/authority/ad1xx3xx.html
+-- for now, ignoring subdivisions
+CREATE TYPE authority.heading_type AS ENUM (
+    'personal_name',
+    'corporate_name',
+    'meeting_name',
+    'uniform_title',
+    'named_event',
+    'chronological_term',
+    'topical_term',
+    'geographic_name',
+    'genre_form_term',
+    'medium_of_performance_term'
+);
+
+CREATE TYPE authority.variant_heading_type AS ENUM (
+    'abbreviation',
+    'acronym',
+    'translation',
+    'expansion',
+    'other',
+    'hidden'
+);
+
+CREATE TYPE authority.related_heading_type AS ENUM (
+    'earlier',
+    'later',
+    'parent organization',
+    'broader',
+    'narrower',
+    'equivalent'
+);
+
+CREATE TYPE authority.heading_purpose AS ENUM (
+    'main',
+    'variant',
+    'related'
+);
+
+CREATE TABLE authority.heading_field (
+    id              SERIAL                      PRIMARY KEY,
+    heading_type    authority.heading_type      NOT NULL,
+    heading_purpose authority.heading_purpose   NOT NULL,
+    label           TEXT                        NOT NULL,
+    format          TEXT                        NOT NULL REFERENCES config.xml_transform (name) DEFAULT 'mads21',
+    heading_xpath   TEXT                        NOT NULL,
+    component_xpath TEXT                        NOT NULL,
+    type_xpath      TEXT                        NULL, -- to extract related or variant type
+    thesaurus_xpath TEXT                        NULL,
+    joiner          TEXT                        NULL
+);
+
+INSERT INTO authority.heading_field(heading_type, heading_purpose, label, heading_xpath, component_xpath, type_xpath, thesaurus_xpath) VALUES
+( 'topical_term', 'main',    'Main Topical Term',    '/mads21:mads/mads21:authority', '//mads21:topic', NULL, '//mads21:topic[1]/@authority' ),
+( 'topical_term', 'variant', 'Variant Topical Term', '/mads21:mads/mads21:variant',   '//mads21:topic', '/mads21:variant/@type', '//mads21:topic[1]/@authority'),
+( 'topical_term', 'related', 'Broader Topical Term', '/mads21:mads/mads21:related',   '//mads21:topic', '/mads21:related/@type', '//mads21:topic[1]/@authority'),
+( 'personal_name', 'main', 'Main Personal name',     '/mads21:mads/mads21:authority', '//mads21:name', NULL, NULL ),
+( 'personal_name', 'variant', 'Variant Personal name',     '/mads21:mads/mads21:variant', '//mads21:name', NULL, NULL )
+;
+
+CREATE TYPE authority.heading AS (
+    type                authority.heading_type,
+    purpose             authority.heading_purpose,
+    variant_type        authority.variant_heading_type,
+    related_type        authority.related_heading_type,
+    thesaurus           TEXT,
+    heading             TEXT,
+    normalized_heading  TEXT
+);
+
+CREATE OR REPLACE FUNCTION authority.extract_headings(rid BIGINT) RETURNS SETOF authority.heading AS $func$
+DECLARE
+    auth        authority.record_entry%ROWTYPE;
+    idx         authority.heading_field%ROWTYPE;
+    xfrm        config.xml_transform%ROWTYPE;
+    prev_xfrm   TEXT;
+    transformed_xml TEXT;
+    heading_node    TEXT;
+    heading_node_list   TEXT[];
+    component_node    TEXT;
+    component_node_list   TEXT[];
+    raw_text    TEXT;
+    curr_text   TEXT;
+    joiner      TEXT;
+    type_value  TEXT;
+    output_row  authority.heading;
+BEGIN
+
+    -- Get the record
+    SELECT INTO auth * FROM authority.record_entry WHERE id = rid;
+
+    -- Loop over the indexing entries
+    FOR idx IN SELECT * FROM authority.heading_field ORDER BY format LOOP
+
+        output_row.type    := idx.heading_type;
+        output_row.purpose := idx.heading_purpose;
+
+        joiner := COALESCE(idx.joiner, ' ');
+
+        SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format;
+
+        -- See if we can skip the XSLT ... it's expensive
+        IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN
+            -- Can't skip the transform
+            IF xfrm.xslt <> '---' THEN
+                transformed_xml := oils_xslt_process(auth.marc, xfrm.xslt);
+            ELSE
+                transformed_xml := auth.marc;
+            END IF;
+
+            prev_xfrm := xfrm.name;
+        END IF;
+
+        heading_node_list := oils_xpath( idx.heading_xpath, transformed_xml, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] );
+
+        FOR heading_node IN SELECT x FROM unnest(heading_node_list) AS x LOOP
+
+            CONTINUE WHEN heading_node !~ E'^\\s*<';
+
+            output_row.variant_type := NULL;
+            output_row.related_type := NULL;
+            output_row.thesaurus    := NULL;
+            output_row.heading      := NULL;
+
+            IF idx.heading_purpose = 'variant' AND idx.type_xpath IS NOT NULL THEN
+                type_value := ARRAY_TO_STRING(oils_xpath(idx.type_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]), '');
+                BEGIN
+                    output_row.variant_type := type_value;
+                EXCEPTION WHEN invalid_text_representation THEN
+                    RAISE NOTICE 'Do not recognize variant heading type %', type_value;
+                END;
+            END IF;
+            IF idx.heading_purpose = 'related' AND idx.type_xpath IS NOT NULL THEN
+                type_value := ARRAY_TO_STRING(oils_xpath(idx.type_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]), '');
+                BEGIN
+                    output_row.related_type := type_value;
+                EXCEPTION WHEN invalid_text_representation THEN
+                    RAISE NOTICE 'Do not recognize related heading type %', type_value;
+                END;
+            END IF;
+            IF idx.thesaurus_xpath IS NOT NULL THEN
+                output_row.thesaurus =  ARRAY_TO_STRING(oils_xpath(idx.thesaurus_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]), '');
+            END IF;
+
+            raw_text := NULL;
+
+            -- now iterate over components of heading
+            component_node_list := oils_xpath( idx.component_xpath, heading_node, ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] );
+            FOR component_node IN SELECT x FROM unnest(component_node_list) AS x LOOP
+            -- XXX much of this should be moved into oils_xpath_string...
+                curr_text := ARRAY_TO_STRING(evergreen.array_remove_item_by_value(evergreen.array_remove_item_by_value(
+                    oils_xpath( '//text()', -- get the content of all the nodes within the main selected node
+                        REGEXP_REPLACE( component_node, E'\\s+', ' ', 'g' ) -- Translate adjacent whitespace to a single space
+                    ), ' '), ''),  -- throw away morally empty (bankrupt?) strings
+                    joiner
+                );
+
+                CONTINUE WHEN curr_text IS NULL OR curr_text = '';
+
+                IF raw_text IS NOT NULL THEN
+                    raw_text := raw_text || joiner;
+                END IF;
+
+                raw_text := COALESCE(raw_text,'') || curr_text;
+            END LOOP;
+
+            IF raw_text IS NOT NULL THEN
+                output_row.heading = raw_text;
+                RETURN NEXT output_row;
+            END IF;
+        END LOOP;
+
+    END LOOP;
+END;
+$func$ LANGUAGE PLPGSQL;
+
+SELECT * FROM authority.extract_headings(152);
+SELECT * FROM authority.extract_headings(164);
+SELECT * FROM authority.extract_headings(149);