From f8a36572a9d8515c3a1f594795751f95e3d13323 Mon Sep 17 00:00:00 2001 From: dbs Date: Wed, 15 Sep 2010 18:53:01 +0000 Subject: [PATCH] Create a unique index on authority records based on their heading, thesaurus, and heading text By providing a truly unique index for headings for a given thesaurus, this index sets the stage for removing the arn_value / arn_source columns from the database. Sites with loaded authority records who attempt to add this index may find that it fails due to existing duplicate entries; some suggestions for hunting down the trouble-doers (duplicate entries) is provided in Open-ILS/src/sql/Pg/0400.schema.unique_authority_index.sql git-svn-id: svn://svn.open-ils.org/ILS/trunk@17704 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/sql/Pg/002.schema.config.sql | 2 +- Open-ILS/src/sql/Pg/020.schema.functions.sql | 62 ++++++++++++++ Open-ILS/src/sql/Pg/800.fkeys.sql | 2 + .../upgrade/0400.schema.unique_authority_index.sql | 96 ++++++++++++++++++++++ 4 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0400.schema.unique_authority_index.sql diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 1ce11ca55..7a23e09cc 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -68,7 +68,7 @@ CREATE TABLE config.upgrade_log ( install_date TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); -INSERT INTO config.upgrade_log (version) VALUES ('0399'); -- miker +INSERT INTO config.upgrade_log (version) VALUES ('0400'); -- dbs CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql index 6c6738561..536b57ffb 100644 --- a/Open-ILS/src/sql/Pg/020.schema.functions.sql +++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql @@ -257,3 +257,65 @@ COMMENT ON FUNCTION actor.org_unit_ancestor_setting( TEXT, INT) IS $$ */ $$; +-- Intended to be used in a unique index on authority.record_entry like so: +-- CREATE UNIQUE INDEX unique_by_heading_and_thesaurus +-- ON authority.record_entry (authority.normalize_heading(marc)) +-- WHERE deleted IS FALSE or deleted = FALSE; +CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$ + use strict; + use warnings; + use MARC::Record; + use MARC::File::XML (BinaryEncoding => 'UTF8'); + + my $xml = shift(); + my $r = MARC::Record->new_from_xml( $xml ); + return undef unless ($r); + + # From http://www.loc.gov/standards/sourcelist/subject.html + my $thes_code_map = { + a => 'lcsh', + b => 'lcshac', + c => 'mesh', + d => 'nal', + k => 'cash', + n => 'notapplicable', + r => 'aat', + s => 'sears', + v => 'rvm', + }; + + # Default to "No attempt to code" if the leader is horribly broken + my $thes_char = substr($r->field('008')->data(), 11, 1) || '|'; + + my $thes_code = 'UNDEFINED'; + + if ($thes_char eq 'z') { + # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html + $thes_code = $r->subfield('040', 'f') || 'UNDEFINED'; + } elsif ($thes_code_map->{$thes_char}) { + $thes_code = $thes_code_map->{$thes_char}; + } + + my $head = $r->field('1..'); + my $auth_txt = ''; + foreach my $sf ($head->subfields()) { + $auth_txt .= $sf->[1]; + } + + + # Perhaps better to parameterize the spi and pass as a parameter + $auth_txt =~ s/'//go; + my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text"); + my $norm_txt = $result->{rows}[0]->{norm_text}; + + return $head->tag() . "_" . $thes_code . " " . $norm_txt; +$func$ LANGUAGE 'plperlu' IMMUTABLE; + +COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$ +/** +* Extract the authority heading, thesaurus, and NACO-normalized values +* from an authority record. The primary purpose is to build a unique +* index to defend against duplicated authority records from the same +* thesaurus. +*/ +$$; diff --git a/Open-ILS/src/sql/Pg/800.fkeys.sql b/Open-ILS/src/sql/Pg/800.fkeys.sql index 2406e036d..e7e9bb32d 100644 --- a/Open-ILS/src/sql/Pg/800.fkeys.sql +++ b/Open-ILS/src/sql/Pg/800.fkeys.sql @@ -117,4 +117,6 @@ ALTER TABLE config.remote_account ADD CONSTRAINT config_remote_account_owner_fke ALTER TABLE config.org_unit_setting_type ADD CONSTRAINT view_perm_fkey FOREIGN KEY (view_perm) REFERENCES permission.perm_list (id) ON UPDATE CASCADE ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED; ALTER TABLE config.org_unit_setting_type ADD CONSTRAINT update_perm_fkey FOREIGN KEY (update_perm) REFERENCES permission.perm_list (id) ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED; +CREATE UNIQUE INDEX unique_by_heading_and_thesaurus ON authority.record_entry (authority.normalize_heading(marc)) WHERE deleted IS FALSE or deleted = FALSE; + COMMIT; diff --git a/Open-ILS/src/sql/Pg/upgrade/0400.schema.unique_authority_index.sql b/Open-ILS/src/sql/Pg/upgrade/0400.schema.unique_authority_index.sql new file mode 100644 index 000000000..7a555bd90 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0400.schema.unique_authority_index.sql @@ -0,0 +1,96 @@ +BEGIN; + +INSERT INTO config.upgrade_log (version) VALUES ('0400'); -- dbs + +CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$ + use strict; + use warnings; + use MARC::Record; + use MARC::File::XML (BinaryEncoding => 'UTF8'); + + my $xml = shift(); + my $r = MARC::Record->new_from_xml( $xml ); + return undef unless ($r); + + # From http://www.loc.gov/standards/sourcelist/subject.html + my $thes_code_map = { + a => 'lcsh', + b => 'lcshac', + c => 'mesh', + d => 'nal', + k => 'cash', + n => 'notapplicable', + r => 'aat', + s => 'sears', + v => 'rvm', + }; + + # Default to "No attempt to code" if the leader is horribly broken + my $thes_char = substr($r->field('008')->data(), 11, 1) || '|'; + + my $thes_code = 'UNDEFINED'; + + if ($thes_char eq 'z') { + # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html + $thes_code = $r->subfield('040', 'f') || 'UNDEFINED'; + } elsif ($thes_code_map->{$thes_char}) { + $thes_code = $thes_code_map->{$thes_char}; + } + + my $head = $r->field('1..'); + my $auth_txt = ''; + foreach my $sf ($head->subfields()) { + $auth_txt .= $sf->[1]; + } + + + # Perhaps better to parameterize the spi and pass as a parameter + $auth_txt =~ s/'//go; + my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text"); + my $norm_txt = $result->{rows}[0]->{norm_text}; + + return $head->tag() . "_" . $thes_code . " " . $norm_txt; +$func$ LANGUAGE 'plperlu' IMMUTABLE; + +COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$ +/** +* Extract the authority heading, thesaurus, and NACO-normalized values +* from an authority record. The primary purpose is to build a unique +* index to defend against duplicated authority records from the same +* thesaurus. +*/ +$$; + +COMMIT; + +-- Do this outside of a transaction to avoid failure if duplicate +-- authority heading / thesaurus / heading text entries already +-- exist in the database: +CREATE UNIQUE INDEX unique_by_heading_and_thesaurus + ON authority.record_entry (authority.normalize_heading(marc)) + WHERE deleted IS FALSE or deleted = FALSE +; + +-- If the unique index fails, uncomment the following to create +-- a regular index that will help find the duplicates in a hurry: +--CREATE INDEX by_heading_and_thesaurus +-- ON authority.record_entry (authority.normalize_heading(marc)) +-- WHERE deleted IS FALSE or deleted = FALSE +--; + +-- Then find the duplicates like so to get an idea of how much +-- pain you're looking at to clean things up: +--SELECT id, authority.normalize_heading(marc) +-- FROM authority.record_entry +-- WHERE authority.normalize_heading(marc) IN ( +-- SELECT authority.normalize_heading(marc) +-- FROM authority.record_entry +-- GROUP BY authority.normalize_heading(marc) +-- HAVING COUNT(*) > 1 +-- ) +--; + +-- Once you have removed the duplicates and the CREATE UNIQUE INDEX +-- statement succeeds, drop the temporary index to avoid unnecessary +-- duplication: +-- DROP INDEX authority.by_heading_and_thesaurus; -- 2.11.0