From 09954064b0f76d9bd234eb186e90394aac97f9b8 Mon Sep 17 00:00:00 2001 From: dbs Date: Thu, 16 Sep 2010 05:25:51 +0000 Subject: [PATCH] Drop the arn_value / arn_source columns on authority.record_entry and create an index instead The authority record number (ARN) was problematic when trying to generate local authorities because the number had no correlation to the content of the authority heading. By creating an index on authority records based on their heading, thesaurus, and heading text, we can do a better job of controlling the actual content of the authority records. In a future release, we may convert this index to a unique index. 2.0 will give sites an opportunity to begin cleaning up their authorities by identifying duplicate and problematic records through the authority.normalize_heading(TEXT) database function. git-svn-id: svn://svn.open-ils.org/ILS/branches/rel_2_0@17722 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/examples/fm_IDL.xml | 2 - Open-ILS/src/extras/import/marc2are.pl | 5 -- .../perlmods/OpenILS/Application/Cat/AuthCommon.pm | 24 ------ .../OpenILS/Application/Storage/CDBI/authority.pm | 2 +- Open-ILS/src/perlmods/OpenILS/WWW/Exporter.pm | 5 -- Open-ILS/src/sql/Pg/002.functions.config.sql | 2 - Open-ILS/src/sql/Pg/011.schema.authority.sql | 3 - Open-ILS/src/sql/Pg/020.schema.functions.sql | 88 ++++++++++++++++++++ Open-ILS/src/sql/Pg/800.fkeys.sql | 2 + .../upgrade/0400.schema.unique_authority_index.sql | 96 ++++++++++++++++++++++ ...0401.schema.authority_record_entry_drop_arn.sql | 9 ++ ...402.schema.unique_authority_index_revisited.sql | 91 ++++++++++++++++++++ Open-ILS/xul/staff_client/server/cat/marcedit.js | 7 +- 13 files changed, 290 insertions(+), 46 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0400.schema.unique_authority_index.sql create mode 100644 Open-ILS/src/sql/Pg/upgrade/0401.schema.authority_record_entry_drop_arn.sql create mode 100644 Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql diff --git a/Open-ILS/examples/fm_IDL.xml b/Open-ILS/examples/fm_IDL.xml index 62ea89ca70..ad3b9bf79a 100644 --- a/Open-ILS/examples/fm_IDL.xml +++ b/Open-ILS/examples/fm_IDL.xml @@ -1401,8 +1401,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA - - diff --git a/Open-ILS/src/extras/import/marc2are.pl b/Open-ILS/src/extras/import/marc2are.pl index 5da337c460..1eb86d43cf 100755 --- a/Open-ILS/src/extras/import/marc2are.pl +++ b/Open-ILS/src/extras/import/marc2are.pl @@ -61,9 +61,6 @@ my $rec; while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { next if ($rec == -1); my $id = $count; - my $_001 = $rec->field('001'); - my $arn = $count; - $arn = $_001->data if ($_001); (my $xml = $rec->as_xml_record()) =~ s/\n//sog; $xml =~ s/^<\?xml.+\?\s*>//go; @@ -81,8 +78,6 @@ while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { $bib->create_date('now'); $bib->editor($user); $bib->edit_date('now'); - $bib->arn_source('LEGACY'); - $bib->arn_value($arn); $bib->last_xact_id('IMPORT-'.$starttime); print OpenSRF::Utils::JSON->perl2JSON($bib)."\n"; diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Cat/AuthCommon.pm b/Open-ILS/src/perlmods/OpenILS/Application/Cat/AuthCommon.pm index a00dc5c169..a8ef1f0bcb 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/Cat/AuthCommon.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/Cat/AuthCommon.pm @@ -36,10 +36,6 @@ sub import_authority_record { $rec->edit_date('now'); $rec->marc($U->entityize($marc_doc->documentElement->toString)); - my ($arn, $evt) = find_arn($e, $marc_doc); - return $evt if $evt; - $rec->arn_value($arn); - $rec = $e->create_authority_record_entry($rec) or return $e->die_event; # we don't care about the result, just fire off the request @@ -68,24 +64,4 @@ sub overlay_authority_record { return $rec; } -sub find_arn { - my($e, $marc_doc) = @_; - - my $xpath = '//marc:controlfield[@tag="001"]'; - my ($arn) = $marc_doc->documentElement->findvalue($xpath); - - if(my $existing_rec = $e->search_authority_record_entry({arn_value => $arn, deleted => 'f'})->[0]) { - # this arn is taken - return ( - undef, - OpenILS::Event->new( - 'AUTHORITY_RECORD_NUMBER_EXISTS', - payload => {existing_record => $existing_rec, arn => $arn} - ) - ); - } - - return ($arn); -} - 1; diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Storage/CDBI/authority.pm b/Open-ILS/src/perlmods/OpenILS/Application/Storage/CDBI/authority.pm index a30f789a27..64a035032c 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/Storage/CDBI/authority.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/Storage/CDBI/authority.pm @@ -10,7 +10,7 @@ use base qw/authority/; authority::record_entry->table( 'authority_record_entry' ); authority::record_entry->columns( Primary => qw/id/ ); -authority::record_entry->columns( Essential => qw/arn_source arn_value creator editor +authority::record_entry->columns( Essential => qw/creator editor create_date edit_date source active deleted marc last_xact_id/ ); diff --git a/Open-ILS/src/perlmods/OpenILS/WWW/Exporter.pm b/Open-ILS/src/perlmods/OpenILS/WWW/Exporter.pm index 56d59ba8ce..8234d1cfeb 100644 --- a/Open-ILS/src/perlmods/OpenILS/WWW/Exporter.pm +++ b/Open-ILS/src/perlmods/OpenILS/WWW/Exporter.pm @@ -114,11 +114,6 @@ sub handler { my $tcn_v = 'tcn_value'; my $tcn_s = 'tcn_source'; - if ($type eq 'authority') { - $tcn_v = 'arn_value'; - $tcn_s = 'arn_source'; - } - my $holdings = $cgi->param('holdings') if ($type eq 'biblio'); my $location = $cgi->param('location') || 'gaaagpl'; # just because... diff --git a/Open-ILS/src/sql/Pg/002.functions.config.sql b/Open-ILS/src/sql/Pg/002.functions.config.sql index 54e2cfb4e6..6b43ceb4b3 100644 --- a/Open-ILS/src/sql/Pg/002.functions.config.sql +++ b/Open-ILS/src/sql/Pg/002.functions.config.sql @@ -437,8 +437,6 @@ BEGIN NEW.marc, E'()', E'' || - '' || NEW.arn_value || E'' || - '' || NEW.arn_source || E'' || '' || NEW.id || E'' || '' || TG_TABLE_SCHEMA || E'' || E'\\1' diff --git a/Open-ILS/src/sql/Pg/011.schema.authority.sql b/Open-ILS/src/sql/Pg/011.schema.authority.sql index 0f777dc08d..480a429c2a 100644 --- a/Open-ILS/src/sql/Pg/011.schema.authority.sql +++ b/Open-ILS/src/sql/Pg/011.schema.authority.sql @@ -24,8 +24,6 @@ CREATE SCHEMA authority; CREATE TABLE authority.record_entry ( id BIGSERIAL PRIMARY KEY, - arn_source TEXT NOT NULL DEFAULT 'AUTOGEN', - arn_value TEXT NOT NULL, creator INT NOT NULL DEFAULT 1, editor INT NOT NULL DEFAULT 1, create_date TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT now(), @@ -39,7 +37,6 @@ CREATE TABLE authority.record_entry ( ); CREATE INDEX authority_record_entry_creator_idx ON authority.record_entry ( creator ); CREATE INDEX authority_record_entry_editor_idx ON authority.record_entry ( editor ); -CREATE UNIQUE INDEX authority_record_unique_tcn ON authority.record_entry (arn_source,arn_value) WHERE deleted = FALSE OR deleted IS FALSE; CREATE TRIGGER a_marcxml_is_well_formed BEFORE INSERT OR UPDATE ON authority.record_entry FOR EACH ROW EXECUTE PROCEDURE biblio.check_marcxml_well_formed(); CREATE TRIGGER b_maintain_901 BEFORE INSERT OR UPDATE ON authority.record_entry FOR EACH ROW EXECUTE PROCEDURE maintain_901(); CREATE TRIGGER c_maintain_control_numbers BEFORE INSERT OR UPDATE ON authority.record_entry FOR EACH ROW EXECUTE PROCEDURE maintain_control_numbers(); diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql index 6c6738561c..aab35caf0c 100644 --- a/Open-ILS/src/sql/Pg/020.schema.functions.sql +++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql @@ -257,3 +257,91 @@ COMMENT ON FUNCTION actor.org_unit_ancestor_setting( TEXT, INT) IS $$ */ $$; +-- Intended to be used in a unique index on authority.record_entry like so: +-- CREATE UNIQUE INDEX unique_by_heading_and_thesaurus +-- ON authority.record_entry (authority.normalize_heading(marc)) +-- WHERE deleted IS FALSE or deleted = FALSE; +CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$ + use strict; + use warnings; + + use utf8; + use MARC::Record; + use MARC::File::XML (BinaryEncoding => 'UTF8'); + use UUID::Tiny ':std'; + + my $xml = shift() or return undef; + + my $r; + + # Prevent errors in XML parsing from blowing out ungracefully + eval { + $r = MARC::Record->new_from_xml( $xml ); + 1; + } or do { + return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); + }; + + if (!$r) { + return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); + } + + # From http://www.loc.gov/standards/sourcelist/subject.html + my $thes_code_map = { + a => 'lcsh', + b => 'lcshac', + c => 'mesh', + d => 'nal', + k => 'cash', + n => 'notapplicable', + r => 'aat', + s => 'sears', + v => 'rvm', + }; + + # Default to "No attempt to code" if the leader is horribly broken + my $fixed_field = $r->field('008'); + my $thes_char = '|'; + if ($fixed_field) { + $thes_char = substr($fixed_field->data(), 11, 1) || '|'; + } + + my $thes_code = 'UNDEFINED'; + + if ($thes_char eq 'z') { + # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html + $thes_code = $r->subfield('040', 'f') || 'UNDEFINED'; + } elsif ($thes_code_map->{$thes_char}) { + $thes_code = $thes_code_map->{$thes_char}; + } + + my $auth_txt = ''; + my $head = $r->field('1..'); + if ($head) { + # Concatenate all of these subfields together, prefixed by their code + # to prevent collisions along the lines of "Fiction, North Carolina" + foreach my $sf ($head->subfields()) { + $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1]; + } + } + + # Perhaps better to parameterize the spi and pass as a parameter + $auth_txt =~ s/'//go; + + if ($auth_txt) { + my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text"); + my $norm_txt = $result->{rows}[0]->{norm_text}; + return $head->tag() . "_" . $thes_code . " " . $norm_txt; + } + + return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml); +$func$ LANGUAGE 'plperlu' IMMUTABLE; + +COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$ +/** +* Extract the authority heading, thesaurus, and NACO-normalized values +* from an authority record. The primary purpose is to build a unique +* index to defend against duplicated authority records from the same +* thesaurus. +*/ +$$; diff --git a/Open-ILS/src/sql/Pg/800.fkeys.sql b/Open-ILS/src/sql/Pg/800.fkeys.sql index 2406e036d2..346ed988df 100644 --- a/Open-ILS/src/sql/Pg/800.fkeys.sql +++ b/Open-ILS/src/sql/Pg/800.fkeys.sql @@ -117,4 +117,6 @@ ALTER TABLE config.remote_account ADD CONSTRAINT config_remote_account_owner_fke ALTER TABLE config.org_unit_setting_type ADD CONSTRAINT view_perm_fkey FOREIGN KEY (view_perm) REFERENCES permission.perm_list (id) ON UPDATE CASCADE ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED; ALTER TABLE config.org_unit_setting_type ADD CONSTRAINT update_perm_fkey FOREIGN KEY (update_perm) REFERENCES permission.perm_list (id) ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED; +CREATE INDEX by_heading_and_thesaurus ON authority.record_entry (authority.normalize_heading(marc)) WHERE deleted IS FALSE or deleted = FALSE; + COMMIT; diff --git a/Open-ILS/src/sql/Pg/upgrade/0400.schema.unique_authority_index.sql b/Open-ILS/src/sql/Pg/upgrade/0400.schema.unique_authority_index.sql new file mode 100644 index 0000000000..7a555bd903 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0400.schema.unique_authority_index.sql @@ -0,0 +1,96 @@ +BEGIN; + +INSERT INTO config.upgrade_log (version) VALUES ('0400'); -- dbs + +CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$ + use strict; + use warnings; + use MARC::Record; + use MARC::File::XML (BinaryEncoding => 'UTF8'); + + my $xml = shift(); + my $r = MARC::Record->new_from_xml( $xml ); + return undef unless ($r); + + # From http://www.loc.gov/standards/sourcelist/subject.html + my $thes_code_map = { + a => 'lcsh', + b => 'lcshac', + c => 'mesh', + d => 'nal', + k => 'cash', + n => 'notapplicable', + r => 'aat', + s => 'sears', + v => 'rvm', + }; + + # Default to "No attempt to code" if the leader is horribly broken + my $thes_char = substr($r->field('008')->data(), 11, 1) || '|'; + + my $thes_code = 'UNDEFINED'; + + if ($thes_char eq 'z') { + # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html + $thes_code = $r->subfield('040', 'f') || 'UNDEFINED'; + } elsif ($thes_code_map->{$thes_char}) { + $thes_code = $thes_code_map->{$thes_char}; + } + + my $head = $r->field('1..'); + my $auth_txt = ''; + foreach my $sf ($head->subfields()) { + $auth_txt .= $sf->[1]; + } + + + # Perhaps better to parameterize the spi and pass as a parameter + $auth_txt =~ s/'//go; + my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text"); + my $norm_txt = $result->{rows}[0]->{norm_text}; + + return $head->tag() . "_" . $thes_code . " " . $norm_txt; +$func$ LANGUAGE 'plperlu' IMMUTABLE; + +COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$ +/** +* Extract the authority heading, thesaurus, and NACO-normalized values +* from an authority record. The primary purpose is to build a unique +* index to defend against duplicated authority records from the same +* thesaurus. +*/ +$$; + +COMMIT; + +-- Do this outside of a transaction to avoid failure if duplicate +-- authority heading / thesaurus / heading text entries already +-- exist in the database: +CREATE UNIQUE INDEX unique_by_heading_and_thesaurus + ON authority.record_entry (authority.normalize_heading(marc)) + WHERE deleted IS FALSE or deleted = FALSE +; + +-- If the unique index fails, uncomment the following to create +-- a regular index that will help find the duplicates in a hurry: +--CREATE INDEX by_heading_and_thesaurus +-- ON authority.record_entry (authority.normalize_heading(marc)) +-- WHERE deleted IS FALSE or deleted = FALSE +--; + +-- Then find the duplicates like so to get an idea of how much +-- pain you're looking at to clean things up: +--SELECT id, authority.normalize_heading(marc) +-- FROM authority.record_entry +-- WHERE authority.normalize_heading(marc) IN ( +-- SELECT authority.normalize_heading(marc) +-- FROM authority.record_entry +-- GROUP BY authority.normalize_heading(marc) +-- HAVING COUNT(*) > 1 +-- ) +--; + +-- Once you have removed the duplicates and the CREATE UNIQUE INDEX +-- statement succeeds, drop the temporary index to avoid unnecessary +-- duplication: +-- DROP INDEX authority.by_heading_and_thesaurus; diff --git a/Open-ILS/src/sql/Pg/upgrade/0401.schema.authority_record_entry_drop_arn.sql b/Open-ILS/src/sql/Pg/upgrade/0401.schema.authority_record_entry_drop_arn.sql new file mode 100644 index 0000000000..61d9871879 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0401.schema.authority_record_entry_drop_arn.sql @@ -0,0 +1,9 @@ +BEGIN; + +INSERT INTO config.upgrade_log (version) VALUES ('0401'); -- dbs + +DROP INDEX authority.authority_record_unique_tcn; +ALTER TABLE authority.record_entry DROP COLUMN arn_value; +ALTER TABLE authority.record_entry DROP COLUMN arn_source; + +COMMIT; diff --git a/Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql b/Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql new file mode 100644 index 0000000000..a16d877ff5 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql @@ -0,0 +1,91 @@ +BEGIN; + +-- Make the authority heading normalization routine more defensive +-- Also drop back to a plain index for 2.0, we will get more restrictive over time + +INSERT INTO config.upgrade_log (version) VALUES ('0402'); -- dbs + +CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$ + use strict; + use warnings; + + use utf8; + use MARC::Record; + use MARC::File::XML (BinaryEncoding => 'UTF8'); + use UUID::Tiny ':std'; + + my $xml = shift() or return undef; + + my $r; + + # Prevent errors in XML parsing from blowing out ungracefully + eval { + $r = MARC::Record->new_from_xml( $xml ); + 1; + } or do { + return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); + }; + + if (!$r) { + return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); + } + + # From http://www.loc.gov/standards/sourcelist/subject.html + my $thes_code_map = { + a => 'lcsh', + b => 'lcshac', + c => 'mesh', + d => 'nal', + k => 'cash', + n => 'notapplicable', + r => 'aat', + s => 'sears', + v => 'rvm', + }; + + # Default to "No attempt to code" if the leader is horribly broken + my $fixed_field = $r->field('008'); + my $thes_char = '|'; + if ($fixed_field) { + $thes_char = substr($fixed_field->data(), 11, 1) || '|'; + } + + my $thes_code = 'UNDEFINED'; + + if ($thes_char eq 'z') { + # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html + $thes_code = $r->subfield('040', 'f') || 'UNDEFINED'; + } elsif ($thes_code_map->{$thes_char}) { + $thes_code = $thes_code_map->{$thes_char}; + } + + my $auth_txt = ''; + my $head = $r->field('1..'); + if ($head) { + # Concatenate all of these subfields together, prefixed by their code + # to prevent collisions along the lines of "Fiction, North Carolina" + foreach my $sf ($head->subfields()) { + $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1]; + } + } + + # Perhaps better to parameterize the spi and pass as a parameter + $auth_txt =~ s/'//go; + + if ($auth_txt) { + my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text"); + my $norm_txt = $result->{rows}[0]->{norm_text}; + return $head->tag() . "_" . $thes_code . " " . $norm_txt; + } + + return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml); +$func$ LANGUAGE 'plperlu' IMMUTABLE; + +DROP INDEX authority.unique_by_heading_and_thesaurus; + +CREATE INDEX by_heading_and_thesaurus + ON authority.record_entry (authority.normalize_heading(marc)) + WHERE deleted IS FALSE or deleted = FALSE +; + +COMMIT; diff --git a/Open-ILS/xul/staff_client/server/cat/marcedit.js b/Open-ILS/xul/staff_client/server/cat/marcedit.js index abde13bdf4..6c68fcdd92 100644 --- a/Open-ILS/xul/staff_client/server/cat/marcedit.js +++ b/Open-ILS/xul/staff_client/server/cat/marcedit.js @@ -2575,9 +2575,9 @@ function loadMarcEditor(pcrud, marcxml) { netscape.security.PrivilegeManager.enablePrivilege('UniversalXPConnect'); win = window.open('/xul/server/cat/marcedit.xul'); // XXX version? - /* Ugly hack to satisfy arn_value and last_xact_id db schema reqs */ + // Match marc2are.pl last_xact_id format, roughly var now = new Date; - var arn = 'AUTOGEN' + Date.parse(now); + var xact_id = 'IMPORT-' + Date.parse(now); win.xulG = { "record": {"marc": marcxml, "rtype": "are"}, @@ -2586,8 +2586,7 @@ function loadMarcEditor(pcrud, marcxml) { "func": function(xmlString) { var rec = new are(); rec.marc(xmlString); - rec.arn_value(arn); - rec.last_xact_id(arn); + rec.last_xact_id(xact_id); rec.isnew(true); pcrud.create(rec, { "oncomplete": function () { -- 2.11.0