From 526130d33d9ea03fe67cc37d36f0f9f5bdc8f8a7 Mon Sep 17 00:00:00 2001 From: dbs Date: Wed, 15 Sep 2010 22:06:37 +0000 Subject: [PATCH] Make authority.normalize_heading() more defensive, and drop back to a plain (non-unique) index NOTE: Database server now requires UUID::Tiny CPAN module in its Perl repertoire When faced with terrible input, authority.normalize_heading() will generate a heading based on the MD5 UUID of the input value, flagged with "BAD_MARCXML" if the MARCXML could not be parsed, or "NOHEADING" if there was no 1xx field. Previously, authority.normalize_heading() would throw raw, ugly Perl errors. Many thanks to Mike Rylander and Galen Charlton for their suggestions on how to break the original version of authority.normalize_heading(), this code should be much more robust as a result. git-svn-id: svn://svn.open-ils.org/ILS/trunk@17714 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/sql/Pg/002.schema.config.sql | 2 +- Open-ILS/src/sql/Pg/020.schema.functions.sql | 48 +++++++++--- Open-ILS/src/sql/Pg/800.fkeys.sql | 2 +- ...402.schema.unique_authority_index_revisited.sql | 91 ++++++++++++++++++++++ 4 files changed, 130 insertions(+), 13 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 07c44e742..c8c79a46f 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -68,7 +68,7 @@ CREATE TABLE config.upgrade_log ( install_date TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); -INSERT INTO config.upgrade_log (version) VALUES ('0401'); -- dbs +INSERT INTO config.upgrade_log (version) VALUES ('0402'); -- dbs CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql index 536b57ffb..aab35caf0 100644 --- a/Open-ILS/src/sql/Pg/020.schema.functions.sql +++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql @@ -264,12 +264,27 @@ $$; CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$ use strict; use warnings; + + use utf8; use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF8'); + use UUID::Tiny ':std'; + + my $xml = shift() or return undef; + + my $r; - my $xml = shift(); - my $r = MARC::Record->new_from_xml( $xml ); - return undef unless ($r); + # Prevent errors in XML parsing from blowing out ungracefully + eval { + $r = MARC::Record->new_from_xml( $xml ); + 1; + } or do { + return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); + }; + + if (!$r) { + return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); + } # From http://www.loc.gov/standards/sourcelist/subject.html my $thes_code_map = { @@ -285,7 +300,11 @@ CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $ }; # Default to "No attempt to code" if the leader is horribly broken - my $thes_char = substr($r->field('008')->data(), 11, 1) || '|'; + my $fixed_field = $r->field('008'); + my $thes_char = '|'; + if ($fixed_field) { + $thes_char = substr($fixed_field->data(), 11, 1) || '|'; + } my $thes_code = 'UNDEFINED'; @@ -296,19 +315,26 @@ CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $ $thes_code = $thes_code_map->{$thes_char}; } - my $head = $r->field('1..'); my $auth_txt = ''; - foreach my $sf ($head->subfields()) { - $auth_txt .= $sf->[1]; + my $head = $r->field('1..'); + if ($head) { + # Concatenate all of these subfields together, prefixed by their code + # to prevent collisions along the lines of "Fiction, North Carolina" + foreach my $sf ($head->subfields()) { + $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1]; + } } - # Perhaps better to parameterize the spi and pass as a parameter $auth_txt =~ s/'//go; - my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text"); - my $norm_txt = $result->{rows}[0]->{norm_text}; - return $head->tag() . "_" . $thes_code . " " . $norm_txt; + if ($auth_txt) { + my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text"); + my $norm_txt = $result->{rows}[0]->{norm_text}; + return $head->tag() . "_" . $thes_code . " " . $norm_txt; + } + + return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml); $func$ LANGUAGE 'plperlu' IMMUTABLE; COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$ diff --git a/Open-ILS/src/sql/Pg/800.fkeys.sql b/Open-ILS/src/sql/Pg/800.fkeys.sql index e7e9bb32d..346ed988d 100644 --- a/Open-ILS/src/sql/Pg/800.fkeys.sql +++ b/Open-ILS/src/sql/Pg/800.fkeys.sql @@ -117,6 +117,6 @@ ALTER TABLE config.remote_account ADD CONSTRAINT config_remote_account_owner_fke ALTER TABLE config.org_unit_setting_type ADD CONSTRAINT view_perm_fkey FOREIGN KEY (view_perm) REFERENCES permission.perm_list (id) ON UPDATE CASCADE ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED; ALTER TABLE config.org_unit_setting_type ADD CONSTRAINT update_perm_fkey FOREIGN KEY (update_perm) REFERENCES permission.perm_list (id) ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED; -CREATE UNIQUE INDEX unique_by_heading_and_thesaurus ON authority.record_entry (authority.normalize_heading(marc)) WHERE deleted IS FALSE or deleted = FALSE; +CREATE INDEX by_heading_and_thesaurus ON authority.record_entry (authority.normalize_heading(marc)) WHERE deleted IS FALSE or deleted = FALSE; COMMIT; diff --git a/Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql b/Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql new file mode 100644 index 000000000..a16d877ff --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql @@ -0,0 +1,91 @@ +BEGIN; + +-- Make the authority heading normalization routine more defensive +-- Also drop back to a plain index for 2.0, we will get more restrictive over time + +INSERT INTO config.upgrade_log (version) VALUES ('0402'); -- dbs + +CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$ + use strict; + use warnings; + + use utf8; + use MARC::Record; + use MARC::File::XML (BinaryEncoding => 'UTF8'); + use UUID::Tiny ':std'; + + my $xml = shift() or return undef; + + my $r; + + # Prevent errors in XML parsing from blowing out ungracefully + eval { + $r = MARC::Record->new_from_xml( $xml ); + 1; + } or do { + return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); + }; + + if (!$r) { + return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); + } + + # From http://www.loc.gov/standards/sourcelist/subject.html + my $thes_code_map = { + a => 'lcsh', + b => 'lcshac', + c => 'mesh', + d => 'nal', + k => 'cash', + n => 'notapplicable', + r => 'aat', + s => 'sears', + v => 'rvm', + }; + + # Default to "No attempt to code" if the leader is horribly broken + my $fixed_field = $r->field('008'); + my $thes_char = '|'; + if ($fixed_field) { + $thes_char = substr($fixed_field->data(), 11, 1) || '|'; + } + + my $thes_code = 'UNDEFINED'; + + if ($thes_char eq 'z') { + # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html + $thes_code = $r->subfield('040', 'f') || 'UNDEFINED'; + } elsif ($thes_code_map->{$thes_char}) { + $thes_code = $thes_code_map->{$thes_char}; + } + + my $auth_txt = ''; + my $head = $r->field('1..'); + if ($head) { + # Concatenate all of these subfields together, prefixed by their code + # to prevent collisions along the lines of "Fiction, North Carolina" + foreach my $sf ($head->subfields()) { + $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1]; + } + } + + # Perhaps better to parameterize the spi and pass as a parameter + $auth_txt =~ s/'//go; + + if ($auth_txt) { + my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text"); + my $norm_txt = $result->{rows}[0]->{norm_text}; + return $head->tag() . "_" . $thes_code . " " . $norm_txt; + } + + return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml); +$func$ LANGUAGE 'plperlu' IMMUTABLE; + +DROP INDEX authority.unique_by_heading_and_thesaurus; + +CREATE INDEX by_heading_and_thesaurus + ON authority.record_entry (authority.normalize_heading(marc)) + WHERE deleted IS FALSE or deleted = FALSE +; + +COMMIT; -- 2.11.0