NOTE: Database server now requires UUID::Tiny CPAN module in its Perl repertoire
When faced with terrible input, authority.normalize_heading() will generate a
heading based on the MD5 UUID of the input value, flagged with "BAD_MARCXML"
if the MARCXML could not be parsed, or "NOHEADING" if there was no 1xx field.
Previously, authority.normalize_heading() would throw raw, ugly Perl errors.
Many thanks to Mike Rylander and Galen Charlton for their suggestions on how
to break the original version of authority.normalize_heading(), this code
should be much more robust as a result.
git-svn-id: svn://svn.open-ils.org/ILS/trunk@17714
dcc99617-32d9-48b4-a31d-
7c20da2025e4
install_date TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
);
-INSERT INTO config.upgrade_log (version) VALUES ('0401'); -- dbs
+INSERT INTO config.upgrade_log (version) VALUES ('0402'); -- dbs
CREATE TABLE config.bib_source (
id SERIAL PRIMARY KEY,
CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
use strict;
use warnings;
+
+ use utf8;
use MARC::Record;
use MARC::File::XML (BinaryEncoding => 'UTF8');
+ use UUID::Tiny ':std';
+
+ my $xml = shift() or return undef;
+
+ my $r;
- my $xml = shift();
- my $r = MARC::Record->new_from_xml( $xml );
- return undef unless ($r);
+ # Prevent errors in XML parsing from blowing out ungracefully
+ eval {
+ $r = MARC::Record->new_from_xml( $xml );
+ 1;
+ } or do {
+ return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
+ };
+
+ if (!$r) {
+ return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
+ }
# From http://www.loc.gov/standards/sourcelist/subject.html
my $thes_code_map = {
};
# Default to "No attempt to code" if the leader is horribly broken
- my $thes_char = substr($r->field('008')->data(), 11, 1) || '|';
+ my $fixed_field = $r->field('008');
+ my $thes_char = '|';
+ if ($fixed_field) {
+ $thes_char = substr($fixed_field->data(), 11, 1) || '|';
+ }
my $thes_code = 'UNDEFINED';
$thes_code = $thes_code_map->{$thes_char};
}
- my $head = $r->field('1..');
my $auth_txt = '';
- foreach my $sf ($head->subfields()) {
- $auth_txt .= $sf->[1];
+ my $head = $r->field('1..');
+ if ($head) {
+ # Concatenate all of these subfields together, prefixed by their code
+ # to prevent collisions along the lines of "Fiction, North Carolina"
+ foreach my $sf ($head->subfields()) {
+ $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1];
+ }
}
-
# Perhaps better to parameterize the spi and pass as a parameter
$auth_txt =~ s/'//go;
- my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
- my $norm_txt = $result->{rows}[0]->{norm_text};
- return $head->tag() . "_" . $thes_code . " " . $norm_txt;
+ if ($auth_txt) {
+ my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
+ my $norm_txt = $result->{rows}[0]->{norm_text};
+ return $head->tag() . "_" . $thes_code . " " . $norm_txt;
+ }
+
+ return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml);
$func$ LANGUAGE 'plperlu' IMMUTABLE;
COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$
ALTER TABLE config.org_unit_setting_type ADD CONSTRAINT view_perm_fkey FOREIGN KEY (view_perm) REFERENCES permission.perm_list (id) ON UPDATE CASCADE ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED;
ALTER TABLE config.org_unit_setting_type ADD CONSTRAINT update_perm_fkey FOREIGN KEY (update_perm) REFERENCES permission.perm_list (id) ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED;
-CREATE UNIQUE INDEX unique_by_heading_and_thesaurus ON authority.record_entry (authority.normalize_heading(marc)) WHERE deleted IS FALSE or deleted = FALSE;
+CREATE INDEX by_heading_and_thesaurus ON authority.record_entry (authority.normalize_heading(marc)) WHERE deleted IS FALSE or deleted = FALSE;
COMMIT;
--- /dev/null
+BEGIN;
+
+-- Make the authority heading normalization routine more defensive
+-- Also drop back to a plain index for 2.0, we will get more restrictive over time
+
+INSERT INTO config.upgrade_log (version) VALUES ('0402'); -- dbs
+
+CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
+ use strict;
+ use warnings;
+
+ use utf8;
+ use MARC::Record;
+ use MARC::File::XML (BinaryEncoding => 'UTF8');
+ use UUID::Tiny ':std';
+
+ my $xml = shift() or return undef;
+
+ my $r;
+
+ # Prevent errors in XML parsing from blowing out ungracefully
+ eval {
+ $r = MARC::Record->new_from_xml( $xml );
+ 1;
+ } or do {
+ return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
+ };
+
+ if (!$r) {
+ return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
+ }
+
+ # From http://www.loc.gov/standards/sourcelist/subject.html
+ my $thes_code_map = {
+ a => 'lcsh',
+ b => 'lcshac',
+ c => 'mesh',
+ d => 'nal',
+ k => 'cash',
+ n => 'notapplicable',
+ r => 'aat',
+ s => 'sears',
+ v => 'rvm',
+ };
+
+ # Default to "No attempt to code" if the leader is horribly broken
+ my $fixed_field = $r->field('008');
+ my $thes_char = '|';
+ if ($fixed_field) {
+ $thes_char = substr($fixed_field->data(), 11, 1) || '|';
+ }
+
+ my $thes_code = 'UNDEFINED';
+
+ if ($thes_char eq 'z') {
+ # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html
+ $thes_code = $r->subfield('040', 'f') || 'UNDEFINED';
+ } elsif ($thes_code_map->{$thes_char}) {
+ $thes_code = $thes_code_map->{$thes_char};
+ }
+
+ my $auth_txt = '';
+ my $head = $r->field('1..');
+ if ($head) {
+ # Concatenate all of these subfields together, prefixed by their code
+ # to prevent collisions along the lines of "Fiction, North Carolina"
+ foreach my $sf ($head->subfields()) {
+ $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1];
+ }
+ }
+
+ # Perhaps better to parameterize the spi and pass as a parameter
+ $auth_txt =~ s/'//go;
+
+ if ($auth_txt) {
+ my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
+ my $norm_txt = $result->{rows}[0]->{norm_text};
+ return $head->tag() . "_" . $thes_code . " " . $norm_txt;
+ }
+
+ return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml);
+$func$ LANGUAGE 'plperlu' IMMUTABLE;
+
+DROP INDEX authority.unique_by_heading_and_thesaurus;
+
+CREATE INDEX by_heading_and_thesaurus
+ ON authority.record_entry (authority.normalize_heading(marc))
+ WHERE deleted IS FALSE or deleted = FALSE
+;
+
+COMMIT;