*/
$$;
+-- Intended to be used in a unique index on authority.record_entry like so:
+-- CREATE UNIQUE INDEX unique_by_heading_and_thesaurus
+-- ON authority.record_entry (authority.normalize_heading(marc))
+-- WHERE deleted IS FALSE or deleted = FALSE;
+CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
+ use strict;
+ use warnings;
+ use MARC::Record;
+ use MARC::File::XML (BinaryEncoding => 'UTF8');
+
+ my $xml = shift();
+ my $r = MARC::Record->new_from_xml( $xml );
+ return undef unless ($r);
+
+ # From http://www.loc.gov/standards/sourcelist/subject.html
+ my $thes_code_map = {
+ a => 'lcsh',
+ b => 'lcshac',
+ c => 'mesh',
+ d => 'nal',
+ k => 'cash',
+ n => 'notapplicable',
+ r => 'aat',
+ s => 'sears',
+ v => 'rvm',
+ };
+
+ # Default to "No attempt to code" if the leader is horribly broken
+ my $thes_char = substr($r->field('008')->data(), 11, 1) || '|';
+
+ my $thes_code = 'UNDEFINED';
+
+ if ($thes_char eq 'z') {
+ # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html
+ $thes_code = $r->subfield('040', 'f') || 'UNDEFINED';
+ } elsif ($thes_code_map->{$thes_char}) {
+ $thes_code = $thes_code_map->{$thes_char};
+ }
+
+ my $head = $r->field('1..');
+ my $auth_txt = '';
+ foreach my $sf ($head->subfields()) {
+ $auth_txt .= $sf->[1];
+ }
+
+
+ # Perhaps better to parameterize the spi and pass as a parameter
+ $auth_txt =~ s/'//go;
+ my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
+ my $norm_txt = $result->{rows}[0]->{norm_text};
+
+ return $head->tag() . "_" . $thes_code . " " . $norm_txt;
+$func$ LANGUAGE 'plperlu' IMMUTABLE;
+
+COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$
+/**
+* Extract the authority heading, thesaurus, and NACO-normalized values
+* from an authority record. The primary purpose is to build a unique
+* index to defend against duplicated authority records from the same
+* thesaurus.
+*/
+$$;
--- /dev/null
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0400'); -- dbs
+
+CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
+ use strict;
+ use warnings;
+ use MARC::Record;
+ use MARC::File::XML (BinaryEncoding => 'UTF8');
+
+ my $xml = shift();
+ my $r = MARC::Record->new_from_xml( $xml );
+ return undef unless ($r);
+
+ # From http://www.loc.gov/standards/sourcelist/subject.html
+ my $thes_code_map = {
+ a => 'lcsh',
+ b => 'lcshac',
+ c => 'mesh',
+ d => 'nal',
+ k => 'cash',
+ n => 'notapplicable',
+ r => 'aat',
+ s => 'sears',
+ v => 'rvm',
+ };
+
+ # Default to "No attempt to code" if the leader is horribly broken
+ my $thes_char = substr($r->field('008')->data(), 11, 1) || '|';
+
+ my $thes_code = 'UNDEFINED';
+
+ if ($thes_char eq 'z') {
+ # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html
+ $thes_code = $r->subfield('040', 'f') || 'UNDEFINED';
+ } elsif ($thes_code_map->{$thes_char}) {
+ $thes_code = $thes_code_map->{$thes_char};
+ }
+
+ my $head = $r->field('1..');
+ my $auth_txt = '';
+ foreach my $sf ($head->subfields()) {
+ $auth_txt .= $sf->[1];
+ }
+
+
+ # Perhaps better to parameterize the spi and pass as a parameter
+ $auth_txt =~ s/'//go;
+ my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
+ my $norm_txt = $result->{rows}[0]->{norm_text};
+
+ return $head->tag() . "_" . $thes_code . " " . $norm_txt;
+$func$ LANGUAGE 'plperlu' IMMUTABLE;
+
+COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$
+/**
+* Extract the authority heading, thesaurus, and NACO-normalized values
+* from an authority record. The primary purpose is to build a unique
+* index to defend against duplicated authority records from the same
+* thesaurus.
+*/
+$$;
+
+COMMIT;
+
+-- Do this outside of a transaction to avoid failure if duplicate
+-- authority heading / thesaurus / heading text entries already
+-- exist in the database:
+CREATE UNIQUE INDEX unique_by_heading_and_thesaurus
+ ON authority.record_entry (authority.normalize_heading(marc))
+ WHERE deleted IS FALSE or deleted = FALSE
+;
+
+-- If the unique index fails, uncomment the following to create
+-- a regular index that will help find the duplicates in a hurry:
+--CREATE INDEX by_heading_and_thesaurus
+-- ON authority.record_entry (authority.normalize_heading(marc))
+-- WHERE deleted IS FALSE or deleted = FALSE
+--;
+
+-- Then find the duplicates like so to get an idea of how much
+-- pain you're looking at to clean things up:
+--SELECT id, authority.normalize_heading(marc)
+-- FROM authority.record_entry
+-- WHERE authority.normalize_heading(marc) IN (
+-- SELECT authority.normalize_heading(marc)
+-- FROM authority.record_entry
+-- GROUP BY authority.normalize_heading(marc)
+-- HAVING COUNT(*) > 1
+-- )
+--;
+
+-- Once you have removed the duplicates and the CREATE UNIQUE INDEX
+-- statement succeeds, drop the temporary index to avoid unnecessary
+-- duplication:
+-- DROP INDEX authority.by_heading_and_thesaurus;