From: Mike Rylander Date: Thu, 21 Apr 2011 16:36:05 +0000 (-0400) Subject: replace the perlish authority.normalize_heading with a nearly-equiv plpgsql version... X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=40283f335c75a75ab2829af3e202a1d33115f44d;p=evergreen%2Fmasslnc.git replace the perlish authority.normalize_heading with a nearly-equiv plpgsql version that understands control sets --- diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql index f29239ef65..262f6cfac5 100644 --- a/Open-ILS/src/sql/Pg/020.schema.functions.sql +++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql @@ -331,84 +331,47 @@ $$; -- CREATE UNIQUE INDEX unique_by_heading_and_thesaurus -- ON authority.record_entry (authority.normalize_heading(marc)) -- WHERE deleted IS FALSE or deleted = FALSE; -CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$ - use strict; - use warnings; - - use utf8; - use MARC::Record; - use MARC::File::XML (BinaryEncoding => 'UTF8'); - use MARC::Charset; - use UUID::Tiny ':std'; - - MARC::Charset->assume_unicode(1); - - my $xml = shift() or return undef; - - my $r; - - # Prevent errors in XML parsing from blowing out ungracefully - eval { - $r = MARC::Record->new_from_xml( $xml ); - 1; - } or do { - return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); - }; - - if (!$r) { - return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml); - } - - # From http://www.loc.gov/standards/sourcelist/subject.html - my $thes_code_map = { - a => 'lcsh', - b => 'lcshac', - c => 'mesh', - d => 'nal', - k => 'cash', - n => 'notapplicable', - r => 'aat', - s => 'sears', - v => 'rvm', - }; - - # Default to "No attempt to code" if the leader is horribly broken - my $fixed_field = $r->field('008'); - my $thes_char = '|'; - if ($fixed_field) { - $thes_char = substr($fixed_field->data(), 11, 1) || '|'; - } - - my $thes_code = 'UNDEFINED'; - - if ($thes_char eq 'z') { - # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html - $thes_code = $r->subfield('040', 'f') || 'UNDEFINED'; - } elsif ($thes_code_map->{$thes_char}) { - $thes_code = $thes_code_map->{$thes_char}; - } - - my $auth_txt = ''; - my $head = $r->field('1..'); - if ($head) { - # Concatenate all of these subfields together, prefixed by their code - # to prevent collisions along the lines of "Fiction, North Carolina" - foreach my $sf ($head->subfields()) { - $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1]; - } - } - - if ($auth_txt) { - my $stmt = spi_prepare('SELECT public.naco_normalize($1) AS norm_text', 'TEXT'); - my $result = spi_exec_prepared($stmt, $auth_txt); - my $norm_txt = $result->{rows}[0]->{norm_text}; - spi_freeplan($stmt); - undef($stmt); - return $head->tag() . "_" . $thes_code . " " . $norm_txt; - } - - return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml); -$func$ LANGUAGE 'plperlu' IMMUTABLE; +CREATE OR REPLACE FUNCTION authority.normalize_heading( marcxml TEXT ) RETURNS TEXT AS $func$ +DECLARE + acsaf authority.control_set_authority_field%ROWTYPE; + tag_used TEXT; + sf TEXT; + thes_code TEXT; + cset INT; + heading_text TEXT; + tmp_text TEXT; +BEGIN + thes_code := vandelay.marc21_extract_fixed_field(marcxml,'Subj'); + IF thes_code IS NULL THEN + thes_code := '|'; + END IF; + + SELECT control_set INTO cset FROM authority.thesaurus WHERE code = thes_code; + IF NOT FOUND THEN + cset = 1; + END IF; + + heading_text := ''; + FOR acsaf IN SELECT * FROM authority.control_set_authority_field WHERE control_set = cset AND main_entry IS NULL LOOP + tag_use := acsaf.tag; + FOR sf IN SELECT * FROM regexp_split_to_table(acsaf.sf_list,'') LOOP + tmp_text := oils_xpath_string('//*[@tag="'||tag_used||'"]/*[@code="'||sf||'"]', marcxml); + IF tmp_text IS NOT NULL THEN + heading_text := heading_text || E'‡' || sf || ' ' || tmp_text; + END IF; + END LOOP; + EXIT WHEN heading_text <> ''; + END LOOP; + + IF heading_text <> '' THEN + heading_text := tag_used || '_' || thes_code || ' ' || public.naco_normalize(heading_text); + ELSE + heading_text := 'NOHEADING_' || thes_code || ' ' || MD5(marcxml); + END IF; + + RETURN heading_text; +END; +$func$ LANGUAGE PLPGSQL IMMUTABLE; COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$ Extract the authority heading, thesaurus, and NACO-normalized values