From: dbs Date: Mon, 9 Nov 2009 01:12:25 +0000 (+0000) Subject: Backport r14825 and r14826 to address ISSN ingest and retrieval bugs X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=6981aed37b94f9276e5860fc81d24d1625e024e7;p=Evergreen.git Backport r14825 and r14826 to address ISSN ingest and retrieval bugs git-svn-id: svn://svn.open-ils.org/ILS/branches/rel_1_6_0@14831 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Ingest.pm b/Open-ILS/src/perlmods/OpenILS/Application/Ingest.pm index e0953919d2..6ea85c342f 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/Ingest.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/Ingest.pm @@ -1025,7 +1025,8 @@ sub xpath_to_string { } $string =~ s/(\w+)\/(\w+)/$1 $2/sgo; - $string =~ s/(\d{4})-(\d{4})/ $1 $2 /sgo; + # Split date ranges and ISSNs on the hyphen + $string =~ s/(\d{4})-(\d{3,4}x?)/ $1 $2 /goi; return NFD($string); } @@ -1181,87 +1182,20 @@ sub _marcxml_to_full_rows { for my $tagline ( @{$root->getChildrenByTagName("leader")} ) { next unless $tagline; - - my $ns = $type->new; - - $ns->tag( 'LDR' ); - my $val = $tagline->textContent; - $val = NFD($val); - $val =~ s/\pM+//sgo; - $val =~ s/\pC+//sgo; - $val =~ s/\W+$//sgo; - $ns->value( $val ); - - push @ns_list, $ns; + _special_tag_to_full_rows($type, $tagline, \@ns_list, 'LDR'); } for my $tagline ( @{$root->getChildrenByTagName("controlfield")} ) { next unless $tagline; - - my $ns = $type->new; - - $ns->tag( $tagline->getAttribute( "tag" ) ); - my $val = $tagline->textContent; - $val = NFD($val); - $val =~ s/\pM+//sgo; - $val =~ s/\pC+//sgo; - $val =~ s/\W+$//sgo; - $ns->value( $val ); - - push @ns_list, $ns; + _special_tag_to_full_rows($type, $tagline, \@ns_list, $tagline->getAttribute( "tag" )); } for my $tagline ( @{$root->getChildrenByTagName("datafield")} ) { next unless $tagline; - - my $tag = $tagline->getAttribute( "tag" ); - my $ind1 = $tagline->getAttribute( "ind1" ); - my $ind2 = $tagline->getAttribute( "ind2" ); - - for my $data ( @{$tagline->getChildrenByTagName('subfield')} ) { - next unless $data; - - my $ns = $type->new; - - $ns->tag( $tag ); - $ns->ind1( $ind1 ); - $ns->ind2( $ind2 ); - $ns->subfield( $data->getAttribute( "code" ) ); - my $val = $data->textContent; - $val = NFD($val); - $val =~ s/\pM+//sgo; - $val =~ s/\pC+//sgo; - $val =~ s/\W+$//sgo; - $val =~ s/(\d{4})-(\d{4})/ $1 $2 /sgo; - $val =~ s/(\w+)\/(\w+)/$1 $2/sgo; - $ns->value( lc($val) ); - - push @ns_list, $ns; - } + _data_tag_to_full_rows($type, $tagline, \@ns_list, $tagline->getAttribute( "tag" )); if ($xmltype eq 'metabib' and $tag eq '245') { - $tag = 'tnf'; - - for my $data ( @{$tagline->getChildrenByTagName('subfield')} ) { - next unless ($data and $data->getAttribute( "code" ) eq 'a'); - - $ns = $type->new; - - $ns->tag( $tag ); - $ns->ind1( $ind1 ); - $ns->ind2( $ind2 ); - $ns->subfield( $data->getAttribute( "code" ) ); - my $val = substr( $data->textContent, $ind2 ); - $val = NFD($val); - $val =~ s/\pM+//sgo; - $val =~ s/\pC+//sgo; - $val =~ s/\W+$//sgo; - $val =~ s/(\w+)\/(\w+)/$1 $2/sgo; - $val =~ s/(\d{4})-(\d{4})/ $1 $2 /sgo; - $ns->value( lc($val) ); - - push @ns_list, $ns; - } + _data_tag_to_full_rows($type, $tagline, \@ns_list, 'tnf'); } } @@ -1269,6 +1203,69 @@ sub _marcxml_to_full_rows { return @ns_list; } +=head2 _special_tag_to_full_rows + +Converts a leader or control field to a set of normalized values + +=cut + +sub _special_tag_to_full_rows { + my $type = shift; + my $tagline = shift; + my $ns_list = shift; + my $tagname = shift; + + my $ns = $type->new; + + $ns->tag( $tagname ); + my $val = $tagline->textContent; + $val = NFD($val); + $val =~ s/\pM+//sgo; + $val =~ s/\pC+//sgo; + $val =~ s/\W+$//sgo; + $ns->value( $val ); + + push @$ns_list, $ns; +} + +=head2 _data_tag_to_full_rows + +Converts a data field to a set of normalized values + +=cut + +sub _data_tag_to_full_rows { + my $type = shift; + my $tagline = shift; + my $ns_list = shift; + my $tag = shift; + + my $ind1 = $tagline->getAttribute( "ind1" ); + my $ind2 = $tagline->getAttribute( "ind2" ); + + for my $data ( @{$tagline->getChildrenByTagName('subfield')} ) { + next unless $data; + + my $ns = $type->new; + + $ns->tag( $tag ); + $ns->ind1( $ind1 ); + $ns->ind2( $ind2 ); + $ns->subfield( $data->getAttribute( "code" ) ); + my $val = $data->textContent; + $val = NFD($val); + $val =~ s/\pM+//sgo; + $val =~ s/\pC+//sgo; + $val =~ s/\W+$//sgo; + # Split date ranges and ISSNs on the hyphen + $val =~ s/(\d{4})-(\d{3,4}x?)/ $1 $2 /goi; + $val =~ s/(\w+)\/(\w+)/$1 $2/sgo; + $ns->value( lc($val) ); + + push @$ns_list, $ns; + } +} + sub flat_marc_xml { my $self = shift; my $client = shift; diff --git a/Open-ILS/src/sql/Pg/reporter-schema.sql b/Open-ILS/src/sql/Pg/reporter-schema.sql index c7f787a6e2..2790e02b6a 100644 --- a/Open-ILS/src/sql/Pg/reporter-schema.sql +++ b/Open-ILS/src/sql/Pg/reporter-schema.sql @@ -127,7 +127,7 @@ SELECT r.id, series_statement.value AS series_statement, summary.value AS summary, ARRAY_ACCUM( SUBSTRING(isbn.value FROM $$^\S+$$) ) AS isbn, - ARRAY_ACCUM( SUBSTRING(issn.value FROM $$^\S+$$) ) AS issn, + ARRAY_ACCUM( REGEXP_REPLACE(issn.value, E'^\\S*(\\d{4})[-\\s](\\d{3,4}x?)', E'\\1 \\2') ) AS issn, ARRAY((SELECT DISTINCT value FROM metabib.full_rec WHERE tag = '650' AND subfield = 'a' AND record = r.id)) AS topic_subject, ARRAY((SELECT DISTINCT value FROM metabib.full_rec WHERE tag = '651' AND subfield = 'a' AND record = r.id)) AS geographic_subject, ARRAY((SELECT DISTINCT value FROM metabib.full_rec WHERE tag = '655' AND subfield = 'a' AND record = r.id)) AS genre,