From: dbs <dbs@dcc99617-32d9-48b4-a31d-7c20da2025e4> Date: Sun, 8 Nov 2009 01:29:04 +0000 (+0000) Subject: Normalize ISSNs on ingest so that "1972-156X" gets added as " 1972 156x " to mfr... X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=80bc127302528898961e1b32ff8e7d49f50bbf4c;p=contrib%2FConifer.git Normalize ISSNs on ingest so that "1972-156X" gets added as " 1972 156x " to mfr / mkfe This enables keyword searching of this subset of ISSNs to work because it sidesteps the full text search tokenizer that would otherwise have indexed the values as "1972", "-156", and "x". git-svn-id: svn://svn.open-ils.org/ILS/trunk@14825 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Ingest.pm b/Open-ILS/src/perlmods/OpenILS/Application/Ingest.pm index e0953919d2..6ea85c342f 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/Ingest.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/Ingest.pm @@ -1025,7 +1025,8 @@ sub xpath_to_string { } $string =~ s/(\w+)\/(\w+)/$1 $2/sgo; - $string =~ s/(\d{4})-(\d{4})/ $1 $2 /sgo; + # Split date ranges and ISSNs on the hyphen + $string =~ s/(\d{4})-(\d{3,4}x?)/ $1 $2 /goi; return NFD($string); } @@ -1181,87 +1182,20 @@ sub _marcxml_to_full_rows { for my $tagline ( @{$root->getChildrenByTagName("leader")} ) { next unless $tagline; - - my $ns = $type->new; - - $ns->tag( 'LDR' ); - my $val = $tagline->textContent; - $val = NFD($val); - $val =~ s/\pM+//sgo; - $val =~ s/\pC+//sgo; - $val =~ s/\W+$//sgo; - $ns->value( $val ); - - push @ns_list, $ns; + _special_tag_to_full_rows($type, $tagline, \@ns_list, 'LDR'); } for my $tagline ( @{$root->getChildrenByTagName("controlfield")} ) { next unless $tagline; - - my $ns = $type->new; - - $ns->tag( $tagline->getAttribute( "tag" ) ); - my $val = $tagline->textContent; - $val = NFD($val); - $val =~ s/\pM+//sgo; - $val =~ s/\pC+//sgo; - $val =~ s/\W+$//sgo; - $ns->value( $val ); - - push @ns_list, $ns; + _special_tag_to_full_rows($type, $tagline, \@ns_list, $tagline->getAttribute( "tag" )); } for my $tagline ( @{$root->getChildrenByTagName("datafield")} ) { next unless $tagline; - - my $tag = $tagline->getAttribute( "tag" ); - my $ind1 = $tagline->getAttribute( "ind1" ); - my $ind2 = $tagline->getAttribute( "ind2" ); - - for my $data ( @{$tagline->getChildrenByTagName('subfield')} ) { - next unless $data; - - my $ns = $type->new; - - $ns->tag( $tag ); - $ns->ind1( $ind1 ); - $ns->ind2( $ind2 ); - $ns->subfield( $data->getAttribute( "code" ) ); - my $val = $data->textContent; - $val = NFD($val); - $val =~ s/\pM+//sgo; - $val =~ s/\pC+//sgo; - $val =~ s/\W+$//sgo; - $val =~ s/(\d{4})-(\d{4})/ $1 $2 /sgo; - $val =~ s/(\w+)\/(\w+)/$1 $2/sgo; - $ns->value( lc($val) ); - - push @ns_list, $ns; - } + _data_tag_to_full_rows($type, $tagline, \@ns_list, $tagline->getAttribute( "tag" )); if ($xmltype eq 'metabib' and $tag eq '245') { - $tag = 'tnf'; - - for my $data ( @{$tagline->getChildrenByTagName('subfield')} ) { - next unless ($data and $data->getAttribute( "code" ) eq 'a'); - - $ns = $type->new; - - $ns->tag( $tag ); - $ns->ind1( $ind1 ); - $ns->ind2( $ind2 ); - $ns->subfield( $data->getAttribute( "code" ) ); - my $val = substr( $data->textContent, $ind2 ); - $val = NFD($val); - $val =~ s/\pM+//sgo; - $val =~ s/\pC+//sgo; - $val =~ s/\W+$//sgo; - $val =~ s/(\w+)\/(\w+)/$1 $2/sgo; - $val =~ s/(\d{4})-(\d{4})/ $1 $2 /sgo; - $ns->value( lc($val) ); - - push @ns_list, $ns; - } + _data_tag_to_full_rows($type, $tagline, \@ns_list, 'tnf'); } } @@ -1269,6 +1203,69 @@ sub _marcxml_to_full_rows { return @ns_list; } +=head2 _special_tag_to_full_rows + +Converts a leader or control field to a set of normalized values + +=cut + +sub _special_tag_to_full_rows { + my $type = shift; + my $tagline = shift; + my $ns_list = shift; + my $tagname = shift; + + my $ns = $type->new; + + $ns->tag( $tagname ); + my $val = $tagline->textContent; + $val = NFD($val); + $val =~ s/\pM+//sgo; + $val =~ s/\pC+//sgo; + $val =~ s/\W+$//sgo; + $ns->value( $val ); + + push @$ns_list, $ns; +} + +=head2 _data_tag_to_full_rows + +Converts a data field to a set of normalized values + +=cut + +sub _data_tag_to_full_rows { + my $type = shift; + my $tagline = shift; + my $ns_list = shift; + my $tag = shift; + + my $ind1 = $tagline->getAttribute( "ind1" ); + my $ind2 = $tagline->getAttribute( "ind2" ); + + for my $data ( @{$tagline->getChildrenByTagName('subfield')} ) { + next unless $data; + + my $ns = $type->new; + + $ns->tag( $tag ); + $ns->ind1( $ind1 ); + $ns->ind2( $ind2 ); + $ns->subfield( $data->getAttribute( "code" ) ); + my $val = $data->textContent; + $val = NFD($val); + $val =~ s/\pM+//sgo; + $val =~ s/\pC+//sgo; + $val =~ s/\W+$//sgo; + # Split date ranges and ISSNs on the hyphen + $val =~ s/(\d{4})-(\d{3,4}x?)/ $1 $2 /goi; + $val =~ s/(\w+)\/(\w+)/$1 $2/sgo; + $ns->value( lc($val) ); + + push @$ns_list, $ns; + } +} + sub flat_marc_xml { my $self = shift; my $client = shift;