From: dbs Date: Wed, 25 Mar 2009 03:30:07 +0000 (+0000) Subject: Dedupe code by moving to a common implementation of entityize() X-Git-Tag: sprint4-merge-nov22~10415 X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=2e7dac440764d6ebb47f5d90d6cc526ae6781e65;p=working%2FEvergreen.git Dedupe code by moving to a common implementation of entityize() Well, almost common. Perhaps we should make strip_ctrl_chars() and ampersize() a standard part of entityize. git-svn-id: svn://svn.open-ils.org/ILS/trunk@12664 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- diff --git a/Open-ILS/src/extras/import/marc2are.pl b/Open-ILS/src/extras/import/marc2are.pl index 401a5dbf32..e40bc47a27 100755 --- a/Open-ILS/src/extras/import/marc2are.pl +++ b/Open-ILS/src/extras/import/marc2are.pl @@ -69,7 +69,7 @@ while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { $xml =~ s/^<\?xml.+\?\s*>//go; $xml =~ s/>\s+entityize($xml,'D'); $xml =~ s/[\x00-\x1f]//go; my $bib = new Fieldmapper::authority::record_entry; @@ -124,17 +124,3 @@ sub login { return $authtoken; } -sub entityize { - my $stuff = shift; - my $form = shift; - - if ($form and $form eq 'D') { - $stuff = NFD($stuff); - } else { - $stuff = NFC($stuff); - } - - $stuff =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; - return $stuff; -} - diff --git a/Open-ILS/src/extras/import/marc2bre.pl b/Open-ILS/src/extras/import/marc2bre.pl index ad848e751a..3f7a2021ea 100755 --- a/Open-ILS/src/extras/import/marc2bre.pl +++ b/Open-ILS/src/extras/import/marc2bre.pl @@ -8,6 +8,7 @@ use Error qw/:try/; use OpenILS::Utils::Fieldmapper; use Digest::MD5 qw/md5_hex/; use OpenSRF::Utils::JSON; +use OpenILS::Application::AppUtils; use Data::Dumper; use Unicode::Normalize; use Encode; @@ -260,7 +261,7 @@ PROCESS: while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { $xml =~ s/^<\?xml.+\?\s*>//go; $xml =~ s/>\s+entityize($xml,'D'); $xml =~ s/[\x00-\x1f]//go; my $bib = new Fieldmapper::biblio::record_entry; @@ -392,20 +393,6 @@ sub preprocess { return ($field901, $tcn_value, $tcn_source); } -sub entityize { - my $stuff = shift; - my $form = shift; - - if ($form and $form eq 'D') { - $stuff = NFD($stuff); - } else { - $stuff = NFC($stuff); - } - - $stuff =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; - return $stuff; -} - sub despace { my $value = shift; diff --git a/Open-ILS/src/perlmods/OpenILS/Application/AppUtils.pm b/Open-ILS/src/perlmods/OpenILS/Application/AppUtils.pm index dfbf7fa80e..063ed08a96 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/AppUtils.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/AppUtils.pm @@ -1399,6 +1399,22 @@ sub entityize { return $string; } +# x0000-x0008 isn't legal in XML documents +# XXX Perhaps this should just go into our standard entityize method +sub strip_ctrl_chars { + my ($self, $string) = @_; + + $string =~ s/([\x{0000}-\x{0008}])//sgoe; + return $string; +} + +# Ampersands are special, mmmkay? +# XXX Perhaps this should go into our standard entityize method +sub ampersize { + my $stuff = shift(); + $stuff =~ s/&(?!\S+;)/&/gso; + return $stuff; +} sub get_copy_price { my($self, $e, $copy, $volume) = @_; diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Search/Z3950.pm b/Open-ILS/src/perlmods/OpenILS/Application/Search/Z3950.pm index 5b2ce71337..98e2b297eb 100755 --- a/Open-ILS/src/perlmods/OpenILS/Application/Search/Z3950.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/Search/Z3950.pm @@ -379,9 +379,11 @@ sub process_results { die "Unsupported record transmission format $tformat" } - $marcs = entityize($marc->as_xml_record); + $marcs = $U->entityize($marc->as_xml_record); + $marcs = $U->strip_ctrl_chars($marcs); my $doc = XML::LibXML->new->parse_string($marcs); - $marcxml = entityize( $doc->documentElement->toString ); + $marcxml = $U->entityize($doc->documentElement->toString); + $marcxml = $U->strip_ctrl_chars($marcxml); my $u = OpenILS::Utils::ModsParser->new(); $u->start_mods_batch( $marcxml ); @@ -434,28 +436,4 @@ sub compile_query { return $str; } - - -# ------------------------------------------------------------------- -# Handles the unicode -# ------------------------------------------------------------------- -sub entityize { - my $stuff = shift; - my $form = shift || ""; - - if ($form eq 'D') { - $stuff = NFD($stuff); - } else { - $stuff = NFC($stuff); - } - - $stuff =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; - - # strip some other unfriendly chars that may leak in - $stuff =~ s/([\x{0000}-\x{0008}])//sgoe; - - return $stuff; -} - - 1; diff --git a/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm b/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm index 3082c1e018..34cea01bbd 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm @@ -23,6 +23,8 @@ use OpenSRF::Utils::Logger qw($logger); # ... and this is our OpenILS object (en|de)coder and psuedo-ORM package. use OpenILS::Utils::Fieldmapper; +# ... and this has some handy common methods +use OpenILS::Application::AppUtils; # We'll be working with XML, so... use XML::LibXML; @@ -39,6 +41,8 @@ our ( %holdings_data_cache, ); +my $U = 'OpenILS::Application::AppUtils'; + sub child_init { # we need an XML parser $_parser = new XML::LibXML; @@ -217,13 +221,6 @@ sub register_record_transforms { } } - -sub entityize { - my $stuff = NFC(shift()); - $stuff =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; - return $stuff; -} - sub tree_walker { my $tree = shift; my $field = shift; @@ -880,7 +877,7 @@ sub retrieve_record_marcxml { my $_storage = OpenSRF::AppSession->create( 'open-ils.cstore' ); my $record = $_storage->request( 'open-ils.cstore.direct.biblio.record_entry.retrieve' => $rid )->gather(1); - return entityize( $record->marc ) if ($record); + return $U->entityize( $record->marc ) if ($record); return undef; } @@ -920,7 +917,7 @@ sub retrieve_isbn_marcxml { return undef unless (@$recs); my $record = $_storage->request( 'open-ils.cstore.direct.biblio.record_entry.retrieve' => $recs->[0]->record )->gather(1); - return entityize( $record->marc ) if ($record); + return $U->entityize( $record->marc ) if ($record); return undef; } @@ -962,7 +959,7 @@ sub retrieve_record_transform { return undef unless ($record); - return entityize($record_xslt{$transform}{xslt}->transform( $_parser->parse_string( $record->marc ) )->toString); + return $U->entityize($record_xslt{$transform}{xslt}->transform( $_parser->parse_string( $record->marc ) )->toString); } sub retrieve_isbn_transform { @@ -985,7 +982,7 @@ sub retrieve_isbn_transform { return undef unless ($record); - return entityize($record_xslt{$transform}{xslt}->transform( $_parser->parse_string( $record->marc ) )->toString); + return $U->entityize($record_xslt{$transform}{xslt}->transform( $_parser->parse_string( $record->marc ) )->toString); } sub retrieve_record_objects { @@ -1215,7 +1212,7 @@ sub retrieve_metarecord_mods { $_storage->disconnect; - return entityize($mods->toString); + return $U->entityize($mods->toString); } __PACKAGE__->register_method( diff --git a/Open-ILS/src/perlmods/OpenILS/WWW/SuperCat.pm b/Open-ILS/src/perlmods/OpenILS/WWW/SuperCat.pm index 0c50a9ad2d..4680fc84a8 100644 --- a/Open-ILS/src/perlmods/OpenILS/WWW/SuperCat.pm +++ b/Open-ILS/src/perlmods/OpenILS/WWW/SuperCat.pm @@ -25,11 +25,13 @@ use Unicode::Normalize; use OpenILS::Utils::Fieldmapper; use OpenILS::WWW::SuperCat::Feed; use OpenSRF::Utils::Logger qw/$logger/; +use OpenILS::Application::AppUtils; use MARC::Record; use MARC::File::XML; my $log = 'OpenSRF::Utils::Logger'; +my $U = 'OpenILS::Application::AppUtils'; # set the bootstrap config when this module is loaded my ($bootstrap, $supercat, $actor, $parser, $search, $xslt, $cn_browse_xslt, %browse_types); @@ -72,7 +74,7 @@ $browse_types{call_number}{xml} = sub { my $r_doc = $parser->parse_string($cn->record->marc); $r_doc->documentElement->setAttribute( id => $rec_tag ); - $content .= entityize($r_doc->documentElement->toString); + $content .= $U->entityize($U->ampersize($r_doc->documentElement->toString)); $content .= ""; } @@ -101,13 +103,13 @@ $browse_types{call_number}{html} = sub { return ( "Content-type: text/html\n\n", - entityize( + $U->entityize($U->ampersize( $cn_browse_xslt->transform( $parser->parse_string( $xml ), 'prev' => "'$p'", 'next' => "'$n'" )->toString(1) - ) + )) ); }; @@ -451,7 +453,7 @@ sub unapi { $feed->link( unapi => $base) if ($flesh_feed); print "Content-type: ". $feed->type ."; charset=utf-8\n\n"; - print entityize($feed->toString) . "\n"; + print $U->entityize($U->ampersize($feed->toString)) . "\n"; return Apache2::Const::OK; } @@ -697,7 +699,7 @@ sub supercat { $feed->link( unapi => $base) if ($flesh_feed); print "Content-type: ". $feed->type ."; charset=utf-8\n\n"; - print entityize($feed->toString) . "\n"; + print $U->entityize($U->ampersize($feed->toString)) . "\n"; return Apache2::Const::OK; } @@ -722,7 +724,7 @@ sub supercat { } print "Content-type: application/xml; charset=utf-8\n\n"; - print entityize( $parser->parse_string( $req->gather(1) )->documentElement->toString ); + print $U->entityize($U->ampersize( $parser->parse_string( $req->gather(1) )->documentElement->toString )); return Apache2::Const::OK; } @@ -798,7 +800,7 @@ sub bookbag_feed { print "Content-type: ". $feed->type ."; charset=utf-8\n\n"; - print entityize($feed->toString) . "\n"; + print $U->entityize($U->ampersize($feed->toString)) . "\n"; return Apache2::Const::OK; } @@ -867,7 +869,7 @@ sub changes_feed { print "Content-type: ". $feed->type ."; charset=utf-8\n\n"; - print entityize($feed->toString) . "\n"; + print $U->entityize($U->ampersize($feed->toString)) . "\n"; return Apache2::Const::OK; } @@ -1244,13 +1246,6 @@ sub create_record_feed { return $feed; } -sub entityize { - my $stuff = NFC(shift()); - $stuff =~ s/&(?!\S+;)/&/gso; - $stuff =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; - return $stuff; -} - sub string_browse { my $apache = shift; return Apache2::Const::DECLINED if (-e $apache->filename); @@ -1636,7 +1631,7 @@ sub sru_search { } print $cgi->header( -type => 'application/xml' ); - print entityize($resp->asXML) . "\n"; + print $U->entityize($U->ampersize($resp->asXML)) . "\n"; return Apache2::Const::OK; }