From 2788298ec23d1caff3755f9c151d03510420651d Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Sun, 4 Mar 2012 02:41:11 -0500 Subject: [PATCH] Fix Unicode mangling in clean_marc function Calling s/\p{Cc}//go; before entityize() was resulting in all xFFFD entities being returned for the upper case diacritic characters, which in turn caused the new unit test to fail (yay unit tests). I added a corresponding unit tese for entityize() to ensure that the problem wasn't coming from that function. Switching the order in which the p{Cc} regex and entityize() calls resolved the corruption in the unit test. This suggests that Vandelay may be introducing significant corruption to imported records and that backporting of this commit to the inline Vandelay variants from previous releases may be warranted. Signed-off-by: Dan Scott Signed-off-by: Jason Stephenson --- .../src/perlmods/lib/OpenILS/Utils/Normalize.pm | 2 +- Open-ILS/src/perlmods/t/01-OpenILS-Application.t | 8 ++++- Open-ILS/src/perlmods/t/14-OpenILS-Utils.t | 39 +++++++++++++++++++++- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm index 9ddca6e1df..90355768c2 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm @@ -115,8 +115,8 @@ sub clean_marc { $xml =~ s/\n//sog; $xml =~ s/^<\?xml.+\?\s*>//go; $xml =~ s/>\s+entityize($xml); + $xml =~ s/\p{Cc}//go; $xml =~ s/[\x00-\x1f]//go; return $xml; } diff --git a/Open-ILS/src/perlmods/t/01-OpenILS-Application.t b/Open-ILS/src/perlmods/t/01-OpenILS-Application.t index 06f3ad4987..a4367f6ca9 100644 --- a/Open-ILS/src/perlmods/t/01-OpenILS-Application.t +++ b/Open-ILS/src/perlmods/t/01-OpenILS-Application.t @@ -1,6 +1,6 @@ #!perl -T -use Test::More tests => 13; +use Test::More tests => 14; BEGIN { use_ok( 'OpenILS::Application' ); @@ -18,3 +18,9 @@ use_ok( 'OpenILS::Application::ResolverResolver' ); use_ok( 'OpenILS::Application::Serial' ); use_ok( 'OpenILS::Application::SuperCat' ); use_ok( 'OpenILS::Application::Vandelay' ); + +is( + OpenILS::Application::AppUtils::entityize(0, 'èöçÇÈÀ'), + 'èöçÇÈÀ', + 'entityize: diacritics' +); diff --git a/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t b/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t index 924e2a3f83..98789564c0 100644 --- a/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t +++ b/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t @@ -1,6 +1,6 @@ #!perl -T -use Test::More tests => 22; +use Test::More tests => 24; use_ok( 'OpenILS::Utils::Configure' ); use_ok( 'OpenILS::Utils::Cronscript' ); @@ -43,3 +43,40 @@ is($apostring, "its time", "naco_normalize: strip apostrophes"); my $apos = OpenILS::Utils::Normalize::search_normalize("it's time"); is($apos, "it s time", "search_normalize: replace apostrophes with space"); + +my $raw_marcxml = < + + 01614nmm a22003975u 4500 + 978-0-387-35767-6 + Springer + 20071022150035.8 + cr nn 008mamaa + 071022s2008 xx j eng d + + 9780387685748 + + + Neteler, Markus. + + + Open Source GIS + [electronic resource] : + A GRASS GIS Approach / + edited by Markus Neteler, Helena Mitasova. + + + Third Edition. + + + Boston, MA : + Springer Science+Business Media, LLC, + 2008. + + +RAWMARC +my $exp_xml = '01614nmm a22003975u 4500978-0-387-35767-6Springer20071022150035.8cr nn 008mamaa071022s2008 xx j eng d9780387685748Neteler, Markus.Open Source GIS[electronic resource] :A GRASS GIS Approach /edited by Markus Neteler, Helena Mitasova.Third Edition.Boston, MA :Springer Science+Business Media, LLC,2008.'; +my $clean_xml = OpenILS::Utils::Normalize::clean_marc($raw_marcxml); +is($clean_xml, $exp_xml, "clean_marc: header and space normalization"); + +is(OpenILS::Utils::Normalize::clean_marc('èöçÇÈÀ'), 'èöçÇÈÀ', 'clean_marc: diacritics'); -- 2.11.0