From 2788298ec23d1caff3755f9c151d03510420651d Mon Sep 17 00:00:00 2001 From: Dan Scott <dscott@laurentian.ca> Date: Sun, 4 Mar 2012 02:41:11 -0500 Subject: [PATCH] Fix Unicode mangling in clean_marc function Calling s/\p{Cc}//go; before entityize() was resulting in all xFFFD entities being returned for the upper case diacritic characters, which in turn caused the new unit test to fail (yay unit tests). I added a corresponding unit tese for entityize() to ensure that the problem wasn't coming from that function. Switching the order in which the p{Cc} regex and entityize() calls resolved the corruption in the unit test. This suggests that Vandelay may be introducing significant corruption to imported records and that backporting of this commit to the inline Vandelay variants from previous releases may be warranted. Signed-off-by: Dan Scott <dscott@laurentian.ca> Signed-off-by: Jason Stephenson <jstephenson@mvlc.org> --- .../src/perlmods/lib/OpenILS/Utils/Normalize.pm | 2 +- Open-ILS/src/perlmods/t/01-OpenILS-Application.t | 8 ++++- Open-ILS/src/perlmods/t/14-OpenILS-Utils.t | 39 +++++++++++++++++++++- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm index 9ddca6e1df..90355768c2 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm @@ -115,8 +115,8 @@ sub clean_marc { $xml =~ s/\n//sog; $xml =~ s/^<\?xml.+\?\s*>//go; $xml =~ s/>\s+</></go; - $xml =~ s/\p{Cc}//go; $xml = OpenILS::Application::AppUtils->entityize($xml); + $xml =~ s/\p{Cc}//go; $xml =~ s/[\x00-\x1f]//go; return $xml; } diff --git a/Open-ILS/src/perlmods/t/01-OpenILS-Application.t b/Open-ILS/src/perlmods/t/01-OpenILS-Application.t index 06f3ad4987..a4367f6ca9 100644 --- a/Open-ILS/src/perlmods/t/01-OpenILS-Application.t +++ b/Open-ILS/src/perlmods/t/01-OpenILS-Application.t @@ -1,6 +1,6 @@ #!perl -T -use Test::More tests => 13; +use Test::More tests => 14; BEGIN { use_ok( 'OpenILS::Application' ); @@ -18,3 +18,9 @@ use_ok( 'OpenILS::Application::ResolverResolver' ); use_ok( 'OpenILS::Application::Serial' ); use_ok( 'OpenILS::Application::SuperCat' ); use_ok( 'OpenILS::Application::Vandelay' ); + +is( + OpenILS::Application::AppUtils::entityize(0, 'èöçÃÃÃ'), + 'èöçÇÈÀ', + 'entityize: diacritics' +); diff --git a/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t b/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t index 924e2a3f83..98789564c0 100644 --- a/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t +++ b/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t @@ -1,6 +1,6 @@ #!perl -T -use Test::More tests => 22; +use Test::More tests => 24; use_ok( 'OpenILS::Utils::Configure' ); use_ok( 'OpenILS::Utils::Cronscript' ); @@ -43,3 +43,40 @@ is($apostring, "its time", "naco_normalize: strip apostrophes"); my $apos = OpenILS::Utils::Normalize::search_normalize("it's time"); is($apos, "it s time", "search_normalize: replace apostrophes with space"); + +my $raw_marcxml = <<RAWMARC; +<?xml version="1.0" encoding="utf-8"?> +<record> + <leader>01614nmm a22003975u 4500</leader> + <controlfield tag="001">978-0-387-35767-6</controlfield> + <controlfield tag="003">Springer</controlfield> + <controlfield tag="005">20071022150035.8</controlfield> + <controlfield tag="007">cr nn 008mamaa</controlfield> + <controlfield tag="008">071022s2008 xx j eng d</controlfield> + <datafield tag="020" ind1=" " ind2=" "> + <subfield code="a">9780387685748</subfield> + </datafield> + <datafield tag="100" ind1="1" ind2=" "> + <subfield code="a">Neteler, Markus.</subfield> + </datafield> + <datafield tag="245" ind1="1" ind2="0"> + <subfield code="a">Open Source GIS</subfield> + <subfield code="h">[electronic resource] :</subfield> + <subfield code="b">A GRASS GIS Approach /</subfield> + <subfield code="c">edited by Markus Neteler, Helena Mitasova.</subfield> + </datafield> + <datafield tag="250" ind1=" " ind2=" "> + <subfield code="a">Third Edition.</subfield> + </datafield> + <datafield tag="260" ind1=" " ind2=" "> + <subfield code="a">Boston, MA :</subfield> + <subfield code="b">Springer Science+Business Media, LLC,</subfield> + <subfield code="c">2008.</subfield> + </datafield> +</record> +RAWMARC +my $exp_xml = '<record><leader>01614nmm a22003975u 4500</leader><controlfield tag="001">978-0-387-35767-6</controlfield><controlfield tag="003">Springer</controlfield><controlfield tag="005">20071022150035.8</controlfield><controlfield tag="007">cr nn 008mamaa</controlfield><controlfield tag="008">071022s2008 xx j eng d</controlfield><datafield tag="020" ind1=" " ind2=" "><subfield code="a">9780387685748</subfield></datafield><datafield tag="100" ind1="1" ind2=" "><subfield code="a">Neteler, Markus.</subfield></datafield><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Open Source GIS</subfield><subfield code="h">[electronic resource] :</subfield><subfield code="b">A GRASS GIS Approach /</subfield><subfield code="c">edited by Markus Neteler, Helena Mitasova.</subfield></datafield><datafield tag="250" ind1=" " ind2=" "><subfield code="a">Third Edition.</subfield></datafield><datafield tag="260" ind1=" " ind2=" "><subfield code="a">Boston, MA :</subfield><subfield code="b">Springer Science+Business Media, LLC,</subfield><subfield code="c">2008.</subfield></datafield></record>'; +my $clean_xml = OpenILS::Utils::Normalize::clean_marc($raw_marcxml); +is($clean_xml, $exp_xml, "clean_marc: header and space normalization"); + +is(OpenILS::Utils::Normalize::clean_marc('èöçÃÃÃ'), 'èöçÇÈÀ', 'clean_marc: diacritics'); -- 2.11.0