# If we're going to convert non-ASCII characters to XML entities,
# we had better be dealing with a UTF8 string to begin with
- $string = decode_utf8($string);
-
- if ($form eq 'D') {
- $string = NFD($string);
- } else {
- $string = NFC($string);
- }
+ $string = NFC($string);
# Convert raw ampersands to entities
$string =~ s/&(?!\S+;)/&/gso;
our @EXPORT_OK = qw( clean_marc naco_normalize search_normalize );
sub naco_normalize {
- my $str = decode_utf8(shift);
+ my $str = NFD(shift);
my $sf = shift;
# Apply NACO normalization to input string; based on
$str = _normalize_codes($str, $sf);
- return $str;
+ return NFC($str);
}
sub search_normalize {
- my $str = decode_utf8(shift);
+ my $str = NFD(shift);
my $sf = shift;
$str = _normalize_substitutions($str, $sf);
$str = _normalize_codes($str, $sf);
- return $str;
+ return NFC($str);
}
sub _normalize_substitutions {
# Assumes input is already in UTF-8.
sub clean_marc {
my $input = shift;
- my $xml = decode_utf8((UNIVERSAL::isa($input, 'MARC::Record')) ? $input->as_xml_record() : $input);
+ my $xml = NFD((UNIVERSAL::isa($input, 'MARC::Record')) ? $input->as_xml_record() : $input);
$xml =~ s/\n//sog;
$xml =~ s/^<\?xml.+\?\s*>//go;
$xml =~ s/>\s+</></go;
$xml =~ s/\p{Cc}//go;
$xml = OpenILS::Application::AppUtils->entityize($xml);
$xml =~ s/[\x00-\x1f]//go;
- return $xml;
+ return NFC($xml);
}
1;
#!perl -T
-use Test::More tests => 30;
+use Test::More tests => 32;
use Test::Warn;
+use utf8;
use_ok( 'OpenILS::Utils::Configure' );
use_ok( 'OpenILS::Utils::Cronscript' );
my $clean_xml = OpenILS::Utils::Normalize::clean_marc($raw_marcxml);
is($clean_xml, $exp_xml, "clean_marc: header and space normalization");
+is(OpenILS::Application::AppUtils->entityize('èöçÇÈÀ'), 'èöçÇÈÀ', 'entityize: diacritics NFC');
+is(OpenILS::Application::AppUtils->entityize('èöçÇÈÀ', 'D'), 'èöçÇÈÀ', 'entityize: diacritics NFD');
is(OpenILS::Utils::Normalize::clean_marc('èöçÇÈÀ'), 'èöçÇÈÀ', 'clean_marc: diacritics');
my $edi_invoice = "UNA:+.? 'UNB+UNOC:3+1556150:31B+123EVER:31B+120926:1621+4'UNH+11+INVOIC:D:96A:UN'BGM+380+5TST084026+9'DTM+137:20120924:102'RFF+ON:24'NAD+BY+123EVER 0001::91'NAD+SU+1691503::31B'CUX+2:USD:4'LIN+1++9780446360272'QTY+47:5'MOA+146:4.5:USD:10'MOA+203:14.65'PRI+AAF:2.93:DI:NTP'RFF+LI:24/102'LIN+2++9780446357197'QTY+47:8'MOA+146:6.5:USD:10'MOA+203:33.84'PRI+AAF:4.23:DI:NTP'RFF+LI:24/100'UNS+S'MOA+86:66.18'ALC+C++++DL'MOA+8:2'ALC+C++++CA'MOA+131:12.3'ALC+C++++TX'MOA+8:3.39'UNT+28+11'UNH+12+INVOIC:D:96A:UN'BGM+380+5TST084027+9'DTM+137:20120924:102'RFF+ON:26'NAD+BY+123EVER 0001::91'NAD+SU+1691503::31B'CUX+2:USD:4'LIN+1++9780446360272'QTY+47:1'MOA+146:4.5:USD:10'MOA+203:4.05'PRI+AAF:4.05:DI:NTP'RFF+LI:26/106'LIN+2++9780446350105'QTY+47:3'MOA+146:6.99:USD:10'MOA+203:14.67'PRI+AAF:4.89:DI:NTP'RFF+LI:26/105'UNS+S'MOA+86:25.03'ALC+C++++DL'MOA+8:2'ALC+C++++CA'MOA+131:3'ALC+C++++TX'MOA+8:1.31'UNT+28+12'UNZ+4+4'";