From f761fdf4a31b679b1e7760b35ad91606eeb05b17 Mon Sep 17 00:00:00 2001 From: dbs Date: Wed, 20 Jan 2010 23:15:10 +0000 Subject: [PATCH] Correct Unicode handling for in-db ingest MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Ensure MARC::File::XML treats the content as UTF-8; then follow recommended practice of explicitly decoding the UTF-8 string to a bytes string before operating on it, then encoding it back to UTF8 when it's returned. 'Québec' now gets naco-normalized to 'quebec' as one would expect. YAY! git-svn-id: svn://svn.open-ils.org/ILS/trunk@15351 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/sql/Pg/020.schema.functions.sql | 14 +++++++++----- Open-ILS/src/sql/Pg/030.schema.metabib.sql | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql index 0894b6ce3a..24a66f8855 100644 --- a/Open-ILS/src/sql/Pg/020.schema.functions.sql +++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql @@ -34,13 +34,15 @@ CREATE OR REPLACE FUNCTION public.non_filing_normalize ( TEXT, "char" ) RETURNS $$ LANGUAGE SQL STRICT IMMUTABLE; CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$ - use Unicode::Normalize; - use Encode; + use Unicode::Normalize; + use Encode; - my $txt = lc(encode_utf8(shift)); + # When working with Unicode data, the first step is to decode it to + # a byte string; after that, lowercasing is safe + my $txt = lc(decode_utf8(shift)); my $sf = shift; - $txt = NFD($txt); + $txt = NFD($txt); $txt =~ s/\pM+//go; # Remove diacritics $txt =~ s/\xE6/AE/go; # Convert ae digraph @@ -73,7 +75,9 @@ CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $ $txt =~ s/^\s+//o; # Remove leading space $txt =~ s/\s+$//o; # Remove trailing space - return $txt; + # Encoding the outgoing string is good practice, but not strictly + # necessary in this case because we've stripped everything from it + return encode_utf8($txt); $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE; CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$ diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index 61177d260c..27c876bd0b 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -375,7 +375,7 @@ $func$ LANGUAGE PLPGSQL; CREATE OR REPLACE FUNCTION biblio.flatten_marc ( TEXT ) RETURNS SETOF metabib.full_rec AS $func$ use MARC::Record; -use MARC::File::XML; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); my $xml = shift; my $r = MARC::Record->new_from_xml( $xml ); -- 2.11.0