From f761fdf4a31b679b1e7760b35ad91606eeb05b17 Mon Sep 17 00:00:00 2001
From: dbs <dbs@dcc99617-32d9-48b4-a31d-7c20da2025e4>
Date: Wed, 20 Jan 2010 23:15:10 +0000
Subject: [PATCH] Correct Unicode handling for in-db ingest
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Ensure MARC::File::XML treats the content as UTF-8; then follow
recommended practice of explicitly decoding the UTF-8 string to a
bytes string before operating on it, then encoding it back to UTF8
when it's returned.

'Québec' now gets naco-normalized to 'quebec' as one would expect. YAY!


git-svn-id: svn://svn.open-ils.org/ILS/trunk@15351 dcc99617-32d9-48b4-a31d-7c20da2025e4
---
 Open-ILS/src/sql/Pg/020.schema.functions.sql | 14 +++++++++-----
 Open-ILS/src/sql/Pg/030.schema.metabib.sql   |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql
index 0894b6ce3a..24a66f8855 100644
--- a/Open-ILS/src/sql/Pg/020.schema.functions.sql
+++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql
@@ -34,13 +34,15 @@ CREATE OR REPLACE FUNCTION public.non_filing_normalize ( TEXT, "char" ) RETURNS
 $$ LANGUAGE SQL STRICT IMMUTABLE;
 
 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
-    use Unicode::Normalize;
-    use Encode;
+	use Unicode::Normalize;
+	use Encode;
 
-	my $txt = lc(encode_utf8(shift));
+	# When working with Unicode data, the first step is to decode it to
+	# a byte string; after that, lowercasing is safe
+	my $txt = lc(decode_utf8(shift));
 	my $sf = shift;
 
-    $txt = NFD($txt);
+	$txt = NFD($txt);
 	$txt =~ s/\pM+//go;	# Remove diacritics
 
 	$txt =~ s/\xE6/AE/go;	# Convert ae digraph
@@ -73,7 +75,9 @@ CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $
 	$txt =~ s/^\s+//o;	# Remove leading space
 	$txt =~ s/\s+$//o;	# Remove trailing space
 
-	return $txt;
+	# Encoding the outgoing string is good practice, but not strictly
+	# necessary in this case because we've stripped everything from it
+	return encode_utf8($txt);
 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
 
 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$
diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql
index 61177d260c..27c876bd0b 100644
--- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql
+++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql
@@ -375,7 +375,7 @@ $func$ LANGUAGE PLPGSQL;
 CREATE OR REPLACE FUNCTION biblio.flatten_marc ( TEXT ) RETURNS SETOF metabib.full_rec AS $func$
 
 use MARC::Record;
-use MARC::File::XML;
+use MARC::File::XML (BinaryEncoding => 'UTF-8');
 
 my $xml = shift;
 my $r = MARC::Record->new_from_xml( $xml );
-- 
2.11.0