From 3869d6ceb3685727cd1710e30efdc77af652957d Mon Sep 17 00:00:00 2001
From: dbs <dbs@dcc99617-32d9-48b4-a31d-7c20da2025e4>
Date: Thu, 21 Jan 2010 02:18:40 +0000
Subject: [PATCH] Add the schema upgrade corresponding to r15351 in-db ingest
 encoding fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Ensure MARC::File::XML treats the content as UTF-8; then follow
recommended practice of explicitly decoding the UTF-8 string to a
bytes string before operating on it, then encoding it back to UTF8
when it's returned.

'QuÃ©bec' now gets naco-normalized to 'quebec' as one would expect. YAY!


git-svn-id: svn://svn.open-ils.org/ILS/trunk@15352 dcc99617-32d9-48b4-a31d-7c20da2025e4
---
 Open-ILS/src/sql/Pg/002.schema.config.sql          |  2 +-
 .../upgrade/0138.schema.in-db-encoding-fixes.sql   | 95 ++++++++++++++++++++++
 2 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 Open-ILS/src/sql/Pg/upgrade/0138.schema.in-db-encoding-fixes.sql

diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql
index f7f29f44d3..60d97bd28b 100644
--- a/Open-ILS/src/sql/Pg/002.schema.config.sql
+++ b/Open-ILS/src/sql/Pg/002.schema.config.sql
@@ -51,7 +51,7 @@ CREATE TABLE config.upgrade_log (
     install_date    TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 
-INSERT INTO config.upgrade_log (version) VALUES ('0137'); -- miker
+INSERT INTO config.upgrade_log (version) VALUES ('0138'); -- dbs
 
 CREATE TABLE config.bib_source (
 	id		SERIAL	PRIMARY KEY,
diff --git a/Open-ILS/src/sql/Pg/upgrade/0138.schema.in-db-encoding-fixes.sql b/Open-ILS/src/sql/Pg/upgrade/0138.schema.in-db-encoding-fixes.sql
new file mode 100644
index 0000000000..a62d3b6010
--- /dev/null
+++ b/Open-ILS/src/sql/Pg/upgrade/0138.schema.in-db-encoding-fixes.sql
@@ -0,0 +1,95 @@
+
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0138'); -- dbs
+CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
+	use Unicode::Normalize;
+	use Encode;
+
+	# When working with Unicode data, the first step is to decode it to
+	# a byte string; after that, lowercasing is safe
+	my $txt = lc(decode_utf8(shift));
+	my $sf = shift;
+
+	$txt = NFD($txt);
+	$txt =~ s/\pM+//go;	# Remove diacritics
+
+	$txt =~ s/\xE6/AE/go;	# Convert ae digraph
+	$txt =~ s/\x{153}/OE/go;# Convert oe digraph
+	$txt =~ s/\xFE/TH/go;	# Convert Icelandic thorn
+
+	$txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
+	$txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
+
+	$txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;	 	# Convert Latin and Greek
+	$txt =~ tr/\x{2113}\xF0\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LD /;	# Convert Misc
+	$txt =~ tr/\'\[\]\|//d;							# Remove Misc
+
+	if ($sf && $sf =~ /^a/o) {
+		my $commapos = index($txt,',');
+		if ($commapos > -1) {
+			if ($commapos != length($txt) - 1) {
+				my @list = split /,/, $txt;
+				my $first = shift @list;
+				$txt = $first . ',' . join(' ', @list);
+			} else {
+				$txt =~ s/,/ /go;
+			}
+		}
+	} else {
+		$txt =~ s/,/ /go;
+	}
+
+	$txt =~ s/\s+/ /go;	# Compress multiple spaces
+	$txt =~ s/^\s+//o;	# Remove leading space
+	$txt =~ s/\s+$//o;	# Remove trailing space
+
+	# Encoding the outgoing string is good practice, but not strictly
+	# necessary in this case because we've stripped everything from it
+	return encode_utf8($txt);
+$func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION biblio.flatten_marc ( TEXT ) RETURNS SETOF metabib.full_rec AS $func$
+
+use MARC::Record;
+use MARC::File::XML (BinaryEncoding => 'UTF-8');
+
+my $xml = shift;
+my $r = MARC::Record->new_from_xml( $xml );
+
+return_next( { tag => 'LDR', value => $r->leader } );
+
+for my $f ( $r->fields ) {
+	if ($f->is_control_field) {
+		return_next({ tag => $f->tag, value => $f->data });
+	} else {
+		for my $s ($f->subfields) {
+			return_next({
+				tag      => $f->tag,
+				ind1     => $f->indicator(1),
+				ind2     => $f->indicator(2),
+				subfield => $s->[0],
+				value    => $s->[1]
+			});
+
+			if ( $f->tag eq '245' and $s->[0] eq 'a' ) {
+				my $trim = $f->indicator(2) || 0;
+				return_next({
+					tag      => 'tnf',
+					ind1     => $f->indicator(1),
+					ind2     => $f->indicator(2),
+					subfield => 'a',
+					value    => substr( $s->[1], $trim )
+				});
+			}
+		}
+	}
+}
+
+return undef;
+
+$func$ LANGUAGE PLPERLU;
+
+
+COMMIT;
+
-- 
2.11.0