From 4d39dd908d6c7abd202cc48e7ae472b0700c32ca Mon Sep 17 00:00:00 2001
From: gmc <gmc@dcc99617-32d9-48b4-a31d-7c20da2025e4>
Date: Wed, 19 Jan 2011 16:07:14 +0000
Subject: [PATCH] backport naco_normalize revisions to rel_2_0

This implements the latest version of the NACO
normalization specification found at

http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf

This version of the algorithm is more general -- for example,
all combining characters are removed -- so there should be
fewer fiddly edge cases to worry about for most European
languages.

Rebuilding the metabib.*_field_entry tables (e.g., by using
reingest-1.6-2.0.pl) is recommended if there are any bibs that contain
any non-ASCII characters.

Normalized text is now left in the NFKD form, so while this should
be transparent to the search system after reindexing, it does mean
that (for example) Korean text in metabib.*_field_entry may not
be in the same Unicode normalization form as that found in
biblio.record_entry.

Also includes fix for bug #684467: more bulletproofing of naco_normalize

Signed-off-by: Galen Charlton <gmc@esilibrary.com>

git-svn-id: svn://svn.open-ils.org/ILS/branches/rel_2_0@19205 dcc99617-32d9-48b4-a31d-7c20da2025e4
---
 .../perlmods/OpenILS/Application/Storage/FTS.pm    |  92 ++++++++++++-------
 Open-ILS/src/sql/Pg/002.schema.config.sql          |   2 +-
 Open-ILS/src/sql/Pg/020.schema.functions.sql       |  83 +++++++++--------
 Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql       | 101 ++++++++++++---------
 .../upgrade/0478.schema.naco_normalize_tweak.sql   |  69 ++++++++++++++
 Open-ILS/tests/naco_normalize.t                    |  91 +++++++++++++++++++
 6 files changed, 321 insertions(+), 117 deletions(-)
 create mode 100644 Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql
 create mode 100644 Open-ILS/tests/naco_normalize.t

diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm b/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm
index eb1a97f9af..321333c5b8 100644
--- a/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm
+++ b/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm
@@ -6,6 +6,7 @@ package OpenILS::Application::Storage::FTS;
 use OpenSRF::Utils::Logger qw/:level/;
 use Parse::RecDescent;
 use Unicode::Normalize;
+use Encode;
 
 my $_default_grammar_parser = new Parse::RecDescent ( <<'GRAMMAR' );
 
@@ -27,45 +28,66 @@ numeric_range: /\d+-\d*/
 
 GRAMMAR
 
+# FIXME - this is a copy-and-paste of the naco_normalize
+#         stored procedure
 sub naco_normalize {
 
-    my $txt = lc(shift);
+    my $str = decode_utf8(shift);
     my $sf = shift;
 
-    $txt = NFD($txt);
-    $txt =~ s/\pM+//go; # Remove diacritics
-
-    $txt =~ s/\xE6/AE/go;   # Convert ae digraph
-    $txt =~ s/\x{153}/OE/go;# Convert oe digraph
-    $txt =~ s/\xFE/TH/go;   # Convert Icelandic thorn
-
-    $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
-    $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
-
-    $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;     # Convert Latin and Greek
-    $txt =~ tr/\x{2113}\xF0\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LD /; # Convert Misc
-    $txt =~ tr/\'\[\]\|//d;                         # Remove Misc
-
-    if ($sf && $sf =~ /^a/o) {
-        my $commapos = index($txt,',');
-        if ($commapos > -1) {
-            if ($commapos != length($txt) - 1) {
-                my @list = split /,/, $txt;
-                my $first = shift @list;
-                $txt = $first . ',' . join(' ', @list);
-            } else {
-                $txt =~ s/,/ /go;
-            }
-        }
-    } else {
-        $txt =~ s/,/ /go;
-    }
-
-    $txt =~ s/\s+/ /go; # Compress multiple spaces
-    $txt =~ s/^\s+//o;  # Remove leading space
-    $txt =~ s/\s+$//o;  # Remove trailing space
-
-    return $txt;
+    # Apply NACO normalization to input string; based on
+    # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+    #
+    # Note that unlike a strict reading of the NACO normalization rules,
+    # output is returned as lowercase instead of uppercase for compatibility
+    # with previous versions of the Evergreen naco_normalize routine.
+
+    # Convert to upper-case first; even though final output will be lowercase, doing this will
+    # ensure that the German eszett (Ã) and certain ligatures (ï¬, ï¬, ï¬, etc.) will be handled correctly.
+    # If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+    $str = uc $str;
+
+    # remove non-filing strings
+    $str =~ s/\x{0098}.*?\x{009C}//g;
+
+    $str = NFKD($str);
+
+    # additional substitutions - 3.6.
+    $str =~ s/\x{00C6}/AE/g;
+    $str =~ s/\x{00DE}/TH/g;
+    $str =~ s/\x{0152}/OE/g;
+    $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d;
+
+    # transformations based on Unicode category codes
+    $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g;
+
+	if ($sf && $sf =~ /^a/o) {
+		my $commapos = index($str, ',');
+		if ($commapos > -1) {
+			if ($commapos != length($str) - 1) {
+                $str =~ s/,/\x07/; # preserve first comma
+			}
+		}
+	}
+
+    # since we've stripped out the control characters, we can now
+    # use a few as placeholders temporarily
+    $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/;
+    $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g;
+    $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/;
+
+    # decimal digits
+    $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
+
+    # intentionally skipping step 8 of the NACO algorithm; if the string
+    # gets normalized away, that's fine.
+
+    # leading and trailing spaces
+    $str =~ s/\s+/ /g;
+    $str =~ s/^\s+//;
+    $str =~ s/\s+$//g;
+
+    return lc $str;
 }
 
 #' stupid vim syntax highlighting ...
diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql
index a8370a0719..6644e6407f 100644
--- a/Open-ILS/src/sql/Pg/002.schema.config.sql
+++ b/Open-ILS/src/sql/Pg/002.schema.config.sql
@@ -70,7 +70,7 @@ CREATE TABLE config.upgrade_log (
     install_date    TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 
-INSERT INTO config.upgrade_log (version) VALUES ('0476'); -- dbs
+INSERT INTO config.upgrade_log (version) VALUES ('0478'); -- gmcharlt
 
 CREATE TABLE config.bib_source (
 	id		SERIAL	PRIMARY KEY,
diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql
index af184e2dc2..1745dbb147 100644
--- a/Open-ILS/src/sql/Pg/020.schema.functions.sql
+++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql
@@ -34,56 +34,67 @@ CREATE OR REPLACE FUNCTION public.non_filing_normalize ( TEXT, "char" ) RETURNS
 $$ LANGUAGE SQL STRICT IMMUTABLE;
 
 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
-	use Unicode::Normalize;
-	use Encode;
 
-	# When working with Unicode data, the first step is to decode it to
-	# a byte string; after that, lowercasing is safe
-	my $txt = lc(decode_utf8(shift));
-	my $sf = shift;
+    use strict;
+    use Unicode::Normalize;
+    use Encode;
+
+    my $str = decode_utf8(shift);
+    my $sf = shift;
+
+    # Apply NACO normalization to input string; based on
+    # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+    #
+    # Note that unlike a strict reading of the NACO normalization rules,
+    # output is returned as lowercase instead of uppercase for compatibility
+    # with previous versions of the Evergreen naco_normalize routine.
 
-	$txt = NFD($txt);
-	$txt =~ s/\pM+//go;	# Remove diacritics
+    # Convert to upper-case first; even though final output will be lowercase, doing this will
+    # ensure that the German eszett (Ã) and certain ligatures (ï¬, ï¬, ï¬, etc.) will be handled correctly.
+    # If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+    $str = uc $str;
 
-	# remove non-combining diacritics
-	# this list of characters follows the NACO normalization spec,
-	# but a looser but more comprehensive version might be
-	# $txt =~ s/\pLm+//go;
-	$txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d;
+    # remove non-filing strings
+    $str =~ s/\x{0098}.*?\x{009C}//g;
 
-	$txt =~ s/\xE6/AE/go;	# Convert ae digraph
-	$txt =~ s/\x{153}/OE/go;# Convert oe digraph
-	$txt =~ s/\xFE/TH/go;	# Convert Icelandic thorn
+    $str = NFKD($str);
 
-	$txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
-	$txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
+    # additional substitutions - 3.6.
+    $str =~ s/\x{00C6}/AE/g;
+    $str =~ s/\x{00DE}/TH/g;
+    $str =~ s/\x{0152}/OE/g;
+    $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d;
 
-	$txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;	 	# Convert Latin and Greek
-	$txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /;	# Convert Misc
-	$txt =~ tr/\'\[\]\|//d;							# Remove Misc
+    # transformations based on Unicode category codes
+    $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g;
 
 	if ($sf && $sf =~ /^a/o) {
-		my $commapos = index($txt,',');
+		my $commapos = index($str, ',');
 		if ($commapos > -1) {
-			if ($commapos != length($txt) - 1) {
-				my @list = split /,/, $txt;
-				my $first = shift @list;
-				$txt = $first . ',' . join(' ', @list);
-			} else {
-				$txt =~ s/,/ /go;
+			if ($commapos != length($str) - 1) {
+                $str =~ s/,/\x07/; # preserve first comma
 			}
 		}
-	} else {
-		$txt =~ s/,/ /go;
 	}
 
-	$txt =~ s/\s+/ /go;	# Compress multiple spaces
-	$txt =~ s/^\s+//o;	# Remove leading space
-	$txt =~ s/\s+$//o;	# Remove trailing space
+    # since we've stripped out the control characters, we can now
+    # use a few as placeholders temporarily
+    $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/;
+    $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g;
+    $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/;
+
+    # decimal digits
+    $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
+
+    # intentionally skipping step 8 of the NACO algorithm; if the string
+    # gets normalized away, that's fine.
+
+    # leading and trailing spaces
+    $str =~ s/\s+/ /g;
+    $str =~ s/^\s+//;
+    $str =~ s/\s+$//g;
 
-	# Encoding the outgoing string is good practice, but not strictly
-	# necessary in this case because we've stripped everything from it
-	return encode_utf8($txt);
+    return lc $str;
 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
 
 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$
diff --git a/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql b/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql
index 1e27959d50..aac25cc909 100644
--- a/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql
+++ b/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql
@@ -7003,56 +7003,67 @@ $$ LANGUAGE PLPGSQL;
 DROP TABLE IF EXISTS config.index_normalizer CASCADE;
 
 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
+
+    use strict;
     use Unicode::Normalize;
     use Encode;
 
-    # When working with Unicode data, the first step is to decode it to
-    # a byte string; after that, lowercasing is safe
-    my $txt = lc(decode_utf8(shift));
+    my $str = decode_utf8(shift);
     my $sf = shift;
 
-    $txt = NFD($txt);
-    $txt =~ s/\pM+//go; # Remove diacritics
-
-    # remove non-combining diacritics
-    # this list of characters follows the NACO normalization spec,
-    # but a looser but more comprehensive version might be
-    # $txt =~ s/\pLm+//go;
-    $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d;
-
-    $txt =~ s/\xE6/AE/go;   # Convert ae digraph
-    $txt =~ s/\x{153}/OE/go;# Convert oe digraph
-    $txt =~ s/\xFE/TH/go;   # Convert Icelandic thorn
-
-    $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
-    $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
-
-    $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;     # Convert Latin and Greek
-    $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /; # Convert Misc
-    $txt =~ tr/\'\[\]\|//d;                         # Remove Misc
-
-    if ($sf && $sf =~ /^a/o) {
-        my $commapos = index($txt,',');
-        if ($commapos > -1) {
-            if ($commapos != length($txt) - 1) {
-                my @list = split /,/, $txt;
-                my $first = shift @list;
-                $txt = $first . ',' . join(' ', @list);
-            } else {
-                $txt =~ s/,/ /go;
-            }
-        }
-    } else {
-        $txt =~ s/,/ /go;
-    }
-
-    $txt =~ s/\s+/ /go; # Compress multiple spaces
-    $txt =~ s/^\s+//o;  # Remove leading space
-    $txt =~ s/\s+$//o;  # Remove trailing space
-
-    # Encoding the outgoing string is good practice, but not strictly
-    # necessary in this case because we've stripped everything from it
-    return encode_utf8($txt);
+    # Apply NACO normalization to input string; based on
+    # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+    #
+    # Note that unlike a strict reading of the NACO normalization rules,
+    # output is returned as lowercase instead of uppercase for compatibility
+    # with previous versions of the Evergreen naco_normalize routine.
+
+    # Convert to upper-case first; even though final output will be lowercase, doing this will
+    # ensure that the German eszett (Ã) and certain ligatures (ï¬, ï¬, ï¬, etc.) will be handled correctly.
+    # If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+    $str = uc $str;
+
+    # remove non-filing strings
+    $str =~ s/\x{0098}.*?\x{009C}//g;
+
+    $str = NFKD($str);
+
+    # additional substitutions - 3.6.
+    $str =~ s/\x{00C6}/AE/g;
+    $str =~ s/\x{00DE}/TH/g;
+    $str =~ s/\x{0152}/OE/g;
+    $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d;
+
+    # transformations based on Unicode category codes
+    $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g;
+
+	if ($sf && $sf =~ /^a/o) {
+		my $commapos = index($str, ',');
+		if ($commapos > -1) {
+			if ($commapos != length($str) - 1) {
+                $str =~ s/,/\x07/; # preserve first comma
+			}
+		}
+	}
+
+    # since we've stripped out the control characters, we can now
+    # use a few as placeholders temporarily
+    $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/;
+    $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g;
+    $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/;
+
+    # decimal digits
+    $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
+
+    # intentionally skipping step 8 of the NACO algorithm; if the string
+    # gets normalized away, that's fine.
+
+    # leading and trailing spaces
+    $str =~ s/\s+/ /g;
+    $str =~ s/^\s+//;
+    $str =~ s/\s+$//g;
+
+    return lc $str;
 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
 
 -- Some handy functions, based on existing ones, to provide optional ingest normalization
diff --git a/Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql b/Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql
new file mode 100644
index 0000000000..8c480ae5b9
--- /dev/null
+++ b/Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql
@@ -0,0 +1,69 @@
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0478'); -- gmcharlt
+
+CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
+
+    use strict;
+    use Unicode::Normalize;
+    use Encode;
+
+    my $str = decode_utf8(shift);
+    my $sf = shift;
+
+    # Apply NACO normalization to input string; based on
+    # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+    #
+    # Note that unlike a strict reading of the NACO normalization rules,
+    # output is returned as lowercase instead of uppercase for compatibility
+    # with previous versions of the Evergreen naco_normalize routine.
+
+    # Convert to upper-case first; even though final output will be lowercase, doing this will
+    # ensure that the German eszett (Ã) and certain ligatures (ï¬, ï¬, ï¬, etc.) will be handled correctly.
+    # If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+    $str = uc $str;
+
+    # remove non-filing strings
+    $str =~ s/\x{0098}.*?\x{009C}//g;
+
+    $str = NFKD($str);
+
+    # additional substitutions - 3.6.
+    $str =~ s/\x{00C6}/AE/g;
+    $str =~ s/\x{00DE}/TH/g;
+    $str =~ s/\x{0152}/OE/g;
+    $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d;
+
+    # transformations based on Unicode category codes
+    $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g;
+
+	if ($sf && $sf =~ /^a/o) {
+		my $commapos = index($str, ',');
+		if ($commapos > -1) {
+			if ($commapos != length($str) - 1) {
+                $str =~ s/,/\x07/; # preserve first comma
+			}
+		}
+	}
+
+    # since we've stripped out the control characters, we can now
+    # use a few as placeholders temporarily
+    $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/;
+    $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g;
+    $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/;
+
+    # decimal digits
+    $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
+
+    # intentionally skipping step 8 of the NACO algorithm; if the string
+    # gets normalized away, that's fine.
+
+    # leading and trailing spaces
+    $str =~ s/\s+/ /g;
+    $str =~ s/^\s+//;
+    $str =~ s/\s+$//g;
+
+    return lc $str;
+$func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
+
+COMMIT;
diff --git a/Open-ILS/tests/naco_normalize.t b/Open-ILS/tests/naco_normalize.t
new file mode 100644
index 0000000000..34dbe03bdf
--- /dev/null
+++ b/Open-ILS/tests/naco_normalize.t
@@ -0,0 +1,91 @@
+use strict;
+use warnings;
+use utf8;
+
+use Test::More tests => 50;
+use Unicode::Normalize;
+use DBI;
+
+use OpenILS::Application::Storage::FTS;
+
+# This could be made better in at least one of two ways (or both);
+# 1. put PL/Perl code that doesn't require a database into external
+#    modules so that test frameworks can get at it more easily
+# 2. Build a test harness that knows how to find an Evergreen
+#    database to use for non-destructive testing.  Of course, there
+#    can be a chicken-and-egg problem here; also, a complete test
+#    suite would need to be able to do *destructive* testing, from
+#    which we'd presumably want to protect production databases.
+
+# Database connection parameters
+my $db_driver = 'Pg';
+my $db_host   = 'evergreen';
+my $db_port   = '5432';
+my $db_name   = 'evergreen';
+my $db_user   = 'evergreen';
+my $db_pw     = 'evergreen';
+my $dsn       = "dbi:" . $db_driver . ":dbname=" . $db_name .';host=' . $db_host . ';port=' . $db_port;
+
+binmode STDOUT, ':utf8';
+binmode STDERR, ':utf8';
+
+my @test_cases = (
+    [ 'abc', 'abc', 'regular text' ],
+    [ 'ABC', 'abc', 'regular text' ],
+    [ 'Ã¥bÃ§dÃ©Ã±ÅÃ¶Ã®Ã¸Ã¦ÃÄ±ÃÃÃÃÃÃÃÃÃÃÅÃ¨', 'abcdenoeoioaeciaaiiiooouaeoee', 'European diacritics' ],
+    [ 'âââÂ«quotesÂ»ââ', 'quotes', 'special quotes' ],
+    [ 'ÂabcÂ def', 'def', 'special non-filing characters designation' ],
+    [ 'Âabcdef', 'abcdef', 'unpaired start of string' ],
+    [ 'Ã', 'ss', 'sharp S (eszett)' ],
+    [ 'ï¬ï¬ï¬', 'flfiff', 'ligatures' ],
+    [ 'Æ Æ¡Æ¯Æ°Â²Ä²Ä³', 'oouu2ijij', 'NFKD applied correctly' ],
+    [ 'ÃÃÃÃ¦Ã°Ã¸Ã¾ÄÄÄ±ÅÅÅÅÊ»Ê¼â', 'aeothaedothddilloeoel', 'part 3.6' ],
+    [ 'Ã', 'd', 'uppercase eth (missing from 3.6?)' ],
+    [ 'Ä±Ä°', 'ii', 'Turkish I' ],
+    [ '[book\'s cover]', 'books cover', 'square brackets and apostrophe' ],
+    [ '  grue   food ', 'grue food', 'trim spaces' ],
+    # note addition of NFKD() to transform expected output
+    [ 'íêµ­ì´ ì¡°ì ë§', NFKD('íêµ­ì´ ì¡°ì ë§'), 'Korean text' ],
+    [ 'æ®éè©± / æ®éè¯', 'æ®éè©± æ®éè¯', 'Chinese text' ],
+    [ 'Ø§ÙØ¹Ø±Ø¨ÙØ©', 'Ø§ÙØ¹Ø±Ø¨ÙØ©', 'Arabic text' ],
+    [ 'á¥áá áá£áá ááá', 'á¥áá áá£áá ááá', 'Georgian text' ],
+    [ 'ÑÑÑÑÐºÐ¸Ð¹ ÑÐ·ÑÐº', 'ÑÑÑÑÐºÐ¸Ð¸ ÑÐ·ÑÐº', 'Russian text' ],
+    [ "\r\npa\tper\f", 'paper', 'other whitespace' ],
+    [ '#1: â C++, @ home & abroad', '#1 c++ @ home & abroad', 'other punctuation' ],
+    [ 'Ù Ù¡Ù¢Ù£Ù¤Ù¥', '012345', 'other decimal digits' ],
+    [ 'Â²Â³Â¹', '231', 'superscript numbers' ],
+    [ 'â­Â©Â®â¯', 'â­ â¯', 'other symbols' ],
+);
+
+# test copy of naco_normalize in OpenILS::Application::Storage::FTS
+foreach my $case (@test_cases) {
+    is(OpenILS::Application::Storage::FTS::naco_normalize($case->[0]), $case->[1], $case->[2] . ' (FTS.pm)');
+}
+is(OpenILS::Application::Storage::FTS::naco_normalize('Smith, Jane. Poet, painter, and author', 'a'),
+    'smith, jane poet painter and author',
+    'retain first comma (FTS.pm)');
+
+SKIP: {
+    my $dbh = DBI->connect($dsn, $db_user, $db_pw, {AutoCommit => 1, pg_enable_utf8 => 1, PrintError => 0});
+    skip "Failed to connect to database: $DBI::errstr", 25 if (!defined($dbh));
+
+    # test stored procedure
+    my $sth1 = $dbh->prepare_cached('SELECT public.naco_normalize(?)');
+    my $sth2 = $dbh->prepare_cached('SELECT public.naco_normalize(?, ?)');
+    sub naco_normalize_wrapper {
+        my ($str, $sf) = @_;
+        if (defined $sf) {
+            $sth2->execute($str, $sf);
+            return $sth2->fetchrow_array;
+        } else {
+            $sth1->execute($str);
+            return $sth1->fetchrow_array;
+        }
+    }
+
+    foreach my $case (@test_cases) {
+        is(naco_normalize_wrapper($case->[0]), $case->[1], $case->[2] . ' (stored procedure)');
+    }
+    is(naco_normalize_wrapper('Smith, Jane. Poet, painter, and author', 'a'), 'smith, jane poet painter and author',
+        'retain first comma (stored procedure)');
+}
-- 
2.11.0