From 4d39dd908d6c7abd202cc48e7ae472b0700c32ca Mon Sep 17 00:00:00 2001 From: gmc Date: Wed, 19 Jan 2011 16:07:14 +0000 Subject: [PATCH] backport naco_normalize revisions to rel_2_0 This implements the latest version of the NACO normalization specification found at http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf This version of the algorithm is more general -- for example, all combining characters are removed -- so there should be fewer fiddly edge cases to worry about for most European languages. Rebuilding the metabib.*_field_entry tables (e.g., by using reingest-1.6-2.0.pl) is recommended if there are any bibs that contain any non-ASCII characters. Normalized text is now left in the NFKD form, so while this should be transparent to the search system after reindexing, it does mean that (for example) Korean text in metabib.*_field_entry may not be in the same Unicode normalization form as that found in biblio.record_entry. Also includes fix for bug #684467: more bulletproofing of naco_normalize Signed-off-by: Galen Charlton git-svn-id: svn://svn.open-ils.org/ILS/branches/rel_2_0@19205 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- .../perlmods/OpenILS/Application/Storage/FTS.pm | 92 ++++++++++++------- Open-ILS/src/sql/Pg/002.schema.config.sql | 2 +- Open-ILS/src/sql/Pg/020.schema.functions.sql | 83 +++++++++-------- Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql | 101 ++++++++++++--------- .../upgrade/0478.schema.naco_normalize_tweak.sql | 69 ++++++++++++++ Open-ILS/tests/naco_normalize.t | 91 +++++++++++++++++++ 6 files changed, 321 insertions(+), 117 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql create mode 100644 Open-ILS/tests/naco_normalize.t diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm b/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm index eb1a97f9af..321333c5b8 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm @@ -6,6 +6,7 @@ package OpenILS::Application::Storage::FTS; use OpenSRF::Utils::Logger qw/:level/; use Parse::RecDescent; use Unicode::Normalize; +use Encode; my $_default_grammar_parser = new Parse::RecDescent ( <<'GRAMMAR' ); @@ -27,45 +28,66 @@ numeric_range: /\d+-\d*/ GRAMMAR +# FIXME - this is a copy-and-paste of the naco_normalize +# stored procedure sub naco_normalize { - my $txt = lc(shift); + my $str = decode_utf8(shift); my $sf = shift; - $txt = NFD($txt); - $txt =~ s/\pM+//go; # Remove diacritics - - $txt =~ s/\xE6/AE/go; # Convert ae digraph - $txt =~ s/\x{153}/OE/go;# Convert oe digraph - $txt =~ s/\xFE/TH/go; # Convert Icelandic thorn - - $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers - $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers - - $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/; # Convert Latin and Greek - $txt =~ tr/\x{2113}\xF0\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LD /; # Convert Misc - $txt =~ tr/\'\[\]\|//d; # Remove Misc - - if ($sf && $sf =~ /^a/o) { - my $commapos = index($txt,','); - if ($commapos > -1) { - if ($commapos != length($txt) - 1) { - my @list = split /,/, $txt; - my $first = shift @list; - $txt = $first . ',' . join(' ', @list); - } else { - $txt =~ s/,/ /go; - } - } - } else { - $txt =~ s/,/ /go; - } - - $txt =~ s/\s+/ /go; # Compress multiple spaces - $txt =~ s/^\s+//o; # Remove leading space - $txt =~ s/\s+$//o; # Remove trailing space - - return $txt; + # Apply NACO normalization to input string; based on + # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf + # + # Note that unlike a strict reading of the NACO normalization rules, + # output is returned as lowercase instead of uppercase for compatibility + # with previous versions of the Evergreen naco_normalize routine. + + # Convert to upper-case first; even though final output will be lowercase, doing this will + # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly. + # If there are any bugs in Perl's implementation of upcasing, they will be passed through here. + $str = uc $str; + + # remove non-filing strings + $str =~ s/\x{0098}.*?\x{009C}//g; + + $str = NFKD($str); + + # additional substitutions - 3.6. + $str =~ s/\x{00C6}/AE/g; + $str =~ s/\x{00DE}/TH/g; + $str =~ s/\x{0152}/OE/g; + $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d; + + # transformations based on Unicode category codes + $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g; + + if ($sf && $sf =~ /^a/o) { + my $commapos = index($str, ','); + if ($commapos > -1) { + if ($commapos != length($str) - 1) { + $str =~ s/,/\x07/; # preserve first comma + } + } + } + + # since we've stripped out the control characters, we can now + # use a few as placeholders temporarily + $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/; + $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g; + $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/; + + # decimal digits + $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/; + + # intentionally skipping step 8 of the NACO algorithm; if the string + # gets normalized away, that's fine. + + # leading and trailing spaces + $str =~ s/\s+/ /g; + $str =~ s/^\s+//; + $str =~ s/\s+$//g; + + return lc $str; } #' stupid vim syntax highlighting ... diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index a8370a0719..6644e6407f 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -70,7 +70,7 @@ CREATE TABLE config.upgrade_log ( install_date TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); -INSERT INTO config.upgrade_log (version) VALUES ('0476'); -- dbs +INSERT INTO config.upgrade_log (version) VALUES ('0478'); -- gmcharlt CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql index af184e2dc2..1745dbb147 100644 --- a/Open-ILS/src/sql/Pg/020.schema.functions.sql +++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql @@ -34,56 +34,67 @@ CREATE OR REPLACE FUNCTION public.non_filing_normalize ( TEXT, "char" ) RETURNS $$ LANGUAGE SQL STRICT IMMUTABLE; CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$ - use Unicode::Normalize; - use Encode; - # When working with Unicode data, the first step is to decode it to - # a byte string; after that, lowercasing is safe - my $txt = lc(decode_utf8(shift)); - my $sf = shift; + use strict; + use Unicode::Normalize; + use Encode; + + my $str = decode_utf8(shift); + my $sf = shift; + + # Apply NACO normalization to input string; based on + # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf + # + # Note that unlike a strict reading of the NACO normalization rules, + # output is returned as lowercase instead of uppercase for compatibility + # with previous versions of the Evergreen naco_normalize routine. - $txt = NFD($txt); - $txt =~ s/\pM+//go; # Remove diacritics + # Convert to upper-case first; even though final output will be lowercase, doing this will + # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly. + # If there are any bugs in Perl's implementation of upcasing, they will be passed through here. + $str = uc $str; - # remove non-combining diacritics - # this list of characters follows the NACO normalization spec, - # but a looser but more comprehensive version might be - # $txt =~ s/\pLm+//go; - $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d; + # remove non-filing strings + $str =~ s/\x{0098}.*?\x{009C}//g; - $txt =~ s/\xE6/AE/go; # Convert ae digraph - $txt =~ s/\x{153}/OE/go;# Convert oe digraph - $txt =~ s/\xFE/TH/go; # Convert Icelandic thorn + $str = NFKD($str); - $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers - $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers + # additional substitutions - 3.6. + $str =~ s/\x{00C6}/AE/g; + $str =~ s/\x{00DE}/TH/g; + $str =~ s/\x{0152}/OE/g; + $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d; - $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/; # Convert Latin and Greek - $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /; # Convert Misc - $txt =~ tr/\'\[\]\|//d; # Remove Misc + # transformations based on Unicode category codes + $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g; if ($sf && $sf =~ /^a/o) { - my $commapos = index($txt,','); + my $commapos = index($str, ','); if ($commapos > -1) { - if ($commapos != length($txt) - 1) { - my @list = split /,/, $txt; - my $first = shift @list; - $txt = $first . ',' . join(' ', @list); - } else { - $txt =~ s/,/ /go; + if ($commapos != length($str) - 1) { + $str =~ s/,/\x07/; # preserve first comma } } - } else { - $txt =~ s/,/ /go; } - $txt =~ s/\s+/ /go; # Compress multiple spaces - $txt =~ s/^\s+//o; # Remove leading space - $txt =~ s/\s+$//o; # Remove trailing space + # since we've stripped out the control characters, we can now + # use a few as placeholders temporarily + $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/; + $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g; + $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/; + + # decimal digits + $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/; + + # intentionally skipping step 8 of the NACO algorithm; if the string + # gets normalized away, that's fine. + + # leading and trailing spaces + $str =~ s/\s+/ /g; + $str =~ s/^\s+//; + $str =~ s/\s+$//g; - # Encoding the outgoing string is good practice, but not strictly - # necessary in this case because we've stripped everything from it - return encode_utf8($txt); + return lc $str; $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE; CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$ diff --git a/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql b/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql index 1e27959d50..aac25cc909 100644 --- a/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql +++ b/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql @@ -7003,56 +7003,67 @@ $$ LANGUAGE PLPGSQL; DROP TABLE IF EXISTS config.index_normalizer CASCADE; CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$ + + use strict; use Unicode::Normalize; use Encode; - # When working with Unicode data, the first step is to decode it to - # a byte string; after that, lowercasing is safe - my $txt = lc(decode_utf8(shift)); + my $str = decode_utf8(shift); my $sf = shift; - $txt = NFD($txt); - $txt =~ s/\pM+//go; # Remove diacritics - - # remove non-combining diacritics - # this list of characters follows the NACO normalization spec, - # but a looser but more comprehensive version might be - # $txt =~ s/\pLm+//go; - $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d; - - $txt =~ s/\xE6/AE/go; # Convert ae digraph - $txt =~ s/\x{153}/OE/go;# Convert oe digraph - $txt =~ s/\xFE/TH/go; # Convert Icelandic thorn - - $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers - $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers - - $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/; # Convert Latin and Greek - $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /; # Convert Misc - $txt =~ tr/\'\[\]\|//d; # Remove Misc - - if ($sf && $sf =~ /^a/o) { - my $commapos = index($txt,','); - if ($commapos > -1) { - if ($commapos != length($txt) - 1) { - my @list = split /,/, $txt; - my $first = shift @list; - $txt = $first . ',' . join(' ', @list); - } else { - $txt =~ s/,/ /go; - } - } - } else { - $txt =~ s/,/ /go; - } - - $txt =~ s/\s+/ /go; # Compress multiple spaces - $txt =~ s/^\s+//o; # Remove leading space - $txt =~ s/\s+$//o; # Remove trailing space - - # Encoding the outgoing string is good practice, but not strictly - # necessary in this case because we've stripped everything from it - return encode_utf8($txt); + # Apply NACO normalization to input string; based on + # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf + # + # Note that unlike a strict reading of the NACO normalization rules, + # output is returned as lowercase instead of uppercase for compatibility + # with previous versions of the Evergreen naco_normalize routine. + + # Convert to upper-case first; even though final output will be lowercase, doing this will + # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly. + # If there are any bugs in Perl's implementation of upcasing, they will be passed through here. + $str = uc $str; + + # remove non-filing strings + $str =~ s/\x{0098}.*?\x{009C}//g; + + $str = NFKD($str); + + # additional substitutions - 3.6. + $str =~ s/\x{00C6}/AE/g; + $str =~ s/\x{00DE}/TH/g; + $str =~ s/\x{0152}/OE/g; + $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d; + + # transformations based on Unicode category codes + $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g; + + if ($sf && $sf =~ /^a/o) { + my $commapos = index($str, ','); + if ($commapos > -1) { + if ($commapos != length($str) - 1) { + $str =~ s/,/\x07/; # preserve first comma + } + } + } + + # since we've stripped out the control characters, we can now + # use a few as placeholders temporarily + $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/; + $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g; + $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/; + + # decimal digits + $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/; + + # intentionally skipping step 8 of the NACO algorithm; if the string + # gets normalized away, that's fine. + + # leading and trailing spaces + $str =~ s/\s+/ /g; + $str =~ s/^\s+//; + $str =~ s/\s+$//g; + + return lc $str; $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE; -- Some handy functions, based on existing ones, to provide optional ingest normalization diff --git a/Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql b/Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql new file mode 100644 index 0000000000..8c480ae5b9 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql @@ -0,0 +1,69 @@ +BEGIN; + +INSERT INTO config.upgrade_log (version) VALUES ('0478'); -- gmcharlt + +CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$ + + use strict; + use Unicode::Normalize; + use Encode; + + my $str = decode_utf8(shift); + my $sf = shift; + + # Apply NACO normalization to input string; based on + # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf + # + # Note that unlike a strict reading of the NACO normalization rules, + # output is returned as lowercase instead of uppercase for compatibility + # with previous versions of the Evergreen naco_normalize routine. + + # Convert to upper-case first; even though final output will be lowercase, doing this will + # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly. + # If there are any bugs in Perl's implementation of upcasing, they will be passed through here. + $str = uc $str; + + # remove non-filing strings + $str =~ s/\x{0098}.*?\x{009C}//g; + + $str = NFKD($str); + + # additional substitutions - 3.6. + $str =~ s/\x{00C6}/AE/g; + $str =~ s/\x{00DE}/TH/g; + $str =~ s/\x{0152}/OE/g; + $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d; + + # transformations based on Unicode category codes + $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g; + + if ($sf && $sf =~ /^a/o) { + my $commapos = index($str, ','); + if ($commapos > -1) { + if ($commapos != length($str) - 1) { + $str =~ s/,/\x07/; # preserve first comma + } + } + } + + # since we've stripped out the control characters, we can now + # use a few as placeholders temporarily + $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/; + $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g; + $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/; + + # decimal digits + $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/; + + # intentionally skipping step 8 of the NACO algorithm; if the string + # gets normalized away, that's fine. + + # leading and trailing spaces + $str =~ s/\s+/ /g; + $str =~ s/^\s+//; + $str =~ s/\s+$//g; + + return lc $str; +$func$ LANGUAGE 'plperlu' STRICT IMMUTABLE; + +COMMIT; diff --git a/Open-ILS/tests/naco_normalize.t b/Open-ILS/tests/naco_normalize.t new file mode 100644 index 0000000000..34dbe03bdf --- /dev/null +++ b/Open-ILS/tests/naco_normalize.t @@ -0,0 +1,91 @@ +use strict; +use warnings; +use utf8; + +use Test::More tests => 50; +use Unicode::Normalize; +use DBI; + +use OpenILS::Application::Storage::FTS; + +# This could be made better in at least one of two ways (or both); +# 1. put PL/Perl code that doesn't require a database into external +# modules so that test frameworks can get at it more easily +# 2. Build a test harness that knows how to find an Evergreen +# database to use for non-destructive testing. Of course, there +# can be a chicken-and-egg problem here; also, a complete test +# suite would need to be able to do *destructive* testing, from +# which we'd presumably want to protect production databases. + +# Database connection parameters +my $db_driver = 'Pg'; +my $db_host = 'evergreen'; +my $db_port = '5432'; +my $db_name = 'evergreen'; +my $db_user = 'evergreen'; +my $db_pw = 'evergreen'; +my $dsn = "dbi:" . $db_driver . ":dbname=" . $db_name .';host=' . $db_host . ';port=' . $db_port; + +binmode STDOUT, ':utf8'; +binmode STDERR, ':utf8'; + +my @test_cases = ( + [ 'abc', 'abc', 'regular text' ], + [ 'ABC', 'abc', 'regular text' ], + [ 'åbçdéñœöîøæÇıÂÅÍÎÏÔÔÒÚÆŒè', 'abcdenoeoioaeciaaiiiooouaeoee', 'European diacritics' ], + [ '“‘„«quotes»’”', 'quotes', 'special quotes' ], + [ '˜abcœ def', 'def', 'special non-filing characters designation' ], + [ 'œabcdef', 'abcdef', 'unpaired start of string' ], + [ 'ß', 'ss', 'sharp S (eszett)' ], + [ 'flfiff', 'flfiff', 'ligatures' ], + [ 'ƠơƯư²IJij', 'oouu2ijij', 'NFKD applied correctly' ], + [ 'ÆØÞæðøþĐđıŁłŒœʻʼℓ', 'aeothaedothddilloeoel', 'part 3.6' ], + [ 'Ð', 'd', 'uppercase eth (missing from 3.6?)' ], + [ 'ıİ', 'ii', 'Turkish I' ], + [ '[book\'s cover]', 'books cover', 'square brackets and apostrophe' ], + [ ' grue food ', 'grue food', 'trim spaces' ], + # note addition of NFKD() to transform expected output + [ '한국어 조선말', NFKD('한국어 조선말'), 'Korean text' ], + [ '普通話 / 普通话', '普通話 普通话', 'Chinese text' ], + [ 'العربية', 'العربية', 'Arabic text' ], + [ 'ქართული ენა', 'ქართული ენა', 'Georgian text' ], + [ 'русский язык', 'русскии язык', 'Russian text' ], + [ "\r\npa\tper\f", 'paper', 'other whitespace' ], + [ '#1: ∃ C++, @ home & abroad', '#1 c++ @ home & abroad', 'other punctuation' ], + [ '٠١٢٣٤٥', '012345', 'other decimal digits' ], + [ '²³¹', '231', 'superscript numbers' ], + [ '♭©®♯', '♭ ♯', 'other symbols' ], +); + +# test copy of naco_normalize in OpenILS::Application::Storage::FTS +foreach my $case (@test_cases) { + is(OpenILS::Application::Storage::FTS::naco_normalize($case->[0]), $case->[1], $case->[2] . ' (FTS.pm)'); +} +is(OpenILS::Application::Storage::FTS::naco_normalize('Smith, Jane. Poet, painter, and author', 'a'), + 'smith, jane poet painter and author', + 'retain first comma (FTS.pm)'); + +SKIP: { + my $dbh = DBI->connect($dsn, $db_user, $db_pw, {AutoCommit => 1, pg_enable_utf8 => 1, PrintError => 0}); + skip "Failed to connect to database: $DBI::errstr", 25 if (!defined($dbh)); + + # test stored procedure + my $sth1 = $dbh->prepare_cached('SELECT public.naco_normalize(?)'); + my $sth2 = $dbh->prepare_cached('SELECT public.naco_normalize(?, ?)'); + sub naco_normalize_wrapper { + my ($str, $sf) = @_; + if (defined $sf) { + $sth2->execute($str, $sf); + return $sth2->fetchrow_array; + } else { + $sth1->execute($str); + return $sth1->fetchrow_array; + } + } + + foreach my $case (@test_cases) { + is(naco_normalize_wrapper($case->[0]), $case->[1], $case->[2] . ' (stored procedure)'); + } + is(naco_normalize_wrapper('Smith, Jane. Poet, painter, and author', 'a'), 'smith, jane poet painter and author', + 'retain first comma (stored procedure)'); +} -- 2.11.0