From ba026002d2e3b9fe5f63c04c2ca71c9dd1af3cd5 Mon Sep 17 00:00:00 2001 From: gmc Date: Tue, 26 Oct 2010 19:35:38 +0000 Subject: [PATCH] fix NACO normalization of four letter modifier characters Signed-off-by: Galen Charlton git-svn-id: svn://svn.open-ils.org/ILS/trunk@18476 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- Open-ILS/src/sql/Pg/002.schema.config.sql | 2 +- Open-ILS/src/sql/Pg/020.schema.functions.sql | 6 +++ Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql | 18 ++++--- .../0446.schema.naco-normalize-modifiers.sql | 58 ++++++++++++++++++++++ 4 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/0446.schema.naco-normalize-modifiers.sql diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 9e2a7126c1..5d339fb5fc 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -70,7 +70,7 @@ CREATE TABLE config.upgrade_log ( install_date TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); -INSERT INTO config.upgrade_log (version) VALUES ('0445'); -- miker +INSERT INTO config.upgrade_log (version) VALUES ('0446'); -- gmc CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/020.schema.functions.sql b/Open-ILS/src/sql/Pg/020.schema.functions.sql index 3ca05db530..dc3bf2ebe9 100644 --- a/Open-ILS/src/sql/Pg/020.schema.functions.sql +++ b/Open-ILS/src/sql/Pg/020.schema.functions.sql @@ -45,6 +45,12 @@ CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $ $txt = NFD($txt); $txt =~ s/\pM+//go; # Remove diacritics + # remove non-combining diacritics + # this list of characters follows the NACO normalization spec, + # but a looser but more comprehensive version might be + # $txt =~ s/\pLm+//go; + $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d; + $txt =~ s/\xE6/AE/go; # Convert ae digraph $txt =~ s/\x{153}/OE/go;# Convert oe digraph $txt =~ s/\xFE/TH/go; # Convert Icelandic thorn diff --git a/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql b/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql index 7cacfe640a..007ebe97b3 100644 --- a/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql +++ b/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql @@ -6911,7 +6911,13 @@ CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $ my $sf = shift; $txt = NFD($txt); - $txt =~ s/\pM+//go; # Remove diacritics + $txt =~ s/\pM+//go; # Remove diacritics + + # remove non-combining diacritics + # this list of characters follows the NACO normalization spec, + # but a looser but more comprehensive version might be + # $txt =~ s/\pLm+//go; + $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d; $txt =~ s/\xE6/AE/go; # Convert ae digraph $txt =~ s/\x{153}/OE/go;# Convert oe digraph @@ -6920,8 +6926,8 @@ CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $ $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers - $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/; # Convert Latin and Greek - $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /; # Convert Misc + $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/; # Convert Latin and Greek + $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /; # Convert Misc $txt =~ tr/\'\[\]\|//d; # Remove Misc if ($sf && $sf =~ /^a/o) { @@ -6939,9 +6945,9 @@ CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $ $txt =~ s/,/ /go; } - $txt =~ s/\s+/ /go; # Compress multiple spaces - $txt =~ s/^\s+//o; # Remove leading space - $txt =~ s/\s+$//o; # Remove trailing space + $txt =~ s/\s+/ /go; # Compress multiple spaces + $txt =~ s/^\s+//o; # Remove leading space + $txt =~ s/\s+$//o; # Remove trailing space # Encoding the outgoing string is good practice, but not strictly # necessary in this case because we've stripped everything from it diff --git a/Open-ILS/src/sql/Pg/upgrade/0446.schema.naco-normalize-modifiers.sql b/Open-ILS/src/sql/Pg/upgrade/0446.schema.naco-normalize-modifiers.sql new file mode 100644 index 0000000000..2e137ffddf --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/0446.schema.naco-normalize-modifiers.sql @@ -0,0 +1,58 @@ +BEGIN; + +INSERT INTO config.upgrade_log (version) VALUES ('0446'); -- gmc + +CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$ + use Unicode::Normalize; + use Encode; + + # When working with Unicode data, the first step is to decode it to + # a byte string; after that, lowercasing is safe + my $txt = lc(decode_utf8(shift)); + my $sf = shift; + + $txt = NFD($txt); + $txt =~ s/\pM+//go; # Remove diacritics + + # remove non-combining diacritics + # this list of characters follows the NACO normalization spec, + # but a looser but more comprehensive version might be + # $txt =~ s/\pLm+//go; + $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d; + + $txt =~ s/\xE6/AE/go; # Convert ae digraph + $txt =~ s/\x{153}/OE/go;# Convert oe digraph + $txt =~ s/\xFE/TH/go; # Convert Icelandic thorn + + $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers + $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers + + $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/; # Convert Latin and Greek + $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /; # Convert Misc + $txt =~ tr/\'\[\]\|//d; # Remove Misc + + if ($sf && $sf =~ /^a/o) { + my $commapos = index($txt,','); + if ($commapos > -1) { + if ($commapos != length($txt) - 1) { + my @list = split /,/, $txt; + my $first = shift @list; + $txt = $first . ',' . join(' ', @list); + } else { + $txt =~ s/,/ /go; + } + } + } else { + $txt =~ s/,/ /go; + } + + $txt =~ s/\s+/ /go; # Compress multiple spaces + $txt =~ s/^\s+//o; # Remove leading space + $txt =~ s/\s+$//o; # Remove trailing space + + # Encoding the outgoing string is good practice, but not strictly + # necessary in this case because we've stripped everything from it + return encode_utf8($txt); +$func$ LANGUAGE 'plperlu' STRICT IMMUTABLE; + +END; -- 2.11.0