From 062bd08b8095fa854c85de42ffe307a2c4a26a36 Mon Sep 17 00:00:00 2001 From: miker Date: Mon, 10 Mar 2008 00:45:35 +0000 Subject: [PATCH] NACO normalization is handy to have around git-svn-id: svn://svn.open-ils.org/ILS/trunk@8942 dcc99617-32d9-48b4-a31d-7c20da2025e4 --- .../perlmods/OpenILS/Application/Storage/FTS.pm | 44 ++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm b/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm index f8db6ea1ee..6dd03c0382 100644 --- a/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm +++ b/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm @@ -5,6 +5,7 @@ my $log = 'OpenSRF::Utils::Logger'; package OpenILS::Application::Storage::FTS; use OpenSRF::Utils::Logger qw/:level/; use Parse::RecDescent; +use Unicode::Normalize; my $_default_grammar_parser = new Parse::RecDescent ( <<'GRAMMAR' ); @@ -26,6 +27,49 @@ numeric_range: /\d+-\d*/ GRAMMAR +sub naco_normalize { + + my $txt = lc(shift); + my $sf = shift; + + $txt = NFD($txt); + $txt =~ s/\pM+//go; # Remove diacritics + + $txt =~ s/\xE6/AE/go; # Convert ae digraph + $txt =~ s/\x{153}/OE/go;# Convert oe digraph + $txt =~ s/\xFE/TH/go; # Convert Icelandic thorn + + $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers + $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers + + $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/; # Convert Latin and Greek + $txt =~ tr/\x{2113}\xF0\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LD /; # Convert Misc + $txt =~ tr/\'\[\]\|//d; # Remove Misc + + if ($sf && $sf =~ /^a/o) { + my $commapos = index($txt,','); + if ($commapos > -1) { + if ($commapos != length($txt) - 1) { + my @list = split /,/, $txt; + my $first = shift @list; + $txt = $first . ',' . join(' ', @list); + } else { + $txt =~ s/,/ /go; + } + } + } else { + $txt =~ s/,/ /go; + } + + $txt =~ s/\s+/ /go; # Compress multiple spaces + $txt =~ s/^\s+//o; # Remove leading space + $txt =~ s/\s+$//o; # Remove trailing space + + return $txt; +} + +#' stupid vim syntax highlighting ... + sub compile { $log->debug("You must override me somewhere, or I will make searching really slow!!!!",ERROR);; -- 2.11.0