Test Dan Wells' theory that we don't need NFD at all

author Dan Scott <dscott@laurentian.ca>

Wed, 23 Oct 2013 02:44:12 +0000 (22:44 -0400)

committer Dan Scott <dscott@laurentian.ca>

Wed, 23 Oct 2013 04:10:51 +0000 (00:10 -0400)
author Dan Scott <dscott@laurentian.ca>
Wed, 23 Oct 2013 02:44:12 +0000 (22:44 -0400)
committer Dan Scott <dscott@laurentian.ca>
Wed, 23 Oct 2013 04:10:51 +0000 (00:10 -0400)
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm

index 24c85bf..1102d10 100644 (file)
--- a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
@@ -12,7 +12,7 @@ use Exporter 'import';
  our @EXPORT_OK = qw( clean_marc naco_normalize search_normalize );
  
  sub naco_normalize {
-    my $str = NFD(shift);
+    my $str = shift;
      my $sf = shift;
  
      # Apply NACO normalization to input string; based on
@@ -28,17 +28,17 @@ sub naco_normalize {
      
      $str = _normalize_codes($str, $sf);
  
-    return NFC($str);
+    return $str;
  }
  
  sub search_normalize {
-    my $str = NFD(shift);
+    my $str = shift;
      my $sf = shift;
  
      $str = _normalize_substitutions($str, $sf);
      $str = _normalize_codes($str, $sf);
      
-    return NFC($str);
+    return $str;
  }
  
  sub _normalize_substitutions {
@@ -111,14 +111,14 @@ sub _normalize_codes {
  # Assumes input is already in UTF-8.
  sub clean_marc {
      my $input = shift;
-    my $xml = NFD((UNIVERSAL::isa($input, 'MARC::Record')) ? $input->as_xml_record() : $input);
+    my $xml = (UNIVERSAL::isa($input, 'MARC::Record')) ? $input->as_xml_record() : $input;
      $xml =~ s/\n//sog;
      $xml =~ s/^<\?xml.+\?\s*>//go;
      $xml =~ s/>\s+</></go;
      $xml =~ s/\p{Cc}//go;
      $xml = OpenILS::Application::AppUtils->entityize($xml);
      $xml =~ s/[\x00-\x1f]//go;
-    return NFC($xml);
+    return $xml;
  }
  
  1;
diff --git a/Open-ILS/src/sql/Pg/002.functions.config.sql b/Open-ILS/src/sql/Pg/002.functions.config.sql

index df93590..cd7df41 100644 (file)
--- a/Open-ILS/src/sql/Pg/002.functions.config.sql
+++ b/Open-ILS/src/sql/Pg/002.functions.config.sql
@@ -523,10 +523,6 @@ $xml =~ s/\p{Cc}//go;
  # Embed a version of OpenILS::Application::AppUtils->entityize()
  # to avoid having to set PERL5LIB for PostgreSQL as well
  
-# If we are going to convert non-ASCII characters to XML entities,
-# we had better be dealing with a UTF8 string to begin with
-$xml = NFC(NFD($xml));
-
  # Convert raw ampersands to entities
  $xml =~ s/&(?!\S+;)/&amp;/gso;
  
@@ -652,10 +648,6 @@ if ($create or $munge) {
      # Embed a version of OpenILS::Application::AppUtils->entityize()
      # to avoid having to set PERL5LIB for PostgreSQL as well
  
-    # If we are going to convert non-ASCII characters to XML entities,
-    # we had better be dealing with a UTF8 string to begin with
-    $xml = NFC(NFD($xml));
-
      # Convert raw ampersands to entities
      $xml =~ s/&(?!\S+;)/&amp;/gso;
  
@@ -696,7 +688,7 @@ CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $
      use Unicode::Normalize;
      use Encode;
  
-    my $str = NFD(shift);
+    my $str = shift;
      my $sf = shift;
  
      # Apply NACO normalization to input string; based on
@@ -751,7 +743,7 @@ CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $
      $str =~ s/^\s+//;
      $str =~ s/\s+$//g;
  
-    return lc NFC($str);
+    return lc $str;
  $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
  
  -- Currently, the only difference from naco_normalize is that search_normalize
@@ -762,7 +754,7 @@ CREATE OR REPLACE FUNCTION public.search_normalize( TEXT, TEXT ) RETURNS TEXT AS
      use Unicode::Normalize;
      use Encode;
  
-    my $str = NFD(shift);
+    my $str = shift;
      my $sf = shift;
  
      # Apply NACO normalization to input string; based on
@@ -817,7 +809,7 @@ CREATE OR REPLACE FUNCTION public.search_normalize( TEXT, TEXT ) RETURNS TEXT AS
      $str =~ s/^\s+//;
      $str =~ s/\s+$//g;
  
-    return lc NFC($str);
+    return lc $str;
  $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
  
  CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
author	Dan Scott <dscott@laurentian.ca>
	Wed, 23 Oct 2013 02:44:12 +0000 (22:44 -0400)
committer	Dan Scott <dscott@laurentian.ca>
	Wed, 23 Oct 2013 04:10:51 +0000 (00:10 -0400)
Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm		patch \| blob \| history
Open-ILS/src/sql/Pg/002.functions.config.sql		patch \| blob \| history