From: Swenyu Duan <dsy@sina.com>
Date: Mon, 4 Jun 2012 02:45:21 +0000 (-0400)
Subject: A tested version of the PostgreSQL extension.
X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=9da882b9483a4ff0678a8c3b5570318da59e9fd7;p=working%2FEvergreen.git

A tested version of the PostgreSQL extension.
Now the extension contains two C implemented functions
(naco_normalize and search_normalize). To add the extension to PostgreSQL,
a upgrade sql script is included in the
Open-ILS/src/sql/Pg/upgrage/XXXX.schema.create_extension.sql. This script
drop the plperlu function of naco_nomalize and search_normalize and create
a extension which will add two C version of the two functions.
The extension depend on the ICU4C lib. The environment variable ICU_DATA must
be set up to the right ICU data path before you use the extension.
The extension has been tested in debian and windows environment(50 cases in
/Open-ILS/tests/naco_normalize.t has passed). The source file in this extension
is debian/linux version of the extension while the windows version has some
difference.

Signed-off-by: Swenyu Duan <dsy@sina.com>
---

diff --git a/Open-ILS/src/sql/Pg/extensions/makefile b/Open-ILS/src/sql/Pg/extensions/makefile
index 8aa4ad3bf2..e3e0adce46 100644
--- a/Open-ILS/src/sql/Pg/extensions/makefile
+++ b/Open-ILS/src/sql/Pg/extensions/makefile
@@ -1,7 +1,7 @@
 MODULE_big = c_functions
 EXTENSION = c_functions
 SHLIB_LINK = -licutu -licuuc -licuio -licui18n -licule -liculx -licudata
-#PG_CPPFLAGS = -L/usr/lib -licuuc -licuio -licui18n -licule -liculx -licudata
+PG_CPPFLAGS = -fshort-wchar 
 DATA = c_functions--1.0.sql
 OBJS = normalize.functions_in_c.o
 
diff --git a/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c b/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c
index 26e48e0db6..a2128bba63 100755
--- a/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c
+++ b/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c
@@ -1,608 +1,718 @@
-/************************************************************************/
-/*  C Implementation: public.search_normalize public.naco_normalize
- *
- *  Descritption:
- *    This file implement public.search_normalize and public.naco_normalize.
- *  These two functions is included in the PostgreSQL extension c_functions.
- *  ICU4C and postgres lib is needed to build this file.
- *
- *  Author: Swenyu Duan <dsy88@sina.com>, (C) 2012
- *
- *  Copyright: See COPYING file that comes with this distribution.
- */
-/************************************************************************/
-#include "postgres.h"
-#include "string.h"
-#include "fmgr.h"
-#include "unicode/unorm2.h"
-#include "unicode/utypes.h"
-#include "unicode/ustring.h"
-#include "unicode/uregex.h"
-#include "unicode/umachine.h"
-
-#ifdef PG_MODULE_MAGIC
-PG_MODULE_MAGIC;
-#endif
-
-static int32_t regexp_transliterate(const UChar *search_list,
-                                    int32_t search_list_len,
-                                    const UChar *replacement_list,
-                                    int32_t replacement_list_len,
-                                    UChar *src,
-                                    int32_t src_len,
-                                    UChar *des,
-                                    int32_t des_capacity)
-{
-    int i, j;
-    int32_t des_len;
-    UChar *cur_pos;
-
-    if (search_list == NULL || 
-        replacement_list == NULL ||
-        des == NULL ||
-        src_len > des_capacity + 1)
-    {
-        return 0;
-    }
-
-    des_len = 0;
-
-    for (i = 0; i < src_len; i++)
-    {
-        if (des != NULL)
-        {
-            des[des_len] = src[i];
-        }
-        des_len++;
-
-        for (j = replacement_list_len; j < search_list_len; j++)
-        {
-            if (search_list[j] == src[i])
-            {
-                des_len--;
-                break;
-            }
-        }
-    }
-    if (des == NULL)
-    {
-        //To store the tail '\\0'.
-        return des_len + 1;
-    }
-
-    des[des_len] = '\0';
-    
-    for (i = 0; i < replacement_list_len; i++)
-    {
-        cur_pos = u_strchr(des, search_list[i]);
-
-        while (cur_pos != NULL)
-        {
-            *cur_pos = replacement_list[i];
-
-            //In case cur_pos is the last char in des.
-            if (cur_pos >= des + des_len)
-            {
-                break;
-            }
-            cur_pos = u_strchr(cur_pos + 1, search_list[i]);
-        }
-    }
-
-    return des_len;
-}
-
-static int32_t regexp_replace(const UChar *regexp,
-                              int32_t regexp_len,
-                              const UChar *replacement,
-                              int32_t replacement_len,
-                              UChar *src,
-                              int32_t src_len,
-                              UChar *des,
-                              int32_t des_capacity,
-                              int is_global)
-{
-    URegularExpression *regular_exp;
-    UErrorCode status;
-    UParseError pe;
-    int32_t len;
-
-    if (regexp == NULL || replacement == NULL || src == NULL)
-    {
-        return 0;
-    }
-
-    regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status);
-    if (regular_exp == NULL)
-    {
-        return 0;
-    }
-
-    uregex_setText(regular_exp, src, src_len, &status);
-
-    if (is_global > 0)
-    {
-        len = uregex_replaceAll(regular_exp,
-                                replacement,
-                                replacement_len,
-                                NULL,
-                                0,
-                                &status);
-        if (des == NULL || des_capacity < len)
-        {
-            uregex_close(regular_exp);
-            return len;
-        }
-
-        uregex_replaceAll(regular_exp,
-                          replacement,
-                          replacement_len,
-                          des,
-                          des_capacity,
-                          &status);
-    }
-    else
-    {
-        len = uregex_replaceFirst(regular_exp,
-                                  replacement,
-                                  replacement_len,
-                                  NULL,
-                                  0,
-                                  &status);
-        if (des == NULL || des_capacity < len)
-        {
-            uregex_close(regular_exp);
-            return len;
-        }
-
-        uregex_replaceFirst(regular_exp,
-                            replacement,
-                            replacement_len,
-                            des,
-                            des_capacity,
-                            &status);
-    }
-    
-    uregex_close(regular_exp);
-    return len;
-}
-
-static UChar *u_strtransliterate(UChar *search_list,
-                                 UChar *replacement_list,
-                                 UChar *str,
-                                 int32_t str_capacity)
-{
-    int32_t search_list_len, replacement_list_len, str_len;
-    UChar *des;
-    int32_t des_len;
-
-    if (search_list == NULL || replacement_list == NULL || str == NULL)
-    {
-        return NULL;
-    }
-
-    search_list_len = u_strlen(search_list);
-    replacement_list_len = u_strlen(replacement_list);
-    str_len = u_strlen(str);
-
-    des_len = regexp_transliterate(search_list, 
-                                   search_list_len,
-                                   replacement_list,
-                                   replacement_list_len,
-                                   str,
-                                   str_len,
-                                   NULL,
-                                   0);
-    des = palloc(des_len * sizeof(UChar));
-    des_len = regexp_transliterate(search_list, 
-                                    search_list_len,
-                                    replacement_list,
-                                    replacement_list_len,
-                                    str,
-                                    str_len,
-                                    des,
-                                    des_len);
-
-    pfree(str);
-    return des;
-}
-
-static UChar *u_strreplace(UChar *regexp,
-                           UChar *replacement,
-                           UChar *str,
-                           int32_t str_capacity,
-                           int is_global)
-{
-    int32_t regexp_len, replacement_len, str_len;
-    UChar *des;
-    int32_t des_len;
-
-    if (regexp == NULL || replacement == NULL || str == NULL)
-    {
-        return NULL;
-    }
-
-    regexp_len = u_strlen(regexp);
-    replacement_len = u_strlen(replacement);
-    str_len = u_strlen(str);
-
-    des_len = regexp_replace(regexp,
-                             regexp_len,
-                             replacement,
-                             replacement_len,
-                             str,
-                             str_len,
-                             NULL,
-                             0,
-                             is_global);
-
-    des = palloc(des_len * sizeof(UChar));
-
-    des_len = regexp_replace(regexp,
-                            regexp_len,
-                            replacement,
-                            replacement_len,
-                            str,
-                            str_len,
-                            des,
-                            des_len,
-                            is_global);
-    pfree(str);
-    return des;
-}
-
-UChar *additional_substitutions(UChar *nustr, int is_search)
-{
-    char *regexp;
-    UChar uregexp[200], replacement[200];
-
-    if (nustr == NULL)
-    {
-        return NULL;
-    }
-
-    regexp = "\\x{00C6}";
-    u_uastrncpy(uregexp, regexp, strlen(regexp));
-    u_uastrncpy(replacement, "AE", strlen("AE"));
-    
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
-
-    regexp = "\\x{00DE}";
-    u_uastrncpy(uregexp, regexp, strlen(regexp));
-    u_uastrncpy(replacement, "TH", strlen("TH"));
-    
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
-
-    regexp = "\\x{0152}";
-    u_uastrncpy(uregexp, regexp, strlen(regexp));
-    u_uastrncpy(replacement, "OE", strlen("OE"));
-
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
-
-    if (is_search)
-    {
-        regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}][";
-    }
-    else
-    {
-        regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['";
-    }
-    
-    u_uastrncpy(uregexp, regexp, strlen(regexp));
-    u_uastrncpy(replacement, "OE", strlen("OE"));
-    
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
-
-    regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['";
-    u_uastrncpy(uregexp, regexp, strlen(regexp));
-    u_uastrncpy(replacement, "DDOLl", strlen("DDOLl"));
-    
-    nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));
-
-    return nustr;
-}
-
-UChar *transformations_on_unicode(UChar *nustr, UChar *usf)
-{
-    char *regexp;
-    UChar uregexp[200], replacement[200];
-    UChar *comma;
-    int32_t nustr_len;
-
-    if (nustr == NULL)
-    {
-        return NULL;
-    }
-
-    nustr_len = u_strlen(nustr);
-
-    regexp = "[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]";
-    u_uastrncpy(uregexp, regexp, strlen(regexp));
-    u_uastrncpy(replacement, "", strlen(""));
-    
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
-
-    if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16
-    {
-        comma = u_strchr(nustr, 0x2c); //0x2c == ',' in utf16
-        if (comma != NULL)
-        {
-            if (comma != nustr + nustr_len - 1)
-            {
-                regexp = ",";
-                u_uastrncpy(uregexp, regexp, strlen(regexp));
-                replacement[0] = 0x7;
-                replacement[1] = 0;
-
-                nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);
-            }
-        }
-    }
-
-    return nustr;
-}
-
-UChar *replace_placehoders(UChar *nustr)
-{
-    UChar uregexp[200], replacement[200];
-
-    if (nustr == NULL)
-    {
-        return NULL;
-    }
-
-    u_uastrncpy(uregexp,
-                "+&@\\x{266D}\\x{266F}#",
-                strlen( "+&@\\x{266D}\\x{266F}#"));
-    u_uastrncpy(replacement,
-                "\\x01\\x02\\x03\\x04\\x05\\x06",
-                strlen("\\x01\\x02\\x03\\x04\\x05\\x06"));
-
-    nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));
-
-    u_uastrncpy(uregexp,
-                "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]",
-               strlen( "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]"));
-    u_uastrncpy(replacement, "", strlen(""));
-
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
-
-    u_uastrncpy(uregexp,
-                "\\x01\\x02\\x03\\x04\\x05\\x06\\x07",
-                strlen( "\\x01\\x02\\x03\\x04\\x05\\x06\\x07"));
-    u_uastrncpy(replacement,
-                "+&@\\x{266D}\\x{266F}#,",
-                strlen("+&@\\x{266D}\\x{266F}#,"));
-
-    nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));
-
-
-    return nustr;
-}
-
-UChar *decimal_digits(UChar *nustr)
-{
-    UChar uregexp[300], replacement[300];
-
-    if (nustr == NULL)
-    {
-        return NULL;
-    }
-    
-    u_uastrncpy(uregexp,
-                "\\x{0660}-\\x{0669}\\x{06F0}-\\x{06F9}\\x{07C0}-\\x{07C9}\\x{0966}-\\x{096F}"
-                "\\x{09E6}-\\x{09EF}\\x{0A66}-\\x{0A6F}\\x{0AE6}-\\x{0AEF}\\x{0B66}-\\x{0B6F}"
-                "\\x{0BE6}-\\x{0BEF}\\x{0C66}-\\x{0C6F}\\x{0CE6}-\\x{0CEF}\\x{0D66}-\\x{0D6F}"
-                "\\x{0E50}-\\x{0E59}\\x{0ED0}-\\x{0ED9}\\x{0F20}-\\x{0F29}\\x{1040}-\\x{1049}"
-                "\\x{1090}-\\x{1099}\\x{17E0}-\\x{17E9}\\x{1810}-\\x{1819}\\x{1946}-\\x{194F}"
-                "\\x{19D0}-\\x{19D9}\\x{1A80}-\\x{1A89}\\x{1A90}-\\x{1A99}\\x{1B50}-\\x{1B59}"
-                "\\x{1BB0}-\\x{1BB9}\\x{1C40}-\\x{1C49}\\x{1C50}-\\x{1C59}\\x{A620}-\\x{A629}"
-                "\\x{A8D0}-\\x{A8D9}\\x{A900}-\\x{A909}\\x{A9D0}-\\x{A9D9}\\x{AA50}-\\x{AA59}"
-                "\\x{ABF0}-\\x{ABF9}\\x{FF10}-\\x{FF19}",
-                strlen("\\x{0660}-\\x{0669}\\x{06F0}-\\x{06F9}\\x{07C0}-\\x{07C9}\\x{0966}-\\x{096F}"
-                    "\\x{09E6}-\\x{09EF}\\x{0A66}-\\x{0A6F}\\x{0AE6}-\\x{0AEF}\\x{0B66}-\\x{0B6F}"
-                    "\\x{0BE6}-\\x{0BEF}\\x{0C66}-\\x{0C6F}\\x{0CE6}-\\x{0CEF}\\x{0D66}-\\x{0D6F}"
-                    "\\x{0E50}-\\x{0E59}\\x{0ED0}-\\x{0ED9}\\x{0F20}-\\x{0F29}\\x{1040}-\\x{1049}"
-                    "\\x{1090}-\\x{1099}\\x{17E0}-\\x{17E9}\\x{1810}-\\x{1819}\\x{1946}-\\x{194F}"
-                    "\\x{19D0}-\\x{19D9}\\x{1A80}-\\x{1A89}\\x{1A90}-\\x{1A99}\\x{1B50}-\\x{1B59}"
-                    "\\x{1BB0}-\\x{1BB9}\\x{1C40}-\\x{1C49}\\x{1C50}-\\x{1C59}\\x{A620}-\\x{A629}"
-                    "\\x{A8D0}-\\x{A8D9}\\x{A900}-\\x{A909}\\x{A9D0}-\\x{A9D9}\\x{AA50}-\\x{AA59}"
-                    "\\x{ABF0}-\\x{ABF9}\\x{FF10}-\\x{FF19}"));
-
-    u_uastrncpy(replacement,
-                "0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9",
-                strlen("0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9"));
-
-    nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));
-
-    return nustr;
-}
-
-UChar *leading_trailing_spaces(UChar * nustr)
-{
-    UChar uregexp[200], replacement[200];
-
-    if (nustr == NULL)
-    {
-        return NULL;
-    }
-
-    u_uastrncpy(uregexp, "\\s+",	strlen( "\\s+"));
-    u_uastrncpy(replacement, " ", strlen(" "));
-
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
-    
-    u_uastrncpy(uregexp, "^\\s+",	strlen( "^\\s+"));
-    u_uastrncpy(replacement, "", strlen(""));
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);
-
-    u_uastrncpy(uregexp, "\\s+$",	strlen( "\\s+$"));
-
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
-
-    return nustr;
-
-}
-
-text *normalize(text *str, text *sf, int is_search)
-{
-    UChar *ustr, *nustr, *temp, *usf;
-    int32_t nustr_len, temp_len, str_len;
-    UNormalizer2 *normalizer;
-    char *regexp, *result;
-    UChar uregexp[200], replacement[200];
-    UErrorCode err;
-
-    if (str == NULL || sf == NULL)
-    {
-        return NULL;
-    }
-
-    normalizer = (UNormalizer2 *)unorm2_getNFKDInstance(&err);
-    if (U_FAILURE(err))
-    {
-        return NULL;
-    }
-
-    ustr = palloc(VARSIZE(str) * sizeof(UChar));
-    nustr = palloc(VARSIZE(str)* sizeof(UChar));
-    temp = palloc(VARSIZE(str) * sizeof(UChar));
-    usf = palloc(VARSIZE(sf) * sizeof(UChar));
-
-    temp = u_strFromUTF8(temp, VARSIZE(str), NULL, VARDATA(str), VARSIZE(str), &err);
-    if (U_FAILURE(err) || temp == NULL)
-    {
-        str = NULL;
-        goto Fail;
-    }
-
-    nustr = u_strncpy(nustr, temp, VARSIZE(str));
-    if (nustr == NULL)
-    {
-        str = NULL;
-        goto Fail;
-    }
-
-    usf = u_strFromUTF8(usf, VARSIZE(sf), NULL, VARDATA(sf), VARSIZE(sf), &err);
-    if (usf == NULL)
-    {
-        str = NULL;
-        goto Fail;
-    }
-    //Apply NACO normalization to input string; based on
-    //http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
-    //
-    //Note that unlike a strict reading of the NACO normalization rules,
-    //output is returned as lowercase instead of uppercase for compatibility
-    //with previous versions of the Evergreen naco_normalize routine.
-    //
-    //Convert to upper-case first; even though final output will be lowercase, doing this will
-    //ensure that the German eszett (?) and certain ligatures (?, ?, ?, etc.) will be handled correctly.
-    //If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
-
-    u_strToUpper(ustr, 
-                VARSIZE(str),
-                temp,
-                VARSIZE(str),
-                NULL,
-                &err);
-    pfree(temp);
-    temp = NULL;
-    if (U_FAILURE(err))
-    {
-        str = NULL;
-        goto Fail;
-    }
-
-    regexp = "\\x{0098}.*?\\x{009C}";
-    u_uastrncpy(uregexp, regexp, strlen(regexp));
-    u_uastrncpy(replacement, "", strlen(""));
-
-    ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1);
-
-    unorm2_normalize(normalizer, ustr, VARSIZE(str), nustr, VARSIZE(str), &err);
-    if (U_FAILURE(err))
-    {
-        str = NULL;
-        goto Fail;
-    }
-
-    //additional substitutions - 3.6.
-    nustr = additional_substitutions(nustr, is_search);
-
-    //transformations based on Unicode category codes
-    nustr = transformations_on_unicode(nustr, usf);
-
-    //since we've stripped out the control characters, we can now
-    //use a few as placeholders temporarily
-    nustr = replace_placehoders(nustr);
-
-    //decimal digit
-    nustr = decimal_digits(nustr);
-
-    //intentionally skipping step 8 of the NACO algorithm; if the string
-    //gets normalized away, that's fine.
-
-    //leading and trailing spaces
-    nustr = leading_trailing_spaces(nustr);
-    nustr_len = u_strlen(nustr);
-
-    temp = palloc(nustr_len * sizeof(UChar));
-    u_strToLower(temp, nustr_len, nustr, nustr_len, NULL, &err);
-    temp_len = nustr_len;
-
-
-    u_strToUTF8(NULL, 0, &str_len, temp, temp_len, &err);
-
-    SET_VARSIZE(str, str_len + VARHDRSZ);
-    str = (text *)palloc(str_len + VARHDRSZ);
-    
-    result = u_strToUTF8(VARDATA(str), str_len, &str_len, temp, temp_len, &err); 
-
-    result[str_len] = '\0';
-
-    
-Fail:
-    pfree(temp);
-    pfree(ustr);
-    pfree(nustr);
-
-    return str;
-}
-
-PG_FUNCTION_INFO_V1(naco_normalize);
-
-Datum naco_normalize(PG_FUNCTION_ARGS)
-{
-    text *str = PG_GETARG_TEXT_P(0);
-    text *sf = PG_GETARG_TEXT_P(1);
-    
-
-    if (str == NULL || sf == NULL)
-    {
-        PG_RETURN_TEXT_P(NULL);
-    }
-
-    str = normalize(str, sf, 0);
-    
-    PG_RETURN_TEXT_P(str);
-}
-
-PG_FUNCTION_INFO_V1(search_normalize);
-
-Datum search_normalize(PG_FUNCTION_ARGS)
-{
-    text *str = PG_GETARG_TEXT_P(0);
-    text *sf = PG_GETARG_TEXT_P(1);
-
-
-    if (str == NULL || sf == NULL)
-    {
-        PG_RETURN_TEXT_P(NULL);
-    }
-
-    str = normalize(str, sf, 1);
-
-    PG_RETURN_TEXT_P(str);
-}
-
+/************************************************************************/
+/*  C Implementation: public.search_normalize public.naco_normalize
+ *
+ *  Descritption:
+ *    This file implement public.search_normalize and public.naco_normalize.
+ *  These two functions is included in the PostgreSQL extension c_functions.
+ *  ICU4C and postgres lib is needed to build this file.
+ *
+ *  Author: Swenyu Duan <dsy88@sina.com>, (C) 2012
+ *
+ *  Copyright: See COPYING file that comes with this distribution.
+ */
+/************************************************************************/
+#include "postgres.h"
+#include "fmgr.h"
+#include "unicode/unorm2.h"
+#include "unicode/utypes.h"
+#include "unicode/ustring.h"
+#include "unicode/uregex.h"
+#include "unicode/umachine.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+static int32_t regexp_expand_string(const UChar *src,
+                                    int32_t src_len,
+                                    UChar *des,
+                                    int32_t des_capacity)
+{
+    int des_len;
+    UChar s, e;
+    const UChar *cur_pos;
+    const UChar *pre_pos;
+
+    if (src == NULL)
+    {
+        return 0;
+    }
+    pre_pos = src;
+    cur_pos = u_strchr(src, L'-');
+	des_len = 0;
+
+    while (cur_pos != NULL)
+    {
+        if (cur_pos + 1 > src + src_len)
+        {
+            //Error!
+            //The pattern is end with a '-'.
+            return -1;
+        }
+
+        while (pre_pos < cur_pos - 1)
+        {
+            if (des != NULL)
+            {
+                des[des_len] = *pre_pos;
+            }
+            des_len++;
+				pre_pos++;
+        }
+
+        cur_pos ++;
+
+        for(s = *pre_pos; s < *cur_pos; s++)
+        {
+            if (des != NULL)
+            {
+                des[des_len] = s;
+            }
+            des_len++;
+        }
+
+        pre_pos = cur_pos;
+
+        cur_pos = u_strchr(cur_pos, L'-');
+    }
+
+    return des_len;
+}
+
+static int32_t regexp_transliterate(const UChar *search_list,
+									int32_t search_list_len,
+									const UChar *replacement_list,
+									int32_t replacement_list_len,
+									UChar *src,
+									int32_t src_len,
+									UChar *des,
+									int32_t des_capacity)
+{
+	int i, j;
+	int32_t des_len;
+	UChar *cur_pos;
+
+	if (search_list == NULL || 
+		replacement_list == NULL)
+	{
+		return 0;
+	}
+
+	des_len = 0;
+
+	for (i = 0; i < src_len; i++)
+	{
+		if (des != NULL)
+		{
+			des[des_len] = src[i];
+		}
+		des_len++;
+
+		if(cur_pos = u_strchr(search_list, src[i]))
+		{
+			if(cur_pos - search_list > replacement_list_len)
+				des_len--;
+		}
+	}
+	if (des == NULL || des_len == 0)
+	{
+		return des_len;
+	}
+
+	
+	for (i = 0; i < replacement_list_len; i++)
+	{
+		cur_pos = u_strchr(des, search_list[i]);
+
+		while (cur_pos != NULL)
+		{
+			*cur_pos = replacement_list[i];
+
+			//In case cur_pos is the last char in des.
+			if (cur_pos >= des + des_len)
+			{
+				break;
+			}
+			cur_pos = u_strchr(cur_pos + 1, search_list[i]);
+		}
+	}
+
+	return des_len;
+
+}
+
+static int32_t regexp_replace(const UChar *regexp,
+							  int32_t regexp_len,
+							  const UChar *replacement,
+							  int32_t replacement_len,
+							  UChar *src,
+							  int32_t src_len,
+							  UChar *des,
+							  int32_t des_capacity,
+							  int is_global)
+{
+	URegularExpression *regular_exp;
+	UErrorCode status;
+	UParseError pe;
+	int32_t len;
+
+	if (regexp == NULL || replacement == NULL || src == NULL)
+	{
+		return 0;
+	}
+
+	status = 0;
+
+	
+	regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status);
+	if (regular_exp == NULL)
+	{
+		return 0;
+	}
+
+	status = 0;
+	uregex_setText(regular_exp, src, src_len, &status);
+	
+	status = 0;
+	if (is_global > 0)
+	{
+		len = uregex_replaceAll(regular_exp,
+								replacement,
+								replacement_len,
+								NULL,
+								0,
+								&status);
+		if (des == NULL || des_capacity < len)
+		{
+			uregex_close(regular_exp);
+			return len;
+		}
+		status = 0;
+
+		uregex_replaceAll(regular_exp,
+						  replacement,
+						  replacement_len,
+						  des,
+						  des_capacity,
+						  &status);
+	}
+	else
+	{
+		len = uregex_replaceFirst(regular_exp,
+								  replacement,
+								  replacement_len,
+								  NULL,
+								  0,
+								  &status);
+		if (des == NULL || des_capacity < len)
+		{
+			uregex_close(regular_exp);
+			return len;
+		}
+
+		status = 0;
+		uregex_replaceFirst(regular_exp,
+							replacement,
+							replacement_len,
+							des,
+							des_capacity,
+							&status);
+	}
+	
+	uregex_close(regular_exp);
+	return len;
+}
+
+static UChar *u_strtransliterate(UChar *search_list,
+								 UChar *replacement_list,
+								 UChar *str,
+								 int32_t str_capacity)
+{
+	int32_t search_list_len, replacement_list_len, str_len;
+	UChar *des;
+	int32_t des_len;
+
+	if (search_list == NULL || replacement_list == NULL || str == NULL)
+	{
+		return NULL;
+	}
+
+	search_list_len = u_strlen(search_list);
+	replacement_list_len = u_strlen(replacement_list);
+	str_len = u_strlen(str);
+
+	des_len = regexp_transliterate(search_list, 
+								   search_list_len,
+								   replacement_list,
+								   replacement_list_len,
+								   str,
+								   str_len,
+								   NULL,
+								   0);
+	des = palloc((des_len + 1) * sizeof(UChar));
+	des_len = regexp_transliterate(search_list, 
+									search_list_len,
+									replacement_list,
+									replacement_list_len,
+									str,
+									str_len,
+									des,
+									des_len);
+	if(des != NULL)
+		des[des_len] = '\0';
+
+	pfree(str);
+	return des;
+}
+
+static UChar *u_strreplace(UChar *regexp,
+	                       UChar *replacement,
+						   UChar *str,
+						   int32_t str_capacity,
+						   int is_global)
+{
+	int32_t regexp_len, replacement_len, str_len;
+	UChar *des;
+	int32_t des_len;
+
+	if (regexp == NULL || replacement == NULL || str == NULL)
+	{
+		return NULL;
+	}
+
+	regexp_len = u_strlen(regexp);
+	replacement_len = u_strlen(replacement);
+	str_len = u_strlen(str);
+
+	des_len = regexp_replace(regexp,
+							 regexp_len,
+							 replacement,
+							 replacement_len,
+							 str,
+							 str_len,
+							 NULL,
+							 0,
+							 is_global);
+
+	des = palloc((des_len + 1)* sizeof(UChar));
+
+	des_len = regexp_replace(regexp,
+							regexp_len,
+							replacement,
+							replacement_len,
+							str,
+							str_len,
+							des,
+							des_len,
+							is_global);
+	if(des != NULL)
+		des[des_len] = '\0';
+
+	pfree(str);
+	return des;
+}
+
+static UChar *additional_substitutions(UChar *nustr, int is_search)
+{
+	UChar uregexp[200], replacement[200];
+
+	if (nustr == NULL)
+	{
+		return NULL;
+	}
+
+	u_uastrncpy(uregexp, "\\x{00C6}", sizeof("\\x{00C6}"));
+	u_uastrncpy(replacement, "AE", sizeof("AE"));
+	
+	nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+	u_uastrncpy(uregexp, "\\x{00DE}", sizeof("\\x{00DE}"));
+	u_uastrncpy(replacement, "TH", sizeof("TH"));
+	
+	nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+	u_uastrncpy(uregexp, "\\x{0152}", sizeof("\\x{0152}"));
+	u_uastrncpy(replacement, "OE", sizeof("OE"));
+
+	nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+	if (is_search)
+	{
+		u_strncpy(uregexp, 
+					L"\x0110\x00D0\x00D8\x0141\x2113\x02BB\x02BC][",
+					sizeof(L"\x0110\x00D0\x00D8\x0141\x2113\x02BB"
+							L"\x02BC][") / sizeof(UChar));
+
+	}
+	else
+	{
+		u_strncpy(uregexp, 
+				L"\x0110\x00D0\x00D8\x0141\x2113\x02BB\x02BC]['",
+				sizeof(L"\x0110\x00D0\x00D8\x0141\x2113"
+						L"\x02BB\x02BC]['") / sizeof(UChar));
+
+	}
+	u_uastrncpy(replacement, "DDOLl", sizeof("DDOLl"));
+	
+	nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));
+
+	return nustr;
+}
+
+static UChar *transformations_on_unicode(UChar *nustr, UChar *usf)
+{
+	UChar uregexp[200], replacement[200];
+	UChar *comma;
+	int32_t nustr_len;
+
+	if (nustr == NULL)
+	{
+		return NULL;
+	}
+
+	nustr_len = u_strlen(nustr);
+
+	u_uastrncpy(uregexp,
+				"[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]",
+				sizeof("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]"));
+	u_uastrncpy(replacement, "", sizeof(""));
+	
+	nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+	if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16
+	{
+		comma = u_strchr(nustr, 0x2c); //0x2c == ',' in utf16
+		if (comma != NULL)
+		{
+			if (comma != nustr + nustr_len - 1)
+			{
+				u_uastrncpy(uregexp, ",", sizeof(","));
+				replacement[0] = 0x7;
+				replacement[1] = 0;
+
+				nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);
+			}
+		}
+	}
+
+	return nustr;
+}
+
+static UChar *replace_placehoders(UChar *nustr)
+{
+	UChar uregexp[200], replacement[200];
+
+	if (nustr == NULL)
+	{
+		return NULL;
+	}
+
+	u_strncpy(uregexp,
+				L"+&@\x266D\x266F#",
+				sizeof(L"+&@\x266D\x266F#") / sizeof(UChar));
+	u_strncpy(replacement,
+				L"\x01\x02\x03\x04\x05\x06",
+				sizeof(L"\x01\x02\x03\x04\x05\x06") / sizeof(UChar));
+
+	nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));
+
+	u_uastrncpy(uregexp,
+				"[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]",
+			   sizeof("[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]"));
+	u_uastrncpy(replacement, " ", sizeof(" "));
+
+	nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+	u_strncpy(uregexp,
+		        L"\x01\x02\x03\x04\x05\x06\x07",
+				sizeof(L"\x01\x02\x03\x04\x05\x06\x07") / sizeof(UChar));
+	u_strncpy(replacement,
+				L"+&@\x266D\x266F#,",
+				sizeof(L"+&@\x266D\x266F#,") / sizeof(UChar));
+
+	nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));
+
+
+	return nustr;
+}
+
+
+static UChar *decimal_digits(UChar *nustr)
+{
+    UChar *uregexp, *replacement;
+    UChar *expand_exp, *expand_replacement;
+    int32_t uregexp_len, replacement_len;
+    int32_t expand_exp_len, expand_replacement_len;
+
+    if (nustr == NULL)
+    {
+        return NULL;
+    }
+
+    uregexp = L"\x0660-\x0669\x06F0-\x06F9\x07C0-\x07C9\x0966-\x096F"
+        L"\x09E6-\x09EF\x0A66-\x0A6F\x0AE6-\x0AEF\x0B66-\x0B6F"
+        L"\x0BE6-\x0BEF\x0C66-\x0C6F\x0CE6-\x0CEF\x0D66-\x0D6F"
+        L"\x0E50-\x0E59\x0ED0-\x0ED9\x0F20-\x0F29\x1040-\x1049"
+        L"\x1090-\x1099\x17E0-\x17E9\x1810-\x1819\x1946-\x194F"
+        L"\x19D0-\x19D9\x1A80-\x1A89\x1A90-\x1A99\x1B50-\x1B59"
+        L"\x1BB0-\x1BB9\x1C40-\x1C49\x1C50-\x1C59\xA620-\xA629"
+        L"\xA8D0-\xA8D9\xA900-\xA909\xA9D0-\xA9D9\xAA50-\xAA59"
+        L"\xABF0-\xABF9\xFF10-\xFF19";
+
+    uregexp_len = sizeof(L"\x0660-\x0669\x06F0-\x06F9\x07C0-\x07C9\x0966-\x096F"
+        L"\x09E6-\x09EF\x0A66-\x0A6F\x0AE6-\x0AEF\x0B66-\x0B6F"
+        L"\x0BE6-\x0BEF\x0C66-\x0C6F\x0CE6-\x0CEF\x0D66-\x0D6F"
+        L"\x0E50-\x0E59\x0ED0-\x0ED9\x0F20-\x0F29\x1040-\x1049"
+        L"\x1090-\x1099\x17E0-\x17E9\x1810-\x1819\x1946-\x194F"
+        L"\x19D0-\x19D9\x1A80-\x1A89\x1A90-\x1A99\x1B50-\x1B59"
+        L"\x1BB0-\x1BB9\x1C40-\x1C49\x1C50-\x1C59\xA620-\xA629"
+        L"\xA8D0-\xA8D9\xA900-\xA909\xA9D0-\xA9D9\xAA50-\xAA59"
+        L"\xABF0-\xABF9\xFF10-\xFF19") / sizeof(UChar);
+    replacement = L"0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-"
+        L"90-90-90-90-90-90-90-90-90-90-90-90-90-90-9";
+
+    replacement_len = sizeof(L"0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-"
+        L"90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-"
+        L"90-90-9") / sizeof(UChar);
+
+    expand_exp_len = regexp_expand_string(uregexp, uregexp_len, NULL, 0);
+
+    expand_exp = palloc(sizeof(UChar) * (expand_exp_len + 1));
+
+    expand_replacement_len = regexp_expand_string(replacement, replacement_len, NULL, 0);
+
+    expand_replacement = palloc(sizeof(UChar) * (expand_replacement_len + 1));
+
+    regexp_expand_string(uregexp, uregexp_len, expand_exp, expand_exp_len);
+    regexp_expand_string(replacement, replacement_len, expand_replacement, expand_replacement_len);
+
+    expand_exp[expand_exp_len] = '\0';
+    expand_replacement[expand_replacement_len] = '\0';
+
+    nustr = u_strtransliterate(expand_exp, expand_replacement, nustr, u_strlen(nustr));
+
+    pfree(expand_exp);
+    pfree(expand_replacement);
+    return nustr;
+}
+
+static UChar *leading_trailing_spaces(UChar * nustr)
+{
+	UChar uregexp[20], replacement[20];
+
+	if (nustr == NULL)
+	{
+		return NULL;
+	}
+
+	u_uastrncpy(uregexp, "\\s+", sizeof("\\s+"));
+	u_uastrncpy(replacement, " ", sizeof(" "));
+
+	nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+	
+	u_uastrncpy(uregexp, "^\\s+", sizeof("^\\s+"));
+	u_uastrncpy(replacement, "", sizeof(""));
+	nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);
+
+	u_uastrncpy(uregexp, "\\s+$", sizeof("\\s+$"));
+
+	nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+	return nustr;
+
+}
+
+text *normalize(text *str, text *sf, int is_search)
+{
+	UChar *ustr = NULL, *nustr = NULL, *temp = NULL, *usf = NULL;
+	int32_t nustr_len, temp_len, str_len, sf_len, ustr_len;
+	UNormalizer2 *normalizer;
+	char *regexp, *result;
+	char *s;
+	UChar uregexp[200], replacement[200];
+	UErrorCode err = 0;
+
+	if (str == NULL || sf == NULL)
+	{
+		return NULL;
+	}
+
+	normalizer = unorm2_getNFKDInstance(&err);
+	if (U_FAILURE(err))
+	{
+		return NULL;
+	}
+
+	s = VARDATA(str);
+
+	str_len = VARSIZE(str) - VARHDRSZ;
+	sf_len = VARSIZE(sf) - VARHDRSZ;
+	temp = palloc((str_len + 1)* sizeof(UChar));
+	usf = palloc((sf_len + 1) * sizeof(UChar));
+
+
+	temp = u_strFromUTF8(temp, str_len, NULL, s, str_len, &err);
+	if (U_FAILURE(err) || temp == NULL)
+	{
+		str = NULL;
+		goto Fail;
+	}
+	
+	usf = u_strFromUTF8(usf, sf_len, NULL, VARDATA(sf), sf_len, &err);
+	if (usf == NULL)
+	{
+		str = NULL;
+		goto Fail;
+	}
+
+	usf[sf_len] = '\0';
+	//Apply NACO normalization to input string; based on
+	//http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+	//
+	//Note that unlike a strict reading of the NACO normalization rules,
+	//output is returned as lowercase instead of uppercase for compatibility
+	//with previous versions of the Evergreen naco_normalize routine.
+	//
+	//Convert to upper-case first; even though final output will be lowercase, doing this will
+	//ensure that the German eszett (?) and certain ligatures (?, ?, ?, etc.) will be handled correctly.
+	//If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+	
+	ustr_len = u_strToUpper(NULL,
+				0,
+				temp,
+				str_len,
+				NULL,
+				&err);
+
+	err = 0;
+	ustr = palloc((ustr_len + 1) * sizeof(UChar));
+	u_strToUpper(ustr, 
+		     ustr_len,
+		     temp,
+		     str_len,
+		     NULL,
+		     &err);
+	pfree(temp);
+	temp = NULL;
+	ustr[ustr_len] = '\0';
+	if (U_FAILURE(err))
+	{
+		str = NULL;
+		goto Fail;
+	}
+
+	u_uastrncpy(uregexp,
+		    "\\x{0098}.*?\\x{009C}",
+		    sizeof("\\x{0098}.*?\\x{009C}"));
+	u_uastrncpy(replacement, "", sizeof(""));
+
+	ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1);
+	ustr_len = u_strlen(ustr);
+
+	nustr_len = unorm2_normalize(normalizer, ustr, ustr_len, NULL, 0, &err);
+	
+	//To store the '\0';
+
+	nustr = palloc((nustr_len + 1)* sizeof(UChar));
+	err = 0;
+
+	unorm2_normalize(normalizer, ustr, ustr_len, nustr, nustr_len, &err);
+	if (U_FAILURE(err))
+	{
+		str = NULL;
+		goto Fail;
+	}
+	nustr[nustr_len] = '\0';
+
+	//additional substitutions - 3.6.
+	nustr = additional_substitutions(nustr, is_search);
+
+	//transformations based on Unicode category codes
+	nustr = transformations_on_unicode(nustr, usf);
+
+	//since we've stripped out the control characters, we can now
+	//use a few as placeholders temporarily
+	nustr = replace_placehoders(nustr);
+
+	//decimal digit
+	nustr = decimal_digits(nustr);
+
+	//intentionally skipping step 8 of the NACO algorithm; if the string
+	//gets normalized away, that's fine.
+
+	//leading and trailing spaces
+	nustr = leading_trailing_spaces(nustr);
+	nustr_len = u_strlen(nustr);
+
+	temp = palloc(nustr_len * sizeof(UChar));
+	u_strToLower(temp, nustr_len, nustr, nustr_len, NULL, &err);
+	temp_len = nustr_len;
+	err = 0;
+
+	u_strToUTF8(NULL, 0, &str_len, temp, temp_len, &err);
+	
+	err = 0;
+	str = (text *)palloc(str_len + VARHDRSZ);
+	SET_VARSIZE(str, str_len + VARHDRSZ);
+
+	result = u_strToUTF8(VARDATA(str), str_len, &str_len, temp, temp_len, &err); 
+	
+	if(result != NULL)
+		result[str_len] = '\0';
+
+	
+Fail:
+	if(temp != NULL)
+		pfree(temp);
+	if(ustr != NULL)
+		pfree(ustr);
+	if(usf != NULL)
+		pfree(usf);
+	if(nustr != NULL)
+		pfree(nustr);
+
+	return str;
+}
+
+PG_FUNCTION_INFO_V1(naco_normalize);
+
+Datum naco_normalize(PG_FUNCTION_ARGS)
+{
+	text *str = PG_GETARG_TEXT_P(0);
+	text *sf = PG_GETARG_TEXT_P(1);
+	
+
+	if (str == NULL || sf == NULL)
+	{
+		PG_RETURN_TEXT_P(NULL);
+	}
+
+	str = normalize(str, sf, 0);
+	
+	if(str != NULL)
+		PG_RETURN_TEXT_P(str);
+	else
+		PG_RETURN_NULL();
+}
+
+PG_FUNCTION_INFO_V1(search_normalize);
+
+Datum search_normalize(PG_FUNCTION_ARGS)
+{
+	text *str = PG_GETARG_TEXT_P(0);
+	text *sf = PG_GETARG_TEXT_P(1);
+
+
+	if (str == NULL || sf == NULL)
+	{
+		PG_RETURN_TEXT_P(NULL);
+	}
+
+	str = normalize(str, sf, 1);
+
+	
+	if(str != NULL)
+		PG_RETURN_TEXT_P(str);
+	else
+		PG_RETURN_NULL();
+}
diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.create_extension.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.create_extension.sql
new file mode 100644
index 0000000000..b42e7904e6
--- /dev/null
+++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.create_extension.sql
@@ -0,0 +1,4 @@
+DROP FUNCTION public.naco_normalize( TEXT, TEXT);
+DROP FUNCTION public.search_normalize( TEXT, TEXT);
+
+CREATE EXTENSION c_functions;
diff --git a/Open-ILS/tests/naco_normalize.t b/Open-ILS/tests/naco_normalize.t
index 182ebab67d..25a27c0142 100644
--- a/Open-ILS/tests/naco_normalize.t
+++ b/Open-ILS/tests/naco_normalize.t
@@ -19,7 +19,7 @@ use OpenILS::Utils::Normalize qw( naco_normalize );
 
 # Database connection parameters
 my $db_driver = 'Pg';
-my $db_host   = 'evergreen';
+my $db_host   = 'localhost';
 my $db_port   = '5432';
 my $db_name   = 'evergreen';
 my $db_user   = 'evergreen';