From: Swenyu Duan Date: Mon, 4 Jun 2012 02:45:21 +0000 (-0400) Subject: A tested version of the PostgreSQL extension. X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=9da882b9483a4ff0678a8c3b5570318da59e9fd7;p=working%2FEvergreen.git A tested version of the PostgreSQL extension. Now the extension contains two C implemented functions (naco_normalize and search_normalize). To add the extension to PostgreSQL, a upgrade sql script is included in the Open-ILS/src/sql/Pg/upgrage/XXXX.schema.create_extension.sql. This script drop the plperlu function of naco_nomalize and search_normalize and create a extension which will add two C version of the two functions. The extension depend on the ICU4C lib. The environment variable ICU_DATA must be set up to the right ICU data path before you use the extension. The extension has been tested in debian and windows environment(50 cases in /Open-ILS/tests/naco_normalize.t has passed). The source file in this extension is debian/linux version of the extension while the windows version has some difference. Signed-off-by: Swenyu Duan --- diff --git a/Open-ILS/src/sql/Pg/extensions/makefile b/Open-ILS/src/sql/Pg/extensions/makefile index 8aa4ad3bf2..e3e0adce46 100644 --- a/Open-ILS/src/sql/Pg/extensions/makefile +++ b/Open-ILS/src/sql/Pg/extensions/makefile @@ -1,7 +1,7 @@ MODULE_big = c_functions EXTENSION = c_functions SHLIB_LINK = -licutu -licuuc -licuio -licui18n -licule -liculx -licudata -#PG_CPPFLAGS = -L/usr/lib -licuuc -licuio -licui18n -licule -liculx -licudata +PG_CPPFLAGS = -fshort-wchar DATA = c_functions--1.0.sql OBJS = normalize.functions_in_c.o diff --git a/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c b/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c index 26e48e0db6..a2128bba63 100755 --- a/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c +++ b/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c @@ -1,608 +1,718 @@ -/************************************************************************/ -/* C Implementation: public.search_normalize public.naco_normalize - * - * Descritption: - * This file implement public.search_normalize and public.naco_normalize. - * These two functions is included in the PostgreSQL extension c_functions. - * ICU4C and postgres lib is needed to build this file. - * - * Author: Swenyu Duan , (C) 2012 - * - * Copyright: See COPYING file that comes with this distribution. - */ -/************************************************************************/ -#include "postgres.h" -#include "string.h" -#include "fmgr.h" -#include "unicode/unorm2.h" -#include "unicode/utypes.h" -#include "unicode/ustring.h" -#include "unicode/uregex.h" -#include "unicode/umachine.h" - -#ifdef PG_MODULE_MAGIC -PG_MODULE_MAGIC; -#endif - -static int32_t regexp_transliterate(const UChar *search_list, - int32_t search_list_len, - const UChar *replacement_list, - int32_t replacement_list_len, - UChar *src, - int32_t src_len, - UChar *des, - int32_t des_capacity) -{ - int i, j; - int32_t des_len; - UChar *cur_pos; - - if (search_list == NULL || - replacement_list == NULL || - des == NULL || - src_len > des_capacity + 1) - { - return 0; - } - - des_len = 0; - - for (i = 0; i < src_len; i++) - { - if (des != NULL) - { - des[des_len] = src[i]; - } - des_len++; - - for (j = replacement_list_len; j < search_list_len; j++) - { - if (search_list[j] == src[i]) - { - des_len--; - break; - } - } - } - if (des == NULL) - { - //To store the tail '\\0'. - return des_len + 1; - } - - des[des_len] = '\0'; - - for (i = 0; i < replacement_list_len; i++) - { - cur_pos = u_strchr(des, search_list[i]); - - while (cur_pos != NULL) - { - *cur_pos = replacement_list[i]; - - //In case cur_pos is the last char in des. - if (cur_pos >= des + des_len) - { - break; - } - cur_pos = u_strchr(cur_pos + 1, search_list[i]); - } - } - - return des_len; -} - -static int32_t regexp_replace(const UChar *regexp, - int32_t regexp_len, - const UChar *replacement, - int32_t replacement_len, - UChar *src, - int32_t src_len, - UChar *des, - int32_t des_capacity, - int is_global) -{ - URegularExpression *regular_exp; - UErrorCode status; - UParseError pe; - int32_t len; - - if (regexp == NULL || replacement == NULL || src == NULL) - { - return 0; - } - - regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status); - if (regular_exp == NULL) - { - return 0; - } - - uregex_setText(regular_exp, src, src_len, &status); - - if (is_global > 0) - { - len = uregex_replaceAll(regular_exp, - replacement, - replacement_len, - NULL, - 0, - &status); - if (des == NULL || des_capacity < len) - { - uregex_close(regular_exp); - return len; - } - - uregex_replaceAll(regular_exp, - replacement, - replacement_len, - des, - des_capacity, - &status); - } - else - { - len = uregex_replaceFirst(regular_exp, - replacement, - replacement_len, - NULL, - 0, - &status); - if (des == NULL || des_capacity < len) - { - uregex_close(regular_exp); - return len; - } - - uregex_replaceFirst(regular_exp, - replacement, - replacement_len, - des, - des_capacity, - &status); - } - - uregex_close(regular_exp); - return len; -} - -static UChar *u_strtransliterate(UChar *search_list, - UChar *replacement_list, - UChar *str, - int32_t str_capacity) -{ - int32_t search_list_len, replacement_list_len, str_len; - UChar *des; - int32_t des_len; - - if (search_list == NULL || replacement_list == NULL || str == NULL) - { - return NULL; - } - - search_list_len = u_strlen(search_list); - replacement_list_len = u_strlen(replacement_list); - str_len = u_strlen(str); - - des_len = regexp_transliterate(search_list, - search_list_len, - replacement_list, - replacement_list_len, - str, - str_len, - NULL, - 0); - des = palloc(des_len * sizeof(UChar)); - des_len = regexp_transliterate(search_list, - search_list_len, - replacement_list, - replacement_list_len, - str, - str_len, - des, - des_len); - - pfree(str); - return des; -} - -static UChar *u_strreplace(UChar *regexp, - UChar *replacement, - UChar *str, - int32_t str_capacity, - int is_global) -{ - int32_t regexp_len, replacement_len, str_len; - UChar *des; - int32_t des_len; - - if (regexp == NULL || replacement == NULL || str == NULL) - { - return NULL; - } - - regexp_len = u_strlen(regexp); - replacement_len = u_strlen(replacement); - str_len = u_strlen(str); - - des_len = regexp_replace(regexp, - regexp_len, - replacement, - replacement_len, - str, - str_len, - NULL, - 0, - is_global); - - des = palloc(des_len * sizeof(UChar)); - - des_len = regexp_replace(regexp, - regexp_len, - replacement, - replacement_len, - str, - str_len, - des, - des_len, - is_global); - pfree(str); - return des; -} - -UChar *additional_substitutions(UChar *nustr, int is_search) -{ - char *regexp; - UChar uregexp[200], replacement[200]; - - if (nustr == NULL) - { - return NULL; - } - - regexp = "\\x{00C6}"; - u_uastrncpy(uregexp, regexp, strlen(regexp)); - u_uastrncpy(replacement, "AE", strlen("AE")); - - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); - - regexp = "\\x{00DE}"; - u_uastrncpy(uregexp, regexp, strlen(regexp)); - u_uastrncpy(replacement, "TH", strlen("TH")); - - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); - - regexp = "\\x{0152}"; - u_uastrncpy(uregexp, regexp, strlen(regexp)); - u_uastrncpy(replacement, "OE", strlen("OE")); - - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); - - if (is_search) - { - regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]["; - } - else - { - regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['"; - } - - u_uastrncpy(uregexp, regexp, strlen(regexp)); - u_uastrncpy(replacement, "OE", strlen("OE")); - - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); - - regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['"; - u_uastrncpy(uregexp, regexp, strlen(regexp)); - u_uastrncpy(replacement, "DDOLl", strlen("DDOLl")); - - nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); - - return nustr; -} - -UChar *transformations_on_unicode(UChar *nustr, UChar *usf) -{ - char *regexp; - UChar uregexp[200], replacement[200]; - UChar *comma; - int32_t nustr_len; - - if (nustr == NULL) - { - return NULL; - } - - nustr_len = u_strlen(nustr); - - regexp = "[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]"; - u_uastrncpy(uregexp, regexp, strlen(regexp)); - u_uastrncpy(replacement, "", strlen("")); - - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); - - if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16 - { - comma = u_strchr(nustr, 0x2c); //0x2c == ',' in utf16 - if (comma != NULL) - { - if (comma != nustr + nustr_len - 1) - { - regexp = ","; - u_uastrncpy(uregexp, regexp, strlen(regexp)); - replacement[0] = 0x7; - replacement[1] = 0; - - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0); - } - } - } - - return nustr; -} - -UChar *replace_placehoders(UChar *nustr) -{ - UChar uregexp[200], replacement[200]; - - if (nustr == NULL) - { - return NULL; - } - - u_uastrncpy(uregexp, - "+&@\\x{266D}\\x{266F}#", - strlen( "+&@\\x{266D}\\x{266F}#")); - u_uastrncpy(replacement, - "\\x01\\x02\\x03\\x04\\x05\\x06", - strlen("\\x01\\x02\\x03\\x04\\x05\\x06")); - - nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); - - u_uastrncpy(uregexp, - "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]", - strlen( "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]")); - u_uastrncpy(replacement, "", strlen("")); - - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); - - u_uastrncpy(uregexp, - "\\x01\\x02\\x03\\x04\\x05\\x06\\x07", - strlen( "\\x01\\x02\\x03\\x04\\x05\\x06\\x07")); - u_uastrncpy(replacement, - "+&@\\x{266D}\\x{266F}#,", - strlen("+&@\\x{266D}\\x{266F}#,")); - - nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); - - - return nustr; -} - -UChar *decimal_digits(UChar *nustr) -{ - UChar uregexp[300], replacement[300]; - - if (nustr == NULL) - { - return NULL; - } - - u_uastrncpy(uregexp, - "\\x{0660}-\\x{0669}\\x{06F0}-\\x{06F9}\\x{07C0}-\\x{07C9}\\x{0966}-\\x{096F}" - "\\x{09E6}-\\x{09EF}\\x{0A66}-\\x{0A6F}\\x{0AE6}-\\x{0AEF}\\x{0B66}-\\x{0B6F}" - "\\x{0BE6}-\\x{0BEF}\\x{0C66}-\\x{0C6F}\\x{0CE6}-\\x{0CEF}\\x{0D66}-\\x{0D6F}" - "\\x{0E50}-\\x{0E59}\\x{0ED0}-\\x{0ED9}\\x{0F20}-\\x{0F29}\\x{1040}-\\x{1049}" - "\\x{1090}-\\x{1099}\\x{17E0}-\\x{17E9}\\x{1810}-\\x{1819}\\x{1946}-\\x{194F}" - "\\x{19D0}-\\x{19D9}\\x{1A80}-\\x{1A89}\\x{1A90}-\\x{1A99}\\x{1B50}-\\x{1B59}" - "\\x{1BB0}-\\x{1BB9}\\x{1C40}-\\x{1C49}\\x{1C50}-\\x{1C59}\\x{A620}-\\x{A629}" - "\\x{A8D0}-\\x{A8D9}\\x{A900}-\\x{A909}\\x{A9D0}-\\x{A9D9}\\x{AA50}-\\x{AA59}" - "\\x{ABF0}-\\x{ABF9}\\x{FF10}-\\x{FF19}", - strlen("\\x{0660}-\\x{0669}\\x{06F0}-\\x{06F9}\\x{07C0}-\\x{07C9}\\x{0966}-\\x{096F}" - "\\x{09E6}-\\x{09EF}\\x{0A66}-\\x{0A6F}\\x{0AE6}-\\x{0AEF}\\x{0B66}-\\x{0B6F}" - "\\x{0BE6}-\\x{0BEF}\\x{0C66}-\\x{0C6F}\\x{0CE6}-\\x{0CEF}\\x{0D66}-\\x{0D6F}" - "\\x{0E50}-\\x{0E59}\\x{0ED0}-\\x{0ED9}\\x{0F20}-\\x{0F29}\\x{1040}-\\x{1049}" - "\\x{1090}-\\x{1099}\\x{17E0}-\\x{17E9}\\x{1810}-\\x{1819}\\x{1946}-\\x{194F}" - "\\x{19D0}-\\x{19D9}\\x{1A80}-\\x{1A89}\\x{1A90}-\\x{1A99}\\x{1B50}-\\x{1B59}" - "\\x{1BB0}-\\x{1BB9}\\x{1C40}-\\x{1C49}\\x{1C50}-\\x{1C59}\\x{A620}-\\x{A629}" - "\\x{A8D0}-\\x{A8D9}\\x{A900}-\\x{A909}\\x{A9D0}-\\x{A9D9}\\x{AA50}-\\x{AA59}" - "\\x{ABF0}-\\x{ABF9}\\x{FF10}-\\x{FF19}")); - - u_uastrncpy(replacement, - "0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9", - strlen("0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9")); - - nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); - - return nustr; -} - -UChar *leading_trailing_spaces(UChar * nustr) -{ - UChar uregexp[200], replacement[200]; - - if (nustr == NULL) - { - return NULL; - } - - u_uastrncpy(uregexp, "\\s+", strlen( "\\s+")); - u_uastrncpy(replacement, " ", strlen(" ")); - - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); - - u_uastrncpy(uregexp, "^\\s+", strlen( "^\\s+")); - u_uastrncpy(replacement, "", strlen("")); - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0); - - u_uastrncpy(uregexp, "\\s+$", strlen( "\\s+$")); - - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); - - return nustr; - -} - -text *normalize(text *str, text *sf, int is_search) -{ - UChar *ustr, *nustr, *temp, *usf; - int32_t nustr_len, temp_len, str_len; - UNormalizer2 *normalizer; - char *regexp, *result; - UChar uregexp[200], replacement[200]; - UErrorCode err; - - if (str == NULL || sf == NULL) - { - return NULL; - } - - normalizer = (UNormalizer2 *)unorm2_getNFKDInstance(&err); - if (U_FAILURE(err)) - { - return NULL; - } - - ustr = palloc(VARSIZE(str) * sizeof(UChar)); - nustr = palloc(VARSIZE(str)* sizeof(UChar)); - temp = palloc(VARSIZE(str) * sizeof(UChar)); - usf = palloc(VARSIZE(sf) * sizeof(UChar)); - - temp = u_strFromUTF8(temp, VARSIZE(str), NULL, VARDATA(str), VARSIZE(str), &err); - if (U_FAILURE(err) || temp == NULL) - { - str = NULL; - goto Fail; - } - - nustr = u_strncpy(nustr, temp, VARSIZE(str)); - if (nustr == NULL) - { - str = NULL; - goto Fail; - } - - usf = u_strFromUTF8(usf, VARSIZE(sf), NULL, VARDATA(sf), VARSIZE(sf), &err); - if (usf == NULL) - { - str = NULL; - goto Fail; - } - //Apply NACO normalization to input string; based on - //http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf - // - //Note that unlike a strict reading of the NACO normalization rules, - //output is returned as lowercase instead of uppercase for compatibility - //with previous versions of the Evergreen naco_normalize routine. - // - //Convert to upper-case first; even though final output will be lowercase, doing this will - //ensure that the German eszett (?) and certain ligatures (?, ?, ?, etc.) will be handled correctly. - //If there are any bugs in Perl's implementation of upcasing, they will be passed through here. - - u_strToUpper(ustr, - VARSIZE(str), - temp, - VARSIZE(str), - NULL, - &err); - pfree(temp); - temp = NULL; - if (U_FAILURE(err)) - { - str = NULL; - goto Fail; - } - - regexp = "\\x{0098}.*?\\x{009C}"; - u_uastrncpy(uregexp, regexp, strlen(regexp)); - u_uastrncpy(replacement, "", strlen("")); - - ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1); - - unorm2_normalize(normalizer, ustr, VARSIZE(str), nustr, VARSIZE(str), &err); - if (U_FAILURE(err)) - { - str = NULL; - goto Fail; - } - - //additional substitutions - 3.6. - nustr = additional_substitutions(nustr, is_search); - - //transformations based on Unicode category codes - nustr = transformations_on_unicode(nustr, usf); - - //since we've stripped out the control characters, we can now - //use a few as placeholders temporarily - nustr = replace_placehoders(nustr); - - //decimal digit - nustr = decimal_digits(nustr); - - //intentionally skipping step 8 of the NACO algorithm; if the string - //gets normalized away, that's fine. - - //leading and trailing spaces - nustr = leading_trailing_spaces(nustr); - nustr_len = u_strlen(nustr); - - temp = palloc(nustr_len * sizeof(UChar)); - u_strToLower(temp, nustr_len, nustr, nustr_len, NULL, &err); - temp_len = nustr_len; - - - u_strToUTF8(NULL, 0, &str_len, temp, temp_len, &err); - - SET_VARSIZE(str, str_len + VARHDRSZ); - str = (text *)palloc(str_len + VARHDRSZ); - - result = u_strToUTF8(VARDATA(str), str_len, &str_len, temp, temp_len, &err); - - result[str_len] = '\0'; - - -Fail: - pfree(temp); - pfree(ustr); - pfree(nustr); - - return str; -} - -PG_FUNCTION_INFO_V1(naco_normalize); - -Datum naco_normalize(PG_FUNCTION_ARGS) -{ - text *str = PG_GETARG_TEXT_P(0); - text *sf = PG_GETARG_TEXT_P(1); - - - if (str == NULL || sf == NULL) - { - PG_RETURN_TEXT_P(NULL); - } - - str = normalize(str, sf, 0); - - PG_RETURN_TEXT_P(str); -} - -PG_FUNCTION_INFO_V1(search_normalize); - -Datum search_normalize(PG_FUNCTION_ARGS) -{ - text *str = PG_GETARG_TEXT_P(0); - text *sf = PG_GETARG_TEXT_P(1); - - - if (str == NULL || sf == NULL) - { - PG_RETURN_TEXT_P(NULL); - } - - str = normalize(str, sf, 1); - - PG_RETURN_TEXT_P(str); -} - +/************************************************************************/ +/* C Implementation: public.search_normalize public.naco_normalize + * + * Descritption: + * This file implement public.search_normalize and public.naco_normalize. + * These two functions is included in the PostgreSQL extension c_functions. + * ICU4C and postgres lib is needed to build this file. + * + * Author: Swenyu Duan , (C) 2012 + * + * Copyright: See COPYING file that comes with this distribution. + */ +/************************************************************************/ +#include "postgres.h" +#include "fmgr.h" +#include "unicode/unorm2.h" +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "unicode/uregex.h" +#include "unicode/umachine.h" + +#ifdef PG_MODULE_MAGIC +PG_MODULE_MAGIC; +#endif + +static int32_t regexp_expand_string(const UChar *src, + int32_t src_len, + UChar *des, + int32_t des_capacity) +{ + int des_len; + UChar s, e; + const UChar *cur_pos; + const UChar *pre_pos; + + if (src == NULL) + { + return 0; + } + pre_pos = src; + cur_pos = u_strchr(src, L'-'); + des_len = 0; + + while (cur_pos != NULL) + { + if (cur_pos + 1 > src + src_len) + { + //Error! + //The pattern is end with a '-'. + return -1; + } + + while (pre_pos < cur_pos - 1) + { + if (des != NULL) + { + des[des_len] = *pre_pos; + } + des_len++; + pre_pos++; + } + + cur_pos ++; + + for(s = *pre_pos; s < *cur_pos; s++) + { + if (des != NULL) + { + des[des_len] = s; + } + des_len++; + } + + pre_pos = cur_pos; + + cur_pos = u_strchr(cur_pos, L'-'); + } + + return des_len; +} + +static int32_t regexp_transliterate(const UChar *search_list, + int32_t search_list_len, + const UChar *replacement_list, + int32_t replacement_list_len, + UChar *src, + int32_t src_len, + UChar *des, + int32_t des_capacity) +{ + int i, j; + int32_t des_len; + UChar *cur_pos; + + if (search_list == NULL || + replacement_list == NULL) + { + return 0; + } + + des_len = 0; + + for (i = 0; i < src_len; i++) + { + if (des != NULL) + { + des[des_len] = src[i]; + } + des_len++; + + if(cur_pos = u_strchr(search_list, src[i])) + { + if(cur_pos - search_list > replacement_list_len) + des_len--; + } + } + if (des == NULL || des_len == 0) + { + return des_len; + } + + + for (i = 0; i < replacement_list_len; i++) + { + cur_pos = u_strchr(des, search_list[i]); + + while (cur_pos != NULL) + { + *cur_pos = replacement_list[i]; + + //In case cur_pos is the last char in des. + if (cur_pos >= des + des_len) + { + break; + } + cur_pos = u_strchr(cur_pos + 1, search_list[i]); + } + } + + return des_len; + +} + +static int32_t regexp_replace(const UChar *regexp, + int32_t regexp_len, + const UChar *replacement, + int32_t replacement_len, + UChar *src, + int32_t src_len, + UChar *des, + int32_t des_capacity, + int is_global) +{ + URegularExpression *regular_exp; + UErrorCode status; + UParseError pe; + int32_t len; + + if (regexp == NULL || replacement == NULL || src == NULL) + { + return 0; + } + + status = 0; + + + regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status); + if (regular_exp == NULL) + { + return 0; + } + + status = 0; + uregex_setText(regular_exp, src, src_len, &status); + + status = 0; + if (is_global > 0) + { + len = uregex_replaceAll(regular_exp, + replacement, + replacement_len, + NULL, + 0, + &status); + if (des == NULL || des_capacity < len) + { + uregex_close(regular_exp); + return len; + } + status = 0; + + uregex_replaceAll(regular_exp, + replacement, + replacement_len, + des, + des_capacity, + &status); + } + else + { + len = uregex_replaceFirst(regular_exp, + replacement, + replacement_len, + NULL, + 0, + &status); + if (des == NULL || des_capacity < len) + { + uregex_close(regular_exp); + return len; + } + + status = 0; + uregex_replaceFirst(regular_exp, + replacement, + replacement_len, + des, + des_capacity, + &status); + } + + uregex_close(regular_exp); + return len; +} + +static UChar *u_strtransliterate(UChar *search_list, + UChar *replacement_list, + UChar *str, + int32_t str_capacity) +{ + int32_t search_list_len, replacement_list_len, str_len; + UChar *des; + int32_t des_len; + + if (search_list == NULL || replacement_list == NULL || str == NULL) + { + return NULL; + } + + search_list_len = u_strlen(search_list); + replacement_list_len = u_strlen(replacement_list); + str_len = u_strlen(str); + + des_len = regexp_transliterate(search_list, + search_list_len, + replacement_list, + replacement_list_len, + str, + str_len, + NULL, + 0); + des = palloc((des_len + 1) * sizeof(UChar)); + des_len = regexp_transliterate(search_list, + search_list_len, + replacement_list, + replacement_list_len, + str, + str_len, + des, + des_len); + if(des != NULL) + des[des_len] = '\0'; + + pfree(str); + return des; +} + +static UChar *u_strreplace(UChar *regexp, + UChar *replacement, + UChar *str, + int32_t str_capacity, + int is_global) +{ + int32_t regexp_len, replacement_len, str_len; + UChar *des; + int32_t des_len; + + if (regexp == NULL || replacement == NULL || str == NULL) + { + return NULL; + } + + regexp_len = u_strlen(regexp); + replacement_len = u_strlen(replacement); + str_len = u_strlen(str); + + des_len = regexp_replace(regexp, + regexp_len, + replacement, + replacement_len, + str, + str_len, + NULL, + 0, + is_global); + + des = palloc((des_len + 1)* sizeof(UChar)); + + des_len = regexp_replace(regexp, + regexp_len, + replacement, + replacement_len, + str, + str_len, + des, + des_len, + is_global); + if(des != NULL) + des[des_len] = '\0'; + + pfree(str); + return des; +} + +static UChar *additional_substitutions(UChar *nustr, int is_search) +{ + UChar uregexp[200], replacement[200]; + + if (nustr == NULL) + { + return NULL; + } + + u_uastrncpy(uregexp, "\\x{00C6}", sizeof("\\x{00C6}")); + u_uastrncpy(replacement, "AE", sizeof("AE")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + u_uastrncpy(uregexp, "\\x{00DE}", sizeof("\\x{00DE}")); + u_uastrncpy(replacement, "TH", sizeof("TH")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + u_uastrncpy(uregexp, "\\x{0152}", sizeof("\\x{0152}")); + u_uastrncpy(replacement, "OE", sizeof("OE")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + if (is_search) + { + u_strncpy(uregexp, + L"\x0110\x00D0\x00D8\x0141\x2113\x02BB\x02BC][", + sizeof(L"\x0110\x00D0\x00D8\x0141\x2113\x02BB" + L"\x02BC][") / sizeof(UChar)); + + } + else + { + u_strncpy(uregexp, + L"\x0110\x00D0\x00D8\x0141\x2113\x02BB\x02BC]['", + sizeof(L"\x0110\x00D0\x00D8\x0141\x2113" + L"\x02BB\x02BC]['") / sizeof(UChar)); + + } + u_uastrncpy(replacement, "DDOLl", sizeof("DDOLl")); + + nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); + + return nustr; +} + +static UChar *transformations_on_unicode(UChar *nustr, UChar *usf) +{ + UChar uregexp[200], replacement[200]; + UChar *comma; + int32_t nustr_len; + + if (nustr == NULL) + { + return NULL; + } + + nustr_len = u_strlen(nustr); + + u_uastrncpy(uregexp, + "[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]", + sizeof("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]")); + u_uastrncpy(replacement, "", sizeof("")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16 + { + comma = u_strchr(nustr, 0x2c); //0x2c == ',' in utf16 + if (comma != NULL) + { + if (comma != nustr + nustr_len - 1) + { + u_uastrncpy(uregexp, ",", sizeof(",")); + replacement[0] = 0x7; + replacement[1] = 0; + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0); + } + } + } + + return nustr; +} + +static UChar *replace_placehoders(UChar *nustr) +{ + UChar uregexp[200], replacement[200]; + + if (nustr == NULL) + { + return NULL; + } + + u_strncpy(uregexp, + L"+&@\x266D\x266F#", + sizeof(L"+&@\x266D\x266F#") / sizeof(UChar)); + u_strncpy(replacement, + L"\x01\x02\x03\x04\x05\x06", + sizeof(L"\x01\x02\x03\x04\x05\x06") / sizeof(UChar)); + + nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); + + u_uastrncpy(uregexp, + "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]", + sizeof("[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]")); + u_uastrncpy(replacement, " ", sizeof(" ")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + u_strncpy(uregexp, + L"\x01\x02\x03\x04\x05\x06\x07", + sizeof(L"\x01\x02\x03\x04\x05\x06\x07") / sizeof(UChar)); + u_strncpy(replacement, + L"+&@\x266D\x266F#,", + sizeof(L"+&@\x266D\x266F#,") / sizeof(UChar)); + + nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); + + + return nustr; +} + + +static UChar *decimal_digits(UChar *nustr) +{ + UChar *uregexp, *replacement; + UChar *expand_exp, *expand_replacement; + int32_t uregexp_len, replacement_len; + int32_t expand_exp_len, expand_replacement_len; + + if (nustr == NULL) + { + return NULL; + } + + uregexp = L"\x0660-\x0669\x06F0-\x06F9\x07C0-\x07C9\x0966-\x096F" + L"\x09E6-\x09EF\x0A66-\x0A6F\x0AE6-\x0AEF\x0B66-\x0B6F" + L"\x0BE6-\x0BEF\x0C66-\x0C6F\x0CE6-\x0CEF\x0D66-\x0D6F" + L"\x0E50-\x0E59\x0ED0-\x0ED9\x0F20-\x0F29\x1040-\x1049" + L"\x1090-\x1099\x17E0-\x17E9\x1810-\x1819\x1946-\x194F" + L"\x19D0-\x19D9\x1A80-\x1A89\x1A90-\x1A99\x1B50-\x1B59" + L"\x1BB0-\x1BB9\x1C40-\x1C49\x1C50-\x1C59\xA620-\xA629" + L"\xA8D0-\xA8D9\xA900-\xA909\xA9D0-\xA9D9\xAA50-\xAA59" + L"\xABF0-\xABF9\xFF10-\xFF19"; + + uregexp_len = sizeof(L"\x0660-\x0669\x06F0-\x06F9\x07C0-\x07C9\x0966-\x096F" + L"\x09E6-\x09EF\x0A66-\x0A6F\x0AE6-\x0AEF\x0B66-\x0B6F" + L"\x0BE6-\x0BEF\x0C66-\x0C6F\x0CE6-\x0CEF\x0D66-\x0D6F" + L"\x0E50-\x0E59\x0ED0-\x0ED9\x0F20-\x0F29\x1040-\x1049" + L"\x1090-\x1099\x17E0-\x17E9\x1810-\x1819\x1946-\x194F" + L"\x19D0-\x19D9\x1A80-\x1A89\x1A90-\x1A99\x1B50-\x1B59" + L"\x1BB0-\x1BB9\x1C40-\x1C49\x1C50-\x1C59\xA620-\xA629" + L"\xA8D0-\xA8D9\xA900-\xA909\xA9D0-\xA9D9\xAA50-\xAA59" + L"\xABF0-\xABF9\xFF10-\xFF19") / sizeof(UChar); + replacement = L"0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-" + L"90-90-90-90-90-90-90-90-90-90-90-90-90-90-9"; + + replacement_len = sizeof(L"0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-" + L"90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-" + L"90-90-9") / sizeof(UChar); + + expand_exp_len = regexp_expand_string(uregexp, uregexp_len, NULL, 0); + + expand_exp = palloc(sizeof(UChar) * (expand_exp_len + 1)); + + expand_replacement_len = regexp_expand_string(replacement, replacement_len, NULL, 0); + + expand_replacement = palloc(sizeof(UChar) * (expand_replacement_len + 1)); + + regexp_expand_string(uregexp, uregexp_len, expand_exp, expand_exp_len); + regexp_expand_string(replacement, replacement_len, expand_replacement, expand_replacement_len); + + expand_exp[expand_exp_len] = '\0'; + expand_replacement[expand_replacement_len] = '\0'; + + nustr = u_strtransliterate(expand_exp, expand_replacement, nustr, u_strlen(nustr)); + + pfree(expand_exp); + pfree(expand_replacement); + return nustr; +} + +static UChar *leading_trailing_spaces(UChar * nustr) +{ + UChar uregexp[20], replacement[20]; + + if (nustr == NULL) + { + return NULL; + } + + u_uastrncpy(uregexp, "\\s+", sizeof("\\s+")); + u_uastrncpy(replacement, " ", sizeof(" ")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + u_uastrncpy(uregexp, "^\\s+", sizeof("^\\s+")); + u_uastrncpy(replacement, "", sizeof("")); + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0); + + u_uastrncpy(uregexp, "\\s+$", sizeof("\\s+$")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + return nustr; + +} + +text *normalize(text *str, text *sf, int is_search) +{ + UChar *ustr = NULL, *nustr = NULL, *temp = NULL, *usf = NULL; + int32_t nustr_len, temp_len, str_len, sf_len, ustr_len; + UNormalizer2 *normalizer; + char *regexp, *result; + char *s; + UChar uregexp[200], replacement[200]; + UErrorCode err = 0; + + if (str == NULL || sf == NULL) + { + return NULL; + } + + normalizer = unorm2_getNFKDInstance(&err); + if (U_FAILURE(err)) + { + return NULL; + } + + s = VARDATA(str); + + str_len = VARSIZE(str) - VARHDRSZ; + sf_len = VARSIZE(sf) - VARHDRSZ; + temp = palloc((str_len + 1)* sizeof(UChar)); + usf = palloc((sf_len + 1) * sizeof(UChar)); + + + temp = u_strFromUTF8(temp, str_len, NULL, s, str_len, &err); + if (U_FAILURE(err) || temp == NULL) + { + str = NULL; + goto Fail; + } + + usf = u_strFromUTF8(usf, sf_len, NULL, VARDATA(sf), sf_len, &err); + if (usf == NULL) + { + str = NULL; + goto Fail; + } + + usf[sf_len] = '\0'; + //Apply NACO normalization to input string; based on + //http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf + // + //Note that unlike a strict reading of the NACO normalization rules, + //output is returned as lowercase instead of uppercase for compatibility + //with previous versions of the Evergreen naco_normalize routine. + // + //Convert to upper-case first; even though final output will be lowercase, doing this will + //ensure that the German eszett (?) and certain ligatures (?, ?, ?, etc.) will be handled correctly. + //If there are any bugs in Perl's implementation of upcasing, they will be passed through here. + + ustr_len = u_strToUpper(NULL, + 0, + temp, + str_len, + NULL, + &err); + + err = 0; + ustr = palloc((ustr_len + 1) * sizeof(UChar)); + u_strToUpper(ustr, + ustr_len, + temp, + str_len, + NULL, + &err); + pfree(temp); + temp = NULL; + ustr[ustr_len] = '\0'; + if (U_FAILURE(err)) + { + str = NULL; + goto Fail; + } + + u_uastrncpy(uregexp, + "\\x{0098}.*?\\x{009C}", + sizeof("\\x{0098}.*?\\x{009C}")); + u_uastrncpy(replacement, "", sizeof("")); + + ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1); + ustr_len = u_strlen(ustr); + + nustr_len = unorm2_normalize(normalizer, ustr, ustr_len, NULL, 0, &err); + + //To store the '\0'; + + nustr = palloc((nustr_len + 1)* sizeof(UChar)); + err = 0; + + unorm2_normalize(normalizer, ustr, ustr_len, nustr, nustr_len, &err); + if (U_FAILURE(err)) + { + str = NULL; + goto Fail; + } + nustr[nustr_len] = '\0'; + + //additional substitutions - 3.6. + nustr = additional_substitutions(nustr, is_search); + + //transformations based on Unicode category codes + nustr = transformations_on_unicode(nustr, usf); + + //since we've stripped out the control characters, we can now + //use a few as placeholders temporarily + nustr = replace_placehoders(nustr); + + //decimal digit + nustr = decimal_digits(nustr); + + //intentionally skipping step 8 of the NACO algorithm; if the string + //gets normalized away, that's fine. + + //leading and trailing spaces + nustr = leading_trailing_spaces(nustr); + nustr_len = u_strlen(nustr); + + temp = palloc(nustr_len * sizeof(UChar)); + u_strToLower(temp, nustr_len, nustr, nustr_len, NULL, &err); + temp_len = nustr_len; + err = 0; + + u_strToUTF8(NULL, 0, &str_len, temp, temp_len, &err); + + err = 0; + str = (text *)palloc(str_len + VARHDRSZ); + SET_VARSIZE(str, str_len + VARHDRSZ); + + result = u_strToUTF8(VARDATA(str), str_len, &str_len, temp, temp_len, &err); + + if(result != NULL) + result[str_len] = '\0'; + + +Fail: + if(temp != NULL) + pfree(temp); + if(ustr != NULL) + pfree(ustr); + if(usf != NULL) + pfree(usf); + if(nustr != NULL) + pfree(nustr); + + return str; +} + +PG_FUNCTION_INFO_V1(naco_normalize); + +Datum naco_normalize(PG_FUNCTION_ARGS) +{ + text *str = PG_GETARG_TEXT_P(0); + text *sf = PG_GETARG_TEXT_P(1); + + + if (str == NULL || sf == NULL) + { + PG_RETURN_TEXT_P(NULL); + } + + str = normalize(str, sf, 0); + + if(str != NULL) + PG_RETURN_TEXT_P(str); + else + PG_RETURN_NULL(); +} + +PG_FUNCTION_INFO_V1(search_normalize); + +Datum search_normalize(PG_FUNCTION_ARGS) +{ + text *str = PG_GETARG_TEXT_P(0); + text *sf = PG_GETARG_TEXT_P(1); + + + if (str == NULL || sf == NULL) + { + PG_RETURN_TEXT_P(NULL); + } + + str = normalize(str, sf, 1); + + + if(str != NULL) + PG_RETURN_TEXT_P(str); + else + PG_RETURN_NULL(); +} diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.create_extension.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.create_extension.sql new file mode 100644 index 0000000000..b42e7904e6 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.create_extension.sql @@ -0,0 +1,4 @@ +DROP FUNCTION public.naco_normalize( TEXT, TEXT); +DROP FUNCTION public.search_normalize( TEXT, TEXT); + +CREATE EXTENSION c_functions; diff --git a/Open-ILS/tests/naco_normalize.t b/Open-ILS/tests/naco_normalize.t index 182ebab67d..25a27c0142 100644 --- a/Open-ILS/tests/naco_normalize.t +++ b/Open-ILS/tests/naco_normalize.t @@ -19,7 +19,7 @@ use OpenILS::Utils::Normalize qw( naco_normalize ); # Database connection parameters my $db_driver = 'Pg'; -my $db_host = 'evergreen'; +my $db_host = 'localhost'; my $db_port = '5432'; my $db_name = 'evergreen'; my $db_user = 'evergreen';