-/************************************************************************/\r
-/* C Implementation: public.search_normalize public.naco_normalize\r
- *\r
- * Descritption:\r
- * This file implement public.search_normalize and public.naco_normalize.\r
- * These two functions is included in the PostgreSQL extension c_functions.\r
- * ICU4C and postgres lib is needed to build this file.\r
- *\r
- * Author: Swenyu Duan <dsy88@sina.com>, (C) 2012\r
- *\r
- * Copyright: See COPYING file that comes with this distribution.\r
- */\r
-/************************************************************************/\r
-#include "postgres.h"\r
-#include "string.h"\r
-#include "fmgr.h"\r
-#include "unicode/unorm2.h"\r
-#include "unicode/utypes.h"\r
-#include "unicode/ustring.h"\r
-#include "unicode/uregex.h"\r
-#include "unicode/umachine.h"\r
-\r
-#ifdef PG_MODULE_MAGIC\r
-PG_MODULE_MAGIC;\r
-#endif\r
-\r
-static int32_t regexp_transliterate(const UChar *search_list,\r
- int32_t search_list_len,\r
- const UChar *replacement_list,\r
- int32_t replacement_list_len,\r
- UChar *src,\r
- int32_t src_len,\r
- UChar *des,\r
- int32_t des_capacity)\r
-{\r
- int i, j;\r
- int32_t des_len;\r
- UChar *cur_pos;\r
-\r
- if (search_list == NULL || \r
- replacement_list == NULL ||\r
- des == NULL ||\r
- src_len > des_capacity + 1)\r
- {\r
- return 0;\r
- }\r
-\r
- des_len = 0;\r
-\r
- for (i = 0; i < src_len; i++)\r
- {\r
- if (des != NULL)\r
- {\r
- des[des_len] = src[i];\r
- }\r
- des_len++;\r
-\r
- for (j = replacement_list_len; j < search_list_len; j++)\r
- {\r
- if (search_list[j] == src[i])\r
- {\r
- des_len--;\r
- break;\r
- }\r
- }\r
- }\r
- if (des == NULL)\r
- {\r
- //To store the tail '\\0'.\r
- return des_len + 1;\r
- }\r
-\r
- des[des_len] = '\0';\r
- \r
- for (i = 0; i < replacement_list_len; i++)\r
- {\r
- cur_pos = u_strchr(des, search_list[i]);\r
-\r
- while (cur_pos != NULL)\r
- {\r
- *cur_pos = replacement_list[i];\r
-\r
- //In case cur_pos is the last char in des.\r
- if (cur_pos >= des + des_len)\r
- {\r
- break;\r
- }\r
- cur_pos = u_strchr(cur_pos + 1, search_list[i]);\r
- }\r
- }\r
-\r
- return des_len;\r
-}\r
-\r
-static int32_t regexp_replace(const UChar *regexp,\r
- int32_t regexp_len,\r
- const UChar *replacement,\r
- int32_t replacement_len,\r
- UChar *src,\r
- int32_t src_len,\r
- UChar *des,\r
- int32_t des_capacity,\r
- int is_global)\r
-{\r
- URegularExpression *regular_exp;\r
- UErrorCode status;\r
- UParseError pe;\r
- int32_t len;\r
-\r
- if (regexp == NULL || replacement == NULL || src == NULL)\r
- {\r
- return 0;\r
- }\r
-\r
- regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status);\r
- if (regular_exp == NULL)\r
- {\r
- return 0;\r
- }\r
-\r
- uregex_setText(regular_exp, src, src_len, &status);\r
-\r
- if (is_global > 0)\r
- {\r
- len = uregex_replaceAll(regular_exp,\r
- replacement,\r
- replacement_len,\r
- NULL,\r
- 0,\r
- &status);\r
- if (des == NULL || des_capacity < len)\r
- {\r
- uregex_close(regular_exp);\r
- return len;\r
- }\r
-\r
- uregex_replaceAll(regular_exp,\r
- replacement,\r
- replacement_len,\r
- des,\r
- des_capacity,\r
- &status);\r
- }\r
- else\r
- {\r
- len = uregex_replaceFirst(regular_exp,\r
- replacement,\r
- replacement_len,\r
- NULL,\r
- 0,\r
- &status);\r
- if (des == NULL || des_capacity < len)\r
- {\r
- uregex_close(regular_exp);\r
- return len;\r
- }\r
-\r
- uregex_replaceFirst(regular_exp,\r
- replacement,\r
- replacement_len,\r
- des,\r
- des_capacity,\r
- &status);\r
- }\r
- \r
- uregex_close(regular_exp);\r
- return len;\r
-}\r
-\r
-static UChar *u_strtransliterate(UChar *search_list,\r
- UChar *replacement_list,\r
- UChar *str,\r
- int32_t str_capacity)\r
-{\r
- int32_t search_list_len, replacement_list_len, str_len;\r
- UChar *des;\r
- int32_t des_len;\r
-\r
- if (search_list == NULL || replacement_list == NULL || str == NULL)\r
- {\r
- return NULL;\r
- }\r
-\r
- search_list_len = u_strlen(search_list);\r
- replacement_list_len = u_strlen(replacement_list);\r
- str_len = u_strlen(str);\r
-\r
- des_len = regexp_transliterate(search_list, \r
- search_list_len,\r
- replacement_list,\r
- replacement_list_len,\r
- str,\r
- str_len,\r
- NULL,\r
- 0);\r
- des = palloc(des_len * sizeof(UChar));\r
- des_len = regexp_transliterate(search_list, \r
- search_list_len,\r
- replacement_list,\r
- replacement_list_len,\r
- str,\r
- str_len,\r
- des,\r
- des_len);\r
-\r
- pfree(str);\r
- return des;\r
-}\r
-\r
-static UChar *u_strreplace(UChar *regexp,\r
- UChar *replacement,\r
- UChar *str,\r
- int32_t str_capacity,\r
- int is_global)\r
-{\r
- int32_t regexp_len, replacement_len, str_len;\r
- UChar *des;\r
- int32_t des_len;\r
-\r
- if (regexp == NULL || replacement == NULL || str == NULL)\r
- {\r
- return NULL;\r
- }\r
-\r
- regexp_len = u_strlen(regexp);\r
- replacement_len = u_strlen(replacement);\r
- str_len = u_strlen(str);\r
-\r
- des_len = regexp_replace(regexp,\r
- regexp_len,\r
- replacement,\r
- replacement_len,\r
- str,\r
- str_len,\r
- NULL,\r
- 0,\r
- is_global);\r
-\r
- des = palloc(des_len * sizeof(UChar));\r
-\r
- des_len = regexp_replace(regexp,\r
- regexp_len,\r
- replacement,\r
- replacement_len,\r
- str,\r
- str_len,\r
- des,\r
- des_len,\r
- is_global);\r
- pfree(str);\r
- return des;\r
-}\r
-\r
-UChar *additional_substitutions(UChar *nustr, int is_search)\r
-{\r
- char *regexp;\r
- UChar uregexp[200], replacement[200];\r
-\r
- if (nustr == NULL)\r
- {\r
- return NULL;\r
- }\r
-\r
- regexp = "\\x{00C6}";\r
- u_uastrncpy(uregexp, regexp, strlen(regexp));\r
- u_uastrncpy(replacement, "AE", strlen("AE"));\r
- \r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
-\r
- regexp = "\\x{00DE}";\r
- u_uastrncpy(uregexp, regexp, strlen(regexp));\r
- u_uastrncpy(replacement, "TH", strlen("TH"));\r
- \r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
-\r
- regexp = "\\x{0152}";\r
- u_uastrncpy(uregexp, regexp, strlen(regexp));\r
- u_uastrncpy(replacement, "OE", strlen("OE"));\r
-\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
-\r
- if (is_search)\r
- {\r
- regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}][";\r
- }\r
- else\r
- {\r
- regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['";\r
- }\r
- \r
- u_uastrncpy(uregexp, regexp, strlen(regexp));\r
- u_uastrncpy(replacement, "OE", strlen("OE"));\r
- \r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
-\r
- regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['";\r
- u_uastrncpy(uregexp, regexp, strlen(regexp));\r
- u_uastrncpy(replacement, "DDOLl", strlen("DDOLl"));\r
- \r
- nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));\r
-\r
- return nustr;\r
-}\r
-\r
-UChar *transformations_on_unicode(UChar *nustr, UChar *usf)\r
-{\r
- char *regexp;\r
- UChar uregexp[200], replacement[200];\r
- UChar *comma;\r
- int32_t nustr_len;\r
-\r
- if (nustr == NULL)\r
- {\r
- return NULL;\r
- }\r
-\r
- nustr_len = u_strlen(nustr);\r
-\r
- regexp = "[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]";\r
- u_uastrncpy(uregexp, regexp, strlen(regexp));\r
- u_uastrncpy(replacement, "", strlen(""));\r
- \r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
-\r
- if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16\r
- {\r
- comma = u_strchr(nustr, 0x2c); //0x2c == ',' in utf16\r
- if (comma != NULL)\r
- {\r
- if (comma != nustr + nustr_len - 1)\r
- {\r
- regexp = ",";\r
- u_uastrncpy(uregexp, regexp, strlen(regexp));\r
- replacement[0] = 0x7;\r
- replacement[1] = 0;\r
-\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);\r
- }\r
- }\r
- }\r
-\r
- return nustr;\r
-}\r
-\r
-UChar *replace_placehoders(UChar *nustr)\r
-{\r
- UChar uregexp[200], replacement[200];\r
-\r
- if (nustr == NULL)\r
- {\r
- return NULL;\r
- }\r
-\r
- u_uastrncpy(uregexp,\r
- "+&@\\x{266D}\\x{266F}#",\r
- strlen( "+&@\\x{266D}\\x{266F}#"));\r
- u_uastrncpy(replacement,\r
- "\\x01\\x02\\x03\\x04\\x05\\x06",\r
- strlen("\\x01\\x02\\x03\\x04\\x05\\x06"));\r
-\r
- nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));\r
-\r
- u_uastrncpy(uregexp,\r
- "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]",\r
- strlen( "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]"));\r
- u_uastrncpy(replacement, "", strlen(""));\r
-\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
-\r
- u_uastrncpy(uregexp,\r
- "\\x01\\x02\\x03\\x04\\x05\\x06\\x07",\r
- strlen( "\\x01\\x02\\x03\\x04\\x05\\x06\\x07"));\r
- u_uastrncpy(replacement,\r
- "+&@\\x{266D}\\x{266F}#,",\r
- strlen("+&@\\x{266D}\\x{266F}#,"));\r
-\r
- nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));\r
-\r
-\r
- return nustr;\r
-}\r
-\r
-UChar *decimal_digits(UChar *nustr)\r
-{\r
- UChar uregexp[300], replacement[300];\r
-\r
- if (nustr == NULL)\r
- {\r
- return NULL;\r
- }\r
- \r
- u_uastrncpy(uregexp,\r
- "\\x{0660}-\\x{0669}\\x{06F0}-\\x{06F9}\\x{07C0}-\\x{07C9}\\x{0966}-\\x{096F}"\r
- "\\x{09E6}-\\x{09EF}\\x{0A66}-\\x{0A6F}\\x{0AE6}-\\x{0AEF}\\x{0B66}-\\x{0B6F}"\r
- "\\x{0BE6}-\\x{0BEF}\\x{0C66}-\\x{0C6F}\\x{0CE6}-\\x{0CEF}\\x{0D66}-\\x{0D6F}"\r
- "\\x{0E50}-\\x{0E59}\\x{0ED0}-\\x{0ED9}\\x{0F20}-\\x{0F29}\\x{1040}-\\x{1049}"\r
- "\\x{1090}-\\x{1099}\\x{17E0}-\\x{17E9}\\x{1810}-\\x{1819}\\x{1946}-\\x{194F}"\r
- "\\x{19D0}-\\x{19D9}\\x{1A80}-\\x{1A89}\\x{1A90}-\\x{1A99}\\x{1B50}-\\x{1B59}"\r
- "\\x{1BB0}-\\x{1BB9}\\x{1C40}-\\x{1C49}\\x{1C50}-\\x{1C59}\\x{A620}-\\x{A629}"\r
- "\\x{A8D0}-\\x{A8D9}\\x{A900}-\\x{A909}\\x{A9D0}-\\x{A9D9}\\x{AA50}-\\x{AA59}"\r
- "\\x{ABF0}-\\x{ABF9}\\x{FF10}-\\x{FF19}",\r
- strlen("\\x{0660}-\\x{0669}\\x{06F0}-\\x{06F9}\\x{07C0}-\\x{07C9}\\x{0966}-\\x{096F}"\r
- "\\x{09E6}-\\x{09EF}\\x{0A66}-\\x{0A6F}\\x{0AE6}-\\x{0AEF}\\x{0B66}-\\x{0B6F}"\r
- "\\x{0BE6}-\\x{0BEF}\\x{0C66}-\\x{0C6F}\\x{0CE6}-\\x{0CEF}\\x{0D66}-\\x{0D6F}"\r
- "\\x{0E50}-\\x{0E59}\\x{0ED0}-\\x{0ED9}\\x{0F20}-\\x{0F29}\\x{1040}-\\x{1049}"\r
- "\\x{1090}-\\x{1099}\\x{17E0}-\\x{17E9}\\x{1810}-\\x{1819}\\x{1946}-\\x{194F}"\r
- "\\x{19D0}-\\x{19D9}\\x{1A80}-\\x{1A89}\\x{1A90}-\\x{1A99}\\x{1B50}-\\x{1B59}"\r
- "\\x{1BB0}-\\x{1BB9}\\x{1C40}-\\x{1C49}\\x{1C50}-\\x{1C59}\\x{A620}-\\x{A629}"\r
- "\\x{A8D0}-\\x{A8D9}\\x{A900}-\\x{A909}\\x{A9D0}-\\x{A9D9}\\x{AA50}-\\x{AA59}"\r
- "\\x{ABF0}-\\x{ABF9}\\x{FF10}-\\x{FF19}"));\r
-\r
- u_uastrncpy(replacement,\r
- "0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9",\r
- strlen("0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9"));\r
-\r
- nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));\r
-\r
- return nustr;\r
-}\r
-\r
-UChar *leading_trailing_spaces(UChar * nustr)\r
-{\r
- UChar uregexp[200], replacement[200];\r
-\r
- if (nustr == NULL)\r
- {\r
- return NULL;\r
- }\r
-\r
- u_uastrncpy(uregexp, "\\s+", strlen( "\\s+"));\r
- u_uastrncpy(replacement, " ", strlen(" "));\r
-\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
- \r
- u_uastrncpy(uregexp, "^\\s+", strlen( "^\\s+"));\r
- u_uastrncpy(replacement, "", strlen(""));\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);\r
-\r
- u_uastrncpy(uregexp, "\\s+$", strlen( "\\s+$"));\r
-\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
-\r
- return nustr;\r
-\r
-}\r
-\r
-text *normalize(text *str, text *sf, int is_search)\r
-{\r
- UChar *ustr, *nustr, *temp, *usf;\r
- int32_t nustr_len, temp_len, str_len;\r
- UNormalizer2 *normalizer;\r
- char *regexp, *result;\r
- UChar uregexp[200], replacement[200];\r
- UErrorCode err;\r
-\r
- if (str == NULL || sf == NULL)\r
- {\r
- return NULL;\r
- }\r
-\r
- normalizer = (UNormalizer2 *)unorm2_getNFKDInstance(&err);\r
- if (U_FAILURE(err))\r
- {\r
- return NULL;\r
- }\r
-\r
- ustr = palloc(VARSIZE(str) * sizeof(UChar));\r
- nustr = palloc(VARSIZE(str)* sizeof(UChar));\r
- temp = palloc(VARSIZE(str) * sizeof(UChar));\r
- usf = palloc(VARSIZE(sf) * sizeof(UChar));\r
-\r
- temp = u_strFromUTF8(temp, VARSIZE(str), NULL, VARDATA(str), VARSIZE(str), &err);\r
- if (U_FAILURE(err) || temp == NULL)\r
- {\r
- str = NULL;\r
- goto Fail;\r
- }\r
-\r
- nustr = u_strncpy(nustr, temp, VARSIZE(str));\r
- if (nustr == NULL)\r
- {\r
- str = NULL;\r
- goto Fail;\r
- }\r
-\r
- usf = u_strFromUTF8(usf, VARSIZE(sf), NULL, VARDATA(sf), VARSIZE(sf), &err);\r
- if (usf == NULL)\r
- {\r
- str = NULL;\r
- goto Fail;\r
- }\r
- //Apply NACO normalization to input string; based on\r
- //http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf\r
- //\r
- //Note that unlike a strict reading of the NACO normalization rules,\r
- //output is returned as lowercase instead of uppercase for compatibility\r
- //with previous versions of the Evergreen naco_normalize routine.\r
- //\r
- //Convert to upper-case first; even though final output will be lowercase, doing this will\r
- //ensure that the German eszett (?) and certain ligatures (?, ?, ?, etc.) will be handled correctly.\r
- //If there are any bugs in Perl's implementation of upcasing, they will be passed through here.\r
-\r
- u_strToUpper(ustr, \r
- VARSIZE(str),\r
- temp,\r
- VARSIZE(str),\r
- NULL,\r
- &err);\r
- pfree(temp);\r
- temp = NULL;\r
- if (U_FAILURE(err))\r
- {\r
- str = NULL;\r
- goto Fail;\r
- }\r
-\r
- regexp = "\\x{0098}.*?\\x{009C}";\r
- u_uastrncpy(uregexp, regexp, strlen(regexp));\r
- u_uastrncpy(replacement, "", strlen(""));\r
-\r
- ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1);\r
-\r
- unorm2_normalize(normalizer, ustr, VARSIZE(str), nustr, VARSIZE(str), &err);\r
- if (U_FAILURE(err))\r
- {\r
- str = NULL;\r
- goto Fail;\r
- }\r
-\r
- //additional substitutions - 3.6.\r
- nustr = additional_substitutions(nustr, is_search);\r
-\r
- //transformations based on Unicode category codes\r
- nustr = transformations_on_unicode(nustr, usf);\r
-\r
- //since we've stripped out the control characters, we can now\r
- //use a few as placeholders temporarily\r
- nustr = replace_placehoders(nustr);\r
-\r
- //decimal digit\r
- nustr = decimal_digits(nustr);\r
-\r
- //intentionally skipping step 8 of the NACO algorithm; if the string\r
- //gets normalized away, that's fine.\r
-\r
- //leading and trailing spaces\r
- nustr = leading_trailing_spaces(nustr);\r
- nustr_len = u_strlen(nustr);\r
-\r
- temp = palloc(nustr_len * sizeof(UChar));\r
- u_strToLower(temp, nustr_len, nustr, nustr_len, NULL, &err);\r
- temp_len = nustr_len;\r
-\r
-\r
- u_strToUTF8(NULL, 0, &str_len, temp, temp_len, &err);\r
-\r
- SET_VARSIZE(str, str_len + VARHDRSZ);\r
- str = (text *)palloc(str_len + VARHDRSZ);\r
- \r
- result = u_strToUTF8(VARDATA(str), str_len, &str_len, temp, temp_len, &err); \r
-\r
- result[str_len] = '\0';\r
-\r
- \r
-Fail:\r
- pfree(temp);\r
- pfree(ustr);\r
- pfree(nustr);\r
-\r
- return str;\r
-}\r
-\r
-PG_FUNCTION_INFO_V1(naco_normalize);\r
-\r
-Datum naco_normalize(PG_FUNCTION_ARGS)\r
-{\r
- text *str = PG_GETARG_TEXT_P(0);\r
- text *sf = PG_GETARG_TEXT_P(1);\r
- \r
-\r
- if (str == NULL || sf == NULL)\r
- {\r
- PG_RETURN_TEXT_P(NULL);\r
- }\r
-\r
- str = normalize(str, sf, 0);\r
- \r
- PG_RETURN_TEXT_P(str);\r
-}\r
-\r
-PG_FUNCTION_INFO_V1(search_normalize);\r
-\r
-Datum search_normalize(PG_FUNCTION_ARGS)\r
-{\r
- text *str = PG_GETARG_TEXT_P(0);\r
- text *sf = PG_GETARG_TEXT_P(1);\r
-\r
-\r
- if (str == NULL || sf == NULL)\r
- {\r
- PG_RETURN_TEXT_P(NULL);\r
- }\r
-\r
- str = normalize(str, sf, 1);\r
-\r
- PG_RETURN_TEXT_P(str);\r
-}\r
-\r
+/************************************************************************/
+/* C Implementation: public.search_normalize public.naco_normalize
+ *
+ * Descritption:
+ * This file implement public.search_normalize and public.naco_normalize.
+ * These two functions is included in the PostgreSQL extension c_functions.
+ * ICU4C and postgres lib is needed to build this file.
+ *
+ * Author: Swenyu Duan <dsy88@sina.com>, (C) 2012
+ *
+ * Copyright: See COPYING file that comes with this distribution.
+ */
+/************************************************************************/
+#include "postgres.h"
+#include "fmgr.h"
+#include "unicode/unorm2.h"
+#include "unicode/utypes.h"
+#include "unicode/ustring.h"
+#include "unicode/uregex.h"
+#include "unicode/umachine.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+static int32_t regexp_expand_string(const UChar *src,
+ int32_t src_len,
+ UChar *des,
+ int32_t des_capacity)
+{
+ int des_len;
+ UChar s, e;
+ const UChar *cur_pos;
+ const UChar *pre_pos;
+
+ if (src == NULL)
+ {
+ return 0;
+ }
+ pre_pos = src;
+ cur_pos = u_strchr(src, L'-');
+ des_len = 0;
+
+ while (cur_pos != NULL)
+ {
+ if (cur_pos + 1 > src + src_len)
+ {
+ //Error!
+ //The pattern is end with a '-'.
+ return -1;
+ }
+
+ while (pre_pos < cur_pos - 1)
+ {
+ if (des != NULL)
+ {
+ des[des_len] = *pre_pos;
+ }
+ des_len++;
+ pre_pos++;
+ }
+
+ cur_pos ++;
+
+ for(s = *pre_pos; s < *cur_pos; s++)
+ {
+ if (des != NULL)
+ {
+ des[des_len] = s;
+ }
+ des_len++;
+ }
+
+ pre_pos = cur_pos;
+
+ cur_pos = u_strchr(cur_pos, L'-');
+ }
+
+ return des_len;
+}
+
+static int32_t regexp_transliterate(const UChar *search_list,
+ int32_t search_list_len,
+ const UChar *replacement_list,
+ int32_t replacement_list_len,
+ UChar *src,
+ int32_t src_len,
+ UChar *des,
+ int32_t des_capacity)
+{
+ int i, j;
+ int32_t des_len;
+ UChar *cur_pos;
+
+ if (search_list == NULL ||
+ replacement_list == NULL)
+ {
+ return 0;
+ }
+
+ des_len = 0;
+
+ for (i = 0; i < src_len; i++)
+ {
+ if (des != NULL)
+ {
+ des[des_len] = src[i];
+ }
+ des_len++;
+
+ if(cur_pos = u_strchr(search_list, src[i]))
+ {
+ if(cur_pos - search_list > replacement_list_len)
+ des_len--;
+ }
+ }
+ if (des == NULL || des_len == 0)
+ {
+ return des_len;
+ }
+
+
+ for (i = 0; i < replacement_list_len; i++)
+ {
+ cur_pos = u_strchr(des, search_list[i]);
+
+ while (cur_pos != NULL)
+ {
+ *cur_pos = replacement_list[i];
+
+ //In case cur_pos is the last char in des.
+ if (cur_pos >= des + des_len)
+ {
+ break;
+ }
+ cur_pos = u_strchr(cur_pos + 1, search_list[i]);
+ }
+ }
+
+ return des_len;
+
+}
+
+static int32_t regexp_replace(const UChar *regexp,
+ int32_t regexp_len,
+ const UChar *replacement,
+ int32_t replacement_len,
+ UChar *src,
+ int32_t src_len,
+ UChar *des,
+ int32_t des_capacity,
+ int is_global)
+{
+ URegularExpression *regular_exp;
+ UErrorCode status;
+ UParseError pe;
+ int32_t len;
+
+ if (regexp == NULL || replacement == NULL || src == NULL)
+ {
+ return 0;
+ }
+
+ status = 0;
+
+
+ regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status);
+ if (regular_exp == NULL)
+ {
+ return 0;
+ }
+
+ status = 0;
+ uregex_setText(regular_exp, src, src_len, &status);
+
+ status = 0;
+ if (is_global > 0)
+ {
+ len = uregex_replaceAll(regular_exp,
+ replacement,
+ replacement_len,
+ NULL,
+ 0,
+ &status);
+ if (des == NULL || des_capacity < len)
+ {
+ uregex_close(regular_exp);
+ return len;
+ }
+ status = 0;
+
+ uregex_replaceAll(regular_exp,
+ replacement,
+ replacement_len,
+ des,
+ des_capacity,
+ &status);
+ }
+ else
+ {
+ len = uregex_replaceFirst(regular_exp,
+ replacement,
+ replacement_len,
+ NULL,
+ 0,
+ &status);
+ if (des == NULL || des_capacity < len)
+ {
+ uregex_close(regular_exp);
+ return len;
+ }
+
+ status = 0;
+ uregex_replaceFirst(regular_exp,
+ replacement,
+ replacement_len,
+ des,
+ des_capacity,
+ &status);
+ }
+
+ uregex_close(regular_exp);
+ return len;
+}
+
+static UChar *u_strtransliterate(UChar *search_list,
+ UChar *replacement_list,
+ UChar *str,
+ int32_t str_capacity)
+{
+ int32_t search_list_len, replacement_list_len, str_len;
+ UChar *des;
+ int32_t des_len;
+
+ if (search_list == NULL || replacement_list == NULL || str == NULL)
+ {
+ return NULL;
+ }
+
+ search_list_len = u_strlen(search_list);
+ replacement_list_len = u_strlen(replacement_list);
+ str_len = u_strlen(str);
+
+ des_len = regexp_transliterate(search_list,
+ search_list_len,
+ replacement_list,
+ replacement_list_len,
+ str,
+ str_len,
+ NULL,
+ 0);
+ des = palloc((des_len + 1) * sizeof(UChar));
+ des_len = regexp_transliterate(search_list,
+ search_list_len,
+ replacement_list,
+ replacement_list_len,
+ str,
+ str_len,
+ des,
+ des_len);
+ if(des != NULL)
+ des[des_len] = '\0';
+
+ pfree(str);
+ return des;
+}
+
+static UChar *u_strreplace(UChar *regexp,
+ UChar *replacement,
+ UChar *str,
+ int32_t str_capacity,
+ int is_global)
+{
+ int32_t regexp_len, replacement_len, str_len;
+ UChar *des;
+ int32_t des_len;
+
+ if (regexp == NULL || replacement == NULL || str == NULL)
+ {
+ return NULL;
+ }
+
+ regexp_len = u_strlen(regexp);
+ replacement_len = u_strlen(replacement);
+ str_len = u_strlen(str);
+
+ des_len = regexp_replace(regexp,
+ regexp_len,
+ replacement,
+ replacement_len,
+ str,
+ str_len,
+ NULL,
+ 0,
+ is_global);
+
+ des = palloc((des_len + 1)* sizeof(UChar));
+
+ des_len = regexp_replace(regexp,
+ regexp_len,
+ replacement,
+ replacement_len,
+ str,
+ str_len,
+ des,
+ des_len,
+ is_global);
+ if(des != NULL)
+ des[des_len] = '\0';
+
+ pfree(str);
+ return des;
+}
+
+static UChar *additional_substitutions(UChar *nustr, int is_search)
+{
+ UChar uregexp[200], replacement[200];
+
+ if (nustr == NULL)
+ {
+ return NULL;
+ }
+
+ u_uastrncpy(uregexp, "\\x{00C6}", sizeof("\\x{00C6}"));
+ u_uastrncpy(replacement, "AE", sizeof("AE"));
+
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+ u_uastrncpy(uregexp, "\\x{00DE}", sizeof("\\x{00DE}"));
+ u_uastrncpy(replacement, "TH", sizeof("TH"));
+
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+ u_uastrncpy(uregexp, "\\x{0152}", sizeof("\\x{0152}"));
+ u_uastrncpy(replacement, "OE", sizeof("OE"));
+
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+ if (is_search)
+ {
+ u_strncpy(uregexp,
+ L"\x0110\x00D0\x00D8\x0141\x2113\x02BB\x02BC][",
+ sizeof(L"\x0110\x00D0\x00D8\x0141\x2113\x02BB"
+ L"\x02BC][") / sizeof(UChar));
+
+ }
+ else
+ {
+ u_strncpy(uregexp,
+ L"\x0110\x00D0\x00D8\x0141\x2113\x02BB\x02BC]['",
+ sizeof(L"\x0110\x00D0\x00D8\x0141\x2113"
+ L"\x02BB\x02BC]['") / sizeof(UChar));
+
+ }
+ u_uastrncpy(replacement, "DDOLl", sizeof("DDOLl"));
+
+ nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));
+
+ return nustr;
+}
+
+static UChar *transformations_on_unicode(UChar *nustr, UChar *usf)
+{
+ UChar uregexp[200], replacement[200];
+ UChar *comma;
+ int32_t nustr_len;
+
+ if (nustr == NULL)
+ {
+ return NULL;
+ }
+
+ nustr_len = u_strlen(nustr);
+
+ u_uastrncpy(uregexp,
+ "[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]",
+ sizeof("[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+ if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16
+ {
+ comma = u_strchr(nustr, 0x2c); //0x2c == ',' in utf16
+ if (comma != NULL)
+ {
+ if (comma != nustr + nustr_len - 1)
+ {
+ u_uastrncpy(uregexp, ",", sizeof(","));
+ replacement[0] = 0x7;
+ replacement[1] = 0;
+
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);
+ }
+ }
+ }
+
+ return nustr;
+}
+
+static UChar *replace_placehoders(UChar *nustr)
+{
+ UChar uregexp[200], replacement[200];
+
+ if (nustr == NULL)
+ {
+ return NULL;
+ }
+
+ u_strncpy(uregexp,
+ L"+&@\x266D\x266F#",
+ sizeof(L"+&@\x266D\x266F#") / sizeof(UChar));
+ u_strncpy(replacement,
+ L"\x01\x02\x03\x04\x05\x06",
+ sizeof(L"\x01\x02\x03\x04\x05\x06") / sizeof(UChar));
+
+ nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));
+
+ u_uastrncpy(uregexp,
+ "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]",
+ sizeof("[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]"));
+ u_uastrncpy(replacement, " ", sizeof(" "));
+
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+ u_strncpy(uregexp,
+ L"\x01\x02\x03\x04\x05\x06\x07",
+ sizeof(L"\x01\x02\x03\x04\x05\x06\x07") / sizeof(UChar));
+ u_strncpy(replacement,
+ L"+&@\x266D\x266F#,",
+ sizeof(L"+&@\x266D\x266F#,") / sizeof(UChar));
+
+ nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));
+
+
+ return nustr;
+}
+
+
+static UChar *decimal_digits(UChar *nustr)
+{
+ UChar *uregexp, *replacement;
+ UChar *expand_exp, *expand_replacement;
+ int32_t uregexp_len, replacement_len;
+ int32_t expand_exp_len, expand_replacement_len;
+
+ if (nustr == NULL)
+ {
+ return NULL;
+ }
+
+ uregexp = L"\x0660-\x0669\x06F0-\x06F9\x07C0-\x07C9\x0966-\x096F"
+ L"\x09E6-\x09EF\x0A66-\x0A6F\x0AE6-\x0AEF\x0B66-\x0B6F"
+ L"\x0BE6-\x0BEF\x0C66-\x0C6F\x0CE6-\x0CEF\x0D66-\x0D6F"
+ L"\x0E50-\x0E59\x0ED0-\x0ED9\x0F20-\x0F29\x1040-\x1049"
+ L"\x1090-\x1099\x17E0-\x17E9\x1810-\x1819\x1946-\x194F"
+ L"\x19D0-\x19D9\x1A80-\x1A89\x1A90-\x1A99\x1B50-\x1B59"
+ L"\x1BB0-\x1BB9\x1C40-\x1C49\x1C50-\x1C59\xA620-\xA629"
+ L"\xA8D0-\xA8D9\xA900-\xA909\xA9D0-\xA9D9\xAA50-\xAA59"
+ L"\xABF0-\xABF9\xFF10-\xFF19";
+
+ uregexp_len = sizeof(L"\x0660-\x0669\x06F0-\x06F9\x07C0-\x07C9\x0966-\x096F"
+ L"\x09E6-\x09EF\x0A66-\x0A6F\x0AE6-\x0AEF\x0B66-\x0B6F"
+ L"\x0BE6-\x0BEF\x0C66-\x0C6F\x0CE6-\x0CEF\x0D66-\x0D6F"
+ L"\x0E50-\x0E59\x0ED0-\x0ED9\x0F20-\x0F29\x1040-\x1049"
+ L"\x1090-\x1099\x17E0-\x17E9\x1810-\x1819\x1946-\x194F"
+ L"\x19D0-\x19D9\x1A80-\x1A89\x1A90-\x1A99\x1B50-\x1B59"
+ L"\x1BB0-\x1BB9\x1C40-\x1C49\x1C50-\x1C59\xA620-\xA629"
+ L"\xA8D0-\xA8D9\xA900-\xA909\xA9D0-\xA9D9\xAA50-\xAA59"
+ L"\xABF0-\xABF9\xFF10-\xFF19") / sizeof(UChar);
+ replacement = L"0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-"
+ L"90-90-90-90-90-90-90-90-90-90-90-90-90-90-9";
+
+ replacement_len = sizeof(L"0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-"
+ L"90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-"
+ L"90-90-9") / sizeof(UChar);
+
+ expand_exp_len = regexp_expand_string(uregexp, uregexp_len, NULL, 0);
+
+ expand_exp = palloc(sizeof(UChar) * (expand_exp_len + 1));
+
+ expand_replacement_len = regexp_expand_string(replacement, replacement_len, NULL, 0);
+
+ expand_replacement = palloc(sizeof(UChar) * (expand_replacement_len + 1));
+
+ regexp_expand_string(uregexp, uregexp_len, expand_exp, expand_exp_len);
+ regexp_expand_string(replacement, replacement_len, expand_replacement, expand_replacement_len);
+
+ expand_exp[expand_exp_len] = '\0';
+ expand_replacement[expand_replacement_len] = '\0';
+
+ nustr = u_strtransliterate(expand_exp, expand_replacement, nustr, u_strlen(nustr));
+
+ pfree(expand_exp);
+ pfree(expand_replacement);
+ return nustr;
+}
+
+static UChar *leading_trailing_spaces(UChar * nustr)
+{
+ UChar uregexp[20], replacement[20];
+
+ if (nustr == NULL)
+ {
+ return NULL;
+ }
+
+ u_uastrncpy(uregexp, "\\s+", sizeof("\\s+"));
+ u_uastrncpy(replacement, " ", sizeof(" "));
+
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+ u_uastrncpy(uregexp, "^\\s+", sizeof("^\\s+"));
+ u_uastrncpy(replacement, "", sizeof(""));
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);
+
+ u_uastrncpy(uregexp, "\\s+$", sizeof("\\s+$"));
+
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);
+
+ return nustr;
+
+}
+
+text *normalize(text *str, text *sf, int is_search)
+{
+ UChar *ustr = NULL, *nustr = NULL, *temp = NULL, *usf = NULL;
+ int32_t nustr_len, temp_len, str_len, sf_len, ustr_len;
+ UNormalizer2 *normalizer;
+ char *regexp, *result;
+ char *s;
+ UChar uregexp[200], replacement[200];
+ UErrorCode err = 0;
+
+ if (str == NULL || sf == NULL)
+ {
+ return NULL;
+ }
+
+ normalizer = unorm2_getNFKDInstance(&err);
+ if (U_FAILURE(err))
+ {
+ return NULL;
+ }
+
+ s = VARDATA(str);
+
+ str_len = VARSIZE(str) - VARHDRSZ;
+ sf_len = VARSIZE(sf) - VARHDRSZ;
+ temp = palloc((str_len + 1)* sizeof(UChar));
+ usf = palloc((sf_len + 1) * sizeof(UChar));
+
+
+ temp = u_strFromUTF8(temp, str_len, NULL, s, str_len, &err);
+ if (U_FAILURE(err) || temp == NULL)
+ {
+ str = NULL;
+ goto Fail;
+ }
+
+ usf = u_strFromUTF8(usf, sf_len, NULL, VARDATA(sf), sf_len, &err);
+ if (usf == NULL)
+ {
+ str = NULL;
+ goto Fail;
+ }
+
+ usf[sf_len] = '\0';
+ //Apply NACO normalization to input string; based on
+ //http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+ //
+ //Note that unlike a strict reading of the NACO normalization rules,
+ //output is returned as lowercase instead of uppercase for compatibility
+ //with previous versions of the Evergreen naco_normalize routine.
+ //
+ //Convert to upper-case first; even though final output will be lowercase, doing this will
+ //ensure that the German eszett (?) and certain ligatures (?, ?, ?, etc.) will be handled correctly.
+ //If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+
+ ustr_len = u_strToUpper(NULL,
+ 0,
+ temp,
+ str_len,
+ NULL,
+ &err);
+
+ err = 0;
+ ustr = palloc((ustr_len + 1) * sizeof(UChar));
+ u_strToUpper(ustr,
+ ustr_len,
+ temp,
+ str_len,
+ NULL,
+ &err);
+ pfree(temp);
+ temp = NULL;
+ ustr[ustr_len] = '\0';
+ if (U_FAILURE(err))
+ {
+ str = NULL;
+ goto Fail;
+ }
+
+ u_uastrncpy(uregexp,
+ "\\x{0098}.*?\\x{009C}",
+ sizeof("\\x{0098}.*?\\x{009C}"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1);
+ ustr_len = u_strlen(ustr);
+
+ nustr_len = unorm2_normalize(normalizer, ustr, ustr_len, NULL, 0, &err);
+
+ //To store the '\0';
+
+ nustr = palloc((nustr_len + 1)* sizeof(UChar));
+ err = 0;
+
+ unorm2_normalize(normalizer, ustr, ustr_len, nustr, nustr_len, &err);
+ if (U_FAILURE(err))
+ {
+ str = NULL;
+ goto Fail;
+ }
+ nustr[nustr_len] = '\0';
+
+ //additional substitutions - 3.6.
+ nustr = additional_substitutions(nustr, is_search);
+
+ //transformations based on Unicode category codes
+ nustr = transformations_on_unicode(nustr, usf);
+
+ //since we've stripped out the control characters, we can now
+ //use a few as placeholders temporarily
+ nustr = replace_placehoders(nustr);
+
+ //decimal digit
+ nustr = decimal_digits(nustr);
+
+ //intentionally skipping step 8 of the NACO algorithm; if the string
+ //gets normalized away, that's fine.
+
+ //leading and trailing spaces
+ nustr = leading_trailing_spaces(nustr);
+ nustr_len = u_strlen(nustr);
+
+ temp = palloc(nustr_len * sizeof(UChar));
+ u_strToLower(temp, nustr_len, nustr, nustr_len, NULL, &err);
+ temp_len = nustr_len;
+ err = 0;
+
+ u_strToUTF8(NULL, 0, &str_len, temp, temp_len, &err);
+
+ err = 0;
+ str = (text *)palloc(str_len + VARHDRSZ);
+ SET_VARSIZE(str, str_len + VARHDRSZ);
+
+ result = u_strToUTF8(VARDATA(str), str_len, &str_len, temp, temp_len, &err);
+
+ if(result != NULL)
+ result[str_len] = '\0';
+
+
+Fail:
+ if(temp != NULL)
+ pfree(temp);
+ if(ustr != NULL)
+ pfree(ustr);
+ if(usf != NULL)
+ pfree(usf);
+ if(nustr != NULL)
+ pfree(nustr);
+
+ return str;
+}
+
+PG_FUNCTION_INFO_V1(naco_normalize);
+
+Datum naco_normalize(PG_FUNCTION_ARGS)
+{
+ text *str = PG_GETARG_TEXT_P(0);
+ text *sf = PG_GETARG_TEXT_P(1);
+
+
+ if (str == NULL || sf == NULL)
+ {
+ PG_RETURN_TEXT_P(NULL);
+ }
+
+ str = normalize(str, sf, 0);
+
+ if(str != NULL)
+ PG_RETURN_TEXT_P(str);
+ else
+ PG_RETURN_NULL();
+}
+
+PG_FUNCTION_INFO_V1(search_normalize);
+
+Datum search_normalize(PG_FUNCTION_ARGS)
+{
+ text *str = PG_GETARG_TEXT_P(0);
+ text *sf = PG_GETARG_TEXT_P(1);
+
+
+ if (str == NULL || sf == NULL)
+ {
+ PG_RETURN_TEXT_P(NULL);
+ }
+
+ str = normalize(str, sf, 1);
+
+
+ if(str != NULL)
+ PG_RETURN_TEXT_P(str);
+ else
+ PG_RETURN_NULL();
+}