--- /dev/null
+/************************************************************************/\r
+/* C Implementation: public.search_normalize public.naco_normalize\r
+ *\r
+ * Descritption:\r
+ * This file implement public.search_normalize and public.naco_normalize.\r
+ * These two functions is included in the PostgreSQL extension c_functions.\r
+ * ICU4C and postgres lib is needed to build this file.\r
+ *\r
+ * Author: Swenyu Duan <dsy88@sina.com>, (C) 2012\r
+ *\r
+ * Copyright: See COPYING file that comes with this distribution.\r
+ */\r
+/************************************************************************/\r
+#include "postgres.h"\r
+#include "string.h"\r
+#include "fmgr.h"\r
+#include "unicode/unorm2.h"\r
+#include "unicode/utypes.h"\r
+#include "unicode/ustring.h"\r
+#include "unicode/uregex.h"\r
+#include "unicode/umachine.h"\r
+\r
+#ifdef PG_MODULE_MAGIC\r
+PG_MODULE_MAGIC;\r
+#endif\r
+\r
+static int32_t regexp_transliterate(const UChar *search_list,\r
+ int32_t search_list_len,\r
+ const UChar *replacement_list,\r
+ int32_t replacement_list_len,\r
+ UChar *src,\r
+ int32_t src_len,\r
+ UChar *des,\r
+ int32_t des_capacity)\r
+{\r
+ int i, j;\r
+ int32_t des_len;\r
+ UChar *cur_pos;\r
+\r
+ if (search_list == NULL || \r
+ replacement_list == NULL ||\r
+ des == NULL ||\r
+ src_len > des_capacity + 1)\r
+ {\r
+ return 0;\r
+ }\r
+\r
+ des_len = 0;\r
+\r
+ for (i = 0; i < src_len; i++)\r
+ {\r
+ if (des != NULL)\r
+ {\r
+ des[des_len] = src[i];\r
+ }\r
+ des_len++;\r
+\r
+ for (j = replacement_list_len; j < search_list_len; j++)\r
+ {\r
+ if (search_list[j] == src[i])\r
+ {\r
+ des_len--;\r
+ break;\r
+ }\r
+ }\r
+ }\r
+ if (des == NULL)\r
+ {\r
+ //To store the tail '\\0'.\r
+ return des_len + 1;\r
+ }\r
+\r
+ des[des_len] = '\0';\r
+ \r
+ for (i = 0; i < replacement_list_len; i++)\r
+ {\r
+ cur_pos = u_strchr(des, search_list[i]);\r
+\r
+ while (cur_pos != NULL)\r
+ {\r
+ *cur_pos = replacement_list[i];\r
+\r
+ //In case cur_pos is the last char in des.\r
+ if (cur_pos >= des + des_len)\r
+ {\r
+ break;\r
+ }\r
+ cur_pos = u_strchr(cur_pos + 1, search_list[i]);\r
+ }\r
+ }\r
+\r
+ return des_len;\r
+}\r
+\r
+static int32_t regexp_replace(const UChar *regexp,\r
+ int32_t regexp_len,\r
+ const UChar *replacement,\r
+ int32_t replacement_len,\r
+ UChar *src,\r
+ int32_t src_len,\r
+ UChar *des,\r
+ int32_t des_capacity,\r
+ int is_global)\r
+{\r
+ URegularExpression *regular_exp;\r
+ UErrorCode status;\r
+ UParseError pe;\r
+ int32_t len;\r
+\r
+ if (regexp == NULL || replacement == NULL || src == NULL)\r
+ {\r
+ return 0;\r
+ }\r
+\r
+ regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status);\r
+ if (regular_exp == NULL)\r
+ {\r
+ return 0;\r
+ }\r
+\r
+ uregex_setText(regular_exp, src, src_len, &status);\r
+\r
+ if (is_global > 0)\r
+ {\r
+ len = uregex_replaceAll(regular_exp,\r
+ replacement,\r
+ replacement_len,\r
+ NULL,\r
+ 0,\r
+ &status);\r
+ if (des == NULL || des_capacity < len)\r
+ {\r
+ uregex_close(regular_exp);\r
+ return len;\r
+ }\r
+\r
+ uregex_replaceAll(regular_exp,\r
+ replacement,\r
+ replacement_len,\r
+ des,\r
+ des_capacity,\r
+ &status);\r
+ }\r
+ else\r
+ {\r
+ len = uregex_replaceFirst(regular_exp,\r
+ replacement,\r
+ replacement_len,\r
+ NULL,\r
+ 0,\r
+ &status);\r
+ if (des == NULL || des_capacity < len)\r
+ {\r
+ uregex_close(regular_exp);\r
+ return len;\r
+ }\r
+\r
+ uregex_replaceFirst(regular_exp,\r
+ replacement,\r
+ replacement_len,\r
+ des,\r
+ des_capacity,\r
+ &status);\r
+ }\r
+ \r
+ uregex_close(regular_exp);\r
+ return len;\r
+}\r
+\r
+static UChar *u_strtransliterate(UChar *search_list,\r
+ UChar *replacement_list,\r
+ UChar *str,\r
+ int32_t str_capacity)\r
+{\r
+ int32_t search_list_len, replacement_list_len, str_len;\r
+ UChar *des;\r
+ int32_t des_len;\r
+\r
+ if (search_list == NULL || replacement_list == NULL || str == NULL)\r
+ {\r
+ return NULL;\r
+ }\r
+\r
+ search_list_len = u_strlen(search_list);\r
+ replacement_list_len = u_strlen(replacement_list);\r
+ str_len = u_strlen(str);\r
+\r
+ des_len = regexp_transliterate(search_list, \r
+ search_list_len,\r
+ replacement_list,\r
+ replacement_list_len,\r
+ str,\r
+ str_len,\r
+ NULL,\r
+ 0);\r
+ des = palloc(des_len * sizeof(UChar));\r
+ des_len = regexp_transliterate(search_list, \r
+ search_list_len,\r
+ replacement_list,\r
+ replacement_list_len,\r
+ str,\r
+ str_len,\r
+ des,\r
+ des_len);\r
+\r
+ pfree(str);\r
+ return des;\r
+}\r
+\r
+static UChar *u_strreplace(UChar *regexp,\r
+ UChar *replacement,\r
+ UChar *str,\r
+ int32_t str_capacity,\r
+ int is_global)\r
+{\r
+ int32_t regexp_len, replacement_len, str_len;\r
+ UChar *des;\r
+ int32_t des_len;\r
+\r
+ if (regexp == NULL || replacement == NULL || str == NULL)\r
+ {\r
+ return NULL;\r
+ }\r
+\r
+ regexp_len = u_strlen(regexp);\r
+ replacement_len = u_strlen(replacement);\r
+ str_len = u_strlen(str);\r
+\r
+ des_len = regexp_replace(regexp,\r
+ regexp_len,\r
+ replacement,\r
+ replacement_len,\r
+ str,\r
+ str_len,\r
+ NULL,\r
+ 0,\r
+ is_global);\r
+\r
+ des = palloc(des_len * sizeof(UChar));\r
+\r
+ des_len = regexp_replace(regexp,\r
+ regexp_len,\r
+ replacement,\r
+ replacement_len,\r
+ str,\r
+ str_len,\r
+ des,\r
+ des_len,\r
+ is_global);\r
+ pfree(str);\r
+ return des;\r
+}\r
+\r
+UChar *additional_substitutions(UChar *nustr, int is_search)\r
+{\r
+ char *regexp;\r
+ UChar uregexp[200], replacement[200];\r
+\r
+ if (nustr == NULL)\r
+ {\r
+ return NULL;\r
+ }\r
+\r
+ regexp = "\\x{00C6}";\r
+ u_uastrncpy(uregexp, regexp, strlen(regexp));\r
+ u_uastrncpy(replacement, "AE", strlen("AE"));\r
+ \r
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+\r
+ regexp = "\\x{00DE}";\r
+ u_uastrncpy(uregexp, regexp, strlen(regexp));\r
+ u_uastrncpy(replacement, "TH", strlen("TH"));\r
+ \r
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+\r
+ regexp = "\\x{0152}";\r
+ u_uastrncpy(uregexp, regexp, strlen(regexp));\r
+ u_uastrncpy(replacement, "OE", strlen("OE"));\r
+\r
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+\r
+ if (is_search)\r
+ {\r
+ regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}][";\r
+ }\r
+ else\r
+ {\r
+ regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['";\r
+ }\r
+ \r
+ u_uastrncpy(uregexp, regexp, strlen(regexp));\r
+ u_uastrncpy(replacement, "OE", strlen("OE"));\r
+ \r
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+\r
+ regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['";\r
+ u_uastrncpy(uregexp, regexp, strlen(regexp));\r
+ u_uastrncpy(replacement, "DDOLl", strlen("DDOLl"));\r
+ \r
+ nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));\r
+\r
+ return nustr;\r
+}\r
+\r
+UChar *transformations_on_unicode(UChar *nustr, UChar *usf)\r
+{\r
+ char *regexp;\r
+ UChar uregexp[200], replacement[200];\r
+ UChar *comma;\r
+ int32_t nustr_len;\r
+\r
+ if (nustr == NULL)\r
+ {\r
+ return NULL;\r
+ }\r
+\r
+ nustr_len = u_strlen(nustr);\r
+\r
+ regexp = "[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]";\r
+ u_uastrncpy(uregexp, regexp, strlen(regexp));\r
+ u_uastrncpy(replacement, "", strlen(""));\r
+ \r
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+\r
+ if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16\r
+ {\r
+ comma = u_strchr(nustr, 0x2c); //0x2c == ',' in utf16\r
+ if (comma != NULL)\r
+ {\r
+ if (comma != nustr + nustr_len - 1)\r
+ {\r
+ regexp = ",";\r
+ u_uastrncpy(uregexp, regexp, strlen(regexp));\r
+ replacement[0] = 0x7;\r
+ replacement[1] = 0;\r
+\r
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);\r
+ }\r
+ }\r
+ }\r
+\r
+ return nustr;\r
+}\r
+\r
+UChar *replace_placehoders(UChar *nustr)\r
+{\r
+ UChar uregexp[200], replacement[200];\r
+\r
+ if (nustr == NULL)\r
+ {\r
+ return NULL;\r
+ }\r
+\r
+ u_uastrncpy(uregexp,\r
+ "+&@\\x{266D}\\x{266F}#",\r
+ strlen( "+&@\\x{266D}\\x{266F}#"));\r
+ u_uastrncpy(replacement,\r
+ "\\x01\\x02\\x03\\x04\\x05\\x06",\r
+ strlen("\\x01\\x02\\x03\\x04\\x05\\x06"));\r
+\r
+ nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));\r
+\r
+ u_uastrncpy(uregexp,\r
+ "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]",\r
+ strlen( "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]"));\r
+ u_uastrncpy(replacement, "", strlen(""));\r
+\r
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+\r
+ u_uastrncpy(uregexp,\r
+ "\\x01\\x02\\x03\\x04\\x05\\x06\\x07",\r
+ strlen( "\\x01\\x02\\x03\\x04\\x05\\x06\\x07"));\r
+ u_uastrncpy(replacement,\r
+ "+&@\\x{266D}\\x{266F}#,",\r
+ strlen("+&@\\x{266D}\\x{266F}#,"));\r
+\r
+ nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));\r
+\r
+\r
+ return nustr;\r
+}\r
+\r
+UChar *decimal_digits(UChar *nustr)\r
+{\r
+ UChar uregexp[300], replacement[300];\r
+\r
+ if (nustr == NULL)\r
+ {\r
+ return NULL;\r
+ }\r
+ \r
+ u_uastrncpy(uregexp,\r
+ "\\x{0660}-\\x{0669}\\x{06F0}-\\x{06F9}\\x{07C0}-\\x{07C9}\\x{0966}-\\x{096F}"\r
+ "\\x{09E6}-\\x{09EF}\\x{0A66}-\\x{0A6F}\\x{0AE6}-\\x{0AEF}\\x{0B66}-\\x{0B6F}"\r
+ "\\x{0BE6}-\\x{0BEF}\\x{0C66}-\\x{0C6F}\\x{0CE6}-\\x{0CEF}\\x{0D66}-\\x{0D6F}"\r
+ "\\x{0E50}-\\x{0E59}\\x{0ED0}-\\x{0ED9}\\x{0F20}-\\x{0F29}\\x{1040}-\\x{1049}"\r
+ "\\x{1090}-\\x{1099}\\x{17E0}-\\x{17E9}\\x{1810}-\\x{1819}\\x{1946}-\\x{194F}"\r
+ "\\x{19D0}-\\x{19D9}\\x{1A80}-\\x{1A89}\\x{1A90}-\\x{1A99}\\x{1B50}-\\x{1B59}"\r
+ "\\x{1BB0}-\\x{1BB9}\\x{1C40}-\\x{1C49}\\x{1C50}-\\x{1C59}\\x{A620}-\\x{A629}"\r
+ "\\x{A8D0}-\\x{A8D9}\\x{A900}-\\x{A909}\\x{A9D0}-\\x{A9D9}\\x{AA50}-\\x{AA59}"\r
+ "\\x{ABF0}-\\x{ABF9}\\x{FF10}-\\x{FF19}",\r
+ strlen("\\x{0660}-\\x{0669}\\x{06F0}-\\x{06F9}\\x{07C0}-\\x{07C9}\\x{0966}-\\x{096F}"\r
+ "\\x{09E6}-\\x{09EF}\\x{0A66}-\\x{0A6F}\\x{0AE6}-\\x{0AEF}\\x{0B66}-\\x{0B6F}"\r
+ "\\x{0BE6}-\\x{0BEF}\\x{0C66}-\\x{0C6F}\\x{0CE6}-\\x{0CEF}\\x{0D66}-\\x{0D6F}"\r
+ "\\x{0E50}-\\x{0E59}\\x{0ED0}-\\x{0ED9}\\x{0F20}-\\x{0F29}\\x{1040}-\\x{1049}"\r
+ "\\x{1090}-\\x{1099}\\x{17E0}-\\x{17E9}\\x{1810}-\\x{1819}\\x{1946}-\\x{194F}"\r
+ "\\x{19D0}-\\x{19D9}\\x{1A80}-\\x{1A89}\\x{1A90}-\\x{1A99}\\x{1B50}-\\x{1B59}"\r
+ "\\x{1BB0}-\\x{1BB9}\\x{1C40}-\\x{1C49}\\x{1C50}-\\x{1C59}\\x{A620}-\\x{A629}"\r
+ "\\x{A8D0}-\\x{A8D9}\\x{A900}-\\x{A909}\\x{A9D0}-\\x{A9D9}\\x{AA50}-\\x{AA59}"\r
+ "\\x{ABF0}-\\x{ABF9}\\x{FF10}-\\x{FF19}"));\r
+\r
+ u_uastrncpy(replacement,\r
+ "0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9",\r
+ strlen("0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9"));\r
+\r
+ nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr));\r
+\r
+ return nustr;\r
+}\r
+\r
+UChar *leading_trailing_spaces(UChar * nustr)\r
+{\r
+ UChar uregexp[200], replacement[200];\r
+\r
+ if (nustr == NULL)\r
+ {\r
+ return NULL;\r
+ }\r
+\r
+ u_uastrncpy(uregexp, "\\s+", strlen( "\\s+"));\r
+ u_uastrncpy(replacement, " ", strlen(" "));\r
+\r
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+ \r
+ u_uastrncpy(uregexp, "^\\s+", strlen( "^\\s+"));\r
+ u_uastrncpy(replacement, "", strlen(""));\r
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);\r
+\r
+ u_uastrncpy(uregexp, "\\s+$", strlen( "\\s+$"));\r
+\r
+ nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+\r
+ return nustr;\r
+\r
+}\r
+\r
+text *normalize(text *str, text *sf, int is_search)\r
+{\r
+ UChar *ustr, *nustr, *temp, *usf;\r
+ int32_t nustr_len, temp_len, str_len;\r
+ UNormalizer2 *normalizer;\r
+ char *regexp, *result;\r
+ UChar uregexp[200], replacement[200];\r
+ UErrorCode err;\r
+\r
+ if (str == NULL || sf == NULL)\r
+ {\r
+ return NULL;\r
+ }\r
+\r
+ normalizer = (UNormalizer2 *)unorm2_getNFKDInstance(&err);\r
+ if (U_FAILURE(err))\r
+ {\r
+ return NULL;\r
+ }\r
+\r
+ ustr = palloc(VARSIZE(str) * sizeof(UChar));\r
+ nustr = palloc(VARSIZE(str)* sizeof(UChar));\r
+ temp = palloc(VARSIZE(str) * sizeof(UChar));\r
+ usf = palloc(VARSIZE(sf) * sizeof(UChar));\r
+\r
+ temp = u_strFromUTF8(temp, VARSIZE(str), NULL, VARDATA(str), VARSIZE(str), &err);\r
+ if (U_FAILURE(err) || temp == NULL)\r
+ {\r
+ str = NULL;\r
+ goto Fail;\r
+ }\r
+\r
+ nustr = u_strncpy(nustr, temp, VARSIZE(str));\r
+ if (nustr == NULL)\r
+ {\r
+ str = NULL;\r
+ goto Fail;\r
+ }\r
+\r
+ usf = u_strFromUTF8(usf, VARSIZE(sf), NULL, VARDATA(sf), VARSIZE(sf), &err);\r
+ if (usf == NULL)\r
+ {\r
+ str = NULL;\r
+ goto Fail;\r
+ }\r
+ //Apply NACO normalization to input string; based on\r
+ //http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf\r
+ //\r
+ //Note that unlike a strict reading of the NACO normalization rules,\r
+ //output is returned as lowercase instead of uppercase for compatibility\r
+ //with previous versions of the Evergreen naco_normalize routine.\r
+ //\r
+ //Convert to upper-case first; even though final output will be lowercase, doing this will\r
+ //ensure that the German eszett (?) and certain ligatures (?, ?, ?, etc.) will be handled correctly.\r
+ //If there are any bugs in Perl's implementation of upcasing, they will be passed through here.\r
+\r
+ u_strToUpper(ustr, \r
+ VARSIZE(str),\r
+ temp,\r
+ VARSIZE(str),\r
+ NULL,\r
+ &err);\r
+ pfree(temp);\r
+ temp = NULL;\r
+ if (U_FAILURE(err))\r
+ {\r
+ str = NULL;\r
+ goto Fail;\r
+ }\r
+\r
+ regexp = "\\x{0098}.*?\\x{009C}";\r
+ u_uastrncpy(uregexp, regexp, strlen(regexp));\r
+ u_uastrncpy(replacement, "", strlen(""));\r
+\r
+ ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1);\r
+\r
+ unorm2_normalize(normalizer, ustr, VARSIZE(str), nustr, VARSIZE(str), &err);\r
+ if (U_FAILURE(err))\r
+ {\r
+ str = NULL;\r
+ goto Fail;\r
+ }\r
+\r
+ //additional substitutions - 3.6.\r
+ nustr = additional_substitutions(nustr, is_search);\r
+\r
+ //transformations based on Unicode category codes\r
+ nustr = transformations_on_unicode(nustr, usf);\r
+\r
+ //since we've stripped out the control characters, we can now\r
+ //use a few as placeholders temporarily\r
+ nustr = replace_placehoders(nustr);\r
+\r
+ //decimal digit\r
+ nustr = decimal_digits(nustr);\r
+\r
+ //intentionally skipping step 8 of the NACO algorithm; if the string\r
+ //gets normalized away, that's fine.\r
+\r
+ //leading and trailing spaces\r
+ nustr = leading_trailing_spaces(nustr);\r
+ nustr_len = u_strlen(nustr);\r
+\r
+ temp = palloc(nustr_len * sizeof(UChar));\r
+ u_strToLower(temp, nustr_len, nustr, nustr_len, NULL, &err);\r
+ temp_len = nustr_len;\r
+\r
+\r
+ u_strToUTF8(NULL, 0, &str_len, temp, temp_len, &err);\r
+\r
+ SET_VARSIZE(str, str_len + VARHDRSZ);\r
+ str = (text *)palloc(str_len + VARHDRSZ);\r
+ \r
+ result = u_strToUTF8(VARDATA(str), str_len, &str_len, temp, temp_len, &err); \r
+\r
+ result[str_len] = '\0';\r
+\r
+ \r
+Fail:\r
+ pfree(temp);\r
+ pfree(ustr);\r
+ pfree(nustr);\r
+\r
+ return str;\r
+}\r
+\r
+PG_FUNCTION_INFO_V1(naco_normalize);\r
+\r
+Datum naco_normalize(PG_FUNCTION_ARGS)\r
+{\r
+ text *str = PG_GETARG_TEXT_P(0);\r
+ text *sf = PG_GETARG_TEXT_P(1);\r
+ \r
+\r
+ if (str == NULL || sf == NULL)\r
+ {\r
+ PG_RETURN_TEXT_P(NULL);\r
+ }\r
+\r
+ str = normalize(str, sf, 0);\r
+ \r
+ PG_RETURN_TEXT_P(str);\r
+}\r
+\r
+PG_FUNCTION_INFO_V1(search_normalize);\r
+\r
+Datum search_normalize(PG_FUNCTION_ARGS)\r
+{\r
+ text *str = PG_GETARG_TEXT_P(0);\r
+ text *sf = PG_GETARG_TEXT_P(1);\r
+\r
+\r
+ if (str == NULL || sf == NULL)\r
+ {\r
+ PG_RETURN_TEXT_P(NULL);\r
+ }\r
+\r
+ str = normalize(str, sf, 1);\r
+\r
+ PG_RETURN_TEXT_P(str);\r
+}\r
+\r