From 20614d25425ea9defb1678456e8ba46a744a75dc Mon Sep 17 00:00:00 2001 From: Swenyu Duan Date: Thu, 31 May 2012 05:02:45 -0400 Subject: [PATCH] Implement search_normalize and naco_normalize in C. This is the first extension for PostgreSQl. The lib has not been tested yet. The normalize.functions_in_c.c relies on ICU4C lib. The makefile links normalize.functions_in_c.o libs of ICU4C(/usr/lib) and PostgreSQL lib. c_functions--1.0.sql will replace the original plperlu version of search_normalize and naco_normalize into C version. --- .../src/sql/Pg/extensions/c_functions--1.0.sql | 9 + Open-ILS/src/sql/Pg/extensions/c_functions.control | 3 + Open-ILS/src/sql/Pg/extensions/makefile | 10 + .../sql/Pg/extensions/normalize.functions_in_c.c | 608 +++++++++++++++++++++ 4 files changed, 630 insertions(+) create mode 100755 Open-ILS/src/sql/Pg/extensions/c_functions--1.0.sql create mode 100755 Open-ILS/src/sql/Pg/extensions/c_functions.control create mode 100644 Open-ILS/src/sql/Pg/extensions/makefile create mode 100755 Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c diff --git a/Open-ILS/src/sql/Pg/extensions/c_functions--1.0.sql b/Open-ILS/src/sql/Pg/extensions/c_functions--1.0.sql new file mode 100755 index 0000000000..5f3049d642 --- /dev/null +++ b/Open-ILS/src/sql/Pg/extensions/c_functions--1.0.sql @@ -0,0 +1,9 @@ +CREATE OR REPLACE FUNCTION public.search_normalize( TEXT, TEXT ) RETURNS TEXT + AS 'c_functions.so', 'search_normalize' + LANGUAGE C STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT + AS 'c_functions.so', 'naco_normalize' + LANGUAGE C STRICT IMMUTABLE; + + diff --git a/Open-ILS/src/sql/Pg/extensions/c_functions.control b/Open-ILS/src/sql/Pg/extensions/c_functions.control new file mode 100755 index 0000000000..3c25035b27 --- /dev/null +++ b/Open-ILS/src/sql/Pg/extensions/c_functions.control @@ -0,0 +1,3 @@ +comment = 'Extensions to convert all the plperlu functions into C functions' +default_version = '1.0' +relocatable = true diff --git a/Open-ILS/src/sql/Pg/extensions/makefile b/Open-ILS/src/sql/Pg/extensions/makefile new file mode 100644 index 0000000000..8aa4ad3bf2 --- /dev/null +++ b/Open-ILS/src/sql/Pg/extensions/makefile @@ -0,0 +1,10 @@ +MODULE_big = c_functions +EXTENSION = c_functions +SHLIB_LINK = -licutu -licuuc -licuio -licui18n -licule -liculx -licudata +#PG_CPPFLAGS = -L/usr/lib -licuuc -licuio -licui18n -licule -liculx -licudata +DATA = c_functions--1.0.sql +OBJS = normalize.functions_in_c.o + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c b/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c new file mode 100755 index 0000000000..26e48e0db6 --- /dev/null +++ b/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c @@ -0,0 +1,608 @@ +/************************************************************************/ +/* C Implementation: public.search_normalize public.naco_normalize + * + * Descritption: + * This file implement public.search_normalize and public.naco_normalize. + * These two functions is included in the PostgreSQL extension c_functions. + * ICU4C and postgres lib is needed to build this file. + * + * Author: Swenyu Duan , (C) 2012 + * + * Copyright: See COPYING file that comes with this distribution. + */ +/************************************************************************/ +#include "postgres.h" +#include "string.h" +#include "fmgr.h" +#include "unicode/unorm2.h" +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "unicode/uregex.h" +#include "unicode/umachine.h" + +#ifdef PG_MODULE_MAGIC +PG_MODULE_MAGIC; +#endif + +static int32_t regexp_transliterate(const UChar *search_list, + int32_t search_list_len, + const UChar *replacement_list, + int32_t replacement_list_len, + UChar *src, + int32_t src_len, + UChar *des, + int32_t des_capacity) +{ + int i, j; + int32_t des_len; + UChar *cur_pos; + + if (search_list == NULL || + replacement_list == NULL || + des == NULL || + src_len > des_capacity + 1) + { + return 0; + } + + des_len = 0; + + for (i = 0; i < src_len; i++) + { + if (des != NULL) + { + des[des_len] = src[i]; + } + des_len++; + + for (j = replacement_list_len; j < search_list_len; j++) + { + if (search_list[j] == src[i]) + { + des_len--; + break; + } + } + } + if (des == NULL) + { + //To store the tail '\\0'. + return des_len + 1; + } + + des[des_len] = '\0'; + + for (i = 0; i < replacement_list_len; i++) + { + cur_pos = u_strchr(des, search_list[i]); + + while (cur_pos != NULL) + { + *cur_pos = replacement_list[i]; + + //In case cur_pos is the last char in des. + if (cur_pos >= des + des_len) + { + break; + } + cur_pos = u_strchr(cur_pos + 1, search_list[i]); + } + } + + return des_len; +} + +static int32_t regexp_replace(const UChar *regexp, + int32_t regexp_len, + const UChar *replacement, + int32_t replacement_len, + UChar *src, + int32_t src_len, + UChar *des, + int32_t des_capacity, + int is_global) +{ + URegularExpression *regular_exp; + UErrorCode status; + UParseError pe; + int32_t len; + + if (regexp == NULL || replacement == NULL || src == NULL) + { + return 0; + } + + regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status); + if (regular_exp == NULL) + { + return 0; + } + + uregex_setText(regular_exp, src, src_len, &status); + + if (is_global > 0) + { + len = uregex_replaceAll(regular_exp, + replacement, + replacement_len, + NULL, + 0, + &status); + if (des == NULL || des_capacity < len) + { + uregex_close(regular_exp); + return len; + } + + uregex_replaceAll(regular_exp, + replacement, + replacement_len, + des, + des_capacity, + &status); + } + else + { + len = uregex_replaceFirst(regular_exp, + replacement, + replacement_len, + NULL, + 0, + &status); + if (des == NULL || des_capacity < len) + { + uregex_close(regular_exp); + return len; + } + + uregex_replaceFirst(regular_exp, + replacement, + replacement_len, + des, + des_capacity, + &status); + } + + uregex_close(regular_exp); + return len; +} + +static UChar *u_strtransliterate(UChar *search_list, + UChar *replacement_list, + UChar *str, + int32_t str_capacity) +{ + int32_t search_list_len, replacement_list_len, str_len; + UChar *des; + int32_t des_len; + + if (search_list == NULL || replacement_list == NULL || str == NULL) + { + return NULL; + } + + search_list_len = u_strlen(search_list); + replacement_list_len = u_strlen(replacement_list); + str_len = u_strlen(str); + + des_len = regexp_transliterate(search_list, + search_list_len, + replacement_list, + replacement_list_len, + str, + str_len, + NULL, + 0); + des = palloc(des_len * sizeof(UChar)); + des_len = regexp_transliterate(search_list, + search_list_len, + replacement_list, + replacement_list_len, + str, + str_len, + des, + des_len); + + pfree(str); + return des; +} + +static UChar *u_strreplace(UChar *regexp, + UChar *replacement, + UChar *str, + int32_t str_capacity, + int is_global) +{ + int32_t regexp_len, replacement_len, str_len; + UChar *des; + int32_t des_len; + + if (regexp == NULL || replacement == NULL || str == NULL) + { + return NULL; + } + + regexp_len = u_strlen(regexp); + replacement_len = u_strlen(replacement); + str_len = u_strlen(str); + + des_len = regexp_replace(regexp, + regexp_len, + replacement, + replacement_len, + str, + str_len, + NULL, + 0, + is_global); + + des = palloc(des_len * sizeof(UChar)); + + des_len = regexp_replace(regexp, + regexp_len, + replacement, + replacement_len, + str, + str_len, + des, + des_len, + is_global); + pfree(str); + return des; +} + +UChar *additional_substitutions(UChar *nustr, int is_search) +{ + char *regexp; + UChar uregexp[200], replacement[200]; + + if (nustr == NULL) + { + return NULL; + } + + regexp = "\\x{00C6}"; + u_uastrncpy(uregexp, regexp, strlen(regexp)); + u_uastrncpy(replacement, "AE", strlen("AE")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + regexp = "\\x{00DE}"; + u_uastrncpy(uregexp, regexp, strlen(regexp)); + u_uastrncpy(replacement, "TH", strlen("TH")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + regexp = "\\x{0152}"; + u_uastrncpy(uregexp, regexp, strlen(regexp)); + u_uastrncpy(replacement, "OE", strlen("OE")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + if (is_search) + { + regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]["; + } + else + { + regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['"; + } + + u_uastrncpy(uregexp, regexp, strlen(regexp)); + u_uastrncpy(replacement, "OE", strlen("OE")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['"; + u_uastrncpy(uregexp, regexp, strlen(regexp)); + u_uastrncpy(replacement, "DDOLl", strlen("DDOLl")); + + nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); + + return nustr; +} + +UChar *transformations_on_unicode(UChar *nustr, UChar *usf) +{ + char *regexp; + UChar uregexp[200], replacement[200]; + UChar *comma; + int32_t nustr_len; + + if (nustr == NULL) + { + return NULL; + } + + nustr_len = u_strlen(nustr); + + regexp = "[\\p{Cc}\\p{Cf}\\p{Co}\\p{Cs}\\p{Lm}\\p{Mc}\\p{Me}\\p{Mn}]"; + u_uastrncpy(uregexp, regexp, strlen(regexp)); + u_uastrncpy(replacement, "", strlen("")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16 + { + comma = u_strchr(nustr, 0x2c); //0x2c == ',' in utf16 + if (comma != NULL) + { + if (comma != nustr + nustr_len - 1) + { + regexp = ","; + u_uastrncpy(uregexp, regexp, strlen(regexp)); + replacement[0] = 0x7; + replacement[1] = 0; + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0); + } + } + } + + return nustr; +} + +UChar *replace_placehoders(UChar *nustr) +{ + UChar uregexp[200], replacement[200]; + + if (nustr == NULL) + { + return NULL; + } + + u_uastrncpy(uregexp, + "+&@\\x{266D}\\x{266F}#", + strlen( "+&@\\x{266D}\\x{266F}#")); + u_uastrncpy(replacement, + "\\x01\\x02\\x03\\x04\\x05\\x06", + strlen("\\x01\\x02\\x03\\x04\\x05\\x06")); + + nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); + + u_uastrncpy(uregexp, + "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]", + strlen( "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]")); + u_uastrncpy(replacement, "", strlen("")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + u_uastrncpy(uregexp, + "\\x01\\x02\\x03\\x04\\x05\\x06\\x07", + strlen( "\\x01\\x02\\x03\\x04\\x05\\x06\\x07")); + u_uastrncpy(replacement, + "+&@\\x{266D}\\x{266F}#,", + strlen("+&@\\x{266D}\\x{266F}#,")); + + nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); + + + return nustr; +} + +UChar *decimal_digits(UChar *nustr) +{ + UChar uregexp[300], replacement[300]; + + if (nustr == NULL) + { + return NULL; + } + + u_uastrncpy(uregexp, + "\\x{0660}-\\x{0669}\\x{06F0}-\\x{06F9}\\x{07C0}-\\x{07C9}\\x{0966}-\\x{096F}" + "\\x{09E6}-\\x{09EF}\\x{0A66}-\\x{0A6F}\\x{0AE6}-\\x{0AEF}\\x{0B66}-\\x{0B6F}" + "\\x{0BE6}-\\x{0BEF}\\x{0C66}-\\x{0C6F}\\x{0CE6}-\\x{0CEF}\\x{0D66}-\\x{0D6F}" + "\\x{0E50}-\\x{0E59}\\x{0ED0}-\\x{0ED9}\\x{0F20}-\\x{0F29}\\x{1040}-\\x{1049}" + "\\x{1090}-\\x{1099}\\x{17E0}-\\x{17E9}\\x{1810}-\\x{1819}\\x{1946}-\\x{194F}" + "\\x{19D0}-\\x{19D9}\\x{1A80}-\\x{1A89}\\x{1A90}-\\x{1A99}\\x{1B50}-\\x{1B59}" + "\\x{1BB0}-\\x{1BB9}\\x{1C40}-\\x{1C49}\\x{1C50}-\\x{1C59}\\x{A620}-\\x{A629}" + "\\x{A8D0}-\\x{A8D9}\\x{A900}-\\x{A909}\\x{A9D0}-\\x{A9D9}\\x{AA50}-\\x{AA59}" + "\\x{ABF0}-\\x{ABF9}\\x{FF10}-\\x{FF19}", + strlen("\\x{0660}-\\x{0669}\\x{06F0}-\\x{06F9}\\x{07C0}-\\x{07C9}\\x{0966}-\\x{096F}" + "\\x{09E6}-\\x{09EF}\\x{0A66}-\\x{0A6F}\\x{0AE6}-\\x{0AEF}\\x{0B66}-\\x{0B6F}" + "\\x{0BE6}-\\x{0BEF}\\x{0C66}-\\x{0C6F}\\x{0CE6}-\\x{0CEF}\\x{0D66}-\\x{0D6F}" + "\\x{0E50}-\\x{0E59}\\x{0ED0}-\\x{0ED9}\\x{0F20}-\\x{0F29}\\x{1040}-\\x{1049}" + "\\x{1090}-\\x{1099}\\x{17E0}-\\x{17E9}\\x{1810}-\\x{1819}\\x{1946}-\\x{194F}" + "\\x{19D0}-\\x{19D9}\\x{1A80}-\\x{1A89}\\x{1A90}-\\x{1A99}\\x{1B50}-\\x{1B59}" + "\\x{1BB0}-\\x{1BB9}\\x{1C40}-\\x{1C49}\\x{1C50}-\\x{1C59}\\x{A620}-\\x{A629}" + "\\x{A8D0}-\\x{A8D9}\\x{A900}-\\x{A909}\\x{A9D0}-\\x{A9D9}\\x{AA50}-\\x{AA59}" + "\\x{ABF0}-\\x{ABF9}\\x{FF10}-\\x{FF19}")); + + u_uastrncpy(replacement, + "0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9", + strlen("0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9")); + + nustr = u_strtransliterate(uregexp, replacement, nustr, u_strlen(nustr)); + + return nustr; +} + +UChar *leading_trailing_spaces(UChar * nustr) +{ + UChar uregexp[200], replacement[200]; + + if (nustr == NULL) + { + return NULL; + } + + u_uastrncpy(uregexp, "\\s+", strlen( "\\s+")); + u_uastrncpy(replacement, " ", strlen(" ")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + u_uastrncpy(uregexp, "^\\s+", strlen( "^\\s+")); + u_uastrncpy(replacement, "", strlen("")); + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0); + + u_uastrncpy(uregexp, "\\s+$", strlen( "\\s+$")); + + nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + + return nustr; + +} + +text *normalize(text *str, text *sf, int is_search) +{ + UChar *ustr, *nustr, *temp, *usf; + int32_t nustr_len, temp_len, str_len; + UNormalizer2 *normalizer; + char *regexp, *result; + UChar uregexp[200], replacement[200]; + UErrorCode err; + + if (str == NULL || sf == NULL) + { + return NULL; + } + + normalizer = (UNormalizer2 *)unorm2_getNFKDInstance(&err); + if (U_FAILURE(err)) + { + return NULL; + } + + ustr = palloc(VARSIZE(str) * sizeof(UChar)); + nustr = palloc(VARSIZE(str)* sizeof(UChar)); + temp = palloc(VARSIZE(str) * sizeof(UChar)); + usf = palloc(VARSIZE(sf) * sizeof(UChar)); + + temp = u_strFromUTF8(temp, VARSIZE(str), NULL, VARDATA(str), VARSIZE(str), &err); + if (U_FAILURE(err) || temp == NULL) + { + str = NULL; + goto Fail; + } + + nustr = u_strncpy(nustr, temp, VARSIZE(str)); + if (nustr == NULL) + { + str = NULL; + goto Fail; + } + + usf = u_strFromUTF8(usf, VARSIZE(sf), NULL, VARDATA(sf), VARSIZE(sf), &err); + if (usf == NULL) + { + str = NULL; + goto Fail; + } + //Apply NACO normalization to input string; based on + //http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf + // + //Note that unlike a strict reading of the NACO normalization rules, + //output is returned as lowercase instead of uppercase for compatibility + //with previous versions of the Evergreen naco_normalize routine. + // + //Convert to upper-case first; even though final output will be lowercase, doing this will + //ensure that the German eszett (?) and certain ligatures (?, ?, ?, etc.) will be handled correctly. + //If there are any bugs in Perl's implementation of upcasing, they will be passed through here. + + u_strToUpper(ustr, + VARSIZE(str), + temp, + VARSIZE(str), + NULL, + &err); + pfree(temp); + temp = NULL; + if (U_FAILURE(err)) + { + str = NULL; + goto Fail; + } + + regexp = "\\x{0098}.*?\\x{009C}"; + u_uastrncpy(uregexp, regexp, strlen(regexp)); + u_uastrncpy(replacement, "", strlen("")); + + ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1); + + unorm2_normalize(normalizer, ustr, VARSIZE(str), nustr, VARSIZE(str), &err); + if (U_FAILURE(err)) + { + str = NULL; + goto Fail; + } + + //additional substitutions - 3.6. + nustr = additional_substitutions(nustr, is_search); + + //transformations based on Unicode category codes + nustr = transformations_on_unicode(nustr, usf); + + //since we've stripped out the control characters, we can now + //use a few as placeholders temporarily + nustr = replace_placehoders(nustr); + + //decimal digit + nustr = decimal_digits(nustr); + + //intentionally skipping step 8 of the NACO algorithm; if the string + //gets normalized away, that's fine. + + //leading and trailing spaces + nustr = leading_trailing_spaces(nustr); + nustr_len = u_strlen(nustr); + + temp = palloc(nustr_len * sizeof(UChar)); + u_strToLower(temp, nustr_len, nustr, nustr_len, NULL, &err); + temp_len = nustr_len; + + + u_strToUTF8(NULL, 0, &str_len, temp, temp_len, &err); + + SET_VARSIZE(str, str_len + VARHDRSZ); + str = (text *)palloc(str_len + VARHDRSZ); + + result = u_strToUTF8(VARDATA(str), str_len, &str_len, temp, temp_len, &err); + + result[str_len] = '\0'; + + +Fail: + pfree(temp); + pfree(ustr); + pfree(nustr); + + return str; +} + +PG_FUNCTION_INFO_V1(naco_normalize); + +Datum naco_normalize(PG_FUNCTION_ARGS) +{ + text *str = PG_GETARG_TEXT_P(0); + text *sf = PG_GETARG_TEXT_P(1); + + + if (str == NULL || sf == NULL) + { + PG_RETURN_TEXT_P(NULL); + } + + str = normalize(str, sf, 0); + + PG_RETURN_TEXT_P(str); +} + +PG_FUNCTION_INFO_V1(search_normalize); + +Datum search_normalize(PG_FUNCTION_ARGS) +{ + text *str = PG_GETARG_TEXT_P(0); + text *sf = PG_GETARG_TEXT_P(1); + + + if (str == NULL || sf == NULL) + { + PG_RETURN_TEXT_P(NULL); + } + + str = normalize(str, sf, 1); + + PG_RETURN_TEXT_P(str); +} + -- 2.11.0