From 70b7c16010afef16a75eceb5ba1aa51143ba4057 Mon Sep 17 00:00:00 2001 From: Swenyu Duan Date: Thu, 5 Jul 2012 09:25:48 -0400 Subject: [PATCH] A optimized version of normalize function in C. In preivous version of normalize function. The norm2_getInstance function is being called each time calls the normalize function. This function will read in some really large file to memory which will be a huge consumption in time. In the new implementation, this function is moved to a PG_fini function which will be called when the extension's shared library is loading into the memory. In this way, the initialized normalizer will could be shared as a global variable instead of loading it in each normalize operation. Besides it, some of unneccessary memory allocation function is also removed. Signed-off-by: Swenyu Duan --- .../sql/Pg/extensions/normalize.functions_in_c.c | 72 +++++++++++++--------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c b/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c index a2128bba63..82422be685 100755 --- a/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c +++ b/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c @@ -23,13 +23,15 @@ PG_MODULE_MAGIC; #endif +UNormalizer2 *normalizer; + static int32_t regexp_expand_string(const UChar *src, int32_t src_len, UChar *des, int32_t des_capacity) { int des_len; - UChar s, e; + UChar s; const UChar *cur_pos; const UChar *pre_pos; @@ -88,7 +90,7 @@ static int32_t regexp_transliterate(const UChar *search_list, UChar *des, int32_t des_capacity) { - int i, j; + int i; int32_t des_len; UChar *cur_pos; @@ -108,7 +110,7 @@ static int32_t regexp_transliterate(const UChar *search_list, } des_len++; - if(cur_pos = u_strchr(search_list, src[i])) + if((cur_pos = u_strchr(search_list, src[i]))) { if(cur_pos - search_list > replacement_list_len) des_len--; @@ -290,26 +292,32 @@ static UChar *u_strreplace(UChar *regexp, replacement_len, str, str_len, - NULL, - 0, + str, + str_len, is_global); - des = palloc((des_len + 1)* sizeof(UChar)); - - des_len = regexp_replace(regexp, - regexp_len, - replacement, - replacement_len, - str, - str_len, - des, - des_len, - is_global); - if(des != NULL) - des[des_len] = '\0'; - - pfree(str); - return des; + if (des_len > str_capacity) + { + pfree(str); + + des = palloc((des_len + 1)* sizeof(UChar)); + + des_len = regexp_replace(regexp, + regexp_len, + replacement, + replacement_len, + str, + str_len, + des, + des_len, + is_global); + if(des != NULL) + des[des_len] = '\0'; + + return des; + } + + return str; } static UChar *additional_substitutions(UChar *nustr, int is_search) @@ -522,26 +530,32 @@ static UChar *leading_trailing_spaces(UChar * nustr) } +void _PG_fini() +{ + UErrorCode err = 0; + normalizer = unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, &err); + if (U_FAILURE(err)) + { + normalizer = NULL; + } + return; +} + text *normalize(text *str, text *sf, int is_search) { UChar *ustr = NULL, *nustr = NULL, *temp = NULL, *usf = NULL; int32_t nustr_len, temp_len, str_len, sf_len, ustr_len; - UNormalizer2 *normalizer; - char *regexp, *result; + char *result; char *s; UChar uregexp[200], replacement[200]; UErrorCode err = 0; - + if (str == NULL || sf == NULL) { return NULL; } - normalizer = unorm2_getNFKDInstance(&err); - if (U_FAILURE(err)) - { - return NULL; - } + s = VARDATA(str); -- 2.11.0