A optimized version of normalize function in C. user/dsy/normalize_in_c
authorSwenyu Duan <dsy@sina.com>
Thu, 5 Jul 2012 13:25:48 +0000 (09:25 -0400)
committerSwenyu Duan <dsy@sina.com>
Thu, 5 Jul 2012 13:43:47 +0000 (09:43 -0400)
In preivous version of normalize function. The norm2_getInstance function
is being called each time calls the normalize function. This function will
read in some really large file to memory which will be a huge consumption in
 time. In the new implementation, this function is moved to a PG_fini
 function which will be called when the extension's shared library is loading
into the memory. In this way, the initialized normalizer will could be shared
as a global variable instead of loading it in each normalize operation.
Besides it, some of unneccessary memory allocation function is also removed.
Signed-off-by: Swenyu Duan <dsy@sina.com>
Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c

index a2128bb..82422be 100755 (executable)
 PG_MODULE_MAGIC;
 #endif
 
+UNormalizer2 *normalizer;
+
 static int32_t regexp_expand_string(const UChar *src,
                                     int32_t src_len,
                                     UChar *des,
                                     int32_t des_capacity)
 {
     int des_len;
-    UChar s, e;
+    UChar s;
     const UChar *cur_pos;
     const UChar *pre_pos;
 
@@ -88,7 +90,7 @@ static int32_t regexp_transliterate(const UChar *search_list,
                                                                        UChar *des,
                                                                        int32_t des_capacity)
 {
-       int i, j;
+       int i;
        int32_t des_len;
        UChar *cur_pos;
 
@@ -108,7 +110,7 @@ static int32_t regexp_transliterate(const UChar *search_list,
                }
                des_len++;
 
-               if(cur_pos = u_strchr(search_list, src[i]))
+               if((cur_pos = u_strchr(search_list, src[i])))
                {
                        if(cur_pos - search_list > replacement_list_len)
                                des_len--;
@@ -290,26 +292,32 @@ static UChar *u_strreplace(UChar *regexp,
                                                         replacement_len,
                                                         str,
                                                         str_len,
-                                                        NULL,
-                                                        0,
+                                                        str,
+                                                        str_len,
                                                         is_global);
 
-       des = palloc((des_len + 1)* sizeof(UChar));
-
-       des_len = regexp_replace(regexp,
-                                                       regexp_len,
-                                                       replacement,
-                                                       replacement_len,
-                                                       str,
-                                                       str_len,
-                                                       des,
-                                                       des_len,
-                                                       is_global);
-       if(des != NULL)
-               des[des_len] = '\0';
-
-       pfree(str);
-       return des;
+       if (des_len > str_capacity)
+       {
+               pfree(str);
+
+               des = palloc((des_len + 1)* sizeof(UChar));
+
+               des_len = regexp_replace(regexp,
+                       regexp_len,
+                       replacement,
+                       replacement_len,
+                       str,
+                       str_len,
+                       des,
+                       des_len,
+                       is_global);
+               if(des != NULL)
+                       des[des_len] = '\0';
+               
+               return des;
+       }
+       
+       return str;
 }
 
 static UChar *additional_substitutions(UChar *nustr, int is_search)
@@ -522,26 +530,32 @@ static UChar *leading_trailing_spaces(UChar * nustr)
 
 }
 
+void _PG_fini()
+{
+       UErrorCode err = 0;
+       normalizer = unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, &err);
+       if (U_FAILURE(err))
+       {
+               normalizer = NULL;
+       }
+       return;
+}
+
 text *normalize(text *str, text *sf, int is_search)
 {
        UChar *ustr = NULL, *nustr = NULL, *temp = NULL, *usf = NULL;
        int32_t nustr_len, temp_len, str_len, sf_len, ustr_len;
-       UNormalizer2 *normalizer;
-       char *regexp, *result;
+       char *result;
        char *s;
        UChar uregexp[200], replacement[200];
        UErrorCode err = 0;
-
+       
        if (str == NULL || sf == NULL)
        {
                return NULL;
        }
 
-       normalizer = unorm2_getNFKDInstance(&err);
-       if (U_FAILURE(err))
-       {
-               return NULL;
-       }
+       
 
        s = VARDATA(str);