Add some new functions to extension c_functions. user/dsy/extension_in_c
authorSwenyu Duan <dsy@sina.com>
Mon, 30 Jul 2012 02:42:47 +0000 (22:42 -0400)
committerSwenyu Duan <dsy@sina.com>
Mon, 30 Jul 2012 02:48:20 +0000 (22:48 -0400)
New functions include oils_xslt_process, vandelay.add_field and vandelay.strip_field. These functions are implemente in plperlu before.
The C implementation of these functions relies on ICU4C and libxml2, libxslt. To install the new functions just create the extension
c_functions as before.
All the functions have not tested yet. There may be some errors in it. I will test it soon.

Signed-off-by: Swenyu Duan <dsy@sina.com>
Open-ILS/src/sql/Pg/extensions/c_functions--1.0.sql
Open-ILS/src/sql/Pg/extensions/makefile
Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c
Open-ILS/src/sql/Pg/extensions/vandelay.functions_in_c.c [new file with mode: 0755]
Open-ILS/src/sql/Pg/extensions/xml.functions_in_c.c [new file with mode: 0755]

index 5f3049d..b02b6c8 100755 (executable)
@@ -1,3 +1,15 @@
+CREATE OR REPLACE FUNCTION vandelay.add_field( TEXT, TEXT, TEXT, INT) RETURNS TEXT
+       AS 'c_functions.so', 'vandelay_add_field'
+       LANGUAGE C STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION vandelay.strip_field( TEXT, TEXT) RETURNS TEXT
+       AS 'c_functions.so', 'vandelay_strip_field'
+       LANGUAGE C STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION oils_xslt_process( TEXT, TEXT) RETURNS TEXT
+       AS 'c_functions.so', 'oils_xslt_process'
+       LANGUAGE C STRICT IMMUTABLE;
+
 CREATE OR REPLACE FUNCTION public.search_normalize( TEXT, TEXT ) RETURNS TEXT 
        AS 'c_functions.so', 'search_normalize'
        LANGUAGE C STRICT IMMUTABLE;    
index 8aa4ad3..61af73e 100644 (file)
@@ -3,7 +3,7 @@ EXTENSION = c_functions
 SHLIB_LINK = -licutu -licuuc -licuio -licui18n -licule -liculx -licudata
 #PG_CPPFLAGS = -L/usr/lib -licuuc -licuio -licui18n -licule -liculx -licudata
 DATA = c_functions--1.0.sql
-OBJS = normalize.functions_in_c.o
+OBJS = normalize.functions_in_c.o xml.functions_in_c.o vandelay.functions_in_c.o
 
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
index 26e48e0..682502a 100755 (executable)
@@ -94,7 +94,8 @@ static int32_t regexp_transliterate(const UChar *search_list,
 \r
 static int32_t regexp_replace(const UChar *regexp,\r
                               int32_t regexp_len,\r
-                              const UChar *replacement,\r
+                             int flag,\r
+                             const UChar *replacement,\r
                               int32_t replacement_len,\r
                               UChar *src,\r
                               int32_t src_len,\r
@@ -112,7 +113,7 @@ static int32_t regexp_replace(const UChar *regexp,
         return 0;\r
     }\r
 \r
-    regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status);\r
+    regular_exp = uregex_open(regexp, regexp_len, flag, &pe, &status);\r
     if (regular_exp == NULL)\r
     {\r
         return 0;\r
@@ -167,10 +168,10 @@ static int32_t regexp_replace(const UChar *regexp,
     return len;\r
 }\r
 \r
-static UChar *u_strtransliterate(UChar *search_list,\r
-                                 UChar *replacement_list,\r
-                                 UChar *str,\r
-                                 int32_t str_capacity)\r
+ UChar *u_strtransliterate(UChar *search_list,\r
+                           UChar *replacement_list,\r
+                           UChar *str,\r
+                           int32_t str_capacity)\r
 {\r
     int32_t search_list_len, replacement_list_len, str_len;\r
     UChar *des;\r
@@ -207,11 +208,12 @@ static UChar *u_strtransliterate(UChar *search_list,
     return des;\r
 }\r
 \r
-static UChar *u_strreplace(UChar *regexp,\r
-                           UChar *replacement,\r
-                           UChar *str,\r
-                           int32_t str_capacity,\r
-                           int is_global)\r
+UChar *u_strreplace(UChar *regexp,\r
+                                                               int flag,\r
+                    UChar *replacement,\r
+                    UChar *str,\r
+                    int32_t str_capacity,\r
+                    int is_global)\r
 {\r
     int32_t regexp_len, replacement_len, str_len;\r
     UChar *des;\r
@@ -228,7 +230,8 @@ static UChar *u_strreplace(UChar *regexp,
 \r
     des_len = regexp_replace(regexp,\r
                              regexp_len,\r
-                             replacement,\r
+                             flag,\r
+                            replacement,\r
                              replacement_len,\r
                              str,\r
                              str_len,\r
@@ -240,6 +243,7 @@ static UChar *u_strreplace(UChar *regexp,
 \r
     des_len = regexp_replace(regexp,\r
                             regexp_len,\r
+                           flag,\r
                             replacement,\r
                             replacement_len,\r
                             str,\r
@@ -265,19 +269,19 @@ UChar *additional_substitutions(UChar *nustr, int is_search)
     u_uastrncpy(uregexp, regexp, strlen(regexp));\r
     u_uastrncpy(replacement, "AE", strlen("AE"));\r
     \r
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+    nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
 \r
     regexp = "\\x{00DE}";\r
     u_uastrncpy(uregexp, regexp, strlen(regexp));\r
     u_uastrncpy(replacement, "TH", strlen("TH"));\r
     \r
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+    nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
 \r
     regexp = "\\x{0152}";\r
     u_uastrncpy(uregexp, regexp, strlen(regexp));\r
     u_uastrncpy(replacement, "OE", strlen("OE"));\r
 \r
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+    nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
 \r
     if (is_search)\r
     {\r
@@ -291,7 +295,7 @@ UChar *additional_substitutions(UChar *nustr, int is_search)
     u_uastrncpy(uregexp, regexp, strlen(regexp));\r
     u_uastrncpy(replacement, "OE", strlen("OE"));\r
     \r
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+    nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
 \r
     regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['";\r
     u_uastrncpy(uregexp, regexp, strlen(regexp));\r
@@ -320,7 +324,7 @@ UChar *transformations_on_unicode(UChar *nustr, UChar *usf)
     u_uastrncpy(uregexp, regexp, strlen(regexp));\r
     u_uastrncpy(replacement, "", strlen(""));\r
     \r
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+    nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
 \r
     if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16\r
     {\r
@@ -334,7 +338,7 @@ UChar *transformations_on_unicode(UChar *nustr, UChar *usf)
                 replacement[0] = 0x7;\r
                 replacement[1] = 0;\r
 \r
-                nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);\r
+                nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 0);\r
             }\r
         }\r
     }\r
@@ -365,7 +369,7 @@ UChar *replace_placehoders(UChar *nustr)
                strlen( "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]"));\r
     u_uastrncpy(replacement, "", strlen(""));\r
 \r
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+    nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
 \r
     u_uastrncpy(uregexp,\r
                 "\\x01\\x02\\x03\\x04\\x05\\x06\\x07",\r
@@ -430,15 +434,15 @@ UChar *leading_trailing_spaces(UChar * nustr)
     u_uastrncpy(uregexp, "\\s+",       strlen( "\\s+"));\r
     u_uastrncpy(replacement, " ", strlen(" "));\r
 \r
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+    nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
     \r
     u_uastrncpy(uregexp, "^\\s+",      strlen( "^\\s+"));\r
     u_uastrncpy(replacement, "", strlen(""));\r
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);\r
+    nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 0);\r
 \r
     u_uastrncpy(uregexp, "\\s+$",      strlen( "\\s+$"));\r
 \r
-    nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+    nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
 \r
     return nustr;\r
 \r
@@ -518,7 +522,7 @@ text *normalize(text *str, text *sf, int is_search)
     u_uastrncpy(uregexp, regexp, strlen(regexp));\r
     u_uastrncpy(replacement, "", strlen(""));\r
 \r
-    ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1);\r
+    ustr = u_strreplace(uregexp, 0, replacement, ustr, u_strlen(ustr), 1);\r
 \r
     unorm2_normalize(normalizer, ustr, VARSIZE(str), nustr, VARSIZE(str), &err);\r
     if (U_FAILURE(err))\r
diff --git a/Open-ILS/src/sql/Pg/extensions/vandelay.functions_in_c.c b/Open-ILS/src/sql/Pg/extensions/vandelay.functions_in_c.c
new file mode 100755 (executable)
index 0000000..4491741
--- /dev/null
@@ -0,0 +1,783 @@
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "unicode/unorm2.h"
+#include "unicode/utypes.h"
+#include "unicode/ustring.h"
+#include "unicode/uregex.h"
+#include "unicode/umachine.h"
+#include "libxml2/libxml/parser.h"
+#include "libxslt/xslt.h"
+#include "libxslt/xsltInternals.h"
+#include "libxslt/transform.h"
+
+
+/************************************************************************
+*  C Implementation: vandelay.add_field vandelay.strip_field
+ *
+ *  Descritption:
+ *    This file implement vandelay.add_field and vandelay.strip_field.
+ *  These two functions is included in the PostgreSQL extension c_functions.
+ *  ICU4C, libxml and postgres lib is needed to build this file.
+ *    The replace function in normalize.functions_in_c.c is also used in this
+ *  module. They have to link togather. 
+ *    This file build up the simple process procedures to handle MARC in C.
+ *
+ *  Author: Swenyu Duan <dsy88@sina.com>, (C) 2012
+ *
+ *  Copyright: See COPYING file that comes with this distribution.
+ *
+************************************************************************/
+UChar *u_strtransliterate(UChar *search_list,
+                        UChar *replacement_list,
+                        UChar *str,
+                        int32_t str_capacity);
+
+UChar *u_strreplace(UChar *regexp,
+                  int flag,
+                  UChar *replacement,
+                  UChar *str,
+                  int32_t str_capacity,
+                  int is_global);
+
+int marc_add_subfield(xmlNode *cur_node, UChar *subfield_name, UChar *content)
+{
+    xmlNode *new_node;
+    int32_t xml_subfield_name_len, xml_content_len, subfield_name_len, content_len;
+    xmlChar *xml_subfield_name, *xml_content;
+    UErrorCode err = 0;
+
+    if (cur_node == NULL || subfield_name == NULL)
+    {
+        return 0;
+    }
+
+    subfield_name_len = u_strlen(subfield_name);
+    content_len = u_strlen(content);
+
+    u_strToUTF8(NULL, 0, &xml_subfield_name_len,
+                subfield_name, subfield_name_len,&err);
+    xml_subfield_name = palloc((xml_subfield_name_len + 1)* sizeof(char));
+    u_strToUTF8((char *)xml_subfield_name,
+                xml_subfield_name_len,
+                &xml_subfield_name_len,
+                subfield_name,
+                subfield_name_len,
+                &err);
+
+    u_strToUTF8(NULL, 0, &xml_content_len,
+                content, content_len, &err);
+    xml_content = palloc((xml_content_len + 1)* sizeof(char));
+    u_strToUTF8((char *)xml_content,
+                xml_content_len,
+                &xml_content_len,
+                content,
+                content_len,
+                &err);
+        
+    new_node = xmlNewPI(xml_subfield_name, xml_content);
+
+    xmlAddChild(cur_node, new_node);
+    
+    return 0;
+}
+
+int compare_field_name(const UChar *field_name, const xmlChar *xml_field_name)
+{
+    UChar *temp_field_name;
+    int32_t temp_field_name_len, xml_field_name_len;
+    UErrorCode err = 0;
+    int ret;
+
+    if (field_name == NULL || xml_field_name == NULL)
+    {
+        return 0;
+    }
+
+    xml_field_name_len = strlen((char *)xml_field_name);
+    temp_field_name = palloc((xml_field_name_len + 1) * sizeof(UChar));
+
+    u_strFromUTF8(temp_field_name,
+                  temp_field_name_len,
+                  &temp_field_name_len,
+                  (char *)xml_field_name,
+                  xml_field_name_len,
+                  &err);
+    ret = u_strcmp(field_name, temp_field_name);
+    pfree(temp_field_name);
+    
+    return ret;
+}
+
+xmlNode *marc_get_field(xmlNode *head, UChar *field_name)
+{
+    xmlNode *cur_node = NULL;
+
+    cur_node = head;
+
+    for(; cur_node; cur_node = cur_node->next)
+    {
+        if (cur_node->type == XML_ELEMENT_NODE)
+        {
+            if (compare_field_name(field_name, cur_node->name) == 0)
+            {
+                return cur_node;
+            }
+        }
+    }
+
+    return NULL;
+}
+
+int marc_delete_subfield(xmlNode *head, UChar *field_name)
+{
+    xmlNode *cur_node = NULL;
+
+    cur_node = marc_get_field(head, field_name);
+
+    xmlUnlinkNode(cur_node);
+    xmlFreeNode(cur_node);
+
+    return 1;
+}
+
+int marc_delete_field(xmlNode *cur_node)
+{
+    xmlUnlinkNode(cur_node);
+    xmlFreeNode(cur_node);
+
+    return 1;
+}
+
+int insert_fields_ordered(xmlDoc *record, xmlNode *source_node)
+{
+    xmlNode *cur_node = NULL;
+    xmlNode *new_node;
+    cur_node = xmlDocGetRootElement(record);
+
+    for (; cur_node; cur_node = cur_node->next)
+    {
+        if (cur_node->type == XML_ELEMENT_NODE)
+        {
+            if(strcmp((char *)source_node->name, (char *)cur_node->name) > 0)
+            {
+                new_node = xmlCopyNode(source_node, 1);
+                xmlAddPrevSibling(cur_node, new_node);
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+int handle_match(UChar *match, UChar **msf, int32_t *msf_len, UChar **mre, int32_t *mre_len)
+{
+    UErrorCode err = 0;
+    URegularExpression *regexp;
+    UParseError pe;
+    UChar uregexp[200], replacement[200];
+    int32_t match_list_len, match_len, m_list_len, m_num;
+    UChar **m_list, *match_list;
+    UChar *msf_t = NULL, *mre_t = NULL;
+    int32_t msf_t_len = 0, mre_t_len = 0;
+
+    if (match != NULL)
+    {
+       match_len = u_strlen(match);
+        u_uastrncpy(uregexp, "~", sizeof("~"));
+        regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+        uregex_setText(regexp, match, match_len, &err);
+
+        match_list_len = match_len;
+        match_list = palloc((match_len + 1) * sizeof(UChar));
+        m_list_len = 5;
+        m_list = palloc(m_list_len * sizeof(UChar *));
+        m_num = uregex_split(regexp, match_list, match_list_len,
+            &match_list_len, m_list, m_list_len, &err);
+        uregex_close(regexp);
+
+        msf_t_len = uregex_group(regexp, 1, NULL, 0, &err);
+        msf_t = palloc((msf_t_len + 1) * sizeof(UChar));
+        uregex_group(regexp, 1, msf_t, msf_t_len, &err);
+
+        mre_t_len = uregex_group(regexp, 2, NULL, 0, &err);
+        mre_t = palloc((mre_t_len + 1) * sizeof(UChar));
+        uregex_group(regexp, 1, mre_t, mre_t_len, &err);
+
+        if (msf_t_len > 0 && mre_t_len > 0)
+        {
+            u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*"));
+            u_uastrncpy(replacement, "", sizeof(""));
+            msf_t = u_strreplace(uregexp, 0, replacement, msf_t, msf_t_len, 1);
+
+            u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$"));
+            u_uastrncpy(replacement, "", sizeof(""));
+            msf_t = u_strreplace(uregexp, 0, replacement, msf_t, msf_t_len, 1);
+
+            u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*"));
+            u_uastrncpy(replacement, "", sizeof(""));
+            mre_t = u_strreplace(uregexp, 0, replacement, mre_t, mre_t_len, 1);
+
+            u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$"));
+            u_uastrncpy(replacement, "", sizeof(""));
+            mre_t = u_strreplace(uregexp, 0, replacement, mre_t, mre_t_len, 1);
+        }
+    }
+
+    *msf = msf_t;
+    *mre = mre_t;
+    *msf_len = msf_t_len;
+    *mre_len = mre_t_len;
+
+    return 1;
+}
+
+int handle_field(URegularExpression *regexp,
+                 int32_t f_len,
+                 UChar **field_name_result,
+                 int32_t *field_name_result_len,
+                 UChar **sf_result,
+                 int32_t *sf_result_len,
+                 UChar **match_result,
+                 int32_t *match_result_len)
+{
+    UErrorCode err = 0;
+    UChar uregexp[200], replacement[200];
+    UChar *field_name;
+    int32_t field_name_len;
+    UChar *sf;
+    int32_t sf_len;
+    UChar *match;
+    int32_t match_len;
+
+    
+    field_name_len = uregex_group(regexp, 1, NULL, 0, &err);
+    field_name = palloc((field_name_len + 1) * sizeof(UChar));
+    uregex_group(regexp, 1, field_name, field_name_len, &err);
+
+    u_uastrncpy(uregexp, "\\s+", sizeof("\\s+"));
+    u_uastrncpy(replacement, "", sizeof(""));
+
+    field_name = u_strreplace(uregexp, 0, replacement, field_name, field_name_len, 1);
+
+    sf_len = uregex_group(regexp, 2, NULL, 0, &err);
+    sf = palloc((sf_len + 1) * sizeof(UChar));
+    uregex_group(regexp, 2, sf, sf_len, &err);
+
+    sf = u_strreplace(uregexp, 0, replacement, sf, sf_len, 1);
+
+    match_len = uregex_group(regexp, 3, NULL, 0, &err);
+    match = palloc((match_len + 1) * sizeof(UChar));
+    uregex_group(regexp, 3, match, match_len, &err);
+
+    u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*"));
+    u_uastrncpy(replacement, "", sizeof(""));
+    match = u_strreplace(uregexp, 0, replacement, match, match_len, 1);
+
+    u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$"));
+    u_uastrncpy(replacement, "", sizeof(""));
+    match = u_strreplace(uregexp, 0, replacement, match, match_len, 1);
+
+    *field_name_result = field_name;
+    *field_name_result_len = field_name_len;
+    *sf_result = sf;
+    *sf_result_len = sf_len;
+    *match_result = match;
+    *match_result_len = match_len;
+
+    return 1;
+}
+
+UChar *xmlchar_to_uchar(const xmlChar *src, int32_t src_len)
+{
+    UChar *result;
+    int32_t result_len;
+    UErrorCode err = 0;
+
+    u_strFromUTF8(NULL, 0, &result_len,
+                (char *)src, src_len, &err);
+
+    result = palloc(result_len * sizeof(UChar));
+
+    u_strFromUTF8(result, result_len, &result_len,
+                (char *)src, src_len, &err);
+
+    return result;
+}
+
+
+PG_FUNCTION_INFO_V1(vandelay_add_field);
+
+Datum vandelay_add_field(PG_FUNCTION_ARGS)
+{
+    text *target_xml = PG_GETARG_TEXT_P(0);
+    text *source_xml = PG_GETARG_TEXT_P(1);
+    text *field_spec = PG_GETARG_TEXT_P(2);
+    int force_add = PG_GETARG_UINT32(3);
+    UChar *result_xml;
+    int32_t result_xml_len;
+    UChar *field, *field_list, *sf_list;
+    UChar *f, *msf, *mre;
+    int i;
+    int32_t field_len, f_len, field_list_len, f_list_len;
+    int32_t msf_len, mre_len, match_len;
+    int32_t f_num, s_num;
+    UChar **f_list;
+    UChar **s_list;
+    UChar uregexp[200], replacement[200];
+    UChar *field_name, *sf, *match;
+    int32_t field_name_len, sf_len, sf_list_len, s_list_len;
+    UErrorCode err = 0;
+    URegularExpression *regexp;
+    UParseError pe;
+    xmlDoc *source_r, *target_r;
+    xmlNode *source_field, *target_field, *cur_node;
+    xmlChar *temp_result;
+    int32_t temp_result_len;
+    UBool if_find;
+
+    if (target_xml == NULL || source_xml == NULL ||
+        field_spec == NULL )
+    {
+        PG_RETURN_NULL();
+    }
+
+    source_r = xmlParseMemory(VARDATA(source_xml), VARSIZE(source_xml));
+    target_r = xmlParseMemory(VARDATA(target_xml), VARSIZE(target_xml));
+    
+    if (source_r == NULL && target_r == NULL)
+    {
+        PG_RETURN_TEXT_P(target_xml);
+    }
+
+    field_len = VARSIZE(field_spec) - VARHDRSZ;
+    field = palloc((field_len + 1) * sizeof(UChar));
+    field_list = palloc((field_len + 1) * sizeof(UChar));
+
+    field = u_strFromUTF8(field, field_len, NULL, VARDATA(field_spec), field_len, &err);
+    if (U_FAILURE(err) || field == NULL)
+    {
+        field = NULL;
+        goto Fail;
+    }
+    u_uastrncpy(uregexp, ",", sizeof(","));
+    regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+    uregex_setText(regexp, field, field_len, &err);
+
+    //Split the field_spec.
+    f_list_len = 10;
+    f_list = palloc(f_list_len * sizeof(UChar *));
+    while(1)
+    {
+        f_num = uregex_split(regexp, field_list, field_len,
+                             &field_list_len, f_list, f_list_len, &err);
+        if(f_num >= f_list_len)
+        {
+            pfree(f_list);
+            f_list_len *= 2;
+            f_list = palloc(f_list_len * sizeof(UChar *));
+        }
+    }
+    uregex_close(regexp);
+    
+    for(i = 0; i < f_num; i++)
+    {
+        f = f_list[i];
+        f_len = u_strlen(f_list[i]);
+        
+        u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*"));
+        u_uastrncpy(replacement, "", sizeof(""));
+
+        f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1);
+
+        u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$"));
+        u_uastrncpy(replacement, "", sizeof(""));
+
+        f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1);
+
+        u_uastrncpy(uregexp, "^(.{3})(\\w*)(?:\\[([^]]*)\\])?$", sizeof("^(.{3})(\\w*)(?:\\[([^]]*)\\])?$"));
+
+        regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+        uregex_setText(regexp, f, f_len, &err);
+
+        if(uregex_matches(regexp, 0, &err))
+        {
+
+            handle_field(regexp,
+                         f_len,
+                         &field_name,
+                         &field_name_len,
+                         &sf,
+                         &sf_len,
+                         &match,
+                         &match_len);
+            //Split sf.
+            u_uastrncpy(uregexp, "", sizeof(""));
+            regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+            uregex_setText(regexp, sf, sf_len, &err);
+
+            sf_list_len = sf_len;
+            sf_list = palloc((sf_len + 1) * sizeof(UChar));
+            s_list_len = 10;
+            s_list = palloc(s_list_len * sizeof(UChar *));
+            s_num = uregex_split(regexp, sf_list, sf_list_len,
+                &sf_list_len, s_list, s_list_len, &err);
+            uregex_close(regexp);
+
+            handle_match(match, &msf, &msf_len, &mre, &mre_len);
+
+            if (s_num != 0)
+            {
+                source_field = xmlDocGetRootElement(source_r);
+                source_field = marc_get_field(source_field, field_name);
+
+                while (source_field != NULL)
+                {
+                    target_field = xmlDocGetRootElement(target_r);
+                    target_field = marc_get_field(target_field, field_name);
+                    if (target_field == NULL)
+                    {
+                        if (force_add || (msf != NULL && mre != NULL))
+                        {
+                            insert_fields_ordered(target_r, source_field);
+                        }
+                    }
+                    else
+                    {                        
+                        for (; target_field; target_field = marc_get_field(target_field, field_name))
+                        {
+                            if (msf != NULL && mre != NULL)
+                            {
+                                UChar *content;
+                                regexp = uregex_open(mre, mre_len, 0, &pe, &err);
+                                cur_node = marc_get_field(target_field->children, msf);
+                                content = xmlchar_to_uchar(cur_node->content, strlen((char *)cur_node->content));
+                                uregex_setText(regexp, content, u_strlen(content), &err);
+                                
+                                if_find = uregex_find(regexp, 0, &err);
+
+                                uregex_close(regexp);
+                                if (if_find == FALSE)
+                                {
+                                    continue;
+                                }
+                            }
+
+                            for (i = 0; i < s_num; i++)
+                            {
+                                UChar *name, *content;
+                                cur_node = marc_get_field(source_field->children, s_list[i]);
+                                while (cur_node != NULL)
+                                {
+                                    name = xmlchar_to_uchar(cur_node->name, strlen((char *)cur_node->name));
+                                    content = xmlchar_to_uchar(cur_node->content, strlen((char *)cur_node->content));
+                                    marc_add_subfield(target_field, name, content);
+                                    cur_node = marc_get_field(source_field->children, s_list[i]);
+                                }
+                            }
+                        }
+                    }
+                    source_field = marc_get_field(source_field, field_name);
+                }
+            }
+            else
+            {
+                source_field = xmlDocGetRootElement(source_r);
+                source_field = marc_get_field(source_field, field_name);
+                insert_fields_ordered(target_r, source_field);
+            }
+
+        }
+        
+
+        uregex_close(regexp);
+    }
+
+    xmlDocDumpMemory(target_r, &temp_result, &temp_result_len);
+
+    u_uastrncpy(uregexp, "^<\\?.+?\\?>$", sizeof("^<\\?.+?\\?>$"));
+    u_uastrncpy(replacement, "", sizeof(""));
+
+    u_strFromUTF8(NULL, 0, &result_xml_len, (char *)temp_result, temp_result_len, &err);
+
+    err = 0;
+    result_xml = (UChar *)palloc(result_xml_len * sizeof(UChar));
+    result_xml = u_strFromUTF8(result_xml,
+                             result_xml_len,
+                             &result_xml_len,
+                             (char *)temp_result,
+                             temp_result_len,
+                             &err);
+    if (result_xml != NULL)
+    {
+        result_xml[result_xml_len] = '\0';
+    }
+    
+    u_uastrncpy(uregexp, "^<\\?.+?\\?>$", sizeof("^<\\?.+?\\?>$"));
+    u_uastrncpy(replacement, "", sizeof(""));
+
+    result_xml = u_strreplace(uregexp,
+        UREGEX_MULTILINE,
+        replacement,
+        result_xml,
+        result_xml_len,
+        0);
+
+    u_uastrncpy(uregexp, "\n", sizeof("\n"));
+    u_uastrncpy(replacement, "", sizeof(""));
+
+    result_xml = u_strreplace(uregexp,
+        UREGEX_DOTALL,
+        replacement,
+        result_xml,
+        result_xml_len,
+        1);
+
+
+    u_uastrncpy(uregexp, ">\\s+<", sizeof(">\\s+<"));
+    u_uastrncpy(replacement, "><", sizeof("><"));
+
+    result_xml = u_strreplace(uregexp,
+        UREGEX_DOTALL,
+        replacement,
+        result_xml,
+        result_xml_len,
+        1);
+
+    u_strToUTF8(NULL, 0, &temp_result_len, result_xml, result_xml_len, &err);
+
+    target_xml = (text *)palloc(temp_result_len + VARHDRSZ);
+    SET_VARSIZE(target_xml, temp_result_len + VARHDRSZ);
+
+    u_strToUTF8(VARDATA(target_xml), 0, &temp_result_len, result_xml, result_xml_len, &err);
+
+    xmlFree(result_xml);
+
+
+Fail:
+    xmlFreeDoc(source_r);
+    xmlFreeDoc(target_r);
+
+    xmlCleanupParser();
+    
+    PG_RETURN_TEXT_P(target_xml);
+}
+
+
+PG_FUNCTION_INFO_V1(vandelay_strip_field);
+
+Datum vandelay_strip_field(PG_FUNCTION_ARGS)
+{
+    text *xml =  PG_GETARG_TEXT_P(0);
+    text *field_spec =  PG_GETARG_TEXT_P(1);
+    xmlDoc *r;
+    UChar *field, *field_list, **f_list, *sf_list, **s_list, *msf, *mre;
+    UChar *field_name, *f, *sf, *match;
+    int32_t sf_list_len, s_list_len, msf_len, mre_len;
+    int32_t sf_len, match_len;
+    UErrorCode err = 0;
+    URegularExpression *regexp;
+    UParseError pe;
+    UChar uregexp[200], replacement[200];
+    int32_t field_len, f_len, field_list_len, f_list_len, field_name_len;
+    int32_t f_num, s_num;
+    xmlNode *to_field, *cur_node;
+    UBool if_find;
+    xmlChar *temp_result;
+    UChar *result_xml;
+    int32_t temp_result_len, result_xml_len;    
+    text *result = NULL;
+    int i;
+
+    if (xml == NULL || field_spec == NULL)
+    {
+        PG_RETURN_NULL();
+    }
+
+    r = xmlParseMemory(VARDATA(xml), VARSIZE(xml));
+    if (r == NULL)
+    {
+        PG_RETURN_TEXT_P(xml);
+    }
+
+    field_len = VARSIZE(field_spec) - VARHDRSZ;
+    field = palloc((field_len + 1) * sizeof(UChar));
+    field_list = palloc((field_len + 1) * sizeof(UChar));
+
+    field = u_strFromUTF8(field, field_len, NULL, VARDATA(field_spec), field_len, &err);
+    if (U_FAILURE(err) || field == NULL)
+    {
+        field = NULL;
+        goto Fail;
+    }
+
+    u_uastrncpy(uregexp, ",", sizeof(","));
+    regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+    uregex_setText(regexp, field, field_len, &err);
+
+    //Split the field_spec.
+    f_list_len = 10;
+    f_list = palloc(f_list_len * sizeof(UChar *));
+    while(1)
+    {
+        f_num = uregex_split(regexp, field_list, field_len,
+            &field_list_len, f_list, f_list_len, &err);
+        if(f_num >= f_list_len)
+        {
+            pfree(f_list);
+            f_list_len *= 2;
+            f_list = palloc(f_list_len * sizeof(UChar *));
+        }
+    }
+    uregex_close(regexp);
+
+
+    for(i = 0; i < f_num; i++)
+    {
+        f = f_list[i];
+        f_len = u_strlen(f_list[i]);
+
+        u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*"));
+        u_uastrncpy(replacement, "", sizeof(""));
+
+        f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1);
+
+        u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$"));
+        u_uastrncpy(replacement, "", sizeof(""));
+
+        f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1);
+
+        u_uastrncpy(uregexp, "^(.{3})(\\w*)(?:\\[([^]]*)\\])?$", sizeof("^(.{3})(\\w*)(?:\\[([^]]*)\\])?$"));
+
+        regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+        uregex_setText(regexp, f, f_len, &err);
+
+        if(uregex_matches(regexp, 0, &err))
+        {
+            handle_field(regexp,
+                        f_len,
+                        &field_name,
+                        &field_name_len,
+                        &sf,
+                        &sf_len,
+                        &match,
+                        &match_len);
+
+            //Split sf.
+            u_uastrncpy(uregexp, "", sizeof(""));
+            regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+            uregex_setText(regexp, sf, sf_len, &err);
+
+            sf_list_len = sf_len;
+            sf_list = palloc((sf_len + 1) * sizeof(UChar));
+            s_list_len = 10;
+            s_list = palloc(s_list_len * sizeof(UChar *));
+            s_num = uregex_split(regexp, sf_list, sf_list_len,
+                &sf_list_len, s_list, s_list_len, &err);
+            uregex_close(regexp);
+
+
+            handle_match(match, &msf, &msf_len, &mre, &mre_len);
+            to_field =  xmlDocGetRootElement(r);
+            to_field = marc_get_field(to_field, field_name);
+            for (; to_field; to_field = marc_get_field(to_field, field_name))
+            {
+                if (msf != NULL && mre != NULL)
+                {
+                    UChar *content;
+                    regexp = uregex_open(mre, mre_len, 0, &pe, &err);
+                    cur_node = marc_get_field(to_field->children, msf);
+                    content = xmlchar_to_uchar(cur_node->content, strlen((char *)cur_node->content));
+                    uregex_setText(regexp, content, u_strlen(content), &err);
+
+                    if_find = uregex_find(regexp, 0, &err);
+
+                    uregex_close(regexp);
+                    if (if_find == FALSE)
+                    {
+                        continue;
+                    }
+                }
+
+                if (s_num != 0)
+                {
+                    for (i = 0; i < s_num; i++)
+                    {
+                        marc_delete_subfield(to_field->children, s_list[i]);
+                    }
+                }
+                else
+                {
+                    marc_delete_field(to_field);
+                }
+            }
+
+        }
+    }
+        xmlDocDumpMemory(r, &temp_result, &temp_result_len);
+
+        u_strFromUTF8(NULL, 0, &result_xml_len, (char *)temp_result, temp_result_len, &err);
+
+
+        err = 0;
+        result_xml = (UChar *)palloc(result_xml_len);
+
+        result_xml = u_strFromUTF8(result_xml,
+                                    result_xml_len,
+                                    &result_xml_len,
+                                    (char *)temp_result,
+                                    temp_result_len,
+                                    &err);
+        if (result_xml != NULL)
+        {
+            result_xml[result_xml_len] = '\0';
+        }
+
+
+        u_uastrncpy(uregexp, "^<\\?.+?\\?>$", sizeof("^<\\?.+?\\?>$"));
+        u_uastrncpy(replacement, "", sizeof(""));
+
+        result_xml = u_strreplace(uregexp,
+                                    UREGEX_MULTILINE,
+                                    replacement,
+                                    result_xml,
+                                    result_xml_len,
+                                    0);
+
+        u_uastrncpy(uregexp, "\n", sizeof("\n"));
+        u_uastrncpy(replacement, "", sizeof(""));
+
+        result_xml = u_strreplace(uregexp,
+            UREGEX_DOTALL,
+            replacement,
+            result_xml,
+            result_xml_len,
+            1);
+
+
+        u_uastrncpy(uregexp, ">\\s+<", sizeof(">\\s+<"));
+        u_uastrncpy(replacement, "><", sizeof("><"));
+
+        result_xml = u_strreplace(uregexp,
+            UREGEX_DOTALL,
+            replacement,
+            result_xml,
+            result_xml_len,
+            1);
+
+       u_strToUTF8(NULL, 0, &temp_result_len, result_xml, result_xml_len, &err);
+
+       result = (text *)palloc(temp_result_len + VARHDRSZ);
+        SET_VARSIZE(result, temp_result_len + VARHDRSZ);
+
+       u_strToUTF8(VARDATA(result), temp_result_len, &temp_result_len, result_xml, result_xml_len, &err);
+
+        xmlFree(temp_result);
+        pfree(result_xml);
+
+Fail:
+    xmlFreeDoc(r);
+    xmlCleanupParser();
+
+    PG_RETURN_TEXT_P(result);
+}
diff --git a/Open-ILS/src/sql/Pg/extensions/xml.functions_in_c.c b/Open-ILS/src/sql/Pg/extensions/xml.functions_in_c.c
new file mode 100755 (executable)
index 0000000..5ab1dc2
--- /dev/null
@@ -0,0 +1,92 @@
+/************************************************************************/
+/*  C Implementation: oils.xslt.process
+ *
+ *  Descritption:
+ *    This file implement oils.xslt.process.
+ *    The function is included in the PostgreSQL extension c_functions.
+ *  libxml2, libxslt and postgres lib is needed to build this file.
+ *
+ *  Author: Swenyu Duan <dsy88@sina.com>, (C) 2012
+ *
+ *  Copyright: See COPYING file that comes with this distribution.
+ */
+/************************************************************************/
+#include "postgres.h"
+#include "fmgr.h"
+#include "stdio.h"
+#include "libxml/tree.h"
+#include "libxml/parser.h"
+#include "libxslt/xslt.h"
+#include "libxslt/xsltInternals.h"
+#include "libxslt/transform.h"
+
+PG_FUNCTION_INFO_V1(oils_xslt_process);
+
+Datum oils_xslt_process(PG_FUNCTION_ARGS)
+{
+    text *doc = PG_GETARG_TEXT_P(0);
+    text *xslt = PG_GETARG_TEXT_P(1);
+    text *processed_doc;
+    xmlDocPtr parsed_doc, parsed_xslt;
+    xmlChar *result;
+    xsltStylesheetPtr style_sheet;
+    int doc_len;
+    int xslt_len;
+    int result_len;
+
+    doc_len = VARSIZE(doc);
+    if (doc == NULL || doc_len == 0)
+    {
+        goto Fail;
+    }
+    xslt_len = VARSIZE(xslt);
+    if (xslt == NULL || xslt_len == 0)
+    {
+        goto Fail;
+    }
+    
+    //Parse the xml.
+    parsed_doc = xmlParseMemory(VARDATA(doc), doc_len);
+    if (parsed_doc == NULL)
+    {
+        goto Fail;
+    }
+    
+    //Parse the style sheet.
+    parsed_xslt = xmlParseMemory(VARDATA(xslt), xslt_len);
+    if (parsed_xslt == NULL)
+    {
+        goto Fail;
+    }
+    
+    //Create the style sheet.
+    style_sheet = xsltParseStylesheetDoc(parsed_xslt);
+    if (style_sheet == NULL)
+    {
+        goto Fail;
+    }
+
+               //Apply the sytle sheet to the xml doc.
+    parsed_doc = xsltApplyStylesheet(style_sheet, parsed_doc, NULL);
+    if (parsed_doc == NULL)
+    {
+        goto Fail;
+    }
+
+         //Output the pared doc.
+    xmlDocDumpMemory(parsed_doc, &result, &result_len);
+
+    processed_doc = palloc(result_len + VARHDRSZ);
+    
+    SET_VARSIZE(processed_doc, result_len);
+    //Copy the result.
+    memcpy(VARDATA(processed_doc), result, result_len);
+
+    //Free the space allocated by libxml using malloc.
+    free(result);
+
+    PG_RETURN_TEXT_P(processed_doc);
+
+Fail:
+    PG_RETURN_NULL();
+}