\r
static int32_t regexp_replace(const UChar *regexp,\r
int32_t regexp_len,\r
- const UChar *replacement,\r
+ int flag,\r
+ const UChar *replacement,\r
int32_t replacement_len,\r
UChar *src,\r
int32_t src_len,\r
return 0;\r
}\r
\r
- regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status);\r
+ regular_exp = uregex_open(regexp, regexp_len, flag, &pe, &status);\r
if (regular_exp == NULL)\r
{\r
return 0;\r
return len;\r
}\r
\r
-static UChar *u_strtransliterate(UChar *search_list,\r
- UChar *replacement_list,\r
- UChar *str,\r
- int32_t str_capacity)\r
+ UChar *u_strtransliterate(UChar *search_list,\r
+ UChar *replacement_list,\r
+ UChar *str,\r
+ int32_t str_capacity)\r
{\r
int32_t search_list_len, replacement_list_len, str_len;\r
UChar *des;\r
return des;\r
}\r
\r
-static UChar *u_strreplace(UChar *regexp,\r
- UChar *replacement,\r
- UChar *str,\r
- int32_t str_capacity,\r
- int is_global)\r
+UChar *u_strreplace(UChar *regexp,\r
+ int flag,\r
+ UChar *replacement,\r
+ UChar *str,\r
+ int32_t str_capacity,\r
+ int is_global)\r
{\r
int32_t regexp_len, replacement_len, str_len;\r
UChar *des;\r
\r
des_len = regexp_replace(regexp,\r
regexp_len,\r
- replacement,\r
+ flag,\r
+ replacement,\r
replacement_len,\r
str,\r
str_len,\r
\r
des_len = regexp_replace(regexp,\r
regexp_len,\r
+ flag,\r
replacement,\r
replacement_len,\r
str,\r
u_uastrncpy(uregexp, regexp, strlen(regexp));\r
u_uastrncpy(replacement, "AE", strlen("AE"));\r
\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+ nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
\r
regexp = "\\x{00DE}";\r
u_uastrncpy(uregexp, regexp, strlen(regexp));\r
u_uastrncpy(replacement, "TH", strlen("TH"));\r
\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+ nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
\r
regexp = "\\x{0152}";\r
u_uastrncpy(uregexp, regexp, strlen(regexp));\r
u_uastrncpy(replacement, "OE", strlen("OE"));\r
\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+ nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
\r
if (is_search)\r
{\r
u_uastrncpy(uregexp, regexp, strlen(regexp));\r
u_uastrncpy(replacement, "OE", strlen("OE"));\r
\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+ nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
\r
regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['";\r
u_uastrncpy(uregexp, regexp, strlen(regexp));\r
u_uastrncpy(uregexp, regexp, strlen(regexp));\r
u_uastrncpy(replacement, "", strlen(""));\r
\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+ nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
\r
if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16\r
{\r
replacement[0] = 0x7;\r
replacement[1] = 0;\r
\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);\r
+ nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 0);\r
}\r
}\r
}\r
strlen( "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]"));\r
u_uastrncpy(replacement, "", strlen(""));\r
\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+ nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
\r
u_uastrncpy(uregexp,\r
"\\x01\\x02\\x03\\x04\\x05\\x06\\x07",\r
u_uastrncpy(uregexp, "\\s+", strlen( "\\s+"));\r
u_uastrncpy(replacement, " ", strlen(" "));\r
\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+ nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
\r
u_uastrncpy(uregexp, "^\\s+", strlen( "^\\s+"));\r
u_uastrncpy(replacement, "", strlen(""));\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0);\r
+ nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 0);\r
\r
u_uastrncpy(uregexp, "\\s+$", strlen( "\\s+$"));\r
\r
- nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1);\r
+ nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1);\r
\r
return nustr;\r
\r
u_uastrncpy(uregexp, regexp, strlen(regexp));\r
u_uastrncpy(replacement, "", strlen(""));\r
\r
- ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1);\r
+ ustr = u_strreplace(uregexp, 0, replacement, ustr, u_strlen(ustr), 1);\r
\r
unorm2_normalize(normalizer, ustr, VARSIZE(str), nustr, VARSIZE(str), &err);\r
if (U_FAILURE(err))\r
--- /dev/null
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "unicode/unorm2.h"
+#include "unicode/utypes.h"
+#include "unicode/ustring.h"
+#include "unicode/uregex.h"
+#include "unicode/umachine.h"
+#include "libxml2/libxml/parser.h"
+#include "libxslt/xslt.h"
+#include "libxslt/xsltInternals.h"
+#include "libxslt/transform.h"
+
+
+/************************************************************************
+* C Implementation: vandelay.add_field vandelay.strip_field
+ *
+ * Descritption:
+ * This file implement vandelay.add_field and vandelay.strip_field.
+ * These two functions is included in the PostgreSQL extension c_functions.
+ * ICU4C, libxml and postgres lib is needed to build this file.
+ * The replace function in normalize.functions_in_c.c is also used in this
+ * module. They have to link togather.
+ * This file build up the simple process procedures to handle MARC in C.
+ *
+ * Author: Swenyu Duan <dsy88@sina.com>, (C) 2012
+ *
+ * Copyright: See COPYING file that comes with this distribution.
+ *
+************************************************************************/
+UChar *u_strtransliterate(UChar *search_list,
+ UChar *replacement_list,
+ UChar *str,
+ int32_t str_capacity);
+
+UChar *u_strreplace(UChar *regexp,
+ int flag,
+ UChar *replacement,
+ UChar *str,
+ int32_t str_capacity,
+ int is_global);
+
+int marc_add_subfield(xmlNode *cur_node, UChar *subfield_name, UChar *content)
+{
+ xmlNode *new_node;
+ int32_t xml_subfield_name_len, xml_content_len, subfield_name_len, content_len;
+ xmlChar *xml_subfield_name, *xml_content;
+ UErrorCode err = 0;
+
+ if (cur_node == NULL || subfield_name == NULL)
+ {
+ return 0;
+ }
+
+ subfield_name_len = u_strlen(subfield_name);
+ content_len = u_strlen(content);
+
+ u_strToUTF8(NULL, 0, &xml_subfield_name_len,
+ subfield_name, subfield_name_len,&err);
+ xml_subfield_name = palloc((xml_subfield_name_len + 1)* sizeof(char));
+ u_strToUTF8((char *)xml_subfield_name,
+ xml_subfield_name_len,
+ &xml_subfield_name_len,
+ subfield_name,
+ subfield_name_len,
+ &err);
+
+ u_strToUTF8(NULL, 0, &xml_content_len,
+ content, content_len, &err);
+ xml_content = palloc((xml_content_len + 1)* sizeof(char));
+ u_strToUTF8((char *)xml_content,
+ xml_content_len,
+ &xml_content_len,
+ content,
+ content_len,
+ &err);
+
+ new_node = xmlNewPI(xml_subfield_name, xml_content);
+
+ xmlAddChild(cur_node, new_node);
+
+ return 0;
+}
+
+int compare_field_name(const UChar *field_name, const xmlChar *xml_field_name)
+{
+ UChar *temp_field_name;
+ int32_t temp_field_name_len, xml_field_name_len;
+ UErrorCode err = 0;
+ int ret;
+
+ if (field_name == NULL || xml_field_name == NULL)
+ {
+ return 0;
+ }
+
+ xml_field_name_len = strlen((char *)xml_field_name);
+ temp_field_name = palloc((xml_field_name_len + 1) * sizeof(UChar));
+
+ u_strFromUTF8(temp_field_name,
+ temp_field_name_len,
+ &temp_field_name_len,
+ (char *)xml_field_name,
+ xml_field_name_len,
+ &err);
+ ret = u_strcmp(field_name, temp_field_name);
+ pfree(temp_field_name);
+
+ return ret;
+}
+
+xmlNode *marc_get_field(xmlNode *head, UChar *field_name)
+{
+ xmlNode *cur_node = NULL;
+
+ cur_node = head;
+
+ for(; cur_node; cur_node = cur_node->next)
+ {
+ if (cur_node->type == XML_ELEMENT_NODE)
+ {
+ if (compare_field_name(field_name, cur_node->name) == 0)
+ {
+ return cur_node;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int marc_delete_subfield(xmlNode *head, UChar *field_name)
+{
+ xmlNode *cur_node = NULL;
+
+ cur_node = marc_get_field(head, field_name);
+
+ xmlUnlinkNode(cur_node);
+ xmlFreeNode(cur_node);
+
+ return 1;
+}
+
+int marc_delete_field(xmlNode *cur_node)
+{
+ xmlUnlinkNode(cur_node);
+ xmlFreeNode(cur_node);
+
+ return 1;
+}
+
+int insert_fields_ordered(xmlDoc *record, xmlNode *source_node)
+{
+ xmlNode *cur_node = NULL;
+ xmlNode *new_node;
+ cur_node = xmlDocGetRootElement(record);
+
+ for (; cur_node; cur_node = cur_node->next)
+ {
+ if (cur_node->type == XML_ELEMENT_NODE)
+ {
+ if(strcmp((char *)source_node->name, (char *)cur_node->name) > 0)
+ {
+ new_node = xmlCopyNode(source_node, 1);
+ xmlAddPrevSibling(cur_node, new_node);
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+int handle_match(UChar *match, UChar **msf, int32_t *msf_len, UChar **mre, int32_t *mre_len)
+{
+ UErrorCode err = 0;
+ URegularExpression *regexp;
+ UParseError pe;
+ UChar uregexp[200], replacement[200];
+ int32_t match_list_len, match_len, m_list_len, m_num;
+ UChar **m_list, *match_list;
+ UChar *msf_t = NULL, *mre_t = NULL;
+ int32_t msf_t_len = 0, mre_t_len = 0;
+
+ if (match != NULL)
+ {
+ match_len = u_strlen(match);
+ u_uastrncpy(uregexp, "~", sizeof("~"));
+ regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+ uregex_setText(regexp, match, match_len, &err);
+
+ match_list_len = match_len;
+ match_list = palloc((match_len + 1) * sizeof(UChar));
+ m_list_len = 5;
+ m_list = palloc(m_list_len * sizeof(UChar *));
+ m_num = uregex_split(regexp, match_list, match_list_len,
+ &match_list_len, m_list, m_list_len, &err);
+ uregex_close(regexp);
+
+ msf_t_len = uregex_group(regexp, 1, NULL, 0, &err);
+ msf_t = palloc((msf_t_len + 1) * sizeof(UChar));
+ uregex_group(regexp, 1, msf_t, msf_t_len, &err);
+
+ mre_t_len = uregex_group(regexp, 2, NULL, 0, &err);
+ mre_t = palloc((mre_t_len + 1) * sizeof(UChar));
+ uregex_group(regexp, 1, mre_t, mre_t_len, &err);
+
+ if (msf_t_len > 0 && mre_t_len > 0)
+ {
+ u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*"));
+ u_uastrncpy(replacement, "", sizeof(""));
+ msf_t = u_strreplace(uregexp, 0, replacement, msf_t, msf_t_len, 1);
+
+ u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$"));
+ u_uastrncpy(replacement, "", sizeof(""));
+ msf_t = u_strreplace(uregexp, 0, replacement, msf_t, msf_t_len, 1);
+
+ u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*"));
+ u_uastrncpy(replacement, "", sizeof(""));
+ mre_t = u_strreplace(uregexp, 0, replacement, mre_t, mre_t_len, 1);
+
+ u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$"));
+ u_uastrncpy(replacement, "", sizeof(""));
+ mre_t = u_strreplace(uregexp, 0, replacement, mre_t, mre_t_len, 1);
+ }
+ }
+
+ *msf = msf_t;
+ *mre = mre_t;
+ *msf_len = msf_t_len;
+ *mre_len = mre_t_len;
+
+ return 1;
+}
+
+int handle_field(URegularExpression *regexp,
+ int32_t f_len,
+ UChar **field_name_result,
+ int32_t *field_name_result_len,
+ UChar **sf_result,
+ int32_t *sf_result_len,
+ UChar **match_result,
+ int32_t *match_result_len)
+{
+ UErrorCode err = 0;
+ UChar uregexp[200], replacement[200];
+ UChar *field_name;
+ int32_t field_name_len;
+ UChar *sf;
+ int32_t sf_len;
+ UChar *match;
+ int32_t match_len;
+
+
+ field_name_len = uregex_group(regexp, 1, NULL, 0, &err);
+ field_name = palloc((field_name_len + 1) * sizeof(UChar));
+ uregex_group(regexp, 1, field_name, field_name_len, &err);
+
+ u_uastrncpy(uregexp, "\\s+", sizeof("\\s+"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ field_name = u_strreplace(uregexp, 0, replacement, field_name, field_name_len, 1);
+
+ sf_len = uregex_group(regexp, 2, NULL, 0, &err);
+ sf = palloc((sf_len + 1) * sizeof(UChar));
+ uregex_group(regexp, 2, sf, sf_len, &err);
+
+ sf = u_strreplace(uregexp, 0, replacement, sf, sf_len, 1);
+
+ match_len = uregex_group(regexp, 3, NULL, 0, &err);
+ match = palloc((match_len + 1) * sizeof(UChar));
+ uregex_group(regexp, 3, match, match_len, &err);
+
+ u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*"));
+ u_uastrncpy(replacement, "", sizeof(""));
+ match = u_strreplace(uregexp, 0, replacement, match, match_len, 1);
+
+ u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$"));
+ u_uastrncpy(replacement, "", sizeof(""));
+ match = u_strreplace(uregexp, 0, replacement, match, match_len, 1);
+
+ *field_name_result = field_name;
+ *field_name_result_len = field_name_len;
+ *sf_result = sf;
+ *sf_result_len = sf_len;
+ *match_result = match;
+ *match_result_len = match_len;
+
+ return 1;
+}
+
+UChar *xmlchar_to_uchar(const xmlChar *src, int32_t src_len)
+{
+ UChar *result;
+ int32_t result_len;
+ UErrorCode err = 0;
+
+ u_strFromUTF8(NULL, 0, &result_len,
+ (char *)src, src_len, &err);
+
+ result = palloc(result_len * sizeof(UChar));
+
+ u_strFromUTF8(result, result_len, &result_len,
+ (char *)src, src_len, &err);
+
+ return result;
+}
+
+
+PG_FUNCTION_INFO_V1(vandelay_add_field);
+
+Datum vandelay_add_field(PG_FUNCTION_ARGS)
+{
+ text *target_xml = PG_GETARG_TEXT_P(0);
+ text *source_xml = PG_GETARG_TEXT_P(1);
+ text *field_spec = PG_GETARG_TEXT_P(2);
+ int force_add = PG_GETARG_UINT32(3);
+ UChar *result_xml;
+ int32_t result_xml_len;
+ UChar *field, *field_list, *sf_list;
+ UChar *f, *msf, *mre;
+ int i;
+ int32_t field_len, f_len, field_list_len, f_list_len;
+ int32_t msf_len, mre_len, match_len;
+ int32_t f_num, s_num;
+ UChar **f_list;
+ UChar **s_list;
+ UChar uregexp[200], replacement[200];
+ UChar *field_name, *sf, *match;
+ int32_t field_name_len, sf_len, sf_list_len, s_list_len;
+ UErrorCode err = 0;
+ URegularExpression *regexp;
+ UParseError pe;
+ xmlDoc *source_r, *target_r;
+ xmlNode *source_field, *target_field, *cur_node;
+ xmlChar *temp_result;
+ int32_t temp_result_len;
+ UBool if_find;
+
+ if (target_xml == NULL || source_xml == NULL ||
+ field_spec == NULL )
+ {
+ PG_RETURN_NULL();
+ }
+
+ source_r = xmlParseMemory(VARDATA(source_xml), VARSIZE(source_xml));
+ target_r = xmlParseMemory(VARDATA(target_xml), VARSIZE(target_xml));
+
+ if (source_r == NULL && target_r == NULL)
+ {
+ PG_RETURN_TEXT_P(target_xml);
+ }
+
+ field_len = VARSIZE(field_spec) - VARHDRSZ;
+ field = palloc((field_len + 1) * sizeof(UChar));
+ field_list = palloc((field_len + 1) * sizeof(UChar));
+
+ field = u_strFromUTF8(field, field_len, NULL, VARDATA(field_spec), field_len, &err);
+ if (U_FAILURE(err) || field == NULL)
+ {
+ field = NULL;
+ goto Fail;
+ }
+ u_uastrncpy(uregexp, ",", sizeof(","));
+ regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+ uregex_setText(regexp, field, field_len, &err);
+
+ //Split the field_spec.
+ f_list_len = 10;
+ f_list = palloc(f_list_len * sizeof(UChar *));
+ while(1)
+ {
+ f_num = uregex_split(regexp, field_list, field_len,
+ &field_list_len, f_list, f_list_len, &err);
+ if(f_num >= f_list_len)
+ {
+ pfree(f_list);
+ f_list_len *= 2;
+ f_list = palloc(f_list_len * sizeof(UChar *));
+ }
+ }
+ uregex_close(regexp);
+
+ for(i = 0; i < f_num; i++)
+ {
+ f = f_list[i];
+ f_len = u_strlen(f_list[i]);
+
+ u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1);
+
+ u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1);
+
+ u_uastrncpy(uregexp, "^(.{3})(\\w*)(?:\\[([^]]*)\\])?$", sizeof("^(.{3})(\\w*)(?:\\[([^]]*)\\])?$"));
+
+ regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+ uregex_setText(regexp, f, f_len, &err);
+
+ if(uregex_matches(regexp, 0, &err))
+ {
+
+ handle_field(regexp,
+ f_len,
+ &field_name,
+ &field_name_len,
+ &sf,
+ &sf_len,
+ &match,
+ &match_len);
+ //Split sf.
+ u_uastrncpy(uregexp, "", sizeof(""));
+ regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+ uregex_setText(regexp, sf, sf_len, &err);
+
+ sf_list_len = sf_len;
+ sf_list = palloc((sf_len + 1) * sizeof(UChar));
+ s_list_len = 10;
+ s_list = palloc(s_list_len * sizeof(UChar *));
+ s_num = uregex_split(regexp, sf_list, sf_list_len,
+ &sf_list_len, s_list, s_list_len, &err);
+ uregex_close(regexp);
+
+ handle_match(match, &msf, &msf_len, &mre, &mre_len);
+
+ if (s_num != 0)
+ {
+ source_field = xmlDocGetRootElement(source_r);
+ source_field = marc_get_field(source_field, field_name);
+
+ while (source_field != NULL)
+ {
+ target_field = xmlDocGetRootElement(target_r);
+ target_field = marc_get_field(target_field, field_name);
+ if (target_field == NULL)
+ {
+ if (force_add || (msf != NULL && mre != NULL))
+ {
+ insert_fields_ordered(target_r, source_field);
+ }
+ }
+ else
+ {
+ for (; target_field; target_field = marc_get_field(target_field, field_name))
+ {
+ if (msf != NULL && mre != NULL)
+ {
+ UChar *content;
+ regexp = uregex_open(mre, mre_len, 0, &pe, &err);
+ cur_node = marc_get_field(target_field->children, msf);
+ content = xmlchar_to_uchar(cur_node->content, strlen((char *)cur_node->content));
+ uregex_setText(regexp, content, u_strlen(content), &err);
+
+ if_find = uregex_find(regexp, 0, &err);
+
+ uregex_close(regexp);
+ if (if_find == FALSE)
+ {
+ continue;
+ }
+ }
+
+ for (i = 0; i < s_num; i++)
+ {
+ UChar *name, *content;
+ cur_node = marc_get_field(source_field->children, s_list[i]);
+ while (cur_node != NULL)
+ {
+ name = xmlchar_to_uchar(cur_node->name, strlen((char *)cur_node->name));
+ content = xmlchar_to_uchar(cur_node->content, strlen((char *)cur_node->content));
+ marc_add_subfield(target_field, name, content);
+ cur_node = marc_get_field(source_field->children, s_list[i]);
+ }
+ }
+ }
+ }
+ source_field = marc_get_field(source_field, field_name);
+ }
+ }
+ else
+ {
+ source_field = xmlDocGetRootElement(source_r);
+ source_field = marc_get_field(source_field, field_name);
+ insert_fields_ordered(target_r, source_field);
+ }
+
+ }
+
+
+ uregex_close(regexp);
+ }
+
+ xmlDocDumpMemory(target_r, &temp_result, &temp_result_len);
+
+ u_uastrncpy(uregexp, "^<\\?.+?\\?>$", sizeof("^<\\?.+?\\?>$"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ u_strFromUTF8(NULL, 0, &result_xml_len, (char *)temp_result, temp_result_len, &err);
+
+ err = 0;
+ result_xml = (UChar *)palloc(result_xml_len * sizeof(UChar));
+ result_xml = u_strFromUTF8(result_xml,
+ result_xml_len,
+ &result_xml_len,
+ (char *)temp_result,
+ temp_result_len,
+ &err);
+ if (result_xml != NULL)
+ {
+ result_xml[result_xml_len] = '\0';
+ }
+
+ u_uastrncpy(uregexp, "^<\\?.+?\\?>$", sizeof("^<\\?.+?\\?>$"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ result_xml = u_strreplace(uregexp,
+ UREGEX_MULTILINE,
+ replacement,
+ result_xml,
+ result_xml_len,
+ 0);
+
+ u_uastrncpy(uregexp, "\n", sizeof("\n"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ result_xml = u_strreplace(uregexp,
+ UREGEX_DOTALL,
+ replacement,
+ result_xml,
+ result_xml_len,
+ 1);
+
+
+ u_uastrncpy(uregexp, ">\\s+<", sizeof(">\\s+<"));
+ u_uastrncpy(replacement, "><", sizeof("><"));
+
+ result_xml = u_strreplace(uregexp,
+ UREGEX_DOTALL,
+ replacement,
+ result_xml,
+ result_xml_len,
+ 1);
+
+ u_strToUTF8(NULL, 0, &temp_result_len, result_xml, result_xml_len, &err);
+
+ target_xml = (text *)palloc(temp_result_len + VARHDRSZ);
+ SET_VARSIZE(target_xml, temp_result_len + VARHDRSZ);
+
+ u_strToUTF8(VARDATA(target_xml), 0, &temp_result_len, result_xml, result_xml_len, &err);
+
+ xmlFree(result_xml);
+
+
+Fail:
+ xmlFreeDoc(source_r);
+ xmlFreeDoc(target_r);
+
+ xmlCleanupParser();
+
+ PG_RETURN_TEXT_P(target_xml);
+}
+
+
+PG_FUNCTION_INFO_V1(vandelay_strip_field);
+
+Datum vandelay_strip_field(PG_FUNCTION_ARGS)
+{
+ text *xml = PG_GETARG_TEXT_P(0);
+ text *field_spec = PG_GETARG_TEXT_P(1);
+ xmlDoc *r;
+ UChar *field, *field_list, **f_list, *sf_list, **s_list, *msf, *mre;
+ UChar *field_name, *f, *sf, *match;
+ int32_t sf_list_len, s_list_len, msf_len, mre_len;
+ int32_t sf_len, match_len;
+ UErrorCode err = 0;
+ URegularExpression *regexp;
+ UParseError pe;
+ UChar uregexp[200], replacement[200];
+ int32_t field_len, f_len, field_list_len, f_list_len, field_name_len;
+ int32_t f_num, s_num;
+ xmlNode *to_field, *cur_node;
+ UBool if_find;
+ xmlChar *temp_result;
+ UChar *result_xml;
+ int32_t temp_result_len, result_xml_len;
+ text *result = NULL;
+ int i;
+
+ if (xml == NULL || field_spec == NULL)
+ {
+ PG_RETURN_NULL();
+ }
+
+ r = xmlParseMemory(VARDATA(xml), VARSIZE(xml));
+ if (r == NULL)
+ {
+ PG_RETURN_TEXT_P(xml);
+ }
+
+ field_len = VARSIZE(field_spec) - VARHDRSZ;
+ field = palloc((field_len + 1) * sizeof(UChar));
+ field_list = palloc((field_len + 1) * sizeof(UChar));
+
+ field = u_strFromUTF8(field, field_len, NULL, VARDATA(field_spec), field_len, &err);
+ if (U_FAILURE(err) || field == NULL)
+ {
+ field = NULL;
+ goto Fail;
+ }
+
+ u_uastrncpy(uregexp, ",", sizeof(","));
+ regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+ uregex_setText(regexp, field, field_len, &err);
+
+ //Split the field_spec.
+ f_list_len = 10;
+ f_list = palloc(f_list_len * sizeof(UChar *));
+ while(1)
+ {
+ f_num = uregex_split(regexp, field_list, field_len,
+ &field_list_len, f_list, f_list_len, &err);
+ if(f_num >= f_list_len)
+ {
+ pfree(f_list);
+ f_list_len *= 2;
+ f_list = palloc(f_list_len * sizeof(UChar *));
+ }
+ }
+ uregex_close(regexp);
+
+
+ for(i = 0; i < f_num; i++)
+ {
+ f = f_list[i];
+ f_len = u_strlen(f_list[i]);
+
+ u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1);
+
+ u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1);
+
+ u_uastrncpy(uregexp, "^(.{3})(\\w*)(?:\\[([^]]*)\\])?$", sizeof("^(.{3})(\\w*)(?:\\[([^]]*)\\])?$"));
+
+ regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+ uregex_setText(regexp, f, f_len, &err);
+
+ if(uregex_matches(regexp, 0, &err))
+ {
+ handle_field(regexp,
+ f_len,
+ &field_name,
+ &field_name_len,
+ &sf,
+ &sf_len,
+ &match,
+ &match_len);
+
+ //Split sf.
+ u_uastrncpy(uregexp, "", sizeof(""));
+ regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err);
+ uregex_setText(regexp, sf, sf_len, &err);
+
+ sf_list_len = sf_len;
+ sf_list = palloc((sf_len + 1) * sizeof(UChar));
+ s_list_len = 10;
+ s_list = palloc(s_list_len * sizeof(UChar *));
+ s_num = uregex_split(regexp, sf_list, sf_list_len,
+ &sf_list_len, s_list, s_list_len, &err);
+ uregex_close(regexp);
+
+
+ handle_match(match, &msf, &msf_len, &mre, &mre_len);
+ to_field = xmlDocGetRootElement(r);
+ to_field = marc_get_field(to_field, field_name);
+ for (; to_field; to_field = marc_get_field(to_field, field_name))
+ {
+ if (msf != NULL && mre != NULL)
+ {
+ UChar *content;
+ regexp = uregex_open(mre, mre_len, 0, &pe, &err);
+ cur_node = marc_get_field(to_field->children, msf);
+ content = xmlchar_to_uchar(cur_node->content, strlen((char *)cur_node->content));
+ uregex_setText(regexp, content, u_strlen(content), &err);
+
+ if_find = uregex_find(regexp, 0, &err);
+
+ uregex_close(regexp);
+ if (if_find == FALSE)
+ {
+ continue;
+ }
+ }
+
+ if (s_num != 0)
+ {
+ for (i = 0; i < s_num; i++)
+ {
+ marc_delete_subfield(to_field->children, s_list[i]);
+ }
+ }
+ else
+ {
+ marc_delete_field(to_field);
+ }
+ }
+
+ }
+ }
+ xmlDocDumpMemory(r, &temp_result, &temp_result_len);
+
+ u_strFromUTF8(NULL, 0, &result_xml_len, (char *)temp_result, temp_result_len, &err);
+
+
+ err = 0;
+ result_xml = (UChar *)palloc(result_xml_len);
+
+ result_xml = u_strFromUTF8(result_xml,
+ result_xml_len,
+ &result_xml_len,
+ (char *)temp_result,
+ temp_result_len,
+ &err);
+ if (result_xml != NULL)
+ {
+ result_xml[result_xml_len] = '\0';
+ }
+
+
+ u_uastrncpy(uregexp, "^<\\?.+?\\?>$", sizeof("^<\\?.+?\\?>$"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ result_xml = u_strreplace(uregexp,
+ UREGEX_MULTILINE,
+ replacement,
+ result_xml,
+ result_xml_len,
+ 0);
+
+ u_uastrncpy(uregexp, "\n", sizeof("\n"));
+ u_uastrncpy(replacement, "", sizeof(""));
+
+ result_xml = u_strreplace(uregexp,
+ UREGEX_DOTALL,
+ replacement,
+ result_xml,
+ result_xml_len,
+ 1);
+
+
+ u_uastrncpy(uregexp, ">\\s+<", sizeof(">\\s+<"));
+ u_uastrncpy(replacement, "><", sizeof("><"));
+
+ result_xml = u_strreplace(uregexp,
+ UREGEX_DOTALL,
+ replacement,
+ result_xml,
+ result_xml_len,
+ 1);
+
+ u_strToUTF8(NULL, 0, &temp_result_len, result_xml, result_xml_len, &err);
+
+ result = (text *)palloc(temp_result_len + VARHDRSZ);
+ SET_VARSIZE(result, temp_result_len + VARHDRSZ);
+
+ u_strToUTF8(VARDATA(result), temp_result_len, &temp_result_len, result_xml, result_xml_len, &err);
+
+ xmlFree(temp_result);
+ pfree(result_xml);
+
+Fail:
+ xmlFreeDoc(r);
+ xmlCleanupParser();
+
+ PG_RETURN_TEXT_P(result);
+}