From 536b54013d03b29a97ff9faebcc75798965e29a2 Mon Sep 17 00:00:00 2001 From: Swenyu Duan Date: Sun, 29 Jul 2012 22:42:47 -0400 Subject: [PATCH] Add some new functions to extension c_functions. New functions include oils_xslt_process, vandelay.add_field and vandelay.strip_field. These functions are implemente in plperlu before. The C implementation of these functions relies on ICU4C and libxml2, libxslt. To install the new functions just create the extension c_functions as before. All the functions have not tested yet. There may be some errors in it. I will test it soon. Signed-off-by: Swenyu Duan --- .../src/sql/Pg/extensions/c_functions--1.0.sql | 12 + Open-ILS/src/sql/Pg/extensions/makefile | 2 +- .../sql/Pg/extensions/normalize.functions_in_c.c | 50 +- .../sql/Pg/extensions/vandelay.functions_in_c.c | 783 +++++++++++++++++++++ .../src/sql/Pg/extensions/xml.functions_in_c.c | 92 +++ 5 files changed, 915 insertions(+), 24 deletions(-) create mode 100755 Open-ILS/src/sql/Pg/extensions/vandelay.functions_in_c.c create mode 100755 Open-ILS/src/sql/Pg/extensions/xml.functions_in_c.c diff --git a/Open-ILS/src/sql/Pg/extensions/c_functions--1.0.sql b/Open-ILS/src/sql/Pg/extensions/c_functions--1.0.sql index 5f3049d642..b02b6c8e0c 100755 --- a/Open-ILS/src/sql/Pg/extensions/c_functions--1.0.sql +++ b/Open-ILS/src/sql/Pg/extensions/c_functions--1.0.sql @@ -1,3 +1,15 @@ +CREATE OR REPLACE FUNCTION vandelay.add_field( TEXT, TEXT, TEXT, INT) RETURNS TEXT + AS 'c_functions.so', 'vandelay_add_field' + LANGUAGE C STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION vandelay.strip_field( TEXT, TEXT) RETURNS TEXT + AS 'c_functions.so', 'vandelay_strip_field' + LANGUAGE C STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION oils_xslt_process( TEXT, TEXT) RETURNS TEXT + AS 'c_functions.so', 'oils_xslt_process' + LANGUAGE C STRICT IMMUTABLE; + CREATE OR REPLACE FUNCTION public.search_normalize( TEXT, TEXT ) RETURNS TEXT AS 'c_functions.so', 'search_normalize' LANGUAGE C STRICT IMMUTABLE; diff --git a/Open-ILS/src/sql/Pg/extensions/makefile b/Open-ILS/src/sql/Pg/extensions/makefile index 8aa4ad3bf2..61af73e1c7 100644 --- a/Open-ILS/src/sql/Pg/extensions/makefile +++ b/Open-ILS/src/sql/Pg/extensions/makefile @@ -3,7 +3,7 @@ EXTENSION = c_functions SHLIB_LINK = -licutu -licuuc -licuio -licui18n -licule -liculx -licudata #PG_CPPFLAGS = -L/usr/lib -licuuc -licuio -licui18n -licule -liculx -licudata DATA = c_functions--1.0.sql -OBJS = normalize.functions_in_c.o +OBJS = normalize.functions_in_c.o xml.functions_in_c.o vandelay.functions_in_c.o PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) diff --git a/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c b/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c index 26e48e0db6..682502abf8 100755 --- a/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c +++ b/Open-ILS/src/sql/Pg/extensions/normalize.functions_in_c.c @@ -94,7 +94,8 @@ static int32_t regexp_transliterate(const UChar *search_list, static int32_t regexp_replace(const UChar *regexp, int32_t regexp_len, - const UChar *replacement, + int flag, + const UChar *replacement, int32_t replacement_len, UChar *src, int32_t src_len, @@ -112,7 +113,7 @@ static int32_t regexp_replace(const UChar *regexp, return 0; } - regular_exp = uregex_open(regexp, regexp_len, 0, &pe, &status); + regular_exp = uregex_open(regexp, regexp_len, flag, &pe, &status); if (regular_exp == NULL) { return 0; @@ -167,10 +168,10 @@ static int32_t regexp_replace(const UChar *regexp, return len; } -static UChar *u_strtransliterate(UChar *search_list, - UChar *replacement_list, - UChar *str, - int32_t str_capacity) + UChar *u_strtransliterate(UChar *search_list, + UChar *replacement_list, + UChar *str, + int32_t str_capacity) { int32_t search_list_len, replacement_list_len, str_len; UChar *des; @@ -207,11 +208,12 @@ static UChar *u_strtransliterate(UChar *search_list, return des; } -static UChar *u_strreplace(UChar *regexp, - UChar *replacement, - UChar *str, - int32_t str_capacity, - int is_global) +UChar *u_strreplace(UChar *regexp, + int flag, + UChar *replacement, + UChar *str, + int32_t str_capacity, + int is_global) { int32_t regexp_len, replacement_len, str_len; UChar *des; @@ -228,7 +230,8 @@ static UChar *u_strreplace(UChar *regexp, des_len = regexp_replace(regexp, regexp_len, - replacement, + flag, + replacement, replacement_len, str, str_len, @@ -240,6 +243,7 @@ static UChar *u_strreplace(UChar *regexp, des_len = regexp_replace(regexp, regexp_len, + flag, replacement, replacement_len, str, @@ -265,19 +269,19 @@ UChar *additional_substitutions(UChar *nustr, int is_search) u_uastrncpy(uregexp, regexp, strlen(regexp)); u_uastrncpy(replacement, "AE", strlen("AE")); - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1); regexp = "\\x{00DE}"; u_uastrncpy(uregexp, regexp, strlen(regexp)); u_uastrncpy(replacement, "TH", strlen("TH")); - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1); regexp = "\\x{0152}"; u_uastrncpy(uregexp, regexp, strlen(regexp)); u_uastrncpy(replacement, "OE", strlen("OE")); - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1); if (is_search) { @@ -291,7 +295,7 @@ UChar *additional_substitutions(UChar *nustr, int is_search) u_uastrncpy(uregexp, regexp, strlen(regexp)); u_uastrncpy(replacement, "OE", strlen("OE")); - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1); regexp = "\\x{0110}\\x{00D0}\\x{00D8}\\x{0141}\\x{2113}\\x{02BB}\\x{02BC}]['"; u_uastrncpy(uregexp, regexp, strlen(regexp)); @@ -320,7 +324,7 @@ UChar *transformations_on_unicode(UChar *nustr, UChar *usf) u_uastrncpy(uregexp, regexp, strlen(regexp)); u_uastrncpy(replacement, "", strlen("")); - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1); if (usf != NULL && usf[0] == 0x61) //0x61 == 'a' in utf16 { @@ -334,7 +338,7 @@ UChar *transformations_on_unicode(UChar *nustr, UChar *usf) replacement[0] = 0x7; replacement[1] = 0; - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0); + nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 0); } } } @@ -365,7 +369,7 @@ UChar *replace_placehoders(UChar *nustr) strlen( "[\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\\\p{Pi}\\p{Po}\\p{Ps}\\p{Sk}\\p{Sm}\\p{So}\\p{Zl}\\p{Zp}\\p{Zs}]")); u_uastrncpy(replacement, "", strlen("")); - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1); u_uastrncpy(uregexp, "\\x01\\x02\\x03\\x04\\x05\\x06\\x07", @@ -430,15 +434,15 @@ UChar *leading_trailing_spaces(UChar * nustr) u_uastrncpy(uregexp, "\\s+", strlen( "\\s+")); u_uastrncpy(replacement, " ", strlen(" ")); - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1); u_uastrncpy(uregexp, "^\\s+", strlen( "^\\s+")); u_uastrncpy(replacement, "", strlen("")); - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 0); + nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 0); u_uastrncpy(uregexp, "\\s+$", strlen( "\\s+$")); - nustr = u_strreplace(uregexp, replacement, nustr, u_strlen(nustr), 1); + nustr = u_strreplace(uregexp, 0, replacement, nustr, u_strlen(nustr), 1); return nustr; @@ -518,7 +522,7 @@ text *normalize(text *str, text *sf, int is_search) u_uastrncpy(uregexp, regexp, strlen(regexp)); u_uastrncpy(replacement, "", strlen("")); - ustr = u_strreplace(uregexp, replacement, ustr, u_strlen(ustr), 1); + ustr = u_strreplace(uregexp, 0, replacement, ustr, u_strlen(ustr), 1); unorm2_normalize(normalizer, ustr, VARSIZE(str), nustr, VARSIZE(str), &err); if (U_FAILURE(err)) diff --git a/Open-ILS/src/sql/Pg/extensions/vandelay.functions_in_c.c b/Open-ILS/src/sql/Pg/extensions/vandelay.functions_in_c.c new file mode 100755 index 0000000000..449174148c --- /dev/null +++ b/Open-ILS/src/sql/Pg/extensions/vandelay.functions_in_c.c @@ -0,0 +1,783 @@ +#include "postgres.h" +#include "fmgr.h" + +#include "unicode/unorm2.h" +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "unicode/uregex.h" +#include "unicode/umachine.h" +#include "libxml2/libxml/parser.h" +#include "libxslt/xslt.h" +#include "libxslt/xsltInternals.h" +#include "libxslt/transform.h" + + +/************************************************************************ +* C Implementation: vandelay.add_field vandelay.strip_field + * + * Descritption: + * This file implement vandelay.add_field and vandelay.strip_field. + * These two functions is included in the PostgreSQL extension c_functions. + * ICU4C, libxml and postgres lib is needed to build this file. + * The replace function in normalize.functions_in_c.c is also used in this + * module. They have to link togather. + * This file build up the simple process procedures to handle MARC in C. + * + * Author: Swenyu Duan , (C) 2012 + * + * Copyright: See COPYING file that comes with this distribution. + * +************************************************************************/ +UChar *u_strtransliterate(UChar *search_list, + UChar *replacement_list, + UChar *str, + int32_t str_capacity); + +UChar *u_strreplace(UChar *regexp, + int flag, + UChar *replacement, + UChar *str, + int32_t str_capacity, + int is_global); + +int marc_add_subfield(xmlNode *cur_node, UChar *subfield_name, UChar *content) +{ + xmlNode *new_node; + int32_t xml_subfield_name_len, xml_content_len, subfield_name_len, content_len; + xmlChar *xml_subfield_name, *xml_content; + UErrorCode err = 0; + + if (cur_node == NULL || subfield_name == NULL) + { + return 0; + } + + subfield_name_len = u_strlen(subfield_name); + content_len = u_strlen(content); + + u_strToUTF8(NULL, 0, &xml_subfield_name_len, + subfield_name, subfield_name_len,&err); + xml_subfield_name = palloc((xml_subfield_name_len + 1)* sizeof(char)); + u_strToUTF8((char *)xml_subfield_name, + xml_subfield_name_len, + &xml_subfield_name_len, + subfield_name, + subfield_name_len, + &err); + + u_strToUTF8(NULL, 0, &xml_content_len, + content, content_len, &err); + xml_content = palloc((xml_content_len + 1)* sizeof(char)); + u_strToUTF8((char *)xml_content, + xml_content_len, + &xml_content_len, + content, + content_len, + &err); + + new_node = xmlNewPI(xml_subfield_name, xml_content); + + xmlAddChild(cur_node, new_node); + + return 0; +} + +int compare_field_name(const UChar *field_name, const xmlChar *xml_field_name) +{ + UChar *temp_field_name; + int32_t temp_field_name_len, xml_field_name_len; + UErrorCode err = 0; + int ret; + + if (field_name == NULL || xml_field_name == NULL) + { + return 0; + } + + xml_field_name_len = strlen((char *)xml_field_name); + temp_field_name = palloc((xml_field_name_len + 1) * sizeof(UChar)); + + u_strFromUTF8(temp_field_name, + temp_field_name_len, + &temp_field_name_len, + (char *)xml_field_name, + xml_field_name_len, + &err); + ret = u_strcmp(field_name, temp_field_name); + pfree(temp_field_name); + + return ret; +} + +xmlNode *marc_get_field(xmlNode *head, UChar *field_name) +{ + xmlNode *cur_node = NULL; + + cur_node = head; + + for(; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_ELEMENT_NODE) + { + if (compare_field_name(field_name, cur_node->name) == 0) + { + return cur_node; + } + } + } + + return NULL; +} + +int marc_delete_subfield(xmlNode *head, UChar *field_name) +{ + xmlNode *cur_node = NULL; + + cur_node = marc_get_field(head, field_name); + + xmlUnlinkNode(cur_node); + xmlFreeNode(cur_node); + + return 1; +} + +int marc_delete_field(xmlNode *cur_node) +{ + xmlUnlinkNode(cur_node); + xmlFreeNode(cur_node); + + return 1; +} + +int insert_fields_ordered(xmlDoc *record, xmlNode *source_node) +{ + xmlNode *cur_node = NULL; + xmlNode *new_node; + cur_node = xmlDocGetRootElement(record); + + for (; cur_node; cur_node = cur_node->next) + { + if (cur_node->type == XML_ELEMENT_NODE) + { + if(strcmp((char *)source_node->name, (char *)cur_node->name) > 0) + { + new_node = xmlCopyNode(source_node, 1); + xmlAddPrevSibling(cur_node, new_node); + return 1; + } + } + } + return 0; +} + +int handle_match(UChar *match, UChar **msf, int32_t *msf_len, UChar **mre, int32_t *mre_len) +{ + UErrorCode err = 0; + URegularExpression *regexp; + UParseError pe; + UChar uregexp[200], replacement[200]; + int32_t match_list_len, match_len, m_list_len, m_num; + UChar **m_list, *match_list; + UChar *msf_t = NULL, *mre_t = NULL; + int32_t msf_t_len = 0, mre_t_len = 0; + + if (match != NULL) + { + match_len = u_strlen(match); + u_uastrncpy(uregexp, "~", sizeof("~")); + regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err); + uregex_setText(regexp, match, match_len, &err); + + match_list_len = match_len; + match_list = palloc((match_len + 1) * sizeof(UChar)); + m_list_len = 5; + m_list = palloc(m_list_len * sizeof(UChar *)); + m_num = uregex_split(regexp, match_list, match_list_len, + &match_list_len, m_list, m_list_len, &err); + uregex_close(regexp); + + msf_t_len = uregex_group(regexp, 1, NULL, 0, &err); + msf_t = palloc((msf_t_len + 1) * sizeof(UChar)); + uregex_group(regexp, 1, msf_t, msf_t_len, &err); + + mre_t_len = uregex_group(regexp, 2, NULL, 0, &err); + mre_t = palloc((mre_t_len + 1) * sizeof(UChar)); + uregex_group(regexp, 1, mre_t, mre_t_len, &err); + + if (msf_t_len > 0 && mre_t_len > 0) + { + u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*")); + u_uastrncpy(replacement, "", sizeof("")); + msf_t = u_strreplace(uregexp, 0, replacement, msf_t, msf_t_len, 1); + + u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$")); + u_uastrncpy(replacement, "", sizeof("")); + msf_t = u_strreplace(uregexp, 0, replacement, msf_t, msf_t_len, 1); + + u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*")); + u_uastrncpy(replacement, "", sizeof("")); + mre_t = u_strreplace(uregexp, 0, replacement, mre_t, mre_t_len, 1); + + u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$")); + u_uastrncpy(replacement, "", sizeof("")); + mre_t = u_strreplace(uregexp, 0, replacement, mre_t, mre_t_len, 1); + } + } + + *msf = msf_t; + *mre = mre_t; + *msf_len = msf_t_len; + *mre_len = mre_t_len; + + return 1; +} + +int handle_field(URegularExpression *regexp, + int32_t f_len, + UChar **field_name_result, + int32_t *field_name_result_len, + UChar **sf_result, + int32_t *sf_result_len, + UChar **match_result, + int32_t *match_result_len) +{ + UErrorCode err = 0; + UChar uregexp[200], replacement[200]; + UChar *field_name; + int32_t field_name_len; + UChar *sf; + int32_t sf_len; + UChar *match; + int32_t match_len; + + + field_name_len = uregex_group(regexp, 1, NULL, 0, &err); + field_name = palloc((field_name_len + 1) * sizeof(UChar)); + uregex_group(regexp, 1, field_name, field_name_len, &err); + + u_uastrncpy(uregexp, "\\s+", sizeof("\\s+")); + u_uastrncpy(replacement, "", sizeof("")); + + field_name = u_strreplace(uregexp, 0, replacement, field_name, field_name_len, 1); + + sf_len = uregex_group(regexp, 2, NULL, 0, &err); + sf = palloc((sf_len + 1) * sizeof(UChar)); + uregex_group(regexp, 2, sf, sf_len, &err); + + sf = u_strreplace(uregexp, 0, replacement, sf, sf_len, 1); + + match_len = uregex_group(regexp, 3, NULL, 0, &err); + match = palloc((match_len + 1) * sizeof(UChar)); + uregex_group(regexp, 3, match, match_len, &err); + + u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*")); + u_uastrncpy(replacement, "", sizeof("")); + match = u_strreplace(uregexp, 0, replacement, match, match_len, 1); + + u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$")); + u_uastrncpy(replacement, "", sizeof("")); + match = u_strreplace(uregexp, 0, replacement, match, match_len, 1); + + *field_name_result = field_name; + *field_name_result_len = field_name_len; + *sf_result = sf; + *sf_result_len = sf_len; + *match_result = match; + *match_result_len = match_len; + + return 1; +} + +UChar *xmlchar_to_uchar(const xmlChar *src, int32_t src_len) +{ + UChar *result; + int32_t result_len; + UErrorCode err = 0; + + u_strFromUTF8(NULL, 0, &result_len, + (char *)src, src_len, &err); + + result = palloc(result_len * sizeof(UChar)); + + u_strFromUTF8(result, result_len, &result_len, + (char *)src, src_len, &err); + + return result; +} + + +PG_FUNCTION_INFO_V1(vandelay_add_field); + +Datum vandelay_add_field(PG_FUNCTION_ARGS) +{ + text *target_xml = PG_GETARG_TEXT_P(0); + text *source_xml = PG_GETARG_TEXT_P(1); + text *field_spec = PG_GETARG_TEXT_P(2); + int force_add = PG_GETARG_UINT32(3); + UChar *result_xml; + int32_t result_xml_len; + UChar *field, *field_list, *sf_list; + UChar *f, *msf, *mre; + int i; + int32_t field_len, f_len, field_list_len, f_list_len; + int32_t msf_len, mre_len, match_len; + int32_t f_num, s_num; + UChar **f_list; + UChar **s_list; + UChar uregexp[200], replacement[200]; + UChar *field_name, *sf, *match; + int32_t field_name_len, sf_len, sf_list_len, s_list_len; + UErrorCode err = 0; + URegularExpression *regexp; + UParseError pe; + xmlDoc *source_r, *target_r; + xmlNode *source_field, *target_field, *cur_node; + xmlChar *temp_result; + int32_t temp_result_len; + UBool if_find; + + if (target_xml == NULL || source_xml == NULL || + field_spec == NULL ) + { + PG_RETURN_NULL(); + } + + source_r = xmlParseMemory(VARDATA(source_xml), VARSIZE(source_xml)); + target_r = xmlParseMemory(VARDATA(target_xml), VARSIZE(target_xml)); + + if (source_r == NULL && target_r == NULL) + { + PG_RETURN_TEXT_P(target_xml); + } + + field_len = VARSIZE(field_spec) - VARHDRSZ; + field = palloc((field_len + 1) * sizeof(UChar)); + field_list = palloc((field_len + 1) * sizeof(UChar)); + + field = u_strFromUTF8(field, field_len, NULL, VARDATA(field_spec), field_len, &err); + if (U_FAILURE(err) || field == NULL) + { + field = NULL; + goto Fail; + } + u_uastrncpy(uregexp, ",", sizeof(",")); + regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err); + uregex_setText(regexp, field, field_len, &err); + + //Split the field_spec. + f_list_len = 10; + f_list = palloc(f_list_len * sizeof(UChar *)); + while(1) + { + f_num = uregex_split(regexp, field_list, field_len, + &field_list_len, f_list, f_list_len, &err); + if(f_num >= f_list_len) + { + pfree(f_list); + f_list_len *= 2; + f_list = palloc(f_list_len * sizeof(UChar *)); + } + } + uregex_close(regexp); + + for(i = 0; i < f_num; i++) + { + f = f_list[i]; + f_len = u_strlen(f_list[i]); + + u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*")); + u_uastrncpy(replacement, "", sizeof("")); + + f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1); + + u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$")); + u_uastrncpy(replacement, "", sizeof("")); + + f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1); + + u_uastrncpy(uregexp, "^(.{3})(\\w*)(?:\\[([^]]*)\\])?$", sizeof("^(.{3})(\\w*)(?:\\[([^]]*)\\])?$")); + + regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err); + uregex_setText(regexp, f, f_len, &err); + + if(uregex_matches(regexp, 0, &err)) + { + + handle_field(regexp, + f_len, + &field_name, + &field_name_len, + &sf, + &sf_len, + &match, + &match_len); + //Split sf. + u_uastrncpy(uregexp, "", sizeof("")); + regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err); + uregex_setText(regexp, sf, sf_len, &err); + + sf_list_len = sf_len; + sf_list = palloc((sf_len + 1) * sizeof(UChar)); + s_list_len = 10; + s_list = palloc(s_list_len * sizeof(UChar *)); + s_num = uregex_split(regexp, sf_list, sf_list_len, + &sf_list_len, s_list, s_list_len, &err); + uregex_close(regexp); + + handle_match(match, &msf, &msf_len, &mre, &mre_len); + + if (s_num != 0) + { + source_field = xmlDocGetRootElement(source_r); + source_field = marc_get_field(source_field, field_name); + + while (source_field != NULL) + { + target_field = xmlDocGetRootElement(target_r); + target_field = marc_get_field(target_field, field_name); + if (target_field == NULL) + { + if (force_add || (msf != NULL && mre != NULL)) + { + insert_fields_ordered(target_r, source_field); + } + } + else + { + for (; target_field; target_field = marc_get_field(target_field, field_name)) + { + if (msf != NULL && mre != NULL) + { + UChar *content; + regexp = uregex_open(mre, mre_len, 0, &pe, &err); + cur_node = marc_get_field(target_field->children, msf); + content = xmlchar_to_uchar(cur_node->content, strlen((char *)cur_node->content)); + uregex_setText(regexp, content, u_strlen(content), &err); + + if_find = uregex_find(regexp, 0, &err); + + uregex_close(regexp); + if (if_find == FALSE) + { + continue; + } + } + + for (i = 0; i < s_num; i++) + { + UChar *name, *content; + cur_node = marc_get_field(source_field->children, s_list[i]); + while (cur_node != NULL) + { + name = xmlchar_to_uchar(cur_node->name, strlen((char *)cur_node->name)); + content = xmlchar_to_uchar(cur_node->content, strlen((char *)cur_node->content)); + marc_add_subfield(target_field, name, content); + cur_node = marc_get_field(source_field->children, s_list[i]); + } + } + } + } + source_field = marc_get_field(source_field, field_name); + } + } + else + { + source_field = xmlDocGetRootElement(source_r); + source_field = marc_get_field(source_field, field_name); + insert_fields_ordered(target_r, source_field); + } + + } + + + uregex_close(regexp); + } + + xmlDocDumpMemory(target_r, &temp_result, &temp_result_len); + + u_uastrncpy(uregexp, "^<\\?.+?\\?>$", sizeof("^<\\?.+?\\?>$")); + u_uastrncpy(replacement, "", sizeof("")); + + u_strFromUTF8(NULL, 0, &result_xml_len, (char *)temp_result, temp_result_len, &err); + + err = 0; + result_xml = (UChar *)palloc(result_xml_len * sizeof(UChar)); + result_xml = u_strFromUTF8(result_xml, + result_xml_len, + &result_xml_len, + (char *)temp_result, + temp_result_len, + &err); + if (result_xml != NULL) + { + result_xml[result_xml_len] = '\0'; + } + + u_uastrncpy(uregexp, "^<\\?.+?\\?>$", sizeof("^<\\?.+?\\?>$")); + u_uastrncpy(replacement, "", sizeof("")); + + result_xml = u_strreplace(uregexp, + UREGEX_MULTILINE, + replacement, + result_xml, + result_xml_len, + 0); + + u_uastrncpy(uregexp, "\n", sizeof("\n")); + u_uastrncpy(replacement, "", sizeof("")); + + result_xml = u_strreplace(uregexp, + UREGEX_DOTALL, + replacement, + result_xml, + result_xml_len, + 1); + + + u_uastrncpy(uregexp, ">\\s+<", sizeof(">\\s+<")); + u_uastrncpy(replacement, "><", sizeof("><")); + + result_xml = u_strreplace(uregexp, + UREGEX_DOTALL, + replacement, + result_xml, + result_xml_len, + 1); + + u_strToUTF8(NULL, 0, &temp_result_len, result_xml, result_xml_len, &err); + + target_xml = (text *)palloc(temp_result_len + VARHDRSZ); + SET_VARSIZE(target_xml, temp_result_len + VARHDRSZ); + + u_strToUTF8(VARDATA(target_xml), 0, &temp_result_len, result_xml, result_xml_len, &err); + + xmlFree(result_xml); + + +Fail: + xmlFreeDoc(source_r); + xmlFreeDoc(target_r); + + xmlCleanupParser(); + + PG_RETURN_TEXT_P(target_xml); +} + + +PG_FUNCTION_INFO_V1(vandelay_strip_field); + +Datum vandelay_strip_field(PG_FUNCTION_ARGS) +{ + text *xml = PG_GETARG_TEXT_P(0); + text *field_spec = PG_GETARG_TEXT_P(1); + xmlDoc *r; + UChar *field, *field_list, **f_list, *sf_list, **s_list, *msf, *mre; + UChar *field_name, *f, *sf, *match; + int32_t sf_list_len, s_list_len, msf_len, mre_len; + int32_t sf_len, match_len; + UErrorCode err = 0; + URegularExpression *regexp; + UParseError pe; + UChar uregexp[200], replacement[200]; + int32_t field_len, f_len, field_list_len, f_list_len, field_name_len; + int32_t f_num, s_num; + xmlNode *to_field, *cur_node; + UBool if_find; + xmlChar *temp_result; + UChar *result_xml; + int32_t temp_result_len, result_xml_len; + text *result = NULL; + int i; + + if (xml == NULL || field_spec == NULL) + { + PG_RETURN_NULL(); + } + + r = xmlParseMemory(VARDATA(xml), VARSIZE(xml)); + if (r == NULL) + { + PG_RETURN_TEXT_P(xml); + } + + field_len = VARSIZE(field_spec) - VARHDRSZ; + field = palloc((field_len + 1) * sizeof(UChar)); + field_list = palloc((field_len + 1) * sizeof(UChar)); + + field = u_strFromUTF8(field, field_len, NULL, VARDATA(field_spec), field_len, &err); + if (U_FAILURE(err) || field == NULL) + { + field = NULL; + goto Fail; + } + + u_uastrncpy(uregexp, ",", sizeof(",")); + regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err); + uregex_setText(regexp, field, field_len, &err); + + //Split the field_spec. + f_list_len = 10; + f_list = palloc(f_list_len * sizeof(UChar *)); + while(1) + { + f_num = uregex_split(regexp, field_list, field_len, + &field_list_len, f_list, f_list_len, &err); + if(f_num >= f_list_len) + { + pfree(f_list); + f_list_len *= 2; + f_list = palloc(f_list_len * sizeof(UChar *)); + } + } + uregex_close(regexp); + + + for(i = 0; i < f_num; i++) + { + f = f_list[i]; + f_len = u_strlen(f_list[i]); + + u_uastrncpy(uregexp, "^\\s*", sizeof("^\\s*")); + u_uastrncpy(replacement, "", sizeof("")); + + f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1); + + u_uastrncpy(uregexp, "\\s*$", sizeof("\\s*$")); + u_uastrncpy(replacement, "", sizeof("")); + + f = u_strreplace(uregexp, 0, replacement, f, u_strlen(f), 1); + + u_uastrncpy(uregexp, "^(.{3})(\\w*)(?:\\[([^]]*)\\])?$", sizeof("^(.{3})(\\w*)(?:\\[([^]]*)\\])?$")); + + regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err); + uregex_setText(regexp, f, f_len, &err); + + if(uregex_matches(regexp, 0, &err)) + { + handle_field(regexp, + f_len, + &field_name, + &field_name_len, + &sf, + &sf_len, + &match, + &match_len); + + //Split sf. + u_uastrncpy(uregexp, "", sizeof("")); + regexp = uregex_open(uregexp, u_strlen(uregexp), 0, &pe, &err); + uregex_setText(regexp, sf, sf_len, &err); + + sf_list_len = sf_len; + sf_list = palloc((sf_len + 1) * sizeof(UChar)); + s_list_len = 10; + s_list = palloc(s_list_len * sizeof(UChar *)); + s_num = uregex_split(regexp, sf_list, sf_list_len, + &sf_list_len, s_list, s_list_len, &err); + uregex_close(regexp); + + + handle_match(match, &msf, &msf_len, &mre, &mre_len); + to_field = xmlDocGetRootElement(r); + to_field = marc_get_field(to_field, field_name); + for (; to_field; to_field = marc_get_field(to_field, field_name)) + { + if (msf != NULL && mre != NULL) + { + UChar *content; + regexp = uregex_open(mre, mre_len, 0, &pe, &err); + cur_node = marc_get_field(to_field->children, msf); + content = xmlchar_to_uchar(cur_node->content, strlen((char *)cur_node->content)); + uregex_setText(regexp, content, u_strlen(content), &err); + + if_find = uregex_find(regexp, 0, &err); + + uregex_close(regexp); + if (if_find == FALSE) + { + continue; + } + } + + if (s_num != 0) + { + for (i = 0; i < s_num; i++) + { + marc_delete_subfield(to_field->children, s_list[i]); + } + } + else + { + marc_delete_field(to_field); + } + } + + } + } + xmlDocDumpMemory(r, &temp_result, &temp_result_len); + + u_strFromUTF8(NULL, 0, &result_xml_len, (char *)temp_result, temp_result_len, &err); + + + err = 0; + result_xml = (UChar *)palloc(result_xml_len); + + result_xml = u_strFromUTF8(result_xml, + result_xml_len, + &result_xml_len, + (char *)temp_result, + temp_result_len, + &err); + if (result_xml != NULL) + { + result_xml[result_xml_len] = '\0'; + } + + + u_uastrncpy(uregexp, "^<\\?.+?\\?>$", sizeof("^<\\?.+?\\?>$")); + u_uastrncpy(replacement, "", sizeof("")); + + result_xml = u_strreplace(uregexp, + UREGEX_MULTILINE, + replacement, + result_xml, + result_xml_len, + 0); + + u_uastrncpy(uregexp, "\n", sizeof("\n")); + u_uastrncpy(replacement, "", sizeof("")); + + result_xml = u_strreplace(uregexp, + UREGEX_DOTALL, + replacement, + result_xml, + result_xml_len, + 1); + + + u_uastrncpy(uregexp, ">\\s+<", sizeof(">\\s+<")); + u_uastrncpy(replacement, "><", sizeof("><")); + + result_xml = u_strreplace(uregexp, + UREGEX_DOTALL, + replacement, + result_xml, + result_xml_len, + 1); + + u_strToUTF8(NULL, 0, &temp_result_len, result_xml, result_xml_len, &err); + + result = (text *)palloc(temp_result_len + VARHDRSZ); + SET_VARSIZE(result, temp_result_len + VARHDRSZ); + + u_strToUTF8(VARDATA(result), temp_result_len, &temp_result_len, result_xml, result_xml_len, &err); + + xmlFree(temp_result); + pfree(result_xml); + +Fail: + xmlFreeDoc(r); + xmlCleanupParser(); + + PG_RETURN_TEXT_P(result); +} diff --git a/Open-ILS/src/sql/Pg/extensions/xml.functions_in_c.c b/Open-ILS/src/sql/Pg/extensions/xml.functions_in_c.c new file mode 100755 index 0000000000..5ab1dc2c3f --- /dev/null +++ b/Open-ILS/src/sql/Pg/extensions/xml.functions_in_c.c @@ -0,0 +1,92 @@ +/************************************************************************/ +/* C Implementation: oils.xslt.process + * + * Descritption: + * This file implement oils.xslt.process. + * The function is included in the PostgreSQL extension c_functions. + * libxml2, libxslt and postgres lib is needed to build this file. + * + * Author: Swenyu Duan , (C) 2012 + * + * Copyright: See COPYING file that comes with this distribution. + */ +/************************************************************************/ +#include "postgres.h" +#include "fmgr.h" +#include "stdio.h" +#include "libxml/tree.h" +#include "libxml/parser.h" +#include "libxslt/xslt.h" +#include "libxslt/xsltInternals.h" +#include "libxslt/transform.h" + +PG_FUNCTION_INFO_V1(oils_xslt_process); + +Datum oils_xslt_process(PG_FUNCTION_ARGS) +{ + text *doc = PG_GETARG_TEXT_P(0); + text *xslt = PG_GETARG_TEXT_P(1); + text *processed_doc; + xmlDocPtr parsed_doc, parsed_xslt; + xmlChar *result; + xsltStylesheetPtr style_sheet; + int doc_len; + int xslt_len; + int result_len; + + doc_len = VARSIZE(doc); + if (doc == NULL || doc_len == 0) + { + goto Fail; + } + xslt_len = VARSIZE(xslt); + if (xslt == NULL || xslt_len == 0) + { + goto Fail; + } + + //Parse the xml. + parsed_doc = xmlParseMemory(VARDATA(doc), doc_len); + if (parsed_doc == NULL) + { + goto Fail; + } + + //Parse the style sheet. + parsed_xslt = xmlParseMemory(VARDATA(xslt), xslt_len); + if (parsed_xslt == NULL) + { + goto Fail; + } + + //Create the style sheet. + style_sheet = xsltParseStylesheetDoc(parsed_xslt); + if (style_sheet == NULL) + { + goto Fail; + } + + //Apply the sytle sheet to the xml doc. + parsed_doc = xsltApplyStylesheet(style_sheet, parsed_doc, NULL); + if (parsed_doc == NULL) + { + goto Fail; + } + + //Output the pared doc. + xmlDocDumpMemory(parsed_doc, &result, &result_len); + + processed_doc = palloc(result_len + VARHDRSZ); + + SET_VARSIZE(processed_doc, result_len); + //Copy the result. + memcpy(VARDATA(processed_doc), result, result_len); + + //Free the space allocated by libxml using malloc. + free(result); + + PG_RETURN_TEXT_P(processed_doc); + +Fail: + PG_RETURN_NULL(); +} -- 2.11.0