From 2d8f5c4faeedfdec4ec41888c15755871ca4d54f Mon Sep 17 00:00:00 2001 From: Mike Rylander Date: Mon, 10 Jan 2022 09:29:01 -0500 Subject: [PATCH] Utility functions to find "bad" XPath Evergreen community testing has identified two specific changes to how XPath is processed in PostgreSQL 10 and later. Both of these changes came about as a result of improved PostgreSQL standards conformance with regard to how XML data should be handled in SQL. The first change in PostgreSQL is related to how XML is handled by the XPath processor and requires that Evergreen use DOCUMENT oriented, rather than CONTENT oriented XPath expressions. This means that XPath expressions cannot begin with "./" or "*" because in DOCUMENT mode there is no context node that can be assumed. These expressions must be adjusted, and this commit provides a utitily function that attempts to detect user-supplied XPath expressions that should be considered. The second change is a side effect of the first, related to how the results of global XPath expressions are assembled when processed in DOCUMENT mode rather than CONTENT mode. In some situations, expressions will match a node and its parent, and they will both show up in the result set as separate values. The effect for Evergreen is duplicated content in many cases. These expressions can take many forms, but a common pattern is a whole-document search with one level of tests, no element name test, and an attribute value test. This commit provides a function that attempts to find expressions matching this pattern. Finally, these functions are built on top of a general function that takes two regexps, one for the field name and one for the field content, and returns the schema and table name, column data, and primary key value of matching rows, which may be useful for other content pattern matching needs in the future. Signed-off-by: Mike Rylander --- .../sql/Pg/upgrade/XXXX.data.find-bad-xpath.sql | 71 ++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.data.find-bad-xpath.sql diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.data.find-bad-xpath.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.data.find-bad-xpath.sql new file mode 100644 index 0000000000..6723a4bef0 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.data.find-bad-xpath.sql @@ -0,0 +1,71 @@ +BEGIN; + +CREATE OR REPLACE FUNCTION evergreen.find_matching_fields (re TEXT, f_re TEXT) RETURNS TABLE (find_matching_fields TEXT) LANGUAGE PLPGSQL AS $F$ +DECLARE + S TEXT; + T TEXT; + F TEXT; + P TEXT; +BEGIN + + FOR S,T,F IN SELECT table_schema, table_name, column_name FROM information_schema.columns WHERE column_name LIKE f_re LOOP + -- RETURN QUERY EXECUTE $$ select 'Checking table $$ || S || '.' || T || $$...'::text$$; + SELECT column_name INTO P FROM information_schema.key_column_usage WHERE table_schema = S AND table_name = T AND constraint_name like '%_pkey'; + IF P IS NOT NULL THEN + RETURN QUERY EXECUTE $$ select '!!! Field $$ || F || $$ on $$ || S || '.' || T || $$ with $$ || P || $$ = ' || $$ || P || $$ || ': ' || $$ || F || $$ from $$ || S || '.' || T || $$ where $$ || F || $$ ~ '$$ || re || $$' $$; + ELSE + RETURN QUERY EXECUTE $$ select '!!! Field $$ || F || $$ on $$ || S || '.' || T || $$: ' || $$ || F || $$ from $$ || S || '.' || T || $$ where $$ || F || $$ ~ '$$ || re || $$' $$; + END IF; + END LOOP; + +END; +$F$; + +CREATE OR REPLACE FUNCTION evergreen.find_matching_xpath (re TEXT) RETURNS TABLE (find_matching_xpath TEXT) LANGUAGE SQL AS $F$ + SELECT evergreen.find_matching_fields(re,'%xpath%'); +$F$; + +CREATE OR REPLACE FUNCTION evergreen.find_content_mode_xpath () RETURNS TABLE (find_content_mode_xpath TEXT) LANGUAGE SQL AS $F$ + SELECT evergreen.find_matching_xpath('(?:^\(?|\|)[.*]'); +$F$; + +COMMENT ON FUNCTION evergreen.find_content_mode_xpath () IS $$ +/** + +This function will look at all fields in an Evergreen database with +"xpath" in their name and identify those that use an idiom valid +in CONTENT mode, used internally in XSLT, but which is invalid in +DOCUMENT mode, the mode used in SQL/XML. These are typically +expressions that begin with "." or "*", which assumes there is no +difference between the root node and the document node. Any such +XPath expressions will need to be adjusted, often simply by adding +"//" before "*", or replacing "./" with "//". + +**/ +$$; + +CREATE OR REPLACE FUNCTION evergreen.find_simple_xpath () RETURNS TABLE (find_content_mode_xpath TEXT) LANGUAGE SQL AS $F$ + SELECT evergreen.find_matching_xpath('(^|//)[^/]*\*\[\s*(?!local-name)[^/]+@[^/]+$'); +$F$; + +COMMENT ON FUNCTION evergreen.find_simple_xpath () IS $$ +/** + +This function will look at all fields in an Evergreen database with +"xpath" in their name and identify those that seemt to be overly +simple, which can lead to duplicated content. It looks for expressions +that have just one node level, start with an "all nodes" test, use at +least one attribute test, and do not use the local-name() function as +the first node test. + +This heuristic is far from perfect, but should identify cases where +more specificity would be beneficial. + +**/ +$$; + +COMMIT; + +SELECT evergreen.find_content_mode_xpath(); +SELECT evergreen.find_simple_xpath(); + -- 2.11.0