From f8708ca97eaf2e1e41a8cd3ed8715d23da7e5f31 Mon Sep 17 00:00:00 2001
From: Galen Charlton <gmc@equinoxOLI.org>
Date: Wed, 30 Nov 2022 11:10:36 -0500
Subject: [PATCH] LP#1998355: reduce growth of DYM dictionary

This patch reduces the number of updates to search.sympell_dictionary
rows that would not change the contents of those rows, thereby
reducing the potential for certain record maintenance operations to
significantlly bloat that table.

In particular, it adjust the upsert to update the row for an existing
prefix only if there would be a net change in at least one of the *_count
columns or the list of suggestions. (Note that if a row is the target of
an UPDATE statement, PostgreSQL will _always_ create a row version, even
if there is no change to the contents of the row.)

It should be noted that while this patch is useful in and of itself, there
is a longer-term fix that would have additional benefits: adjust the
overall reingest logic so that it minimizes changes to all large tables
derived from the bib record when a bib gets reingested. A row that never
gets touched because it doesn't have to be can never become bloat.

To test
-------
[1] In a Concerto database, ensure that idempotent updates of the MARC
    in biblio.record_entry will nonetheless force a reingest by running:

    update config.internal_flag set enabled = true where name = 'ingest.reingest.force_on_same_marc';

[2] Note the size of search.symspell_dictionary by running:

    select pg_size_pretty(pg_total_relation_size('search.symspell_dictionary'));

[3] Run a few rounds of the following update that forces a reingest of the bibs:

    update biblio.record_entry set id = id;

[4] For the sake of fairness, run a vacuum on the table:

    VACUUM ANALYZE search.sympsell_dictionary

[5] Run the size measuremeant again and not that it's significantly larger.

[6] Run the following the reset the table size:

    VACUUM FULL search.symspell_dictionary;

[7] Note the size, apply the patch, and repeat step 3.

[8] This time, the table size should be the same (or close to the same) as it
    was at the beginning of step 7.

Signed-off-by: Galen Charlton <gmc@equinoxOLI.org>
Signed-off-by: Jason Boyer <JBoyer@equinoxOLI.org>
---
 Open-ILS/src/sql/Pg/300.schema.staged_search.sql   | 15 +++++
 ...XXXX.schema.lp1998355-reduce-symspell-churn.sql | 64 ++++++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql

diff --git a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql
index 5abbbef2f0..2455aadd76 100644
--- a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql
+++ b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql
@@ -1202,6 +1202,21 @@ CREATE OR REPLACE FUNCTION search.symspell_dictionary_reify () RETURNS SETOF sea
 
     identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count),
     identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions)
+
+    WHERE (
+        EXCLUDED.keyword_count <> 0 OR
+        EXCLUDED.title_count <> 0 OR
+        EXCLUDED.author_count <> 0 OR
+        EXCLUDED.subject_count <> 0 OR
+        EXCLUDED.series_count <> 0 OR
+        EXCLUDED.identifier_count <> 0 OR
+        NOT (EXCLUDED.keyword_suggestions <@ d.keyword_suggestions) OR
+        NOT (EXCLUDED.title_suggestions <@ d.title_suggestions) OR
+        NOT (EXCLUDED.author_suggestions <@ d.author_suggestions) OR
+        NOT (EXCLUDED.subject_suggestions <@ d.subject_suggestions) OR
+        NOT (EXCLUDED.series_suggestions <@ d.series_suggestions) OR
+        NOT (EXCLUDED.identifier_suggestions <@ d.identifier_suggestions)
+    )
  RETURNING *;
 $f$ LANGUAGE SQL;
 
diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql
new file mode 100644
index 0000000000..996e967408
--- /dev/null
+++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql
@@ -0,0 +1,64 @@
+BEGIN;
+
+-- SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
+
+CREATE OR REPLACE FUNCTION search.symspell_dictionary_reify () RETURNS SETOF search.symspell_dictionary AS $f$
+ WITH new_rows AS (
+    DELETE FROM search.symspell_dictionary_updates WHERE transaction_id = txid_current() RETURNING *
+ ), computed_rows AS ( -- this collapses the rows deleted into the format we need for UPSERT
+    SELECT  SUM(keyword_count)    AS keyword_count,
+            SUM(title_count)      AS title_count,
+            SUM(author_count)     AS author_count,
+            SUM(subject_count)    AS subject_count,
+            SUM(series_count)     AS series_count,
+            SUM(identifier_count) AS identifier_count,
+
+            prefix_key,
+
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT keyword_suggestions[1]), NULL)    AS keyword_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT title_suggestions[1]), NULL)      AS title_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT author_suggestions[1]), NULL)     AS author_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT subject_suggestions[1]), NULL)    AS subject_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT series_suggestions[1]), NULL)     AS series_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT identifier_suggestions[1]), NULL) AS identifier_suggestions
+      FROM  new_rows
+      GROUP BY prefix_key
+ )
+ INSERT INTO search.symspell_dictionary AS d SELECT * FROM computed_rows
+ ON CONFLICT (prefix_key) DO UPDATE SET
+    keyword_count = GREATEST(0, d.keyword_count + EXCLUDED.keyword_count),
+    keyword_suggestions = evergreen.text_array_merge_unique(EXCLUDED.keyword_suggestions,d.keyword_suggestions),
+
+    title_count = GREATEST(0, d.title_count + EXCLUDED.title_count),
+    title_suggestions = evergreen.text_array_merge_unique(EXCLUDED.title_suggestions,d.title_suggestions),
+
+    author_count = GREATEST(0, d.author_count + EXCLUDED.author_count),
+    author_suggestions = evergreen.text_array_merge_unique(EXCLUDED.author_suggestions,d.author_suggestions),
+
+    subject_count = GREATEST(0, d.subject_count + EXCLUDED.subject_count),
+    subject_suggestions = evergreen.text_array_merge_unique(EXCLUDED.subject_suggestions,d.subject_suggestions),
+
+    series_count = GREATEST(0, d.series_count + EXCLUDED.series_count),
+    series_suggestions = evergreen.text_array_merge_unique(EXCLUDED.series_suggestions,d.series_suggestions),
+
+    identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count),
+    identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions)
+
+    WHERE (
+        EXCLUDED.keyword_count <> 0 OR
+        EXCLUDED.title_count <> 0 OR
+        EXCLUDED.author_count <> 0 OR
+        EXCLUDED.subject_count <> 0 OR
+        EXCLUDED.series_count <> 0 OR
+        EXCLUDED.identifier_count <> 0 OR
+        NOT (EXCLUDED.keyword_suggestions <@ d.keyword_suggestions) OR
+        NOT (EXCLUDED.title_suggestions <@ d.title_suggestions) OR
+        NOT (EXCLUDED.author_suggestions <@ d.author_suggestions) OR
+        NOT (EXCLUDED.subject_suggestions <@ d.subject_suggestions) OR
+        NOT (EXCLUDED.series_suggestions <@ d.series_suggestions) OR
+        NOT (EXCLUDED.identifier_suggestions <@ d.identifier_suggestions)
+    )
+ RETURNING *;
+$f$ LANGUAGE SQL;
+
+COMMIT;
-- 
2.11.0