From f04004d3e26176f05aed04b5c1f17f9f15465591 Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Wed, 30 Nov 2022 11:10:36 -0500 Subject: [PATCH] LP#1998355: reduce growth of DYM dictionary This patch reduces the number of updates to search.sympell_dictionary rows that would not change the contents of those rows, thereby reducing the potential for certain record maintenance operations to significantlly bloat that table. In particular, it adjust the upsert to update the row for an existing prefix only if there would be a net change in at least one of the *_count columns or the list of suggestions. (Note that if a row is the target of an UPDATE statement, PostgreSQL will _always_ create a row version, even if there is no change to the contents of the row.) It should be noted that while this patch is useful in and of itself, there is a longer-term fix that would have additional benefits: adjust the overall reingest logic so that it minimizes changes to all large tables derived from the bib record when a bib gets reingested. A row that never gets touched because it doesn't have to be can never become bloat. To test ------- [1] In a Concerto database, ensure that idempotent updates of the MARC in biblio.record_entry will nonetheless force a reingest by running: update config.internal_flag set enabled = true where name = 'ingest.reingest.force_on_same_marc'; [2] Note the size of search.symspell_dictionary by running: select pg_size_pretty(pg_total_relation_size('search.symspell_dictionary')); [3] Run a few rounds of the following update that forces a reingest of the bibs: update biblio.record_entry set id = id; [4] For the sake of fairness, run a vacuum on the table: VACUUM ANALYZE search.sympsell_dictionary [5] Run the size measuremeant again and not that it's significantly larger. [6] Run the following the reset the table size: VACUUM FULL search.symspell_dictionary; [7] Note the size, apply the patch, and repeat step 3. [8] This time, the table size should be the same (or close to the same) as it was at the beginning of step 7. Signed-off-by: Galen Charlton Signed-off-by: Jason Boyer --- Open-ILS/src/sql/Pg/300.schema.staged_search.sql | 15 +++++ ...XXXX.schema.lp1998355-reduce-symspell-churn.sql | 64 ++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql diff --git a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql index 5abbbef2f0..2455aadd76 100644 --- a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql +++ b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql @@ -1202,6 +1202,21 @@ CREATE OR REPLACE FUNCTION search.symspell_dictionary_reify () RETURNS SETOF sea identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count), identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions) + + WHERE ( + EXCLUDED.keyword_count <> 0 OR + EXCLUDED.title_count <> 0 OR + EXCLUDED.author_count <> 0 OR + EXCLUDED.subject_count <> 0 OR + EXCLUDED.series_count <> 0 OR + EXCLUDED.identifier_count <> 0 OR + NOT (EXCLUDED.keyword_suggestions <@ d.keyword_suggestions) OR + NOT (EXCLUDED.title_suggestions <@ d.title_suggestions) OR + NOT (EXCLUDED.author_suggestions <@ d.author_suggestions) OR + NOT (EXCLUDED.subject_suggestions <@ d.subject_suggestions) OR + NOT (EXCLUDED.series_suggestions <@ d.series_suggestions) OR + NOT (EXCLUDED.identifier_suggestions <@ d.identifier_suggestions) + ) RETURNING *; $f$ LANGUAGE SQL; diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql new file mode 100644 index 0000000000..996e967408 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.lp1998355-reduce-symspell-churn.sql @@ -0,0 +1,64 @@ +BEGIN; + +-- SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); + +CREATE OR REPLACE FUNCTION search.symspell_dictionary_reify () RETURNS SETOF search.symspell_dictionary AS $f$ + WITH new_rows AS ( + DELETE FROM search.symspell_dictionary_updates WHERE transaction_id = txid_current() RETURNING * + ), computed_rows AS ( -- this collapses the rows deleted into the format we need for UPSERT + SELECT SUM(keyword_count) AS keyword_count, + SUM(title_count) AS title_count, + SUM(author_count) AS author_count, + SUM(subject_count) AS subject_count, + SUM(series_count) AS series_count, + SUM(identifier_count) AS identifier_count, + + prefix_key, + + ARRAY_REMOVE(ARRAY_AGG(DISTINCT keyword_suggestions[1]), NULL) AS keyword_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT title_suggestions[1]), NULL) AS title_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT author_suggestions[1]), NULL) AS author_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT subject_suggestions[1]), NULL) AS subject_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT series_suggestions[1]), NULL) AS series_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT identifier_suggestions[1]), NULL) AS identifier_suggestions + FROM new_rows + GROUP BY prefix_key + ) + INSERT INTO search.symspell_dictionary AS d SELECT * FROM computed_rows + ON CONFLICT (prefix_key) DO UPDATE SET + keyword_count = GREATEST(0, d.keyword_count + EXCLUDED.keyword_count), + keyword_suggestions = evergreen.text_array_merge_unique(EXCLUDED.keyword_suggestions,d.keyword_suggestions), + + title_count = GREATEST(0, d.title_count + EXCLUDED.title_count), + title_suggestions = evergreen.text_array_merge_unique(EXCLUDED.title_suggestions,d.title_suggestions), + + author_count = GREATEST(0, d.author_count + EXCLUDED.author_count), + author_suggestions = evergreen.text_array_merge_unique(EXCLUDED.author_suggestions,d.author_suggestions), + + subject_count = GREATEST(0, d.subject_count + EXCLUDED.subject_count), + subject_suggestions = evergreen.text_array_merge_unique(EXCLUDED.subject_suggestions,d.subject_suggestions), + + series_count = GREATEST(0, d.series_count + EXCLUDED.series_count), + series_suggestions = evergreen.text_array_merge_unique(EXCLUDED.series_suggestions,d.series_suggestions), + + identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count), + identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions) + + WHERE ( + EXCLUDED.keyword_count <> 0 OR + EXCLUDED.title_count <> 0 OR + EXCLUDED.author_count <> 0 OR + EXCLUDED.subject_count <> 0 OR + EXCLUDED.series_count <> 0 OR + EXCLUDED.identifier_count <> 0 OR + NOT (EXCLUDED.keyword_suggestions <@ d.keyword_suggestions) OR + NOT (EXCLUDED.title_suggestions <@ d.title_suggestions) OR + NOT (EXCLUDED.author_suggestions <@ d.author_suggestions) OR + NOT (EXCLUDED.subject_suggestions <@ d.subject_suggestions) OR + NOT (EXCLUDED.series_suggestions <@ d.series_suggestions) OR + NOT (EXCLUDED.identifier_suggestions <@ d.identifier_suggestions) + ) + RETURNING *; +$f$ LANGUAGE SQL; + +COMMIT; -- 2.11.0