LP#1931737: Allow the delay of symspell updates
authorMike Rylander <mrylander@gmail.com>
Fri, 13 May 2022 16:35:24 +0000 (12:35 -0400)
committerChris Sharp <csharp@georgialibraries.org>
Mon, 15 Aug 2022 18:27:18 +0000 (14:27 -0400)
This commit adds a new internal flag, auto-created at the time of need,
to control whether record ingest will cause immediate updates to the
symspell dictionary, or if those updates will simply be recorded for
later incorporation.  Inline symspell dictionary updates can cause
record updates to be logically serialized, impacting the preformance of
other tools used for batch reingest.

pingest.pl is changed to allow an administrator to make use of this
feature via the --delay-symspell command line flag.

Signed-off-by: Mike Rylander <mrylander@gmail.com>
Open-ILS/src/sql/Pg/030.schema.metabib.sql
Open-ILS/src/sql/Pg/300.schema.staged_search.sql
Open-ILS/src/sql/Pg/upgrade/XXXX.schema.dym_delayed_reify.sql

index ac35fc7..86416ca 100644 (file)
@@ -1168,7 +1168,10 @@ BEGIN
 
     IF NOT b_skip_search THEN
         PERFORM metabib.update_combined_index_vectors(bib_id);
-        PERFORM search.symspell_dictionary_reify(); -- NOTE: we only use search data for symspell today
+        PERFORM * FROM config.internal_flag WHERE name = 'ingest.disable_symspell_reification' AND enabled;
+        IF NOT FOUND THEN
+            PERFORM search.symspell_dictionary_reify();
+        END IF;
     END IF;
 
     RETURN;
index 868ce5f..5abbbef 100644 (file)
@@ -1205,6 +1205,59 @@ CREATE OR REPLACE FUNCTION search.symspell_dictionary_reify () RETURNS SETOF sea
  RETURNING *;
 $f$ LANGUAGE SQL;
 
+CREATE OR REPLACE FUNCTION search.disable_symspell_reification () RETURNS VOID AS $f$
+    INSERT INTO config.internal_flag (name,enabled)
+      VALUES ('ingest.disable_symspell_reification',TRUE)
+    ON CONFLICT (name) DO UPDATE SET enabled = TRUE;
+$f$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION search.enable_symspell_reification () RETURNS VOID AS $f$
+    UPDATE config.internal_flag SET enabled = FALSE WHERE name = 'ingest.disable_symspell_reification';
+$f$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION search.symspell_dictionary_full_reify () RETURNS SETOF search.symspell_dictionary AS $f$
+ WITH new_rows AS (
+    DELETE FROM search.symspell_dictionary_updates RETURNING *
+ ), computed_rows AS ( -- this collapses the rows deleted into the format we need for UPSERT
+    SELECT  SUM(keyword_count)    AS keyword_count,
+            SUM(title_count)      AS title_count,
+            SUM(author_count)     AS author_count,
+            SUM(subject_count)    AS subject_count,
+            SUM(series_count)     AS series_count,
+            SUM(identifier_count) AS identifier_count,
+
+            prefix_key,
+
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT keyword_suggestions[1]), NULL)    AS keyword_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT title_suggestions[1]), NULL)      AS title_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT author_suggestions[1]), NULL)     AS author_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT subject_suggestions[1]), NULL)    AS subject_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT series_suggestions[1]), NULL)     AS series_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT identifier_suggestions[1]), NULL) AS identifier_suggestions
+      FROM  new_rows
+      GROUP BY prefix_key
+ )
+ INSERT INTO search.symspell_dictionary AS d SELECT * FROM computed_rows
+ ON CONFLICT (prefix_key) DO UPDATE SET
+    keyword_count = GREATEST(0, d.keyword_count + EXCLUDED.keyword_count),
+    keyword_suggestions = evergreen.text_array_merge_unique(EXCLUDED.keyword_suggestions,d.keyword_suggestions),
+
+    title_count = GREATEST(0, d.title_count + EXCLUDED.title_count),
+    title_suggestions = evergreen.text_array_merge_unique(EXCLUDED.title_suggestions,d.title_suggestions),
+
+    author_count = GREATEST(0, d.author_count + EXCLUDED.author_count),
+    author_suggestions = evergreen.text_array_merge_unique(EXCLUDED.author_suggestions,d.author_suggestions),
+
+    subject_count = GREATEST(0, d.subject_count + EXCLUDED.subject_count),
+    subject_suggestions = evergreen.text_array_merge_unique(EXCLUDED.subject_suggestions,d.subject_suggestions),
+
+    series_count = GREATEST(0, d.series_count + EXCLUDED.series_count),
+    series_suggestions = evergreen.text_array_merge_unique(EXCLUDED.series_suggestions,d.series_suggestions),
+
+    identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count),
+    identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions)
+ RETURNING *;
+$f$ LANGUAGE SQL;
 
 CREATE OR REPLACE FUNCTION search.symspell_parse_words ( phrase TEXT )
 RETURNS SETOF TEXT AS $F$
index 9dc4b0c..5401162 100644 (file)
@@ -1,26 +1,5 @@
 BEGIN;
 
--- INSERT-only table that catches updates to be reconciled
-CREATE UNLOGGED TABLE search.symspell_dictionary_updates (
-    transaction_id          BIGINT,
-    keyword_count           INT     NOT NULL DEFAULT 0,
-    title_count             INT     NOT NULL DEFAULT 0,
-    author_count            INT     NOT NULL DEFAULT 0,
-    subject_count           INT     NOT NULL DEFAULT 0,
-    series_count            INT     NOT NULL DEFAULT 0,
-    identifier_count        INT     NOT NULL DEFAULT 0,
-
-    prefix_key              TEXT    NOT NULL,
-
-    keyword_suggestions     TEXT[],
-    title_suggestions       TEXT[],
-    author_suggestions      TEXT[],
-    subject_suggestions     TEXT[],
-    series_suggestions      TEXT[],
-    identifier_suggestions  TEXT[]
-);
-CREATE INDEX symspell_dictionary_updates_tid_idx ON search.symspell_dictionary_updates (transaction_id);
-
 CREATE OR REPLACE FUNCTION search.disable_symspell_reification () RETURNS VOID AS $f$
     INSERT INTO config.internal_flag (name,enabled)
       VALUES ('ingest.disable_symspell_reification',TRUE)