LP#1931737: Allow the delay of symspell updates

author Mike Rylander <mrylander@gmail.com>

Fri, 13 May 2022 16:35:24 +0000 (12:35 -0400)

committer Mike Rylander <mrylander@gmail.com>

Tue, 25 Oct 2022 15:41:31 +0000 (11:41 -0400)
author Mike Rylander <mrylander@gmail.com>
Fri, 13 May 2022 16:35:24 +0000 (12:35 -0400)
committer Mike Rylander <mrylander@gmail.com>
Tue, 25 Oct 2022 15:41:31 +0000 (11:41 -0400)
diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql

index ac35fc7..86416ca 100644 (file)
--- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql
+++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql
@@ -1168,7 +1168,10 @@ BEGIN
  
      IF NOT b_skip_search THEN
          PERFORM metabib.update_combined_index_vectors(bib_id);
-        PERFORM search.symspell_dictionary_reify(); -- NOTE: we only use search data for symspell today
+        PERFORM * FROM config.internal_flag WHERE name = 'ingest.disable_symspell_reification' AND enabled;
+        IF NOT FOUND THEN
+            PERFORM search.symspell_dictionary_reify();
+        END IF;
      END IF;
  
      RETURN;
diff --git a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql

index 868ce5f..5abbbef 100644 (file)
--- a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql
+++ b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql
@@ -1205,6 +1205,59 @@ CREATE OR REPLACE FUNCTION search.symspell_dictionary_reify () RETURNS SETOF sea
   RETURNING *;
  $f$ LANGUAGE SQL;
  
+CREATE OR REPLACE FUNCTION search.disable_symspell_reification () RETURNS VOID AS $f$
+    INSERT INTO config.internal_flag (name,enabled)
+      VALUES ('ingest.disable_symspell_reification',TRUE)
+    ON CONFLICT (name) DO UPDATE SET enabled = TRUE;
+$f$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION search.enable_symspell_reification () RETURNS VOID AS $f$
+    UPDATE config.internal_flag SET enabled = FALSE WHERE name = 'ingest.disable_symspell_reification';
+$f$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION search.symspell_dictionary_full_reify () RETURNS SETOF search.symspell_dictionary AS $f$
+ WITH new_rows AS (
+    DELETE FROM search.symspell_dictionary_updates RETURNING *
+ ), computed_rows AS ( -- this collapses the rows deleted into the format we need for UPSERT
+    SELECT  SUM(keyword_count)    AS keyword_count,
+            SUM(title_count)      AS title_count,
+            SUM(author_count)     AS author_count,
+            SUM(subject_count)    AS subject_count,
+            SUM(series_count)     AS series_count,
+            SUM(identifier_count) AS identifier_count,
+
+            prefix_key,
+
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT keyword_suggestions[1]), NULL)    AS keyword_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT title_suggestions[1]), NULL)      AS title_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT author_suggestions[1]), NULL)     AS author_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT subject_suggestions[1]), NULL)    AS subject_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT series_suggestions[1]), NULL)     AS series_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT identifier_suggestions[1]), NULL) AS identifier_suggestions
+      FROM  new_rows
+      GROUP BY prefix_key
+ )
+ INSERT INTO search.symspell_dictionary AS d SELECT * FROM computed_rows
+ ON CONFLICT (prefix_key) DO UPDATE SET
+    keyword_count = GREATEST(0, d.keyword_count + EXCLUDED.keyword_count),
+    keyword_suggestions = evergreen.text_array_merge_unique(EXCLUDED.keyword_suggestions,d.keyword_suggestions),
+
+    title_count = GREATEST(0, d.title_count + EXCLUDED.title_count),
+    title_suggestions = evergreen.text_array_merge_unique(EXCLUDED.title_suggestions,d.title_suggestions),
+
+    author_count = GREATEST(0, d.author_count + EXCLUDED.author_count),
+    author_suggestions = evergreen.text_array_merge_unique(EXCLUDED.author_suggestions,d.author_suggestions),
+
+    subject_count = GREATEST(0, d.subject_count + EXCLUDED.subject_count),
+    subject_suggestions = evergreen.text_array_merge_unique(EXCLUDED.subject_suggestions,d.subject_suggestions),
+
+    series_count = GREATEST(0, d.series_count + EXCLUDED.series_count),
+    series_suggestions = evergreen.text_array_merge_unique(EXCLUDED.series_suggestions,d.series_suggestions),
+
+    identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count),
+    identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions)
+ RETURNING *;
+$f$ LANGUAGE SQL;
  
  CREATE OR REPLACE FUNCTION search.symspell_parse_words ( phrase TEXT )
  RETURNS SETOF TEXT AS $F$
diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.dym_delayed_reify.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.dym_delayed_reify.sql

new file mode 100644 (file)

index 0000000..5401162
--- /dev/null
+++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.dym_delayed_reify.sql
@@ -0,0 +1,199 @@
+BEGIN;
+
+CREATE OR REPLACE FUNCTION search.disable_symspell_reification () RETURNS VOID AS $f$
+    INSERT INTO config.internal_flag (name,enabled)
+      VALUES ('ingest.disable_symspell_reification',TRUE)
+    ON CONFLICT (name) DO UPDATE SET enabled = TRUE;
+$f$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION search.enable_symspell_reification () RETURNS VOID AS $f$
+    UPDATE config.internal_flag SET enabled = FALSE WHERE name = 'ingest.disable_symspell_reification';
+$f$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION search.symspell_dictionary_full_reify () RETURNS SETOF search.symspell_dictionary AS $f$
+ WITH new_rows AS (
+    DELETE FROM search.symspell_dictionary_updates RETURNING *
+ ), computed_rows AS ( -- this collapses the rows deleted into the format we need for UPSERT
+    SELECT  SUM(keyword_count)    AS keyword_count,
+            SUM(title_count)      AS title_count,
+            SUM(author_count)     AS author_count,
+            SUM(subject_count)    AS subject_count,
+            SUM(series_count)     AS series_count,
+            SUM(identifier_count) AS identifier_count,
+
+            prefix_key,
+
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT keyword_suggestions[1]), NULL)    AS keyword_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT title_suggestions[1]), NULL)      AS title_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT author_suggestions[1]), NULL)     AS author_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT subject_suggestions[1]), NULL)    AS subject_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT series_suggestions[1]), NULL)     AS series_suggestions,
+            ARRAY_REMOVE(ARRAY_AGG(DISTINCT identifier_suggestions[1]), NULL) AS identifier_suggestions
+      FROM  new_rows
+      GROUP BY prefix_key
+ )
+ INSERT INTO search.symspell_dictionary AS d SELECT * FROM computed_rows
+ ON CONFLICT (prefix_key) DO UPDATE SET
+    keyword_count = GREATEST(0, d.keyword_count + EXCLUDED.keyword_count),
+    keyword_suggestions = evergreen.text_array_merge_unique(EXCLUDED.keyword_suggestions,d.keyword_suggestions),
+
+    title_count = GREATEST(0, d.title_count + EXCLUDED.title_count),
+    title_suggestions = evergreen.text_array_merge_unique(EXCLUDED.title_suggestions,d.title_suggestions),
+
+    author_count = GREATEST(0, d.author_count + EXCLUDED.author_count),
+    author_suggestions = evergreen.text_array_merge_unique(EXCLUDED.author_suggestions,d.author_suggestions),
+
+    subject_count = GREATEST(0, d.subject_count + EXCLUDED.subject_count),
+    subject_suggestions = evergreen.text_array_merge_unique(EXCLUDED.subject_suggestions,d.subject_suggestions),
+
+    series_count = GREATEST(0, d.series_count + EXCLUDED.series_count),
+    series_suggestions = evergreen.text_array_merge_unique(EXCLUDED.series_suggestions,d.series_suggestions),
+
+    identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count),
+    identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions)
+ RETURNING *;
+$f$ LANGUAGE SQL;
+
+-- Updated again to check for delayed symspell reification
+CREATE OR REPLACE FUNCTION metabib.reingest_metabib_field_entries(
+    bib_id BIGINT,
+    skip_facet BOOL DEFAULT FALSE,
+    skip_display BOOL DEFAULT FALSE,
+    skip_browse BOOL DEFAULT FALSE,
+    skip_search BOOL DEFAULT FALSE,
+    only_fields INT[] DEFAULT '{}'::INT[]
+) RETURNS VOID AS $func$
+DECLARE
+    fclass          RECORD;
+    ind_data        metabib.field_entry_template%ROWTYPE;
+    mbe_row         metabib.browse_entry%ROWTYPE;
+    mbe_id          BIGINT;
+    b_skip_facet    BOOL;
+    b_skip_display    BOOL;
+    b_skip_browse   BOOL;
+    b_skip_search   BOOL;
+    value_prepped   TEXT;
+    field_list      INT[] := only_fields;
+    field_types     TEXT[] := '{}'::TEXT[];
+BEGIN
+
+    IF field_list = '{}'::INT[] THEN
+        SELECT ARRAY_AGG(id) INTO field_list FROM config.metabib_field;
+    END IF;
+
+    SELECT COALESCE(NULLIF(skip_facet, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name =  'ingest.skip_facet_indexing' AND enabled)) INTO b_skip_facet;
+    SELECT COALESCE(NULLIF(skip_display, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name =  'ingest.skip_display_indexing' AND enabled)) INTO b_skip_display;
+    SELECT COALESCE(NULLIF(skip_browse, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name =  'ingest.skip_browse_indexing' AND enabled)) INTO b_skip_browse;
+    SELECT COALESCE(NULLIF(skip_search, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name =  'ingest.skip_search_indexing' AND enabled)) INTO b_skip_search;
+
+    IF NOT b_skip_facet THEN field_types := field_types || '{facet}'; END IF;
+    IF NOT b_skip_display THEN field_types := field_types || '{display}'; END IF;
+    IF NOT b_skip_browse THEN field_types := field_types || '{browse}'; END IF;
+    IF NOT b_skip_search THEN field_types := field_types || '{search}'; END IF;
+
+    PERFORM * FROM config.internal_flag WHERE name = 'ingest.assume_inserts_only' AND enabled;
+    IF NOT FOUND THEN
+        IF NOT b_skip_search THEN
+            FOR fclass IN SELECT * FROM config.metabib_class LOOP
+                -- RAISE NOTICE 'Emptying out %', fclass.name;
+                EXECUTE $$DELETE FROM metabib.$$ || fclass.name || $$_field_entry WHERE source = $$ || bib_id;
+            END LOOP;
+        END IF;
+        IF NOT b_skip_facet THEN
+            DELETE FROM metabib.facet_entry WHERE source = bib_id;
+        END IF;
+        IF NOT b_skip_display THEN
+            DELETE FROM metabib.display_entry WHERE source = bib_id;
+        END IF;
+        IF NOT b_skip_browse THEN
+            DELETE FROM metabib.browse_entry_def_map WHERE source = bib_id;
+        END IF;
+    END IF;
+
+    FOR ind_data IN SELECT * FROM biblio.extract_metabib_field_entry( bib_id, ' ', field_types, field_list ) LOOP
+
+    -- don't store what has been normalized away
+        CONTINUE WHEN ind_data.value IS NULL;
+
+        IF ind_data.field < 0 THEN
+            ind_data.field = -1 * ind_data.field;
+        END IF;
+
+        IF ind_data.facet_field AND NOT b_skip_facet THEN
+            INSERT INTO metabib.facet_entry (field, source, value)
+                VALUES (ind_data.field, ind_data.source, ind_data.value);
+        END IF;
+
+        IF ind_data.display_field AND NOT b_skip_display THEN
+            INSERT INTO metabib.display_entry (field, source, value)
+                VALUES (ind_data.field, ind_data.source, ind_data.value);
+        END IF;
+
+
+        IF ind_data.browse_field AND NOT b_skip_browse THEN
+            -- A caveat about this SELECT: this should take care of replacing
+            -- old mbe rows when data changes, but not if normalization (by
+            -- which I mean specifically the output of
+            -- evergreen.oils_tsearch2()) changes.  It may or may not be
+            -- expensive to add a comparison of index_vector to index_vector
+            -- to the WHERE clause below.
+
+            CONTINUE WHEN ind_data.sort_value IS NULL;
+
+            value_prepped := metabib.browse_normalize(ind_data.value, ind_data.field);
+            IF ind_data.browse_nocase THEN
+                SELECT INTO mbe_row * FROM metabib.browse_entry
+                    WHERE evergreen.lowercase(value) = evergreen.lowercase(value_prepped) AND sort_value = ind_data.sort_value
+                    ORDER BY sort_value, value LIMIT 1; -- gotta pick something, I guess
+            ELSE
+                SELECT INTO mbe_row * FROM metabib.browse_entry
+                    WHERE value = value_prepped AND sort_value = ind_data.sort_value;
+            END IF;
+
+            IF FOUND THEN
+                mbe_id := mbe_row.id;
+            ELSE
+                INSERT INTO metabib.browse_entry
+                    ( value, sort_value ) VALUES
+                    ( value_prepped, ind_data.sort_value );
+
+                mbe_id := CURRVAL('metabib.browse_entry_id_seq'::REGCLASS);
+            END IF;
+
+            INSERT INTO metabib.browse_entry_def_map (entry, def, source, authority)
+                VALUES (mbe_id, ind_data.field, ind_data.source, ind_data.authority);
+        END IF;
+
+        IF ind_data.search_field AND NOT b_skip_search THEN
+            -- Avoid inserting duplicate rows
+            EXECUTE 'SELECT 1 FROM metabib.' || ind_data.field_class ||
+                '_field_entry WHERE field = $1 AND source = $2 AND value = $3'
+                INTO mbe_id USING ind_data.field, ind_data.source, ind_data.value;
+                -- RAISE NOTICE 'Search for an already matching row returned %', mbe_id;
+            IF mbe_id IS NULL THEN
+                EXECUTE $$
+                INSERT INTO metabib.$$ || ind_data.field_class || $$_field_entry (field, source, value)
+                    VALUES ($$ ||
+                        quote_literal(ind_data.field) || $$, $$ ||
+                        quote_literal(ind_data.source) || $$, $$ ||
+                        quote_literal(ind_data.value) ||
+                    $$);$$;
+            END IF;
+        END IF;
+
+    END LOOP;
+
+    IF NOT b_skip_search THEN
+        PERFORM metabib.update_combined_index_vectors(bib_id);
+        PERFORM * FROM config.internal_flag WHERE name = 'ingest.disable_symspell_reification' AND enabled;
+        IF NOT FOUND THEN
+            PERFORM search.symspell_dictionary_reify();
+        END IF;
+    END IF;
+
+    RETURN;
+END;
+$func$ LANGUAGE PLPGSQL;
+
+COMMIT;
+
diff --git a/Open-ILS/src/support-scripts/pingest.pl b/Open-ILS/src/support-scripts/pingest.pl

index 8fd2cc9..2453fc9 100755 (executable)
--- a/Open-ILS/src/support-scripts/pingest.pl
+++ b/Open-ILS/src/support-scripts/pingest.pl
@@ -29,6 +29,7 @@ use Getopt::Long;
  my $batch_size = 10000; # records processed per batch
  my $max_child  = 8;     # max number of parallel worker processes
  
+my $delay_dym;    # Delay DYM symspell dictionary reification.
  my $skip_browse;  # Skip the browse reingest.
  my $skip_attrs;   # Skip the record attributes reingest.
  my $skip_search;  # Skip the search reingest.
@@ -57,6 +58,7 @@ GetOptions(
      'port=i'         => \$db_port,
      'batch-size=i'   => \$batch_size,
      'max-child=i'    => \$max_child,
+    'delay-symspell' => \$delay_dym,
      'skip-browse'    => \$skip_browse,
      'skip-attrs'     => \$skip_attrs,
      'skip-search'    => \$skip_search,
@@ -83,6 +85,15 @@ sub help {
      --max-child
          Max number of worker processes
  
+    --delay-symspell
+        Delay reification of symspell dictionary entries
+        This can provide a significant speedup for large ingests.
+        NOTE:  This will cause concurrent, unrelated symspell
+        updates to be delayed as well.  This is usually not a
+        concern in an existing database as the dictionary is
+        generally complete and only the details of use counts
+        will change due to reingests and record inserts/updates.
+
      --skip-browse
      --skip-attrs
      --skip-search
@@ -200,6 +211,14 @@ $lol[$lists++] = $records if ($count); # Last batch is likely to be
  # batches processed.
  $count = 0;
  
+# Disable inline reification of symspell data during the main ingest process
+if ($delay_dym) {
+    my $dbh = DBI->connect("DBI:Pg:database=$db_db;host=$db_host;port=$db_port;application_name=pingest",
+                           $db_user, $db_password);
+    $dbh->do('SELECT search.disable_symspell_reification()');
+    $dbh->disconnect();
+}
+
  # @running keeps track of the running child processes.
  my @running = ();
  
@@ -229,14 +248,39 @@ while ($count < $lists) {
      }
  
      if ($duration_expired && scalar(@running) == 0) {
+        symspell_reification() if ($delay_dym);
          warn "Exiting on max_duration ($max_duration)\n";
          exit(0);
      }
  }
  
+# Incorporate symspell updates if they were delayed
+symspell_reification() if ($delay_dym);
+
  # Rebuild reporter.materialized_simple_record after the ingests.
  rmsr_rebuild() if ($rebuild_rmsr);
  
+# This sub should be called at the end of the run if symspell updates
+# were delayed using the --delay-dym command line flag.
+sub symspell_reification {
+    my $dbh = DBI->connect("DBI:Pg:database=$db_db;host=$db_host;port=$db_port;application_name=pingest",
+                           $db_user, $db_password);
+    $dbh->do('SELECT search.enable_symspell_reification()');
+    $dbh->do('SELECT search.symspell_dictionary_full_reify()');
+
+    # There might be a race condition above if non-pingest record updates
+    # were started before the first of the two statements above, but ended
+    # after the second one, so we'll wait a few seconds and then look again.
+    sleep(5);
+
+    # This count will always be 0 when symspell reification is done inline
+    # rather than delayed, because it is handled by a trigger that runs
+    # inside the transaction that causes inline reification.
+    my ($recheck) = $dbh->selectrow_array('SELECT COUNT(*) FROM search.symspell_dictionary_updates');
+    $dbh->do('SELECT search.symspell_dictionary_full_reify()') if ($recheck);
+    $dbh->disconnect();
+}
+
  # This subroutine forks a process to do the browse-only ingest on the
  # @blist above.  It cannot be parallelized, but can run in parrallel
  # to the other ingests.
author	Mike Rylander <mrylander@gmail.com>
	Fri, 13 May 2022 16:35:24 +0000 (12:35 -0400)
committer	Mike Rylander <mrylander@gmail.com>
	Tue, 25 Oct 2022 15:41:31 +0000 (11:41 -0400)
Open-ILS/src/sql/Pg/030.schema.metabib.sql		patch \| blob \| history
Open-ILS/src/sql/Pg/300.schema.staged_search.sql		patch \| blob \| history
Open-ILS/src/sql/Pg/upgrade/XXXX.schema.dym_delayed_reify.sql	[new file with mode: 0644]	patch \| blob
Open-ILS/src/support-scripts/pingest.pl		patch \| blob \| history