From: Mike Rylander Date: Fri, 13 May 2022 16:35:24 +0000 (-0400) Subject: LP#1931737: Allow the delay of symspell updates X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=139222bd850c45172e7c6acd5040b23076c1de67;p=evergreen%2Fequinox.git LP#1931737: Allow the delay of symspell updates This commit adds a new internal flag, auto-created at the time of need, to control whether record ingest will cause immediate updates to the symspell dictionary, or if those updates will simply be recorded for later incorporation. Inline symspell dictionary updates can cause record updates to be logically serialized, impacting the preformance of other tools used for batch reingest. pingest.pl is changed to allow an administrator to make use of this feature via the --delay-symspell command line flag. NOTE: includes a minor fixup from blake@mobiusconsortium.org for a syntax error. Signed-off-by: Mike Rylander Signed-off-by: Jason Stephenson Signed-off-by: blake --- diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index ac35fc73d3..86416cac97 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -1168,7 +1168,10 @@ BEGIN IF NOT b_skip_search THEN PERFORM metabib.update_combined_index_vectors(bib_id); - PERFORM search.symspell_dictionary_reify(); -- NOTE: we only use search data for symspell today + PERFORM * FROM config.internal_flag WHERE name = 'ingest.disable_symspell_reification' AND enabled; + IF NOT FOUND THEN + PERFORM search.symspell_dictionary_reify(); + END IF; END IF; RETURN; diff --git a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql index 868ce5fb3a..5abbbef2f0 100644 --- a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql +++ b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql @@ -1205,6 +1205,59 @@ CREATE OR REPLACE FUNCTION search.symspell_dictionary_reify () RETURNS SETOF sea RETURNING *; $f$ LANGUAGE SQL; +CREATE OR REPLACE FUNCTION search.disable_symspell_reification () RETURNS VOID AS $f$ + INSERT INTO config.internal_flag (name,enabled) + VALUES ('ingest.disable_symspell_reification',TRUE) + ON CONFLICT (name) DO UPDATE SET enabled = TRUE; +$f$ LANGUAGE SQL; + +CREATE OR REPLACE FUNCTION search.enable_symspell_reification () RETURNS VOID AS $f$ + UPDATE config.internal_flag SET enabled = FALSE WHERE name = 'ingest.disable_symspell_reification'; +$f$ LANGUAGE SQL; + +CREATE OR REPLACE FUNCTION search.symspell_dictionary_full_reify () RETURNS SETOF search.symspell_dictionary AS $f$ + WITH new_rows AS ( + DELETE FROM search.symspell_dictionary_updates RETURNING * + ), computed_rows AS ( -- this collapses the rows deleted into the format we need for UPSERT + SELECT SUM(keyword_count) AS keyword_count, + SUM(title_count) AS title_count, + SUM(author_count) AS author_count, + SUM(subject_count) AS subject_count, + SUM(series_count) AS series_count, + SUM(identifier_count) AS identifier_count, + + prefix_key, + + ARRAY_REMOVE(ARRAY_AGG(DISTINCT keyword_suggestions[1]), NULL) AS keyword_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT title_suggestions[1]), NULL) AS title_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT author_suggestions[1]), NULL) AS author_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT subject_suggestions[1]), NULL) AS subject_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT series_suggestions[1]), NULL) AS series_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT identifier_suggestions[1]), NULL) AS identifier_suggestions + FROM new_rows + GROUP BY prefix_key + ) + INSERT INTO search.symspell_dictionary AS d SELECT * FROM computed_rows + ON CONFLICT (prefix_key) DO UPDATE SET + keyword_count = GREATEST(0, d.keyword_count + EXCLUDED.keyword_count), + keyword_suggestions = evergreen.text_array_merge_unique(EXCLUDED.keyword_suggestions,d.keyword_suggestions), + + title_count = GREATEST(0, d.title_count + EXCLUDED.title_count), + title_suggestions = evergreen.text_array_merge_unique(EXCLUDED.title_suggestions,d.title_suggestions), + + author_count = GREATEST(0, d.author_count + EXCLUDED.author_count), + author_suggestions = evergreen.text_array_merge_unique(EXCLUDED.author_suggestions,d.author_suggestions), + + subject_count = GREATEST(0, d.subject_count + EXCLUDED.subject_count), + subject_suggestions = evergreen.text_array_merge_unique(EXCLUDED.subject_suggestions,d.subject_suggestions), + + series_count = GREATEST(0, d.series_count + EXCLUDED.series_count), + series_suggestions = evergreen.text_array_merge_unique(EXCLUDED.series_suggestions,d.series_suggestions), + + identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count), + identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions) + RETURNING *; +$f$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION search.symspell_parse_words ( phrase TEXT ) RETURNS SETOF TEXT AS $F$ diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.dym_delayed_reify.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.dym_delayed_reify.sql new file mode 100644 index 0000000000..5401162322 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.dym_delayed_reify.sql @@ -0,0 +1,199 @@ +BEGIN; + +CREATE OR REPLACE FUNCTION search.disable_symspell_reification () RETURNS VOID AS $f$ + INSERT INTO config.internal_flag (name,enabled) + VALUES ('ingest.disable_symspell_reification',TRUE) + ON CONFLICT (name) DO UPDATE SET enabled = TRUE; +$f$ LANGUAGE SQL; + +CREATE OR REPLACE FUNCTION search.enable_symspell_reification () RETURNS VOID AS $f$ + UPDATE config.internal_flag SET enabled = FALSE WHERE name = 'ingest.disable_symspell_reification'; +$f$ LANGUAGE SQL; + +CREATE OR REPLACE FUNCTION search.symspell_dictionary_full_reify () RETURNS SETOF search.symspell_dictionary AS $f$ + WITH new_rows AS ( + DELETE FROM search.symspell_dictionary_updates RETURNING * + ), computed_rows AS ( -- this collapses the rows deleted into the format we need for UPSERT + SELECT SUM(keyword_count) AS keyword_count, + SUM(title_count) AS title_count, + SUM(author_count) AS author_count, + SUM(subject_count) AS subject_count, + SUM(series_count) AS series_count, + SUM(identifier_count) AS identifier_count, + + prefix_key, + + ARRAY_REMOVE(ARRAY_AGG(DISTINCT keyword_suggestions[1]), NULL) AS keyword_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT title_suggestions[1]), NULL) AS title_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT author_suggestions[1]), NULL) AS author_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT subject_suggestions[1]), NULL) AS subject_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT series_suggestions[1]), NULL) AS series_suggestions, + ARRAY_REMOVE(ARRAY_AGG(DISTINCT identifier_suggestions[1]), NULL) AS identifier_suggestions + FROM new_rows + GROUP BY prefix_key + ) + INSERT INTO search.symspell_dictionary AS d SELECT * FROM computed_rows + ON CONFLICT (prefix_key) DO UPDATE SET + keyword_count = GREATEST(0, d.keyword_count + EXCLUDED.keyword_count), + keyword_suggestions = evergreen.text_array_merge_unique(EXCLUDED.keyword_suggestions,d.keyword_suggestions), + + title_count = GREATEST(0, d.title_count + EXCLUDED.title_count), + title_suggestions = evergreen.text_array_merge_unique(EXCLUDED.title_suggestions,d.title_suggestions), + + author_count = GREATEST(0, d.author_count + EXCLUDED.author_count), + author_suggestions = evergreen.text_array_merge_unique(EXCLUDED.author_suggestions,d.author_suggestions), + + subject_count = GREATEST(0, d.subject_count + EXCLUDED.subject_count), + subject_suggestions = evergreen.text_array_merge_unique(EXCLUDED.subject_suggestions,d.subject_suggestions), + + series_count = GREATEST(0, d.series_count + EXCLUDED.series_count), + series_suggestions = evergreen.text_array_merge_unique(EXCLUDED.series_suggestions,d.series_suggestions), + + identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count), + identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions) + RETURNING *; +$f$ LANGUAGE SQL; + +-- Updated again to check for delayed symspell reification +CREATE OR REPLACE FUNCTION metabib.reingest_metabib_field_entries( + bib_id BIGINT, + skip_facet BOOL DEFAULT FALSE, + skip_display BOOL DEFAULT FALSE, + skip_browse BOOL DEFAULT FALSE, + skip_search BOOL DEFAULT FALSE, + only_fields INT[] DEFAULT '{}'::INT[] +) RETURNS VOID AS $func$ +DECLARE + fclass RECORD; + ind_data metabib.field_entry_template%ROWTYPE; + mbe_row metabib.browse_entry%ROWTYPE; + mbe_id BIGINT; + b_skip_facet BOOL; + b_skip_display BOOL; + b_skip_browse BOOL; + b_skip_search BOOL; + value_prepped TEXT; + field_list INT[] := only_fields; + field_types TEXT[] := '{}'::TEXT[]; +BEGIN + + IF field_list = '{}'::INT[] THEN + SELECT ARRAY_AGG(id) INTO field_list FROM config.metabib_field; + END IF; + + SELECT COALESCE(NULLIF(skip_facet, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_facet_indexing' AND enabled)) INTO b_skip_facet; + SELECT COALESCE(NULLIF(skip_display, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_display_indexing' AND enabled)) INTO b_skip_display; + SELECT COALESCE(NULLIF(skip_browse, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_browse_indexing' AND enabled)) INTO b_skip_browse; + SELECT COALESCE(NULLIF(skip_search, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_search_indexing' AND enabled)) INTO b_skip_search; + + IF NOT b_skip_facet THEN field_types := field_types || '{facet}'; END IF; + IF NOT b_skip_display THEN field_types := field_types || '{display}'; END IF; + IF NOT b_skip_browse THEN field_types := field_types || '{browse}'; END IF; + IF NOT b_skip_search THEN field_types := field_types || '{search}'; END IF; + + PERFORM * FROM config.internal_flag WHERE name = 'ingest.assume_inserts_only' AND enabled; + IF NOT FOUND THEN + IF NOT b_skip_search THEN + FOR fclass IN SELECT * FROM config.metabib_class LOOP + -- RAISE NOTICE 'Emptying out %', fclass.name; + EXECUTE $$DELETE FROM metabib.$$ || fclass.name || $$_field_entry WHERE source = $$ || bib_id; + END LOOP; + END IF; + IF NOT b_skip_facet THEN + DELETE FROM metabib.facet_entry WHERE source = bib_id; + END IF; + IF NOT b_skip_display THEN + DELETE FROM metabib.display_entry WHERE source = bib_id; + END IF; + IF NOT b_skip_browse THEN + DELETE FROM metabib.browse_entry_def_map WHERE source = bib_id; + END IF; + END IF; + + FOR ind_data IN SELECT * FROM biblio.extract_metabib_field_entry( bib_id, ' ', field_types, field_list ) LOOP + + -- don't store what has been normalized away + CONTINUE WHEN ind_data.value IS NULL; + + IF ind_data.field < 0 THEN + ind_data.field = -1 * ind_data.field; + END IF; + + IF ind_data.facet_field AND NOT b_skip_facet THEN + INSERT INTO metabib.facet_entry (field, source, value) + VALUES (ind_data.field, ind_data.source, ind_data.value); + END IF; + + IF ind_data.display_field AND NOT b_skip_display THEN + INSERT INTO metabib.display_entry (field, source, value) + VALUES (ind_data.field, ind_data.source, ind_data.value); + END IF; + + + IF ind_data.browse_field AND NOT b_skip_browse THEN + -- A caveat about this SELECT: this should take care of replacing + -- old mbe rows when data changes, but not if normalization (by + -- which I mean specifically the output of + -- evergreen.oils_tsearch2()) changes. It may or may not be + -- expensive to add a comparison of index_vector to index_vector + -- to the WHERE clause below. + + CONTINUE WHEN ind_data.sort_value IS NULL; + + value_prepped := metabib.browse_normalize(ind_data.value, ind_data.field); + IF ind_data.browse_nocase THEN + SELECT INTO mbe_row * FROM metabib.browse_entry + WHERE evergreen.lowercase(value) = evergreen.lowercase(value_prepped) AND sort_value = ind_data.sort_value + ORDER BY sort_value, value LIMIT 1; -- gotta pick something, I guess + ELSE + SELECT INTO mbe_row * FROM metabib.browse_entry + WHERE value = value_prepped AND sort_value = ind_data.sort_value; + END IF; + + IF FOUND THEN + mbe_id := mbe_row.id; + ELSE + INSERT INTO metabib.browse_entry + ( value, sort_value ) VALUES + ( value_prepped, ind_data.sort_value ); + + mbe_id := CURRVAL('metabib.browse_entry_id_seq'::REGCLASS); + END IF; + + INSERT INTO metabib.browse_entry_def_map (entry, def, source, authority) + VALUES (mbe_id, ind_data.field, ind_data.source, ind_data.authority); + END IF; + + IF ind_data.search_field AND NOT b_skip_search THEN + -- Avoid inserting duplicate rows + EXECUTE 'SELECT 1 FROM metabib.' || ind_data.field_class || + '_field_entry WHERE field = $1 AND source = $2 AND value = $3' + INTO mbe_id USING ind_data.field, ind_data.source, ind_data.value; + -- RAISE NOTICE 'Search for an already matching row returned %', mbe_id; + IF mbe_id IS NULL THEN + EXECUTE $$ + INSERT INTO metabib.$$ || ind_data.field_class || $$_field_entry (field, source, value) + VALUES ($$ || + quote_literal(ind_data.field) || $$, $$ || + quote_literal(ind_data.source) || $$, $$ || + quote_literal(ind_data.value) || + $$);$$; + END IF; + END IF; + + END LOOP; + + IF NOT b_skip_search THEN + PERFORM metabib.update_combined_index_vectors(bib_id); + PERFORM * FROM config.internal_flag WHERE name = 'ingest.disable_symspell_reification' AND enabled; + IF NOT FOUND THEN + PERFORM search.symspell_dictionary_reify(); + END IF; + END IF; + + RETURN; +END; +$func$ LANGUAGE PLPGSQL; + +COMMIT; + diff --git a/Open-ILS/src/support-scripts/pingest.pl b/Open-ILS/src/support-scripts/pingest.pl index 8fd2cc914b..2453fc935e 100755 --- a/Open-ILS/src/support-scripts/pingest.pl +++ b/Open-ILS/src/support-scripts/pingest.pl @@ -29,6 +29,7 @@ use Getopt::Long; my $batch_size = 10000; # records processed per batch my $max_child = 8; # max number of parallel worker processes +my $delay_dym; # Delay DYM symspell dictionary reification. my $skip_browse; # Skip the browse reingest. my $skip_attrs; # Skip the record attributes reingest. my $skip_search; # Skip the search reingest. @@ -57,6 +58,7 @@ GetOptions( 'port=i' => \$db_port, 'batch-size=i' => \$batch_size, 'max-child=i' => \$max_child, + 'delay-symspell' => \$delay_dym, 'skip-browse' => \$skip_browse, 'skip-attrs' => \$skip_attrs, 'skip-search' => \$skip_search, @@ -83,6 +85,15 @@ sub help { --max-child Max number of worker processes + --delay-symspell + Delay reification of symspell dictionary entries + This can provide a significant speedup for large ingests. + NOTE: This will cause concurrent, unrelated symspell + updates to be delayed as well. This is usually not a + concern in an existing database as the dictionary is + generally complete and only the details of use counts + will change due to reingests and record inserts/updates. + --skip-browse --skip-attrs --skip-search @@ -200,6 +211,14 @@ $lol[$lists++] = $records if ($count); # Last batch is likely to be # batches processed. $count = 0; +# Disable inline reification of symspell data during the main ingest process +if ($delay_dym) { + my $dbh = DBI->connect("DBI:Pg:database=$db_db;host=$db_host;port=$db_port;application_name=pingest", + $db_user, $db_password); + $dbh->do('SELECT search.disable_symspell_reification()'); + $dbh->disconnect(); +} + # @running keeps track of the running child processes. my @running = (); @@ -229,14 +248,39 @@ while ($count < $lists) { } if ($duration_expired && scalar(@running) == 0) { + symspell_reification() if ($delay_dym); warn "Exiting on max_duration ($max_duration)\n"; exit(0); } } +# Incorporate symspell updates if they were delayed +symspell_reification() if ($delay_dym); + # Rebuild reporter.materialized_simple_record after the ingests. rmsr_rebuild() if ($rebuild_rmsr); +# This sub should be called at the end of the run if symspell updates +# were delayed using the --delay-dym command line flag. +sub symspell_reification { + my $dbh = DBI->connect("DBI:Pg:database=$db_db;host=$db_host;port=$db_port;application_name=pingest", + $db_user, $db_password); + $dbh->do('SELECT search.enable_symspell_reification()'); + $dbh->do('SELECT search.symspell_dictionary_full_reify()'); + + # There might be a race condition above if non-pingest record updates + # were started before the first of the two statements above, but ended + # after the second one, so we'll wait a few seconds and then look again. + sleep(5); + + # This count will always be 0 when symspell reification is done inline + # rather than delayed, because it is handled by a trigger that runs + # inside the transaction that causes inline reification. + my ($recheck) = $dbh->selectrow_array('SELECT COUNT(*) FROM search.symspell_dictionary_updates'); + $dbh->do('SELECT search.symspell_dictionary_full_reify()') if ($recheck); + $dbh->disconnect(); +} + # This subroutine forks a process to do the browse-only ingest on the # @blist above. It cannot be parallelized, but can run in parrallel # to the other ingests.