RETURNING *;
$f$ LANGUAGE SQL;
+CREATE OR REPLACE FUNCTION search.disable_symspell_reification () RETURNS VOID AS $f$
+ INSERT INTO config.internal_flag (name,enabled)
+ VALUES ('ingest.disable_symspell_reification',TRUE)
+ ON CONFLICT (name) DO UPDATE SET enabled = TRUE;
+$f$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION search.enable_symspell_reification () RETURNS VOID AS $f$
+ UPDATE config.internal_flag SET enabled = FALSE WHERE name = 'ingest.disable_symspell_reification';
+$f$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION search.symspell_dictionary_full_reify () RETURNS SETOF search.symspell_dictionary AS $f$
+ WITH new_rows AS (
+ DELETE FROM search.symspell_dictionary_updates RETURNING *
+ ), computed_rows AS ( -- this collapses the rows deleted into the format we need for UPSERT
+ SELECT SUM(keyword_count) AS keyword_count,
+ SUM(title_count) AS title_count,
+ SUM(author_count) AS author_count,
+ SUM(subject_count) AS subject_count,
+ SUM(series_count) AS series_count,
+ SUM(identifier_count) AS identifier_count,
+
+ prefix_key,
+
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT keyword_suggestions[1]), NULL) AS keyword_suggestions,
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT title_suggestions[1]), NULL) AS title_suggestions,
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT author_suggestions[1]), NULL) AS author_suggestions,
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT subject_suggestions[1]), NULL) AS subject_suggestions,
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT series_suggestions[1]), NULL) AS series_suggestions,
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT identifier_suggestions[1]), NULL) AS identifier_suggestions
+ FROM new_rows
+ GROUP BY prefix_key
+ )
+ INSERT INTO search.symspell_dictionary AS d SELECT * FROM computed_rows
+ ON CONFLICT (prefix_key) DO UPDATE SET
+ keyword_count = GREATEST(0, d.keyword_count + EXCLUDED.keyword_count),
+ keyword_suggestions = evergreen.text_array_merge_unique(EXCLUDED.keyword_suggestions,d.keyword_suggestions),
+
+ title_count = GREATEST(0, d.title_count + EXCLUDED.title_count),
+ title_suggestions = evergreen.text_array_merge_unique(EXCLUDED.title_suggestions,d.title_suggestions),
+
+ author_count = GREATEST(0, d.author_count + EXCLUDED.author_count),
+ author_suggestions = evergreen.text_array_merge_unique(EXCLUDED.author_suggestions,d.author_suggestions),
+
+ subject_count = GREATEST(0, d.subject_count + EXCLUDED.subject_count),
+ subject_suggestions = evergreen.text_array_merge_unique(EXCLUDED.subject_suggestions,d.subject_suggestions),
+
+ series_count = GREATEST(0, d.series_count + EXCLUDED.series_count),
+ series_suggestions = evergreen.text_array_merge_unique(EXCLUDED.series_suggestions,d.series_suggestions),
+
+ identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count),
+ identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions)
+ RETURNING *;
+$f$ LANGUAGE SQL;
CREATE OR REPLACE FUNCTION search.symspell_parse_words ( phrase TEXT )
RETURNS SETOF TEXT AS $F$
--- /dev/null
+BEGIN;
+
+CREATE OR REPLACE FUNCTION search.disable_symspell_reification () RETURNS VOID AS $f$
+ INSERT INTO config.internal_flag (name,enabled)
+ VALUES ('ingest.disable_symspell_reification',TRUE)
+ ON CONFLICT (name) DO UPDATE SET enabled = TRUE;
+$f$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION search.enable_symspell_reification () RETURNS VOID AS $f$
+ UPDATE config.internal_flag SET enabled = FALSE WHERE name = 'ingest.disable_symspell_reification';
+$f$ LANGUAGE SQL;
+
+CREATE OR REPLACE FUNCTION search.symspell_dictionary_full_reify () RETURNS SETOF search.symspell_dictionary AS $f$
+ WITH new_rows AS (
+ DELETE FROM search.symspell_dictionary_updates RETURNING *
+ ), computed_rows AS ( -- this collapses the rows deleted into the format we need for UPSERT
+ SELECT SUM(keyword_count) AS keyword_count,
+ SUM(title_count) AS title_count,
+ SUM(author_count) AS author_count,
+ SUM(subject_count) AS subject_count,
+ SUM(series_count) AS series_count,
+ SUM(identifier_count) AS identifier_count,
+
+ prefix_key,
+
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT keyword_suggestions[1]), NULL) AS keyword_suggestions,
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT title_suggestions[1]), NULL) AS title_suggestions,
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT author_suggestions[1]), NULL) AS author_suggestions,
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT subject_suggestions[1]), NULL) AS subject_suggestions,
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT series_suggestions[1]), NULL) AS series_suggestions,
+ ARRAY_REMOVE(ARRAY_AGG(DISTINCT identifier_suggestions[1]), NULL) AS identifier_suggestions
+ FROM new_rows
+ GROUP BY prefix_key
+ )
+ INSERT INTO search.symspell_dictionary AS d SELECT * FROM computed_rows
+ ON CONFLICT (prefix_key) DO UPDATE SET
+ keyword_count = GREATEST(0, d.keyword_count + EXCLUDED.keyword_count),
+ keyword_suggestions = evergreen.text_array_merge_unique(EXCLUDED.keyword_suggestions,d.keyword_suggestions),
+
+ title_count = GREATEST(0, d.title_count + EXCLUDED.title_count),
+ title_suggestions = evergreen.text_array_merge_unique(EXCLUDED.title_suggestions,d.title_suggestions),
+
+ author_count = GREATEST(0, d.author_count + EXCLUDED.author_count),
+ author_suggestions = evergreen.text_array_merge_unique(EXCLUDED.author_suggestions,d.author_suggestions),
+
+ subject_count = GREATEST(0, d.subject_count + EXCLUDED.subject_count),
+ subject_suggestions = evergreen.text_array_merge_unique(EXCLUDED.subject_suggestions,d.subject_suggestions),
+
+ series_count = GREATEST(0, d.series_count + EXCLUDED.series_count),
+ series_suggestions = evergreen.text_array_merge_unique(EXCLUDED.series_suggestions,d.series_suggestions),
+
+ identifier_count = GREATEST(0, d.identifier_count + EXCLUDED.identifier_count),
+ identifier_suggestions = evergreen.text_array_merge_unique(EXCLUDED.identifier_suggestions,d.identifier_suggestions)
+ RETURNING *;
+$f$ LANGUAGE SQL;
+
+-- Updated again to check for delayed symspell reification
+CREATE OR REPLACE FUNCTION metabib.reingest_metabib_field_entries(
+ bib_id BIGINT,
+ skip_facet BOOL DEFAULT FALSE,
+ skip_display BOOL DEFAULT FALSE,
+ skip_browse BOOL DEFAULT FALSE,
+ skip_search BOOL DEFAULT FALSE,
+ only_fields INT[] DEFAULT '{}'::INT[]
+) RETURNS VOID AS $func$
+DECLARE
+ fclass RECORD;
+ ind_data metabib.field_entry_template%ROWTYPE;
+ mbe_row metabib.browse_entry%ROWTYPE;
+ mbe_id BIGINT;
+ b_skip_facet BOOL;
+ b_skip_display BOOL;
+ b_skip_browse BOOL;
+ b_skip_search BOOL;
+ value_prepped TEXT;
+ field_list INT[] := only_fields;
+ field_types TEXT[] := '{}'::TEXT[];
+BEGIN
+
+ IF field_list = '{}'::INT[] THEN
+ SELECT ARRAY_AGG(id) INTO field_list FROM config.metabib_field;
+ END IF;
+
+ SELECT COALESCE(NULLIF(skip_facet, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_facet_indexing' AND enabled)) INTO b_skip_facet;
+ SELECT COALESCE(NULLIF(skip_display, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_display_indexing' AND enabled)) INTO b_skip_display;
+ SELECT COALESCE(NULLIF(skip_browse, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_browse_indexing' AND enabled)) INTO b_skip_browse;
+ SELECT COALESCE(NULLIF(skip_search, FALSE), EXISTS (SELECT enabled FROM config.internal_flag WHERE name = 'ingest.skip_search_indexing' AND enabled)) INTO b_skip_search;
+
+ IF NOT b_skip_facet THEN field_types := field_types || '{facet}'; END IF;
+ IF NOT b_skip_display THEN field_types := field_types || '{display}'; END IF;
+ IF NOT b_skip_browse THEN field_types := field_types || '{browse}'; END IF;
+ IF NOT b_skip_search THEN field_types := field_types || '{search}'; END IF;
+
+ PERFORM * FROM config.internal_flag WHERE name = 'ingest.assume_inserts_only' AND enabled;
+ IF NOT FOUND THEN
+ IF NOT b_skip_search THEN
+ FOR fclass IN SELECT * FROM config.metabib_class LOOP
+ -- RAISE NOTICE 'Emptying out %', fclass.name;
+ EXECUTE $$DELETE FROM metabib.$$ || fclass.name || $$_field_entry WHERE source = $$ || bib_id;
+ END LOOP;
+ END IF;
+ IF NOT b_skip_facet THEN
+ DELETE FROM metabib.facet_entry WHERE source = bib_id;
+ END IF;
+ IF NOT b_skip_display THEN
+ DELETE FROM metabib.display_entry WHERE source = bib_id;
+ END IF;
+ IF NOT b_skip_browse THEN
+ DELETE FROM metabib.browse_entry_def_map WHERE source = bib_id;
+ END IF;
+ END IF;
+
+ FOR ind_data IN SELECT * FROM biblio.extract_metabib_field_entry( bib_id, ' ', field_types, field_list ) LOOP
+
+ -- don't store what has been normalized away
+ CONTINUE WHEN ind_data.value IS NULL;
+
+ IF ind_data.field < 0 THEN
+ ind_data.field = -1 * ind_data.field;
+ END IF;
+
+ IF ind_data.facet_field AND NOT b_skip_facet THEN
+ INSERT INTO metabib.facet_entry (field, source, value)
+ VALUES (ind_data.field, ind_data.source, ind_data.value);
+ END IF;
+
+ IF ind_data.display_field AND NOT b_skip_display THEN
+ INSERT INTO metabib.display_entry (field, source, value)
+ VALUES (ind_data.field, ind_data.source, ind_data.value);
+ END IF;
+
+
+ IF ind_data.browse_field AND NOT b_skip_browse THEN
+ -- A caveat about this SELECT: this should take care of replacing
+ -- old mbe rows when data changes, but not if normalization (by
+ -- which I mean specifically the output of
+ -- evergreen.oils_tsearch2()) changes. It may or may not be
+ -- expensive to add a comparison of index_vector to index_vector
+ -- to the WHERE clause below.
+
+ CONTINUE WHEN ind_data.sort_value IS NULL;
+
+ value_prepped := metabib.browse_normalize(ind_data.value, ind_data.field);
+ IF ind_data.browse_nocase THEN
+ SELECT INTO mbe_row * FROM metabib.browse_entry
+ WHERE evergreen.lowercase(value) = evergreen.lowercase(value_prepped) AND sort_value = ind_data.sort_value
+ ORDER BY sort_value, value LIMIT 1; -- gotta pick something, I guess
+ ELSE
+ SELECT INTO mbe_row * FROM metabib.browse_entry
+ WHERE value = value_prepped AND sort_value = ind_data.sort_value;
+ END IF;
+
+ IF FOUND THEN
+ mbe_id := mbe_row.id;
+ ELSE
+ INSERT INTO metabib.browse_entry
+ ( value, sort_value ) VALUES
+ ( value_prepped, ind_data.sort_value );
+
+ mbe_id := CURRVAL('metabib.browse_entry_id_seq'::REGCLASS);
+ END IF;
+
+ INSERT INTO metabib.browse_entry_def_map (entry, def, source, authority)
+ VALUES (mbe_id, ind_data.field, ind_data.source, ind_data.authority);
+ END IF;
+
+ IF ind_data.search_field AND NOT b_skip_search THEN
+ -- Avoid inserting duplicate rows
+ EXECUTE 'SELECT 1 FROM metabib.' || ind_data.field_class ||
+ '_field_entry WHERE field = $1 AND source = $2 AND value = $3'
+ INTO mbe_id USING ind_data.field, ind_data.source, ind_data.value;
+ -- RAISE NOTICE 'Search for an already matching row returned %', mbe_id;
+ IF mbe_id IS NULL THEN
+ EXECUTE $$
+ INSERT INTO metabib.$$ || ind_data.field_class || $$_field_entry (field, source, value)
+ VALUES ($$ ||
+ quote_literal(ind_data.field) || $$, $$ ||
+ quote_literal(ind_data.source) || $$, $$ ||
+ quote_literal(ind_data.value) ||
+ $$);$$;
+ END IF;
+ END IF;
+
+ END LOOP;
+
+ IF NOT b_skip_search THEN
+ PERFORM metabib.update_combined_index_vectors(bib_id);
+ PERFORM * FROM config.internal_flag WHERE name = 'ingest.disable_symspell_reification' AND enabled;
+ IF NOT FOUND THEN
+ PERFORM search.symspell_dictionary_reify();
+ END IF;
+ END IF;
+
+ RETURN;
+END;
+$func$ LANGUAGE PLPGSQL;
+
+COMMIT;
+
my $batch_size = 10000; # records processed per batch
my $max_child = 8; # max number of parallel worker processes
+my $delay_dym; # Delay DYM symspell dictionary reification.
my $skip_browse; # Skip the browse reingest.
my $skip_attrs; # Skip the record attributes reingest.
my $skip_search; # Skip the search reingest.
'port=i' => \$db_port,
'batch-size=i' => \$batch_size,
'max-child=i' => \$max_child,
+ 'delay-symspell' => \$delay_dym,
'skip-browse' => \$skip_browse,
'skip-attrs' => \$skip_attrs,
'skip-search' => \$skip_search,
--max-child
Max number of worker processes
+ --delay-symspell
+ Delay reification of symspell dictionary entries
+ This can provide a significant speedup for large ingests.
+ NOTE: This will cause concurrent, unrelated symspell
+ updates to be delayed as well. This is usually not a
+ concern in an existing database as the dictionary is
+ generally complete and only the details of use counts
+ will change due to reingests and record inserts/updates.
+
--skip-browse
--skip-attrs
--skip-search
# batches processed.
$count = 0;
+# Disable inline reification of symspell data during the main ingest process
+if ($delay_dym) {
+ my $dbh = DBI->connect("DBI:Pg:database=$db_db;host=$db_host;port=$db_port;application_name=pingest",
+ $db_user, $db_password);
+ $dbh->do('SELECT search.disable_symspell_reification()');
+ $dbh->disconnect();
+}
+
# @running keeps track of the running child processes.
my @running = ();
}
if ($duration_expired && scalar(@running) == 0) {
+ symspell_reification() if ($delay_dym);
warn "Exiting on max_duration ($max_duration)\n";
exit(0);
}
}
+# Incorporate symspell updates if they were delayed
+symspell_reification() if ($delay_dym);
+
# Rebuild reporter.materialized_simple_record after the ingests.
rmsr_rebuild() if ($rebuild_rmsr);
+# This sub should be called at the end of the run if symspell updates
+# were delayed using the --delay-dym command line flag.
+sub symspell_reification {
+ my $dbh = DBI->connect("DBI:Pg:database=$db_db;host=$db_host;port=$db_port;application_name=pingest",
+ $db_user, $db_password);
+ $dbh->do('SELECT search.enable_symspell_reification()');
+ $dbh->do('SELECT search.symspell_dictionary_full_reify()');
+
+ # There might be a race condition above if non-pingest record updates
+ # were started before the first of the two statements above, but ended
+ # after the second one, so we'll wait a few seconds and then look again.
+ sleep(5);
+
+ # This count will always be 0 when symspell reification is done inline
+ # rather than delayed, because it is handled by a trigger that runs
+ # inside the transaction that causes inline reification.
+ my ($recheck) = $dbh->selectrow_array('SELECT COUNT(*) FROM search.symspell_dictionary_updates');
+ $dbh->do('SELECT search.symspell_dictionary_full_reify()') if ($recheck);
+ $dbh->disconnect();
+}
+
# This subroutine forks a process to do the browse-only ingest on the
# @blist above. It cannot be parallelized, but can run in parrallel
# to the other ingests.