From: Galen Charlton Date: Fri, 5 Nov 2021 16:07:52 +0000 (-0400) Subject: LP#1947173: stamp schema update X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=d6d9388a47a4a91e85829221ab4c23e180abe5e9;p=working%2FEvergreen.git LP#1947173: stamp schema update Signed-off-by: Galen Charlton --- diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 080f8e85cd..78798f0826 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -92,7 +92,7 @@ CREATE TRIGGER no_overlapping_deps BEFORE INSERT OR UPDATE ON config.db_patch_dependencies FOR EACH ROW EXECUTE PROCEDURE evergreen.array_overlap_check ('deprecates'); -INSERT INTO config.upgrade_log (version, applied_to) VALUES ('1305', :eg_version); -- phasefx/csharp/gmcharlt +INSERT INTO config.upgrade_log (version, applied_to) VALUES ('1306', :eg_version); -- miker/gmcharlt CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/upgrade/1306.schema.symspell-speed-ingest.sql b/Open-ILS/src/sql/Pg/upgrade/1306.schema.symspell-speed-ingest.sql new file mode 100644 index 0000000000..7015a8cef2 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/1306.schema.symspell-speed-ingest.sql @@ -0,0 +1,423 @@ +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('1306', :eg_version); + +-- We don't pass this function arrays with nulls, so we save 5% not testing for that +CREATE OR REPLACE FUNCTION evergreen.text_array_merge_unique ( + TEXT[], TEXT[] +) RETURNS TEXT[] AS $F$ + SELECT NULLIF(ARRAY( + SELECT * FROM UNNEST($1) x + UNION + SELECT * FROM UNNEST($2) y + ),'{}'); +$F$ LANGUAGE SQL; + +CREATE OR REPLACE FUNCTION search.symspell_build_raw_entry ( + raw_input TEXT, + source_class TEXT, + no_limit BOOL DEFAULT FALSE, + prefix_length INT DEFAULT 6, + maxED INT DEFAULT 3 +) RETURNS SETOF search.symspell_dictionary AS $F$ +DECLARE + key TEXT; + del_key TEXT; + key_list TEXT[]; + entry search.symspell_dictionary%ROWTYPE; +BEGIN + key := raw_input; + + IF NOT no_limit AND CHARACTER_LENGTH(raw_input) > prefix_length THEN + key := SUBSTRING(key FROM 1 FOR prefix_length); + key_list := ARRAY[raw_input, key]; + ELSE + key_list := ARRAY[key]; + END IF; + + FOREACH del_key IN ARRAY key_list LOOP + -- skip empty keys + CONTINUE WHEN del_key IS NULL OR CHARACTER_LENGTH(del_key) = 0; + + entry.prefix_key := del_key; + + entry.keyword_count := 0; + entry.title_count := 0; + entry.author_count := 0; + entry.subject_count := 0; + entry.series_count := 0; + entry.identifier_count := 0; + + entry.keyword_suggestions := '{}'; + entry.title_suggestions := '{}'; + entry.author_suggestions := '{}'; + entry.subject_suggestions := '{}'; + entry.series_suggestions := '{}'; + entry.identifier_suggestions := '{}'; + + IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'title' THEN entry.title_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'author' THEN entry.author_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'subject' THEN entry.subject_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'series' THEN entry.series_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'identifier' THEN entry.identifier_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; + + IF del_key = raw_input THEN + IF source_class = 'keyword' THEN entry.keyword_count := 1; END IF; + IF source_class = 'title' THEN entry.title_count := 1; END IF; + IF source_class = 'author' THEN entry.author_count := 1; END IF; + IF source_class = 'subject' THEN entry.subject_count := 1; END IF; + IF source_class = 'series' THEN entry.series_count := 1; END IF; + IF source_class = 'identifier' THEN entry.identifier_count := 1; END IF; + END IF; + + RETURN NEXT entry; + END LOOP; + + FOR del_key IN SELECT x FROM UNNEST(search.symspell_generate_edits(key, 1, maxED)) x LOOP + + -- skip empty keys + CONTINUE WHEN del_key IS NULL OR CHARACTER_LENGTH(del_key) = 0; + -- skip suggestions that are already too long for the prefix key + CONTINUE WHEN CHARACTER_LENGTH(del_key) <= (prefix_length - maxED) AND CHARACTER_LENGTH(raw_input) > prefix_length; + + entry.keyword_suggestions := '{}'; + entry.title_suggestions := '{}'; + entry.author_suggestions := '{}'; + entry.subject_suggestions := '{}'; + entry.series_suggestions := '{}'; + entry.identifier_suggestions := '{}'; + + IF source_class = 'keyword' THEN entry.keyword_count := 0; END IF; + IF source_class = 'title' THEN entry.title_count := 0; END IF; + IF source_class = 'author' THEN entry.author_count := 0; END IF; + IF source_class = 'subject' THEN entry.subject_count := 0; END IF; + IF source_class = 'series' THEN entry.series_count := 0; END IF; + IF source_class = 'identifier' THEN entry.identifier_count := 0; END IF; + + entry.prefix_key := del_key; + + IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'title' THEN entry.title_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'author' THEN entry.author_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'subject' THEN entry.subject_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'series' THEN entry.series_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'identifier' THEN entry.identifier_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; + + RETURN NEXT entry; + END LOOP; + +END; +$F$ LANGUAGE PLPGSQL STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION search.symspell_build_entries ( + full_input TEXT, + source_class TEXT, + old_input TEXT DEFAULT NULL, + include_phrases BOOL DEFAULT FALSE +) RETURNS SETOF search.symspell_dictionary AS $F$ +DECLARE + prefix_length INT; + maxED INT; + word_list TEXT[]; + input TEXT; + word TEXT; + entry search.symspell_dictionary; +BEGIN + IF full_input IS NOT NULL THEN + SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled; + prefix_length := COALESCE(prefix_length, 6); + + SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled; + maxED := COALESCE(maxED, 3); + + input := evergreen.lowercase(full_input); + word_list := ARRAY_AGG(x) FROM search.symspell_parse_words_distinct(input) x; + IF word_list IS NULL THEN + RETURN; + END IF; + + IF CARDINALITY(word_list) > 1 AND include_phrases THEN + RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(input, source_class, TRUE, prefix_length, maxED); + END IF; + + FOREACH word IN ARRAY word_list LOOP + -- Skip words that have runs of 5 or more digits (I'm looking at you, ISxNs) + CONTINUE WHEN CHARACTER_LENGTH(word) > 4 AND word ~ '\d{5,}'; + RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(word, source_class, FALSE, prefix_length, maxED); + END LOOP; + END IF; + + IF old_input IS NOT NULL THEN + input := evergreen.lowercase(old_input); + + FOR word IN SELECT x FROM search.symspell_parse_words_distinct(input) x LOOP + -- similarly skip words that have 5 or more digits here to + -- avoid adding erroneous prefix deletion entries to the dictionary + CONTINUE WHEN CHARACTER_LENGTH(word) > 4 AND word ~ '\d{5,}'; + entry.prefix_key := word; + + entry.keyword_count := 0; + entry.title_count := 0; + entry.author_count := 0; + entry.subject_count := 0; + entry.series_count := 0; + entry.identifier_count := 0; + + entry.keyword_suggestions := '{}'; + entry.title_suggestions := '{}'; + entry.author_suggestions := '{}'; + entry.subject_suggestions := '{}'; + entry.series_suggestions := '{}'; + entry.identifier_suggestions := '{}'; + + IF source_class = 'keyword' THEN entry.keyword_count := -1; END IF; + IF source_class = 'title' THEN entry.title_count := -1; END IF; + IF source_class = 'author' THEN entry.author_count := -1; END IF; + IF source_class = 'subject' THEN entry.subject_count := -1; END IF; + IF source_class = 'series' THEN entry.series_count := -1; END IF; + IF source_class = 'identifier' THEN entry.identifier_count := -1; END IF; + + RETURN NEXT entry; + END LOOP; + END IF; +END; +$F$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION search.symspell_build_and_merge_entries ( + full_input TEXT, + source_class TEXT, + old_input TEXT DEFAULT NULL, + include_phrases BOOL DEFAULT FALSE +) RETURNS SETOF search.symspell_dictionary AS $F$ +DECLARE + new_entry RECORD; + conflict_entry RECORD; +BEGIN + + IF full_input = old_input THEN -- neither NULL, and are the same + RETURN; + END IF; + + FOR new_entry IN EXECUTE $q$ + SELECT count, + prefix_key, + s AS suggestions + FROM (SELECT prefix_key, + ARRAY_AGG(DISTINCT $q$ || source_class || $q$_suggestions[1]) s, + SUM($q$ || source_class || $q$_count) count + FROM search.symspell_build_entries($1, $2, $3, $4) + GROUP BY 1) x + $q$ USING full_input, source_class, old_input, include_phrases + LOOP + EXECUTE $q$ + SELECT prefix_key, + $q$ || source_class || $q$_suggestions suggestions, + $q$ || source_class || $q$_count count + FROM search.symspell_dictionary + WHERE prefix_key = $1 $q$ + INTO conflict_entry + USING new_entry.prefix_key; + + IF new_entry.count <> 0 THEN -- Real word, and count changed + IF conflict_entry.prefix_key IS NOT NULL THEN -- we'll be updating + IF conflict_entry.count > 0 THEN -- it's a real word + RETURN QUERY EXECUTE $q$ + UPDATE search.symspell_dictionary + SET $q$ || source_class || $q$_count = $2 + WHERE prefix_key = $1 + RETURNING * $q$ + USING new_entry.prefix_key, GREATEST(0, new_entry.count + conflict_entry.count); + ELSE -- it was a prefix key or delete-emptied word before + IF conflict_entry.suggestions @> new_entry.suggestions THEN -- already have all suggestions here... + RETURN QUERY EXECUTE $q$ + UPDATE search.symspell_dictionary + SET $q$ || source_class || $q$_count = $2 + WHERE prefix_key = $1 + RETURNING * $q$ + USING new_entry.prefix_key, GREATEST(0, new_entry.count); + ELSE -- new suggestion! + RETURN QUERY EXECUTE $q$ + UPDATE search.symspell_dictionary + SET $q$ || source_class || $q$_count = $2, + $q$ || source_class || $q$_suggestions = $3 + WHERE prefix_key = $1 + RETURNING * $q$ + USING new_entry.prefix_key, GREATEST(0, new_entry.count), evergreen.text_array_merge_unique(conflict_entry.suggestions,new_entry.suggestions); + END IF; + END IF; + ELSE + -- We keep the on-conflict clause just in case... + RETURN QUERY EXECUTE $q$ + INSERT INTO search.symspell_dictionary AS d ( + $q$ || source_class || $q$_count, + prefix_key, + $q$ || source_class || $q$_suggestions + ) VALUES ( $1, $2, $3 ) ON CONFLICT (prefix_key) DO + UPDATE SET $q$ || source_class || $q$_count = d.$q$ || source_class || $q$_count + EXCLUDED.$q$ || source_class || $q$_count, + $q$ || source_class || $q$_suggestions = evergreen.text_array_merge_unique(d.$q$ || source_class || $q$_suggestions, EXCLUDED.$q$ || source_class || $q$_suggestions) + RETURNING * $q$ + USING new_entry.count, new_entry.prefix_key, new_entry.suggestions; + END IF; + ELSE -- key only, or no change + IF conflict_entry.prefix_key IS NOT NULL THEN -- we'll be updating + IF NOT conflict_entry.suggestions @> new_entry.suggestions THEN -- There are new suggestions + RETURN QUERY EXECUTE $q$ + UPDATE search.symspell_dictionary + SET $q$ || source_class || $q$_suggestions = $2 + WHERE prefix_key = $1 + RETURNING * $q$ + USING new_entry.prefix_key, evergreen.text_array_merge_unique(conflict_entry.suggestions,new_entry.suggestions); + END IF; + ELSE + RETURN QUERY EXECUTE $q$ + INSERT INTO search.symspell_dictionary AS d ( + $q$ || source_class || $q$_count, + prefix_key, + $q$ || source_class || $q$_suggestions + ) VALUES ( $1, $2, $3 ) ON CONFLICT (prefix_key) DO -- key exists, suggestions may be added due to this entry + UPDATE SET $q$ || source_class || $q$_suggestions = evergreen.text_array_merge_unique(d.$q$ || source_class || $q$_suggestions, EXCLUDED.$q$ || source_class || $q$_suggestions) + RETURNING * $q$ + USING new_entry.count, new_entry.prefix_key, new_entry.suggestions; + END IF; + END IF; + END LOOP; +END; +$F$ LANGUAGE PLPGSQL; + +COMMIT; + +\qecho '' +\qecho 'The following should be run at the end of the upgrade before any' +\qecho 'reingest occurs. Because new triggers are installed already,' +\qecho 'updates to indexed strings will cause zero-count dictionary entries' +\qecho 'to be recorded which will require updating every row again (or' +\qecho 'starting from scratch) so best to do this before other batch' +\qecho 'changes. A later reingest that does not significantly change' +\qecho 'indexed strings will /not/ cause table bloat here, and will be' +\qecho 'as fast as normal. A copy of the SQL in a ready-to-use, non-escaped' +\qecho 'form is available inside a comment at the end of this upgrade sub-' +\qecho 'script so you do not need to copy this comment from the psql ouptut.' +\qecho '' +\qecho '\\a' +\qecho '\\t' +\qecho '' +\qecho '\\o title' +\qecho 'select value from metabib.title_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '\\o author' +\qecho 'select value from metabib.author_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '\\o subject' +\qecho 'select value from metabib.subject_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '\\o series' +\qecho 'select value from metabib.series_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '\\o identifier' +\qecho 'select value from metabib.identifier_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '\\o keyword' +\qecho 'select value from metabib.keyword_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '' +\qecho '\\o' +\qecho '\\a' +\qecho '\\t' +\qecho '' +\qecho '// Then, at the command line:' +\qecho '' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl title > title.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl author > author.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl subject > subject.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl series > series.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl identifier > identifier.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl keyword > keyword.sql' +\qecho '' +\qecho '// And, back in psql' +\qecho '' +\qecho 'ALTER TABLE search.symspell_dictionary SET UNLOGGED;' +\qecho 'TRUNCATE search.symspell_dictionary;' +\qecho '' +\qecho '\\i identifier.sql' +\qecho '\\i author.sql' +\qecho '\\i title.sql' +\qecho '\\i subject.sql' +\qecho '\\i series.sql' +\qecho '\\i keyword.sql' +\qecho '' +\qecho 'CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;' +\qecho 'REINDEX TABLE search.symspell_dictionary;' +\qecho 'ALTER TABLE search.symspell_dictionary SET LOGGED;' +\qecho 'VACUUM ANALYZE search.symspell_dictionary;' +\qecho '' +\qecho 'DROP TABLE search.symspell_dictionary_partial_title;' +\qecho 'DROP TABLE search.symspell_dictionary_partial_author;' +\qecho 'DROP TABLE search.symspell_dictionary_partial_subject;' +\qecho 'DROP TABLE search.symspell_dictionary_partial_series;' +\qecho 'DROP TABLE search.symspell_dictionary_partial_identifier;' +\qecho 'DROP TABLE search.symspell_dictionary_partial_keyword;' + +/* To run by hand: + +\a +\t + +\o title +select value from metabib.title_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o author +select value from metabib.author_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o subject +select value from metabib.subject_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o series +select value from metabib.series_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o identifier +select value from metabib.identifier_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o keyword +select value from metabib.keyword_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o +\a +\t + +// Then, at the command line: + +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl title > title.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl author > author.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl subject > subject.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl series > series.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl identifier > identifier.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl keyword > keyword.sql + +// To the extent your hardware allows, the above commands can be run in +// in parallel, in different shells. Each will use a full CPU, and RAM +// may be a limiting resource, so keep an eye on that with `top`. + + +// And, back in psql + +ALTER TABLE search.symspell_dictionary SET UNLOGGED; +TRUNCATE search.symspell_dictionary; + +\i identifier.sql +\i author.sql +\i title.sql +\i subject.sql +\i series.sql +\i keyword.sql + +CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey; +REINDEX TABLE search.symspell_dictionary; +ALTER TABLE search.symspell_dictionary SET LOGGED; +VACUUM ANALYZE search.symspell_dictionary; + +DROP TABLE search.symspell_dictionary_partial_title; +DROP TABLE search.symspell_dictionary_partial_author; +DROP TABLE search.symspell_dictionary_partial_subject; +DROP TABLE search.symspell_dictionary_partial_series; +DROP TABLE search.symspell_dictionary_partial_identifier; +DROP TABLE search.symspell_dictionary_partial_keyword; + +*/ + diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell-speed-ingest.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell-speed-ingest.sql deleted file mode 100644 index e0e660fea7..0000000000 --- a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell-speed-ingest.sql +++ /dev/null @@ -1,423 +0,0 @@ -BEGIN; - -SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); - --- We don't pass this function arrays with nulls, so we save 5% not testing for that -CREATE OR REPLACE FUNCTION evergreen.text_array_merge_unique ( - TEXT[], TEXT[] -) RETURNS TEXT[] AS $F$ - SELECT NULLIF(ARRAY( - SELECT * FROM UNNEST($1) x - UNION - SELECT * FROM UNNEST($2) y - ),'{}'); -$F$ LANGUAGE SQL; - -CREATE OR REPLACE FUNCTION search.symspell_build_raw_entry ( - raw_input TEXT, - source_class TEXT, - no_limit BOOL DEFAULT FALSE, - prefix_length INT DEFAULT 6, - maxED INT DEFAULT 3 -) RETURNS SETOF search.symspell_dictionary AS $F$ -DECLARE - key TEXT; - del_key TEXT; - key_list TEXT[]; - entry search.symspell_dictionary%ROWTYPE; -BEGIN - key := raw_input; - - IF NOT no_limit AND CHARACTER_LENGTH(raw_input) > prefix_length THEN - key := SUBSTRING(key FROM 1 FOR prefix_length); - key_list := ARRAY[raw_input, key]; - ELSE - key_list := ARRAY[key]; - END IF; - - FOREACH del_key IN ARRAY key_list LOOP - -- skip empty keys - CONTINUE WHEN del_key IS NULL OR CHARACTER_LENGTH(del_key) = 0; - - entry.prefix_key := del_key; - - entry.keyword_count := 0; - entry.title_count := 0; - entry.author_count := 0; - entry.subject_count := 0; - entry.series_count := 0; - entry.identifier_count := 0; - - entry.keyword_suggestions := '{}'; - entry.title_suggestions := '{}'; - entry.author_suggestions := '{}'; - entry.subject_suggestions := '{}'; - entry.series_suggestions := '{}'; - entry.identifier_suggestions := '{}'; - - IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'title' THEN entry.title_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'author' THEN entry.author_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'subject' THEN entry.subject_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'series' THEN entry.series_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'identifier' THEN entry.identifier_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; - - IF del_key = raw_input THEN - IF source_class = 'keyword' THEN entry.keyword_count := 1; END IF; - IF source_class = 'title' THEN entry.title_count := 1; END IF; - IF source_class = 'author' THEN entry.author_count := 1; END IF; - IF source_class = 'subject' THEN entry.subject_count := 1; END IF; - IF source_class = 'series' THEN entry.series_count := 1; END IF; - IF source_class = 'identifier' THEN entry.identifier_count := 1; END IF; - END IF; - - RETURN NEXT entry; - END LOOP; - - FOR del_key IN SELECT x FROM UNNEST(search.symspell_generate_edits(key, 1, maxED)) x LOOP - - -- skip empty keys - CONTINUE WHEN del_key IS NULL OR CHARACTER_LENGTH(del_key) = 0; - -- skip suggestions that are already too long for the prefix key - CONTINUE WHEN CHARACTER_LENGTH(del_key) <= (prefix_length - maxED) AND CHARACTER_LENGTH(raw_input) > prefix_length; - - entry.keyword_suggestions := '{}'; - entry.title_suggestions := '{}'; - entry.author_suggestions := '{}'; - entry.subject_suggestions := '{}'; - entry.series_suggestions := '{}'; - entry.identifier_suggestions := '{}'; - - IF source_class = 'keyword' THEN entry.keyword_count := 0; END IF; - IF source_class = 'title' THEN entry.title_count := 0; END IF; - IF source_class = 'author' THEN entry.author_count := 0; END IF; - IF source_class = 'subject' THEN entry.subject_count := 0; END IF; - IF source_class = 'series' THEN entry.series_count := 0; END IF; - IF source_class = 'identifier' THEN entry.identifier_count := 0; END IF; - - entry.prefix_key := del_key; - - IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'title' THEN entry.title_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'author' THEN entry.author_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'subject' THEN entry.subject_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'series' THEN entry.series_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'identifier' THEN entry.identifier_suggestions := ARRAY[raw_input]; END IF; - IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; - - RETURN NEXT entry; - END LOOP; - -END; -$F$ LANGUAGE PLPGSQL STRICT IMMUTABLE; - -CREATE OR REPLACE FUNCTION search.symspell_build_entries ( - full_input TEXT, - source_class TEXT, - old_input TEXT DEFAULT NULL, - include_phrases BOOL DEFAULT FALSE -) RETURNS SETOF search.symspell_dictionary AS $F$ -DECLARE - prefix_length INT; - maxED INT; - word_list TEXT[]; - input TEXT; - word TEXT; - entry search.symspell_dictionary; -BEGIN - IF full_input IS NOT NULL THEN - SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled; - prefix_length := COALESCE(prefix_length, 6); - - SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled; - maxED := COALESCE(maxED, 3); - - input := evergreen.lowercase(full_input); - word_list := ARRAY_AGG(x) FROM search.symspell_parse_words_distinct(input) x; - IF word_list IS NULL THEN - RETURN; - END IF; - - IF CARDINALITY(word_list) > 1 AND include_phrases THEN - RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(input, source_class, TRUE, prefix_length, maxED); - END IF; - - FOREACH word IN ARRAY word_list LOOP - -- Skip words that have runs of 5 or more digits (I'm looking at you, ISxNs) - CONTINUE WHEN CHARACTER_LENGTH(word) > 4 AND word ~ '\d{5,}'; - RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(word, source_class, FALSE, prefix_length, maxED); - END LOOP; - END IF; - - IF old_input IS NOT NULL THEN - input := evergreen.lowercase(old_input); - - FOR word IN SELECT x FROM search.symspell_parse_words_distinct(input) x LOOP - -- similarly skip words that have 5 or more digits here to - -- avoid adding erroneous prefix deletion entries to the dictionary - CONTINUE WHEN CHARACTER_LENGTH(word) > 4 AND word ~ '\d{5,}'; - entry.prefix_key := word; - - entry.keyword_count := 0; - entry.title_count := 0; - entry.author_count := 0; - entry.subject_count := 0; - entry.series_count := 0; - entry.identifier_count := 0; - - entry.keyword_suggestions := '{}'; - entry.title_suggestions := '{}'; - entry.author_suggestions := '{}'; - entry.subject_suggestions := '{}'; - entry.series_suggestions := '{}'; - entry.identifier_suggestions := '{}'; - - IF source_class = 'keyword' THEN entry.keyword_count := -1; END IF; - IF source_class = 'title' THEN entry.title_count := -1; END IF; - IF source_class = 'author' THEN entry.author_count := -1; END IF; - IF source_class = 'subject' THEN entry.subject_count := -1; END IF; - IF source_class = 'series' THEN entry.series_count := -1; END IF; - IF source_class = 'identifier' THEN entry.identifier_count := -1; END IF; - - RETURN NEXT entry; - END LOOP; - END IF; -END; -$F$ LANGUAGE PLPGSQL; - -CREATE OR REPLACE FUNCTION search.symspell_build_and_merge_entries ( - full_input TEXT, - source_class TEXT, - old_input TEXT DEFAULT NULL, - include_phrases BOOL DEFAULT FALSE -) RETURNS SETOF search.symspell_dictionary AS $F$ -DECLARE - new_entry RECORD; - conflict_entry RECORD; -BEGIN - - IF full_input = old_input THEN -- neither NULL, and are the same - RETURN; - END IF; - - FOR new_entry IN EXECUTE $q$ - SELECT count, - prefix_key, - s AS suggestions - FROM (SELECT prefix_key, - ARRAY_AGG(DISTINCT $q$ || source_class || $q$_suggestions[1]) s, - SUM($q$ || source_class || $q$_count) count - FROM search.symspell_build_entries($1, $2, $3, $4) - GROUP BY 1) x - $q$ USING full_input, source_class, old_input, include_phrases - LOOP - EXECUTE $q$ - SELECT prefix_key, - $q$ || source_class || $q$_suggestions suggestions, - $q$ || source_class || $q$_count count - FROM search.symspell_dictionary - WHERE prefix_key = $1 $q$ - INTO conflict_entry - USING new_entry.prefix_key; - - IF new_entry.count <> 0 THEN -- Real word, and count changed - IF conflict_entry.prefix_key IS NOT NULL THEN -- we'll be updating - IF conflict_entry.count > 0 THEN -- it's a real word - RETURN QUERY EXECUTE $q$ - UPDATE search.symspell_dictionary - SET $q$ || source_class || $q$_count = $2 - WHERE prefix_key = $1 - RETURNING * $q$ - USING new_entry.prefix_key, GREATEST(0, new_entry.count + conflict_entry.count); - ELSE -- it was a prefix key or delete-emptied word before - IF conflict_entry.suggestions @> new_entry.suggestions THEN -- already have all suggestions here... - RETURN QUERY EXECUTE $q$ - UPDATE search.symspell_dictionary - SET $q$ || source_class || $q$_count = $2 - WHERE prefix_key = $1 - RETURNING * $q$ - USING new_entry.prefix_key, GREATEST(0, new_entry.count); - ELSE -- new suggestion! - RETURN QUERY EXECUTE $q$ - UPDATE search.symspell_dictionary - SET $q$ || source_class || $q$_count = $2, - $q$ || source_class || $q$_suggestions = $3 - WHERE prefix_key = $1 - RETURNING * $q$ - USING new_entry.prefix_key, GREATEST(0, new_entry.count), evergreen.text_array_merge_unique(conflict_entry.suggestions,new_entry.suggestions); - END IF; - END IF; - ELSE - -- We keep the on-conflict clause just in case... - RETURN QUERY EXECUTE $q$ - INSERT INTO search.symspell_dictionary AS d ( - $q$ || source_class || $q$_count, - prefix_key, - $q$ || source_class || $q$_suggestions - ) VALUES ( $1, $2, $3 ) ON CONFLICT (prefix_key) DO - UPDATE SET $q$ || source_class || $q$_count = d.$q$ || source_class || $q$_count + EXCLUDED.$q$ || source_class || $q$_count, - $q$ || source_class || $q$_suggestions = evergreen.text_array_merge_unique(d.$q$ || source_class || $q$_suggestions, EXCLUDED.$q$ || source_class || $q$_suggestions) - RETURNING * $q$ - USING new_entry.count, new_entry.prefix_key, new_entry.suggestions; - END IF; - ELSE -- key only, or no change - IF conflict_entry.prefix_key IS NOT NULL THEN -- we'll be updating - IF NOT conflict_entry.suggestions @> new_entry.suggestions THEN -- There are new suggestions - RETURN QUERY EXECUTE $q$ - UPDATE search.symspell_dictionary - SET $q$ || source_class || $q$_suggestions = $2 - WHERE prefix_key = $1 - RETURNING * $q$ - USING new_entry.prefix_key, evergreen.text_array_merge_unique(conflict_entry.suggestions,new_entry.suggestions); - END IF; - ELSE - RETURN QUERY EXECUTE $q$ - INSERT INTO search.symspell_dictionary AS d ( - $q$ || source_class || $q$_count, - prefix_key, - $q$ || source_class || $q$_suggestions - ) VALUES ( $1, $2, $3 ) ON CONFLICT (prefix_key) DO -- key exists, suggestions may be added due to this entry - UPDATE SET $q$ || source_class || $q$_suggestions = evergreen.text_array_merge_unique(d.$q$ || source_class || $q$_suggestions, EXCLUDED.$q$ || source_class || $q$_suggestions) - RETURNING * $q$ - USING new_entry.count, new_entry.prefix_key, new_entry.suggestions; - END IF; - END IF; - END LOOP; -END; -$F$ LANGUAGE PLPGSQL; - -COMMIT; - -\qecho '' -\qecho 'The following should be run at the end of the upgrade before any' -\qecho 'reingest occurs. Because new triggers are installed already,' -\qecho 'updates to indexed strings will cause zero-count dictionary entries' -\qecho 'to be recorded which will require updating every row again (or' -\qecho 'starting from scratch) so best to do this before other batch' -\qecho 'changes. A later reingest that does not significantly change' -\qecho 'indexed strings will /not/ cause table bloat here, and will be' -\qecho 'as fast as normal. A copy of the SQL in a ready-to-use, non-escaped' -\qecho 'form is available inside a comment at the end of this upgrade sub-' -\qecho 'script so you do not need to copy this comment from the psql ouptut.' -\qecho '' -\qecho '\\a' -\qecho '\\t' -\qecho '' -\qecho '\\o title' -\qecho 'select value from metabib.title_field_entry where source in (select id from biblio.record_entry where not deleted);' -\qecho '\\o author' -\qecho 'select value from metabib.author_field_entry where source in (select id from biblio.record_entry where not deleted);' -\qecho '\\o subject' -\qecho 'select value from metabib.subject_field_entry where source in (select id from biblio.record_entry where not deleted);' -\qecho '\\o series' -\qecho 'select value from metabib.series_field_entry where source in (select id from biblio.record_entry where not deleted);' -\qecho '\\o identifier' -\qecho 'select value from metabib.identifier_field_entry where source in (select id from biblio.record_entry where not deleted);' -\qecho '\\o keyword' -\qecho 'select value from metabib.keyword_field_entry where source in (select id from biblio.record_entry where not deleted);' -\qecho '' -\qecho '\\o' -\qecho '\\a' -\qecho '\\t' -\qecho '' -\qecho '// Then, at the command line:' -\qecho '' -\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl title > title.sql' -\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl author > author.sql' -\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl subject > subject.sql' -\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl series > series.sql' -\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl identifier > identifier.sql' -\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl keyword > keyword.sql' -\qecho '' -\qecho '// And, back in psql' -\qecho '' -\qecho 'ALTER TABLE search.symspell_dictionary SET UNLOGGED;' -\qecho 'TRUNCATE search.symspell_dictionary;' -\qecho '' -\qecho '\\i identifier.sql' -\qecho '\\i author.sql' -\qecho '\\i title.sql' -\qecho '\\i subject.sql' -\qecho '\\i series.sql' -\qecho '\\i keyword.sql' -\qecho '' -\qecho 'CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;' -\qecho 'REINDEX TABLE search.symspell_dictionary;' -\qecho 'ALTER TABLE search.symspell_dictionary SET LOGGED;' -\qecho 'VACUUM ANALYZE search.symspell_dictionary;' -\qecho '' -\qecho 'DROP TABLE search.symspell_dictionary_partial_title;' -\qecho 'DROP TABLE search.symspell_dictionary_partial_author;' -\qecho 'DROP TABLE search.symspell_dictionary_partial_subject;' -\qecho 'DROP TABLE search.symspell_dictionary_partial_series;' -\qecho 'DROP TABLE search.symspell_dictionary_partial_identifier;' -\qecho 'DROP TABLE search.symspell_dictionary_partial_keyword;' - -/* To run by hand: - -\a -\t - -\o title -select value from metabib.title_field_entry where source in (select id from biblio.record_entry where not deleted); - -\o author -select value from metabib.author_field_entry where source in (select id from biblio.record_entry where not deleted); - -\o subject -select value from metabib.subject_field_entry where source in (select id from biblio.record_entry where not deleted); - -\o series -select value from metabib.series_field_entry where source in (select id from biblio.record_entry where not deleted); - -\o identifier -select value from metabib.identifier_field_entry where source in (select id from biblio.record_entry where not deleted); - -\o keyword -select value from metabib.keyword_field_entry where source in (select id from biblio.record_entry where not deleted); - -\o -\a -\t - -// Then, at the command line: - -$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl title > title.sql -$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl author > author.sql -$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl subject > subject.sql -$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl series > series.sql -$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl identifier > identifier.sql -$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl keyword > keyword.sql - -// To the extent your hardware allows, the above commands can be run in -// in parallel, in different shells. Each will use a full CPU, and RAM -// may be a limiting resource, so keep an eye on that with `top`. - - -// And, back in psql - -ALTER TABLE search.symspell_dictionary SET UNLOGGED; -TRUNCATE search.symspell_dictionary; - -\i identifier.sql -\i author.sql -\i title.sql -\i subject.sql -\i series.sql -\i keyword.sql - -CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey; -REINDEX TABLE search.symspell_dictionary; -ALTER TABLE search.symspell_dictionary SET LOGGED; -VACUUM ANALYZE search.symspell_dictionary; - -DROP TABLE search.symspell_dictionary_partial_title; -DROP TABLE search.symspell_dictionary_partial_author; -DROP TABLE search.symspell_dictionary_partial_subject; -DROP TABLE search.symspell_dictionary_partial_series; -DROP TABLE search.symspell_dictionary_partial_identifier; -DROP TABLE search.symspell_dictionary_partial_keyword; - -*/ -