From 2623d2c0ce8fb6c492b3bb0a4cc4aa45b24c85f8 Mon Sep 17 00:00:00 2001 From: Mike Rylander Date: Thu, 14 Oct 2021 09:57:24 -0400 Subject: [PATCH] LP#1947173: Speed up the symspell part of ingest For certain data, and certain data set sizes, merging the suggestion arrays used by the symspell algorithm is noticably expensive. This is the case for suggestion arrays containing many thousands of entries. These suggestion sets are not only slow, but generally not useful. We avoid the creation of such overly long suggestion sets using several word filters that take advantage of our knowledge of the incoming data to optimize for what is useful in a bibliographic context. The mechanisms employed by this patch are: - Omit suggestions whose length is longer than the max prefix key length when the prefix key length is less than or equal to the maximum prefix key length minus the maximum edit distance. - Omit words that contain a run of 5 or more digits. This will drop most identifiers from the dictionary while still allowing suggestions to happen for year values. - Omit empty keys from the dictionary. This should have been the case already but is now enforced directly. - Add a small speedup to evergreen.text_array_merge_unique() by making it assume that arrays passed to it do not have null values, which we intentionally avoid, and against which we protect in other ways in the commit. Besides improving reingest speed, the patches will also make the search.symspell_dictionary table significantly smaller. Signed-off-by: Mike Rylander Signed-off-by: Galen Charlton --- Open-ILS/src/sql/Pg/300.schema.staged_search.sql | 22 +- .../upgrade/XXXX.schema.symspell-speed-ingest.sql | 420 +++++++++++++++++++++ Open-ILS/src/support-scripts/symspell-sideload.pl | 7 +- 3 files changed, 444 insertions(+), 5 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell-speed-ingest.sql diff --git a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql index ece24083ba..d06c56b9d6 100644 --- a/Open-ILS/src/sql/Pg/300.schema.staged_search.sql +++ b/Open-ILS/src/sql/Pg/300.schema.staged_search.sql @@ -1472,13 +1472,14 @@ $f$ LANGUAGE PLPGSQL ROWS 10; -- SymSpell implementation follows +-- We don't pass this function arrays with nulls, so we save 5% not testing for that CREATE OR REPLACE FUNCTION evergreen.text_array_merge_unique ( TEXT[], TEXT[] ) RETURNS TEXT[] AS $F$ SELECT NULLIF(ARRAY( - SELECT * FROM UNNEST($1) x WHERE x IS NOT NULL + SELECT * FROM UNNEST($1) x UNION - SELECT * FROM UNNEST($2) y WHERE y IS NOT NULL + SELECT * FROM UNNEST($2) y ),'{}'); $F$ LANGUAGE SQL; @@ -1871,6 +1872,9 @@ BEGIN END IF; FOREACH del_key IN ARRAY key_list LOOP + -- skip empty keys + CONTINUE WHEN del_key IS NULL OR CHARACTER_LENGTH(del_key) = 0; + entry.prefix_key := del_key; entry.keyword_count := 0; @@ -1909,6 +1913,11 @@ BEGIN FOR del_key IN SELECT x FROM UNNEST(search.symspell_generate_edits(key, 1, maxED)) x LOOP + -- skip empty keys + CONTINUE WHEN del_key IS NULL OR CHARACTER_LENGTH(del_key) = 0; + -- skip suggestions that are already too long for the prefix key + CONTINUE WHEN CHARACTER_LENGTH(del_key) <= (prefix_length - maxED) AND CHARACTER_LENGTH(raw_input) > prefix_length; + entry.keyword_suggestions := '{}'; entry.title_suggestions := '{}'; entry.author_suggestions := '{}'; @@ -1968,6 +1977,8 @@ BEGIN END IF; FOREACH word IN ARRAY word_list LOOP + -- Skip words that have runs of 5 or more digits (I'm looking at you, ISxNs) + CONTINUE WHEN CHARACTER_LENGTH(word) > 4 AND word ~ '\d{5,}'; RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(word, source_class, FALSE, prefix_length, maxED); END LOOP; END IF; @@ -1976,6 +1987,9 @@ BEGIN input := evergreen.lowercase(old_input); FOR word IN SELECT x FROM search.symspell_parse_words_distinct(input) x LOOP + -- similarly skip words that have 5 or more digits here to + -- avoid adding erroneous prefix deletion entries to the dictionary + CONTINUE WHEN CHARACTER_LENGTH(word) > 4 AND word ~ '\d{5,}'; entry.prefix_key := word; entry.keyword_count := 0; @@ -2023,9 +2037,9 @@ BEGIN FOR new_entry IN EXECUTE $q$ SELECT count, prefix_key, - evergreen.text_array_merge_unique(s,'{}') suggestions + s AS suggestions FROM (SELECT prefix_key, - ARRAY_AGG($q$ || source_class || $q$_suggestions[1]) s, + ARRAY_AGG(DISTINCT $q$ || source_class || $q$_suggestions[1]) s, SUM($q$ || source_class || $q$_count) count FROM search.symspell_build_entries($1, $2, $3, $4) GROUP BY 1) x diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell-speed-ingest.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell-speed-ingest.sql new file mode 100644 index 0000000000..150135794e --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell-speed-ingest.sql @@ -0,0 +1,420 @@ +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); + +-- We don't pass this function arrays with nulls, so we save 5% not testing for that +CREATE OR REPLACE FUNCTION evergreen.text_array_merge_unique ( + TEXT[], TEXT[] +) RETURNS TEXT[] AS $F$ + SELECT NULLIF(ARRAY( + SELECT * FROM UNNEST($1) x + UNION + SELECT * FROM UNNEST($2) y + ),'{}'); +$F$ LANGUAGE SQL; + +CREATE OR REPLACE FUNCTION search.symspell_build_raw_entry ( + raw_input TEXT, + source_class TEXT, + no_limit BOOL DEFAULT FALSE, + prefix_length INT DEFAULT 6, + maxED INT DEFAULT 3 +) RETURNS SETOF search.symspell_dictionary AS $F$ +DECLARE + key TEXT; + del_key TEXT; + key_list TEXT[]; + entry search.symspell_dictionary%ROWTYPE; +BEGIN + key := raw_input; + + IF NOT no_limit AND CHARACTER_LENGTH(raw_input) > prefix_length THEN + key := SUBSTRING(key FROM 1 FOR prefix_length); + key_list := ARRAY[raw_input, key]; + ELSE + key_list := ARRAY[key]; + END IF; + + FOREACH del_key IN ARRAY key_list LOOP + -- skip empty keys + CONTINUE WHEN del_key IS NULL OR CHARACTER_LENGTH(del_key) = 0; + + entry.prefix_key := del_key; + + entry.keyword_count := 0; + entry.title_count := 0; + entry.author_count := 0; + entry.subject_count := 0; + entry.series_count := 0; + entry.identifier_count := 0; + + entry.keyword_suggestions := '{}'; + entry.title_suggestions := '{}'; + entry.author_suggestions := '{}'; + entry.subject_suggestions := '{}'; + entry.series_suggestions := '{}'; + entry.identifier_suggestions := '{}'; + + IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'title' THEN entry.title_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'author' THEN entry.author_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'subject' THEN entry.subject_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'series' THEN entry.series_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'identifier' THEN entry.identifier_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; + + IF del_key = raw_input THEN + IF source_class = 'keyword' THEN entry.keyword_count := 1; END IF; + IF source_class = 'title' THEN entry.title_count := 1; END IF; + IF source_class = 'author' THEN entry.author_count := 1; END IF; + IF source_class = 'subject' THEN entry.subject_count := 1; END IF; + IF source_class = 'series' THEN entry.series_count := 1; END IF; + IF source_class = 'identifier' THEN entry.identifier_count := 1; END IF; + END IF; + + RETURN NEXT entry; + END LOOP; + + FOR del_key IN SELECT x FROM UNNEST(search.symspell_generate_edits(key, 1, maxED)) x LOOP + + -- skip empty keys + CONTINUE WHEN del_key IS NULL OR CHARACTER_LENGTH(del_key) = 0; + -- skip suggestions that are already too long for the prefix key + CONTINUE WHEN CHARACTER_LENGTH(del_key) <= (prefix_length - maxED) AND CHARACTER_LENGTH(raw_input) > prefix_length; + + entry.keyword_suggestions := '{}'; + entry.title_suggestions := '{}'; + entry.author_suggestions := '{}'; + entry.subject_suggestions := '{}'; + entry.series_suggestions := '{}'; + entry.identifier_suggestions := '{}'; + + IF source_class = 'keyword' THEN entry.keyword_count := 0; END IF; + IF source_class = 'title' THEN entry.title_count := 0; END IF; + IF source_class = 'author' THEN entry.author_count := 0; END IF; + IF source_class = 'subject' THEN entry.subject_count := 0; END IF; + IF source_class = 'series' THEN entry.series_count := 0; END IF; + IF source_class = 'identifier' THEN entry.identifier_count := 0; END IF; + + entry.prefix_key := del_key; + + IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'title' THEN entry.title_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'author' THEN entry.author_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'subject' THEN entry.subject_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'series' THEN entry.series_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'identifier' THEN entry.identifier_suggestions := ARRAY[raw_input]; END IF; + IF source_class = 'keyword' THEN entry.keyword_suggestions := ARRAY[raw_input]; END IF; + + RETURN NEXT entry; + END LOOP; + +END; +$F$ LANGUAGE PLPGSQL STRICT IMMUTABLE; + +CREATE OR REPLACE FUNCTION search.symspell_build_entries ( + full_input TEXT, + source_class TEXT, + old_input TEXT DEFAULT NULL, + include_phrases BOOL DEFAULT FALSE +) RETURNS SETOF search.symspell_dictionary AS $F$ +DECLARE + prefix_length INT; + maxED INT; + word_list TEXT[]; + input TEXT; + word TEXT; + entry search.symspell_dictionary; +BEGIN + IF full_input IS NOT NULL THEN + SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled; + prefix_length := COALESCE(prefix_length, 6); + + SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled; + maxED := COALESCE(maxED, 3); + + input := evergreen.lowercase(full_input); + word_list := ARRAY_AGG(x) FROM search.symspell_parse_words_distinct(input) x; + + IF CARDINALITY(word_list) > 1 AND include_phrases THEN + RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(input, source_class, TRUE, prefix_length, maxED); + END IF; + + FOREACH word IN ARRAY word_list LOOP + -- Skip words that have runs of 5 or more digits (I'm looking at you, ISxNs) + CONTINUE WHEN CHARACTER_LENGTH(word) > 4 AND word ~ '\d{5,}'; + RETURN QUERY SELECT * FROM search.symspell_build_raw_entry(word, source_class, FALSE, prefix_length, maxED); + END LOOP; + END IF; + + IF old_input IS NOT NULL THEN + input := evergreen.lowercase(old_input); + + FOR word IN SELECT x FROM search.symspell_parse_words_distinct(input) x LOOP + -- similarly skip words that have 5 or more digits here to + -- avoid adding erroneous prefix deletion entries to the dictionary + CONTINUE WHEN CHARACTER_LENGTH(word) > 4 AND word ~ '\d{5,}'; + entry.prefix_key := word; + + entry.keyword_count := 0; + entry.title_count := 0; + entry.author_count := 0; + entry.subject_count := 0; + entry.series_count := 0; + entry.identifier_count := 0; + + entry.keyword_suggestions := '{}'; + entry.title_suggestions := '{}'; + entry.author_suggestions := '{}'; + entry.subject_suggestions := '{}'; + entry.series_suggestions := '{}'; + entry.identifier_suggestions := '{}'; + + IF source_class = 'keyword' THEN entry.keyword_count := -1; END IF; + IF source_class = 'title' THEN entry.title_count := -1; END IF; + IF source_class = 'author' THEN entry.author_count := -1; END IF; + IF source_class = 'subject' THEN entry.subject_count := -1; END IF; + IF source_class = 'series' THEN entry.series_count := -1; END IF; + IF source_class = 'identifier' THEN entry.identifier_count := -1; END IF; + + RETURN NEXT entry; + END LOOP; + END IF; +END; +$F$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION search.symspell_build_and_merge_entries ( + full_input TEXT, + source_class TEXT, + old_input TEXT DEFAULT NULL, + include_phrases BOOL DEFAULT FALSE +) RETURNS SETOF search.symspell_dictionary AS $F$ +DECLARE + new_entry RECORD; + conflict_entry RECORD; +BEGIN + + IF full_input = old_input THEN -- neither NULL, and are the same + RETURN; + END IF; + + FOR new_entry IN EXECUTE $q$ + SELECT count, + prefix_key, + s AS suggestions + FROM (SELECT prefix_key, + ARRAY_AGG(DISTINCT $q$ || source_class || $q$_suggestions[1]) s, + SUM($q$ || source_class || $q$_count) count + FROM search.symspell_build_entries($1, $2, $3, $4) + GROUP BY 1) x + $q$ USING full_input, source_class, old_input, include_phrases + LOOP + EXECUTE $q$ + SELECT prefix_key, + $q$ || source_class || $q$_suggestions suggestions, + $q$ || source_class || $q$_count count + FROM search.symspell_dictionary + WHERE prefix_key = $1 $q$ + INTO conflict_entry + USING new_entry.prefix_key; + + IF new_entry.count <> 0 THEN -- Real word, and count changed + IF conflict_entry.prefix_key IS NOT NULL THEN -- we'll be updating + IF conflict_entry.count > 0 THEN -- it's a real word + RETURN QUERY EXECUTE $q$ + UPDATE search.symspell_dictionary + SET $q$ || source_class || $q$_count = $2 + WHERE prefix_key = $1 + RETURNING * $q$ + USING new_entry.prefix_key, GREATEST(0, new_entry.count + conflict_entry.count); + ELSE -- it was a prefix key or delete-emptied word before + IF conflict_entry.suggestions @> new_entry.suggestions THEN -- already have all suggestions here... + RETURN QUERY EXECUTE $q$ + UPDATE search.symspell_dictionary + SET $q$ || source_class || $q$_count = $2 + WHERE prefix_key = $1 + RETURNING * $q$ + USING new_entry.prefix_key, GREATEST(0, new_entry.count); + ELSE -- new suggestion! + RETURN QUERY EXECUTE $q$ + UPDATE search.symspell_dictionary + SET $q$ || source_class || $q$_count = $2, + $q$ || source_class || $q$_suggestions = $3 + WHERE prefix_key = $1 + RETURNING * $q$ + USING new_entry.prefix_key, GREATEST(0, new_entry.count), evergreen.text_array_merge_unique(conflict_entry.suggestions,new_entry.suggestions); + END IF; + END IF; + ELSE + -- We keep the on-conflict clause just in case... + RETURN QUERY EXECUTE $q$ + INSERT INTO search.symspell_dictionary AS d ( + $q$ || source_class || $q$_count, + prefix_key, + $q$ || source_class || $q$_suggestions + ) VALUES ( $1, $2, $3 ) ON CONFLICT (prefix_key) DO + UPDATE SET $q$ || source_class || $q$_count = d.$q$ || source_class || $q$_count + EXCLUDED.$q$ || source_class || $q$_count, + $q$ || source_class || $q$_suggestions = evergreen.text_array_merge_unique(d.$q$ || source_class || $q$_suggestions, EXCLUDED.$q$ || source_class || $q$_suggestions) + RETURNING * $q$ + USING new_entry.count, new_entry.prefix_key, new_entry.suggestions; + END IF; + ELSE -- key only, or no change + IF conflict_entry.prefix_key IS NOT NULL THEN -- we'll be updating + IF NOT conflict_entry.suggestions @> new_entry.suggestions THEN -- There are new suggestions + RETURN QUERY EXECUTE $q$ + UPDATE search.symspell_dictionary + SET $q$ || source_class || $q$_suggestions = $2 + WHERE prefix_key = $1 + RETURNING * $q$ + USING new_entry.prefix_key, evergreen.text_array_merge_unique(conflict_entry.suggestions,new_entry.suggestions); + END IF; + ELSE + RETURN QUERY EXECUTE $q$ + INSERT INTO search.symspell_dictionary AS d ( + $q$ || source_class || $q$_count, + prefix_key, + $q$ || source_class || $q$_suggestions + ) VALUES ( $1, $2, $3 ) ON CONFLICT (prefix_key) DO -- key exists, suggestions may be added due to this entry + UPDATE SET $q$ || source_class || $q$_suggestions = evergreen.text_array_merge_unique(d.$q$ || source_class || $q$_suggestions, EXCLUDED.$q$ || source_class || $q$_suggestions) + RETURNING * $q$ + USING new_entry.count, new_entry.prefix_key, new_entry.suggestions; + END IF; + END IF; + END LOOP; +END; +$F$ LANGUAGE PLPGSQL; + +COMMIT; + +\qecho '' +\qecho 'The following should be run at the end of the upgrade before any' +\qecho 'reingest occurs. Because new triggers are installed already,' +\qecho 'updates to indexed strings will cause zero-count dictionary entries' +\qecho 'to be recorded which will require updating every row again (or' +\qecho 'starting from scratch) so best to do this before other batch' +\qecho 'changes. A later reingest that does not significantly change' +\qecho 'indexed strings will /not/ cause table bloat here, and will be' +\qecho 'as fast as normal. A copy of the SQL in a ready-to-use, non-escaped' +\qecho 'form is available inside a comment at the end of this upgrade sub-' +\qecho 'script so you do not need to copy this comment from the psql ouptut.' +\qecho '' +\qecho '\\a' +\qecho '\\t' +\qecho '' +\qecho '\\o title' +\qecho 'select value from metabib.title_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '\\o author' +\qecho 'select value from metabib.author_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '\\o subject' +\qecho 'select value from metabib.subject_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '\\o series' +\qecho 'select value from metabib.series_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '\\o identifier' +\qecho 'select value from metabib.identifier_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '\\o keyword' +\qecho 'select value from metabib.keyword_field_entry where source in (select id from biblio.record_entry where not deleted);' +\qecho '' +\qecho '\\o' +\qecho '\\a' +\qecho '\\t' +\qecho '' +\qecho '// Then, at the command line:' +\qecho '' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl title > title.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl author > author.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl subject > subject.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl series > series.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl identifier > identifier.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl keyword > keyword.sql' +\qecho '' +\qecho '// And, back in psql' +\qecho '' +\qecho 'ALTER TABLE search.symspell_dictionary SET UNLOGGED;' +\qecho 'TRUNCATE search.symspell_dictionary;' +\qecho '' +\qecho '\\i identifier.sql' +\qecho '\\i author.sql' +\qecho '\\i title.sql' +\qecho '\\i subject.sql' +\qecho '\\i series.sql' +\qecho '\\i keyword.sql' +\qecho '' +\qecho 'CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;' +\qecho 'REINDEX TABLE search.symspell_dictionary;' +\qecho 'ALTER TABLE search.symspell_dictionary SET LOGGED;' +\qecho 'VACUUM ANALYZE search.symspell_dictionary;' +\qecho '' +\qecho 'DROP TABLE search.symspell_dictionary_partial_title;' +\qecho 'DROP TABLE search.symspell_dictionary_partial_author;' +\qecho 'DROP TABLE search.symspell_dictionary_partial_subject;' +\qecho 'DROP TABLE search.symspell_dictionary_partial_series;' +\qecho 'DROP TABLE search.symspell_dictionary_partial_identifier;' +\qecho 'DROP TABLE search.symspell_dictionary_partial_keyword;' + +/* To run by hand: + +\a +\t + +\o title +select value from metabib.title_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o author +select value from metabib.author_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o subject +select value from metabib.subject_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o series +select value from metabib.series_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o identifier +select value from metabib.identifier_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o keyword +select value from metabib.keyword_field_entry where source in (select id from biblio.record_entry where not deleted); + +\o +\a +\t + +// Then, at the command line: + +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl title > title.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl author > author.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl subject > subject.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl series > series.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl identifier > identifier.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl keyword > keyword.sql + +// To the extent your hardware allows, the above commands can be run in +// in parallel, in different shells. Each will use a full CPU, and RAM +// may be a limiting resource, so keep an eye on that with `top`. + + +// And, back in psql + +ALTER TABLE search.symspell_dictionary SET UNLOGGED; +TRUNCATE search.symspell_dictionary; + +\i identifier.sql +\i author.sql +\i title.sql +\i subject.sql +\i series.sql +\i keyword.sql + +CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey; +REINDEX TABLE search.symspell_dictionary; +ALTER TABLE search.symspell_dictionary SET LOGGED; +VACUUM ANALYZE search.symspell_dictionary; + +DROP TABLE search.symspell_dictionary_partial_title; +DROP TABLE search.symspell_dictionary_partial_author; +DROP TABLE search.symspell_dictionary_partial_subject; +DROP TABLE search.symspell_dictionary_partial_series; +DROP TABLE search.symspell_dictionary_partial_identifier; +DROP TABLE search.symspell_dictionary_partial_keyword; + +*/ + diff --git a/Open-ILS/src/support-scripts/symspell-sideload.pl b/Open-ILS/src/support-scripts/symspell-sideload.pl index 257f28b05c..e74120eb93 100755 --- a/Open-ILS/src/support-scripts/symspell-sideload.pl +++ b/Open-ILS/src/support-scripts/symspell-sideload.pl @@ -27,19 +27,24 @@ while (my $data = <>) { for my $raw (uniq @words) { my $key = $raw; + my $rlen = length($raw); + next if ($raw =~ /\d{5,}/ and $rlen > 4); + $dict{$key} //= [0,[]]; $dict{$key}[0]++; if ($dict{$key}[0] == 1) { # first time we've seen it, need to generate prefix keys push @{$dict{$key}[1]}, $raw; - if (length($raw) > $plen) { + if ($rlen > $plen) { $key = substr($raw,0,$plen); $dict{$key} //= [0,[]]; push @{$dict{$key}[1]}, $raw; } for my $edit (symspell_generate_edits($key, 1)) { + next unless length($edit); + next if (length($edit) <= ($plen - $maxed) and $rlen > $plen); $dict{$edit} //= [0,[]]; push @{$dict{$edit}[1]}, $raw; } -- 2.11.0