From: Jason Boyer Date: Fri, 27 Aug 2021 18:45:54 +0000 (-0400) Subject: LP1931162: Stamp Upgrade Script X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=f50b5e9c1b3a8e3ee163d10294e363db3bc5f3e4;p=Evergreen.git LP1931162: Stamp Upgrade Script Signed-off-by: Jason Boyer --- diff --git a/Open-ILS/src/sql/Pg/002.schema.config.sql b/Open-ILS/src/sql/Pg/002.schema.config.sql index 4e6cee2173..177f27944f 100644 --- a/Open-ILS/src/sql/Pg/002.schema.config.sql +++ b/Open-ILS/src/sql/Pg/002.schema.config.sql @@ -92,7 +92,7 @@ CREATE TRIGGER no_overlapping_deps BEFORE INSERT OR UPDATE ON config.db_patch_dependencies FOR EACH ROW EXECUTE PROCEDURE evergreen.array_overlap_check ('deprecates'); -INSERT INTO config.upgrade_log (version, applied_to) VALUES ('1281', :eg_version); -- berick/rfrasur/gmcharlt +INSERT INTO config.upgrade_log (version, applied_to) VALUES ('1282', :eg_version); -- miker/slink/jboyer CREATE TABLE config.bib_source ( id SERIAL PRIMARY KEY, diff --git a/Open-ILS/src/sql/Pg/upgrade/1282.function.did_you_mean_optimization.sql b/Open-ILS/src/sql/Pg/upgrade/1282.function.did_you_mean_optimization.sql new file mode 100644 index 0000000000..9a3170c520 --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/1282.function.did_you_mean_optimization.sql @@ -0,0 +1,268 @@ +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('1282', :eg_version); + +CREATE OR REPLACE FUNCTION search.symspell_lookup( + raw_input text, + search_class text, + verbosity integer DEFAULT 2, + xfer_case boolean DEFAULT false, + count_threshold integer DEFAULT 1, + soundex_weight integer DEFAULT 0, + pg_trgm_weight integer DEFAULT 0, + kbdist_weight integer DEFAULT 0 +) RETURNS SETOF search.symspell_lookup_output + LANGUAGE plpgsql +AS $function$ +DECLARE + prefix_length INT; + maxED INT; + good_suggs HSTORE; + word_list TEXT[]; + edit_list TEXT[] := '{}'; + seen_list TEXT[] := '{}'; + output search.symspell_lookup_output; + output_list search.symspell_lookup_output[]; + entry RECORD; + entry_key TEXT; + prefix_key TEXT; + sugg TEXT; + input TEXT; + word TEXT; + w_pos INT := -1; + smallest_ed INT := -1; + global_ed INT; + i_len INT; + l_maxED INT; +BEGIN + SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled; + prefix_length := COALESCE(prefix_length, 6); + + SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled; + maxED := COALESCE(maxED, 3); + + word_list := ARRAY_AGG(x) FROM search.symspell_parse_words(raw_input) x; + + -- Common case exact match test for preformance + IF verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN + EXECUTE + 'SELECT '||search_class||'_suggestions AS suggestions, + '||search_class||'_count AS count, + prefix_key + FROM search.symspell_dictionary + WHERE prefix_key = $1 + AND '||search_class||'_count >= $2 + AND '||search_class||'_suggestions @> ARRAY[$1]' + INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1); + IF entry.prefix_key IS NOT NULL THEN + output.lev_distance := 0; -- definitionally + output.prefix_key := entry.prefix_key; + output.prefix_key_count := entry.count; + output.suggestion_count := entry.count; + output.input := word_list[1]; + IF xfer_case THEN + output.suggestion := search.symspell_transfer_casing(output.input, entry.prefix_key); + ELSE + output.suggestion := entry.prefix_key; + END IF; + output.norm_input := entry.prefix_key; + output.qwerty_kb_match := 1; + output.pg_trgm_sim := 1; + output.soundex_sim := 1; + RETURN NEXT output; + RETURN; + END IF; + END IF; + + <> + FOREACH word IN ARRAY word_list LOOP + w_pos := w_pos + 1; + input := evergreen.lowercase(word); + i_len := CHARACTER_LENGTH(input); + l_maxED := maxED; + + IF CHARACTER_LENGTH(input) > prefix_length THEN + prefix_key := SUBSTRING(input FROM 1 FOR prefix_length); + edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, l_maxED); + ELSE + edit_list := input || search.symspell_generate_edits(input, 1, l_maxED); + END IF; + + SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x; + + output_list := '{}'; + seen_list := '{}'; + global_ed := NULL; + + <> + FOREACH entry_key IN ARRAY edit_list LOOP + smallest_ed := -1; + IF global_ed IS NOT NULL THEN + smallest_ed := global_ed; + END IF; + + FOR entry IN EXECUTE + 'SELECT '||search_class||'_suggestions AS suggestions, + '||search_class||'_count AS count, + prefix_key + FROM search.symspell_dictionary + WHERE prefix_key = $1 + AND '||search_class||'_suggestions IS NOT NULL' + USING entry_key + LOOP + + SELECT HSTORE( + ARRAY_AGG( + ARRAY[s, evergreen.levenshtein_damerau_edistance(input,s,l_maxED)::TEXT] + ORDER BY evergreen.levenshtein_damerau_edistance(input,s,l_maxED) DESC + ) + ) + INTO good_suggs + FROM UNNEST(entry.suggestions) s + WHERE (ABS(CHARACTER_LENGTH(s) - i_len) <= maxEd AND evergreen.levenshtein_damerau_edistance(input,s,l_maxED) BETWEEN 0 AND l_maxED) + AND NOT seen_list @> ARRAY[s]; + + CONTINUE WHEN good_suggs IS NULL; + + FOR sugg, output.suggestion_count IN EXECUTE + 'SELECT prefix_key, '||search_class||'_count + FROM search.symspell_dictionary + WHERE prefix_key = ANY ($1) + AND '||search_class||'_count >= $2' + USING AKEYS(good_suggs), COALESCE(count_threshold,1) + LOOP + + output.lev_distance := good_suggs->sugg; + seen_list := seen_list || sugg; + + -- Track the smallest edit distance among suggestions from this prefix key. + IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN + smallest_ed := output.lev_distance; + END IF; + + -- Track the smallest edit distance for all prefix keys for this word. + IF global_ed IS NULL OR smallest_ed < global_ed THEN + global_ed = smallest_ed; + -- And if low verbosity, ignore suggs with a larger distance from here on. + IF verbosity <= 1 THEN + l_maxED := global_ed; + END IF; + END IF; + + -- Lev distance is our main similarity measure. While + -- trgm or soundex similarity could be the main filter, + -- Lev is both language agnostic and faster. + -- + -- Here we will skip suggestions that have a longer edit distance + -- than the shortest we've already found. This is simply an + -- optimization that allows us to avoid further processing + -- of this entry. It would be filtered out later. + CONTINUE WHEN output.lev_distance > global_ed AND verbosity <= 1; + + -- If we have an exact match on the suggestion key we can also avoid + -- some function calls. + IF output.lev_distance = 0 THEN + output.qwerty_kb_match := 1; + output.pg_trgm_sim := 1; + output.soundex_sim := 1; + ELSE + IF kbdist_weight THEN + output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg); + ELSE + output.qwerty_kb_match := 0; + END IF; + IF pg_trgm_weight THEN + output.pg_trgm_sim := similarity(input, sugg); + ELSE + output.pg_trgm_sim := 0; + END IF; + IF soundex_weight THEN + output.soundex_sim := difference(input, sugg) / 4.0; + ELSE + output.soundex_sim := 0; + END IF; + END IF; + + -- Fill in some fields + IF xfer_case AND input <> word THEN + output.suggestion := search.symspell_transfer_casing(word, sugg); + ELSE + output.suggestion := sugg; + END IF; + output.prefix_key := entry.prefix_key; + output.prefix_key_count := entry.count; + output.input := word; + output.norm_input := input; + output.word_pos := w_pos; + + -- We can't "cache" a set of generated records directly, so + -- here we build up an array of search.symspell_lookup_output + -- records that we can revivicate later as a table using UNNEST(). + output_list := output_list || output; + + EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit + CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key + + END LOOP; -- loop over suggestions + END LOOP; -- loop over entries + END LOOP; -- loop over entry_keys + + -- Now we're done examining this word + IF verbosity = 0 THEN + -- Return the "best" suggestion from the smallest edit + -- distance group. We define best based on the weighting + -- of the non-lev similarity measures and use the suggestion + -- use count to break ties. + RETURN QUERY + SELECT * FROM UNNEST(output_list) + ORDER BY lev_distance, + (soundex_sim * COALESCE(soundex_weight,0)) + + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) + + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, + suggestion_count DESC + LIMIT 1; + ELSIF verbosity = 1 THEN + -- Return all suggestions from the smallest + -- edit distance group. + RETURN QUERY + SELECT * FROM UNNEST(output_list) WHERE lev_distance = smallest_ed + ORDER BY (soundex_sim * COALESCE(soundex_weight,0)) + + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) + + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, + suggestion_count DESC; + ELSIF verbosity = 2 THEN + -- Return everything we find, along with relevant stats + RETURN QUERY + SELECT * FROM UNNEST(output_list) + ORDER BY lev_distance, + (soundex_sim * COALESCE(soundex_weight,0)) + + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) + + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, + suggestion_count DESC; + ELSIF verbosity = 3 THEN + -- Return everything we find from the two smallest edit distance groups + RETURN QUERY + SELECT * FROM UNNEST(output_list) + WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) ORDER BY 1 LIMIT 2) + ORDER BY lev_distance, + (soundex_sim * COALESCE(soundex_weight,0)) + + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) + + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, + suggestion_count DESC; + ELSIF verbosity = 4 THEN + -- Return everything we find from the two smallest edit distance groups that are NOT 0 distance + RETURN QUERY + SELECT * FROM UNNEST(output_list) + WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) WHERE lev_distance > 0 ORDER BY 1 LIMIT 2) + ORDER BY lev_distance, + (soundex_sim * COALESCE(soundex_weight,0)) + + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) + + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, + suggestion_count DESC; + END IF; + END LOOP; -- loop over words +END; +$function$; + +COMMIT; + diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.function.did_you_mean_optimization.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.function.did_you_mean_optimization.sql deleted file mode 100644 index a72c76f709..0000000000 --- a/Open-ILS/src/sql/Pg/upgrade/XXXX.function.did_you_mean_optimization.sql +++ /dev/null @@ -1,268 +0,0 @@ -BEGIN; - -SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); - -CREATE OR REPLACE FUNCTION search.symspell_lookup( - raw_input text, - search_class text, - verbosity integer DEFAULT 2, - xfer_case boolean DEFAULT false, - count_threshold integer DEFAULT 1, - soundex_weight integer DEFAULT 0, - pg_trgm_weight integer DEFAULT 0, - kbdist_weight integer DEFAULT 0 -) RETURNS SETOF search.symspell_lookup_output - LANGUAGE plpgsql -AS $function$ -DECLARE - prefix_length INT; - maxED INT; - good_suggs HSTORE; - word_list TEXT[]; - edit_list TEXT[] := '{}'; - seen_list TEXT[] := '{}'; - output search.symspell_lookup_output; - output_list search.symspell_lookup_output[]; - entry RECORD; - entry_key TEXT; - prefix_key TEXT; - sugg TEXT; - input TEXT; - word TEXT; - w_pos INT := -1; - smallest_ed INT := -1; - global_ed INT; - i_len INT; - l_maxED INT; -BEGIN - SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled; - prefix_length := COALESCE(prefix_length, 6); - - SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled; - maxED := COALESCE(maxED, 3); - - word_list := ARRAY_AGG(x) FROM search.symspell_parse_words(raw_input) x; - - -- Common case exact match test for preformance - IF verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN - EXECUTE - 'SELECT '||search_class||'_suggestions AS suggestions, - '||search_class||'_count AS count, - prefix_key - FROM search.symspell_dictionary - WHERE prefix_key = $1 - AND '||search_class||'_count >= $2 - AND '||search_class||'_suggestions @> ARRAY[$1]' - INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1); - IF entry.prefix_key IS NOT NULL THEN - output.lev_distance := 0; -- definitionally - output.prefix_key := entry.prefix_key; - output.prefix_key_count := entry.count; - output.suggestion_count := entry.count; - output.input := word_list[1]; - IF xfer_case THEN - output.suggestion := search.symspell_transfer_casing(output.input, entry.prefix_key); - ELSE - output.suggestion := entry.prefix_key; - END IF; - output.norm_input := entry.prefix_key; - output.qwerty_kb_match := 1; - output.pg_trgm_sim := 1; - output.soundex_sim := 1; - RETURN NEXT output; - RETURN; - END IF; - END IF; - - <> - FOREACH word IN ARRAY word_list LOOP - w_pos := w_pos + 1; - input := evergreen.lowercase(word); - i_len := CHARACTER_LENGTH(input); - l_maxED := maxED; - - IF CHARACTER_LENGTH(input) > prefix_length THEN - prefix_key := SUBSTRING(input FROM 1 FOR prefix_length); - edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, l_maxED); - ELSE - edit_list := input || search.symspell_generate_edits(input, 1, l_maxED); - END IF; - - SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x; - - output_list := '{}'; - seen_list := '{}'; - global_ed := NULL; - - <> - FOREACH entry_key IN ARRAY edit_list LOOP - smallest_ed := -1; - IF global_ed IS NOT NULL THEN - smallest_ed := global_ed; - END IF; - - FOR entry IN EXECUTE - 'SELECT '||search_class||'_suggestions AS suggestions, - '||search_class||'_count AS count, - prefix_key - FROM search.symspell_dictionary - WHERE prefix_key = $1 - AND '||search_class||'_suggestions IS NOT NULL' - USING entry_key - LOOP - - SELECT HSTORE( - ARRAY_AGG( - ARRAY[s, evergreen.levenshtein_damerau_edistance(input,s,l_maxED)::TEXT] - ORDER BY evergreen.levenshtein_damerau_edistance(input,s,l_maxED) DESC - ) - ) - INTO good_suggs - FROM UNNEST(entry.suggestions) s - WHERE (ABS(CHARACTER_LENGTH(s) - i_len) <= maxEd AND evergreen.levenshtein_damerau_edistance(input,s,l_maxED) BETWEEN 0 AND l_maxED) - AND NOT seen_list @> ARRAY[s]; - - CONTINUE WHEN good_suggs IS NULL; - - FOR sugg, output.suggestion_count IN EXECUTE - 'SELECT prefix_key, '||search_class||'_count - FROM search.symspell_dictionary - WHERE prefix_key = ANY ($1) - AND '||search_class||'_count >= $2' - USING AKEYS(good_suggs), COALESCE(count_threshold,1) - LOOP - - output.lev_distance := good_suggs->sugg; - seen_list := seen_list || sugg; - - -- Track the smallest edit distance among suggestions from this prefix key. - IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN - smallest_ed := output.lev_distance; - END IF; - - -- Track the smallest edit distance for all prefix keys for this word. - IF global_ed IS NULL OR smallest_ed < global_ed THEN - global_ed = smallest_ed; - -- And if low verbosity, ignore suggs with a larger distance from here on. - IF verbosity <= 1 THEN - l_maxED := global_ed; - END IF; - END IF; - - -- Lev distance is our main similarity measure. While - -- trgm or soundex similarity could be the main filter, - -- Lev is both language agnostic and faster. - -- - -- Here we will skip suggestions that have a longer edit distance - -- than the shortest we've already found. This is simply an - -- optimization that allows us to avoid further processing - -- of this entry. It would be filtered out later. - CONTINUE WHEN output.lev_distance > global_ed AND verbosity <= 1; - - -- If we have an exact match on the suggestion key we can also avoid - -- some function calls. - IF output.lev_distance = 0 THEN - output.qwerty_kb_match := 1; - output.pg_trgm_sim := 1; - output.soundex_sim := 1; - ELSE - IF kbdist_weight THEN - output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg); - ELSE - output.qwerty_kb_match := 0; - END IF; - IF pg_trgm_weight THEN - output.pg_trgm_sim := similarity(input, sugg); - ELSE - output.pg_trgm_sim := 0; - END IF; - IF soundex_weight THEN - output.soundex_sim := difference(input, sugg) / 4.0; - ELSE - output.soundex_sim := 0; - END IF; - END IF; - - -- Fill in some fields - IF xfer_case AND input <> word THEN - output.suggestion := search.symspell_transfer_casing(word, sugg); - ELSE - output.suggestion := sugg; - END IF; - output.prefix_key := entry.prefix_key; - output.prefix_key_count := entry.count; - output.input := word; - output.norm_input := input; - output.word_pos := w_pos; - - -- We can't "cache" a set of generated records directly, so - -- here we build up an array of search.symspell_lookup_output - -- records that we can revivicate later as a table using UNNEST(). - output_list := output_list || output; - - EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit - CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key - - END LOOP; -- loop over suggestions - END LOOP; -- loop over entries - END LOOP; -- loop over entry_keys - - -- Now we're done examining this word - IF verbosity = 0 THEN - -- Return the "best" suggestion from the smallest edit - -- distance group. We define best based on the weighting - -- of the non-lev similarity measures and use the suggestion - -- use count to break ties. - RETURN QUERY - SELECT * FROM UNNEST(output_list) - ORDER BY lev_distance, - (soundex_sim * COALESCE(soundex_weight,0)) - + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) - + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, - suggestion_count DESC - LIMIT 1; - ELSIF verbosity = 1 THEN - -- Return all suggestions from the smallest - -- edit distance group. - RETURN QUERY - SELECT * FROM UNNEST(output_list) WHERE lev_distance = smallest_ed - ORDER BY (soundex_sim * COALESCE(soundex_weight,0)) - + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) - + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, - suggestion_count DESC; - ELSIF verbosity = 2 THEN - -- Return everything we find, along with relevant stats - RETURN QUERY - SELECT * FROM UNNEST(output_list) - ORDER BY lev_distance, - (soundex_sim * COALESCE(soundex_weight,0)) - + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) - + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, - suggestion_count DESC; - ELSIF verbosity = 3 THEN - -- Return everything we find from the two smallest edit distance groups - RETURN QUERY - SELECT * FROM UNNEST(output_list) - WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) ORDER BY 1 LIMIT 2) - ORDER BY lev_distance, - (soundex_sim * COALESCE(soundex_weight,0)) - + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) - + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, - suggestion_count DESC; - ELSIF verbosity = 4 THEN - -- Return everything we find from the two smallest edit distance groups that are NOT 0 distance - RETURN QUERY - SELECT * FROM UNNEST(output_list) - WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) WHERE lev_distance > 0 ORDER BY 1 LIMIT 2) - ORDER BY lev_distance, - (soundex_sim * COALESCE(soundex_weight,0)) - + (pg_trgm_sim * COALESCE(pg_trgm_weight,0)) - + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC, - suggestion_count DESC; - END IF; - END LOOP; -- loop over words -END; -$function$; - -COMMIT; -