--- /dev/null
+BEGIN;
+
+SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
+
+CREATE OR REPLACE FUNCTION search.symspell_lookup(
+ raw_input text,
+ search_class text,
+ verbosity integer DEFAULT 2,
+ xfer_case boolean DEFAULT false,
+ count_threshold integer DEFAULT 1,
+ soundex_weight integer DEFAULT 0,
+ pg_trgm_weight integer DEFAULT 0,
+ kbdist_weight integer DEFAULT 0
+) RETURNS SETOF search.symspell_lookup_output
+ LANGUAGE plpgsql
+AS $function$
+DECLARE
+ prefix_length INT;
+ maxED INT;
+ good_suggs HSTORE;
+ word_list TEXT[];
+ edit_list TEXT[] := '{}';
+ seen_list TEXT[] := '{}';
+ output search.symspell_lookup_output;
+ output_list search.symspell_lookup_output[];
+ entry RECORD;
+ entry_key TEXT;
+ prefix_key TEXT;
+ sugg TEXT;
+ input TEXT;
+ word TEXT;
+ w_pos INT := -1;
+ smallest_ed INT := -1;
+ global_ed INT;
+ i_len INT;
+ l_maxED INT;
+BEGIN
+ SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled;
+ prefix_length := COALESCE(prefix_length, 6);
+
+ SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled;
+ maxED := COALESCE(maxED, 3);
+
+ word_list := ARRAY_AGG(x) FROM search.symspell_parse_words(raw_input) x;
+
+ -- Common case exact match test for preformance
+ IF verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN
+ EXECUTE
+ 'SELECT '||search_class||'_suggestions AS suggestions,
+ '||search_class||'_count AS count,
+ prefix_key
+ FROM search.symspell_dictionary
+ WHERE prefix_key = $1
+ AND '||search_class||'_count >= $2
+ AND '||search_class||'_suggestions @> ARRAY[$1]'
+ INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1);
+ IF entry.prefix_key IS NOT NULL THEN
+ output.lev_distance := 0; -- definitionally
+ output.prefix_key := entry.prefix_key;
+ output.prefix_key_count := entry.count;
+ output.suggestion_count := entry.count;
+ output.input := word_list[1];
+ IF xfer_case THEN
+ output.suggestion := search.symspell_transfer_casing(output.input, entry.prefix_key);
+ ELSE
+ output.suggestion := entry.prefix_key;
+ END IF;
+ output.norm_input := entry.prefix_key;
+ output.qwerty_kb_match := 1;
+ output.pg_trgm_sim := 1;
+ output.soundex_sim := 1;
+ RETURN NEXT output;
+ RETURN;
+ END IF;
+ END IF;
+
+ <<word_loop>>
+ FOREACH word IN ARRAY word_list LOOP
+ w_pos := w_pos + 1;
+ input := evergreen.lowercase(word);
+ i_len := CHARACTER_LENGTH(input);
+ l_maxED := maxED;
+
+ IF CHARACTER_LENGTH(input) > prefix_length THEN
+ prefix_key := SUBSTRING(input FROM 1 FOR prefix_length);
+ edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, l_maxED);
+ ELSE
+ edit_list := input || search.symspell_generate_edits(input, 1, l_maxED);
+ END IF;
+
+ SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x;
+
+ output_list := '{}';
+ seen_list := '{}';
+ global_ed := NULL;
+
+ <<entry_key_loop>>
+ FOREACH entry_key IN ARRAY edit_list LOOP
+ smallest_ed := -1;
+ IF global_ed IS NOT NULL THEN
+ smallest_ed := global_ed;
+ END IF;
+
+ FOR entry IN EXECUTE
+ 'SELECT '||search_class||'_suggestions AS suggestions,
+ '||search_class||'_count AS count,
+ prefix_key
+ FROM search.symspell_dictionary
+ WHERE prefix_key = $1
+ AND '||search_class||'_suggestions IS NOT NULL'
+ USING entry_key
+ LOOP
+
+ SELECT HSTORE(
+ ARRAY_AGG(
+ ARRAY[s, evergreen.levenshtein_damerau_edistance(input,s,l_maxED)::TEXT]
+ ORDER BY evergreen.levenshtein_damerau_edistance(input,s,l_maxED) DESC
+ )
+ )
+ INTO good_suggs
+ FROM UNNEST(entry.suggestions) s
+ WHERE (ABS(CHARACTER_LENGTH(s) - i_len) <= maxEd AND evergreen.levenshtein_damerau_edistance(input,s,l_maxED) BETWEEN 0 AND l_maxED)
+ AND NOT seen_list @> ARRAY[s];
+
+ CONTINUE WHEN good_suggs IS NULL;
+
+ FOR sugg, output.suggestion_count IN EXECUTE
+ 'SELECT prefix_key, '||search_class||'_count
+ FROM search.symspell_dictionary
+ WHERE prefix_key = ANY ($1)
+ AND '||search_class||'_count >= $2'
+ USING AKEYS(good_suggs), COALESCE(count_threshold,1)
+ LOOP
+
+ output.lev_distance := good_suggs->sugg;
+ seen_list := seen_list || sugg;
+
+ -- Track the smallest edit distance among suggestions from this prefix key.
+ IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN
+ smallest_ed := output.lev_distance;
+ END IF;
+
+ -- Track the smallest edit distance for all prefix keys for this word.
+ IF global_ed IS NULL OR smallest_ed < global_ed THEN
+ global_ed = smallest_ed;
+ -- And if low verbosity, ignore suggs with a larger distance from here on.
+ IF verbosity <= 1 THEN
+ l_maxED := global_ed;
+ END IF;
+ END IF;
+
+ -- Lev distance is our main similarity measure. While
+ -- trgm or soundex similarity could be the main filter,
+ -- Lev is both language agnostic and faster.
+ --
+ -- Here we will skip suggestions that have a longer edit distance
+ -- than the shortest we've already found. This is simply an
+ -- optimization that allows us to avoid further processing
+ -- of this entry. It would be filtered out later.
+ CONTINUE WHEN output.lev_distance > global_ed AND verbosity <= 1;
+
+ -- If we have an exact match on the suggestion key we can also avoid
+ -- some function calls.
+ IF output.lev_distance = 0 THEN
+ output.qwerty_kb_match := 1;
+ output.pg_trgm_sim := 1;
+ output.soundex_sim := 1;
+ ELSE
+ IF kbdist_weight THEN
+ output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg);
+ ELSE
+ output.qwerty_kb_match := 0;
+ END IF;
+ IF pg_trgm_weight THEN
+ output.pg_trgm_sim := similarity(input, sugg);
+ ELSE
+ output.pg_trgm_sim := 0;
+ END IF;
+ IF soundex_weight THEN
+ output.soundex_sim := difference(input, sugg) / 4.0;
+ ELSE
+ output.soundex_sim := 0;
+ END IF;
+ END IF;
+
+ -- Fill in some fields
+ IF xfer_case AND input <> word THEN
+ output.suggestion := search.symspell_transfer_casing(word, sugg);
+ ELSE
+ output.suggestion := sugg;
+ END IF;
+ output.prefix_key := entry.prefix_key;
+ output.prefix_key_count := entry.count;
+ output.input := word;
+ output.norm_input := input;
+ output.word_pos := w_pos;
+
+ -- We can't "cache" a set of generated records directly, so
+ -- here we build up an array of search.symspell_lookup_output
+ -- records that we can revivicate later as a table using UNNEST().
+ output_list := output_list || output;
+
+ EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit
+ CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key
+
+ END LOOP; -- loop over suggestions
+ END LOOP; -- loop over entries
+ END LOOP; -- loop over entry_keys
+
+ -- Now we're done examining this word
+ IF verbosity = 0 THEN
+ -- Return the "best" suggestion from the smallest edit
+ -- distance group. We define best based on the weighting
+ -- of the non-lev similarity measures and use the suggestion
+ -- use count to break ties.
+ RETURN QUERY
+ SELECT * FROM UNNEST(output_list)
+ ORDER BY lev_distance,
+ (soundex_sim * COALESCE(soundex_weight,0))
+ + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
+ + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+ suggestion_count DESC
+ LIMIT 1;
+ ELSIF verbosity = 1 THEN
+ -- Return all suggestions from the smallest
+ -- edit distance group.
+ RETURN QUERY
+ SELECT * FROM UNNEST(output_list) WHERE lev_distance = smallest_ed
+ ORDER BY (soundex_sim * COALESCE(soundex_weight,0))
+ + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
+ + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+ suggestion_count DESC;
+ ELSIF verbosity = 2 THEN
+ -- Return everything we find, along with relevant stats
+ RETURN QUERY
+ SELECT * FROM UNNEST(output_list)
+ ORDER BY lev_distance,
+ (soundex_sim * COALESCE(soundex_weight,0))
+ + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
+ + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+ suggestion_count DESC;
+ ELSIF verbosity = 3 THEN
+ -- Return everything we find from the two smallest edit distance groups
+ RETURN QUERY
+ SELECT * FROM UNNEST(output_list)
+ WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) ORDER BY 1 LIMIT 2)
+ ORDER BY lev_distance,
+ (soundex_sim * COALESCE(soundex_weight,0))
+ + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
+ + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+ suggestion_count DESC;
+ ELSIF verbosity = 4 THEN
+ -- Return everything we find from the two smallest edit distance groups that are NOT 0 distance
+ RETURN QUERY
+ SELECT * FROM UNNEST(output_list)
+ WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) WHERE lev_distance > 0 ORDER BY 1 LIMIT 2)
+ ORDER BY lev_distance,
+ (soundex_sim * COALESCE(soundex_weight,0))
+ + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
+ + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+ suggestion_count DESC;
+ END IF;
+ END LOOP; -- loop over words
+END;
+$function$;
+
+COMMIT;
+