word_pos INT
);
-CREATE OR REPLACE FUNCTION search.symspell_lookup (
- raw_input TEXT,
- search_class TEXT,
- verbosity INT DEFAULT 2,
- xfer_case BOOL DEFAULT FALSE,
- count_threshold INT DEFAULT 1,
- soundex_weight INT DEFAULT 0,
- pg_trgm_weight INT DEFAULT 0,
- kbdist_weight INT DEFAULT 0
-) RETURNS SETOF search.symspell_lookup_output AS $F$
+
+CREATE OR REPLACE FUNCTION search.symspell_lookup(
+ raw_input text,
+ search_class text,
+ verbosity integer DEFAULT 2,
+ xfer_case boolean DEFAULT false,
+ count_threshold integer DEFAULT 1,
+ soundex_weight integer DEFAULT 0,
+ pg_trgm_weight integer DEFAULT 0,
+ kbdist_weight integer DEFAULT 0
+) RETURNS SETOF search.symspell_lookup_output LANGUAGE plpgsql AS $function$
DECLARE
prefix_length INT;
maxED INT;
+ good_suggs HSTORE;
word_list TEXT[];
edit_list TEXT[] := '{}';
seen_list TEXT[] := '{}';
w_pos INT := -1;
smallest_ed INT := -1;
global_ed INT;
+ i_len INT;
+ l_maxED INT;
BEGIN
SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled;
prefix_length := COALESCE(prefix_length, 6);
prefix_key
FROM search.symspell_dictionary
WHERE prefix_key = $1
- AND '||search_class||'_count >= $2
- AND '||search_class||'_suggestions @> ARRAY[$1]'
+ AND '||search_class||'_count >= $2
+ AND '||search_class||'_suggestions @> ARRAY[$1]'
INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1);
IF entry.prefix_key IS NOT NULL THEN
output.lev_distance := 0; -- definitionally
FOREACH word IN ARRAY word_list LOOP
w_pos := w_pos + 1;
input := evergreen.lowercase(word);
+ i_len := CHARACTER_LENGTH(input);
+ l_maxED := maxED;
IF CHARACTER_LENGTH(input) > prefix_length THEN
prefix_key := SUBSTRING(input FROM 1 FOR prefix_length);
- edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, maxED);
+ edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, l_maxED);
ELSE
- edit_list := input || search.symspell_generate_edits(input, 1, maxED);
+ edit_list := input || search.symspell_generate_edits(input, 1, l_maxED);
END IF;
SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x;
IF global_ed IS NOT NULL THEN
smallest_ed := global_ed;
END IF;
+
FOR entry IN EXECUTE
'SELECT '||search_class||'_suggestions AS suggestions,
'||search_class||'_count AS count,
prefix_key
FROM search.symspell_dictionary
WHERE prefix_key = $1
- AND '||search_class||'_suggestions IS NOT NULL'
+ AND '||search_class||'_suggestions IS NOT NULL'
USING entry_key
LOOP
- FOREACH sugg IN ARRAY entry.suggestions LOOP
- IF NOT seen_list @> ARRAY[sugg] THEN
- seen_list := seen_list || sugg;
- IF input = sugg THEN -- exact match, no need to spend time on a call
- output.lev_distance := 0;
- output.suggestion_count = entry.count;
- ELSIF ABS(CHARACTER_LENGTH(input) - CHARACTER_LENGTH(sugg)) > maxED THEN
- -- They are definitionally too different to consider, just move on.
- CONTINUE;
- ELSE
- --output.lev_distance := levenshtein_less_equal(
- output.lev_distance := evergreen.levenshtein_damerau_edistance(
- input,
- sugg,
- maxED
- );
- IF output.lev_distance < 0 THEN
- -- The Perl module returns -1 for "more distant than max".
- output.lev_distance := maxED + 1;
- -- This short-circuit's the count test below for speed, bypassing
- -- a couple useless tests.
- output.suggestion_count := -1;
- ELSE
- EXECUTE 'SELECT '||search_class||'_count FROM search.symspell_dictionary WHERE prefix_key = $1'
- INTO output.suggestion_count USING sugg;
- END IF;
- END IF;
- -- The caller passes a minimum suggestion count threshold (or uses
- -- the default of 0) and if the suggestion has that many or less uses
- -- then we move on to the next suggestion, since this one is too rare.
- CONTINUE WHEN output.suggestion_count < COALESCE(count_threshold,1);
+ SELECT HSTORE(
+ ARRAY_AGG(
+ ARRAY[s, evergreen.levenshtein_damerau_edistance(input,s,l_maxED)::TEXT]
+ ORDER BY evergreen.levenshtein_damerau_edistance(input,s,l_maxED) DESC
+ )
+ )
+ INTO good_suggs
+ FROM UNNEST(entry.suggestions) s
+ WHERE (ABS(CHARACTER_LENGTH(s) - i_len) <= maxEd AND evergreen.levenshtein_damerau_edistance(input,s,l_maxED) BETWEEN 0 AND l_maxED)
+ AND NOT seen_list @> ARRAY[s];
+
+ CONTINUE WHEN good_suggs IS NULL;
+
+ FOR sugg, output.suggestion_count IN EXECUTE
+ 'SELECT prefix_key, '||search_class||'_count
+ FROM search.symspell_dictionary
+ WHERE prefix_key = ANY ($1)
+ AND '||search_class||'_count >= $2'
+ USING AKEYS(good_suggs), COALESCE(count_threshold,1)
+ LOOP
+
+ output.lev_distance := good_suggs->sugg;
+ seen_list := seen_list || sugg;
+
+ -- Track the smallest edit distance among suggestions from this prefix key.
+ IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN
+ smallest_ed := output.lev_distance;
+ END IF;
- -- Track the smallest edit distance among suggestions from this prefix key.
- IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN
- smallest_ed := output.lev_distance;
+ -- Track the smallest edit distance for all prefix keys for this word.
+ IF global_ed IS NULL OR smallest_ed < global_ed THEN
+ global_ed = smallest_ed;
+ -- And if low verbosity, ignore suggs with a larger distance from here on.
+ IF verbosity <= 1 THEN
+ l_maxED := global_ed;
END IF;
+ END IF;
- -- Track the smallest edit distance for all prefix keys for this word.
- IF global_ed IS NULL OR smallest_ed < global_ed THEN
- global_ed = smallest_ed;
+ -- Lev distance is our main similarity measure. While
+ -- trgm or soundex similarity could be the main filter,
+ -- Lev is both language agnostic and faster.
+ --
+ -- Here we will skip suggestions that have a longer edit distance
+ -- than the shortest we've already found. This is simply an
+ -- optimization that allows us to avoid further processing
+ -- of this entry. It would be filtered out later.
+ CONTINUE WHEN output.lev_distance > global_ed AND verbosity <= 1;
+
+ -- If we have an exact match on the suggestion key we can also avoid
+ -- some function calls.
+ IF output.lev_distance = 0 THEN
+ output.qwerty_kb_match := 1;
+ output.pg_trgm_sim := 1;
+ output.soundex_sim := 1;
+ ELSE
+ IF kbdist_weight THEN
+ output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg);
+ ELSE
+ output.qwerty_kb_match := 0;
END IF;
+ IF pg_trgm_weight THEN
+ output.pg_trgm_sim := similarity(input, sugg);
+ ELSE
+ output.pg_trgm_sim := 0;
+ END IF;
+ IF soundex_weight THEN
+ output.soundex_sim := difference(input, sugg) / 4.0;
+ ELSE
+ output.soundex_sim := 0;
+ END IF;
+ END IF;
- -- Only proceed if the edit distance is <= the max for the dictionary.
- IF output.lev_distance <= maxED THEN
- IF output.lev_distance > global_ed AND verbosity <= 1 THEN
- -- Lev distance is our main similarity measure. While
- -- trgm or soundex similarity could be the main filter,
- -- Lev is both language agnostic and faster.
- --
- -- Here we will skip suggestions that have a longer edit distance
- -- than the shortest we've already found. This is simply an
- -- optimization that allows us to avoid further processing
- -- of this entry. It would be filtered out later.
+ -- Fill in some fields
+ IF xfer_case AND input <> word THEN
+ output.suggestion := search.symspell_transfer_casing(word, sugg);
+ ELSE
+ output.suggestion := sugg;
+ END IF;
+ output.prefix_key := entry.prefix_key;
+ output.prefix_key_count := entry.count;
+ output.input := word;
+ output.norm_input := input;
+ output.word_pos := w_pos;
- CONTINUE;
- END IF;
+ -- We can't "cache" a set of generated records directly, so
+ -- here we build up an array of search.symspell_lookup_output
+ -- records that we can revivicate later as a table using UNNEST().
+ output_list := output_list || output;
- -- If we have an exact match on the suggestion key we can also avoid
- -- some function calls.
- IF output.lev_distance = 0 THEN
- output.qwerty_kb_match := 1;
- output.pg_trgm_sim := 1;
- output.soundex_sim := 1;
- ELSE
- output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg);
- output.pg_trgm_sim := similarity(input, sugg);
- output.soundex_sim := difference(input, sugg) / 4.0;
- END IF;
+ EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit
+ CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key
- -- Fill in some fields
- IF xfer_case THEN
- output.suggestion := search.symspell_transfer_casing(word, sugg);
- ELSE
- output.suggestion := sugg;
- END IF;
- output.prefix_key := entry.prefix_key;
- output.prefix_key_count := entry.count;
- output.input := word;
- output.norm_input := input;
- output.word_pos := w_pos;
-
- -- We can't "cache" a set of generated records directly, so
- -- here we build up an array of search.symspell_lookup_output
- -- records that we can revivicate later as a table using UNNEST().
- output_list := output_list || output;
-
- EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit
- CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key
- END IF; -- maxED test
- END IF; -- suggestion not seen test
END LOOP; -- loop over suggestions
END LOOP; -- loop over entries
END LOOP; -- loop over entry_keys
END IF;
END LOOP; -- loop over words
END;
-$F$ LANGUAGE PLPGSQL;
+$function$;
CREATE OR REPLACE FUNCTION search.symspell_build_raw_entry (
raw_input TEXT,
--- /dev/null
+BEGIN;
+
+SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
+
+CREATE OR REPLACE FUNCTION search.symspell_lookup(
+ raw_input text,
+ search_class text,
+ verbosity integer DEFAULT 2,
+ xfer_case boolean DEFAULT false,
+ count_threshold integer DEFAULT 1,
+ soundex_weight integer DEFAULT 0,
+ pg_trgm_weight integer DEFAULT 0,
+ kbdist_weight integer DEFAULT 0
+) RETURNS SETOF search.symspell_lookup_output
+ LANGUAGE plpgsql
+AS $function$
+DECLARE
+ prefix_length INT;
+ maxED INT;
+ good_suggs HSTORE;
+ word_list TEXT[];
+ edit_list TEXT[] := '{}';
+ seen_list TEXT[] := '{}';
+ output search.symspell_lookup_output;
+ output_list search.symspell_lookup_output[];
+ entry RECORD;
+ entry_key TEXT;
+ prefix_key TEXT;
+ sugg TEXT;
+ input TEXT;
+ word TEXT;
+ w_pos INT := -1;
+ smallest_ed INT := -1;
+ global_ed INT;
+ i_len INT;
+ l_maxED INT;
+BEGIN
+ SELECT value::INT INTO prefix_length FROM config.internal_flag WHERE name = 'symspell.prefix_length' AND enabled;
+ prefix_length := COALESCE(prefix_length, 6);
+
+ SELECT value::INT INTO maxED FROM config.internal_flag WHERE name = 'symspell.max_edit_distance' AND enabled;
+ maxED := COALESCE(maxED, 3);
+
+ word_list := ARRAY_AGG(x) FROM search.symspell_parse_words(raw_input) x;
+
+ -- Common case exact match test for preformance
+ IF verbosity = 0 AND CARDINALITY(word_list) = 1 AND CHARACTER_LENGTH(word_list[1]) <= prefix_length THEN
+ EXECUTE
+ 'SELECT '||search_class||'_suggestions AS suggestions,
+ '||search_class||'_count AS count,
+ prefix_key
+ FROM search.symspell_dictionary
+ WHERE prefix_key = $1
+ AND '||search_class||'_count >= $2
+ AND '||search_class||'_suggestions @> ARRAY[$1]'
+ INTO entry USING evergreen.lowercase(word_list[1]), COALESCE(count_threshold,1);
+ IF entry.prefix_key IS NOT NULL THEN
+ output.lev_distance := 0; -- definitionally
+ output.prefix_key := entry.prefix_key;
+ output.prefix_key_count := entry.count;
+ output.suggestion_count := entry.count;
+ output.input := word_list[1];
+ IF xfer_case THEN
+ output.suggestion := search.symspell_transfer_casing(output.input, entry.prefix_key);
+ ELSE
+ output.suggestion := entry.prefix_key;
+ END IF;
+ output.norm_input := entry.prefix_key;
+ output.qwerty_kb_match := 1;
+ output.pg_trgm_sim := 1;
+ output.soundex_sim := 1;
+ RETURN NEXT output;
+ RETURN;
+ END IF;
+ END IF;
+
+ <<word_loop>>
+ FOREACH word IN ARRAY word_list LOOP
+ w_pos := w_pos + 1;
+ input := evergreen.lowercase(word);
+ i_len := CHARACTER_LENGTH(input);
+ l_maxED := maxED;
+
+ IF CHARACTER_LENGTH(input) > prefix_length THEN
+ prefix_key := SUBSTRING(input FROM 1 FOR prefix_length);
+ edit_list := ARRAY[input,prefix_key] || search.symspell_generate_edits(prefix_key, 1, l_maxED);
+ ELSE
+ edit_list := input || search.symspell_generate_edits(input, 1, l_maxED);
+ END IF;
+
+ SELECT ARRAY_AGG(x ORDER BY CHARACTER_LENGTH(x) DESC) INTO edit_list FROM UNNEST(edit_list) x;
+
+ output_list := '{}';
+ seen_list := '{}';
+ global_ed := NULL;
+
+ <<entry_key_loop>>
+ FOREACH entry_key IN ARRAY edit_list LOOP
+ smallest_ed := -1;
+ IF global_ed IS NOT NULL THEN
+ smallest_ed := global_ed;
+ END IF;
+
+ FOR entry IN EXECUTE
+ 'SELECT '||search_class||'_suggestions AS suggestions,
+ '||search_class||'_count AS count,
+ prefix_key
+ FROM search.symspell_dictionary
+ WHERE prefix_key = $1
+ AND '||search_class||'_suggestions IS NOT NULL'
+ USING entry_key
+ LOOP
+
+ SELECT HSTORE(
+ ARRAY_AGG(
+ ARRAY[s, evergreen.levenshtein_damerau_edistance(input,s,l_maxED)::TEXT]
+ ORDER BY evergreen.levenshtein_damerau_edistance(input,s,l_maxED) DESC
+ )
+ )
+ INTO good_suggs
+ FROM UNNEST(entry.suggestions) s
+ WHERE (ABS(CHARACTER_LENGTH(s) - i_len) <= maxEd AND evergreen.levenshtein_damerau_edistance(input,s,l_maxED) BETWEEN 0 AND l_maxED)
+ AND NOT seen_list @> ARRAY[s];
+
+ CONTINUE WHEN good_suggs IS NULL;
+
+ FOR sugg, output.suggestion_count IN EXECUTE
+ 'SELECT prefix_key, '||search_class||'_count
+ FROM search.symspell_dictionary
+ WHERE prefix_key = ANY ($1)
+ AND '||search_class||'_count >= $2'
+ USING AKEYS(good_suggs), COALESCE(count_threshold,1)
+ LOOP
+
+ output.lev_distance := good_suggs->sugg;
+ seen_list := seen_list || sugg;
+
+ -- Track the smallest edit distance among suggestions from this prefix key.
+ IF smallest_ed = -1 OR output.lev_distance < smallest_ed THEN
+ smallest_ed := output.lev_distance;
+ END IF;
+
+ -- Track the smallest edit distance for all prefix keys for this word.
+ IF global_ed IS NULL OR smallest_ed < global_ed THEN
+ global_ed = smallest_ed;
+ -- And if low verbosity, ignore suggs with a larger distance from here on.
+ IF verbosity <= 1 THEN
+ l_maxED := global_ed;
+ END IF;
+ END IF;
+
+ -- Lev distance is our main similarity measure. While
+ -- trgm or soundex similarity could be the main filter,
+ -- Lev is both language agnostic and faster.
+ --
+ -- Here we will skip suggestions that have a longer edit distance
+ -- than the shortest we've already found. This is simply an
+ -- optimization that allows us to avoid further processing
+ -- of this entry. It would be filtered out later.
+ CONTINUE WHEN output.lev_distance > global_ed AND verbosity <= 1;
+
+ -- If we have an exact match on the suggestion key we can also avoid
+ -- some function calls.
+ IF output.lev_distance = 0 THEN
+ output.qwerty_kb_match := 1;
+ output.pg_trgm_sim := 1;
+ output.soundex_sim := 1;
+ ELSE
+ IF kbdist_weight THEN
+ output.qwerty_kb_match := evergreen.qwerty_keyboard_distance_match(input, sugg);
+ ELSE
+ output.qwerty_kb_match := 0;
+ END IF;
+ IF pg_trgm_weight THEN
+ output.pg_trgm_sim := similarity(input, sugg);
+ ELSE
+ output.pg_trgm_sim := 0;
+ END IF;
+ IF soundex_weight THEN
+ output.soundex_sim := difference(input, sugg) / 4.0;
+ ELSE
+ output.soundex_sim := 0;
+ END IF;
+ END IF;
+
+ -- Fill in some fields
+ IF xfer_case AND input <> word THEN
+ output.suggestion := search.symspell_transfer_casing(word, sugg);
+ ELSE
+ output.suggestion := sugg;
+ END IF;
+ output.prefix_key := entry.prefix_key;
+ output.prefix_key_count := entry.count;
+ output.input := word;
+ output.norm_input := input;
+ output.word_pos := w_pos;
+
+ -- We can't "cache" a set of generated records directly, so
+ -- here we build up an array of search.symspell_lookup_output
+ -- records that we can revivicate later as a table using UNNEST().
+ output_list := output_list || output;
+
+ EXIT entry_key_loop WHEN smallest_ed = 0 AND verbosity = 0; -- exact match early exit
+ CONTINUE entry_key_loop WHEN smallest_ed = 0 AND verbosity = 1; -- exact match early jump to the next key
+
+ END LOOP; -- loop over suggestions
+ END LOOP; -- loop over entries
+ END LOOP; -- loop over entry_keys
+
+ -- Now we're done examining this word
+ IF verbosity = 0 THEN
+ -- Return the "best" suggestion from the smallest edit
+ -- distance group. We define best based on the weighting
+ -- of the non-lev similarity measures and use the suggestion
+ -- use count to break ties.
+ RETURN QUERY
+ SELECT * FROM UNNEST(output_list)
+ ORDER BY lev_distance,
+ (soundex_sim * COALESCE(soundex_weight,0))
+ + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
+ + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+ suggestion_count DESC
+ LIMIT 1;
+ ELSIF verbosity = 1 THEN
+ -- Return all suggestions from the smallest
+ -- edit distance group.
+ RETURN QUERY
+ SELECT * FROM UNNEST(output_list) WHERE lev_distance = smallest_ed
+ ORDER BY (soundex_sim * COALESCE(soundex_weight,0))
+ + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
+ + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+ suggestion_count DESC;
+ ELSIF verbosity = 2 THEN
+ -- Return everything we find, along with relevant stats
+ RETURN QUERY
+ SELECT * FROM UNNEST(output_list)
+ ORDER BY lev_distance,
+ (soundex_sim * COALESCE(soundex_weight,0))
+ + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
+ + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+ suggestion_count DESC;
+ ELSIF verbosity = 3 THEN
+ -- Return everything we find from the two smallest edit distance groups
+ RETURN QUERY
+ SELECT * FROM UNNEST(output_list)
+ WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) ORDER BY 1 LIMIT 2)
+ ORDER BY lev_distance,
+ (soundex_sim * COALESCE(soundex_weight,0))
+ + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
+ + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+ suggestion_count DESC;
+ ELSIF verbosity = 4 THEN
+ -- Return everything we find from the two smallest edit distance groups that are NOT 0 distance
+ RETURN QUERY
+ SELECT * FROM UNNEST(output_list)
+ WHERE lev_distance IN (SELECT DISTINCT lev_distance FROM UNNEST(output_list) WHERE lev_distance > 0 ORDER BY 1 LIMIT 2)
+ ORDER BY lev_distance,
+ (soundex_sim * COALESCE(soundex_weight,0))
+ + (pg_trgm_sim * COALESCE(pg_trgm_weight,0))
+ + (qwerty_kb_match * COALESCE(kbdist_weight,0)) DESC,
+ suggestion_count DESC;
+ END IF;
+ END LOOP; -- loop over words
+END;
+$function$;
+
+COMMIT;
+