\qecho 'form is available inside a comment at the end of this upgrade sub-'
\qecho 'script so you do not need to copy this comment from the psql ouptut.'
\qecho ''
-\qecho 'ALTER TABLE search.symspell_dictionary SET UNLOGGED;'
-\qecho 'TRUNCATE search.symspell_dictionary;'
-\qecho ''
\qecho '\\a'
\qecho '\\t'
-\qecho '\\o dym.prime.sql'
-\qecho select $y$select $y$||x.id||$y$, '''title''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''title''');$y$ from metabib.title_field_entry x;
-\qecho select $y$select $y$||x.id||$y$, '''author''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''author''');$y$ from metabib.author_field_entry x;
-\qecho select $y$select $y$||x.id||$y$, '''series''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''series''');$y$ from metabib.series_field_entry x;
-\qecho select $y$select $y$||x.id||$y$, '''identifier''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''identifier''');$y$ from metabib.identifier_field_entry x;
-\qecho select $y$select $y$||x.id||$y$, '''keyword''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''keyword''');$y$ from metabib.keyword_field_entry x;
-\qecho select $y$select $y$||x.id||$y$, '''subject''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''subject''');$y$ from metabib.subject_field_entry x;
+\qecho '\\o symspell-title.txt'
+\qecho 'select 'title:'||value from metabib.title_field_entry;'
+\qecho '\\o symspell-author.txt'
+\qecho 'select 'author:'||value from metabib.author_field_entry;'
+\qecho '\\o symspell-subject.txt'
+\qecho 'select 'subject:'||value from metabib.subject_field_entry;'
+\qecho '\\o symspell-series.txt'
+\qecho 'select 'series:'||value from metabib.series_field_entry;'
+\qecho '\\o symspell-identifier.txt'
+\qecho 'select 'identifier:'||value from metabib.identifier_field_entry;'
+\qecho '\\o symspell-keyword.txt'
+\qecho 'select 'keyword:'||value from metabib.keyword_field_entry;'
\qecho '\\o'
+\qecho '\\a'
+\qecho '\\t'
+\qecho ''
+\qecho 'Then, at the command line:'
\qecho ''
-\qecho '\\i dym.prime.sql'
+\qecho '$ ~/Evergreen-tarball-path/Open-ILS/src/support-scripts/symspell-sideload.pl \'
+\qecho ' symspell-title.txt \'
+\qecho ' symspell-author.txt \'
+\qecho ' symspell-subject.txt \'
+\qecho ' symspell-series.txt \'
+\qecho ' symspell-identifier.txt \'
+\qecho ' symspell-keyword.txt > sideload.sql'
+\qecho ''
+\qecho 'And, back in psql'
+\qecho ''
+\qecho 'ALTER TABLE search.symspell_dictionary SET UNLOGGED;'
+\qecho 'TRUNCATE search.symspell_dictionary;'
+\qecho ''
+\qecho '\\i sideload.sql'
\qecho ''
\qecho 'CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;'
\qecho 'REINDEX TABLE search.symspell_dictionary;'
/* To run by hand:
-ALTER TABLE search.symspell_dictionary SET UNLOGGED;
-TRUNCATE search.symspell_dictionary;
-
\a
\t
-\o dym.prime.sql
-select $y$select $y$||x.id||$y$, 'title', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'title');$y$ from metabib.title_field_entry x;
-select $y$select $y$||x.id||$y$, 'author', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'author');$y$ from metabib.author_field_entry x;
-select $y$select $y$||x.id||$y$, 'series', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'series');$y$ from metabib.series_field_entry x;
-select $y$select $y$||x.id||$y$, 'identifier', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'identifier');$y$ from metabib.identifier_field_entry x;
-select $y$select $y$||x.id||$y$, 'keyword', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'keyword');$y$ from metabib.keyword_field_entry x;
-select $y$select $y$||x.id||$y$, 'subject', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'subject');$y$ from metabib.subject_field_entry x;
+
+\o symspell-title.txt
+select 'title:'||value from metabib.title_field_entry;
+
+\o symspell-author.txt
+select 'author:'||value from metabib.author_field_entry;
+
+\o symspell-subject.txt
+select 'subject:'||value from metabib.subject_field_entry;
+
+\o symspell-series.txt
+select 'series:'||value from metabib.series_field_entry;
+
+\o symspell-identifier.txt
+select 'identifier:'||value from metabib.identifier_field_entry;
+
+\o symspell-keyword.txt
+select 'keyword:'||value from metabib.keyword_field_entry;
+
\o
+\a
+\t
+
+// Then, at the command line:
+
+$ ~/Evergreen-tarball-path/Open-ILS/src/support-scripts/symspell-sideload.pl \
+ symspell-title.txt \
+ symspell-author.txt \
+ symspell-subject.txt \
+ symspell-series.txt \
+ symspell-identifier.txt \
+ symspell-keyword.txt > sideload.sql
+
+// And, back in psql
+
+ALTER TABLE search.symspell_dictionary SET UNLOGGED;
+TRUNCATE search.symspell_dictionary;
-\i dym.prime.sql
+\i sideload.sql
CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;
REINDEX TABLE search.symspell_dictionary;
--- /dev/null
+#!/usr/bin/perl
+use warnings;
+use strict;
+use List::MoreUtils qw/uniq/;
+
+$| = 1;
+
+my $plen = 6;
+my $maxed = 3;
+
+my %dict;
+my $etime;
+my $secs;
+
+my %classes = (
+ title => 0,
+ author => 1,
+ subject => 2,
+ series => 3,
+ keyword => 4,
+ identifier => 5
+);
+
+my $stime = time;
+while (<>) {
+ my $line = $.;
+
+ chomp(); $_=lc($_);
+ my ($class,$data) = m/^(\w+):(.*$)/;
+
+ my $ckey = $class.'_count';
+ my $skey = $class.'_suggestions';
+
+ my @words;
+ while( $data =~ m/([\w\d]+'*[\w\d]*)/g ) {
+ push @words, $1;
+ }
+
+ for my $raw (uniq @words) {
+ my $key = $raw;
+ $dict{$key} //= { map {$_.'_count' => 0, $_.'_suggestions' => {}} keys %classes };
+ $dict{$key}{$ckey}++;
+ if ($dict{$key}{$ckey} == 1) { # first time we've seen it in this class, need to generate prefix keys
+ $dict{$key}{$skey}{$raw} = 1;
+
+ if (length($raw) > $plen) {
+ $key = substr($raw,0,$plen);
+ $dict{$key} //= { map {$_.'_count' => 0, $_.'_suggestions' => {}} keys %classes };
+ $dict{$key}{$skey}{$raw} = 1;
+ }
+
+ for my $edit (symspell_generate_edits($key, 1)) {
+ $dict{$edit} //= { map {$_.'_count' => 0, $_.'_suggestions' => {}} keys %classes };
+ $dict{$edit}{$skey}{$raw} = 1;
+ }
+ }
+ }
+
+ unless ($line % 10000) {
+ $etime = time;
+ $secs = $etime - $stime;
+ warn "$line lines consumed from input in $secs seconds...\n";
+ }
+}
+
+$etime = time;
+$secs = $etime - $stime;
+warn "Dictionary built in $secs seconds, writing...\n";
+
+$stime = time;
+my $counter = 0;
+
+my @keymap = (map { [$_.'_count', $_.'_suggestions'] } sort keys %classes);
+
+print 'COPY search.symspell_dictionary (prefix_key, '. join(', ', map { ($$_[0], $$_[1]) } @keymap) . ") FROM STDIN;\n";
+
+while ( my ($key, $cl_dict) = each %dict ) {
+ $counter++;
+ print join( "\t", $key, map {
+ ($$cl_dict{$$_[0]}, (keys %{$$cl_dict{$$_[1]}} ? '{'.join(',', uniq(keys %{$$cl_dict{$$_[1]}})).'}' : '\N'))
+ } @keymap) . "\n";
+
+ delete $dict{$key};
+}
+
+print '\.'."\n\n";
+
+
+$etime = time;
+$secs = $etime - $stime;
+warn "$counter dictionary prefix key entries written in $secs seconds.\n";
+
+sub symspell_generate_edits {
+ my $word = shift;
+ my $dist = shift;
+ my $c = 1;
+ my @list;
+ my @sublist;
+ my $len = length($word);
+
+ while ( $c <= $len ) {
+ my $item = substr($word, 0, $c - 1) . substr($word, $c);
+ push @list, $item;
+ if ($dist < $maxed) {
+ push @sublist, symspell_generate_edits($item, $dist + 1);
+ }
+ $c++;
+ }
+
+ push @list, @sublist;
+
+ if ($dist == 1) {
+ #warn join(', ', uniq @list) . "\n";
+ return uniq(@list);
+ }
+
+ return @list;
+}
+