From: Mike Rylander Date: Wed, 10 Mar 2021 20:01:29 +0000 (-0500) Subject: symspell sideloader for upgrade speed X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=cc86116ab460635e56b28a558b64e29c0159cf6b;p=working%2FEvergreen.git symspell sideloader for upgrade speed Signed-off-by: Mike Rylander --- diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell.sql index 09cb828186..7a3427ab26 100644 --- a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell.sql +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell.sql @@ -786,21 +786,40 @@ select $z$select $y$select $y$||x.id||$y$, '$z$||x.x||$z$', count(*) from search \qecho 'form is available inside a comment at the end of this upgrade sub-' \qecho 'script so you do not need to copy this comment from the psql ouptut.' \qecho '' -\qecho 'ALTER TABLE search.symspell_dictionary SET UNLOGGED;' -\qecho 'TRUNCATE search.symspell_dictionary;' -\qecho '' \qecho '\\a' \qecho '\\t' -\qecho '\\o dym.prime.sql' -\qecho select $y$select $y$||x.id||$y$, '''title''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''title''');$y$ from metabib.title_field_entry x; -\qecho select $y$select $y$||x.id||$y$, '''author''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''author''');$y$ from metabib.author_field_entry x; -\qecho select $y$select $y$||x.id||$y$, '''series''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''series''');$y$ from metabib.series_field_entry x; -\qecho select $y$select $y$||x.id||$y$, '''identifier''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''identifier''');$y$ from metabib.identifier_field_entry x; -\qecho select $y$select $y$||x.id||$y$, '''keyword''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''keyword''');$y$ from metabib.keyword_field_entry x; -\qecho select $y$select $y$||x.id||$y$, '''subject''', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, '''subject''');$y$ from metabib.subject_field_entry x; +\qecho '\\o symspell-title.txt' +\qecho 'select 'title:'||value from metabib.title_field_entry;' +\qecho '\\o symspell-author.txt' +\qecho 'select 'author:'||value from metabib.author_field_entry;' +\qecho '\\o symspell-subject.txt' +\qecho 'select 'subject:'||value from metabib.subject_field_entry;' +\qecho '\\o symspell-series.txt' +\qecho 'select 'series:'||value from metabib.series_field_entry;' +\qecho '\\o symspell-identifier.txt' +\qecho 'select 'identifier:'||value from metabib.identifier_field_entry;' +\qecho '\\o symspell-keyword.txt' +\qecho 'select 'keyword:'||value from metabib.keyword_field_entry;' \qecho '\\o' +\qecho '\\a' +\qecho '\\t' +\qecho '' +\qecho 'Then, at the command line:' \qecho '' -\qecho '\\i dym.prime.sql' +\qecho '$ ~/Evergreen-tarball-path/Open-ILS/src/support-scripts/symspell-sideload.pl \' +\qecho ' symspell-title.txt \' +\qecho ' symspell-author.txt \' +\qecho ' symspell-subject.txt \' +\qecho ' symspell-series.txt \' +\qecho ' symspell-identifier.txt \' +\qecho ' symspell-keyword.txt > sideload.sql' +\qecho '' +\qecho 'And, back in psql' +\qecho '' +\qecho 'ALTER TABLE search.symspell_dictionary SET UNLOGGED;' +\qecho 'TRUNCATE search.symspell_dictionary;' +\qecho '' +\qecho '\\i sideload.sql' \qecho '' \qecho 'CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;' \qecho 'REINDEX TABLE search.symspell_dictionary;' @@ -810,21 +829,47 @@ select $z$select $y$select $y$||x.id||$y$, '$z$||x.x||$z$', count(*) from search /* To run by hand: -ALTER TABLE search.symspell_dictionary SET UNLOGGED; -TRUNCATE search.symspell_dictionary; - \a \t -\o dym.prime.sql -select $y$select $y$||x.id||$y$, 'title', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'title');$y$ from metabib.title_field_entry x; -select $y$select $y$||x.id||$y$, 'author', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'author');$y$ from metabib.author_field_entry x; -select $y$select $y$||x.id||$y$, 'series', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'series');$y$ from metabib.series_field_entry x; -select $y$select $y$||x.id||$y$, 'identifier', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'identifier');$y$ from metabib.identifier_field_entry x; -select $y$select $y$||x.id||$y$, 'keyword', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'keyword');$y$ from metabib.keyword_field_entry x; -select $y$select $y$||x.id||$y$, 'subject', count(*) from search.symspell_build_and_merge_entries($x$$y$ || x.value||$y$$x$, 'subject');$y$ from metabib.subject_field_entry x; + +\o symspell-title.txt +select 'title:'||value from metabib.title_field_entry; + +\o symspell-author.txt +select 'author:'||value from metabib.author_field_entry; + +\o symspell-subject.txt +select 'subject:'||value from metabib.subject_field_entry; + +\o symspell-series.txt +select 'series:'||value from metabib.series_field_entry; + +\o symspell-identifier.txt +select 'identifier:'||value from metabib.identifier_field_entry; + +\o symspell-keyword.txt +select 'keyword:'||value from metabib.keyword_field_entry; + \o +\a +\t + +// Then, at the command line: + +$ ~/Evergreen-tarball-path/Open-ILS/src/support-scripts/symspell-sideload.pl \ + symspell-title.txt \ + symspell-author.txt \ + symspell-subject.txt \ + symspell-series.txt \ + symspell-identifier.txt \ + symspell-keyword.txt > sideload.sql + +// And, back in psql + +ALTER TABLE search.symspell_dictionary SET UNLOGGED; +TRUNCATE search.symspell_dictionary; -\i dym.prime.sql +\i sideload.sql CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey; REINDEX TABLE search.symspell_dictionary; diff --git a/Open-ILS/src/support-scripts/symspell-sideload.pl b/Open-ILS/src/support-scripts/symspell-sideload.pl new file mode 100755 index 0000000000..120ff77a74 --- /dev/null +++ b/Open-ILS/src/support-scripts/symspell-sideload.pl @@ -0,0 +1,119 @@ +#!/usr/bin/perl +use warnings; +use strict; +use List::MoreUtils qw/uniq/; + +$| = 1; + +my $plen = 6; +my $maxed = 3; + +my %dict; +my $etime; +my $secs; + +my %classes = ( + title => 0, + author => 1, + subject => 2, + series => 3, + keyword => 4, + identifier => 5 +); + +my $stime = time; +while (<>) { + my $line = $.; + + chomp(); $_=lc($_); + my ($class,$data) = m/^(\w+):(.*$)/; + + my $ckey = $class.'_count'; + my $skey = $class.'_suggestions'; + + my @words; + while( $data =~ m/([\w\d]+'*[\w\d]*)/g ) { + push @words, $1; + } + + for my $raw (uniq @words) { + my $key = $raw; + $dict{$key} //= { map {$_.'_count' => 0, $_.'_suggestions' => {}} keys %classes }; + $dict{$key}{$ckey}++; + if ($dict{$key}{$ckey} == 1) { # first time we've seen it in this class, need to generate prefix keys + $dict{$key}{$skey}{$raw} = 1; + + if (length($raw) > $plen) { + $key = substr($raw,0,$plen); + $dict{$key} //= { map {$_.'_count' => 0, $_.'_suggestions' => {}} keys %classes }; + $dict{$key}{$skey}{$raw} = 1; + } + + for my $edit (symspell_generate_edits($key, 1)) { + $dict{$edit} //= { map {$_.'_count' => 0, $_.'_suggestions' => {}} keys %classes }; + $dict{$edit}{$skey}{$raw} = 1; + } + } + } + + unless ($line % 10000) { + $etime = time; + $secs = $etime - $stime; + warn "$line lines consumed from input in $secs seconds...\n"; + } +} + +$etime = time; +$secs = $etime - $stime; +warn "Dictionary built in $secs seconds, writing...\n"; + +$stime = time; +my $counter = 0; + +my @keymap = (map { [$_.'_count', $_.'_suggestions'] } sort keys %classes); + +print 'COPY search.symspell_dictionary (prefix_key, '. join(', ', map { ($$_[0], $$_[1]) } @keymap) . ") FROM STDIN;\n"; + +while ( my ($key, $cl_dict) = each %dict ) { + $counter++; + print join( "\t", $key, map { + ($$cl_dict{$$_[0]}, (keys %{$$cl_dict{$$_[1]}} ? '{'.join(',', uniq(keys %{$$cl_dict{$$_[1]}})).'}' : '\N')) + } @keymap) . "\n"; + + delete $dict{$key}; +} + +print '\.'."\n\n"; + + +$etime = time; +$secs = $etime - $stime; +warn "$counter dictionary prefix key entries written in $secs seconds.\n"; + +sub symspell_generate_edits { + my $word = shift; + my $dist = shift; + my $c = 1; + my @list; + my @sublist; + my $len = length($word); + + while ( $c <= $len ) { + my $item = substr($word, 0, $c - 1) . substr($word, $c); + push @list, $item; + if ($dist < $maxed) { + push @sublist, symspell_generate_edits($item, $dist + 1); + } + $c++; + } + + push @list, @sublist; + + if ($dist == 1) { + #warn join(', ', uniq @list) . "\n"; + return uniq(@list); + } + + return @list; +} +