From 53f0009992bcc931ae06fac5fc00ad27f38efe1d Mon Sep 17 00:00:00 2001 From: Mike Rylander Date: Thu, 11 Mar 2021 15:08:55 -0500 Subject: [PATCH] Improve side-loader for RAM use and time. Signed-off-by: Mike Rylander --- .../src/sql/Pg/upgrade/XXXX.schema.symspell.sql | 107 +++++++++++++-------- Open-ILS/src/support-scripts/symspell-sideload.pl | 59 ++++++------ 2 files changed, 95 insertions(+), 71 deletions(-) diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell.sql index 7a3427ab26..5de9eafeea 100644 --- a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell.sql +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.symspell.sql @@ -788,67 +788,74 @@ select $z$select $y$select $y$||x.id||$y$, '$z$||x.x||$z$', count(*) from search \qecho '' \qecho '\\a' \qecho '\\t' -\qecho '\\o symspell-title.txt' -\qecho 'select 'title:'||value from metabib.title_field_entry;' -\qecho '\\o symspell-author.txt' -\qecho 'select 'author:'||value from metabib.author_field_entry;' -\qecho '\\o symspell-subject.txt' -\qecho 'select 'subject:'||value from metabib.subject_field_entry;' -\qecho '\\o symspell-series.txt' -\qecho 'select 'series:'||value from metabib.series_field_entry;' -\qecho '\\o symspell-identifier.txt' -\qecho 'select 'identifier:'||value from metabib.identifier_field_entry;' -\qecho '\\o symspell-keyword.txt' -\qecho 'select 'keyword:'||value from metabib.keyword_field_entry;' +\qecho '' +\qecho '\\o title' +\qecho 'select value from metabib.title_field_entry;' +\qecho '\\o author' +\qecho 'select value from metabib.author_field_entry;' +\qecho '\\o subject' +\qecho 'select value from metabib.subject_field_entry;' +\qecho '\\o series' +\qecho 'select value from metabib.series_field_entry;' +\qecho '\\o identifier' +\qecho 'select value from metabib.identifier_field_entry;' +\qecho '\\o keyword' +\qecho 'select value from metabib.keyword_field_entry;' +\qecho '' \qecho '\\o' \qecho '\\a' \qecho '\\t' \qecho '' -\qecho 'Then, at the command line:' +\qecho '// Then, at the command line:' \qecho '' -\qecho '$ ~/Evergreen-tarball-path/Open-ILS/src/support-scripts/symspell-sideload.pl \' -\qecho ' symspell-title.txt \' -\qecho ' symspell-author.txt \' -\qecho ' symspell-subject.txt \' -\qecho ' symspell-series.txt \' -\qecho ' symspell-identifier.txt \' -\qecho ' symspell-keyword.txt > sideload.sql' +\qecho '$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl title > title.sql' \qecho '' -\qecho 'And, back in psql' +\qecho '// And, back in psql' \qecho '' \qecho 'ALTER TABLE search.symspell_dictionary SET UNLOGGED;' \qecho 'TRUNCATE search.symspell_dictionary;' \qecho '' -\qecho '\\i sideload.sql' +\qecho '\i identifier.sql' +\qecho '\i author.sql' +\qecho '\i title.sql' +\qecho '\i subject.sql' +\qecho '\i series.sql' +\qecho '\i keyword.sql' \qecho '' \qecho 'CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey;' \qecho 'REINDEX TABLE search.symspell_dictionary;' \qecho 'ALTER TABLE search.symspell_dictionary SET LOGGED;' \qecho 'VACUUM ANALYZE search.symspell_dictionary;' \qecho '' +\qecho 'DROP TABLE search.search.symspell_dictionary_partial_title;' +\qecho 'DROP TABLE search.search.symspell_dictionary_partial_author;' +\qecho 'DROP TABLE search.search.symspell_dictionary_partial_subject;' +\qecho 'DROP TABLE search.search.symspell_dictionary_partial_series;' +\qecho 'DROP TABLE search.search.symspell_dictionary_partial_identifier;' +\qecho 'DROP TABLE search.search.symspell_dictionary_partial_keyword;' /* To run by hand: \a \t -\o symspell-title.txt -select 'title:'||value from metabib.title_field_entry; +\o title +select value from metabib.title_field_entry; -\o symspell-author.txt -select 'author:'||value from metabib.author_field_entry; +\o author +select value from metabib.author_field_entry; -\o symspell-subject.txt -select 'subject:'||value from metabib.subject_field_entry; +\o subject +select value from metabib.subject_field_entry; -\o symspell-series.txt -select 'series:'||value from metabib.series_field_entry; +\o series +select value from metabib.series_field_entry; -\o symspell-identifier.txt -select 'identifier:'||value from metabib.identifier_field_entry; +\o identifier +select value from metabib.identifier_field_entry; -\o symspell-keyword.txt -select 'keyword:'||value from metabib.keyword_field_entry; +\o keyword +select value from metabib.keyword_field_entry; \o \a @@ -856,25 +863,41 @@ select 'keyword:'||value from metabib.keyword_field_entry; // Then, at the command line: -$ ~/Evergreen-tarball-path/Open-ILS/src/support-scripts/symspell-sideload.pl \ - symspell-title.txt \ - symspell-author.txt \ - symspell-subject.txt \ - symspell-series.txt \ - symspell-identifier.txt \ - symspell-keyword.txt > sideload.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl title > title.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl author > author.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl subject > subject.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl series > series.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl identifier > identifier.sql +$ ~/EG-src-path/Open-ILS/src/support-scripts/symspell-sideload.pl keyword > keyword.sql + +// To the extent your hardware allows, the above commands can be run in +// in parallel, in different shells. Each will use a full CPU, and RAM +// may be a limiting resource, so keep an eye on that with `top`. + // And, back in psql ALTER TABLE search.symspell_dictionary SET UNLOGGED; TRUNCATE search.symspell_dictionary; -\i sideload.sql +\i identifier.sql +\i author.sql +\i title.sql +\i subject.sql +\i series.sql +\i keyword.sql CLUSTER search.symspell_dictionary USING symspell_dictionary_pkey; REINDEX TABLE search.symspell_dictionary; ALTER TABLE search.symspell_dictionary SET LOGGED; VACUUM ANALYZE search.symspell_dictionary; +DROP TABLE search.search.symspell_dictionary_partial_title; +DROP TABLE search.search.symspell_dictionary_partial_author; +DROP TABLE search.search.symspell_dictionary_partial_subject; +DROP TABLE search.search.symspell_dictionary_partial_series; +DROP TABLE search.search.symspell_dictionary_partial_identifier; +DROP TABLE search.search.symspell_dictionary_partial_keyword; + */ diff --git a/Open-ILS/src/support-scripts/symspell-sideload.pl b/Open-ILS/src/support-scripts/symspell-sideload.pl index 120ff77a74..b92b22e38d 100755 --- a/Open-ILS/src/support-scripts/symspell-sideload.pl +++ b/Open-ILS/src/support-scripts/symspell-sideload.pl @@ -12,24 +12,13 @@ my %dict; my $etime; my $secs; -my %classes = ( - title => 0, - author => 1, - subject => 2, - series => 3, - keyword => 4, - identifier => 5 -); +my $class = $ARGV[0]; my $stime = time; -while (<>) { +while (my $data = <>) { my $line = $.; - chomp(); $_=lc($_); - my ($class,$data) = m/^(\w+):(.*$)/; - - my $ckey = $class.'_count'; - my $skey = $class.'_suggestions'; + chomp($data); $data=lc($data); my @words; while( $data =~ m/([\w\d]+'*[\w\d]*)/g ) { @@ -38,20 +27,21 @@ while (<>) { for my $raw (uniq @words) { my $key = $raw; - $dict{$key} //= { map {$_.'_count' => 0, $_.'_suggestions' => {}} keys %classes }; - $dict{$key}{$ckey}++; - if ($dict{$key}{$ckey} == 1) { # first time we've seen it in this class, need to generate prefix keys - $dict{$key}{$skey}{$raw} = 1; + $dict{$key} //= [0,[]]; + $dict{$key}[0]++; + + if ($dict{$key}[0] == 1) { # first time we've seen it, need to generate prefix keys + push @{$dict{$key}[1]}, $raw; if (length($raw) > $plen) { $key = substr($raw,0,$plen); - $dict{$key} //= { map {$_.'_count' => 0, $_.'_suggestions' => {}} keys %classes }; - $dict{$key}{$skey}{$raw} = 1; + $dict{$key} //= [0,[]]; + push @{$dict{$key}[1]}, $raw; } for my $edit (symspell_generate_edits($key, 1)) { - $dict{$edit} //= { map {$_.'_count' => 0, $_.'_suggestions' => {}} keys %classes }; - $dict{$edit}{$skey}{$raw} = 1; + $dict{$edit} //= [0,[]]; + push @{$dict{$edit}[1]}, $raw; } } } @@ -70,21 +60,32 @@ warn "Dictionary built in $secs seconds, writing...\n"; $stime = time; my $counter = 0; -my @keymap = (map { [$_.'_count', $_.'_suggestions'] } sort keys %classes); +print <<"SQL"; +CREATE UNLOGGED TABLE search.symspell_dictionary_partial_$class ( + prefix_key TEXT, + ${class}_count INT, + ${class}_suggestions TEXT[] +) FROM STDIN; -print 'COPY search.symspell_dictionary (prefix_key, '. join(', ', map { ($$_[0], $$_[1]) } @keymap) . ") FROM STDIN;\n"; +COPY search.search.symspell_dictionary_partial_$class FROM STDIN; +SQL while ( my ($key, $cl_dict) = each %dict ) { $counter++; - print join( "\t", $key, map { - ($$cl_dict{$$_[0]}, (keys %{$$cl_dict{$$_[1]}} ? '{'.join(',', uniq(keys %{$$cl_dict{$$_[1]}})).'}' : '\N')) - } @keymap) . "\n"; - + print join( "\t", $key, $$cl_dict[0], (scalar(@{$$cl_dict[1]}) ? '{'.join(',', uniq @{$$cl_dict[1]}).'}' : '\N')) . "\n"; delete $dict{$key}; } -print '\.'."\n\n"; +print <<"SQL"; +\\. + +INSERT INTO search.symspell_dictionary (prefix_key, ${class}_count, ${class}_suggestions) + SELECT * FROM search.search.symspell_dictionary_partial_$class + ON CONFILICT (prefix_key) DO UPDATE + SET ${class}_count = EXCLUDED.${class}_count, + ${class}_suggestions = EXCLUDED.${class}_suggestions; +SQL $etime = time; $secs = $etime - $stime; -- 2.11.0