From: Lebbeous Fogle-Weekley Date: Fri, 19 Aug 2011 20:11:11 +0000 (-0400) Subject: abstract query representations from QueryParser X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=17a299f8821bdc5c2df0e9fc34e49f557a0d45f7;p=evergreen%2Fequinox.git abstract query representations from QueryParser When you perform a search, you now also get back an abstract representation of your search query. fts-replacement.pl is a test script that shows you things that QueryParser does, including the new abstract query feature, and it also reconstitutes a string from the abstract query to demonstrate the correctness of that query. Signed-off-by: Lebbeous Fogle-Weekley --- diff --git a/Open-ILS/src/extras/fts-replacement.pl b/Open-ILS/src/extras/fts-replacement.pl new file mode 100755 index 0000000000..0fca66b08b --- /dev/null +++ b/Open-ILS/src/extras/fts-replacement.pl @@ -0,0 +1,137 @@ +#!/usr/bin/perl +use warnings; +use strict; +use lib '/home/opensrf/svn/OpenSRF/trunk/src/perl/lib/'; +use lib '/home/opensrf/svn/ILS/trunk/Open-ILS/src/perlmods/'; +use OpenILS::Application::Storage::Driver::Pg::QueryParser; +use JSON::XS; +use Getopt::Long; +use Data::Dumper; +$Data::Dumper::Indent = 1; +use Time::HiRes qw/time/; + +my $qpconfig; + +sub _abstract_query2str_filter { + my $f = shift; + + return sprintf( + "%s%s(%s)", + $f->{negate} ? $qpconfig->{operators}{disallowed} : "", + $f->{name}, + join(",", @{$f->{args}}) + ); +} + +sub _abstract_query2str_modifier { + my $f = shift; + + return $qpconfig->{operators}{modifier} . $f; +} + +# This should produce an equivalent query to the original, given an +# abstract_query with a qp config. +sub abstract_query2str { + my $abstract_query = shift; + my $depth = shift || 0; + + $qpconfig ||= $abstract_query->{config}; + + my $gs = $qpconfig->{operators}{group_start}; + my $ge = $qpconfig->{operators}{group_end}; + my $and = $qpconfig->{operators}{and}; + my $or = $qpconfig->{operators}{or}; + + my $q = ""; + $q .= $gs if $abstract_query->{type} and $abstract_query->{type} eq "query_plan" and $depth; + + if (exists $abstract_query->{type}) { + if ($abstract_query->{type} eq 'query_plan') { + $q .= join(" ", map { _abstract_query2str_filter($_) } @{$abstract_query->{filters}}) if + exists $abstract_query->{filters}; + $q .= " "; + + $q .= join(" ", map { _abstract_query2str_modifier($_) } @{$abstract_query->{modifiers}}) if + exists $abstract_query->{modifiers}; + } elsif ($abstract_query->{type} eq 'node') { + $q .= " " . $abstract_query->{class}; + $q .= "|$_" foreach @{$abstract_query->{fields}}; + $q .= ":"; + } elsif ($abstract_query->{type} eq 'atom') { + my $prefix = $abstract_query->{prefix} || ''; + $prefix = $qpconfig->{operators}{disallowed} if $prefix eq '!'; + $q .= $prefix . + ($abstract_query->{content} || '') . + ($abstract_query->{suffix} || ''); + } + } + + if (exists $abstract_query->{children}) { + my $op = (keys(%{$abstract_query->{children}}))[0]; + $q .= join( + " " . ($op eq '&' ? $and : $or) . " ", + map { abstract_query2str($_, $depth + 1) } @{$abstract_query->{children}{$op}} + ); + } elsif ($abstract_query->{'&'} or $abstract_query->{'|'}) { + my $op = (keys(%{$abstract_query}))[0]; + $q .= join( + " " . ($op eq '&' ? $and : $or) . " ", + map { abstract_query2str($_, $depth + 1) } @{$abstract_query->{$op}} + ); + } + $q .= " "; + + + $q .= $ge if $abstract_query->{type} and $abstract_query->{type} eq "query_plan" and $depth; + + return $q; +} + +OpenILS::Application::Storage::Driver::Pg::QueryParser->TEST_SETUP; + +my $query = '#available title: foo bar* || (-baz || (subject:"1900'. + '-1910 junk" "and another thing" se:stuff #available '. + 'statuses(0,7,12))) && && && au:malarky || au|'. + 'corporate|personal:gonzo && dc.identifier:+123456789X'. + ' dc.contributor=rowling #metarecord estimation_'. + 'strategy(exclusion) item_type(a, t) item_form(d) '. + 'bib.subjectTitle=potter bib.subjectName=harry '. + 'keyword|mapscale:1:250000'; +my $superpage = 1; +my $superpage_size = 1000; +my $core_limit = 25000; +my $debug; +my $quiet; +my $runs = 100; + +GetOptions( + 'superpage=i' => \$superpage, + 'superpage-size=i' => \$superpage_size, + 'core-limit=i' => \$core_limit, + 'query=s' => \$query, + 'debug' => \$debug, + 'quiet' => \$quiet, + 'runs=i' => \$runs +); + +print "Original query: $query\n"; + +my $start = time(); +OpenILS::Application::Storage::Driver::Pg::QueryParser->new( superpage_size => $superpage_size, superpage => $superpage, core_limit => $core_limit, debug => $debug, query => $query )->parse->parse_tree for (1 .. $runs); +my $end = time(); + +my $plan = OpenILS::Application::Storage::Driver::Pg::QueryParser->new( superpage_size => $superpage_size, superpage => $superpage, core_limit => $core_limit, query => $query, debug => $debug ); +$plan->parse; +print "Parsed query tree:\n" . Dumper( $plan->parse_tree) if (!$quiet); +#print "Parsed query tree:\n" . Dumper( QueryParser->new( superpage_size => $superpage_size, superpage => $superpage, core_limit => $core_limit, query => $query, debug => $debug )->parse->parse_tree); +my $sql = $plan->toSQL; +$sql =~ s/^\s*$//gm; +print "SQL:\n$sql\n\n" if (!$quiet); + +my $abstract_query = $plan->parse_tree->to_abstract_query(with_config => 1); +print "abstract_query: " . Dumper($abstract_query) . "\n"; +print "abstract_query back to string: " . abstract_query2str($abstract_query) . "\n"; +print "Simple plan: " . ($plan->simple_plan ? 'yes' : 'no') . "\n"; +print "Total parse time, $runs runs: " . ($end - $start) . "s\n"; +print "Average parse time, $runs runs: " . sprintf('%0.3f',(($end - $start) / $runs) * 1000) . "ms\n"; + diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm index ac8046b3bb..28b27f45c8 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm @@ -1245,6 +1245,7 @@ sub staged_search { my $page; # current superpage my $est_hit_count = 0; my $current_page_summary = {}; + my $current_abstract = {}; my $global_summary = {checked => 0, visible => 0, excluded => 0, deleted => 0, total => 0}; my $is_real_hit_count = 0; my $new_ids = []; @@ -1253,7 +1254,7 @@ sub staged_search { my $data = $cache_data->{$page}; my $results; - my $summary; + my ($summary, $abstract); $logger->debug("staged search: analyzing superpage $page"); @@ -1262,13 +1263,14 @@ sub staged_search { $logger->debug("staged search: found cached results"); $summary = $data->{summary}; $results = $data->{results}; - + $abstract = pop @$results; } else { # retrieve the window of results from the database $logger->debug("staged search: fetching results from the database"); $search_hash->{skip_check} = $page * $superpage_size; my $start = time; $results = $U->storagereq($method, %$search_hash); + $abstract = pop @$results; $search_duration = time - $start; $logger->info("staged search: DB call took $search_duration seconds and returned ".scalar(@$results)." rows, including summary"); $summary = shift(@$results) if $results; @@ -1301,6 +1303,7 @@ sub staged_search { } $current_page_summary = $summary; + $current_abstract = $abstract; # add the new set of results to the set under construction push(@$all_results, @$results); @@ -1362,7 +1365,8 @@ sub staged_search { superpage_size => $search_hash->{check_limit}, superpage_summary => $current_page_summary, facet_key => $facet_key, - ids => \@results + ids => \@results, + abstract_query => $current_abstract } ); diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Publisher/metabib.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Publisher/metabib.pm index acdc0c23a7..b7fe27a46b 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Publisher/metabib.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/Publisher/metabib.pm @@ -8,6 +8,7 @@ use OpenSRF::Utils::Logger qw/:level/; use OpenSRF::Utils::Cache; use OpenSRF::Utils::JSON; use Data::Dumper; +$Data::Dumper::Indent = 0; use Digest::MD5 qw/md5_hex/; @@ -3069,6 +3070,7 @@ sub query_parser_fts { $client->respond( $rec ); } + $client->respond($query->parse_tree->to_abstract_query(with_config => 1)); return undef; } __PACKAGE__->register_method( diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/QueryParser.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/QueryParser.pm index dc4cfef896..6ce70c7494 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/QueryParser.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Storage/QueryParser.pm @@ -763,6 +763,64 @@ sub superpage_size { #------------------------------- +package _util; + +# At this level, joiners are always & or |. This is not +# the external, configurable representation of joiners that +# defaults to # && and ||. +sub is_joiner { + my $str = shift; + + return (not ref $str and ($str eq '&' or $str eq '|')); +} + +sub default_joiner { '&' } + +# 0 for different, 1 for the same. +sub compare_abstract_atoms { + my ($left, $right) = @_; + + foreach (qw/prefix suffix content/) { + no warnings; # undef can stand in for '' here + return 0 unless $left->{$_} eq $right->{$_}; + } + + return 1; +} + +sub fake_abstract_atom_from_phrase { + my ($phrase, $neg) = @_; + + my $prefix = '"'; + if ($neg) { + $prefix = + $QueryParser::parser_config{QueryParser}{operators}{disallowed} . + $prefix; + } + + return { + "type" => "atom", "prefix" => $prefix, "suffix" => '"', + "content" => $phrase + } +} + +sub find_arrays_in_abstract { + my ($hash) = @_; + + my @arrays; + foreach my $key (keys %$hash) { + if (ref $hash->{$key} eq "ARRAY") { + push @arrays, $hash->{$key}; + foreach (@{$hash->{$key}}) { + push @arrays, find_arrays_in_abstract($_); + } + } + } + + return @arrays; +} + +#------------------------------- package QueryParser::query_plan; sub QueryParser { @@ -958,9 +1016,58 @@ sub add_filter { return $self; } +# %opts supports two options at this time: +# no_phrases : +# If true, do not do anything to the phrases and unphrases +# fields on any discovered nodes. +# with_config : +# If true, also return the query parser config as part of the blob. +# This will get set back to 0 before recursion to avoid repetition. +sub to_abstract_query { + my $self = shift; + my %opts = @_; + + my $abstract_query = { + type => "query_plan", + filters => [map { $_->to_abstract_query } @{$self->filters}], + modifiers => [map { $_->to_abstract_query } @{$self->modifiers}] + }; + + if ($opts{with_config}) { + $opts{with_config} = 0; + $abstract_query->{config} = $QueryParser::parser_config{QueryParser}; + } + + my $kids = []; + + for my $qnode (@{$self->query_nodes}) { + # Remember: qnode can be a joiner string, a node, or another query_plan + + if (_util::is_joiner($qnode)) { + if ($abstract_query->{children}) { + my $open_joiner = (keys(%{$abstract_query->{children}}))[0]; + next if $open_joiner eq $qnode; + + my $oldroot = $abstract_query->{children}; + $kids = [$oldroot]; + $abstract_query->{children} = {$qnode => $kids}; + } else { + $abstract_query->{children} = {$qnode => $kids}; + } + } else { + push @$kids, $qnode->to_abstract_query(%opts); + } + } + + $abstract_query->{children} ||= { _util::default_joiner() => $kids }; + return $abstract_query; +} + #------------------------------- package QueryParser::query_plan::node; +use Data::Dumper; +$Data::Dumper::Indent = 0; sub new { my $pkg = shift; @@ -1091,6 +1198,140 @@ sub add_dummy_atom { return $self; } +# This will find up to one occurence of @$short_list within @$long_list, and +# replace it with the single atom $replacement. +sub replace_phrase_in_abstract_query { + my ($self, $short_list, $long_list, $replacement) = @_; + + my $success = 0; + my @already = (); + my $goal = scalar @$short_list; + + for (my $i = 0; $i < scalar (@$long_list); $i++) { + my $right = $long_list->[$i]; + + if (_util::compare_abstract_atoms( + $short_list->[scalar @already], $right + )) { + push @already, $i; + } elsif (scalar @already) { + @already = (); + next; + } + + if (scalar @already == $goal) { + splice @$long_list, $already[0], scalar(@already), $replacement; + $success = 1; + last; + } + } + + return $success; +} + +sub to_abstract_query { + my $self = shift; + my %opts = @_; + + my $abstract_query = { + "type" => "node", + "class" => $self->classname, + "fields" => $self->fields + }; + + my $kids = []; + + for my $qatom (@{$self->query_atoms}) { + if (_util::is_joiner($qatom)) { + if ($abstract_query->{children}) { + my $open_joiner = (keys(%{$abstract_query->{children}}))[0]; + next if $open_joiner eq $qatom; + + my $oldroot = $abstract_query->{children}; + $kids = [$oldroot]; + $abstract_query->{children} = {$qatom => $kids}; + } else { + $abstract_query->{children} = {$qatom => $kids}; + } + } else { + push @$kids, $qatom->to_abstract_query; + } + } + + if ($self->{phrases} and not $opts{no_phrases}) { + for my $phrase (@{$self->{phrases}}) { + # Phrases appear duplication in a real QP tree, and we don't want + # that duplication in our abstract query. So for all our phrases, + # break them into atoms as QP would, and remove any matching + # sequences of atoms from our abstract query. + + my $tmptree = $self->{plan}->{QueryParser}->new(query => '"'.$phrase.'"')->parse->parse_tree; + if ($tmptree) { + # For a well-behaved phrase, we should now have only one node + # in the $tmptree query plan, and that node should have an + # orderly list of atoms and joiners. + + if ($tmptree->{query} and scalar(@{$tmptree->{query}}) == 1) { + my $tmplist; + + eval { + $tmplist = $tmptree->{query}->[0]->to_abstract_query( + no_phrases => 1 + )->{children}->{'&'}->[0]->{children}->{'&'}; + }; + next if $@; + + foreach ( + _util::find_arrays_in_abstract($abstract_query->{children}) + ) { + last if $self->replace_phrase_in_abstract_query( + $tmplist, + $_, + _util::fake_abstract_atom_from_phrase($phrase) + ); + } + } + } + } + } + + # Do the same as the preceding block for unphrases (negated phrases). + if ($self->{unphrases} and not $opts{no_phrases}) { + for my $phrase (@{$self->{unphrases}}) { + my $tmptree = $self->{plan}->{QueryParser}->new( + query => $QueryParser::parser_config{QueryParser}{operators}{disallowed}. + '"' . $phrase . '"' + )->parse->parse_tree; + + if ($tmptree) { + if ($tmptree->{query} and scalar(@{$tmptree->{query}}) == 1) { + my $tmplist; + + eval { + $tmplist = $tmptree->{query}->[0]->to_abstract_query( + no_phrases => 1 + )->{children}->{'&'}->[0]->{children}->{'&'}; + }; + next if $@; + + foreach ( + _util::find_arrays_in_abstract($abstract_query->{children}) + ) { + last if $self->replace_phrase_in_abstract_query( + $tmplist, + $_, + _util::fake_abstract_atom_from_phrase($phrase, 1) + ); + } + } + } + } + } + + $abstract_query->{children} ||= { _util::default_joiner() => $kids }; + return $abstract_query; +} + #------------------------------- package QueryParser::query_plan::node::atom; @@ -1126,6 +1367,14 @@ sub suffix { return $self->{suffix}; } +sub to_abstract_query { + my ($self) = @_; + + return { + (map { $_ => $self->$_ } qw/prefix suffix content/), + "type" => "atom" + }; +} #------------------------------- package QueryParser::query_plan::filter; @@ -1157,6 +1406,14 @@ sub args { return $self->{args}; } +sub to_abstract_query { + my ($self) = @_; + + return { + map { $_ => $self->$_ } qw/name negate args/ + }; +} + #------------------------------- package QueryParser::query_plan::facet; @@ -1204,5 +1461,10 @@ sub name { return $$self; } +sub to_abstract_query { + my ($self) = @_; + + return $self->name; +} 1;