From: Bill Erickson Date: Wed, 28 Aug 2019 16:16:17 +0000 (-0400) Subject: bib marc record index / searching X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=93f371d62493274eb835fed8c410e31db10cb4ed;p=working%2FEvergreen.git bib marc record index / searching Signed-off-by: Bill Erickson --- diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search.pm index bcd3fbfad5..78d4a4e2db 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search.pm @@ -36,7 +36,6 @@ sub initialize { sub child_init { OpenILS::Application::Search::Z3950->child_init; OpenILS::Application::Search::Browse->child_init; - OpenILS::Application::Search::Elastic->child_init; } diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm index a0ac123bfd..883dfd80cc 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm @@ -10,7 +10,7 @@ use OpenSRF::Utils::SettingsClient; use OpenILS::Utils::CStoreEditor q/:funcs/; use OpenSRF::Utils::Cache; use Encode; -use OpenILS::Application::Search::Elastic; +use OpenILS::Application::Search::ElasticMapper; use OpenSRF::Utils::Logger qw/:logger/; @@ -1157,11 +1157,11 @@ sub staged_search { $user_offset = ($user_offset >= 0) ? $user_offset : 0; $user_limit = ($user_limit >= 0) ? $user_limit : 10; - return OpenILS::Application::Search::Elastic->bib_search( + return OpenILS::Application::Search::ElasticMapper->bib_search( $search_hash->{query}, # query string ($method =~ /staff/ ? 1 : 0), $user_offset, $user_limit - ) if OpenILS::Application::Search::Elastic->is_enabled; + ) if OpenILS::Application::Search::ElasticMapper->is_enabled('bib-search'); # we're grabbing results on a per-superpage basis, which means the # limit and offset should coincide with superpage boundaries @@ -2133,6 +2133,10 @@ sub marc_search { my $limit = $args->{limit} || 10; my $offset = $args->{offset} || 0; + return OpenILS::Application::Search::ElasticMapper->marc_search( + $args, ($method =~ /staff/ ? 1 : 0), $limit, $offset + ) if OpenILS::Application::Search::ElasticMapper->is_enabled('bib-marc'); + # allow caller to pass in a call timeout since MARC searches # can take longer than the default 60-second timeout. # Default to 2 mins. Arbitrarily cap at 5 mins. diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm deleted file mode 100644 index bb6caa14a5..0000000000 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm +++ /dev/null @@ -1,549 +0,0 @@ -package OpenILS::Application::Search::Elastic; -# --------------------------------------------------------------- -# Copyright (C) 2018 King County Library System -# Author: Bill Erickson -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# --------------------------------------------------------------- -use strict; -use warnings; -use OpenSRF::Utils::JSON; -use OpenSRF::Utils::Logger qw/:logger/; -use OpenILS::Utils::Fieldmapper; -use OpenSRF::Utils::SettingsClient; -use OpenILS::Utils::CStoreEditor q/:funcs/; -use OpenILS::Elastic::BibSearch; -use List::Util qw/min/; -use Digest::MD5 qw(md5_hex); - -use OpenILS::Application::AppUtils; -my $U = "OpenILS::Application::AppUtils"; - -# Use the QueryParser module to make sense of the inbound search query. -use OpenILS::Application::Storage::Driver::Pg::QueryParser; - - -# avoid repetitive calls to DB for org info. -my %org_data_cache = (by_shortname => {}, ancestors_at => {}); - -# bib fields defined in the elastic bib-search index -my $bib_fields; -my $hidden_copy_statuses; -my $hidden_copy_locations; -my $avail_copy_statuses; -our $enabled = undef; - -# Returns true if the Elasticsearch 'bib-search' index is active. -sub is_enabled { - - return $enabled if defined $enabled; - - # Elastic bib search is enabled if a "bib-search" index is enabled. - my $index = new_editor()->search_elastic_index( - {active => 't', code => 'bib-search'})->[0]; - - if ($index) { - - $logger->info("ES bib-search index is enabled"); - $enabled = 1; - } else { - $enabled = 0; - } - - return $enabled; -} - -sub child_init { - my $class = shift; - return unless $class->is_enabled(); - - my $e = new_editor(); - - $bib_fields = $e->retrieve_all_elastic_bib_field; - - my $stats = $e->json_query({ - select => {ccs => ['id', 'opac_visible', 'is_available']}, - from => 'ccs', - where => {'-or' => [ - {opac_visible => 'f'}, - {is_available => 't'} - ]} - }); - - $hidden_copy_statuses = - [map {$_->{id}} grep {$_->{opac_visible} eq 'f'} @$stats]; - - $avail_copy_statuses = - [map {$_->{id}} grep {$_->{is_available} eq 't'} @$stats]; - - # Include deleted copy locations since this is an exclusion set. - my $locs = $e->json_query({ - select => {acpl => ['id']}, - from => 'acpl', - where => {opac_visible => 'f'} - }); - - $hidden_copy_locations = [map {$_->{id}} @$locs]; - - return 1; -} - -# Translate a bib search API call into something consumable by Elasticsearch -# Translate search results into a structure consistent with a bib search -# API response. -sub bib_search { - my ($class, $query, $staff, $offset, $limit) = @_; - - $logger->info("ES parsing API query $query staff=$staff"); - - my ($elastic_query, $cache_key) = - compile_elastic_query($query, $staff, $offset, $limit); - - my $es = OpenILS::Elastic::BibSearch->new('main'); - - $es->connect; - my $results = $es->search($elastic_query); - - $logger->debug("ES elasticsearch returned: ". - OpenSRF::Utils::JSON->perl2JSON($results)); - - return {count => 0} unless $results; - - return { - count => $results->{hits}->{total}, - ids => [ - map { [$_->{_id}, undef, $_->{_score}] } - grep {defined $_} @{$results->{hits}->{hits}} - ], - facets => format_facets($results->{aggregations}), - # Elastic has its own search cacheing, so external caching is - # performed, but providing cache keys allows the caller to - # know if this search matches another search. - cache_key => $cache_key, - facet_key => $cache_key.'_facets' - }; -} - -sub compile_elastic_query { - my ($query, $staff, $offset, $limit) = @_; - - my $parser = init_query_parser($query); - - $parser->parse; - my $query_struct = $parser->parse_tree->to_abstract_query; - - my $elastic = { - _source => ['id'], # Fetch bib ID only - size => $limit, - from => $offset, - sort => [], - query => { - bool => { - must => [], - filter => [] - } - } - }; - - $elastic->{query}->{bool}->{must} = - [translate_query_node($elastic, $query_struct)]; - - add_elastic_holdings_filter($elastic, $staff, - $elastic->{__site}, $elastic->{__depth}, $elastic->{__available}); - - add_elastic_facet_aggregations($elastic); - - # delete __-prefixed state maintenance keys. - delete $elastic->{$_} for (grep {$_ =~ /^__/} keys %$elastic); - - $elastic->{sort} = ['_score'] unless @{$elastic->{sort}}; - - return $elastic; -} - -sub translate_query_node { - my ($elastic, $node) = @_; - - if ($node->{type} eq 'query_plan') { - - my ($joiner) = keys %{$node->{children}}; - my $children = $node->{children}->{$joiner}; - my $filters = $node->{filters}; - my $modifiers = $node->{modifiers}; - - if (grep {$_ eq 'descending'} @$modifiers) { - $elastic->{__sort_dir} = 'desc'; - } - - if (grep {$_ eq 'available'} @$modifiers) { - $elastic->{__available} = 1; - } - - return unless @$children || @$filters; - - my $bool_op = $joiner eq '&' ? 'must' : 'should'; - my $bool_nodes = []; - my $filter_nodes = []; - my $query = { - bool => { - $bool_op => $bool_nodes, - filter => $filter_nodes - } - }; - - for my $child (@$children) { - my $type = $child->{type}; - - if ($type eq 'node' || $type eq 'query_plan') { - my $subq = translate_query_node($elastic, $child); - push(@$bool_nodes, $subq) if defined $subq; - - } elsif ($type eq 'facet') { - - for my $value (@{$child->{values}}) { - push(@$filter_nodes, {term => {$child->{name} => $value}}); - } - } - } - - for my $filter (@$filters) { - my $name = $filter->{name}; - my @values = @{$filter->{args}}; - - # Sorting is managed at the root of the ES search structure. - # QP assumes all sorts are ascending or descending -- possible - # only one sort filter per struct is supported? - if ($name eq 'sort') { - my $dir = $elastic->{__sort_dir} || 'asc'; - push(@{$elastic->{sort}}, {$_ => $dir}) for @values; - - } elsif ($name =~ /site|depth/) { - # site and depth are copy-level filters. - # Apply those after the main structure is built. - $elastic->{"__$name"} = $values[0]; - - } else { - if (@values > 1) { - push(@$filter_nodes, {terms => {$name => \@values}}); - } else { - push(@$filter_nodes, {term => {$name => $values[0]}}); - } - } - } - - # trim and compress branches - if (!@$filter_nodes) { - delete $query->{bool}{filter}; - return $bool_nodes->[0] if scalar(@$bool_nodes) == 1; - - } elsif (!@$bool_nodes) { - # If this is a filter-only node, add a match-all - # query for the filter to have something to match on. - $query->{bool}{must} = {match_all => {}}; - } - - return $query; - - } elsif ($node->{type} eq 'node') { - - my $field_class = $node->{class}; # e.g. subject - my @fields = @{$node->{fields}}; # e.g. temporal (optional) - - # class-level searches are OR/should searches across all - # fields in the selected class. - @fields = map {$_->name} - grep {$_->search_group eq $field_class} @$bib_fields - unless @fields; - - # note: $joiner is always '&' for type=node - my ($joiner) = keys %{$node->{children}}; - my $children = $node->{children}->{$joiner}; - - # Content is only split across children when multiple words - # are part of the same query structure, e.g. kw:piano music - # This equates to a match search with multiple words in ES. - my $content = join(' ', map {$_->{content}} @$children); - - # not sure how/why this happens sometimes. - return undef unless $content; - - my $first_char = substr($content, 0, 1); - my $last_char = substr($content, -1, 1); - my $prefix = $children->[0]->{prefix}; - - my $match_op = 'match'; - - # "Contains Phrase" - $match_op = 'match_phrase' if $prefix eq '"'; - - # Should we use the .raw keyword field? - my $text_search = 1; - - # Matchiness specificiers embedded in the content override - # the query node prefix. - if ($first_char eq '^') { - $text_search = 0; - $content = substr($content, 1); - - if ($last_char eq '$') { # "Matches Exactly" - - $match_op = 'term'; - $content = substr($content, 0, -1); - - } else { # "Starts With" - - $match_op = 'match_phrase_prefix'; - } - } - - # for match queries, treat multi-word search as AND searches - # instead of the default ES OR searches. - $content = {query => $content, operator => 'and'} - if $match_op eq 'match'; - - my $field_nodes = []; - for my $field (@fields) { - my $key = "$field_class|$field"; - - - if ($text_search) { - # use the full-text indices - - push(@$field_nodes, - {$match_op => {"$key.text" => $content}}); - - push(@$field_nodes, - {$match_op => {"$key.text_folded" => $content}}); - - } else { - - # Use the lowercase normalized keyword index for non-text searches. - push(@$field_nodes, {$match_op => {"$key.lower" => $content}}); - } - } - - $logger->info("ES content = $content / bools = ". - OpenSRF::Utils::JSON->perl2JSON($field_nodes)); - - my $query; - if (scalar(@$field_nodes) == 1) { - $query = {bool => {must => $field_nodes}}; - } else { - # Query multiple fields within a search class via OR query. - $query = {bool => {should => $field_nodes}}; - } - - if ($prefix eq '-"') { - # Negation query. Wrap the whole shebang in a must_not - $query = {bool => {must_not => $query}}; - } - - return $query; - } -} - -sub init_query_parser { - my $query = shift; - - my $query_parser = - OpenILS::Application::Storage::Driver::Pg::QueryParser->new( - query => $query - ); - - my %attrs = get_qp_attrs(); - $query_parser->initialize(%attrs); - - return $query_parser; -} - -my %qp_attrs; -sub get_qp_attrs { - return %qp_attrs if %qp_attrs; - - # Fetch and cache the QP configuration attributes - # TODO: call this in service child_init()? - - $logger->debug("ES initializing query parser attributes"); - my $e = new_editor(); - - %qp_attrs = ( - config_record_attr_index_norm_map => - $e->search_config_record_attr_index_norm_map([ - { id => { "!=" => undef } }, - { flesh => 1, flesh_fields => { crainm => [qw/norm/] }, - order_by => [{ class => "crainm", field => "pos" }] } - ]), - search_relevance_adjustment => - $e->retrieve_all_search_relevance_adjustment, - config_metabib_field => - $e->retrieve_all_config_metabib_field, - config_metabib_field_virtual_map => - $e->retrieve_all_config_metabib_field_virtual_map, - config_metabib_search_alias => - $e->retrieve_all_config_metabib_search_alias, - config_metabib_field_index_norm_map => - $e->search_config_metabib_field_index_norm_map([ - { id => { "!=" => undef } }, - { flesh => 1, flesh_fields => { cmfinm => [qw/norm/] }, - order_by => [{ class => "cmfinm", field => "pos" }] } - ]), - config_record_attr_definition => - $e->retrieve_all_config_record_attr_definition - ); - - return %qp_attrs; -} - - -# Format ES search aggregations to match the API response facet structure -# {$cmf_id => {"Value" => $count}, $cmf_id2 => {"Value Two" => $count2}, ...} -sub format_facets { - my $aggregations = shift; - my $facets = {}; - - for my $fname (keys %$aggregations) { - - my ($field_class, $name) = split(/\|/, $fname); - - my ($bib_field) = grep { - $_->name eq $name && $_->search_group eq $field_class - } @$bib_fields; - - my $hash = $facets->{$bib_field->metabib_field} = {}; - - my $values = $aggregations->{$fname}->{buckets}; - for my $bucket (@$values) { - $hash->{$bucket->{key}} = $bucket->{doc_count}; - } - } - - return $facets; -} - -sub add_elastic_facet_aggregations { - my ($elastic_query) = @_; - - my @facet_fields = grep {$_->facet_field eq 't'} @$bib_fields; - return unless @facet_fields; - - $elastic_query->{aggs} = {}; - - for my $facet (@facet_fields) { - my $fname = $facet->name; - my $fgrp = $facet->search_group; - $fname = "$fgrp|$fname" if $fgrp; - - $elastic_query->{aggs}{$fname} = {terms => {field => $fname}}; - } -} - -sub add_elastic_holdings_filter { - my ($elastic_query, $staff, $shortname, $depth, $available) = @_; - - # in non-staff mode, ensure at least on copy in scope is visible - my $visible = !$staff; - - my $org; - if ($shortname) { - - if (!$org_data_cache{by_shortname}{$shortname}) { - $org_data_cache{by_shortname}{$shortname} = - $U->find_org_by_shortname($U->get_org_tree, $shortname); - } - - $org = $org_data_cache{by_shortname}{$shortname}; - - my $types = $U->get_org_types; # pulls from cache - my ($type) = grep {$_->id == $org->ou_type} @$types; - $depth = defined $depth ? min($depth, $type->depth) : $type->depth; - } - - my $visible_filters = { - query => { - bool => { - must_not => [ - {terms => {'holdings.status' => $hidden_copy_statuses}}, - {terms => {'holdings.location' => $hidden_copy_locations}} - ] - } - } - }; - - my $filter = {nested => {path => 'holdings', query => {bool => {}}}}; - - if ($depth > 0) { - - if (!$org_data_cache{ancestors_at}{$shortname}) { - $org_data_cache{ancestors_at}{$shortname} = {}; - } - - if (!$org_data_cache{ancestors_at}{$shortname}{$depth}) { - $org_data_cache{ancestors_at}{$shortname}{$depth} = - $U->get_org_descendants($org->id, $depth); - } - - my $org_ids = $org_data_cache{ancestors_at}{$shortname}{$depth}; - - # Add a boolean OR-filter on holdings circ lib and optionally - # add a boolean AND-filter on copy status for availability - # checking. - - my $should = []; - $filter->{nested}->{query}->{bool}->{should} = $should; - - for my $org_id (@$org_ids) { - - # Ensure at least one copy exists at the selected org unit - my $and = { - bool => { - must => [ - {term => {'holdings.circ_lib' => $org_id}} - ] - } - }; - - # When limiting to visible/available, ensure at least one of the - # copies from the above org-limited set is visible/available. - if ($available) { - push( - @{$and->{bool}{must}}, - {terms => {'holdings.status' => $avail_copy_statuses}} - ); - - } elsif ($visible) { - push(@{$and->{bool}{must}}, $visible_filters); - } - - push(@$should, $and); - } - - } elsif ($available) { - # Limit to results that have an available copy, but don't worry - # about where the copy lives, since we're searching globally. - - $filter->{nested}->{query}->{bool}->{must} = - [{terms => {'holdings.status' => $avail_copy_statuses}}]; - - } elsif ($visible) { - - $filter->{nested}->{query} = $visible_filters->{query}; - } - - $logger->info("ES holdings filter is " . - OpenSRF::Utils::JSON->perl2JSON($filter)); - - # array of filters in progress - push(@{$elastic_query->{query}->{bool}->{filter}}, $filter); - -} - -1; - diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm new file mode 100644 index 0000000000..566d57a9d6 --- /dev/null +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm @@ -0,0 +1,653 @@ +package OpenILS::Application::Search::ElasticMapper; +# --------------------------------------------------------------- +# Copyright (C) 2018 King County Library System +# Author: Bill Erickson +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# --------------------------------------------------------------- +use strict; +use warnings; +use OpenSRF::Utils::JSON; +use OpenSRF::Utils::Logger qw/:logger/; +use OpenILS::Utils::Fieldmapper; +use OpenSRF::Utils::SettingsClient; +use OpenILS::Utils::CStoreEditor q/:funcs/; +use OpenILS::Elastic::BibSearch; +use OpenILS::Elastic::BibMarc; +use List::Util qw/min/; +use Digest::MD5 qw(md5_hex); + +use OpenILS::Application::AppUtils; +my $U = "OpenILS::Application::AppUtils"; + +# Use the QueryParser module to make sense of the inbound search query. +use OpenILS::Application::Storage::Driver::Pg::QueryParser; + + +# avoid repetitive calls to DB for org info. +my %org_data_cache = (by_shortname => {}, ancestors_at => {}); + +# bib fields defined in the elastic bib-search index +my $bib_fields; +my $hidden_copy_statuses; +my $hidden_copy_locations; +my $avail_copy_statuses; +our $enabled = {}; + +# Returns true if the Elasticsearch 'bib-search' index is active. +sub is_enabled { + my ($class, $index) = @_; + + $class->init; + + return $enabled->{$index} if exists $enabled->{$index}; + + # Elastic bib search is enabled if a "bib-search" index is enabled. + my $config = new_editor()->search_elastic_index( + {active => 't', code => $index})->[0]; + + if ($config) { + $logger->info("ES '$index' index is enabled"); + $enabled->{$index} = 1; + } else { + $enabled->{$index} = 0; + } + + return $enabled->{$index}; +} + +my $init_complete = 0; +sub init { + my $class = shift; + return if $init_complete; + + my $e = new_editor(); + + $bib_fields = $e->retrieve_all_elastic_bib_field; + + my $stats = $e->json_query({ + select => {ccs => ['id', 'opac_visible', 'is_available']}, + from => 'ccs', + where => {'-or' => [ + {opac_visible => 'f'}, + {is_available => 't'} + ]} + }); + + $hidden_copy_statuses = + [map {$_->{id}} grep {$_->{opac_visible} eq 'f'} @$stats]; + + $avail_copy_statuses = + [map {$_->{id}} grep {$_->{is_available} eq 't'} @$stats]; + + # Include deleted copy locations since this is an exclusion set. + my $locs = $e->json_query({ + select => {acpl => ['id']}, + from => 'acpl', + where => {opac_visible => 'f'} + }); + + $hidden_copy_locations = [map {$_->{id}} @$locs]; + + $init_complete = 1; + return 1; +} + +# Translate a bib search API call into something consumable by Elasticsearch +# Translate search results into a structure consistent with a bib search +# API response. +sub bib_search { + my ($class, $query, $staff, $offset, $limit) = @_; + + $logger->info("ES parsing API query $query staff=$staff"); + + my ($elastic_query, $cache_key) = + compile_elastic_query($query, $staff, $offset, $limit); + + my $es = OpenILS::Elastic::BibSearch->new('main'); + + $es->connect; + my $results = $es->search($elastic_query); + + $logger->debug("ES elasticsearch returned: ". + OpenSRF::Utils::JSON->perl2JSON($results)); + + return {count => 0} unless $results; + + return { + count => $results->{hits}->{total}, + ids => [ + map { [$_->{_id}, undef, $_->{_score}] } + grep {defined $_} @{$results->{hits}->{hits}} + ], + facets => format_facets($results->{aggregations}), + # Elastic has its own search cacheing, so external caching is + # performed, but providing cache keys allows the caller to + # know if this search matches another search. + cache_key => $cache_key, + facet_key => $cache_key.'_facets' + }; +} + +sub compile_elastic_query { + my ($query, $staff, $offset, $limit) = @_; + + my $parser = init_query_parser($query); + + $parser->parse; + my $query_struct = $parser->parse_tree->to_abstract_query; + + my $elastic = { + _source => ['id'], # Fetch bib ID only + size => $limit, + from => $offset, + sort => [], + query => { + bool => { + must => [], + filter => [] + } + } + }; + + $elastic->{query}->{bool}->{must} = + [translate_query_node($elastic, $query_struct)]; + + add_elastic_holdings_filter($elastic, $staff, + $elastic->{__site}, $elastic->{__depth}, $elastic->{__available}); + + add_elastic_facet_aggregations($elastic); + + # delete __-prefixed state maintenance keys. + delete $elastic->{$_} for (grep {$_ =~ /^__/} keys %$elastic); + + $elastic->{sort} = ['_score'] unless @{$elastic->{sort}}; + + return $elastic; +} + +sub translate_query_node { + my ($elastic, $node) = @_; + + if ($node->{type} eq 'query_plan') { + + my ($joiner) = keys %{$node->{children}}; + my $children = $node->{children}->{$joiner}; + my $filters = $node->{filters}; + my $modifiers = $node->{modifiers}; + + if (grep {$_ eq 'descending'} @$modifiers) { + $elastic->{__sort_dir} = 'desc'; + } + + if (grep {$_ eq 'available'} @$modifiers) { + $elastic->{__available} = 1; + } + + return unless @$children || @$filters; + + my $bool_op = $joiner eq '&' ? 'must' : 'should'; + my $bool_nodes = []; + my $filter_nodes = []; + my $query = { + bool => { + $bool_op => $bool_nodes, + filter => $filter_nodes + } + }; + + for my $child (@$children) { + my $type = $child->{type}; + + if ($type eq 'node' || $type eq 'query_plan') { + my $subq = translate_query_node($elastic, $child); + push(@$bool_nodes, $subq) if defined $subq; + + } elsif ($type eq 'facet') { + + for my $value (@{$child->{values}}) { + push(@$filter_nodes, {term => {$child->{name} => $value}}); + } + } + } + + for my $filter (@$filters) { + my $name = $filter->{name}; + my @values = @{$filter->{args}}; + + # Sorting is managed at the root of the ES search structure. + # QP assumes all sorts are ascending or descending -- possible + # only one sort filter per struct is supported? + if ($name eq 'sort') { + my $dir = $elastic->{__sort_dir} || 'asc'; + push(@{$elastic->{sort}}, {$_ => $dir}) for @values; + + } elsif ($name =~ /site|depth/) { + # site and depth are copy-level filters. + # Apply those after the main structure is built. + $elastic->{"__$name"} = $values[0]; + + } else { + if (@values > 1) { + push(@$filter_nodes, {terms => {$name => \@values}}); + } else { + push(@$filter_nodes, {term => {$name => $values[0]}}); + } + } + } + + # trim and compress branches + if (!@$filter_nodes) { + delete $query->{bool}{filter}; + return $bool_nodes->[0] if scalar(@$bool_nodes) == 1; + + } elsif (!@$bool_nodes) { + # If this is a filter-only node, add a match-all + # query for the filter to have something to match on. + $query->{bool}{must} = {match_all => {}}; + } + + return $query; + + } elsif ($node->{type} eq 'node') { + + my $field_class = $node->{class}; # e.g. subject + my @fields = @{$node->{fields}}; # e.g. temporal (optional) + + $logger->info("ES query node field_class=$field_class fields=@fields"); + + # class-level searches are OR ("should") searches across all + # fields in the selected class. + @fields = map {$_->name} + grep {$_->search_group eq $field_class} @$bib_fields + unless @fields; + + # note: $joiner is always '&' for type=node + my ($joiner) = keys %{$node->{children}}; + my $children = $node->{children}->{$joiner}; + + # Content is only split across children when multiple words + # are part of the same query structure, e.g. kw:piano music + # This equates to a match search with multiple words in ES. + my $content = join(' ', map {$_->{content}} @$children); + + # not sure how/why this happens sometimes. + return undef unless $content; + + my $first_char = substr($content, 0, 1); + my $last_char = substr($content, -1, 1); + my $prefix = $children->[0]->{prefix}; + + my $match_op = 'match'; + + # "Contains Phrase" + $match_op = 'match_phrase' if $prefix eq '"'; + + # Should we use the .raw keyword field? + my $text_search = 1; + + # Matchiness specificiers embedded in the content override + # the query node prefix. + if ($first_char eq '^') { + $text_search = 0; + $content = substr($content, 1); + + if ($last_char eq '$') { # "Matches Exactly" + + $match_op = 'term'; + $content = substr($content, 0, -1); + + } else { # "Starts With" + + $match_op = 'match_phrase_prefix'; + } + } + + # for match queries, treat multi-word search as AND searches + # instead of the default ES OR searches. + $content = {query => $content, operator => 'and'} + if $match_op eq 'match'; + + my $field_nodes = []; + for my $field (@fields) { + my $key = "$field_class|$field"; + + if ($text_search) { + # use the full-text indices + + push(@$field_nodes, + {$match_op => {"$key.text" => $content}}); + + push(@$field_nodes, + {$match_op => {"$key.text_folded" => $content}}); + + } else { + + # Use the lowercase normalized keyword index for non-text searches. + push(@$field_nodes, {$match_op => {"$key.lower" => $content}}); + } + } + + $logger->info( + "ES content = ". OpenSRF::Utils::JSON->perl2JSON($content) . + "; bools = ". OpenSRF::Utils::JSON->perl2JSON($field_nodes) + ); + + my $query; + if (scalar(@$field_nodes) == 1) { + $query = {bool => {must => $field_nodes}}; + } else { + # Query multiple fields within a search class via OR query. + $query = {bool => {should => $field_nodes}}; + } + + if ($prefix eq '-"') { + # Negation query. Wrap the whole shebang in a must_not + $query = {bool => {must_not => $query}}; + } + + $logger->info("ES sub-query = ". OpenSRF::Utils::JSON->perl2JSON($query)); + + return $query; + } +} + +sub init_query_parser { + my $query = shift; + + my $query_parser = + OpenILS::Application::Storage::Driver::Pg::QueryParser->new( + query => $query + ); + + my %attrs = get_qp_attrs(); + $query_parser->initialize(%attrs); + + return $query_parser; +} + +my %qp_attrs; +sub get_qp_attrs { + return %qp_attrs if %qp_attrs; + + # Fetch and cache the QP configuration attributes + # TODO: call this in service child_init()? + + $logger->debug("ES initializing query parser attributes"); + my $e = new_editor(); + + %qp_attrs = ( + config_record_attr_index_norm_map => + $e->search_config_record_attr_index_norm_map([ + { id => { "!=" => undef } }, + { flesh => 1, flesh_fields => { crainm => [qw/norm/] }, + order_by => [{ class => "crainm", field => "pos" }] } + ]), + search_relevance_adjustment => + $e->retrieve_all_search_relevance_adjustment, + config_metabib_field => + $e->retrieve_all_config_metabib_field, + config_metabib_field_virtual_map => + $e->retrieve_all_config_metabib_field_virtual_map, + config_metabib_search_alias => + $e->retrieve_all_config_metabib_search_alias, + config_metabib_field_index_norm_map => + $e->search_config_metabib_field_index_norm_map([ + { id => { "!=" => undef } }, + { flesh => 1, flesh_fields => { cmfinm => [qw/norm/] }, + order_by => [{ class => "cmfinm", field => "pos" }] } + ]), + config_record_attr_definition => + $e->retrieve_all_config_record_attr_definition + ); + + return %qp_attrs; +} + + +# Format ES search aggregations to match the API response facet structure +# {$cmf_id => {"Value" => $count}, $cmf_id2 => {"Value Two" => $count2}, ...} +sub format_facets { + my $aggregations = shift; + my $facets = {}; + + for my $fname (keys %$aggregations) { + + my ($field_class, $name) = split(/\|/, $fname); + + my ($bib_field) = grep { + $_->name eq $name && $_->search_group eq $field_class + } @$bib_fields; + + my $hash = $facets->{$bib_field->metabib_field} = {}; + + my $values = $aggregations->{$fname}->{buckets}; + for my $bucket (@$values) { + $hash->{$bucket->{key}} = $bucket->{doc_count}; + } + } + + return $facets; +} + +sub add_elastic_facet_aggregations { + my ($elastic_query) = @_; + + my @facet_fields = grep {$_->facet_field eq 't'} @$bib_fields; + return unless @facet_fields; + + $elastic_query->{aggs} = {}; + + for my $facet (@facet_fields) { + my $fname = $facet->name; + my $fgrp = $facet->search_group; + $fname = "$fgrp|$fname" if $fgrp; + + $elastic_query->{aggs}{$fname} = {terms => {field => $fname}}; + } +} + +sub add_elastic_holdings_filter { + my ($elastic_query, $staff, $shortname, $depth, $available) = @_; + + # in non-staff mode, ensure at least on copy in scope is visible + my $visible = !$staff; + + my $org; + if ($shortname) { + + if (!$org_data_cache{by_shortname}{$shortname}) { + $org_data_cache{by_shortname}{$shortname} = + $U->find_org_by_shortname($U->get_org_tree, $shortname); + } + + $org = $org_data_cache{by_shortname}{$shortname}; + + my $types = $U->get_org_types; # pulls from cache + my ($type) = grep {$_->id == $org->ou_type} @$types; + $depth = defined $depth ? min($depth, $type->depth) : $type->depth; + } + + my $visible_filters = { + query => { + bool => { + must_not => [ + {terms => {'holdings.status' => $hidden_copy_statuses}}, + {terms => {'holdings.location' => $hidden_copy_locations}} + ] + } + } + }; + + my $filter = {nested => {path => 'holdings', query => {bool => {}}}}; + + if ($depth > 0) { + + if (!$org_data_cache{ancestors_at}{$shortname}) { + $org_data_cache{ancestors_at}{$shortname} = {}; + } + + if (!$org_data_cache{ancestors_at}{$shortname}{$depth}) { + $org_data_cache{ancestors_at}{$shortname}{$depth} = + $U->get_org_descendants($org->id, $depth); + } + + my $org_ids = $org_data_cache{ancestors_at}{$shortname}{$depth}; + + # Add a boolean OR-filter on holdings circ lib and optionally + # add a boolean AND-filter on copy status for availability + # checking. + + my $should = []; + $filter->{nested}->{query}->{bool}->{should} = $should; + + for my $org_id (@$org_ids) { + + # Ensure at least one copy exists at the selected org unit + my $and = { + bool => { + must => [ + {term => {'holdings.circ_lib' => $org_id}} + ] + } + }; + + # When limiting to visible/available, ensure at least one of the + # copies from the above org-limited set is visible/available. + if ($available) { + push( + @{$and->{bool}{must}}, + {terms => {'holdings.status' => $avail_copy_statuses}} + ); + + } elsif ($visible) { + push(@{$and->{bool}{must}}, $visible_filters); + } + + push(@$should, $and); + } + + } elsif ($available) { + # Limit to results that have an available copy, but don't worry + # about where the copy lives, since we're searching globally. + + $filter->{nested}->{query}->{bool}->{must} = + [{terms => {'holdings.status' => $avail_copy_statuses}}]; + + } elsif ($visible) { + + $filter->{nested}->{query} = $visible_filters->{query}; + } + + $logger->info("ES holdings filter is " . + OpenSRF::Utils::JSON->perl2JSON($filter)); + + # array of filters in progress + push(@{$elastic_query->{query}->{bool}->{filter}}, $filter); + +} + + + +sub compile_elastic_marc_query { + my ($args, $staff, $offset, $limit) = @_; + + # args->{searches} = + # [{term => "harry", restrict => [{tag => 245, subfield => "a"}]}] + + my $root_and = []; + for my $search (@{$args->{searches}}) { + + # NOTE Assume only one tag/subfield will be queried per search term. + my $tag = $search->{restrict}->[0]->{tag}; + my $sf = $search->{restrict}->[0]->{subfield}; + my $value = $search->{term}; + + # Use text searching on the value field + my $value_query = { + bool => { + should => [ + {match => {'marc.value.text' => + {query => $value, operator => 'and'}}}, + {match => {'marc.value.text_folded' => + {query => $value, operator => 'and'}}} + ] + } + }; + + my $sub_query = { + bool => { + must => [ + {term => {'marc.tag' => $tag}}, + {term => {'marc.subfield.lower' => $sf}}, + $value_query + ] + } + }; + + push (@$root_and, { + nested => { + path => 'marc', + query => {bool => {must => $sub_query}} + } + }); + } + + return { + _source => ['id'], # Fetch bib ID only + size => $limit, + from => $offset, + sort => [], + query => { + bool => { + must => $root_and, + filter => [] + } + } + }; +} + + + +# Translate a MARC search API call into something consumable by Elasticsearch +# Translate search results into a structure consistent with a bib search +# API response. +sub marc_search { + my ($class, $args, $staff, $limit, $offset) = @_; + + return {count => 0} unless $args->{searches} && @{$args->{searches}}; + + my $elastic_query = + compile_elastic_marc_query($args, $staff, $offset, $limit); + + my $es = OpenILS::Elastic::BibMarc->new('main'); + + $es->connect; + my $results = $es->search($elastic_query); + + $logger->debug("ES elasticsearch returned: ". + OpenSRF::Utils::JSON->perl2JSON($results)); + + return {count => 0} unless $results; + + my @bib_ids = map {$_->{_id}} + grep {defined $_} @{$results->{hits}->{hits}}; + + return { + ids => \@bib_ids, + count => $results->{hits}->{total} + }; +} + + + +1; + diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibMarc.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibMarc.pm index bf02b66d92..86bfc9f425 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibMarc.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibMarc.pm @@ -60,9 +60,35 @@ my $BASE_PROPERTIES = { marc => { type => 'nested', properties => { + # tag is assumed to be composed of numbers, so no lowercase. tag => {type => 'keyword'}, - subfield => {type => 'keyword'}, - value => {type => 'text'} + subfield => { + type => 'keyword', + fields => { + lower => { + type => 'keyword', + normalizer => 'custom_lowercase' + } + } + }, + value => { + type => 'keyword', + fields => { + lower => { + type => 'keyword', + normalizer => 'custom_lowercase' + }, + text => { + type => 'text', + analyzer => $LANG_ANALYZER + }, + text_folded => { + type => 'text', + analyzer => 'folding' + } + } + } + } } };