From 73dc4cd71361481c70030242bba63e09d8696ac7 Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Thu, 5 Sep 2019 12:42:18 -0400 Subject: [PATCH] initial support multiple lang analyzers; multi_match queries Signed-off-by: Bill Erickson --- .../OpenILS/Application/Search/ElasticMapper.pm | 70 ++++++++++------------ Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm | 5 ++ .../src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm | 54 +++++++++-------- .../support-scripts/test-scripts/elastic-search.pl | 4 +- 4 files changed, 65 insertions(+), 68 deletions(-) diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm index 1bfedebc79..1d1dcdc3de 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm @@ -286,67 +286,59 @@ sub translate_query_node { my $last_char = substr($content, -1, 1); my $prefix = $children->[0]->{prefix}; - my $match_op = 'match'; + my $match_type = 'most_fields'; # "Contains Phrase" - $match_op = 'match_phrase' if $prefix eq '"'; + $match_type = 'phrase' if $prefix eq '"'; - # Should we use the .raw keyword field? - my $text_search = 1; + my @field_nodes; # Matchiness specificiers embedded in the content override # the query node prefix. if ($first_char eq '^') { - $text_search = 0; $content = substr($content, 1); if ($last_char eq '$') { # "Matches Exactly" - $match_op = 'term'; + $match_type = undef; $content = substr($content, 0, -1); + for my $field (@fields) { + my $key = "$field_class|$field"; + # Use the lowercase normalized keyword index for + # exact match searches. + push(@field_nodes, {term => {"$key.lower" => $content}}); + } + } else { # "Starts With" - $match_op = 'match_phrase_prefix'; + $match_type = 'phrase_prefix'; } } - # for match queries, treat multi-word search as AND searches - # instead of the default ES OR searches. - $content = {query => $content, operator => 'and'} - if $match_op eq 'match'; - - my $field_nodes = []; - for my $field (@fields) { - my $key = "$field_class|$field"; + if ($match_type) { - if ($text_search) { - # use the full-text indices - - push(@$field_nodes, - {$match_op => {"$key.text" => $content}}); - - push(@$field_nodes, - {$match_op => {"$key.text_folded" => $content}}); - - } else { - - # Use the lowercase normalized keyword index for non-text searches. - push(@$field_nodes, {$match_op => {"$key.lower" => $content}}); - } + push(@field_nodes, { + multi_match => { + query => $content, + operator => 'and', + fields => ["$field_class|*.text*"], + type => $match_type + } + }); } $logger->info( "ES content = ". OpenSRF::Utils::JSON->perl2JSON($content) . - "; bools = ". OpenSRF::Utils::JSON->perl2JSON($field_nodes) + "; bools = ". OpenSRF::Utils::JSON->perl2JSON(\@field_nodes) ); my $query; - if (scalar(@$field_nodes) == 1) { - $query = {bool => {must => $field_nodes}}; + if (scalar(@field_nodes) == 1) { + $query = {bool => {must => \@field_nodes}}; } else { # Query multiple fields within a search class via OR query. - $query = {bool => {should => $field_nodes}}; + $query = {bool => {should => \@field_nodes}}; } if ($prefix eq '-"') { @@ -573,13 +565,11 @@ sub compile_elastic_marc_query { # Use text searching on the value field my $value_query = { - bool => { - should => [ - {match => {'marc.value.text' => - {query => $value, operator => 'and'}}}, - {match => {'marc.value.text_folded' => - {query => $value, operator => 'and'}}} - ] + multi_match => { + query => $value, + fields => ['marc.value*'], + type => 'most_fields', + operator => 'and' } }; diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm index 181d6c0212..b4d8f732e7 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm @@ -60,6 +60,11 @@ sub index_name { die "Index name must be provided by sub-class\n"; } +sub language_analyzers { + # Override in subclass as needed + return ("english"); +} + # Provide a direct DB connection so some high-volume activities, # like indexing bib records, can take advantage of a direct connection. # Returns database connection object -- connects if necessary. diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm index 55da7d620e..7c5d92c48b 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm @@ -30,9 +30,6 @@ my $INDEX_NAME = 'bib-search'; # number of bibs to index per batch. my $BIB_BATCH_SIZE = 500; -# TODO: it's possible to apply multiple language analyzers. -my $LANG_ANALYZER = 'english'; - my $BASE_INDEX_SETTINGS = { analysis => { analyzer => { @@ -71,44 +68,36 @@ my $BASE_PROPERTIES = { marc => { type => 'nested', properties => { - # tag is assumed to be composed of numbers, so no lowercase. - tag => {type => 'keyword'}, + tag => { + type => 'keyword', + normalizer => 'custom_lowercase' + }, subfield => { type => 'keyword', - fields => { - lower => { - type => 'keyword', - normalizer => 'custom_lowercase' - } - } + normalizer => 'custom_lowercase' }, value => { - type => 'keyword', + type => 'text', fields => { - lower => { - type => 'keyword', - normalizer => 'custom_lowercase' - }, - text => { - type => 'text', - analyzer => $LANG_ANALYZER - }, text_folded => { type => 'text', analyzer => 'folding' } } } - } } - }; sub index_name { return $INDEX_NAME; } +# TODO: add index-specific language analyzers to DB config +sub language_analyzers { + return ("english"); +} + sub create_index { my ($self) = @_; @@ -122,6 +111,14 @@ sub create_index { my $mappings = $BASE_PROPERTIES; + # Add the language analyzers to the MARC mappings + for my $lang_analyzer ($self->language_analyzers) { + $mappings->{marc}->{properties}->{value}->{fields}->{"text_$lang_analyzer"} = { + type => 'text', + analyzer => $lang_analyzer + }; + } + my $fields = new_editor()->retrieve_all_elastic_bib_field(); for my $field (@$fields) { @@ -147,15 +144,20 @@ sub create_index { # Search fields also get full text indexing and analysis # plus a "folded" variation for ascii folded searches. - $def->{fields}->{text} = { - type => 'text', - analyzer => $LANG_ANALYZER - }; + $def->{fields}->{text} = {type => 'text'}; $def->{fields}->{text_folded} = { type => 'text', analyzer => 'folding' }; + + # Add the language analyzers + for my $lang_analyzer ($self->language_analyzers) { + $def->{fields}->{"text_$lang_analyzer"} = { + type => 'text', + analyzer => $lang_analyzer + }; + } } # Apply field boost. diff --git a/Open-ILS/src/support-scripts/test-scripts/elastic-search.pl b/Open-ILS/src/support-scripts/test-scripts/elastic-search.pl index 6562051d7b..9cfdcf9dc8 100755 --- a/Open-ILS/src/support-scripts/test-scripts/elastic-search.pl +++ b/Open-ILS/src/support-scripts/test-scripts/elastic-search.pl @@ -41,7 +41,7 @@ Fieldmapper->import( IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); OpenILS::Utils::CStoreEditor::init(); -# Title search AND author search AND MARC tag=100 search +# Title search AND subject search AND MARC tag=100 search my $query = { _source => ['id', 'title|proper'] , # return only the ID field from => 0, @@ -74,7 +74,7 @@ my $query = { must => [{ multi_match => { query => 'cline', - fields => ['marc.value.text*'], + fields => ['marc.value*'], operator => 'and', type => 'most_fields' } -- 2.11.0