From c13779398e9d283574f7397968fc71a14d68fae5 Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Tue, 18 Feb 2020 14:16:24 -0500 Subject: [PATCH] LP1844418 Direct indexing experiment WIP Signed-off-by: Bill Erickson --- .../lib/OpenILS/Application/Search/Elastic.pm | 2 +- .../src/perlmods/lib/OpenILS/Elastic/BibSearch.pm | 60 ++++++++++++++-------- .../perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm | 50 +++++++++++++----- 3 files changed, 78 insertions(+), 34 deletions(-) diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm index 25799443c1..2f725e6006 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm @@ -315,7 +315,7 @@ sub add_elastic_facet_aggregations { my $fgrp = $facet->search_group; $fname = "$fgrp|$fname" if $fgrp; - $elastic_query->{aggs}{$fname} = {terms => {field => "$fname.facet"}}; + $elastic_query->{aggs}{$fname} = {terms => {field => "$fname|facet"}}; } } diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm index 8ae28ffbd0..bb8067a1d4 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm @@ -31,6 +31,12 @@ use base qw/OpenILS::Elastic/; my $DEFAULT_BIB_BATCH_SIZE = 500; my $INDEX_CLASS = 'bib-search'; +# https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html +# Useful for ignoring excessively long filters, sorters, and facets. +# Only applied to the keyword variation of each index. Does not affect +# the 'text' varieties. +my $IGNORE_ABOVE = 256; + my $BASE_INDEX_SETTINGS = { analysis => { analyzer => { @@ -121,7 +127,7 @@ my $BASE_PROPERTIES = { # searched via 'text' indexes. title => { type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', fields => { text => {type => 'text'}, @@ -132,7 +138,7 @@ my $BASE_PROPERTIES = { }, author => { type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', fields => { text => {type => 'text'}, @@ -143,7 +149,7 @@ my $BASE_PROPERTIES = { }, subject => { type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', fields => { text => {type => 'text'}, @@ -154,7 +160,7 @@ my $BASE_PROPERTIES = { }, series => { type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', fields => { text => {type => 'text'}, @@ -168,7 +174,7 @@ my $BASE_PROPERTIES = { # keyword field, but we index it just the same (sans lowercase) # for structural consistency with other group fields. type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, fields => { text => {type => 'text'}, text_folded => {type => 'text', analyzer => 'folding'}, @@ -179,7 +185,7 @@ my $BASE_PROPERTIES = { identifier => { # Avoid full-text indexing on identifier fields. type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', }, @@ -191,7 +197,7 @@ my $BASE_PROPERTIES = { kw => {type => 'text'}, id => { type => 'keyword', - ignore_above => 256 + ignore_above => $IGNORE_ABOVE } }; @@ -269,31 +275,41 @@ sub create_index_properties { } } else { - - # Non-grouped fields are used for filtering and sorting, so - # they don't need as much processing. + # Filters and sorters $def = { type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase' }; } + if ($def) { + $logger->debug("ES adding field $field_name: ". + OpenSRF::Utils::JSON->perl2JSON($def)); + + $properties->{$field_name} = $def; + } + + # Search and facet fields can have the same name/group pair, + # but are stored as separate fields in ES since the content + # may vary between the two. if ($field->facet_field eq 't') { - $def->{fields} = {} unless $def->{fields}; # facet only? - # Facet fields are used for aggregation which requires - # an additional unaltered keyword field. - $def->{fields}->{facet} = { + + # Facet fields are stored as separate fields, because their + # content may differ from the matching search field. + $field_name = "$field_name|facet"; + + $def = { type => 'keyword', - ignore_above => 256 + ignore_above => $IGNORE_ABOVE }; - } - $logger->debug("ES adding field $field_name: ". - OpenSRF::Utils::JSON->perl2JSON($def)); + $logger->debug("ES adding field $field_name: ". + OpenSRF::Utils::JSON->perl2JSON($def)); - $properties->{$field_name} = $def; + $properties->{$field_name} = $def; + } } return $properties; @@ -365,6 +381,9 @@ sub create_index { return 1; } +# TODO: elastic.bib_record_properties needs to also pull values +# from metabib.facet_entry +# TODO: stamp each field with a 'purpose' (search, facet, filter, sorter) sub get_bib_data { my ($self, $record_ids) = @_; @@ -452,6 +471,7 @@ sub populate_bib_index_batch { next unless defined $value && $value ne ''; $fname = "$fclass|$fname" if $fclass; + $fname = "$fname|facet" if $field->{purpose} eq 'facet'; $value = $self->truncate_value($value); if ($fname eq 'identifier|isbn') { diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm index fda4682257..ea941304d3 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm @@ -31,6 +31,10 @@ sub sorter { return $self->{sorter} ? 't' : 'f'; } +sub weight { + return $self->{weight} || 1; +} + package OpenILS::Elastic::BibSearch::XSLT; use strict; use warnings; @@ -53,7 +57,7 @@ sub xsl_doc { my ($self) = @_; $self->{xsl_doc} = XML::LibXML->load_xml(location => $self->xsl_file); - unless ($self->{xsl_doc}); + unless $self->{xsl_doc}; return $self->{xsl_doc}; } @@ -83,10 +87,6 @@ sub add_dynamic_field { push(@$fields, $field); } -# TODO: what to do about fields that have the same class/name -# and are both search and facet fields, but the facet values -# are different than the searched value? - sub get_dynamic_fields { my $self = shift; my $fields = []; @@ -136,18 +136,42 @@ sub get_bib_data { my $result = $self->xsl_sheet->transform($marc_doc); my $output = $stylesheet->output_as_chars($result); - my @fields = split(/\n/, $output); - for my $field (@fields) { - my @parts = split(/ /, $field); - my $field_type = $parts[0]; + my @rows = split(/\n/, $output); + my $first = 1; + for my $row (@rows) { + my @parts = split(/ /, $row); + my $purpose = $parts[0]; + + my $field = {purpose => $purpose}; + + if ($first) { + # Stamp the first field with the additional bib metadata. + $field->{$_} = $db_rec->{$_} for + qw/id bib_source metarecord create_date edit_date/; + $first = 0; + } + + if ($purpose eq 'search') { + $field->{search_group} = @parts[1]; + $field->{name} = @parts[2]; + $field->{weight} = @parts[3]; + $field->{value} = join(' ', @parts[4..$#parts]); + + } elsif ($purpose eq 'facet') { + $field->{search_group} = @parts[1]; + $field->{name} = @parts[2]; + $field->{value} = join(' ', @parts[3..$#parts]); - if ($field_type eq 'search') { - } elsif ($field_type eq 'facet') { - } elsif ($field_type eq 'filter') { - } elsif ($field_type eq 'sorter') { + } elsif ($purpose eq 'filter' || $purpose eq 'sorter') { + $field->{name} = @parts[1]; + $field->{value} = join(' ', @parts[2..$#parts]); } } + + push(@$bib_data, $field); } + + return $bib_data; } sub get_bib_db_data { -- 2.11.0