LP1844418 Direct indexing experiment WIP

author Bill Erickson <berickxx@gmail.com>

Tue, 18 Feb 2020 19:16:24 +0000 (14:16 -0500)

committer Bill Erickson <berickxx@gmail.com>

Fri, 21 Feb 2020 21:20:33 +0000 (16:20 -0500)
author Bill Erickson <berickxx@gmail.com>
Tue, 18 Feb 2020 19:16:24 +0000 (14:16 -0500)
committer Bill Erickson <berickxx@gmail.com>
Fri, 21 Feb 2020 21:20:33 +0000 (16:20 -0500)
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm

index 2579944..2f725e6 100644 (file)
--- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm
@@ -315,7 +315,7 @@ sub add_elastic_facet_aggregations {
          my $fgrp = $facet->search_group;
          $fname = "$fgrp|$fname" if $fgrp;
  
-        $elastic_query->{aggs}{$fname} = {terms => {field => "$fname.facet"}};
+        $elastic_query->{aggs}{$fname} = {terms => {field => "$fname|facet"}};
      }
  }
  
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm

index 8ae28ff..bb8067a 100644 (file)
--- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm
@@ -31,6 +31,12 @@ use base qw/OpenILS::Elastic/;
  my $DEFAULT_BIB_BATCH_SIZE = 500;
  my $INDEX_CLASS = 'bib-search';
  
+# https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html
+# Useful for ignoring excessively long filters, sorters, and facets.
+# Only applied to the keyword variation of each index.  Does not affect
+# the 'text' varieties.
+my $IGNORE_ABOVE = 256;
+
  my $BASE_INDEX_SETTINGS = {
      analysis => {
          analyzer => {
@@ -121,7 +127,7 @@ my $BASE_PROPERTIES = {
      # searched via 'text' indexes.
      title => {
          type => 'keyword',
-        ignore_above => 256,
+        ignore_above => $IGNORE_ABOVE,
          normalizer => 'custom_lowercase',
          fields => {
              text => {type => 'text'},
@@ -132,7 +138,7 @@ my $BASE_PROPERTIES = {
      },
      author => {
          type => 'keyword',
-        ignore_above => 256,
+        ignore_above => $IGNORE_ABOVE,
          normalizer => 'custom_lowercase',
          fields => {
              text => {type => 'text'},
@@ -143,7 +149,7 @@ my $BASE_PROPERTIES = {
      },
      subject => {
          type => 'keyword',
-        ignore_above => 256,
+        ignore_above => $IGNORE_ABOVE,
          normalizer => 'custom_lowercase',
          fields => {
              text => {type => 'text'},
@@ -154,7 +160,7 @@ my $BASE_PROPERTIES = {
      },
      series => {
          type => 'keyword',
-        ignore_above => 256,
+        ignore_above => $IGNORE_ABOVE,
          normalizer => 'custom_lowercase',
          fields => {
              text => {type => 'text'},
@@ -168,7 +174,7 @@ my $BASE_PROPERTIES = {
          # keyword field, but we index it just the same (sans lowercase) 
          # for structural consistency with other group fields.
          type => 'keyword',
-        ignore_above => 256,
+        ignore_above => $IGNORE_ABOVE,
          fields => {
              text => {type => 'text'},
              text_folded => {type => 'text', analyzer => 'folding'},
@@ -179,7 +185,7 @@ my $BASE_PROPERTIES = {
      identifier => {
          # Avoid full-text indexing on identifier fields.
          type => 'keyword',
-        ignore_above => 256,
+        ignore_above => $IGNORE_ABOVE,
          normalizer => 'custom_lowercase',
      },
  
@@ -191,7 +197,7 @@ my $BASE_PROPERTIES = {
      kw => {type => 'text'},
      id => {
          type => 'keyword',
-        ignore_above => 256
+        ignore_above => $IGNORE_ABOVE
      }
  };
  
@@ -269,31 +275,41 @@ sub create_index_properties {
              }
  
          } else {
-
-            # Non-grouped fields are used for filtering and sorting, so
-            # they don't need as much processing.
+            # Filters and sorters
  
              $def = {
                  type => 'keyword',
-                ignore_above => 256,
+                ignore_above => $IGNORE_ABOVE,
                  normalizer => 'custom_lowercase'
              };
          }
  
+        if ($def) {
+            $logger->debug("ES adding field $field_name: ". 
+                OpenSRF::Utils::JSON->perl2JSON($def));
+    
+            $properties->{$field_name} = $def;
+        }
+
+        # Search and facet fields can have the same name/group pair,
+        # but are stored as separate fields in ES since the content
+        # may vary between the two.
          if ($field->facet_field eq 't') {
-            $def->{fields} = {} unless $def->{fields}; # facet only?
-            # Facet fields are used for aggregation which requires
-            # an additional unaltered keyword field.
-            $def->{fields}->{facet} = {
+
+            # Facet fields are stored as separate fields, because their
+            # content may differ from the matching search field.
+            $field_name = "$field_name|facet";
+
+            $def = {
                  type => 'keyword',
-                ignore_above => 256
+                ignore_above => $IGNORE_ABOVE
              };
-        }
  
-        $logger->debug("ES adding field $field_name: ". 
-            OpenSRF::Utils::JSON->perl2JSON($def));
+            $logger->debug("ES adding field $field_name: ". 
+                OpenSRF::Utils::JSON->perl2JSON($def));
  
-        $properties->{$field_name} = $def;
+            $properties->{$field_name} = $def;
+        }
      }
  
      return $properties;
@@ -365,6 +381,9 @@ sub create_index {
      return 1;
  }
  
+# TODO: elastic.bib_record_properties needs to also pull values
+# from metabib.facet_entry
+# TODO: stamp each field with a 'purpose' (search, facet, filter, sorter)
  sub get_bib_data {
      my ($self, $record_ids) = @_;
  
@@ -452,6 +471,7 @@ sub populate_bib_index_batch {
              next unless defined $value && $value ne '';
  
              $fname = "$fclass|$fname" if $fclass;
+            $fname = "$fname|facet" if $field->{purpose} eq 'facet';
              $value = $self->truncate_value($value);
  
              if ($fname eq 'identifier|isbn') {
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm

index fda4682..ea94130 100644 (file)
--- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm
@@ -31,6 +31,10 @@ sub sorter {
      return $self->{sorter} ? 't' : 'f';
  }
  
+sub weight {
+    return $self->{weight} || 1;
+}
+
  package OpenILS::Elastic::BibSearch::XSLT;
  use strict;
  use warnings;
@@ -53,7 +57,7 @@ sub xsl_doc {
      my ($self) = @_;
  
      $self->{xsl_doc} = XML::LibXML->load_xml(location => $self->xsl_file);
-        unless ($self->{xsl_doc});
+        unless $self->{xsl_doc};
  
      return $self->{xsl_doc};
  }
@@ -83,10 +87,6 @@ sub add_dynamic_field {
      push(@$fields, $field);
  }
  
-# TODO: what to do about fields that have the same class/name
-# and are both search and facet fields, but the facet values
-# are different than the searched value?
-
  sub get_dynamic_fields {
      my $self = shift;
      my $fields = [];
@@ -136,18 +136,42 @@ sub get_bib_data {
          my $result = $self->xsl_sheet->transform($marc_doc);
          my $output = $stylesheet->output_as_chars($result);
  
-        my @fields = split(/\n/, $output);
-        for my $field (@fields) {
-            my @parts = split(/ /, $field);
-            my $field_type = $parts[0];
+        my @rows = split(/\n/, $output);
+        my $first = 1;
+        for my $row (@rows) {
+            my @parts = split(/ /, $row);
+            my $purpose = $parts[0];
+
+            my $field = {purpose => $purpose};
+
+            if ($first) {
+                # Stamp the first field with the additional bib metadata.
+                $field->{$_} = $db_rec->{$_} for 
+                    qw/id bib_source metarecord create_date edit_date/;
+                $first = 0;
+            }
+
+            if ($purpose eq 'search') {
+                $field->{search_group} = @parts[1];
+                $field->{name} = @parts[2];
+                $field->{weight} = @parts[3];
+                $field->{value} = join(' ', @parts[4..$#parts]);
+
+            } elsif ($purpose eq 'facet') {
+                $field->{search_group} = @parts[1];
+                $field->{name} = @parts[2];
+                $field->{value} = join(' ', @parts[3..$#parts]);
  
-            if ($field_type eq 'search') {
-            } elsif ($field_type eq 'facet') {
-            } elsif ($field_type eq 'filter') {
-            } elsif ($field_type eq 'sorter') {
+            } elsif ($purpose eq 'filter' || $purpose eq 'sorter') {
+                $field->{name} = @parts[1];
+                $field->{value} = join(' ', @parts[2..$#parts]);
              }
          }
+
+        push(@$bib_data, $field);
      }
+
+    return $bib_data;
  }
  
  sub get_bib_db_data {
author	Bill Erickson <berickxx@gmail.com>
	Tue, 18 Feb 2020 19:16:24 +0000 (14:16 -0500)
committer	Bill Erickson <berickxx@gmail.com>
	Fri, 21 Feb 2020 21:20:33 +0000 (16:20 -0500)
Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm		patch \| blob \| history
Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm		patch \| blob \| history
Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm		patch \| blob \| history