From 811f5024e6c7579fcaa502400fc6dd1ab52252d0 Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Wed, 27 Oct 2021 12:28:16 -0400 Subject: [PATCH] LP1844418 ES rebase circ 3.8 plus cross-porting Signed-off-by: Bill Erickson --- Open-ILS/examples/elastic/README.adoc | 14 + Open-ILS/examples/elastic/bib-233-marc.xml | 443 +++++++ Open-ILS/examples/elastic/bib-248-marc.xml | 87 ++ .../examples/elastic/elastic-config.example.xml | 63 + .../lib/OpenILS/Application/Search/Elastic.pm | 33 +- Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm | 384 +++++- .../src/perlmods/lib/OpenILS/Elastic/BibSearch.pm | 736 +++++++---- .../sql/Pg/upgrade/XXXX.schema.elastic-search.sql | 281 +--- Open-ILS/src/support-scripts/elastic-index.pl | 178 ++- .../test-scripts/elastic-search-samples.pl | 18 +- .../support-scripts/test-scripts/elastic-search.pl | 47 +- Open-ILS/xsl/elastic-bib-transform.xsl | 1339 ++++++++++++++++++++ 12 files changed, 3054 insertions(+), 569 deletions(-) create mode 100644 Open-ILS/examples/elastic/README.adoc create mode 100644 Open-ILS/examples/elastic/bib-233-marc.xml create mode 100644 Open-ILS/examples/elastic/bib-248-marc.xml create mode 100644 Open-ILS/examples/elastic/elastic-config.example.xml create mode 100644 Open-ILS/xsl/elastic-bib-transform.xsl diff --git a/Open-ILS/examples/elastic/README.adoc b/Open-ILS/examples/elastic/README.adoc new file mode 100644 index 0000000000..1f46c9318e --- /dev/null +++ b/Open-ILS/examples/elastic/README.adoc @@ -0,0 +1,14 @@ += Elasticsearch Examples + +== Bib Transform Testing + +[source,sh] +---------------------------------------------------------------------------- +sudo apt install xsltproc + +xsltproc ../../xsl/elastic-bib-transform.xsl bib-248-marc.xml + +xsltproc ../../xsl/elastic-bib-transform.xsl bib-233-marc.xml +---------------------------------------------------------------------------- + + diff --git a/Open-ILS/examples/elastic/bib-233-marc.xml b/Open-ILS/examples/elastic/bib-233-marc.xml new file mode 100644 index 0000000000..0aad4bd77e --- /dev/null +++ b/Open-ILS/examples/elastic/bib-233-marc.xml @@ -0,0 +1,443 @@ + + + 07649cim a2200913 i 4500 + 233 + CONS + 20140128084328.0 + 140128s2013 nyuopk|zqdefhi n | ita d + + 2013565186 + + + 9781480328532 + + + 1480328537 + + + 884088883249 + + + HL50498721 + Hal Leonard + (bk.) + + + HL50490487 + Hal Leonard + (cd.) + + + HL50486260 + Hal Leonard + (cd.) + + + 63011108 + Hal Leonard + (diction coach 1) + + + 63011109 + Hal Leonard + (diction coach 2) + + + 63014792 + Hal Leonard + (CD 1) + + + 63014793 + Hal Leonard + (CD 2) + + + (OCoLC)ocn826076986 + + + (OCoLC)826076986 + + + YDXCP + eng + rda + YDXCP + CLE + NUI + MYG + DLC + + + ita + ger + fre + eng + ita + ger + fre + eng + eng + + + lccopycat + + + vf01 + ka01 + + + M1507.A+ + + + Pickins, Slim + More Stuff + + + The Arias for bass : + complete package : with diction coach and accompaniment CDs / + compiled and edited by Robert L. Larsen. + + + New York, NY : + G. Schirmer, Inc., + 2013. + + + Milwaukee, WI : + Distributed by Hal Leonard Corporation + + + 1 score (263 pages) ; + 31 cm + + 4 sound discs (digital ; 4 3/4 in.) + + + notated music + ntm + rdacontent + + + performed music + prm + rdacontent + + + unmediated + n + rdamedia + + + audio + s + rdamedia + + + volume + nc + rdacarrier + + + audio disc + sd + rdacarrier + + + G. Schirmer opera anthology + + + staff notation + + + Italian, French, German, and English words; non-English texts also printed with English translations. + + + Opera arias; acc. arr. for piano. + + + William Billingham, pianist on CDs. + + + disc 1-2 diction coach -- disc 3-4 accompaniment CDs. + + + Il barbiere di Siviglia. La calunnia / Gioachino Rossini -- La Bohè̀me. Vecchia zimarra, senti / Giacomo Puccini -- La Cenerentola. Miei rampolli femminini / Gioachino Rossini -- Don Giovanni. Madamina! Il catalogo è questo / Wolfgang Amadeus Mozart -- Don Pasquale. Ah! Un foco insolito / Gaetano Donizetti -- Die Entführung aus dem Serail. O, wie will ich triumphiren / Wolfgang Amadeus Mozart -- Ernani. Infelice! E tuo credevi / Giuseppe Verdi -- Eugene Onegin. Gremin's aria / Pyotr Il'yich Tchaikovsky -- Faust. Le veau d'or ; Vous qui faites l'endormie / Charles Gounod -- Der Freischütz. Schweig'! Schweig'! Damit dich niemand warnt / Carl Maria von Weber -- Les huguenots. Pour le couvents c'est fini (Piff, paff) / Giacomo Meyerbeer -- La jolie fille de Perth. Quand la flamme de l'amour / Georges Bizet -- Lucia di Lammermoor. Dalle stanze ove Lucia / Gaetano Donizetti -- Die lustigen Weiber von Windsor. Als Büblein klein / Otto Nicolai -- Macbeth. Come dal ciel precipita / Giuseppe Verdi -- Manon. Épouse quelque brave fille / Jules Massenet -- The mother of us all. What what is it / Virgil Thomson -- Le nozze di Figaro. La vendetta ; Se vuol ballare ; Non più andrai ; Aprite un po' quegl'occhi / Wolfgang Amadeus Mozart -- Simon Boccanegra. Il lacerato spirito / Giuseppe Verdi -- La sonnambula. Vi ravviso / Vincenzo Bellini -- Street scene. Let things be like they always was / Kurt Weill -- I vespri siciliani. O tu, Palermo / Giuseppe Verdi -- Die Zauberflöte. O Isis und Osiris ; In diesen heil'gen Hallen / Wolfgang Amadeus Mozart. + + + Operas + Excerpts + Vocal scores with piano. + + + Recorded accompaniments (Low voice) + + + Larsen, Robert L., + 1934- + editor, + compiler. + + + Billingham, William, + performer. + + + Contains (expression): + Rossini, Gioacchino, + 1792-1868. + Barbiere di Siviglia. + Calunnia è un venticello. + Vocal score. + + + Contains (expression): + Puccini, Giacomo, + 1858-1924. + Bohème. + Vecchia zimarra. + Vocal score. + + + Contains (expression): + Rossini, Gioacchino, + 1792-1868. + Cenerentola. + Miei rampolli femminini. + Vocal score. + + + Contains (expression): + Mozart, Wolfgang Amadeus, + 1756-1791. + Don Giovanni. + Madamina, il catalogo è questo. + Vocal score. + + + Contains (expression): + Donizetti, Gaetano, + 1797-1848. + Don Pasquale. + Foco insolito. + Vocal score. + + + Contains (expression): + Mozart, Wolfgang Amadeus, + 1756-1791. + Entführung aus dem Serail. + Ha! wie will ich triumphieren. + Vocal score. + + + Contains (expression): + Verdi, Giuseppe, + 1813-1901. + Ernani. + Infelice! e tu credevi. + Vocal score. + + + Contains (expression): + Tchaikovsky, Peter Ilich, + 1840-1893. + Evgeniĭ Onegin. + Arii︠a︡ kni︠a︡zi︠a︡. + Vocal score. + + + Contains (expression): + Gounod, Charles, + 1818-1893. + Faust. + Veau d'or est toujours debout. + Vocal score. + + + Contains (expression): + Gounod, Charles, + 1818-1893. + Faust. + Vous qui faites l'endormie. + Vocal score. + + + Contains (expression): + Weber, Carl Maria von, + 1786-1826. + Freischütz. + Schweig', schweig'! damit dich niemand warnt. + Vocal score. + + + Contains (expression): + Meyerbeer, Giacomo, + 1791-1864. + Huguenots. + Piff, paff. + Vocal score. + + + Contains (expression): + Bizet, Georges, + 1838-1875. + Jolie fille de Perth. + Quand la flamme de l'amour. + Vocal score. + + + Contains (expression): + Donizetti, Gaetano, + 1797-1848. + Lucia di Lammermoor. + Dalle stanze ove Lucia. + Vocal score. + + + Contains (expression): + Nicolai, Otto, + 1810-1849. + Lustigen Weiber von Windsor. + Als Büblein klein. + Vocal score. + + + Contains (expression): + Verdi, Giuseppe, + 1813-1901. + Macbeth. + Come dal ciel precipita. + Vocal score. + + + Contains (expression): + Massenet, Jules, + 1842-1912. + Manon. + Épouse quelque brave fille. + Vocal score. + + + Contains (expression): + Thomson, Virgil, + 1896-1989. + Mother of us all. + What what is it. + Vocal score. + + + Contains (expression): + Mozart, Wolfgang Amadeus, + 1756-1791. + Nozze di Figaro. + Vendetta. + Vocal score. + + + Contains (expression): + Mozart, Wolfgang Amadeus, + 1756-1791. + Nozze di Figaro. + Se vuol ballare, signor contino. + Vocal score. + + + Contains (expression): + Mozart, Wolfgang Amadeus, + 1756-1791. + Nozze di Figaro. + Non più andrai farfallone. + Vocal score. + + + Contains (expression): + Mozart, Wolfgang Amadeus, + 1756-1791. + Nozze di Figaro. + Aprite un po' quegl' occhi. + Vocal score. + + + Contains (expression): + Verdi, Giuseppe, + 1813-1901. + Simon Boccanegra. + Lacerato spirito. + Vocal score. + + + Contains (expression): + Bellini, Vincenzo, + 1801-1835. + Sonnambula. + Vi ravviso, o luoghi ameni. + Vocal score. + + + Contains (expression): + Weill, Kurt, + 1900-1950. + Street scene. + Let things be like they always was. + Vocal score. + + + Contains (expression): + Verdi, Giuseppe, + 1813-1901. + Vêpres siciliennes. + Et toi Palerme. + Italian. + Vocal score. + + + Contains (expression): + Mozart, Wolfgang Amadeus, + 1756-1791. + Zauberflöte. + O Isis und Osiris (Aria and chorus) + O Isis und Osiris. + Vocal score. + + + Contains (expression): + Mozart, Wolfgang Amadeus, + 1756-1791. + Zauberflöte. + In diesen heil'gen Hallen. + Vocal score. + + + G. Schirmer opera anthology. + + + 0 + par + copycat + 2 + ncip + 20 + y-genmusic + + + acquire + 2 shelf copies + policy default + + + vl34 2014-01-28 z-client + vl34 2014-01-28 + vl34 2014-01-28 4 sound disc to MBRS for shelf label + vl34 2014-01-28 copy 2, 4 sound disc to MBRS for shelf label + + + 233 + AUTOGEN + 233 + biblio + + + v + + + diff --git a/Open-ILS/examples/elastic/bib-248-marc.xml b/Open-ILS/examples/elastic/bib-248-marc.xml new file mode 100644 index 0000000000..a0efb653b3 --- /dev/null +++ b/Open-ILS/examples/elastic/bib-248-marc.xml @@ -0,0 +1,87 @@ + + + 00975pam a2200337 a 4500 + 248 + CONS + 20110823130500.0 + 110422s2011 nyu 000 1 eng + + 2011015247 + + + 9780307887436 : + $24.00 + + + 030788743X : + $24.00 + + + (DLC) 2011015247 + + + DLC + DLC + NjBwBT + GCmBT + + + pcc + + + PS3603.L548 + R43 2011 + + + 813/.6 + 22 + + + Cline, Ernest. + + + Ready player one / + Ernest Cline. + + + 1st ed. + + + New York : + Crown Publishers, + c2011. + + + 374 p. ; + 25 cm. + + + Regression (Civilization) + Fiction. + + + Virtual reality + Fiction. + + + Utopias + Fiction. + + + Puzzles + Fiction. + + + Fantasy fiction. + gsafd + + + 1 + + + 248 + AUTOGEN + 248 + biblio + + diff --git a/Open-ILS/examples/elastic/elastic-config.example.xml b/Open-ILS/examples/elastic/elastic-config.example.xml new file mode 100644 index 0000000000..84fbe985b2 --- /dev/null +++ b/Open-ILS/examples/elastic/elastic-config.example.xml @@ -0,0 +1,63 @@ + + + + + + + /openils/var/xsl/elastic-bib-transform.xsl + 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm index 3fde569d09..a137db96b3 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Elastic.pm @@ -19,7 +19,6 @@ use warnings; use OpenSRF::Utils::JSON; use OpenSRF::Utils::Logger qw/:logger/; use OpenILS::Utils::Fieldmapper; -use OpenSRF::Utils::SettingsClient; use OpenILS::Utils::CStoreEditor q/:funcs/; use OpenILS::Elastic::BibSearch; use Digest::MD5 qw(md5_hex); @@ -47,11 +46,15 @@ sub init { return if $init_done; $init_done = 1; - my $e = new_editor(); + # NOTE: after things stabilize and maybe load balancing, etc. is + # tested and working, we could maintain a global $es so the + # connection is cached instead of reconnecting on every search call. + my $es = OpenILS::Elastic::BibSearch->new; + $es->connect; - # no pkey - $bib_fields = $e->search_elastic_bib_field({name => {'!=' => undef}}); + $bib_fields = $es->bib_fields; + my $e = new_editor(); my $stats = $e->json_query({ select => {ccs => ['id', 'opac_visible', 'is_available']}, from => 'ccs', @@ -99,9 +102,6 @@ __PACKAGE__->register_method( Org unit based item presence and availability filtering may optionally be added to the query. See search options below. - - See [ select * from elastic.bib_field where search_field; ] - for full-text search fields and classes. /, params => [ { type => 'object', @@ -208,9 +208,12 @@ sub bib_search { $elastic_query->{size} = 1000; } - my $es = OpenILS::Elastic::BibSearch->new('main'); - + # NOTE: after things stabilize and maybe load balancing, etc. is + # tested and working, we could maintain a global $es so the + # connection is cached instead of reconnecting on every search call. + my $es = OpenILS::Elastic::BibSearch->new; $es->connect; + my $results = $es->search($elastic_query); $logger->debug("ES elasticsearch returned: ". @@ -278,20 +281,20 @@ sub compile_elastic_query { } # Format ES search aggregations to match the API response facet structure -# {$cmf_id => {"Value" => $count}, $cmf_id2 => {"Value Two" => $count2}, ...} +# {$field_id => {"Value" => $count}, $field_id2 => {"Value Two" => $count2}, ...} sub format_facets { my $aggregations = shift; my $facets = {}; for my $fname (keys %$aggregations) { - my ($field_class, $name) = split(/\|/, $fname); + my ($search_group, $name) = split(/\|/, $fname); my ($bib_field) = grep { - $_->name eq $name && $_->search_group eq $field_class + $_->name eq $name && $_->search_group eq $search_group } @$bib_fields; - my $hash = $facets->{$bib_field->metabib_field} = {}; + my $hash = $facets->{$bib_field->id} = {}; my $values = $aggregations->{$fname}->{buckets}; for my $bucket (@$values) { @@ -305,7 +308,7 @@ sub format_facets { sub add_elastic_facet_aggregations { my ($elastic_query) = @_; - my @facet_fields = grep {$_->facet_field eq 't'} @$bib_fields; + my @facet_fields = grep {$_->facet_field} @$bib_fields; return unless @facet_fields; $elastic_query->{aggs} = {}; @@ -315,7 +318,7 @@ sub add_elastic_facet_aggregations { my $fgrp = $facet->search_group; $fname = "$fgrp|$fname" if $fgrp; - $elastic_query->{aggs}{$fname} = {terms => {field => "$fname.facet"}}; + $elastic_query->{aggs}{$fname} = {terms => {field => "$fname|facet"}}; } } diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm index 6ad35962ca..381a85d7e8 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm @@ -17,21 +17,25 @@ use strict; use warnings; use DBI; use Time::HiRes qw/time/; -use OpenSRF::Utils::Logger qw/:logger/; -use OpenILS::Utils::CStoreEditor qw/:funcs/; +use XML::LibXML; +use XML::LibXML::XPathContext; use Search::Elasticsearch; use OpenSRF::Utils::JSON; +use OpenSRF::Utils::Logger qw/:logger/; +use OpenILS::Utils::CStoreEditor qw/:funcs/; +use OpenILS::Utils::Fieldmapper; use Data::Dumper; $Data::Dumper::Indent = 0; +# For parsing the Elasticsearch configuration file +my $ES_NAMESPACE = 'http://evergreen-ils.org/spec/elasticsearch/v1'; + sub new { - my ($class, $cluster) = @_; + my ($class, %args) = @_; - my $self = { - cluster => $cluster, - indices => [], - marc_fields => [] - }; + my $self = {%args}; + + $self->{cluster} = 'main' unless $args{cluster}; return bless($self, $class); } @@ -48,7 +52,23 @@ sub nodes { sub indices { my $self = shift; - return $self->{indices}; + return $self->{indices} if $self->{indices}; + + my $def; + eval { + # All open indices + $def = $self->es->indices->get( + index => $self->index_class . '-*', + expand_wildcards => 'open' + ); + }; + + if ($@) { + $logger->error("ES index lookup failed: $@"); + return {}; + } + + return $self->{indices} = $def; } sub es { @@ -57,7 +77,34 @@ sub es { } sub index_name { - die "Index name must be provided by sub-class\n"; + my ($self) = @_; + return $self->{index_name}; +} + +my $xpc; +sub xpath_context { + if (!$xpc) { + $xpc = XML::LibXML::XPathContext->new; + $xpc->registerNs('es', $ES_NAMESPACE); + } + return $xpc; +} + +# In maintenance mode we are working with specific indexes. +# Otherwise all actions target the index alias which is index_class. +sub index_target { + my ($self) = @_; + return $self->maintenance_mode ? $self->index_name : $self->index_class; +} + +sub index_class { + die "index_class() should be implemented by sub-classes\n"; +} + +# Are we modifying indexes or just read/writing indexed data? +sub maintenance_mode { + my $self = shift; + return $self->{maintenance_mode}; } sub language_analyzers { @@ -80,9 +127,9 @@ sub db { my $db_pass = $self->{db_pass}; my $db_appn = $self->{db_appn} || 'Elastic Indexer'; - # TODO Add application_name to dsn + my $dsn = + "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;application_name='$db_appn';"; - my $dsn = "dbi:Pg:db=$db_name;host=$db_host;port=$db_port"; $logger->debug("ES connecting to DB $dsn"); $self->{db} = DBI->connect( @@ -106,47 +153,212 @@ sub get_db_rows { # load the config via cstore. sub load_config { - my $self = shift; + my ($self) = @_; + my $e = new_editor(); my $cluster = $self->cluster; - $self->{nodes} = $e->search_elastic_node({cluster => $cluster, active => 't'}); + my @nodes = $self->{nodes} ? @{$self->{nodes}} : (); + + if (@nodes) { + + $logger->info("ES overriding nodes with @nodes"); + $self->{nodes} = \@nodes; + + } else { + + my %active = $self->maintenance_mode ? () : (active => 't'); + my $nodes = $e->search_elastic_node({cluster => $cluster, %active}); + + $self->{nodes} = [ + map { + sprintf("%s://%s:%d%s", $_->proto, $_->host, $_->port, $_->path) + } @$nodes + ]; + } unless (@{$self->nodes}) { $logger->error("ES no nodes defined for cluster $cluster"); return; } - $self->{indices} = $e->search_elastic_index({cluster => $cluster}); - - unless (@{$self->indices}) { - $logger->error("ES no indices defined for cluster $cluster"); + if (!$self->index_class) { + $logger->error("ES index_class required to initialize"); return; } } -sub connect { +sub load_es_config { my ($self) = @_; - $self->load_config; - my @nodes; - for my $server (@{$self->nodes}) { - push(@nodes, { - scheme => $server->proto, - host => $server->host, - port => $server->port, - path => $server->path - }); + my $cluster = $self->cluster; + + if (!$self->indices || !keys(%{$self->indices})) { + $logger->info("ES no usable indices defined for cluster $cluster"); + return unless $self->maintenance_mode; } - $logger->debug("ES connecting to ".scalar(@nodes)." nodes"); + if (!$self->index_name) { + # Default to the index that has an alias matching our index_class + + for my $name (keys %{$self->indices}) { + if ($self->index_is_active($name)) { + $logger->info("ES defaulting to active index $name"); + $self->{index_name} = $name; + } + } + } + + # Load the main ES config file + + # TODO: 'dirs' option for 'conf' + #my $client = OpenSRF::Utils::SettingsClient->new; + #my $dir = $client->config_value("dirs", "conf"); - eval { $self->{es} = Search::Elasticsearch->new(nodes => \@nodes) }; + my $doc; + my $filename = $self->{es_config_file} + || '/openils/conf/elastic-config.xml'; + + eval { $doc = XML::LibXML->load_xml(location => $filename) }; + + if ($@ || !$doc) { + my $msg = "ES could not parse elastic config file: $filename $@"; + $logger->error($msg); + die "$msg\n"; + } + + $self->{es_config} = $doc->documentElement; +} + +sub es_config { + my $self = shift; + return $self->{es_config}; +} + +sub active_index { + my $self = shift; + my $indices = $self->indices; + for my $name (keys %{$indices}) { + return $name if $self->index_is_active($name); + } + return undef; +} + +# True if the named index has an alias matching our index class +sub index_is_active { + my ($self, $name) = @_; + + my $conf = $self->indices->{$name}; + return 0 unless $conf; + + my @aliases = keys %{$conf->{aliases}}; + return 1 if grep {$_ eq $self->index_class} @aliases; + + return 0; +} + + +sub index_config { + my $self = shift; + my $class = $self->index_class; + + if (!$self->es_config) { + $logger->error("ES cannot load index config without a config file"); + return undef; + } + + my @conf; + eval { + @conf = $xpc->findnodes( + "//es:elasticsearch/es:index[\@class='$class']", + $self->es_config + ); + }; + + if ($@ || !@conf) { + my $msg = "ES failed to locate config for index class '$class' $@"; + $logger->error($msg); + die "$msg\n"; + } + + return $conf[0]; +} + +sub connect { + my ($self) = @_; + + $self->load_config; + + my @nodes = @{$self->nodes}; + $logger->info("ES connecting to nodes: @nodes"); + + eval { + $self->{es} = Search::Elasticsearch->new( + client => '6_0::Direct', + nodes => \@nodes + ); + }; if ($@) { $logger->error("ES failed to connect to @nodes: $@"); return; } + + $self->load_es_config; +} + +# Activates the currently loaded index while deactivating any active +# index with the same cluster and index_class. +# Applies an alias to the activated index equal to the index class. +sub activate_index { + my ($self) = @_; + + my $index = $self->index_name; + + if (!$self->es->indices->exists(index => $index)) { + $logger->warn("ES cannot activate index '$index' which does not exist"); + return; + } + + my $from_index = $self->active_index; + + # When activating an index, point the main alias toward the + # newly active index. + return $self->migrate_alias($self->index_class, $from_index, $index); +} + + +# Migrate an alias from one index to another. +# If either from_index or to_index are not defined, then only half +# of the migration (i.e. remove or add) is performed. +sub migrate_alias { + my ($self, $alias, $from_index, $to_index) = @_; + return undef unless $alias && ($from_index || $to_index); + + my @actions; + + $from_index ||= ''; + $to_index ||= ''; + $logger->info("ES migrating alias [$alias] from $from_index to $to_index"); + + if ($from_index) { + push(@actions, {remove => {alias => $alias, index => $from_index}}); + } + + if ($to_index) { + push(@actions, {add => {alias => $alias, index => $to_index}}); + } + + eval { + $self->es->indices->update_aliases({body => {actions => \@actions}}); + }; + + if ($@) { + $logger->error("ES alias migration [$alias] failed $@"); + return undef; + } + + return 1; } sub delete_index { @@ -163,6 +375,10 @@ sub delete_index { $logger->warn("ES index '$index' ". "does not exist in cluster '".$self->cluster."'"); } + + delete $self->indices->{$index}; + + return 1; } # Remove multiple documents from the index by ID. @@ -176,7 +392,7 @@ sub delete_documents { eval { $result = $self->es->delete_by_query( - index => $self->index_name, + index => $self->index_target, type => 'record', body => {query => {terms => {_id => $ids}}} ); @@ -191,6 +407,30 @@ sub delete_documents { return $result; } +# Returns true if a document with the requested ID exists. +sub document_exists { + my ($self, $id) = @_; + + my $result; + + eval { + $result = $self->es->index( + index => $self->index_target, + type => 'record', + id => $id, + ); + }; + + + if ($@) { + $logger->error("ES document_exists failed with $@"); + return undef; + } + + return $result ? 1 : 0; +} + +# Create or replace a document. sub index_document { my ($self, $id, $body) = @_; @@ -198,7 +438,7 @@ sub index_document { eval { $result = $self->es->index( - index => $self->index_name, + index => $self->index_target, type => 'record', id => $id, body => $body @@ -219,6 +459,67 @@ sub index_document { return $result; } +# Index a new document +# This will fail if the document already exists. +sub create_document { + my ($self, $id, $body) = @_; + + my $result; + + eval { + $result = $self->es->create( + index => $self->index_target, + type => 'record', + id => $id, + body => $body + ); + }; + + if ($@) { + $logger->error("ES create_document failed with $@"); + return undef; + } + + if ($result->{failed}) { + $logger->error("ES create document $id failed " . Dumper($result)); + return undef; + } + + $logger->debug("ES create => $id succeeded"); + return $result; +} + + +# Partial document update +# This will fail if the document does not exist. +sub update_document { + my ($self, $id, $body) = @_; + + my $result; + + eval { + $result = $self->es->update( + index => $self->index_target, + type => 'record', + id => $id, + body => {doc => $body} + ); + }; + + if ($@) { + $logger->error("ES update_document failed with $@"); + return undef; + } + + if ($result->{failed}) { + $logger->error("ES update document $id failed " . Dumper($result)); + return undef; + } + + $logger->debug("ES update => $id succeeded"); + return $result; +} + sub search { my ($self, $query) = @_; @@ -230,7 +531,7 @@ sub search { eval { my $start_time = time; $result = $self->es->search( - index => $self->index_name, + index => $self->index_target, body => $query ); $duration = time - $start_time; @@ -254,13 +555,24 @@ sub search { # Avoid trying to index such data by lazily chopping it off # at 1/4 the limit to accomodate all UTF-8 chars. sub truncate_value { - my ($self, $value) = @_; + my ($self, $value, $length) = @_; + $length = 8190 unless $length; return substr($value, 0, 8190); } sub get_index_def { - my ($self) = @_; - return $self->es->indices->get(index => $self->index_name); + my ($self, $name) = @_; + $name ||= $self->index_name; + + my $def; + eval { $def = $self->es->indices->get(index => $name) }; + + if ($@) { + $logger->error("ES cannot find index def for $name"); + return undef; + } + + return $def; } diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm index 1d5340d343..d412e75ef8 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm @@ -1,6 +1,5 @@ -package OpenILS::Elastic::BibSearch; # --------------------------------------------------------------- -# Copyright (C) 2019 King County Library System +# Copyright (C) 2019-2020 King County Library System # Author: Bill Erickson # # This program is free software; you can redistribute it and/or @@ -13,31 +12,97 @@ package OpenILS::Elastic::BibSearch; # MERCHANTABILITY or FITNESS FOR A PARTICULAR code. See the # GNU General Public License for more details. # --------------------------------------------------------------- +package OpenILS::Elastic::BibField; +use strict; +use warnings; + +sub new { + my ($class, %args) = @_; + my $self = {%args}; + return bless($self, $class); +} +sub id { + my $self = shift; + return $self->search_group ? + $self->search_group . '|' . $self->name : $self->name; +} +sub search_group { + my $self = shift; + return $self->{search_group}; +} +sub name { + my $self = shift; + return $self->{name}; +} +sub search_field { + my $self = shift; + return $self->{search_field}; +} +sub facet_field { + my $self = shift; + return $self->{facet_field}; +} +sub weight { + my $self = shift; + return $self->{weight}; +} +sub filter { + my $self = shift; + return $self->{filter}; +} +sub sorter { + my $self = shift; + return $self->{sorter}; +} + +package OpenILS::Elastic::BibSearch; use strict; use warnings; -use Encode; use DateTime; use Clone 'clone'; -use Business::ISBN; -use Business::ISSN; use Time::HiRes qw/time/; +use XML::LibXML; +use XML::LibXML::XPathContext; use OpenSRF::Utils::Logger qw/:logger/; use OpenSRF::Utils::JSON; +use OpenSRF::Utils::SettingsClient; use OpenILS::Utils::CStoreEditor qw/:funcs/; use OpenILS::Utils::DateTime qw/interval_to_seconds/; use OpenILS::Elastic; +use OpenILS::Utils::Normalize; use base qw/OpenILS::Elastic/; # default number of bibs to index per batch. my $DEFAULT_BIB_BATCH_SIZE = 500; +my $INDEX_CLASS = 'bib-search'; -my $INDEX_NAME = 'bib-search'; +# https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html +# Useful for ignoring excessively long filters and facets. +# Only applied to the keyword variation of each index. Does not affect +# the 'text' varieties. The selected limit is arbitrary. +my $IGNORE_ABOVE = 256; + +# Individual characters of some values like sorters provide less and less +# value as the length of the text gets longer and longer. Unlike +# $IGNORE_ABOVE, this only trims the string, it does not prevent it from +# getting indexed in the first place. The selected limit is arbitrary. +my $TRIM_ABOVE = 512; my $BASE_INDEX_SETTINGS = { analysis => { analyzer => { folding => { - filter => ['lowercase', 'asciifolding'], + filter => ['asciifolding', 'lowercase'], + tokenizer => 'standard' + }, + icu_folding => { + filter => ['icu_folding', 'lowercase'], + tokenizer => 'icu_tokenizer' + }, + stripapos => { + # "It's A Wonderful Live" => "Its A ..." + char_filter => ['stripapos'], + filter => ['lowercase'], tokenizer => 'standard' }, stripdots => { @@ -60,6 +125,10 @@ my $BASE_INDEX_SETTINGS = { } }, char_filter => { + stripapos => { + type => 'mapping', + mappings => ['\' =>'] + }, stripdots => { type => 'mapping', mappings => ['. =>'] @@ -72,7 +141,7 @@ my $BASE_INDEX_SETTINGS = { } }; -# Well-known bib-search index properties +# Well-known bib-search index properties my $BASE_PROPERTIES = { bib_source => {type => 'integer'}, create_date => {type => 'date'}, @@ -103,12 +172,13 @@ my $BASE_PROPERTIES = { normalizer => 'custom_lowercase' }, value => { - type => 'text', + type => 'keyword', + ignore_above => $IGNORE_ABOVE, + normalizer => 'custom_lowercase', fields => { - text_folded => { - type => 'text', - analyzer => 'folding' - } + text => {type => 'text'}, + text_folded => {type => 'text', analyzer => 'folding'}, + text_icu_folded => {type => 'text', analyzer => 'icu_folding'} } } } @@ -118,103 +188,232 @@ my $BASE_PROPERTIES = { # Values from grouped fields are copied into the group field. # Here we make some assumptions about the general purpose of # each group. + # The 'keyword' variation of each is used for exact matches, + # starts with, and similar searches. # Note the ignore_above only affects the 'keyword' version of the # field, the assumption being text that large would solely be # searched via 'text' indexes. title => { type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', fields => { text => {type => 'text'}, text_folded => {type => 'text', analyzer => 'folding'}, + text_icu_folded => {type => 'text', analyzer => 'icu_folding'}, text_spacedots => {type => 'text', analyzer => 'spacedots'}, - text_stripdots => {type => 'text', analyzer => 'stripdots'} + text_stripdots => {type => 'text', analyzer => 'stripdots'}, + text_stripapos => {type => 'text', analyzer => 'stripapos'} } }, author => { type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', fields => { text => {type => 'text'}, text_folded => {type => 'text', analyzer => 'folding'}, + text_icu_folded => {type => 'text', analyzer => 'icu_folding'}, text_spacedots => {type => 'text', analyzer => 'spacedots'}, - text_stripdots => {type => 'text', analyzer => 'stripdots'} + text_stripdots => {type => 'text', analyzer => 'stripdots'}, + text_stripapos => {type => 'text', analyzer => 'stripapos'} } }, subject => { type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', fields => { text => {type => 'text'}, text_folded => {type => 'text', analyzer => 'folding'}, + text_icu_folded => {type => 'text', analyzer => 'icu_folding'}, text_spacedots => {type => 'text', analyzer => 'spacedots'}, text_stripdots => {type => 'text', analyzer => 'stripdots'} } }, series => { type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', fields => { text => {type => 'text'}, text_folded => {type => 'text', analyzer => 'folding'}, + text_icu_folded => {type => 'text', analyzer => 'icu_folding'}, text_spacedots => {type => 'text', analyzer => 'spacedots'}, - text_stripdots => {type => 'text', analyzer => 'stripdots'} + text_stripdots => {type => 'text', analyzer => 'stripdots'}, + text_stripapos => {type => 'text', analyzer => 'stripapos'} } }, keyword => { # term (aka "keyword") searches are not used on the - # keyword field, but we index it just the same (sans lowercase) - # for structural consistency with other group fields. + # keyword field, but we structure the index just the same + # for consistency with other group fields. type => 'keyword', - ignore_above => 256, + ignore_above => 1, # essentially a no-op. fields => { text => {type => 'text'}, text_folded => {type => 'text', analyzer => 'folding'}, + text_icu_folded => {type => 'text', analyzer => 'icu_folding'}, text_spacedots => {type => 'text', analyzer => 'spacedots'}, - text_stripdots => {type => 'text', analyzer => 'stripdots'} + text_stripdots => {type => 'text', analyzer => 'stripdots'}, + text_stripapos => {type => 'text', analyzer => 'stripapos'} } }, + # Identifier fields only support 'keyword' indexes, no full-text. identifier => { - # Avoid full-text indexing on identifier fields. type => 'keyword', - ignore_above => 256, + ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', - }, - - # Create some shortcut indexes for streamlining query_string searches. - ti => {type => 'text'}, - au => {type => 'text'}, - se => {type => 'text'}, - su => {type => 'text'}, - kw => {type => 'text'}, - id => { - type => 'keyword', - ignore_above => 256 } }; -my %SHORT_GROUP_MAP = ( - title => 'ti', - author => 'au', - subject => 'su', - series => 'se', - keyword => 'kw', - identifier => 'id' +# Map 'au' to 'author', etc. +my %SEARCH_CLASS_ALIAS_MAP = ( + ti => 'title.text', + au => 'author.text', + su => 'subject.text', + se => 'series.text', + kw => 'keyword.text', + pb => 'keyword|publisher.text', + id => 'identifier' ); -sub index_name { - return $INDEX_NAME; +sub index_class { + return $INDEX_CLASS; } -# TODO: add index-specific language analyzers to DB config +# TODO: determine when/how to apply language analyzers. +# e.g. create lang-specific index fields? sub language_analyzers { return ("english"); } +sub skip_holdings { + my $self = shift; + return $self->{skip_holdings}; +} + +sub bib_fields { + my $self = shift; + return $self->{bib_fields} if $self->{bib_fields}; + + my @bib_fields = $self->xpath_context->findnodes( + '//es:fields/es:field', $self->index_config); + + my @fields; + for my $field (@bib_fields) { + + my %struct; + + for my $key (qw/search_group name/) { + $struct{$key} = $field->getAttribute($key) || ''; + } + + for my $key (qw/search_field facet_field filter sorter/) { + $struct{$key} = ($field->getAttribute($key) || '') eq 'true'; + } + + push (@fields, OpenILS::Elastic::BibField->new(%struct)); + } + + return $self->{bib_fields} = \@fields; +} + +sub xsl_file { + my ($self) = @_; + + if (!$self->{xsl_file}) { + my @nodes = $self->xpath_context->findnodes( + '//es:transform/text()', $self->index_config); + $self->{xsl_file} = $nodes[0]; + } + + return $self->{xsl_file}; +} + +sub xsl_doc { + my ($self) = @_; + + $self->{xsl_doc} = XML::LibXML->load_xml(location => $self->xsl_file) + unless $self->{xsl_doc}; + + return $self->{xsl_doc}; +} + +sub xsl_sheet { + my $self = shift; + + $self->{xsl_sheet} = XML::LibXSLT->new->parse_stylesheet($self->xsl_doc) + unless $self->{xsl_sheet}; + + return $self->{xsl_sheet}; +} + +sub get_bib_data { + my ($self, $record_ids) = @_; + + my $records = []; + my $db_data = $self->get_bib_db_data($record_ids); + + for my $db_rec (@$db_data) { + + my $rec = {fields => []}; + push(@$records, $rec); + + # Copy DB data into our record object. + $rec->{$_} = $db_rec->{$_} for + qw/id bib_source metarecord create_date edit_date deleted/; + + # No need to extract index values for delete records; + next if $rec->{deleted} == 1; + + my $marc_doc = XML::LibXML->load_xml(string => $db_rec->{marc}); + my $result = $self->xsl_sheet->transform($marc_doc); + my $output = $self->xsl_sheet->output_as_chars($result); + + my @rows = split(/\n/, $output); + for my $row (@rows) { + my ($purpose, $search_group, $name, @tokens) = split(/ /, $row); + + $search_group = '' if ($search_group || '') eq '_'; + + my $value = join(' ', @tokens); + + my $field = { + purpose => $purpose, + search_group => $search_group, + name => $name, + value => $value + }; + + push(@{$rec->{fields}}, $field); + } + } + + return $records; +} + +sub get_bib_db_data { + my ($self, $record_ids) = @_; + + my $ids_str = join(',', @$record_ids); + + my $sql = <get_db_rows($sql); +} + sub create_index_properties { my ($self) = @_; @@ -236,10 +435,8 @@ sub create_index_properties { } foreach qw/title subject series keyword/; } - # elastic.bib_field has no primary key field, so retrieve_all won't work. - # Note the name value may be repeated across search group depending - # on local configuration. - my $fields = new_editor()->search_elastic_bib_field({name => {'!=' => undef}}); + # field_group will be undef for main/active fields + my $fields = $self->bib_fields; for my $field (@$fields) { @@ -250,42 +447,60 @@ sub create_index_properties { my $def; if ($search_group) { + if ($field->search_field) { - # Use the same fields and analysis as the 'grouped' field. - $def = clone($properties->{$search_group}); - $def->{copy_to} = [$search_group, $SHORT_GROUP_MAP{$search_group}]; + # Use the same fields and analysis as the 'grouped' field. + $def = clone($properties->{$search_group}); - # Apply ranking boost to each analysis variation. - my $flds = $def->{fields}; - if ($flds && (my $boost = ($field->weight || 1)) > 1) { - $flds->{$_}->{boost} = $boost foreach keys %$flds; + # Copy grouped fields into their group parent field. + $def->{copy_to} = $search_group; + + # Apply ranking boost to each analysis variation. + my $flds = $def->{fields}; + if ($flds && (my $boost = ($field->weight || 1)) > 1) { + $flds->{$_}->{boost} = $boost foreach keys %$flds; + } } } else { - - # Non-grouped fields are used for filtering and sorting, so - # they don't need as much processing. + # Filters and sorters $def = { type => 'keyword', - ignore_above => 256, normalizer => 'custom_lowercase' }; + + # Long sorter values are not necessarily unexpected, + # e.g. long titles. + $def->{ignore_above} = $IGNORE_ABOVE unless $field->sorter; + } + + if ($def) { + $logger->debug("ES adding field $field_name: ". + OpenSRF::Utils::JSON->perl2JSON($def)); + + $properties->{$field_name} = $def; } - if ($field->facet_field eq 't' && $def->{fields}) { - # Facet fields are used for aggregation which requires - # an additional unaltered keyword field. - $def->{fields}->{facet} = { + # Search and facet fields can have the same name/group pair, + # but are stored as separate fields in ES since the content + # may vary between the two. + if ($field->facet_field) { + + # Facet fields are stored as separate fields, because their + # content may differ from the matching search field. + $field_name = "$field_name|facet"; + + $def = { type => 'keyword', - ignore_above => 256 + ignore_above => $IGNORE_ABOVE }; - } - $logger->debug("ES adding field $field_name: ". - OpenSRF::Utils::JSON->perl2JSON($def)); + $logger->debug("ES adding field $field_name: ". + OpenSRF::Utils::JSON->perl2JSON($def)); - $properties->{$field_name} = $def; + $properties->{$field_name} = $def; + } } return $properties; @@ -293,86 +508,114 @@ sub create_index_properties { sub create_index { my ($self) = @_; + my $index_name = $self->index_name; - if ($self->es->indices->exists(index => $INDEX_NAME)) { - $logger->warn("ES index '$INDEX_NAME' already exists"); + if ($self->es->indices->exists(index => $index_name)) { + $logger->warn("ES index '$index_name' already exists in ES"); return; } $logger->info( - "ES creating index '$INDEX_NAME' on cluster '".$self->cluster."'"); + "ES creating index '$index_name' on cluster '".$self->cluster."'"); my $properties = $self->create_index_properties; my $settings = $BASE_INDEX_SETTINGS; - $settings->{number_of_replicas} = scalar(@{$self->nodes}); - $settings->{number_of_shards} = $self->index->num_shards; + $settings->{number_of_shards} = 1; # TODO $index_config->num_shards; my $conf = { - index => $INDEX_NAME, + index => $index_name, body => {settings => $settings} }; - $logger->info("ES creating index '$INDEX_NAME'"); + $logger->info("ES creating index '$index_name'"); # Create the base index with settings eval { $self->es->indices->create($conf) }; if ($@) { - $logger->error("ES failed to create index cluster=". - $self->cluster. "index=$INDEX_NAME error=$@"); - warn "$@\n\n"; - return 0; + my $msg = "ES failed to create index cluster=". + $self->cluster. "index=$index_name error=$@"; + + $logger->error($msg); + die "$msg\n"; } # Create each mapping one at a time instead of en masse so we # can more easily report when mapping creation fails. - for my $field (keys %$properties) { - $logger->info("ES Creating index mapping for field $field"); - - eval { - $self->es->indices->put_mapping({ - index => $INDEX_NAME, - type => 'record', - body => {dynamic => 'strict', properties => {$field => $properties->{$field}}} - }); - }; + return 0 unless + $self->create_one_field_index($field, $properties->{$field}); + } + + # Now that we've added the configured fields, + # add the shortened search_group aliases. + while (my ($alias, $field) = each %SEARCH_CLASS_ALIAS_MAP) { - if ($@) { - my $mapjson = OpenSRF::Utils::JSON->perl2JSON($properties->{$field}); + return 0 unless $self->create_one_field_index( + $alias, {type => 'alias', path => $field}); + } - $logger->error("ES failed to create index mapping: " . - "index=$INDEX_NAME field=$field error=$@ mapping=$mapjson"); + return 1; +} - warn "$@\n\n"; - return 0; - } +sub create_one_field_index { + my ($self, $field, $properties) = @_; + + my $index_name = $self->index_name; + + $logger->info("ES Creating index mapping for field $field"); + + eval { + $self->es->indices->put_mapping({ + index => $index_name, + type => 'record', + body => { + dynamic => 'strict', + properties => {$field => $properties} + } + }); + }; + + if ($@) { + my $mapjson = OpenSRF::Utils::JSON->perl2JSON($properties); + + $logger->error("ES failed to create index mapping: " . + "index=$index_name field=$field error=$@ mapping=$mapjson"); + + warn "$@\n\n"; + return 0; } return 1; } -sub get_bib_data { - my ($self, $record_ids) = @_; - my $ids_str = join(',', @$record_ids); +sub get_bib_field_for_data { + my ($self, $field) = @_; - my $sql = <name eq $field->{name}} @{$self->bib_fields}; - return $self->get_db_rows($sql); + @matches = grep { + (($_->search_group || '') eq ($field->{search_group} || '')) + } @matches; + + my ($match) = grep { + ($_->search_field && $field->{purpose} eq 'search') || + ($_->facet_field && $field->{purpose} eq 'facet') || + ($_->filter && $field->{purpose} eq 'filter') || + ($_->sorter && $field->{purpose} eq 'sorter') + } @matches; + + if (!$match) { + # Warning on mismatched fields can lead to a lot of logs + # while trying different field configs. Consider a + # 'warn-on-field-mismatch' flag. + $logger->debug("ES No bib field matches extracted data ". + OpenSRF::Utils::JSON->perl2JSON($field)); + } + + return $match; } sub populate_bib_index_batch { @@ -385,7 +628,7 @@ sub populate_bib_index_batch { $logger->info("ES indexing ".scalar(@$bib_ids)." records"); - my $bib_data = $self->get_bib_data($bib_ids); + my $records = $self->get_bib_data($bib_ids); # Remove records that are marked deleted. # This should only happen when running in refresh mode. @@ -394,9 +637,9 @@ sub populate_bib_index_batch { for my $bib_id (@$bib_ids) { # Every row in the result data contains the 'deleted' value. - my ($field) = grep {$_->{id} == $bib_id} @$bib_data; + my ($rec) = grep {$_->{id} == $bib_id} @$records; - if ($field->{deleted} == 1) { # not 't' / 'f' + if ($rec->{deleted} == 1) { # not 't' / 'f' $self->delete_documents($bib_id); } else { push(@active_ids, $bib_id); @@ -405,64 +648,123 @@ sub populate_bib_index_batch { $bib_ids = [@active_ids]; - my $holdings = $self->load_holdings($bib_ids); - my $marc = $self->load_marc($bib_ids); + return 0 unless @$bib_ids; + + my $holdings = $self->load_holdings($bib_ids) unless $self->skip_holdings; for my $bib_id (@$bib_ids) { + my ($rec) = grep {$_->{id} == $bib_id} @$records; my $body = { - marc => $marc->{$bib_id} || [], - holdings => $holdings->{$bib_id} || [] + bib_source => $rec->{bib_source}, + metarecord => $rec->{metarecord}, + marc => [] }; - # there are multiple rows per bib in the data list. - my @fields = grep {$_->{id} == $bib_id} @$bib_data; + $body->{holdings} = $holdings->{$bib_id} || [] unless $self->skip_holdings; - my $first = 1; - for my $field (@fields) { - - if ($first) { - $first = 0; - # some values are repeated per field. - # extract them from the first entry. - $body->{bib_source} = $field->{bib_source}; - $body->{metarecord} = $field->{metarecord}; - - # ES ikes the "T" separator for ISO dates - ($body->{create_date} = $field->{create_date}) =~ s/ /T/g; - ($body->{edit_date} = $field->{edit_date}) =~ s/ /T/g; - } + # ES likes the "T" separator for ISO dates + ($body->{create_date} = $rec->{create_date}) =~ s/ /T/g; + ($body->{edit_date} = $rec->{edit_date}) =~ s/ /T/g; + for my $field (@{$rec->{fields}}) { + my $purpose = $field->{purpose}; my $fclass = $field->{search_group}; my $fname = $field->{name}; my $value = $field->{value}; next unless defined $value && $value ne ''; + my $trim = $purpose eq 'sorter' ? $TRIM_ABOVE : undef; + $value = $self->truncate_value($value, $trim); + + if ($purpose eq 'marc') { + # NOTE: we could create/require elastic.bib_field entries for + # MARC values as well if we wanted to control the exact + # MARC data that's indexed. + $self->add_marc_value($body, $fclass, $fname, $value); + next; + } + + # Ignore any data provided by the transform we have + # no configuration for. + next unless $self->get_bib_field_for_data($field); + $fname = "$fclass|$fname" if $fclass; - $value = $self->truncate_value($value); + $fname = "$fname|facet" if $purpose eq 'facet'; if ($fname eq 'identifier|isbn') { index_isbns($body, $value); + } elsif ($fname eq 'identifier|issn') { index_issns($body, $value); + + } elsif ($fname eq 'pubdate') { + index_pubdate($body, $value); + + } elsif ($fname =~ /sort/) { + index_sorter($body, $fname, $value); + } else { append_field_value($body, $fname, $value); } } - return 0 unless $self->index_document($bib_id, $body); + if ($self->skip_holdings) { + # Skip-Holdings mode performs an update for existing + # documents, so the attached holdings will remain, but + # performs a create for documents that don't yet exist. + if ($self->document_exists($bib_id)) { + return 0 unless $self->update_document($bib_id, $body); + } else { + return 0 unless $self->create_document($bib_id, $body); + } + } else { + return 0 unless $self->index_document($bib_id, $body); + } $state->{start_record} = $bib_id + 1; $index_count++; } - $logger->info("ES indexing completed for records " . - $bib_ids->[0] . '...' . $bib_ids->[-1]); + $self->{total_indexed} += $index_count; + + $logger->info(sprintf( + "ES indexed %d records in this batch across records %d ... %d ". + "with a session total of %d", + $index_count, $bib_ids->[0], $bib_ids->[-1], $self->{total_indexed})); + + my $batch_size = $state->{batch_size} || $DEFAULT_BIB_BATCH_SIZE; return $index_count; } +sub index_sorter { + my ($body, $fname, $value) = @_; + + $value = OpenILS::Utils::Normalize::search_normalize($value); + + $value =~ s/^ +//g; + + append_field_value($body, $fname, $value) if $value; +} + +# Normalize the pubdate (used for sorting) to a single 4-digit year. +# Pad with zeroes where the year fall short of 4 digits. +sub index_pubdate { + my ($body, $value) = @_; + + $value =~ s/\D//g; + + return unless $value; # no numbers + + $value = substr($value . '0' x 4, 0, 4); + + return if $value eq '0000'; # treat as no date. + + append_field_value($body, 'pubdate', $value) if $value; +} + # Indexes ISBN10, ISBN13, and formatted values of both (with hyphens) sub index_isbns { @@ -470,25 +772,24 @@ sub index_isbns { return unless $value; my %seen; # deduplicate values - - # Chop up the collected raw values into parts and let - # Business::* tell us which parts looks like ISBNs. - for my $token (split(/ /, $value)) { - if (length($token) > 8) { - my $isbn = Business::ISBN->new($token); - if ($isbn && $isbn->is_valid) { - if ($isbn->as_isbn10) { - $seen{$isbn->as_isbn10->isbn} = 1; - $seen{$isbn->as_isbn10->as_string} = 1; - } - if ($isbn->as_isbn13) { - $seen{$isbn->as_isbn13->isbn} = 1; - $seen{$isbn->as_isbn13->as_string} = 1; - } - } + my @values = OpenILS::Utils::Normalize::clean_isbns($value); + my $isbns = $values[0]; + my $strings = $values[1]; + + for my $isbn (@$isbns) { + if ($isbn->as_isbn10) { + $seen{$isbn->as_isbn10->isbn} = 1; # compact + $seen{$isbn->as_isbn10->as_string} = 1; # with hyphens + } + if ($isbn->as_isbn13) { + $seen{$isbn->as_isbn13->isbn} = 1; + $seen{$isbn->as_isbn13->as_string} = 1; } } + # Add the unvalidated ISBNs + $seen{$_} = 1 for @$strings; + append_field_value($body, 'identifier|isbn', $_) foreach keys %seen; } @@ -498,17 +799,13 @@ sub index_issns { return unless $value; my %seen; # deduplicate values + my @issns = OpenILS::Utils::Normalize::clean_issns($value); - # Chop up the collected raw values into parts and let - # Business::* tell us which parts looks valid. - for my $token (split(/ /, $value)) { - my $issn = Business::ISSN->new($token); - if ($issn && $issn->is_valid) { - # no option in business::issn to get the unformatted value. - (my $unformatted = $issn->as_string) =~ s/-//g; - $seen{$unformatted} = 1; - $seen{$issn->as_string} = 1; - } + for my $issn (@issns) { + # no option in business::issn to get the unformatted value. + (my $unformatted = $issn->as_string) =~ s/-//g; + $seen{$unformatted} = 1; + $seen{$issn->as_string} = 1; } append_field_value($body, 'identifier|issn', $_) foreach keys %seen; @@ -521,10 +818,12 @@ sub append_field_value { if (ref $body->{$fname}) { # Three or more values encountered for field. # Add to the list. + return if grep {$_ eq $value} @{$body->{$fname}}; # dupe push(@{$body->{$fname}}, $value); } else { # Second value encountered for field. # Upgrade to array storage. + return if $body->{$fname} eq $value; # dupe $body->{$fname} = [$body->{$fname}, $value]; } } else { @@ -579,89 +878,52 @@ SQL return $holdings; } -sub load_marc { - my ($self, $bib_ids) = @_; - - my $bib_ids_str = join(',', @$bib_ids); - - my $marc_data = $self->get_db_rows(<info("ES found ".scalar(@$marc_data). - " MARC rows for current record batch"); - - my $marc = {}; - for my $row (@$marc_data) { +sub add_marc_value { + my ($self, $rec, $tag, $subfield, $value) = @_; - my $value = $row->{value}; - next unless defined $value && $value ne ''; + # XSL uses '_' when no subfield is present (e.g. controlfields) + $subfield = undef if $subfield eq '_'; - my $subfield = $row->{subfield}; - my $rec_id = $row->{record}; - delete $row->{record}; # avoid adding this to the index + my ($match) = grep { + $_->{tag} eq $tag && + ($_->{subfield} || '') eq ($subfield || '') + } @{$rec->{marc}}; - $row->{value} = $value = $self->truncate_value($value); + if ($match) { + if (ref $match->{value}) { + # 3rd or more instance of tag/subfield for this record. - $marc->{$rec_id} = [] unless $marc->{$rec_id}; - delete $row->{subfield} unless defined $subfield; - - # Add values to existing record/tag/subfield rows. - - my $existing; - for my $entry (@{$marc->{$rec_id}}) { - next unless $entry->{tag} eq $row->{tag}; - - if (defined $subfield) { - if (defined $entry->{subfield}) { - if ($subfield eq $entry->{subfield}) { - $existing = $entry; - last; - } - } - } elsif (!defined $entry->{subfield}) { - # Neither has a subfield value / not all tags have subfields - $existing = $entry; - last; - } - } + # avoid dupes + return if grep {$_ eq $value} @{$match->{value}}; - if ($existing) { - - $existing->{value} = [$existing->{value}] unless ref $existing->{value}; - push(@{$existing->{value}}, $value); + push(@{$match->{value}}, $value); } else { + # 2nd instance of tag/subfield for this record. + + # avoid dupes + return if $match->{value} eq $value; - push(@{$marc->{$rec_id}}, $row); + $match->{value} = [$match->{value}, $value]; } - } - - return $marc; -} + } else { + # first instance of tag/subfield for this record. + $match = {tag => $tag, value => $value}; + $match->{subfield} = $subfield if defined $subfield; -sub index { - my $self = shift; - return $self->{index} if $self->{index}; - ($self->{index}) = grep {$_->code eq $self->index_name} @{$self->indices}; - - $logger->error("No ndex configured named ".$self->index_name) unless $self->{index}; - - return $self->{index}; + push(@{$rec->{marc}}, $match); + } } - # Add data to the bib-search index sub populate_index { my ($self, $settings) = @_; $settings ||= {}; my $index_count = 0; - my $total_indexed = 0; + $self->{total_indexed} = 0; # extract the database settings. for my $db_key (grep {$_ =~ /^db_/} keys %$settings) { @@ -679,9 +941,6 @@ sub populate_index { while (1) { $index_count = $self->populate_bib_index_batch($settings); - $total_indexed += $index_count; - - $logger->info("ES indexed $total_indexed bib records"); # exit if we're only indexing a single record or if the # batch indexer says there are no more records to index. @@ -694,7 +953,7 @@ sub populate_index { } } - $logger->info("ES bib indexing complete with $total_indexed records"); + $logger->info("ES bib indexing complete with " . $self->{total_indexed} . " records"); } sub get_bib_ids { @@ -710,9 +969,10 @@ sub get_bib_ids { my ($select, $from, $where); if ($modified_since) { + $logger->info("ES bib indexing records modified since $modified_since"); $select = "SELECT id"; - $from = "FROM elastic.bib_last_mod_date"; - $where = "WHERE last_mod_date > '$modified_since'"; + $from = "FROM elastic.bib_mod_since(QUOTE_LITERAL('$modified_since')::TIMESTAMPTZ)"; + $where = "WHERE TRUE"; } else { $select = "SELECT id"; $from = "FROM biblio.record_entry"; diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql index b492659c1c..32c715e03e 100644 --- a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql @@ -3,32 +3,6 @@ DROP SCHEMA IF EXISTS elastic CASCADE; BEGIN; -ALTER TABLE config.record_attr_definition - ADD COLUMN elastic_field BOOLEAN NOT NULL DEFAULT FALSE; - -ALTER TABLE config.metabib_field - ADD COLUMN elastic_field BOOLEAN NOT NULL DEFAULT FALSE; - --- Provide a sweeping set of default elastic fields. --- Likely this set of fields can be trimmed significantly for most sites, --- since many of these fields will never be searched from the catalog. --- Reducing the number of elastic_field's will improve indexing time, --- search time, and reduce Elastic disk space requirements. -UPDATE config.record_attr_definition - SET elastic_field = TRUE WHERE name NOT LIKE 'marc21_%'; - -UPDATE config.metabib_field - SET elastic_field = TRUE WHERE search_field OR facet_field; - -INSERT INTO config.global_flag (name, enabled, label) -VALUES ( - 'elastic.bib_search.enabled', FALSE, - 'Elasticsearch Enable Bib Searching' -), ( - 'elastic.bib_search.dynamic_properties', FALSE, - 'Elasticsearch Dynamic Bib Record Properties' -); - CREATE SCHEMA elastic; CREATE TABLE elastic.cluster ( @@ -44,252 +18,55 @@ CREATE TABLE elastic.node ( port INTEGER NOT NULL, path TEXT NOT NULL DEFAULT '/', active BOOLEAN NOT NULL DEFAULT FALSE, - cluster TEXT NOT NULL + cluster TEXT NOT NULL REFERENCES elastic.cluster (code) ON DELETE CASCADE, CONSTRAINT node_once UNIQUE (host, port, path, cluster) ); -CREATE TABLE elastic.index ( - id SERIAL PRIMARY KEY, - code TEXT NOT NULL, -- e.g. 'bib-search' - cluster TEXT NOT NULL - REFERENCES elastic.cluster (code) ON DELETE CASCADE, - active BOOLEAN NOT NULL DEFAULT FALSE, - num_shards INTEGER NOT NULL DEFAULT 1, - CONSTRAINT index_type_once_per_cluster UNIQUE (code, cluster) -); - -CREATE OR REPLACE VIEW elastic.bib_field AS - SELECT fields.* FROM ( - SELECT - NULL::INT AS metabib_field, - crad.name, - crad.label, - NULL AS search_group, - crad.sorter, - FALSE AS search_field, - FALSE AS facet_field, - 1 AS weight - FROM config.record_attr_definition crad - WHERE crad.elastic_field - UNION - SELECT - cmf.id AS metabib_field, - cmf.name, - cmf.label, - cmf.field_class AS search_group, - FALSE AS sorter, - -- always treat identifier fields as non-search fields. - (cmf.field_class <> 'identifier' AND cmf.search_field) AS search_field, - cmf.facet_field, - cmf.weight - FROM config.metabib_field cmf - WHERE cmf.elastic_field - ) fields; - - -CREATE OR REPLACE FUNCTION elastic.bib_record_attrs(bre_id BIGINT) -RETURNS TABLE ( - search_group TEXT, - name TEXT, - source BIGINT, - value TEXT -) -AS $FUNK$ - SELECT DISTINCT record.* FROM ( - SELECT - NULL::TEXT AS search_group, - crad.name, - mrs.source, - mrs.value - FROM metabib.record_sorter mrs - JOIN config.record_attr_definition crad ON (crad.name = mrs.attr) - WHERE mrs.source = $1 AND crad.elastic_field - UNION - - -- record attributes - SELECT - NULL::TEXT AS search_group, - crad.name, - mraf.id AS source, - mraf.value - FROM metabib.record_attr_flat mraf - JOIN config.record_attr_definition crad ON (crad.name = mraf.attr) - WHERE mraf.id = $1 AND crad.elastic_field - ) record -$FUNK$ LANGUAGE SQL STABLE; - -CREATE OR REPLACE FUNCTION elastic.bib_record_static_props(bre_id BIGINT) -RETURNS TABLE ( - search_group TEXT, - name TEXT, - source BIGINT, - value TEXT -) -AS $FUNK$ - SELECT DISTINCT record.* FROM ( - SELECT - cmf.field_class AS search_group, - cmf.name, - props.source, - CASE WHEN cmf.joiner IS NOT NULL THEN - REGEXP_SPLIT_TO_TABLE(props.value, cmf.joiner) - ELSE - props.value - END AS value - FROM ( - SELECT * FROM metabib.title_field_entry mtfe WHERE mtfe.source = $1 - UNION - SELECT * FROM metabib.author_field_entry mafe WHERE mafe.source = $1 - UNION - SELECT * FROM metabib.subject_field_entry msfe WHERE msfe.source = $1 - UNION - SELECT * FROM metabib.series_field_entry msrfe WHERE msrfe.source = $1 - UNION - SELECT * FROM metabib.keyword_field_entry mkfe WHERE mkfe.source = $1 - UNION - SELECT * FROM metabib.identifier_field_entry mife WHERE mife.source = $1 - ) props - JOIN config.metabib_field cmf ON (cmf.id = props.field) - WHERE cmf.elastic_field - ) record -$FUNK$ LANGUAGE SQL STABLE; - -CREATE OR REPLACE FUNCTION elastic.bib_record_dynamic_props(bre_id BIGINT) -RETURNS TABLE ( - search_group TEXT, - name TEXT, - source BIGINT, - value TEXT -) -AS $FUNK$ - SELECT DISTINCT record.* FROM ( - SELECT - cmf.field_class AS search_group, - cmf.name, - props.source, - CASE WHEN cmf.joiner IS NOT NULL THEN - REGEXP_SPLIT_TO_TABLE(props.value, cmf.joiner) - ELSE - props.value - END AS value - FROM biblio.extract_metabib_field_entry( - $1, ' ', '{facet,search}', - (SELECT ARRAY_AGG(id) FROM config.metabib_field WHERE elastic_field) - ) props - JOIN config.metabib_field cmf ON (cmf.id = props.field) - ) record -$FUNK$ LANGUAGE SQL STABLE; - - -CREATE OR REPLACE FUNCTION elastic.bib_record_properties(bre_id BIGINT) - RETURNS TABLE ( - search_group TEXT, - name TEXT, - source BIGINT, - value TEXT - ) - AS $FUNK$ -DECLARE - props_func TEXT; -BEGIN - - PERFORM 1 FROM config.internal_flag cif WHERE - cif.name = 'elastic.bib_search.dynamic_properties' AND cif.enabled; - - IF FOUND THEN - props_func := 'elastic.bib_record_dynamic_props'; - ELSE - props_func := 'elastic.bib_record_static_props'; - END IF; - - RETURN QUERY EXECUTE $$ - SELECT DISTINCT record.* FROM ( - SELECT * FROM elastic.bib_record_attrs($$ || QUOTE_LITERAL(bre_id) || $$) - UNION - SELECT * FROM $$ || props_func || '(' || QUOTE_LITERAL(bre_id) || $$) - ) record - $$; -END $FUNK$ LANGUAGE PLPGSQL; - -/* give me bibs I should upate */ - -CREATE OR REPLACE VIEW elastic.bib_last_mod_date AS +CREATE OR REPLACE FUNCTION elastic.bib_mod_since(since TIMESTAMPTZ) + RETURNS TABLE (id BIGINT) AS $FUNK$ /** * Last update date for each bib, which is taken from most recent * edit for either the bib, a linked call number, or a linked copy. * If no call numbers are linked, uses the bib edit date only. * Includes deleted data since it can impact indexing. */ - WITH mod_dates AS ( - SELECT bre.id, - bre.edit_date, - MAX(COALESCE(acn.edit_date, '1901-01-01')) AS max_call_number_edit_date, - MAX(COALESCE(acp.edit_date, '1901-01-01')) AS max_copy_edit_date - FROM biblio.record_entry bre - LEFT JOIN asset.call_number acn ON (acn.record = bre.id) - LEFT JOIN asset.copy acp ON (acp.call_number = acn.id) - GROUP BY 1, 2 - ) SELECT dates.id, - GREATEST(dates.edit_date, - GREATEST(dates.max_call_number_edit_date, dates.max_copy_edit_date) - ) AS last_mod_date - FROM mod_dates dates; + SELECT bre.id FROM biblio.record_entry bre WHERE bre.edit_date > since + UNION -/* SEED DATA ------------------------------------------------------------ */ - -INSERT INTO elastic.cluster (code, label) - VALUES ('main', 'Main Cluster'); - -INSERT INTO elastic.node (label, host, proto, port, active, cluster) - VALUES ('Localhost', 'localhost', 'http', 9200, TRUE, 'main'); + SELECT bre.id + FROM biblio.record_entry bre + JOIN asset.call_number acn ON acn.record = bre.id + WHERE acn.edit_date > since + UNION -INSERT INTO elastic.index (code, active, cluster) - VALUES ('bib-search', TRUE, 'main'); + SELECT bre.id + FROM biblio.record_entry bre + JOIN asset.call_number acn ON acn.record = bre.id + JOIN asset.copy acp ON acp.call_number = acn.id + WHERE acp.edit_date > since -COMMIT; +$FUNK$ LANGUAGE SQL; -/* UNDO - -DROP SCHEMA IF EXISTS elastic CASCADE; +/* SEED DATA ------------------------------------------------------------ */ -ALTER TABLE config.record_attr_definition DROP COLUMN elastic_field; -ALTER TABLE config.metabib_field DROP COLUMN elastic_field; +INSERT INTO config.global_flag (name, enabled, label, value) +VALUES ( + 'elastic.bib_search.enabled', FALSE, + 'Elasticsearch Enable Bib Searching', NULL +); -DELETE FROM config.global_flag WHERE name ~ 'elastic.*'; +INSERT INTO elastic.cluster (code, label) VALUES ('main', 'Main Cluster'); -*/ +INSERT INTO elastic.node (label, host, proto, port, active, cluster) + VALUES ('Localhost', 'localhost', 'http', 9200, TRUE, 'main'); /* - --- Sample narrower set of elastic fields to avoid duplication and --- indexing data that will presumably never be searched in the catalog. - -UPDATE config.metabib_field SET elastic_field = FALSE -WHERE - (field_class = 'keyword' AND name <> 'keyword') OR - (field_class = 'subject' AND name = 'complete') OR - (field_class = 'author' AND name = 'first_author') -; - -UPDATE config.record_attr_definition SET elastic_field = FALSE -WHERE name NOT IN ( - 'authorsort', - 'date1', - 'date2', - 'bib_level', - 'icon_format', - 'item_form', - 'item_lang', - 'item_type', - 'lit_form', - 'pubdate', - 'search_format', - 'titlesort', - 'sr_format', - 'vr_format' -); +-- turn it on for the UI +UPDATE config.global_flag SET enabled = TRUE WHERE name = 'elastic.bib_search.enabled'; */ + +COMMIT; diff --git a/Open-ILS/src/support-scripts/elastic-index.pl b/Open-ILS/src/support-scripts/elastic-index.pl index 05ebc6b9dc..5abe39d9dc 100755 --- a/Open-ILS/src/support-scripts/elastic-index.pl +++ b/Open-ILS/src/support-scripts/elastic-index.pl @@ -3,23 +3,34 @@ use strict; use warnings; use Getopt::Long; use OpenSRF::Utils::JSON; +use OpenSRF::Utils::Logger qw/:logger/; use OpenILS::Utils::Fieldmapper; use OpenILS::Utils::CStoreEditor; use OpenILS::Elastic::BibSearch; -my $help; +my $lockfile; +my $batch_size = 500; my $osrf_config = '/openils/conf/opensrf_core.xml'; my $cluster = 'main'; +my @nodes; +my $index_class = 'bib-search'; +my $bib_transform; +my $es_config_file; my $create_index; my $delete_index; -my $index_name = 'bib-search'; # only supported index at time of writing +my $index_name; +my $activate_index; my $populate; my $index_record; my $start_record; my $stop_record; my $modified_since; my $max_duration; -my $batch_size = 500; +my $skip_holdings; +my $list_indices; +my $no_opensrf; +my $force; +my $help; # Database settings read from ENV by default. my $db_host = $ENV{PGHOST} || 'localhost'; @@ -32,16 +43,26 @@ my $db_appn = 'Elastic Indexer'; GetOptions( 'help' => \$help, 'osrf-config=s' => \$osrf_config, + 'lockfile=s' => \$lockfile, 'cluster=s' => \$cluster, + 'node=s' => \@nodes, 'create-index' => \$create_index, 'delete-index' => \$delete_index, - 'index=s' => \$index_name, + 'index-name=s' => \$index_name, + 'index-class=s' => \$index_class, 'index-record=s' => \$index_record, + 'activate-index' => \$activate_index, 'start-record=s' => \$start_record, 'stop-record=s' => \$stop_record, 'modified-since=s' => \$modified_since, 'max-duration=s' => \$max_duration, 'batch-size=s' => \$batch_size, + 'bib-transform=s' => \$bib_transform, + 'es-config-file=s' => \$es_config_file, + 'skip-holdings' => \$skip_holdings, + 'list-indices' => \$list_indices, + 'no-opensrf' => \$no_opensrf, + 'force' => \$force, 'db-name=s' => \$db_name, 'db-host=s' => \$db_host, 'db-port=s' => \$db_port, @@ -51,16 +72,23 @@ GetOptions( 'populate' => \$populate ) || die "\nSee --help for more\n"; +$index_name = "$index_class-$index_name" if $index_name; + sub help { print < + --lockfile + Enables lock file controls over the process. If unset, + no lock file is created. + --db-name <$db_name> --db-host <$db_host> --db-port <$db_port> @@ -70,13 +98,38 @@ sub help { Database connection values. This is the Evergreen database where values should be extracted for elastic search indexing. + Beware that data loaded through Evergreen, e.g. elasticsearch + configuration data, will be loaded from the DB used by the + running Evergreen instance, regardless of these --db-* + settings. + Values default to their PG* environment variable equivalent. + --no-opensrf + Avoid connecting to OpenSRF. Requires passing at least + one --node. + + --bib-transform + Override the configured global config value for + 'elastic.bib_search.transform_file' + + --es-config-file + Override the default ES configuration XML file. + --cluster Specify a cluster name. Defaults to 'main'. - --index - Specify an index name. Defaults to 'bib-search'. + --node [repeatable] + Override the configured ES nodes. + + --index-class + Specifies which data set the current index manages (e.g. bib-search) + Must match a well-known index class with backing code. + + --index-name + The index name will be automatically prepended with the + index class. e.g. "my-index" becomes "bib-search-my-index" + on the backend for the "bib-search" index class. --delete-index Delete the specified index and all of its data. @@ -84,6 +137,10 @@ sub help { --create-index Create an index whose name equals --index-name. + --activate-index + Activate the selected index while deactivating all other + indexes of the same index_class and cluster. + --batch-size Index at most this many records per batch. Default is 500. @@ -103,6 +160,11 @@ sub help { at regular intervals to keep the ES-indexed data in sync with the EG data. + --skip-holdings + Bypass indexing the holdings data. This is useful + when reindexing for configuration changes, where the + underlying holdings data has not changed. + --max-duration Stop indexing once the process has been running for this amount of time. @@ -112,47 +174,94 @@ sub help { are provided (e.g. --index-start-record) then all applicable values will be indexed. + --list-indices + List all Elasticsearch indices represented in the + Evergreen database. + + --force + Force various actions. + HELP exit(0); } help() if $help; -# connect to osrf... -OpenSRF::System->bootstrap_client(config_file => $osrf_config); -Fieldmapper->import( - IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); -OpenILS::Utils::CStoreEditor::init(); +if ($lockfile) { + + die "I seem to be running already. If not remove $lockfile, try again\n" + if -e $lockfile; + + open(LOCK, ">$lockfile") or die "Cannot open lock file: $lockfile : $@\n"; + print LOCK $$ or die "Cannot write to lock file: $lockfile : $@\n"; + close LOCK; +} + +# We only need to connect to opensrf to look up the nodes in the database. +# If the nodes are provided and --no-opensrf is set, avoid the connection +# and log to stdout. +if (@nodes && $no_opensrf) { + $logger->set_log_stdout(1); + $logger->set_log_level($logger->INFO); + +} else { + OpenSRF::System->bootstrap_client(config_file => $osrf_config); + Fieldmapper->import( + IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + OpenILS::Utils::CStoreEditor::init(); +} my $es; -if ($index_name eq 'bib-search') { - $es = OpenILS::Elastic::BibSearch->new($cluster); +if ($index_class eq 'bib-search') { + $es = OpenILS::Elastic::BibSearch->new( + db_name => $db_name, + db_host => $db_host, + db_port => $db_port, + db_user => $db_user, + db_pass => 'REDACTED', + db_appn => $db_appn, + cluster => $cluster, + nodes => \@nodes, + xsl_file => $bib_transform, + index_name => $index_name, + maintenance_mode => 1, + es_config_file => $es_config_file, + skip_holdings => $skip_holdings + ); } if (!$es) { - die "Unknown index type: $index_name\n"; + die "Unknown index class: $index_class\n"; } $es->connect; if ($delete_index) { + + if (!$force) { + my $active = $es->active_index; + if ($active && $active eq $index_name) { + die "Index '$index_name' is active! " . + "Use --force to delete an active index.\n"; + } + } + $es->delete_index or die "Index delete failed.\n"; } if ($create_index) { + + if ($index_name eq $index_class) { + die "An index name cannot match its index_class [$index_class]\n"; + } + $es->create_index or die "Index create failed.\n"; } if ($populate) { my $settings = { - db_name => $db_name, - db_host => $db_host, - db_port => $db_port, - db_user => $db_user, - db_pass => 'REDACTED', - db_appn => $db_appn, index_record => $index_record, start_record => $start_record, stop_record => $stop_record, @@ -170,4 +279,31 @@ if ($populate) { $es->populate_index($settings) or die "Index populate failed.\n"; } +if ($activate_index) { + $es->activate_index or die "Index activation failed.\n"; +} + +if ($list_indices) { + my $indices = $es->indices; + + for my $name (keys %{$indices}) { + my $index_def = $indices->{$name}; + + my @aliases; + if ($index_def) { + @aliases = keys(%{$index_def->{$name}->{aliases}}); + } else { + warn "ES has no index named $name\n"; + } + + print sprintf( + "index_class=%s index_name=%s active=%s aliases=@aliases\n", + $es->index_class, $name, + $es->index_is_active($name) ? 'yes' : 'no'); + } +} + +unlink $lockfile if $lockfile; + + diff --git a/Open-ILS/src/support-scripts/test-scripts/elastic-search-samples.pl b/Open-ILS/src/support-scripts/test-scripts/elastic-search-samples.pl index c45e7c4d07..1202ce4d5b 100755 --- a/Open-ILS/src/support-scripts/test-scripts/elastic-search-samples.pl +++ b/Open-ILS/src/support-scripts/test-scripts/elastic-search-samples.pl @@ -14,7 +14,8 @@ binmode(STDOUT, ':utf8'); my $help; my $osrf_config = '/openils/conf/opensrf_core.xml'; my $cluster = 'main'; -my $index = 'bib-search'; +my $index_class = 'bib-search'; +my $index_name; my $quiet = 0; my $query_string; @@ -22,6 +23,7 @@ GetOptions( 'help' => \$help, 'osrf-config=s' => \$osrf_config, 'cluster=s' => \$cluster, + 'index-name=s' => \$index_name, 'quiet' => \$quiet, ) || die "\nSee --help for more\n"; @@ -29,10 +31,13 @@ sub help { print < Performs a series of canned bib record searches + Note if --index-name is omitted, the currently active index on + the 'bib-search' index class will be used. + HELP exit(0); } @@ -117,9 +122,16 @@ Fieldmapper->import( IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); OpenILS::Utils::CStoreEditor::init(); -my $es = OpenILS::Elastic::BibSearch->new($cluster); +my $es = OpenILS::Elastic::BibSearch->new(index_name => $index_name); $es->connect; +if ($es->index_name) { + print "Using bib-search index '" . $es->index_name . "'\n"; +} else { + die "No active 'bib-search' index found. ". + "Use --index-name or activate an index in the database.\n"; +} + print "Searching...\n"; for my $query_part (@$queries) { diff --git a/Open-ILS/src/support-scripts/test-scripts/elastic-search.pl b/Open-ILS/src/support-scripts/test-scripts/elastic-search.pl index 297b92fa34..47c72da914 100755 --- a/Open-ILS/src/support-scripts/test-scripts/elastic-search.pl +++ b/Open-ILS/src/support-scripts/test-scripts/elastic-search.pl @@ -14,7 +14,10 @@ binmode(STDOUT, ':utf8'); my $help; my $osrf_config = '/openils/conf/opensrf_core.xml'; my $cluster = 'main'; -my $index = 'bib-search'; +my @nodes; +my $index_class = 'bib-search'; +my $index_name; +my $field_group; my $quiet = 0; my $query_string; @@ -22,6 +25,9 @@ GetOptions( 'help' => \$help, 'osrf-config=s' => \$osrf_config, 'cluster=s' => \$cluster, + 'node=s' => \@nodes, + 'index-class=s' => \$index_class, + 'index-name=s' => \$index_name, 'quiet' => \$quiet, ) || die "\nSee --help for more\n"; @@ -29,10 +35,13 @@ sub help { print < Performs query string searches. + Note if --index-name is omitted, the currently active index on + the 'bib-search' index class will be used. + HELP exit(0); } @@ -46,9 +55,22 @@ Fieldmapper->import( IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); OpenILS::Utils::CStoreEditor::init(); -my $es = OpenILS::Elastic::BibSearch->new($cluster); +my $es = OpenILS::Elastic::BibSearch->new( + maintenance_mode => 1, # allows access to inactive indexes + cluster => $cluster, + nodes => \@nodes, + field_group => $field_group, + index_name => $index_name +); $es->connect; +if ($es->index_name) { + print "Using bib-search index '" . $es->index_name . "'\n"; +} else { + die "No active 'bib-search' index found. ". + "Use --index-name or activate an index in the database.\n"; +} + print < ['id', 'title|maintitle'] , # return only a few fields + _source => ['id', 'title|maintitle', 'author|personal'] , # return only a few fields from => 0, size => 10, sort => [{'_score' => 'desc'}], @@ -86,6 +108,17 @@ while (1) { # Search the base keyword text index by default. default_field => 'keyword.text' } + }, + # Request highligh data for title/author text fields. + # See below for logging highlight response data. + highlight => { + # Pre/Post tags modified to match stock Evergreen. + pre_tags => '', + post_tags => '', + fields => { + 'title*.text' => {}, + 'author*.text' => {} + } } }; @@ -107,6 +140,12 @@ while (1) { $hit->{_id}, $hit->{_score}, ($hit->{_source}->{'title|maintitle'} || '') ); + +# Uncomment to log highlighted field data. +# for my $hl (keys %{$hit->{highlight}}) { +# my @values = @{$hit->{highlight}->{$hl}}; +# print "\tHighlight: $hl => @values\n"; +# } } } diff --git a/Open-ILS/xsl/elastic-bib-transform.xsl b/Open-ILS/xsl/elastic-bib-transform.xsl new file mode 100644 index 0000000000..a6d974c79d --- /dev/null +++ b/Open-ILS/xsl/elastic-bib-transform.xsl @@ -0,0 +1,1339 @@ + + + + + + + + + + + + + + + + 650 + subject + + combined + abcdvxyz + + + 651 + subject + + combined + avxyz + + + 655 + subject + + combined + abcvxyz + + + 630 + subject + + combined + adfgklmnoprstvxyz + + + 600 + subject + + combined + abcdfgjklmnopqrstuvxyz + + + 610 + subject + + combined + abcdfgklmnoprstuvxyz + + + 611 + subject + + combined + acdefgjklnpqstuvxyz + + + 490 + series + seriestitle + a + + + 800 + series + seriestitle + tflmnoprs + + + 810 + series + seriestitle + tflmnoprs + + + 830 + series + seriestitle + adfgklmnoprst + + + 100 + author + personal + abcdq + + + 100 + author + + combined + abcdq + + + 110 + author + + combined + abcdn + + + 111 + author + + combined + acdegng + + + 700 + author + + combined + abcdq + + + 710 + author + + combined + ab + + + 711 + author + + combined + acde + + + 400 + author + + combined + abcd + + + 410 + author + + combined + abcd + + + 411 + author + + combined + acdegq + + + 010 + identifier + lccn + a + + + 010 + identifier + lccn + z + + + 020 + identifier + isbn + a + + + 020 + identifier + isbn + z + + + 022 + identifier + issn + a + + + 022 + identifier + issn + y + + + 022 + identifier + issn + z + + + 024 + identifier + upc + a + + + 024 + identifier + upc + z + + + 027 + identifier + tech_number + a + + + 027 + identifier + tech_number + z + + + 028 + identifier + tech_number + ab + + + 074 + identifier + sudoc + a + + + 074 + identifier + sudoc + z + + + 086 + identifier + sudoc + a + + + 086 + identifier + sudoc + z + + + + 092 + identifier + bibcn + ab + + + 099 + identifier + bibcn + ab + + + 086 + identifier + bibcn + ab + + + 092 + keyword + bibcn + ab + + + 099 + keyword + bibcn + ab + + + 086 + keyword + bibcn + ab + + + 901 + identifier + bibid + c + + + 901 + identifier + tcn + c + + + 130 + title + + combined + abcefgijklmnopqrstuvwxyz + + + 210 + title + + combined + abcefghijklmnopqrstuvwxyz + + + 222 + title + + combined + a + + + 240 + title + + combined + abcefgijklmnopqrstuvwxyz + + + 245 + title + maintitle + a + + + 245 + title + + combined + a + + + 245 + title + + combined + abefgijklmnopqrstuvwxyz + + + 245 + author + + combined + c + + + 246 + title + + combined + abcefgjklmnopqrstuvwxyz + + + 247 + title + + combined + abcefgijklmnopqrstuvwxyz + + + 260 + keyword + publisher + b + + + 264 + keyword + publisher + b + + + 245 + keyword + title + a + + + 100 + keyword + author + abcdq + + + 400 + series + seriestitle + ptv + + + 410 + author + + combined + abcde + + + 410 + series + seriestitle + ptv + + + 411 + author + + combined + acdegq + + + 411 + title + + combined + ptv + + + 440 + series + seriestitle + abcefghijklmnopqrstuvwyz + + + 490 + series + seriestitle + abcefghijklmnopqrstuvwyz + + + 490 + title + + combined + abcefghijklmnopqrstuvwyz + + + 694 + series + seriestitle + a + + + 700 + title + + combined + fgklmnoprst + + + 710 + title + + combined + fgklmnoprst + + + 711 + title + + combined + fklnpst + + + 730 + title + + combined + abcefgjklmnopqrstuvwyz + + + 740 + title + + combined + abcefgijklmnopqrstuvwyz + + + + + 780 + title + + combined + st + + + 785 + title + + combined + st + + + 800 + author + + combined + abcdq + + + 800 + series + seriestitle + fgklmnoprst + + + 810 + author + + combined + abcdn + + + 810 + series + seriestitle + abcdn + + + 811 + author + + combined + acdegnq + + + 811 + series + seriestitle + fklnpstv + + + 830 + series + seriestitle + abcefgijklmnopqrstuvwxyz + + + + + + + 650 + subject + + combined + abcdvxyz + + + 651 + subject + + combined + avxyz + + + 600 + subject + + combined + abcdfgjklmnopqrstuvxyz + + + 490 + series + seriestitle + a + + + 800 + series + seriestitle + tflmnoprs + + + 810 + series + seriestitle + tflmnoprs + + + 830 + series + seriestitle + adfgklmnoprst + + + 100 + author + + combined + abcdq + + + 110 + author + + combined + ab + + + 710 + author + + combined + ab + + + 410 + author + + combined + abcd + + + 400 + series + seriestitle + + + + 410 + author + + combined + + + + 410 + series + seriestitle + + + + 411 + author + + combined + + + + 440 + series + seriestitle + abcefghijklmnopqrstuvwyz + + + 490 + series + seriestitle + + + + 694 + series + seriestitle + + + + 800 + series + seriestitle + fgklmnoprst + + + 810 + series + seriestitle + abcdn + + + 811 + series + seriestitle + fklnpstv + + + 830 + series + seriestitle + abcefgijklmnopqrstuvwxyz + + + + + + + + + + + authorsort + + + + + + + + + + + + + + a + + + + + + + + + 0 + + + + + titlesort + + + + + + + + pubdate + + + 008 + 7 + 4 + + + + + + + + + + + date1 + + + 008 + 7 + 4 + + + 0000 + + + + date2 + + + 008 + 11 + 4 + + + 9999 + + + + lit_form + + + 008 + 33 + 1 + + + + + + item_lang + + + 008 + 35 + 3 + + + + + + audience + + + 008 + 22 + 1 + + + + + + + + + 6 + 1 + + + + + item_type + + + + + + 7 + 1 + + + + + bib_level + + + + + + 008 + 23 + 1 + + + + + item_form + + + + + + 007 + 0 + 1 + + + + + + + 007 + 4 + 1 + + + + + + vr_format + + + + + + + 007 + 3 + 1 + + + + + + sr_format + + + + + + + search_format + blu-ray + + s + + + + search_format + book + + at + + abcfoqrs + + acdm + + + + search_format + braille + + a + + f + + + + search_format + casaudiobook + + i + + l + + + + search_format + casmusic + + j + + l + + + + search_format + cdaudiobook + + i + + f + + + + search_format + cdaudiobook + + j + + f + + + + search_format + dvd + + v + + + + search_format + eaudio + + i + + oqs + + + + search_format + ebook + + at + + oqs + + acdm + + + + search_format + electronic + + os + + + + search_format + equip + + r + + + + search_format + evideo + + g + + oqs + + + + search_format + kit + + op + + + + search_format + lpbook + + at + + d + + acdm + + + + search_format + map + + ef + + + + search_format + microform + + abc + + + + search_format + music + + j + + + + search_format + phonomusic + + j + + abcde + + + + search_format + phonospoken + + i + + abcde + + + + search_format + picture + + k + + + + search_format + serial + + bs + + + + search_format + score + + cd + + + + search_format + software + + m + + + + search_format + vhs + + b + + + + + + + + sorter _ + + + + + + + + + + + filter _ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + abcdefghijklmnopqrstuvwxyz + + + + + + + + + + + + + + + + + + + + + search + + + + + + + + + + + + + + + + + + facet + + + + + + + + + + + + + + search keyword keyword + + + + + + + + + + + marc LDR _ + + + + marc + + _ + + + + + + + marc + + + + + + + + + + + + + + -- 2.11.0