From ef720630eacc56b5605ec06057c03dda111a5a5d Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Fri, 30 Aug 2019 17:08:31 -0400 Subject: [PATCH] Consolidate some indexing code Signed-off-by: Bill Erickson --- .../OpenILS/Application/Search/ElasticMapper.pm | 8 +- Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib.pm | 122 ++++++++++++++++++ .../OpenILS/Elastic/{BibMarc.pm => Bib/Marc.pm} | 102 +-------------- .../Elastic/{BibSearch.pm => Bib/Search.pm} | 143 +-------------------- Open-ILS/src/support-scripts/elastic-index.pl | 8 +- 5 files changed, 137 insertions(+), 246 deletions(-) create mode 100644 Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib.pm rename Open-ILS/src/perlmods/lib/OpenILS/Elastic/{BibMarc.pm => Bib/Marc.pm} (73%) rename Open-ILS/src/perlmods/lib/OpenILS/Elastic/{BibSearch.pm => Bib/Search.pm} (70%) diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm index d737e2ea8b..c218b868b0 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm @@ -20,8 +20,8 @@ use OpenSRF::Utils::Logger qw/:logger/; use OpenILS::Utils::Fieldmapper; use OpenSRF::Utils::SettingsClient; use OpenILS::Utils::CStoreEditor q/:funcs/; -use OpenILS::Elastic::BibSearch; -use OpenILS::Elastic::BibMarc; +use OpenILS::Elastic::Bib::Search; +use OpenILS::Elastic::Bib::Marc; use List::Util qw/min/; use Digest::MD5 qw(md5_hex); @@ -112,7 +112,7 @@ sub bib_search { my ($elastic_query, $cache_key) = compile_elastic_query($query, $staff, $offset, $limit); - my $es = OpenILS::Elastic::BibSearch->new('main'); + my $es = OpenILS::Elastic::Bib::Search->new('main'); $es->connect; my $results = $es->search($elastic_query); @@ -629,7 +629,7 @@ sub marc_search { my $elastic_query = compile_elastic_marc_query($args, $staff, $offset, $limit); - my $es = OpenILS::Elastic::BibMarc->new('main'); + my $es = OpenILS::Elastic::Bib::Marc->new('main'); $es->connect; my $results = $es->search($elastic_query); diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib.pm new file mode 100644 index 0000000000..7d05749903 --- /dev/null +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib.pm @@ -0,0 +1,122 @@ +package OpenILS::Elastic::Bib; +# --------------------------------------------------------------- +# Copyright (C) 2019 King County Library System +# Author: Bill Erickson +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR code. See the +# GNU General Public License for more details. +# --------------------------------------------------------------- +use strict; +use warnings; +use Encode; +use DateTime; +use Time::HiRes qw/time/; +use OpenSRF::Utils::Logger qw/:logger/; +use OpenSRF::Utils::JSON; +use OpenILS::Utils::CStoreEditor qw/:funcs/; +use OpenILS::Utils::DateTime qw/interval_to_seconds/; +use OpenILS::Elastic; +use base qw/OpenILS::Elastic/; + +# number of bibs to index per batch. +my $BIB_BATCH_SIZE = 500; + +sub index { + my $self = shift; + return $self->{index} if $self->{index}; + ($self->{index}) = grep {$_->code eq $self->index_name} @{$self->indices}; + + $logger->error("No ndex configured named ".$self->index_name) unless $self->{index}; + + return $self->{index}; +} + + +# Add data to the bib-search index +sub populate_index { + my ($self, $settings) = @_; + $settings ||= {}; + + my $index_count = 0; + my $total_indexed = 0; + + # extract the database settings. + for my $db_key (grep {$_ =~ /^db_/} keys %$settings) { + $self->{$db_key} = $settings->{$db_key}; + } + + my $end_time; + my $duration = $settings->{max_duration}; + if ($duration) { + my $seconds = interval_to_seconds($duration); + $end_time = DateTime->now; + $end_time->add(seconds => $seconds); + } + + while (1) { + + $index_count = $self->populate_bib_index_batch($settings); + $total_indexed += $index_count; + + $logger->info("ES indexed $total_indexed bib records"); + + # exit if we're only indexing a single record or if the + # batch indexer says there are no more records to index. + last if !$index_count || $settings->{index_record}; + + if ($end_time && DateTime->now > $end_time) { + $logger->info( + "ES index populate exiting early on max_duration $duration"); + last; + } + } + + $logger->info("ES bib indexing complete with $total_indexed records"); +} + +sub get_bib_ids { + my ($self, $state) = @_; + + # A specific record is selected for indexing. + return [$state->{index_record}] if $state->{index_record}; + + my $start_id = $state->{start_record} || 0; + my $stop_id = $state->{stop_record}; + my $modified_since = $state->{modified_since}; + + my ($select, $from, $where); + if ($modified_since) { + $select = "SELECT id"; + $from = "FROM elastic.bib_last_mod_date"; + $where = "WHERE last_mod_date > '$modified_since'"; + } else { + $select = "SELECT id"; + $from = "FROM biblio.record_entry"; + $where = "WHERE NOT deleted AND active"; + } + + $where .= " AND id >= $start_id" if $start_id; + $where .= " AND id <= $stop_id" if $stop_id; + + # Ordering by ID is the simplest way to guarantee all requested + # records are processed, given that edit dates may not be unique + # and that we're using start_id/stop_id instead of OFFSET to + # define the batches. + my $order = "ORDER BY id"; + + my $sql = "$select $from $where $order LIMIT $BIB_BATCH_SIZE"; + + my $ids = $self->get_db_rows($sql); + return [ map {$_->{id}} @$ids ]; +} + +1; + + diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibMarc.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Marc.pm similarity index 73% rename from Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibMarc.pm rename to Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Marc.pm index 86bfc9f425..dae48fa5ac 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibMarc.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Marc.pm @@ -1,4 +1,5 @@ -package OpenILS::Elastic::BibMarc; +package OpenILS::Elastic::Bib::Marc; +use base 'OpenILS::Elastic::Bib'; # --------------------------------------------------------------- # Copyright (C) 2019 King County Library System # Author: Bill Erickson @@ -22,16 +23,11 @@ use OpenSRF::Utils::Logger qw/:logger/; use OpenSRF::Utils::JSON; use OpenILS::Utils::CStoreEditor qw/:funcs/; use OpenILS::Utils::DateTime qw/interval_to_seconds/; -use OpenILS::Elastic; -use base qw/OpenILS::Elastic/; -use Data::Dumper; -$Data::Dumper::Indent = 0; +use OpenILS::Elastic::Bib; +use base qw/OpenILS::Elastic::Bib/; my $INDEX_NAME = 'bib-marc'; -# number of bibs to index per batch. -my $BIB_BATCH_SIZE = 500; - # TODO: it's possible to apply multiple language analyzers. my $LANG_ANALYZER = 'english'; @@ -97,19 +93,6 @@ sub index_name { return $INDEX_NAME; } -sub index { - my $self = shift; - return $self->{index} if $self->{index}; - ($self->{index}) = grep {$_->code eq $INDEX_NAME} @{$self->indices}; - - if (!$self->{index}) { - $logger->error("No index configuration exists for '$INDEX_NAME'"); - return undef; - } - - return $self->{index}; -} - sub create_index { my ($self) = @_; @@ -171,83 +154,6 @@ sub create_index { return 1; } -sub populate_index { - my ($self, $settings) = @_; - $settings ||= {}; - - my $index_count = 0; - my $total_indexed = 0; - - # extract the database settings. - for my $db_key (grep {$_ =~ /^db_/} keys %$settings) { - $self->{$db_key} = $settings->{$db_key}; - } - - my $end_time; - my $duration = $settings->{max_duration}; - if ($duration) { - my $seconds = interval_to_seconds($duration); - $end_time = DateTime->now; - $end_time->add(seconds => $seconds); - } - - while (1) { - - $index_count = $self->populate_bib_index_batch($settings); - $total_indexed += $index_count; - - $logger->info("ES indexed $total_indexed bib records"); - - # exit if we're only indexing a single record or if the - # batch indexer says there are no more records to index. - last if !$index_count || $settings->{index_record}; - - if ($end_time && DateTime->now > $end_time) { - $logger->info( - "ES index populate exiting early on max_duration $duration"); - last; - } - } - - $logger->info("ES bib indexing complete with $total_indexed records"); -} - -sub get_bib_ids { - my ($self, $state) = @_; - - # A specific record is selected for indexing. - return [$state->{index_record}] if $state->{index_record}; - - my $start_id = $state->{start_record} || 0; - my $stop_id = $state->{stop_record}; - my $modified_since = $state->{modified_since}; - - my ($select, $from, $where); - if ($modified_since) { - $select = "SELECT id"; - $from = "FROM elastic.bib_last_mod_date"; - $where = "WHERE last_mod_date > '$modified_since'"; - } else { - $select = "SELECT id"; - $from = "FROM biblio.record_entry"; - $where = "WHERE NOT deleted AND active"; - } - - $where .= " AND id >= $start_id" if $start_id; - $where .= " AND id <= $stop_id" if $stop_id; - - # Ordering by ID is the simplest way to guarantee all requested - # records are processed, given that edit dates may not be unique - # and that we're using start_id/stop_id instead of OFFSET to - # define the batches. - my $order = "ORDER BY id"; - - my $sql = "$select $from $where $order LIMIT $BIB_BATCH_SIZE"; - - my $ids = $self->get_db_rows($sql); - return [ map {$_->{id}} @$ids ]; -} - sub get_bib_data { my ($self, $record_ids) = @_; diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm similarity index 70% rename from Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm rename to Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm index 9f2d9bbc89..37e75bcee1 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm @@ -1,4 +1,4 @@ -package OpenILS::Elastic::BibSearch; +package OpenILS::Elastic::Bib::Search; # --------------------------------------------------------------- # Copyright (C) 2018 King County Library System # Author: Bill Erickson @@ -22,8 +22,8 @@ use OpenSRF::Utils::Logger qw/:logger/; use OpenSRF::Utils::JSON; use OpenILS::Utils::CStoreEditor qw/:funcs/; use OpenILS::Utils::DateTime qw/interval_to_seconds/; -use OpenILS::Elastic; -use base qw/OpenILS::Elastic/; +use OpenILS::Elastic::Bib; +use base qw/OpenILS::Elastic::Bib/; my $INDEX_NAME = 'bib-search'; @@ -74,13 +74,6 @@ sub index_name { return $INDEX_NAME; } -sub index { - my $self = shift; - return $self->{index} if $self->{index}; - ($self->{index}) = grep {$_->code eq $INDEX_NAME} @{$self->indices}; - return $self->{index}; -} - sub create_index { my ($self) = @_; @@ -189,84 +182,6 @@ sub create_index { return 1; } -# Add data to the bib-search index -sub populate_index { - my ($self, $settings) = @_; - $settings ||= {}; - - my $index_count = 0; - my $total_indexed = 0; - - # extract the database settings. - for my $db_key (grep {$_ =~ /^db_/} keys %$settings) { - $self->{$db_key} = $settings->{$db_key}; - } - - my $end_time; - my $duration = $settings->{max_duration}; - if ($duration) { - my $seconds = interval_to_seconds($duration); - $end_time = DateTime->now; - $end_time->add(seconds => $seconds); - } - - while (1) { - - $index_count = $self->populate_bib_index_batch($settings); - $total_indexed += $index_count; - - $logger->info("ES indexed $total_indexed bib records"); - - # exit if we're only indexing a single record or if the - # batch indexer says there are no more records to index. - last if !$index_count || $settings->{index_record}; - - if ($end_time && DateTime->now > $end_time) { - $logger->info( - "ES index populate exiting early on max_duration $duration"); - last; - } - } - - $logger->info("ES bib indexing complete with $total_indexed records"); -} - -sub get_bib_ids { - my ($self, $state) = @_; - - # A specific record is selected for indexing. - return [$state->{index_record}] if $state->{index_record}; - - my $start_id = $state->{start_record} || 0; - my $stop_id = $state->{stop_record}; - my $modified_since = $state->{modified_since}; - - my ($select, $from, $where); - if ($modified_since) { - $select = "SELECT id"; - $from = "FROM elastic.bib_last_mod_date"; - $where = "WHERE last_mod_date > '$modified_since'"; - } else { - $select = "SELECT id"; - $from = "FROM biblio.record_entry"; - $where = "WHERE NOT deleted AND active"; - } - - $where .= " AND id >= $start_id" if $start_id; - $where .= " AND id <= $stop_id" if $stop_id; - - # Ordering by ID is the simplest way to guarantee all requested - # records are processed, given that edit dates may not be unique - # and that we're using start_id/stop_id instead of OFFSET to - # define the batches. - my $order = "ORDER BY id"; - - my $sql = "$select $from $where $order LIMIT $BIB_BATCH_SIZE"; - - my $ids = $self->get_db_rows($sql); - return [ map {$_->{id}} @$ids ]; -} - sub get_bib_data { my ($self, $record_ids) = @_; @@ -423,58 +338,6 @@ SQL return $holdings; } -# Example pulling marc tag/subfield data. -# TODO: Create a separate bib-marc index if needed. -sub load_marc { - my ($self, $bib_ids) = @_; - - my $bib_ids_str = join(',', @$bib_ids); - - my $marc_data = $self->get_db_rows(<info("ES found ".scalar(@$marc_data). - " full record rows for current record batch"); - - my $marc = {}; - for my $row (@$marc_data) { - - my $rec_id = $row->{record}; - next unless defined $row->{value} && $row->{value} ne ''; - - $marc->{$rec_id} = [] unless $marc->{$rec_id}; - delete $row->{subfield} unless defined $row->{subfield}; - - # Add values to existing record/tag/subfield rows. - - my ($existing) = grep { - $_->{record} == $row->{record} && - $_->{tag} eq $row->{tag} && ( - (not defined $_->{subfield} && not defined $row->{subfield}) || - ($_->{subfield} eq $row->{subfield}) - ) - } @{$marc->{$rec_id}}; - - if ($existing) { - - $existing->{subfield} = [$existing->{subfield}] - unless ref $existing->{subfield}; - push(@{$existing->{subfield}}, $row->{value}); - - } else { - - push(@{$marc->{$rec_id}}, $row); - } - } - - return $marc; -} - - - 1; diff --git a/Open-ILS/src/support-scripts/elastic-index.pl b/Open-ILS/src/support-scripts/elastic-index.pl index c24aa38e20..80d3ef46d7 100755 --- a/Open-ILS/src/support-scripts/elastic-index.pl +++ b/Open-ILS/src/support-scripts/elastic-index.pl @@ -5,8 +5,8 @@ use Getopt::Long; use OpenSRF::Utils::JSON; use OpenILS::Utils::Fieldmapper; use OpenILS::Utils::CStoreEditor; -use OpenILS::Elastic::BibSearch; -use OpenILS::Elastic::BibMarc; +use OpenILS::Elastic::Bib::Search; +use OpenILS::Elastic::Bib::Marc; my $help; my $osrf_config = '/openils/conf/opensrf_core.xml'; @@ -128,9 +128,9 @@ OpenILS::Utils::CStoreEditor::init(); my $es; if ($index_name eq 'bib-search') { - $es = OpenILS::Elastic::BibSearch->new($cluster); + $es = OpenILS::Elastic::Bib::Search->new($cluster); } elsif ($index_name eq 'bib-marc') { - $es = OpenILS::Elastic::BibMarc->new($cluster); + $es = OpenILS::Elastic::Bib::Marc->new($cluster); } if (!$es) { -- 2.11.0