From 429c0eef0fe5534e8ede36068657f51ad3d3c9bc Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Mon, 8 Jul 2019 18:12:16 -0400 Subject: [PATCH] working on data sync Signed-off-by: Bill Erickson --- Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm | 37 ++++++++++++++++++- .../src/perlmods/lib/OpenILS/Elastic/BibSearch.pm | 43 +++++++++++++--------- .../sql/Pg/upgrade/XXXX.schema.elastic-search.sql | 19 +++++----- Open-ILS/src/support-scripts/elastic-index.pl | 24 ++++++------ 4 files changed, 83 insertions(+), 40 deletions(-) diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm index b30a1c2cb8..372d31acaa 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm @@ -21,6 +21,8 @@ use OpenSRF::Utils::Logger qw/:logger/; use OpenILS::Utils::CStoreEditor qw/:funcs/; use Search::Elasticsearch; use OpenSRF::Utils::JSON; +use Data::Dumper; +$Data::Dumper::Indent = 0; sub new { my ($class, $cluster) = @_; @@ -154,6 +156,32 @@ sub delete_index { } } +# Remove multiple documents from the index by ID. +# $ids can be a single ID or an array ref of IDs. +sub delete_documents { + my ($self, $ids) = @_; + $ids = [$ids] unless ref $ids; + + my $result; + + eval { + + $result = $self->es->delete_by_query( + index => $self->index_name, + type => 'record', + body => {query => {terms => {id => $ids}}} + ); + }; + + if ($@) { + $logger->error("ES delete document failed with $@"); + return undef; + } + + $logger->debug("ES delete removed " . $result->{deleted} . " document"); + return $result; +} + sub index_document { my ($self, $id, $body) = @_; @@ -162,7 +190,7 @@ sub index_document { eval { $result = $self->es->index( index => $self->index_name, - type => 'record', # deprecated in v6 + type => 'record', id => $id, body => $body ); @@ -173,7 +201,12 @@ sub index_document { return undef; } - $logger->debug("ES index command returned $result"); + if ($result->{failed}) { + $logger->error("ES index document $id failed " . Dumper($result)); + return undef; + } + + $logger->debug("ES index => $id succeeded"); return $result; } diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm index 894b15c97f..8f8cabe3e1 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm @@ -67,15 +67,7 @@ my $BASE_PROPERTIES = { circulate => {type => 'boolean'}, opac_visible => {type => 'boolean'} } - }#, - -# marc => { -# type => 'nested', -# properties => { -# tag => {type => 'text'} -# # subfields are added dynamically -# } -# } + } }; sub index_name { @@ -222,13 +214,13 @@ sub get_bib_ids { my $start_id = $state->{start_record} || 0; my $stop_id = $state->{stop_record}; - my $start_date = $state->{start_date}; + my $modified_since = $state->{modified_since}; my ($select, $from, $where); - if ($start_date) { + if ($modified_since) { $select = "SELECT id"; $from = "FROM elastic.bib_last_mod_date"; - $where = "WHERE last_mod_date > '$start_date'"; + $where = "WHERE last_mod_date > '$modified_since'"; } else { $select = "SELECT id"; $from = "FROM biblio.record_entry"; @@ -261,6 +253,7 @@ SELECT bre.create_date, bre.edit_date, bre.source AS bib_source, + bre.deleted, (elastic.bib_record_properties(bre.id)).* FROM biblio.record_entry bre WHERE id IN ($ids_str) @@ -281,14 +274,30 @@ sub populate_bib_index_batch { my $bib_data = $self->get_bib_data($bib_ids); + # Remove records that are marked deleted. + # This should only happen when running in refresh mode. + + my @active_ids; + for my $bib_id (@$bib_ids) { + + # Every row in the result data contains the 'deleted' value. + my ($field) = grep {$_->{id} == $bib_id} @$bib_data; + + if ($field->{deleted} == 1) { # not 't' / 'f' + $self->delete_documents($bib_id); + } else { + push(@active_ids, $bib_id); + } + } + + $bib_ids = [@active_ids]; + my $holdings = $self->load_holdings($bib_ids); - #my $marc = $self->load_marc($bib_ids); for my $bib_id (@$bib_ids) { my $body = { holdings => $holdings->{$bib_id} || [] - #marc => $marc || [] }; # there are multiple rows per bib in the data list. @@ -395,10 +404,8 @@ SQL return $holdings; } -# TODO: Disabled for now because this approach leads to exceeding the -# maximum number of indexed fields (1k). Create a separate index -# for MARC searches. -# Load MARC record tags and subfields +# Example pulling marc tag/subfield data. +# TODO: Create a separate bib-marc index if needed. sub load_marc { my ($self, $bib_ids) = @_; diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql index 743f5f160b..c26c487d2b 100644 --- a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql @@ -137,19 +137,20 @@ END $FUNK$ LANGUAGE PLPGSQL; /* give me bibs I should upate */ CREATE OR REPLACE VIEW elastic.bib_last_mod_date AS + /** + * Last update date for each bib, which is taken from most recent + * edit for either the bib, a linked call number, or a linked copy. + * If no call numbers are linked, uses the bib edit date only. + * Includes deleted data since it can impact indexing. + */ WITH mod_dates AS ( SELECT bre.id, bre.edit_date, - MAX(acn.edit_date) AS max_call_number_edit_date, - MAX(acp.edit_date) AS max_copy_edit_date + MAX(COALESCE(acn.edit_date, '1901-01-01')) AS max_call_number_edit_date, + MAX(COALESCE(acp.edit_date, '1901-01-01')) AS max_copy_edit_date FROM biblio.record_entry bre - JOIN asset.call_number acn ON (acn.record = bre.id) - JOIN asset.copy acp ON (acp.call_number = acn.id) - WHERE - bre.active - AND NOT bre.deleted - AND NOT acn.deleted - AND NOT acp.deleted + LEFT JOIN asset.call_number acn ON (acn.record = bre.id) + LEFT JOIN asset.copy acp ON (acp.call_number = acn.id) GROUP BY 1, 2 ) SELECT dates.id, GREATEST(dates.edit_date, diff --git a/Open-ILS/src/support-scripts/elastic-index.pl b/Open-ILS/src/support-scripts/elastic-index.pl index 6544716a29..05ebc6b9dc 100755 --- a/Open-ILS/src/support-scripts/elastic-index.pl +++ b/Open-ILS/src/support-scripts/elastic-index.pl @@ -17,7 +17,7 @@ my $populate; my $index_record; my $start_record; my $stop_record; -my $start_date; +my $modified_since; my $max_duration; my $batch_size = 500; @@ -39,7 +39,7 @@ GetOptions( 'index-record=s' => \$index_record, 'start-record=s' => \$start_record, 'stop-record=s' => \$stop_record, - 'start-date=s' => \$start_date, + 'modified-since=s' => \$modified_since, 'max-duration=s' => \$max_duration, 'batch-size=s' => \$batch_size, 'db-name=s' => \$db_name, @@ -97,9 +97,11 @@ sub help { --stop-record Stop indexing after the record with this ID has been indexed. - --start-date - Start indexing records whose last edit date falls after - the provided date; + --modified-since + Index new records and reindex existing records whose last + modification date falls after the date provided. Use this + at regular intervals to keep the ES-indexed data in sync + with the EG data. --max-duration Stop indexing once the process has been running for this @@ -151,12 +153,12 @@ if ($populate) { db_user => $db_user, db_pass => 'REDACTED', db_appn => $db_appn, - index_record => $index_record, - start_record => $start_record, - stop_record => $stop_record, - start_date => $start_date, - max_duration => $max_duration, - batch_size => $batch_size + index_record => $index_record, + start_record => $start_record, + stop_record => $stop_record, + modified_since => $modified_since, + max_duration => $max_duration, + batch_size => $batch_size }; print "Commencing index populate with settings: " . -- 2.11.0