use OpenILS::Utils::CStoreEditor qw/:funcs/;
use Search::Elasticsearch;
use OpenSRF::Utils::JSON;
+use Data::Dumper;
+$Data::Dumper::Indent = 0;
sub new {
my ($class, $cluster) = @_;
}
}
+# Remove multiple documents from the index by ID.
+# $ids can be a single ID or an array ref of IDs.
+sub delete_documents {
+ my ($self, $ids) = @_;
+ $ids = [$ids] unless ref $ids;
+
+ my $result;
+
+ eval {
+
+ $result = $self->es->delete_by_query(
+ index => $self->index_name,
+ type => 'record',
+ body => {query => {terms => {id => $ids}}}
+ );
+ };
+
+ if ($@) {
+ $logger->error("ES delete document failed with $@");
+ return undef;
+ }
+
+ $logger->debug("ES delete removed " . $result->{deleted} . " document");
+ return $result;
+}
+
sub index_document {
my ($self, $id, $body) = @_;
eval {
$result = $self->es->index(
index => $self->index_name,
- type => 'record', # deprecated in v6
+ type => 'record',
id => $id,
body => $body
);
return undef;
}
- $logger->debug("ES index command returned $result");
+ if ($result->{failed}) {
+ $logger->error("ES index document $id failed " . Dumper($result));
+ return undef;
+ }
+
+ $logger->debug("ES index => $id succeeded");
return $result;
}
circulate => {type => 'boolean'},
opac_visible => {type => 'boolean'}
}
- }#,
-
-# marc => {
-# type => 'nested',
-# properties => {
-# tag => {type => 'text'}
-# # subfields are added dynamically
-# }
-# }
+ }
};
sub index_name {
my $start_id = $state->{start_record} || 0;
my $stop_id = $state->{stop_record};
- my $start_date = $state->{start_date};
+ my $modified_since = $state->{modified_since};
my ($select, $from, $where);
- if ($start_date) {
+ if ($modified_since) {
$select = "SELECT id";
$from = "FROM elastic.bib_last_mod_date";
- $where = "WHERE last_mod_date > '$start_date'";
+ $where = "WHERE last_mod_date > '$modified_since'";
} else {
$select = "SELECT id";
$from = "FROM biblio.record_entry";
bre.create_date,
bre.edit_date,
bre.source AS bib_source,
+ bre.deleted,
(elastic.bib_record_properties(bre.id)).*
FROM biblio.record_entry bre
WHERE id IN ($ids_str)
my $bib_data = $self->get_bib_data($bib_ids);
+ # Remove records that are marked deleted.
+ # This should only happen when running in refresh mode.
+
+ my @active_ids;
+ for my $bib_id (@$bib_ids) {
+
+ # Every row in the result data contains the 'deleted' value.
+ my ($field) = grep {$_->{id} == $bib_id} @$bib_data;
+
+ if ($field->{deleted} == 1) { # not 't' / 'f'
+ $self->delete_documents($bib_id);
+ } else {
+ push(@active_ids, $bib_id);
+ }
+ }
+
+ $bib_ids = [@active_ids];
+
my $holdings = $self->load_holdings($bib_ids);
- #my $marc = $self->load_marc($bib_ids);
for my $bib_id (@$bib_ids) {
my $body = {
holdings => $holdings->{$bib_id} || []
- #marc => $marc || []
};
# there are multiple rows per bib in the data list.
return $holdings;
}
-# TODO: Disabled for now because this approach leads to exceeding the
-# maximum number of indexed fields (1k). Create a separate index
-# for MARC searches.
-# Load MARC record tags and subfields
+# Example pulling marc tag/subfield data.
+# TODO: Create a separate bib-marc index if needed.
sub load_marc {
my ($self, $bib_ids) = @_;
/* give me bibs I should upate */
CREATE OR REPLACE VIEW elastic.bib_last_mod_date AS
+ /**
+ * Last update date for each bib, which is taken from most recent
+ * edit for either the bib, a linked call number, or a linked copy.
+ * If no call numbers are linked, uses the bib edit date only.
+ * Includes deleted data since it can impact indexing.
+ */
WITH mod_dates AS (
SELECT bre.id,
bre.edit_date,
- MAX(acn.edit_date) AS max_call_number_edit_date,
- MAX(acp.edit_date) AS max_copy_edit_date
+ MAX(COALESCE(acn.edit_date, '1901-01-01')) AS max_call_number_edit_date,
+ MAX(COALESCE(acp.edit_date, '1901-01-01')) AS max_copy_edit_date
FROM biblio.record_entry bre
- JOIN asset.call_number acn ON (acn.record = bre.id)
- JOIN asset.copy acp ON (acp.call_number = acn.id)
- WHERE
- bre.active
- AND NOT bre.deleted
- AND NOT acn.deleted
- AND NOT acp.deleted
+ LEFT JOIN asset.call_number acn ON (acn.record = bre.id)
+ LEFT JOIN asset.copy acp ON (acp.call_number = acn.id)
GROUP BY 1, 2
) SELECT dates.id,
GREATEST(dates.edit_date,
my $index_record;
my $start_record;
my $stop_record;
-my $start_date;
+my $modified_since;
my $max_duration;
my $batch_size = 500;
'index-record=s' => \$index_record,
'start-record=s' => \$start_record,
'stop-record=s' => \$stop_record,
- 'start-date=s' => \$start_date,
+ 'modified-since=s' => \$modified_since,
'max-duration=s' => \$max_duration,
'batch-size=s' => \$batch_size,
'db-name=s' => \$db_name,
--stop-record <id>
Stop indexing after the record with this ID has been indexed.
- --start-date <YYYY-MM-DD[Thh::mm:ss]>
- Start indexing records whose last edit date falls after
- the provided date;
+ --modified-since <YYYY-MM-DD[Thh::mm:ss]>
+ Index new records and reindex existing records whose last
+ modification date falls after the date provided. Use this
+ at regular intervals to keep the ES-indexed data in sync
+ with the EG data.
--max-duration <duration>
Stop indexing once the process has been running for this
db_user => $db_user,
db_pass => 'REDACTED',
db_appn => $db_appn,
- index_record => $index_record,
- start_record => $start_record,
- stop_record => $stop_record,
- start_date => $start_date,
- max_duration => $max_duration,
- batch_size => $batch_size
+ index_record => $index_record,
+ start_record => $start_record,
+ stop_record => $stop_record,
+ modified_since => $modified_since,
+ max_duration => $max_duration,
+ batch_size => $batch_size
};
print "Commencing index populate with settings: " .