working on data sync

author Bill Erickson <berickxx@gmail.com>

Mon, 8 Jul 2019 22:12:16 +0000 (18:12 -0400)

committer Bill Erickson <berickxx@gmail.com>

Fri, 17 Jan 2020 19:36:01 +0000 (14:36 -0500)
author Bill Erickson <berickxx@gmail.com>
Mon, 8 Jul 2019 22:12:16 +0000 (18:12 -0400)
committer Bill Erickson <berickxx@gmail.com>
Fri, 17 Jan 2020 19:36:01 +0000 (14:36 -0500)
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm

index b30a1c2..372d31a 100644 (file)
--- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm
@@ -21,6 +21,8 @@ use OpenSRF::Utils::Logger qw/:logger/;
  use OpenILS::Utils::CStoreEditor qw/:funcs/;
  use Search::Elasticsearch;
  use OpenSRF::Utils::JSON;
+use Data::Dumper;
+$Data::Dumper::Indent = 0;
  
  sub new {
      my ($class, $cluster) = @_;
@@ -154,6 +156,32 @@ sub delete_index {
      }
  }
  
+# Remove multiple documents from the index by ID.
+# $ids can be a single ID or an array ref of IDs.
+sub delete_documents {
+    my ($self, $ids) = @_;
+    $ids = [$ids] unless ref $ids;
+
+    my $result;
+
+    eval {
+    
+        $result = $self->es->delete_by_query(
+            index => $self->index_name,
+            type => 'record',
+            body => {query => {terms => {id => $ids}}}
+        );
+    };
+
+    if ($@) {
+        $logger->error("ES delete document failed with $@");
+        return undef;
+    } 
+
+    $logger->debug("ES delete removed " . $result->{deleted} . " document");
+    return $result;
+}
+
  sub index_document {
      my ($self, $id, $body) = @_;
  
@@ -162,7 +190,7 @@ sub index_document {
      eval {
          $result = $self->es->index(
              index => $self->index_name,
-            type => 'record', # deprecated in v6
+            type => 'record',
              id => $id,
              body => $body
          );
@@ -173,7 +201,12 @@ sub index_document {
          return undef;
      } 
  
-    $logger->debug("ES index command returned $result");
+    if ($result->{failed}) {
+        $logger->error("ES index document $id failed " . Dumper($result));
+        return undef;
+    }
+
+    $logger->debug("ES index => $id succeeded");
      return $result;
  }
  
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm

index 894b15c..8f8cabe 100644 (file)
--- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm
@@ -67,15 +67,7 @@ my $BASE_PROPERTIES = {
              circulate => {type => 'boolean'},
              opac_visible => {type => 'boolean'}
          }
-    }#,
-
-#    marc => {
-#        type => 'nested',
-#        properties => {
-#            tag => {type => 'text'}
-#            # subfields are added dynamically
-#        }
-#    }
+    }
  };
  
  sub index_name {
@@ -222,13 +214,13 @@ sub get_bib_ids {
  
      my $start_id = $state->{start_record} || 0;
      my $stop_id = $state->{stop_record};
-    my $start_date = $state->{start_date};
+    my $modified_since = $state->{modified_since};
  
      my ($select, $from, $where);
-    if ($start_date) {
+    if ($modified_since) {
          $select = "SELECT id";
          $from   = "FROM elastic.bib_last_mod_date";
-        $where  = "WHERE last_mod_date > '$start_date'";
+        $where  = "WHERE last_mod_date > '$modified_since'";
      } else {
          $select = "SELECT id";
          $from   = "FROM biblio.record_entry";
@@ -261,6 +253,7 @@ SELECT
      bre.create_date, 
      bre.edit_date, 
      bre.source AS bib_source,
+    bre.deleted,
      (elastic.bib_record_properties(bre.id)).*
  FROM biblio.record_entry bre
  WHERE id IN ($ids_str)
@@ -281,14 +274,30 @@ sub populate_bib_index_batch {
  
      my $bib_data = $self->get_bib_data($bib_ids);
  
+    # Remove records that are marked deleted.
+    # This should only happen when running in refresh mode.
+
+    my @active_ids;
+    for my $bib_id (@$bib_ids) {
+
+        # Every row in the result data contains the 'deleted' value.
+        my ($field) = grep {$_->{id} == $bib_id} @$bib_data;
+
+        if ($field->{deleted} == 1) { # not 't' / 'f'
+           $self->delete_documents($bib_id); 
+        } else {
+            push(@active_ids, $bib_id);
+        }
+    }
+
+    $bib_ids = [@active_ids];
+
      my $holdings = $self->load_holdings($bib_ids);
-    #my $marc = $self->load_marc($bib_ids);
  
      for my $bib_id (@$bib_ids) {
  
          my $body = {
              holdings => $holdings->{$bib_id} || []
-            #marc => $marc || []
          };
  
          # there are multiple rows per bib in the data list.
@@ -395,10 +404,8 @@ SQL
      return $holdings;
  }
  
-# TODO: Disabled for now because this approach leads to exceeding the
-# maximum number of indexed fields (1k).  Create a separate index
-# for MARC searches.
-# Load MARC record tags and subfields
+# Example pulling marc tag/subfield data.
+# TODO: Create a separate bib-marc index if needed.
  sub load_marc {
      my ($self, $bib_ids) = @_;
  
diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql

index 743f5f1..c26c487 100644 (file)
--- a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql
+++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql
@@ -137,19 +137,20 @@ END $FUNK$ LANGUAGE PLPGSQL;
  /* give me bibs I should upate */
  
  CREATE OR REPLACE VIEW elastic.bib_last_mod_date AS
+    /**
+     * Last update date for each bib, which is taken from most recent
+     * edit for either the bib, a linked call number, or a linked copy.
+     * If no call numbers are linked, uses the bib edit date only.
+     * Includes deleted data since it can impact indexing.
+     */
      WITH mod_dates AS (
          SELECT bre.id, 
              bre.edit_date, 
-            MAX(acn.edit_date) AS max_call_number_edit_date, 
-            MAX(acp.edit_date) AS max_copy_edit_date
+            MAX(COALESCE(acn.edit_date, '1901-01-01')) AS max_call_number_edit_date, 
+            MAX(COALESCE(acp.edit_date, '1901-01-01')) AS max_copy_edit_date
          FROM biblio.record_entry bre
-            JOIN asset.call_number acn ON (acn.record = bre.id)
-            JOIN asset.copy acp ON (acp.call_number = acn.id)
-        WHERE 
-            bre.active
-            AND NOT bre.deleted
-            AND NOT acn.deleted
-            AND NOT acp.deleted
+            LEFT JOIN asset.call_number acn ON (acn.record = bre.id)
+            LEFT JOIN asset.copy acp ON (acp.call_number = acn.id)
          GROUP BY 1, 2
      ) SELECT dates.id, 
          GREATEST(dates.edit_date, 
diff --git a/Open-ILS/src/support-scripts/elastic-index.pl b/Open-ILS/src/support-scripts/elastic-index.pl

index 6544716..05ebc6b 100755 (executable)
--- a/Open-ILS/src/support-scripts/elastic-index.pl
+++ b/Open-ILS/src/support-scripts/elastic-index.pl
@@ -17,7 +17,7 @@ my $populate;
  my $index_record;
  my $start_record;
  my $stop_record;
-my $start_date;
+my $modified_since;
  my $max_duration;
  my $batch_size = 500;
  
@@ -39,7 +39,7 @@ GetOptions(
      'index-record=s'    => \$index_record,
      'start-record=s'    => \$start_record,
      'stop-record=s'     => \$stop_record,
-    'start-date=s'      => \$start_date,
+    'modified-since=s'  => \$modified_since,
      'max-duration=s'    => \$max_duration,
      'batch-size=s'      => \$batch_size,
      'db-name=s'         => \$db_name,
@@ -97,9 +97,11 @@ sub help {
              --stop-record <id>
                  Stop indexing after the record with this ID has been indexed.
  
-            --start-date <YYYY-MM-DD[Thh::mm:ss]>
-                Start indexing records whose last edit date falls after
-                the provided date;
+            --modified-since <YYYY-MM-DD[Thh::mm:ss]>
+                Index new records and reindex existing records whose last
+                modification date falls after the date provided.  Use this
+                at regular intervals to keep the ES-indexed data in sync 
+                with the EG data.
  
              --max-duration <duration>
                  Stop indexing once the process has been running for this
@@ -151,12 +153,12 @@ if ($populate) {
          db_user => $db_user,
          db_pass => 'REDACTED',
          db_appn => $db_appn,
-        index_record => $index_record,
-        start_record => $start_record,
-        stop_record  => $stop_record,
-        start_date   => $start_date,
-        max_duration => $max_duration,
-        batch_size   => $batch_size
+        index_record   => $index_record,
+        start_record   => $start_record,
+        stop_record    => $stop_record,
+        modified_since => $modified_since,
+        max_duration   => $max_duration,
+        batch_size     => $batch_size
      };
  
      print "Commencing index populate with settings: " .
author	Bill Erickson <berickxx@gmail.com>
	Mon, 8 Jul 2019 22:12:16 +0000 (18:12 -0400)
committer	Bill Erickson <berickxx@gmail.com>
	Fri, 17 Jan 2020 19:36:01 +0000 (14:36 -0500)
Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm		patch \| blob \| history
Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm		patch \| blob \| history
Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql		patch \| blob \| history
Open-ILS/src/support-scripts/elastic-index.pl		patch \| blob \| history