From 28d87c24928344307fc68338aa01ca2c1d97173a Mon Sep 17 00:00:00 2001
From: Bill Erickson <>
Date: Tue, 16 Feb 2016 12:59:55 -0500
Subject: [PATCH] LP 1768715: pingest supports max/min ID, duration, more ops

From the new help text:

        Number of records to process per batch

        Max number of worker processes

        Skip the selected reingest component

        Start processing at this record ID.

        Stop processing when this record ID is reached

        Stop processing after this many total seconds have passed.

        Show this help text.

Signed-off-by: Bill Erickson <>
Signed-off-by: Jason Stephenson <>
 Open-ILS/src/support-scripts/ | 147 ++++++++++++++++++++++----------
 1 file changed, 104 insertions(+), 43 deletions(-)

diff --git a/Open-ILS/src/support-scripts/ b/Open-ILS/src/support-scripts/
index 28cb031b23..ed9f45e950 100755
--- a/Open-ILS/src/support-scripts/
+++ b/Open-ILS/src/support-scripts/
@@ -18,49 +18,89 @@
 use strict;
 use warnings;
 use DBI;
+use Getopt::Long;
+# Globals for the command line options: --
 # You will want to adjust the next two based on your database size,
 # i.e. number of bib records as well as the number of cores on your
 # database server.  Using roughly number of cores/2 doesn't seem to
 # have much impact in off peak times.
-use constant {
-    BATCHSIZE => 10000,
-    MAXCHILD => 8
-# Globals for the command line options:
-my $do_browse = 1; # Do the browse reingest.
-my $do_attrs = 1; # Do the record attributes reingest.
-my $do_search = 1; # Do the search reingest.
-my $do_facets = 1; # Do the facets reingest.
-# Command line options to skip different reingests. In this case, we
-# use the '-' to indicate a minus or a no, so to
-# skip browse reingest: -browse or -b
-# skip attribute reingest: -attributes or -a
-# skip search reingest: -search or -s
-# skip facet reingest: -facets or -f
-foreach (@ARGV) {
-    if (/^-b(?:rowse)?$/) {
-        $do_browse = 0;
-    } elsif (/^-a(?:ttr(?:ibute)?s?)?$/) {
-        $do_attrs = 0;
-    } elsif (/^-s(?:earch)?$/) {
-        $do_search = 0;
-    } elsif (/^-f(?:acets?)?$/) {
-        $do_facets = 0;
-    } else {
-        # TODO: Add usage() function to report allowed options.
-        die ("Unrecognized option: $_");
-    }
+my $batch_size = 10000; # records processed per batch
+my $max_child  = 8;     # max number of parallel worker processes
+my $skip_browse;  # Skip the browse reingest.
+my $skip_attrs;   # Skip the record attributes reingest.
+my $skip_search;  # Skip the search reingest.
+my $skip_facets;  # Skip the facets reingest.
+my $start_id;     # start processing at this bib ID.
+my $end_id;       # stop processing when this bib ID is reached.
+my $max_duration; # max processing duration in seconds
+my $help;         # show help text
+    'batch-size=i'   => \$batch_size,
+    'max-child=i'    => \$max_child,
+    'skip-browse'    => \$skip_browse,
+    'skip-attrs'     => \$skip_attrs,
+    'skip-search'    => \$skip_search,
+    'skip-facets'    => \$skip_facets,
+    'start-id=i'     => \$start_id,
+    'end-id=i'       => \$end_id,
+    'max-duration=i' => \$max_duration,
+    'help'           => \$help
+sub help {
+    print <<HELP;
+    $0 --batch-size $batch_size --max-child $max_child \
+        --start-id 1 --end-id 500000 --duration 14400
+    --batch-size
+        Number of records to process per batch
+    --max-child
+        Max number of worker processes
+    --skip-browse
+    --skip-attrs
+    --skip-search
+    --skip-facets
+        Skip the selected reingest component
+    --start-id
+        Start processing at this record ID.
+    --end-id
+        Stop processing when this record ID is reached
+    --max-duration
+        Stop processing after this many total seconds have passed.
+    --help
+        Show this help text.
+    exit;
+help() if $help;
+my $where = "WHERE deleted = 'f'";
+if ($start_id && $end_id) {
+    $where .= " AND id BETWEEN $start_id AND $end_id";
+} elsif ($start_id) {
+    $where .= " AND id >= $start_id";
+} elsif ($end_id) {
+    $where .= " AND id <= $end_id";
 # "Gimme the keys!  I'll drive!"
 my $q = <<END_OF_Q;
 FROM biblio.record_entry
-WHERE deleted = 'f'
-AND id > 0
@@ -72,6 +112,13 @@ my @lol = ();
 # To do the browse-only ingest:
 my @blist = ();
+my $start_epoch = time;
+sub duration_expired {
+    return 1 if $max_duration && (time - $start_epoch) >= $max_duration;
+    return 0;
 # All of the DBI->connect() calls in this file assume that you have
 # variables in your execution environment.  If you have not, you have
@@ -88,7 +135,7 @@ foreach my $r (@$results) {
     my $record = $r->[0];
     push(@blist, $record); # separate list of browse-only ingest
     push(@$records, $record);
-    if (++$count == BATCHSIZE) {
+    if (++$count == $batch_size) {
         $lol[$lists++] = $records;
         $count = 0;
         $records = [];
@@ -106,17 +153,20 @@ $count = 0;
 my @running = ();
 # We start the browse-only ingest before starting the other ingests.
-browse_ingest(@blist) if ($do_browse);
+browse_ingest(@blist) unless ($skip_browse);
-# We loop until we have processed all of the batches stored in @lol:
+# We loop until we have processed all of the batches stored in @lol
+# or the maximum processing duration has been reached.
 while ($count < $lists) {
-    if (scalar(@lol) && scalar(@running) < MAXCHILD) {
+    my $duration_expired = duration_expired();
+    if (scalar(@lol) && scalar(@running) < $max_child && !$duration_expired) {
         # Reuse $records for the lulz.
         $records = shift(@lol);
-        if ($do_search || $do_facets || $do_attrs) {
-            reingest($records);
-        } else {
+        if ($skip_search && $skip_facets && $skip_attrs) {
+        } else {
+            reingest($records);
     } else {
         my $pid = wait();
@@ -126,6 +176,11 @@ while ($count < $lists) {
             print "$count of $lists processed\n";
+    if ($duration_expired && scalar(@running) == 0) {
+        warn "Exiting on max_duration ($max_duration)\n";
+        exit(0);
+    }
 # This subroutine forks a process to do the browse-only ingest on the
@@ -151,6 +206,11 @@ sub browse_ingest {
             } else {
                 warn ("Browse ingest failed for record $_");
+            if (duration_expired()) {
+                warn "browse_ingest() stopping on record $_ ".
+                    "after max duration reached\n";
+                last;
+            }
@@ -168,8 +228,9 @@ sub reingest {
         push(@running, $pid);
     } elsif ($pid == 0) {
         my $dbh = DBI->connect('DBI:Pg:');
-        reingest_attributes($dbh, $list) if ($do_attrs);
-        reingest_field_entries($dbh, $list) if ($do_facets || $do_search);
+        reingest_attributes($dbh, $list) unless ($skip_attrs);
+        reingest_field_entries($dbh, $list)
+            unless ($skip_facets && $skip_search);
@@ -181,8 +242,8 @@ sub reingest_field_entries {
     my $list = shift;
     my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, ?, TRUE, ?)");
     # Because reingest uses "skip" options we invert the logic of do variables.
-    $sth->bind_param(2, ($do_facets) ? 0 : 1);
-    $sth->bind_param(3, ($do_search) ? 0 : 1);
+    $sth->bind_param(2, ($skip_facets) ? 1 : 0);
+    $sth->bind_param(3, ($skip_search) ? 1 : 0);
     foreach (@$list) {
         $sth->bind_param(1, $_);
         if ($sth->execute()) {