use strict;
use warnings;
use DBI;
+use Getopt::Long;
+
+# Globals for the command line options: --
# You will want to adjust the next two based on your database size,
# i.e. number of bib records as well as the number of cores on your
# database server. Using roughly number of cores/2 doesn't seem to
# have much impact in off peak times.
-use constant {
- BATCHSIZE => 10000,
- MAXCHILD => 8
-};
-
-# Globals for the command line options:
-my $do_browse = 1; # Do the browse reingest.
-my $do_attrs = 1; # Do the record attributes reingest.
-my $do_search = 1; # Do the search reingest.
-my $do_facets = 1; # Do the facets reingest.
-
-# Command line options to skip different reingests. In this case, we
-# use the '-' to indicate a minus or a no, so to
-# skip browse reingest: -browse or -b
-# skip attribute reingest: -attributes or -a
-# skip search reingest: -search or -s
-# skip facet reingest: -facets or -f
-foreach (@ARGV) {
- if (/^-b(?:rowse)?$/) {
- $do_browse = 0;
- } elsif (/^-a(?:ttr(?:ibute)?s?)?$/) {
- $do_attrs = 0;
- } elsif (/^-s(?:earch)?$/) {
- $do_search = 0;
- } elsif (/^-f(?:acets?)?$/) {
- $do_facets = 0;
- } else {
- # TODO: Add usage() function to report allowed options.
- die ("Unrecognized option: $_");
- }
+my $batch_size = 10000; # records processed per batch
+my $max_child = 8; # max number of parallel worker processes
+
+my $skip_browse; # Skip the browse reingest.
+my $skip_attrs; # Skip the record attributes reingest.
+my $skip_search; # Skip the search reingest.
+my $skip_facets; # Skip the facets reingest.
+my $start_id; # start processing at this bib ID.
+my $end_id; # stop processing when this bib ID is reached.
+my $max_duration; # max processing duration in seconds
+my $help; # show help text
+
+GetOptions(
+ 'batch-size=i' => \$batch_size,
+ 'max-child=i' => \$max_child,
+ 'skip-browse' => \$skip_browse,
+ 'skip-attrs' => \$skip_attrs,
+ 'skip-search' => \$skip_search,
+ 'skip-facets' => \$skip_facets,
+ 'start-id=i' => \$start_id,
+ 'end-id=i' => \$end_id,
+ 'max-duration=i' => \$max_duration,
+ 'help' => \$help
+);
+
+sub help {
+ print <<HELP;
+
+ $0 --batch-size $batch_size --max-child $max_child \
+ --start-id 1 --end-id 500000 --duration 14400
+
+ --batch-size
+ Number of records to process per batch
+
+ --max-child
+ Max number of worker processes
+
+ --skip-browse
+ --skip-attrs
+ --skip-search
+ --skip-facets
+ Skip the selected reingest component
+
+ --start-id
+ Start processing at this record ID.
+
+ --end-id
+ Stop processing when this record ID is reached
+
+ --max-duration
+ Stop processing after this many total seconds have passed.
+
+ --help
+ Show this help text.
+
+HELP
+ exit;
+}
+
+help() if $help;
+
+my $where = "WHERE deleted = 'f'";
+if ($start_id && $end_id) {
+ $where .= " AND id BETWEEN $start_id AND $end_id";
+} elsif ($start_id) {
+ $where .= " AND id >= $start_id";
+} elsif ($end_id) {
+ $where .= " AND id <= $end_id";
}
# "Gimme the keys! I'll drive!"
my $q = <<END_OF_Q;
SELECT id
FROM biblio.record_entry
-WHERE deleted = 'f'
-AND id > 0
+$where
ORDER BY id ASC
END_OF_Q
# To do the browse-only ingest:
my @blist = ();
+my $start_epoch = time;
+
+sub duration_expired {
+ return 1 if $max_duration && (time - $start_epoch) >= $max_duration;
+ return 0;
+}
+
# All of the DBI->connect() calls in this file assume that you have
# configured the PGHOST, PGPORT, PGDATABASE, PGUSER, and PGPASSWORD
# variables in your execution environment. If you have not, you have
my $record = $r->[0];
push(@blist, $record); # separate list of browse-only ingest
push(@$records, $record);
- if (++$count == BATCHSIZE) {
+ if (++$count == $batch_size) {
$lol[$lists++] = $records;
$count = 0;
$records = [];
my @running = ();
# We start the browse-only ingest before starting the other ingests.
-browse_ingest(@blist) if ($do_browse);
+browse_ingest(@blist) unless ($skip_browse);
-# We loop until we have processed all of the batches stored in @lol:
+# We loop until we have processed all of the batches stored in @lol
+# or the maximum processing duration has been reached.
while ($count < $lists) {
- if (scalar(@lol) && scalar(@running) < MAXCHILD) {
+ my $duration_expired = duration_expired();
+
+ if (scalar(@lol) && scalar(@running) < $max_child && !$duration_expired) {
# Reuse $records for the lulz.
$records = shift(@lol);
- if ($do_search || $do_facets || $do_attrs) {
- reingest($records);
- } else {
+ if ($skip_search && $skip_facets && $skip_attrs) {
$count++;
+ } else {
+ reingest($records);
}
} else {
my $pid = wait();
print "$count of $lists processed\n";
}
}
+
+ if ($duration_expired && scalar(@running) == 0) {
+ warn "Exiting on max_duration ($max_duration)\n";
+ exit(0);
+ }
}
# This subroutine forks a process to do the browse-only ingest on the
} else {
warn ("Browse ingest failed for record $_");
}
+ if (duration_expired()) {
+ warn "browse_ingest() stopping on record $_ ".
+ "after max duration reached\n";
+ last;
+ }
}
$dbh->disconnect();
exit(0);
push(@running, $pid);
} elsif ($pid == 0) {
my $dbh = DBI->connect('DBI:Pg:');
- reingest_attributes($dbh, $list) if ($do_attrs);
- reingest_field_entries($dbh, $list) if ($do_facets || $do_search);
+ reingest_attributes($dbh, $list) unless ($skip_attrs);
+ reingest_field_entries($dbh, $list)
+ unless ($skip_facets && $skip_search);
$dbh->disconnect();
exit(0);
}
my $list = shift;
my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, ?, TRUE, ?)");
# Because reingest uses "skip" options we invert the logic of do variables.
- $sth->bind_param(2, ($do_facets) ? 0 : 1);
- $sth->bind_param(3, ($do_search) ? 0 : 1);
+ $sth->bind_param(2, ($skip_facets) ? 1 : 0);
+ $sth->bind_param(3, ($skip_search) ? 1 : 0);
foreach (@$list) {
$sth->bind_param(1, $_);
if ($sth->execute()) {