From: Bill Erickson Date: Fri, 2 Dec 2016 17:04:17 +0000 (-0500) Subject: JBAS-1437 Bibs-to-link finder and batch generator X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=261b41348622a9108e6ee21b5c257501c9571b55;p=working%2FEvergreen.git JBAS-1437 Bibs-to-link finder and batch generator Signed-off-by: Bill Erickson --- diff --git a/KCLS/linking/find-bibs-to-link.pl b/KCLS/linking/find-bibs-to-link.pl new file mode 100755 index 0000000000..46a20bf298 --- /dev/null +++ b/KCLS/linking/find-bibs-to-link.pl @@ -0,0 +1,169 @@ +#!/usr/bin/perl +# ---------------------------------------------------------------------- +# Find bib records matching the requested criteria for linking. +# Bib IDs are exported to one or more batch files for future processing. +# ---------------------------------------------------------------------- +use strict; +use warnings; +use DBI; +use Getopt::Long; +use DateTime; + +my $db_handle; +my $counter = 0; + +# options +my $help; +my $modified_since; +my $exported_since; +my $batch_size = 10000; +my $start_id; +my $end_id; +my $count_only; +my $out_dir = '/tmp'; +my $db_host = $ENV{PGHOST} || 'localhost'; +my $db_port = $ENV{PGPORT} || '5432'; +my $db_user = $ENV{PGUSER} || 'evergreen'; +my $db_name = $ENV{PGDATABASE} || 'evergreen'; +my $db_pass = $ENV{PGPASSWORD}; + +my $opt_result = GetOptions( + 'modified-since=s' => \$modified_since, + 'exported-since=s' => \$exported_since, + 'start-id=i' => \$start_id, + 'end-id=i' => \$end_id, + 'batch-size=i' => \$batch_size, + 'count-only' => \$count_only, + 'out-dir=s' => \$out_dir, + "db-host=s" => \$db_host, + "db-user=s" => \$db_user, + "db-pass=s" => \$db_pass, + "db-port=s" => \$db_port, + 'help' => \$help +); + +sub announce { + my $msg = shift; + print DateTime->now(time_zone => 'local')->strftime('%F %T')." $msg\n"; +} + +sub help { + print < + Limit bibs to those modifed since the specified date. + + --exported-since + Limit bibs to those exported since the specified date. + Export date is based on data found in the + metabib.bib_export_data table. + + --start-id + Limit bibs to those whose ID is no less than + + --end-id + Limit bibs to those whose ID is no greater than + + --out-dir [/tmp] + Output directory. + + --batch-size + Number of bib IDs to write to each batch file. + + --count-only + Print the total number of records that would be added + to batch files without adding to any batch files. + + --db-host + --db-user + --db-pass + --db-port + Database connection params. PG environment variables are + also inspected for values. When all else fails, try to + connect to database evergreen\@localhost +HELP + exit 0; +} + +help() if $help || !$opt_result; + +sub connect_db { + $db_handle = DBI->connect( + "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", + $db_user, $db_pass, { + RaiseError => 1, + PrintError => 0, + AutoCommit => 1, + pg_expand_array => 0, + pg_enable_utf8 => 1 + } + ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; +} + +connect_db(); + +# ---------------------------------------------------------------------- +my $from = 'FROM biblio.record_entry bre'; + +my $where = 'WHERE NOT bre.deleted'; +$where .= " AND bre.id >= $start_id" if $start_id; +$where .= " AND bre.id <= $end_id" if $end_id; + +if ($exported_since) { + $where .= " AND bed.export_date > '$exported_since'"; + $from .= " JOIN metabib.bib_export_data bed ON (bed.bib = bre.id)"; +} + +my $sql = <prepare($sql); +$sth->execute; + +my $batch_file; +sub open_batch_file { + my $path = shift; + announce("Starting new batch file: $path"); + + close $batch_file if $batch_file; + + open $batch_file, '>', $path or + die "Cannot open batch file for writing: $!\n"; +} + +my $ctr = 0; +my $batch = 0; +while (my $ref = $sth->fetchrow_hashref()) { + $ctr++; + next if $count_only; + + if (( ($ctr - 1) % $batch_size) == 0) { + my $path = sprintf("$out_dir/bib-ids.%0.3d", $batch); + open_batch_file($path); + $batch++; + } + + print $batch_file $ref->{id} . "\n"; +} + +close $batch_file if $batch_file; +$sth->finish; + +announce("Found $ctr bib records"); +