JBAS-1437 Bibs-to-link finder and batch generator
authorBill Erickson <berickxx@gmail.com>
Fri, 2 Dec 2016 17:04:17 +0000 (12:04 -0500)
committerBill Erickson <berickxx@gmail.com>
Thu, 21 Mar 2019 19:46:23 +0000 (15:46 -0400)
Signed-off-by: Bill Erickson <berickxx@gmail.com>
KCLS/linking/find-bibs-to-link.pl [new file with mode: 0755]

diff --git a/KCLS/linking/find-bibs-to-link.pl b/KCLS/linking/find-bibs-to-link.pl
new file mode 100755 (executable)
index 0000000..46a20bf
--- /dev/null
@@ -0,0 +1,169 @@
+#!/usr/bin/perl
+# ----------------------------------------------------------------------
+# Find bib records matching the requested criteria for linking.
+# Bib IDs are exported to one or more batch files for future processing.
+# ----------------------------------------------------------------------
+use strict;
+use warnings;
+use DBI;
+use Getopt::Long;
+use DateTime;
+
+my $db_handle;
+my $counter = 0;
+
+# options
+my $help;
+my $modified_since;
+my $exported_since;
+my $batch_size = 10000;
+my $start_id;
+my $end_id;
+my $count_only;
+my $out_dir = '/tmp';
+my $db_host = $ENV{PGHOST}     || 'localhost';
+my $db_port = $ENV{PGPORT}     || '5432';
+my $db_user = $ENV{PGUSER}     || 'evergreen';
+my $db_name = $ENV{PGDATABASE} || 'evergreen';
+my $db_pass = $ENV{PGPASSWORD};
+
+my $opt_result = GetOptions(
+    'modified-since=s'  => \$modified_since,
+    'exported-since=s'  => \$exported_since,
+    'start-id=i'        => \$start_id,
+    'end-id=i'          => \$end_id,
+    'batch-size=i'      => \$batch_size,
+    'count-only'        => \$count_only,
+    'out-dir=s'         => \$out_dir,
+    "db-host=s"         => \$db_host,
+    "db-user=s"         => \$db_user,
+    "db-pass=s"         => \$db_pass,
+    "db-port=s"         => \$db_port,
+    'help'              => \$help
+);
+
+sub announce {
+    my $msg = shift;
+    print DateTime->now(time_zone => 'local')->strftime('%F %T')." $msg\n";
+}
+
+sub help {
+    print <<HELP;
+        Find IDs for bib records based on various criteria.  Write bib
+        IDs to batch files.  Batch files are placed into --out-dir and
+        named bib-ids.001, bib-ids.002, etc.
+
+        Usage:
+
+            Find  
+        
+            $0 --modified-since 1 --batch-size 100 \
+                --out-dir /openils/var/data/linkbibs/2016-12-01
+        
+        Options:
+
+            --modified-since <YYYY-MM-DD>
+                Limit bibs to those modifed since the specified date.
+
+            --exported-since <YYYY-MM-DD>
+                Limit bibs to those exported since the specified date.
+                Export date is based on data found in the
+                metabib.bib_export_data table.
+
+            --start-id <id>
+                Limit bibs to those whose ID is no less than <id>
+
+            --end-id <id>
+                Limit bibs to those whose ID is no greater than <id>
+
+            --out-dir [/tmp]
+                Output directory.
+
+            --batch-size
+                Number of bib IDs to write to each batch file.  
+
+            --count-only
+                Print the total number of records that would be added
+                to batch files without adding to any batch files.
+
+            --db-host
+            --db-user
+            --db-pass
+            --db-port
+                Database connection params. PG environment variables are
+                also inspected for values.  When all else fails, try to 
+                connect to database evergreen\@localhost
+HELP
+    exit 0;
+}
+
+help() if $help || !$opt_result;
+
+sub connect_db {
+    $db_handle = DBI->connect(
+        "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'",
+        $db_user, $db_pass, { 
+            RaiseError => 1,
+            PrintError => 0,
+            AutoCommit => 1,
+            pg_expand_array => 0,
+            pg_enable_utf8 => 1
+        }
+    ) or die "Connection to database failed: $DBI::err : $DBI::errstr";
+}
+
+connect_db();
+
+# ----------------------------------------------------------------------
+my $from = 'FROM biblio.record_entry bre';
+
+my $where = 'WHERE NOT bre.deleted';
+$where .= " AND bre.id >= $start_id" if $start_id;
+$where .= " AND bre.id <= $end_id"   if $end_id;
+
+if ($exported_since) {
+    $where .= " AND bed.export_date > '$exported_since'";
+    $from .= " JOIN metabib.bib_export_data bed ON (bed.bib = bre.id)";
+}
+
+my $sql = <<SQL;
+    SELECT bre.id 
+    $from
+    $where
+    ORDER BY bre.id DESC;
+SQL
+
+my $sth = $db_handle->prepare($sql);
+$sth->execute;
+
+my $batch_file;
+sub open_batch_file {
+    my $path = shift;
+    announce("Starting new batch file: $path");
+
+    close $batch_file if $batch_file;
+
+    open $batch_file, '>', $path or 
+        die "Cannot open batch file for writing: $!\n";
+}
+
+my $ctr = 0;
+my $batch = 0;
+while (my $ref = $sth->fetchrow_hashref()) {
+    $ctr++;
+    next if $count_only;
+
+    if (( ($ctr - 1) % $batch_size) == 0) {
+        my $path = sprintf("$out_dir/bib-ids.%0.3d", $batch);
+        open_batch_file($path);
+        $batch++;
+    }
+
+    print $batch_file $ref->{id} . "\n";
+}
+
+close $batch_file if $batch_file;
+$sth->finish;
+
+announce("Found $ctr bib records");
+