# Export bib records for Backstage processing.
#
# The UTF-8 encoded USMARC string for each record is printed to STDOUT.
-# Each exported bib has its export_date value updated to NOW().
#
# Exported bibs meet the following criteria:
#
use Getopt::Long;
use MARC::Record;
use MARC::File::XML (BinaryEncoding => 'UTF-8');
-
-my $db_handle;
+use OpenILS::Utils::KCLSScriptUtil;
+my $KU = 'OpenILS::Utils::KCLSScriptUtil';
my $start_date;
my $end_date;
-my $export_date;
my $ids_only;
my $count_only;
my $out_file;
-my $limit;
-my $db_user = $ENV{PGUSER} || 'evergreen';
-my $db_name = $ENV{PGDATABASE} || 'evergreen';
-my $db_host = $ENV{PGHOST} || 'localhost';
-my $db_port = $ENV{PGPORT} || '5432';
-my $db_pass = $ENV{PGPASSWORD};
my $help;
GetOptions(
'start-date=s' => \$start_date,
'end-date=s' => \$end_date,
- 'export-date=s' => \$export_date,
'ids-only' => \$ids_only,
'count-only' => \$count_only,
'out-file=s' => \$out_file,
- 'limit=f' => \$limit,
- 'db-user=s' => \$db_user,
- 'db-host=s' => \$db_host,
- 'db-name=s' => \$db_name,
- 'db-port=i' => \$db_port,
- 'db-pass=s' => \$db_pass,
'help' => \$help
-);
+) || help();
sub help {
print <<HELP;
Export bib records for uploading to Backstage for processing.
MARC data is sent to STDOUT. Redirect as needed.
-$0 --start-date 2015-06-01 \
- --end-date 2016-06-01 \
- --export-date 2016-06-01
+$0 --start-date 2015-06-01 --end-date 2016-06-01 --out-file /tmp/foo.marc
Options
Export bib records whose cataloging_date (for physical records) or
create_date (for electronic records) value is between the provided
start and end dates.
-
- --export-date <YYYY-MM-DD>
- Sets the export date to the provided value. If no --export-date
- value is set, no export date value will be applied in the database.
--out-file </path/to/file>
Write MARC records (or IDs) to this file.
--count-only
Only print the number of bibs that would be exported to STDOUT.
- --limit
- Export at most this many records.
-
HELP
exit;
}
-die "--start-date and --end-date required\n"
+help() if $help;
+
+$KU->announce('ERR', "--start-date and --end-date required", 1)
unless $start_date && $end_date;
-die "Invalid date format\n" unless
+$KU->announce('ERR', "Invalid date format", 1) unless
$start_date =~ /^\d{4}-\d{2}-\d{2}$/ &&
- $end_date =~ /^\d{4}-\d{2}-\d{2}$/ &&
- (!$export_date || $export_date =~ /^\d{4}-\d{2}-\d{2}$/);
+ $end_date =~ /^\d{4}-\d{2}-\d{2}$/;
-die "--out-file <filename> required\n" unless $out_file || $count_only;
+$KU->announce('ERR', "--out-file <filename> required", 1)
+ unless $out_file || $count_only;
sub bib_query {
my $sql = <<SQL;
WHERE cr.filter_date BETWEEN '$start_date' AND '$end_date'
SQL
- $sql .= " LIMIT $limit" if $limit;
return $sql;
}
binmode(MARCFILE, ':utf8');
}
- my $sth = $db_handle->prepare(bib_query());
- my $edate_sth = $db_handle->prepare(
- 'SELECT * FROM metabib.set_export_date(?, ?)');
-
+ my $sth = $KU->prepare_statement(bib_query());
$sth->execute;
+
my $count = 0;
my @skipped;
while (my $bib = $sth->fetchrow_hashref) {
if ($ids_only) {
print MARCFILE "$bib_id\n";
- print "$count records written...\n" if ($count % 1000) == 0;
+ $KU->announce('INFO', "$count records written...")
+ if ($count % 1000) == 0;
next;
}
- my $rec = $db_handle->selectall_arrayref(
+ my $rec = $KU->db_handle->selectall_arrayref(
"SELECT marc FROM biblio.record_entry WHERE id = $bib_id");
my $marc = $rec->[0]->[0];
my $marcdoc = MARC::Record->new_from_xml($marc, 'UTF-8', 'USMARC');
if (my @warnings = $marcdoc->warnings) {
- print "Skipping record $bib_id on warnings: @warnings\n";
+ $KU->announce('WARNING',
+ "Skipping record $bib_id on warnings: @warnings");
push(@skipped, $bib_id);
} else {
print MARCFILE $marcdoc->as_usmarc;
}
- print "$count records processed...\n" if ($count % 1000) == 0;
-
- next unless $export_date;
-
- # Update the bib record's metabib.bib_export_data entry.
- eval { $edate_sth->execute($bib_id, $export_date) };
- die "Error setting export date for bib ".
- "$bib_id to $export_date : $@\n" if $@;
+ $KU->announce('INFO', "$count records processed...")
+ if ($count % 1000) == 0;
}
close(MARCFILE) if $out_file;
my $skip_count = scalar(@skipped);
- print "total bib count = $count\n";
- print "skipped bib count = $skip_count\n";
- print "export bib count = ".($count - $skip_count)."\n";
- print "skipped bibs: @skipped\n";
- $sth->finish;
- $edate_sth->finish;
+ $KU->announce('INFO', "total bibs = $count");
+ $KU->announce('INFO', "skipped bibs = $skip_count");
+ $KU->announce('INFO', "exported bibs = ".($count - $skip_count));
+ $KU->announce('INFO', "skipped bibs: @skipped") if @skipped;
}
-sub connect_db {
- $db_handle = DBI->connect(
- "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'",
- $db_user, $db_pass, {
- RaiseError => 1,
- PrintError => 0,
- AutoCommit => 1,
- pg_expand_array => 0,
- pg_enable_utf8 => 1
- }
- ) or die "Connection to database failed: $DBI::err : $DBI::errstr";
-}
-
-connect_db();
+$KU->connect_db;
export_marc();
-
-$db_handle->disconnect;
+$KU->disconnect_db;
use File::Basename;
use Sys::Syslog qw(syslog openlog);
use OpenILS::Utils::Normalize qw(clean_marc);
+use OpenILS::Utils::KCLSScriptUtil;
binmode(STDOUT, ':utf8');
-my $db_handle;
-my $db_handle_ops = 0;
+my $KU = 'OpenILS::Utils::KCLSScriptUtil';
+
# Reset the DB handle have this many operations to avoid memory leaks.
my $db_handle_reset = 500;
my $log_mod = 500; # log every 500th of each type of event (see verbose)
my $zip_file;
my $export_date;
my $working_dir = '.',
-my $bib_collision_file = 'bib-collisions.mrc'; # in --working-dir
my $verbose;
-my $db_user = $ENV{PGUSER} || 'evergreen';
-my $db_name = $ENV{PGDATABASE} || 'evergreen';
-my $db_host = $ENV{PGHOST} || 'localhost';
-my $db_port = $ENV{PGPORT} || '5432';
-my $db_pass = $ENV{PGPASSWORD};
-
-my $syslog_facility = 'LOCAL6'; # matches Evergreen gateway
-my $syslog_ops = 'pid';
-my $syslog_ident = 'BACKSTAGE';
+my $bib_collision_file = 'bib-collisions.mrc'; # in --working-dir
my $new_auth_sth;
my $mod_auth_sth;
my $del_auth_ctr = 0;
my $mod_bibs_ctr = 0;
my $col_bibs_ctr = 0;
-
my $help;
GetOptions(
- 'marc-file=s' => \$marc_file,
- 'zip-file=s' => \$zip_file,
- 'export-date=s' => \$export_date,
- 'working-dir=s' => \$working_dir,
- 'bib-collision-file=s' => \$bib_collision_file,
- 'verbose' => \$verbose,
- 'db-user=s' => \$db_user,
- 'db-host=s' => \$db_host,
- 'db-name=s' => \$db_name,
- 'db-port=i' => \$db_port,
- 'db-pass=s' => \$db_pass,
+ 'marc-file=s' => \$marc_file,
+ 'zip-file=s' => \$zip_file,
+ 'export-date=s' => \$export_date,
+ 'working-dir=s' => \$working_dir,
+ 'verbose' => \$verbose,
'help' => \$help
);
sub help {
print <<HELP;
- TODO
-
-$0
-
+ export WORKING_DIR=/openils/var/data/authority-control/backstage/quarterly/2016-Q4
+ export PGHOST=testing-db01
+ $0 \
+ --export-date 2016-12-09 \
+ --working-dir \$WORKING_DIR
+ --zip-file \$WORKING_DIR/BACKSTAGE-ZIP-FILE.zip
Options
--export-date
--working-dir
Directory where constituent files are extracted.
Defaults to the CWD of this script.
-
- --bib-collision-file
- File created in the working directory containing MARC data of
- bib records that were modified by staff after export and
- modified by Backstage as part of the export. These are
- re-imported via external Vandelay process.
-
- --db-host
- --db-name
- --db-port
- --db-pass
- Database connections parameters.
-
HELP
exit;
}
-die "required: --export-date YYYY-MM-DD\n" unless
- $export_date && $export_date =~ /^\d{4}-\d{2}-\d{2}$/;
+$KU->verbose($verbose);
+$KU->syslog_ident('BACKSTAGE');
-die "--marc-file or --zip-file required\n" unless ($marc_file || $zip_file);
+$KU->announce('ERR', "required: --export-date YYYY-MM-DD", 1)
+ unless $export_date && $export_date =~ /^\d{4}-\d{2}-\d{2}$/;
+
+$KU->announce('ERR', "--marc-file or --zip-file required", 1)
+ unless ($marc_file || $zip_file);
# Log every occurrence of each event type.
$log_mod = 1 if $verbose;
-sub announce {
- my ($level, $msg, $die) = @_;
- syslog("LOG_$level", $msg);
-
- my $date_str = DateTime->now(time_zone => 'local')->strftime('%F %T');
- my $msg_str = "$date_str [$$] $level $msg\n";
-
- if ($die) {
- die $msg_str;
-
- } else {
- if ($level eq 'ERR' or $level eq 'WARNING') {
- # always copy problem messages to stdout
- warn $msg_str; # avoid dupe messages
- } else {
- print $msg_str;
- }
- }
-}
-
sub check_db_handle {
- return if $db_handle_ops < $db_handle_reset;
- reset_db_handle();
-}
-
-sub reset_db_handle {
- announce('INFO', 'Refreshing DB connection on max ops.')
- if $verbose && $db_handle;
- $db_handle_ops = 0;
- disconnect_db();
- connect_db();
+ return if $KU->db_handle_ops < $db_handle_reset;
+ $KU->reset_db_handle;
prepare_statements();
}
-sub disconnect_db {
- return unless $db_handle;
-
- $new_auth_sth->finish;
- $mod_auth_sth->finish;
- $del_auth_sth->finish;
- $delmod_auth_sth->finish;
- $match_auth_sth->finish;
- $match_auth_001_sth->finish;
- $mod_bibs_sth->finish;
- $db_handle->disconnect;
-}
-
-
-sub connect_db {
- $db_handle = DBI->connect(
- "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'",
- $db_user, $db_pass, {
- RaiseError => 1,
- PrintError => 0,
- AutoCommit => 1,
- pg_expand_array => 0,
- pg_enable_utf8 => 1,
- Callbacks => {
- # Track the number of execute() operations
- ChildCallbacks => {
- execute => sub { $db_handle_ops++; return; }
- }
- }
- }
- ) or die "Connection to database failed: $DBI::err : $DBI::errstr";
-}
-
sub process_zip_file {
my $zip = Archive::Zip->new();
- announce('ERR', "Failed to read $zip_file", 1)
+ $KU->announce('ERR', "Failed to read $zip_file", 1)
unless $zip->read($zip_file) == AZ_OK;
my %marc_files = (bib => [], auth => []);
my $basename = basename($member->fileName());
- announce('INFO', "Extracting file $basename");
+ $KU->announce('INFO', "Extracting file $basename");
my $local_file = "$working_dir/$basename";
- announce('ERR', "Unable to extract to file: $local_file", 1)
+ $KU->announce('ERR', "Unable to extract to file: $local_file", 1)
unless $member->extractToFileNamed($local_file) == AZ_OK;
my $key = ($basename =~ /BIB/) ? 'bib' : 'auth';
sub process_marc_file {
my $local_file = shift;
my $basename = basename($local_file);
- announce('INFO', "Processing file $basename");
+ $KU->announce('INFO', "Processing file $basename");
my $marc_batch = MARC::File::USMARC->in($local_file, 'UTF8')
- or announce('ERR', "Unable to read $local_file as MARC", 1);
+ or $KU->announce('ERR', "Unable to read $local_file as MARC", 1);
if ($basename =~ /BIB/) {
} else {
- announce('WARNING', "Unknown file type: $basename");
+ $KU->announce('WARNING', "Unknown file type: $basename");
}
}
return if $mod_searched;
$mod_searched = 1;
- my $id_arrays = $db_handle->selectall_arrayref(<<" SQL");
+ my $id_arrays = $KU->db_handle->selectall_arrayref(<<" SQL");
SELECT id
FROM biblio.record_entry
WHERE NOT deleted AND edit_date >= '$export_date'
@modified_bibs = map {$_->[0]} @$id_arrays;
- announce('INFO', scalar(@modified_bibs)." bibs modified since export");
+ $KU->announce('INFO', scalar(@modified_bibs)." bibs modified since export");
}
check_db_handle();
if (!$bib_id) {
- announce('ERR', "Bib record has no 901c (ID) value. Skipping");
+ $KU->announce('ERR', "Bib record has no 901c (ID) value. Skipping");
next;
}
eval { $mod_bibs_sth->execute($marcxml, $bib_id) };
if ($@) {
- announce('ERR', "Error updating biblio record: $@ : $marcxml");
+ $KU->announce('ERR', "Error updating biblio record: $@ : $marcxml");
return;
}
$mod_bibs_ctr++;
- announce('INFO', "Updated $mod_bibs_ctr bib records")
+ $KU->announce('INFO', "Updated $mod_bibs_ctr bib records")
if $mod_bibs_ctr % $log_mod == 0;
}
my $filename = "$working_dir/$bib_collision_file";
open(BIBS_FILE, ">>$filename") or
- announce('ERR', "Cannot open bib collision file: $filename : $!", 1);
+ $KU->announce('ERR', "Cannot open bib collision file: $filename : $!", 1);
binmode(BIBS_FILE, ":utf8");
print BIBS_FILE $record->as_usmarc();
close BIBS_FILE or
- announce('WARNING', "Error closing bib collision file: $filename : $!");
+ $KU->announce('WARNING', "Error closing bib collision file: $filename : $!");
$col_bibs_ctr++;
- announce('INFO', "Dumped $col_bibs_ctr bib collisions to file")
+ $KU->announce('INFO', "Dumped $col_bibs_ctr bib collisions to file")
if $col_bibs_ctr % $log_mod == 0;
}
};
if ($@) {
- announce(
+ $KU->announce(
'ERR', "Error deleting authority record: $@ : $auth_id");
next;
}
$del_auth_ctr++;
- announce('INFO', "Deleted $del_auth_ctr authority records")
+ $KU->announce('INFO', "Deleted $del_auth_ctr authority records")
if $del_auth_ctr % $log_mod == 0;
}
}
eval { $mod_auth_sth->execute($marcxml, $auth_id) };
if ($@) {
- announce('ERR', "Error updating authority record: $@ : $marcxml");
+ $KU->announce('ERR', "Error updating authority record: $@ : $marcxml");
return;
}
$mod_auth_ctr++;
- announce('INFO', "Updated $mod_auth_ctr authority records")
+ $KU->announce('INFO', "Updated $mod_auth_ctr authority records")
if $mod_auth_ctr % $log_mod == 0;
}
eval { $new_auth_sth->execute($marcxml, "IMPORT-" . time) };
if ($@) {
- announce('ERR',
+ $KU->announce('ERR',
"Error creating new authority record: $@ : $marcxml");
return;
}
$new_auth_ctr++;
- announce('INFO', "Created $new_auth_ctr authority records")
+ $KU->announce('INFO', "Created $new_auth_ctr authority records")
if $new_auth_ctr % $log_mod == 0;
}
my $matches = $match_auth_sth->fetchall_arrayref;
my @ids = map {$_->[0]} @$matches;
- announce('INFO', "Auth 010z=$subfield matched records: @ids")
+ $KU->announce('INFO', "Auth 010z=$subfield matched records: @ids")
if $verbose && @ids;
return @ids;
$match_auth_001_sth->execute($subfield);
my $matches = $match_auth_001_sth->fetchall_arrayref;
my @ids = map {$_->[0]} @$matches;
- announce('INFO', "Auth 001=$subfield matched records: @ids")
+ $KU->announce('INFO', "Auth 001=$subfield matched records: @ids")
if $verbose && @ids;
return @ids;
}
my $matches = $match_auth_sth->fetchall_arrayref;
my @ids = map {$_->[0]} @$matches;
- announce('INFO', "Auth ${tag}a=$subfield matched records: @ids")
+ $KU->announce('INFO', "Auth ${tag}a=$subfield matched records: @ids")
if $verbose && @ids;
return @ids;
sub prepare_statements {
- $del_auth_sth = $db_handle->prepare(<<" SQL");
+ $del_auth_sth = $KU->prepare_statement(<<" SQL");
DELETE FROM authority.record_entry WHERE id = ?
SQL
- $delmod_auth_sth = $db_handle->prepare(<<" SQL");
+ $delmod_auth_sth = $KU->prepare_statement(<<" SQL");
UPDATE authority.record_entry
SET edit_date = NOW() WHERE id = ?
SQL
- $mod_bibs_sth = $db_handle->prepare(<<" SQL");
+ $mod_bibs_sth = $KU->prepare_statement(<<" SQL");
UPDATE biblio.record_entry
SET marc = ?, edit_date = NOW()
WHERE id = ?
SQL
- $mod_auth_sth = $db_handle->prepare(<<" SQL");
+ $mod_auth_sth = $KU->prepare_statement(<<" SQL");
UPDATE authority.record_entry
SET marc = ?, edit_date = NOW()
WHERE id = ?
SQL
- $new_auth_sth = $db_handle->prepare(<<" SQL");
+ $new_auth_sth = $KU->prepare_statement(<<" SQL");
INSERT INTO authority.record_entry (marc, last_xact_id)
VALUES (?, ?)
SQL
- $match_auth_sth = $db_handle->prepare(<<" SQL");
+ $match_auth_sth = $KU->prepare_statement(<<" SQL");
SELECT DISTINCT(rec.id)
FROM authority.record_entry rec
JOIN authority.full_rec frec ON (frec.record = rec.id)
AND frec.value = NACO_NORMALIZE(?, 'a')
SQL
- $match_auth_001_sth = $db_handle->prepare(<<" SQL");
+ $match_auth_001_sth = $KU->prepare_statement(<<" SQL");
SELECT DISTINCT(rec.id)
FROM authority.record_entry rec
JOIN authority.full_rec frec ON (frec.record = rec.id)
SQL
}
-openlog($syslog_ident, $syslog_ops, $syslog_facility);
-reset_db_handle();
+$KU->connect_db;
+prepare_statements();
process_zip_file() if $zip_file;
process_marc_file($marc_file) if $marc_file;
-disconnect_db();
+$KU->disconnect_db;
== Re-Link All Bibs Over 1 Week ==
-
[source,sh]
--------------------------------------------------------------------------
sudo -u opensrf
-LINK_DIR=/openils/var/data/authority-control/link-bibs
-
-mkdir -p $LINK_DIR
-PGPASSWORD=<PASSWORD> ./find-bibs-to-link.pl --db-host <DB-HOST> \
- --batch-size 5000 --out-dir $LINK_DIR
+export PGPASSWORD=<PASSWORD>
+export PGHOST=<PGHOST>
-./link-bib-batches.sh # e.g. via CRON
+./link-all-bibs-daily.sh # CRON
--------------------------------------------------------------------------
use Encode;
use Unicode::Normalize;
use OpenILS::Utils::Normalize;
-use Data::Dumper;
-use Pod::Usage qw/ pod2usage /;
+use OpenILS::Utils::KCLSScriptUtil;
$ENV{OSRF_LOG_CLIENT} = 1;
+my $KU = 'OpenILS::Utils::KCLSScriptUtil';
+$KU->syslog_ident('LINKING');
-$Data::Dumper::Indent = 0;
MARC::Charset->assume_unicode(1);
my $acsaf_cache = {};
my $bootstrap = '/openils/conf/opensrf_core.xml';
my @records;
my $verbose;
+my $auth_mod_since;
my $input_file ='';
-my $db_host = $ENV{PGHOST} || 'localhost';
-my $db_port = $ENV{PGPORT} || '5432';
-my $db_user = $ENV{PGDATABASE} || 'evergreen';
-my $db_pass = $ENV{PGPASSWORD};
+my $count_only;
+my $all;
+my $help;
+
my $links_removed = 0;
my $links_added = 0;
my $CNI = 'KCLS';
-my %options;
my $result = GetOptions(
- \%options,
- 'configuration=s' => \$bootstrap,
- 'record=i' => \@records,
- 'all', 'help',
+ 'record=i' => \@records,
+ 'all' => \$all,
+ 'help' => \$help,
+ 'auth-mod-since=s' => \$auth_mod_since,
'start_id=i' => \$start_id,
- 'end_id=i' => \$end_id,
- 'file=s' => \$input_file,
- 'verbose' => \$verbose,
- "db-host=s" => \$db_host,
- "db-user=s" => \$db_user,
- "db-pass=s" => \$db_pass,
- "db-port=s" => \$db_port
+ 'end_id=i' => \$end_id,
+ 'file=s' => \$input_file,
+ 'count-only' => \$count_only,
+ 'verbose' => \$verbose,
);
-sub announce {
- my $msg = shift;
- return unless $verbose;
- print DateTime->now(time_zone => 'local')->strftime('%F %T') . " $msg\n";
-}
-
-pod2usage(0) if not $result or $options{help};
+$KU->verbose($verbose);
OpenSRF::System->bootstrap_client(config_file => $bootstrap);
-Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL"));
+Fieldmapper->import(
+ IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL"));
# must be loaded and initialized after the IDL is parsed
my $e = OpenILS::Utils::CStoreEditor->new;
+# returns list of authority.record_entry IDs
+sub find_modified_auths {
+
+ $KU->announce('INFO',
+ "Searching for authority records modified since '$auth_mod_since'");
+
+ my $sth = $KU->prepare_statement(<<" SQL");
+ SELECT id FROM authority.record_entry
+ WHERE DATE(edit_date) >= '$auth_mod_since'::TIMESTAMPTZ AND NOT deleted
+ SQL
+ $sth->execute;
+
+ my @ids;
+ while (my $ref = $sth->fetchrow_hashref()) {
+ push(@ids, $ref->{id});
+ }
+
+ $KU->announce('INFO', "Found ".scalar(@ids)." authority records");
+ return @ids;
+}
+
my $query = q{
SELECT
source,
) x GROUP BY 1 ORDER BY source
};
+$KU->connect_db;
+
my @bind_params;
if (@records) {
$query = sprintf($query, "WHERE sh2.record = ?");
@bind_params = @records; # should be just one scalar in this array.
-} elsif ($options{all}) {
+
+} elsif ($auth_mod_since) {
+
+ my @are_ids = find_modified_auths();
+ my $id_str = join(',', @are_ids);
+ $query = sprintf($query, "WHERE sh2.record IN ($id_str)");
+
+} elsif ($all) {
$query = sprintf($query, ""); # no where clause
+
} elsif ($start_id and $end_id) {
$query = sprintf($query, "WHERE sh2.record BETWEEN ? AND ?");
@bind_params = ($start_id, $end_id);
} elsif ($input_file) {
# Load authority record IDs from a file.
- announce("Reading authority record IDs from $input_file");
+ $KU->announce('INFO', "Reading authority record IDs from $input_file");
open FILE, "<", $input_file or die "Can't open file $input_file\n";
while(<FILE>) {
}
close FILE;
- announce("Read ".scalar(@records)." from $input_file");
+ $KU->announce('INFO', "Read ".scalar(@records)." from $input_file");
# No bind_params needed.
my $recstr = join(',', @records);
$query = sprintf($query, "WHERE sh2.record IN ($recstr)");
-
-} else {
- pod2usage(0);
}
-announce("SQL, params: ", Dumper($query, \@bind_params));
+my $sth = $KU->prepare_statement($query);
+$sth->execute(@bind_params);
-my $dsn = "dbi:Pg:database=evergreen;host=$db_host;port=$db_port";
-my $dbh = DBI->connect($dsn, $db_user, $db_pass);
-$dbh->do('SET statement_timeout = 0');
+my @src_links;
+while (my ($src, $links) = $sth->fetchrow_array) {
+ push(@src_links, [$src, $links]);
+}
+$KU->disconnect_db;
-my $sth = $dbh->prepare($query);
+my $total_records = scalar(@src_links);
+$KU->announce('INFO', "Processing $total_records authority records");
-announce("Executing query ...");
-$sth->execute(@bind_params);
+exit if $count_only;
my $problems = 0;
+for my $src_link (@src_links) {
+ my $src = $src_link->[0];
+ my $links = $src_link->[1];
-if (@records) {
- announce("Processing ".scalar(@records)." records");
-} elsif ($start_id) {
- announce("Processing records $start_id => $end_id");
-} else {
- announce("Processing all records");
-}
-
-my $total_records = 0;
-while (my ($src, $links) = $sth->fetchrow_array) {
- announce("--------------------");
- announce("Processing authority source record $src");
- $total_records++;
+ $KU->announce('INFO', "--------------------");
+ $KU->announce('INFO', "Processing authority source record $src");
- try {
+ eval {
my $src_rec = $e->retrieve_authority_record_entry($src);
if (!$src_rec) {
my $src_marc = MARC::Record->new_from_xml($src_rec->marc);
my $auth_src_thesaurus = substr($src_marc->field('008')->data(), 11, 1);
- announce("Source record thesaurus value=$auth_src_thesaurus");
+ $KU->announce('INFO', "Source record thesaurus value=$auth_src_thesaurus");
my $changed = 0;
my %tags_seen;
next if $target eq $src_rec->id;
- announce("Target: $target, field_id: $field_id");
+ $KU->announce('INFO', "Target: $target, field_id: $field_id");
my $target_rec = $e->retrieve_authority_record_entry($target);
if (!$target_rec) {
my $auth_target_thesaurus =
substr($target_marc->field('008')->data(), 11, 1);
- announce("Target record thesaurus value=$auth_target_thesaurus");
+ $KU->announce('INFO', "Target record thesaurus value=$auth_target_thesaurus");
if ($auth_src_thesaurus ne $auth_target_thesaurus) {
- announce("Thesauri for source/target records do not match. Skipping..");
+ $KU->announce('INFO', "Thesauri for source/target records do not match. Skipping..");
next;
}
# so they can be completely rebuilt.
for my $field ($src_marc->field($acsaf->tag)) {
if (my $val = $field->subfield('0')) {
- announce("Removing existing subfield 0 : $val");
+ $KU->announce('INFO', "Removing existing subfield 0 : $val");
$field->delete_subfield(code => '0');
$changed = 1;
$links_removed++;
$acsaf->main_entry->joiner
);
- announce(sprintf(
+ $KU->announce('INFO', sprintf(
"At field id=%s (tag=%s) / trying to match '%s'",
$acsaf->id, $acsaf->tag, $src_string));
);
if ($target_string eq $src_string) {
- announce("Got a match");
+ $KU->announce('INFO', "Got a match");
$field->update('0' => "($CNI)$target");
$changed = 1;
$links_added++;
}
if ($changed) {
- announce("Updating authority record ".$src_rec->id);
+ $KU->announce('INFO', "Updating authority record ".$src_rec->id);
$src_rec->marc(OpenILS::Utils::Normalize::clean_marc($src_marc));
$e->xact_begin;
$e->update_authority_record_entry($src_rec) or die $e->die_event;
$e->xact_commit;
}
-
- } otherwise {
- my $err = shift;
- print STDERR "\nRecord # $src : ",
- (ref $err eq "HASH" ? Dumper($err) : $err), "\n";
+ };
+
+ if ($@) {
+ $KU->announce('WARNING', "Error processing record $src : $@");
# Reset SAX parser so that one bad record doesn't
# kill the entire process.
-
import MARC::File::XML;
$problems++;
}
}
+
if (@records) {
- announce("Processed records ". scalar(@records).
+ $KU->announce('INFO', "Processed records ". scalar(@records).
" records; processed=$total_records problems=$problems");
} elsif ($start_id) {
- announce("Processed records $start_id => $end_id; ".
+ $KU->announce('INFO', "Processed records $start_id => $end_id; ".
"processed=$total_records; problems=$problems");
} else {
- announce("Processed all records; processed=$total_records; problems=$problems");
+ $KU->announce('INFO',
+ "Processed all records; processed=$total_records; problems=$problems");
}
-announce("links removed: $links_removed");
-announce("links added: $links_added");
-announce("delta added: ".($links_added - $links_removed));
-
-exit ($problems > 0);
-
-__END__
-
-=head1 NAME
-
-authority_authority_linker.pl - Link reference headings in authority records to main entry headings in other authority records
-
-=head1 SYNOPSIS
-
-authority_authority_linker.pl [B<--configuration>=I<opensrf_core.conf>]
-[[B<--record>=I<record>[ B<--record>=I<record>]]] | [B<--all>] | [B<--start_id>=I<start-ID> B<--end_id>=I<end-ID>]
-
-=head1 DESCRIPTION
-
-For a given set of records, find authority reference headings that also
-appear as main entry headings in any other authority record. In the
-specific MARC field of the authority record (source) containing the reference
-heading with such a match in another authority record (target), add a subfield
-0 (zero) referring to the target record by ID.
-
-=head1 OPTIONS
-
-=over
-
-=item * B<-c> I<config-file>, B<--configuration>=I<config-file>
-
-Specifies the OpenSRF configuration file used to connect to the OpenSRF router.
-Defaults to F</openils/conf/opensrf_core.xml>
-
-=item * B<-r> I<record-ID>, B<--record>=I<record-ID>
-
-Specifies the authority record ID (found in the C<authority.record_entry.id>
-column) of the B<source> record to process. This option may be specified more
-than once to process multiple records in a single run.
-
-=item * B<-a>, B<--all>
-
-Specifies that all authority records should be processed. For large
-databases, this may take an extraordinarily long amount of time.
-
-=item * B<-s> I<start-ID>, B<--start_id>=I<start-ID>
-
-Specifies the starting ID of the range of authority records to process.
-This option is ignored unless it is accompanied by the B<-e> or B<--end_id>
-option.
-
-=item * B<-e> I<end-ID>, B<--end_id>=I<end-ID>
-
-Specifies the ending ID of the range of authority records to process.
-This option is ignored unless it is accompanied by the B<-s> or B<--start>
-option.
-
-=back
-
-=head1 EXAMPLES
-
- authority_authority_linker.pl --start_id 1 --end_id 50000
-
-Processes the authority records with IDs between 1 and 50,000 using the
-default OpenSRF configuration file for connection information.
-
-=head1 AUTHOR
-
-Lebbeous Fogle-Weekley <lebbeous@esilibrary.com>
-
-=head1 COPYRIGHT AND LICENSE
-
-Copyright (C) 2013 Equinox Software, Inc.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-02110-1301, USA.
+$KU->announce('INFO', "links removed: $links_removed");
+$KU->announce('INFO', "links added: $links_added");
+$KU->announce('INFO', "delta added: ".($links_added - $links_removed));
-=cut
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# ---------------------------------------------------------------
-
use strict;
use warnings;
-use DBI;
use Getopt::Long;
+use DateTime;
use MARC::Record;
use MARC::File::XML (BinaryEncoding => 'UTF-8');
use MARC::Charset;
+use Encode;
+use Unicode::Normalize;
use OpenSRF::System;
use OpenILS::Utils::Fieldmapper;
use OpenSRF::Utils::SettingsClient;
-use Encode;
-use Unicode::Normalize;
-use OpenILS::Application::AppUtils;
use Data::Dumper;
-use Pod::Usage qw/ pod2usage /;
-use DateTime;
+use OpenILS::Application::AppUtils;
+use OpenILS::Utils::KCLSScriptUtil;
+my $KU = 'OpenILS::Utils::KCLSScriptUtil';
$ENV{OSRF_LOG_CLIENT} = 1;
+$KU->syslog_ident('LINKING');
$Data::Dumper::Indent = 0;
MARC::Charset->assume_unicode(1);
-my $start_id;
-my $end_id;
-my $refresh;
-my $days_back; # days; 0 means today only
-my $modified_since; # days; 0 means today only
-my $input_file ='';
my $bootstrap = '/openils/conf/opensrf_core.xml';
my @records;
+my $input_file ='';
+my $auth_mod_since;
+my $slot_count; # starts at 1
+my $slot; # starts at 0
+my $count_only;
+my $print_ids;
my $verbose = 0;
-my $sort_desc = 0;
-my $db_host = $ENV{PGHOST} || 'localhost';
-my $db_port = $ENV{PGPORT} || '5432';
-my $db_user = $ENV{PGDATABASE} || 'evergreen';
-my $db_pass = $ENV{PGPASSWORD};
-my $CNI = 'KCLS';
-
-my %options;
-my $result = GetOptions(
- \%options,
- 'configuration=s' => \$bootstrap,
- 'record=i' => \@records,
- 'refresh' => \$refresh,
- 'start-id=i' => \$start_id,
- 'end-id=i' => \$end_id,
- 'days-back=i' => \$days_back,
- 'modified-since=i' => \$modified_since,
- 'sort-desc' => \$sort_desc,
- 'file=s' => \$input_file,
- 'verbose' => \$verbose,
- "db-host=s" => \$db_host,
- "db-user=s" => \$db_user,
- "db-pass=s" => \$db_pass,
- "db-port=s" => \$db_port,
- 'all', # now assumed to be true when --file is unset
- 'help'
-);
+my $CNI = 'KCLS';
+my $help;
-sub announce {
- my $msg = shift;
- my $force = shift;
- return unless $force || $verbose;
- print DateTime->now->strftime('%F %T') . " [$$] $msg\n";
-}
+GetOptions(
+ 'record=i' => \@records,
+ 'slot-count=i' => \$slot_count,
+ 'slot=i' => \$slot,
+ 'auth-mod-since=s' => \$auth_mod_since,
+ 'file=s' => \$input_file,
+ 'count-only' => \$count_only,
+ 'print-ids' => \$print_ids,
+ 'verbose' => \$verbose,
+ 'help' => \$help
+) || help();
+
+sub help {
+ print <<HELP;
-if (!$result or $options{help}) {
- pod2usage(0);
+ Update bib-to-auth links on all non-deleted bib records:
+
+ $0
+
+ Options:
+
+ --record <id>
+ Process a single bib record by ID.
+
+ --slot-count <number>
+ Specifies the number of buckets the bib records will put into
+ for batch processing. The higher the slot-count number, the
+ smaller each bucket.
+
+ --slot <0 .. (--slot-count - 1)>
+ Species which bucket of records to process in the current
+ iteration of the script. Valid values are 0 up to the
+ --slot-count value minus 1.
+
+ E.g. Process the first of 10 batches:
+ --slot-count 10 --slot 0
+
+ Under the covers: WHERE MOD(id, 10) = 0
+
+ E.g. Process the (zero-based) 10th of 25 batches:
+ --slot-count 25 --slot 9
+
+ --auth-mod-since <YYYY-MM-DD>
+ Only process bibs that may need to link to authority
+ records added or edited on or after the specified date.
+
+ --count-only
+ Log the number of records that would be processed and exit.
+
+ --print-ids
+ Print the IDs of all bib records that would be processed.
+HELP
+ exit;
}
+help() if $help;
+$KU->verbose($verbose);
+
OpenSRF::System->bootstrap_client(config_file => $bootstrap);
-Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL"));
+Fieldmapper->import(
+ IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL"));
# must be loaded and initialized after the IDL is parsed
use OpenILS::Utils::CStoreEditor;
my $e = OpenILS::Utils::CStoreEditor->new;
-if ($input_file) {
- open FILE, "<", $input_file or die "Can't open file " . $input_file;
- while(<FILE>) {
- chomp;
- if($_) {
- push(@records, $_);
- }
- }
- close FILE;
+sub load_ids_from_file {
+ open FILE, "<", $input_file
+ or die "Can't open file " . $input_file . "\n";
-} else {
+ while(<FILE>) {
+ chomp;
+ push(@records, $_) if $_;
+ }
+ close FILE;
+}
+# returns list of authority.record_entry IDs
+sub find_modified_auths {
- my $where = "WHERE not deleted";
- $where .= " AND id >= $start_id" if $start_id;
- $where .= " AND id <= $end_id" if $end_id;
+ $KU->announce('INFO',
+ "Searching for authority records modified since '$auth_mod_since'");
+
+ my $sth = $KU->prepare_statement(<<" SQL");
+ SELECT id FROM authority.record_entry
+ WHERE DATE(edit_date) >= '$auth_mod_since'::TIMESTAMPTZ AND NOT deleted
+ SQL
+ $sth->execute;
+
+ my @ids;
+ while (my $ref = $sth->fetchrow_hashref()) {
+ push(@ids, $ref->{id});
+ }
- my $order = "ORDER BY id";
- $order .= " DESC" if $sort_desc;
+ $KU->announce('INFO', "Found ".scalar(@ids)." authority records");
+ return @ids;
+}
- if (defined $days_back or defined $modified_since) {
- my $date_field = defined $days_back ? 'create_date' : 'edit_date';
- my $duration = defined $days_back ? $days_back : $modified_since;
- $where .= " AND DATE($date_field) >= DATE(NOW() - '$duration day'::INTERVAL)";
+# ----------------------------------------------------------------------
+# Returns IDs for bib records that we might want to link to the new or
+# modified authority records.
+#
+# Query: give me bib records that link to browse entries that also
+# link to exactly one authority record, specifically the new authority
+# records we are processing via this script. Only include bib records
+# that are not already linked via bib_linking to said authority record.
+# This represents the set of bib records that might need to be linked
+# to our new authority records.
+# ----------------------------------------------------------------------
+sub find_bibs_from_modified_auths {
+ my %bib_ids; # de-dupe by record ID.
+
+ my @auth_ids = find_modified_auths();
+ return () unless @auth_ids;
+
+ my $auth_ids_param = join(',', @auth_ids);
+
+ for my $axis (qw/author subject series title/) {
+ $KU->announce('INFO',
+ "Scanning browse axis '$axis' for linkable bibs...");
+
+ my $query = <<SQL;
+SELECT
+ entry.id,
+ are.id AS auth_record,
+ def.source AS bib_record
+FROM metabib.browse_${axis}_entry entry
+ JOIN metabib.browse_${axis}_entry_simple_heading_map map
+ ON (map.entry = entry.id)
+ JOIN authority.simple_heading ash ON (ash.id = map.simple_heading)
+ JOIN authority.record_entry are ON (are.id = ash.record)
+ JOIN metabib.browse_${axis}_entry_def_map def ON (def.entry = entry.id)
+ JOIN biblio.record_entry bre ON (bre.id = def.source)
+ JOIN (
+ -- we only care about browse entries that link to
+ -- exactly one auth record, the auth record in question.
+ SELECT entry.id, COUNT(are.id)
+ FROM metabib.browse_${axis}_entry entry
+ JOIN metabib.browse_${axis}_entry_simple_heading_map map
+ ON (map.entry = entry.id)
+ JOIN authority.simple_heading ash
+ ON (ash.id = map.simple_heading)
+ JOIN authority.record_entry are
+ ON (are.id = ash.record)
+ WHERE NOT are.deleted
+ GROUP BY 1
+ HAVING COUNT(are.id) = 1
+ ) singles ON (singles.id = entry.id)
+ LEFT JOIN authority.bib_linking link
+ ON (link.bib = def.source AND link.authority = are.id)
+WHERE
+ NOT bre.deleted
+ AND link.authority IS NULL -- unlinked records
+ AND are.id IN ($auth_ids_param)
+SQL
+ my $sth = $KU->prepare_statement($query);
+ $sth->execute;
+
+ while (my $ref = $sth->fetchrow_hashref()) {
+ $bib_ids{$ref->{bib_record}} = 1; # de-dupe
+ }
}
- my $sql =
- "SELECT DISTINCT(id) AS id FROM biblio.record_entry $where $order";
+ return keys %bib_ids;
+}
+
+sub load_ids_with_sql_filters {
+ my $where = "WHERE not deleted";
+ $where .= " AND MOD(id, $slot_count) = $slot"
+ if $slot_count && defined $slot;
- announce("Loading record ID's with query:\n$sql");
+ my $sql = <<" SQL";
+ SELECT DISTINCT(id) AS id FROM biblio.record_entry
+ $where
+ ORDER BY id
+ SQL
- my $dsn = "dbi:Pg:database=evergreen;host=$db_host;port=$db_port";
- my $dbh = DBI->connect($dsn, $db_user, $db_pass);
+ (my $sql_str = $sql) =~ s/\n//g;
+ $KU->announce('INFO', "Loading record ID's with query [$sql_str]");
- my $sth = $dbh->prepare($sql);
+ my $sth = $KU->prepare_statement($sql);
$sth->execute;
while (my $ref = $sth->fetchrow_hashref()) {
push(@records, $ref->{id});
}
+}
+
+if (@records) {
+ $KU->announce('INFO', "Loading records from command line");
+
+} elsif ($input_file) {
+ $KU->announce('INFO', "Loading records from file: $input_file");
+ load_ids_from_file();
+
+} else {
+ $KU->announce('INFO', "Loading records via SQL...");
+ $KU->connect_db;
+
+ if ($auth_mod_since) {
+ @records = find_bibs_from_modified_auths();
+
+ # Chop the link-able records into batches if requested.
+ @records = grep { ($_ % $slot_count) == $slot } @records
+ if $slot_count && defined $slot;
- $sth->finish();
- $dbh->disconnect();
+ } else {
+ load_ids_with_sql_filters();
+ }
+
+ $KU->disconnect_db;
}
-announce("Processing ".scalar(@records)." records");
+@records = sort { $a <=> $b } @records;
+$KU->announce('INFO', "Processing ".scalar(@records)." records");
+$KU->announce('INFO', "@records") if $print_ids;
+
+exit if $count_only || $print_ids;
# Hash of controlled fields & subfields in bibliographic records, and their
# corresponding controlling fields & subfields in the authority record
rvm => '6'
);
-my $start_time = localtime();
-
-if($input_file) {
- announce("Start $start_time for ".scalar(@records)." records");
-} elsif($start_id) {
- announce("Start $start_time for record range: $start_id => $end_id");
-} else {
- announce("Start $start_time for all records");
-}
# Fetch leader/008 values for authority records. Filter out any whose
# 008 14 or 15 field are not appropriate for the requested bib tag.
if (substr($value, $index, 1) eq 'a') {
push(@keepers, $leader);
} else {
- announce("Skipping authority record ".$leader->{record}.
+ $KU->announce('INFO',
+ "Skipping authority record ".$leader->{record}.
" on bib $bib_tag match; 008/#14|#15 not appropriate");
}
}
# bib field thesaurus spec
my $cfield_ind2 = $bib_field->indicator(2);
- announce("6XX indicator 2 value = $cfield_ind2");
+ $KU->announce('DEBUG', "6XX indicator 2 value = $cfield_ind2");
my $is_local = 0;
if ($cfield_ind2 eq '7') {
$is_local = 1;
my $thesaurus = $bib_field->subfield('2') || '';
- announce("Found local thesaurus value $thesaurus");
+ $KU->announce('DEBUG', "Found local thesaurus value $thesaurus");
# if we have no special remapping value for the found thesaurus,
# fall back to ind2 => 7=Other.
$cfield_ind2 = $REMAP_BIB_SF2_TO_IND2{$thesaurus} || '7';
- announce("Local thesaurus '$thesaurus' ".
+ $KU->announce('DEBUG', "Local thesaurus '$thesaurus' ".
"remapped to ind2 value '$cfield_ind2'");
}
$authz_found = $leader->{record} if $thesaurus eq 'z';
if ($AUTH_TO_BIB_IND2{$thesaurus} eq $cfield_ind2) {
- announce("Found a match on thesaurus ".
+ $KU->announce('DEBUG', "Found a match on thesaurus ".
"'$thesaurus' for " . $leader->{record});
return $leader->{record};
}
my $count = 0;
my $total = scalar(@records);
-announce("processing $total bib records", 1);
+$KU->announce('INFO', "processing $total bib records");
+$slot ||= 'N/A';
+my $start_time = time();
foreach my $rec_id (@records) {
$count++;
- announce("processing $count of $total", 1) if ($count % 1000) == 0;
+ $KU->announce('INFO', "processed $count of $total [slot=$slot]")
+ if ($count % 1000) == 0;
- announce("processing bib record $rec_id [$count of $total]");
+ $KU->announce('DEBUG',
+ "processing bib record $rec_id [$count of $total; slot=$slot]");
# State variable; was the record changed?
my $changed = 0;
if ($is_fast_heading && $sf0 =~ /\)fst/) {
# fast heading looks OK. ignore it.
- announce("Ignoring FAST heading field on ".
+ $KU->announce('DEBUG', "Ignoring FAST heading field on ".
"rec=$rec_id and tag=$c_tag \$0 $sf0");
next;
}
- if ($sf0 && $refresh) {
- announce("Removing \$0 $sf0 for rec=$rec_id and tag=$c_tag");
+ if ($sf0) {
+ $KU->announce('DEBUG',
+ "Removing \$0 $sf0 for rec=$rec_id and tag=$c_tag");
$bib_field->delete_subfield(code => '0');
$changed = 1;
}
# We don't control fast headings, so there's nothing
# left to do. Move on to the next field...
- announce("No linking performed on FAST heading ".
+ $KU->announce('DEBUG',
+ "No linking performed on FAST heading ".
"field on rec=$rec_id and tag=$c_tag");
next;
}
}
next if !$match_tag;
- announce("Searching for matches on controlled field $c_tag ".
+ $KU->announce('INFO',
+ "Searching for matches on controlled field $c_tag ".
"(auth tag=$match_tag): \n - ".Dumper(\@searches));
my @tags = ($match_tag);
# Protect against failed (error condition) search request
if (!$validates) {
- print STDERR "Search for matching authority failed; record # $rec_id\n";
- next if (!$changed);
+ $KU->announce('WARNING',
+ "Search for matching authority failed; record $rec_id");
+ next unless $changed;
}
- announce("Match query returned @$validates");
+ $KU->announce('INFO', "Match query returned @$validates");
# No matches found. Nothing left to do for this field.
next if scalar(@$validates) == 0;
# matching authority record to the controlled bib field.
$changed = 1;
$bib_field->add_subfields('0' => "($CNI)$auth_id");
- announce("auth=$auth_id cni=$CNI. It's a match!");
+ $KU->announce('INFO',
+ "auth=$auth_id cni=$CNI. It's a match!");
}
}
}
update_record($record, $marc) if $changed;
-
};
if ($@) {
- print STDERR "\nRecord # $rec_id : $@\n";
- import MARC::File::XML; # reset SAX parser so that one bad record doesn't kill the entire process
+ $KU->announce('WARNING', "Error linking record $rec_id : $@");
+ # Reset SAX parser so that one bad record doesn't kill the entire process
+ import MARC::File::XML;
}
}
-my $end_time = localtime();
-
-if($input_file) {
- announce("Stop $end_time for ".scalar(@records)." records");
-} elsif($start_id) {
- announce("Start $end_time for record range: $start_id => $end_id");
-} else {
- announce("Start $end_time for all records");
-}
-
-__END__
-
-=head1 NAME
-
-authority_control_fields.pl - Controls fields in bibliographic records with authorities in Evergreen
-
-=head1 SYNOPSIS
-
-C<authority_control_fields.pl> [B<--configuration>=I<opensrf_core.conf>] [B<--refresh>]
-[[B<--record>=I<record>[ B<--record>=I<record>]]] | [B<--all>] | [B<--start_id>=I<start-ID> B<--end_id>=I<end-ID>]
-
-=head1 DESCRIPTION
-
-For a given set of records:
-
-=over
-
-=item * Iterate through the list of fields that are controlled fields
-
-=item * Iterate through the list of subfields that are controlled for
-that given field
-
-=item * Search for a matching authority record for that combination of
-field + subfield(s)
-
-=over
-
-=item * If we find a match, then add a $0 subfield to that field identifying
-the controlling authority record
-
-=item * If we do not find a match, then insert a row into an "uncontrolled"
-table identifying the record ID, field, and subfield(s) that were not controlled
-
-=back
-
-=item * Iterate through the list of floating subdivisions
-
-=over
-
-=item * If we find a match, then add a $0 subfield to that field identifying
-the controlling authority record
-
-=item * If we do not find a match, then insert a row into an "uncontrolled"
-table identifying the record ID, field, and subfield(s) that were not controlled
-
-=back
-
-=item * If we changed the record, update it in the database
-
-=back
-
-=head1 OPTIONS
-
-=over
-
-=item * B<-f>, B<--file>
-
-Specifies a file of bibs ids to link.
-
-=item * B<-c> I<config-file>, B<--configuration>=I<config-file>
-
-Specifies the OpenSRF configuration file used to connect to the OpenSRF router.
-Defaults to F</openils/conf/opensrf_core.xml>
-
-=item * B<-r> I<record-ID>, B<--record>=I<record-ID>
-
-Specifies the bibliographic record ID (found in the C<biblio.record_entry.id>
-column) of the record to process. This option may be specified more than once
-to process multiple records in a single run.
-
-=item * B<-a>, B<--all>
-
-Specifies that all bibliographic records should be processed. For large
-databases, this may take an extraordinarily long amount of time.
-
-=item * B<-r>, B<--refresh>
-
-Specifies that all authority links should be removed from the target
-bibliographic record(s). This will effectively rewrite all authority
-linking anew.
-
-=item * B<-s> I<start-ID>, B<--start_id>=I<start-ID>
-
-Specifies the starting ID of the range of bibliographic records to process.
-This option is ignored unless it is accompanied by the B<-e> or B<--end_id>
-option.
-
-=item * B<-e> I<end-ID>, B<--end_id>=I<end-ID>
-
-Specifies the ending ID of the range of bibliographic records to process.
-This option is ignored unless it is accompanied by the B<-s> or B<--start>
-option.
-
-=back
-
-=head1 EXAMPLES
-
- authority_control_fields.pl --start_id 1 --end_id 50000
-
-Processes the bibliographic records with IDs between 1 and 50,000 using the
-default OpenSRF configuration file for connection information.
-
-=head1 AUTHOR
-
-Dan Scott <dscott@laurentian.ca>
-
-=head1 COPYRIGHT AND LICENSE
-
-Copyright 2010-2011 by Dan Scott
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-
-=cut
+my $end_time = time();
+my $duration = $end_time - $start_time;
+$KU->announce('INFO', "Bib linking script duration $duration seconds");
+++ /dev/null
-#!/usr/bin/perl
-# ----------------------------------------------------------------------
-# Find bib records matching the requested criteria for linking.
-# Bib IDs are exported to one or more batch files for future processing.
-# ----------------------------------------------------------------------
-use strict;
-use warnings;
-use DBI;
-use Getopt::Long;
-use DateTime;
-
-my $db_handle;
-my $counter = 0;
-
-# options
-my $help;
-my $modified_since;
-my $exported_since;
-my $batch_size = 10000;
-my $start_id;
-my $end_id;
-my $count_only;
-my $out_dir = '/tmp';
-my $db_host = $ENV{PGHOST} || 'localhost';
-my $db_port = $ENV{PGPORT} || '5432';
-my $db_user = $ENV{PGUSER} || 'evergreen';
-my $db_name = $ENV{PGDATABASE} || 'evergreen';
-my $db_pass = $ENV{PGPASSWORD};
-
-my $opt_result = GetOptions(
- 'modified-since=s' => \$modified_since,
- 'exported-since=s' => \$exported_since,
- 'start-id=i' => \$start_id,
- 'end-id=i' => \$end_id,
- 'batch-size=i' => \$batch_size,
- 'count-only' => \$count_only,
- 'out-dir=s' => \$out_dir,
- "db-host=s" => \$db_host,
- "db-user=s" => \$db_user,
- "db-pass=s" => \$db_pass,
- "db-port=s" => \$db_port,
- 'help' => \$help
-);
-
-sub announce {
- my $msg = shift;
- print DateTime->now(time_zone => 'local')->strftime('%F %T')." $msg\n";
-}
-
-sub help {
- print <<HELP;
- Find IDs for bib records based on various criteria. Write bib
- IDs to batch files. Batch files are placed into --out-dir and
- named bib-ids.001, bib-ids.002, etc.
-
- Usage:
-
- Find
-
- $0 --modified-since 1 --batch-size 100 \
- --out-dir /openils/var/data/linkbibs/2016-12-01
-
- Options:
-
- --modified-since <YYYY-MM-DD>
- Limit bibs to those modifed since the specified date.
-
- --exported-since <YYYY-MM-DD>
- Limit bibs to those exported since the specified date.
- Export date is based on data found in the
- metabib.bib_export_data table.
-
- --start-id <id>
- Limit bibs to those whose ID is no less than <id>
-
- --end-id <id>
- Limit bibs to those whose ID is no greater than <id>
-
- --out-dir [/tmp]
- Output directory.
-
- --batch-size
- Number of bib IDs to write to each batch file.
-
- --count-only
- Print the total number of records that would be added
- to batch files without adding to any batch files.
-
- --db-host
- --db-user
- --db-pass
- --db-port
- Database connection params. PG environment variables are
- also inspected for values. When all else fails, try to
- connect to database evergreen\@localhost
-HELP
- exit 0;
-}
-
-help() if $help || !$opt_result;
-
-sub connect_db {
- $db_handle = DBI->connect(
- "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'",
- $db_user, $db_pass, {
- RaiseError => 1,
- PrintError => 0,
- AutoCommit => 1,
- pg_expand_array => 0,
- pg_enable_utf8 => 1
- }
- ) or die "Connection to database failed: $DBI::err : $DBI::errstr";
-}
-
-connect_db();
-
-# ----------------------------------------------------------------------
-my $from = 'FROM biblio.record_entry bre';
-
-my $where = 'WHERE NOT bre.deleted';
-$where .= " AND bre.id >= $start_id" if $start_id;
-$where .= " AND bre.id <= $end_id" if $end_id;
-
-if ($exported_since) {
- $where .= " AND bed.export_date > '$exported_since'";
- $from .= " JOIN metabib.bib_export_data bed ON (bed.bib = bre.id)";
-}
-
-my $sql = <<SQL;
- SELECT bre.id
- $from
- $where
- ORDER BY bre.id DESC;
-SQL
-
-my $sth = $db_handle->prepare($sql);
-$sth->execute;
-
-my $batch_file;
-sub open_batch_file {
- my $path = shift;
- announce("Starting new batch file: $path");
-
- close $batch_file if $batch_file;
-
- open $batch_file, '>', $path or
- die "Cannot open batch file for writing: $!\n";
-}
-
-my $ctr = 0;
-my $batch = 0;
-while (my $ref = $sth->fetchrow_hashref()) {
- $ctr++;
- next if $count_only;
-
- if (( ($ctr - 1) % $batch_size) == 0) {
- my $path = sprintf("$out_dir/bib-ids.%0.3d", $batch);
- open_batch_file($path);
- $batch++;
- }
-
- print $batch_file $ref->{id} . "\n";
-}
-
-close $batch_file if $batch_file;
-$sth->finish;
-
-announce("Found $ctr bib records");
-
--- /dev/null
+#!/bin/bash
+set -u
+NUM_PROCS=3
+SLOTS_PER_PROC=10
+
+DOW=$(date +%u);
+[ $DOW == 7 ] && DOW=0; # set Sunday = 0
+
+let SLOTS_PER_SCRIPT=$NUM_PROCS*$SLOTS_PER_PROC;
+let TOTAL_SLOTS=$SLOTS_PER_SCRIPT*7; # run daily
+let PROC_START_SLOT=$SLOTS_PER_SCRIPT*$DOW;
+
+echo "Starting at slot $PROC_START_SLOT";
+
+function execute_background_proc {
+ START_SLOT=$1
+ let MAX_SLOT=$SLOTS_PER_PROC-1;
+
+ for SLOT_OFFSET in $(seq 0 $MAX_SLOT); do
+ let SLOT=$START_SLOT+$SLOT_OFFSET
+ echo "Processing slot $SLOT of $TOTAL_SLOTS"
+ perl ./authority_control_fields.pl \
+ --slot-count $TOTAL_SLOTS --slot $SLOT
+ done;
+}
+
+for PROC in $(seq 1 $NUM_PROCS); do
+ (execute_background_proc $PROC_START_SLOT) &
+ let PROC_START_SLOT=$PROC_START_SLOT+$SLOTS_PER_PROC;
+done
+
+# Wait for all child processes to complete. This makes it easier
+# to track and/or kill all of the child processes if needed.
+wait;
+
+echo "Done processing all batches"
+
+++ /dev/null
-#!/bin/bash
-set -eu
-PROC_COUNT=3
-BATCHES_PER_PROC=9
-BATCH_DIR="/openils/var/data/authority-control/link-bibs"
-
-DOW=$(date +%u);
-[ $DOW == 7 ] && DOW=0; # make Sunday = 0
-
-BATCH_SIZE=$((($PROC_COUNT * $BATCHES_PER_PROC)));
-BATCH=$((($BATCH_SIZE * $DOW)));
-
-echo "Starting at batch number $BATCH";
-
-function execute_background_proc {
- START=$1
- for batch in $(seq 1 $BATCHES_PER_PROC); do
- FILE=$(printf "$BATCH_DIR/bib-ids.%0.3d" $((($START + $batch))));
- if [ -f $FILE ]; then
- echo "Linking bib batch file $FILE"
- else
- echo "Skipping nonexistent bib link batch file $FILE"
- continue
- fi
- perl ./authority_control_fields.pl --refresh --file $FILE
- done;
-}
-
-for PROC in $(seq 1 $PROC_COUNT); do
- (execute_background_proc $BATCH) &
- BATCH=$((($BATCH + BATCHES_PER_PROC)));
-done
-
-wait;
-
-echo "Done processing all batches"
-
--- /dev/null
+package OpenILS::Utils::KCLSScriptUtil;
+# ---------------------------------------------------------------------
+# Script utility functions.
+#
+# 1. Assumes a single DB connection per Perl process.
+#
+# 2. Keep imports at a minimum so utility scripts can be run from
+# non-Evergreen servers (e.g. beefy DB servers).
+# ---------------------------------------------------------------------
+use strict;
+use warnings;
+use DBI;
+use DateTime;
+use Sys::Syslog qw(syslog openlog);
+my $KU = 'OpenILS::Utils::KCLSScriptUtil';
+
+# ---------------------------------------------------------------------
+# LOGGING
+# ---------------------------------------------------------------------
+our $syslog_facility = 'LOCAL6'; # matches Evergreen gateway
+our $syslog_ops = 'pid';
+our $syslog_ident = 'KCLSScriptUtil';
+my $syslog_opened = 0;
+my $verbose = 0;
+
+sub syslog_facility {
+ my $cls = shift;
+ my $val = shift;
+ $syslog_facility = $val if defined $val;
+ return $syslog_facility;
+}
+
+sub syslog_ops {
+ my $cls = shift;
+ my $val = shift;
+ $syslog_ops = $val if defined $val;
+ return $syslog_ops;
+}
+
+sub syslog_ident {
+ my $cls = shift;
+ my $val = shift;
+ $syslog_ident = $val if defined $val;
+ return $syslog_ident;
+}
+
+sub syslog_opened {
+ my $cls = shift;
+ my $val = shift;
+ $syslog_opened = $val if defined $val;
+ return $syslog_opened;
+}
+
+sub verbose {
+ my $cls = shift;
+ my $val = shift;
+ $verbose = $val if defined $val;
+ return $verbose;
+}
+
+
+
+# Send messages to syslog
+# Send INFO, DEBUG messages to STDOUT if $verbose is true
+# Send ERR, WARNING messages to STDERR
+# Finishes with die() if $die is true
+sub announce {
+ my $cls = shift; # class ref; unused
+ my $lvl = shift; # syslog level (minus the LOG_ part)
+ my $msg = shift; # message string to log
+ my $die = shift; # true if this should cause the script to die
+
+ if (!$syslog_opened) {
+ openlog($syslog_ident, $syslog_ops, $syslog_facility);
+ $syslog_opened = 1;
+ }
+
+ syslog("LOG_$lvl", $msg);
+
+ my $date_str = DateTime->now(time_zone => 'local')->strftime('%F %T');
+ my $msg_str = "$date_str [$$] $lvl $msg\n";
+
+ if ($die) {
+ die $msg_str;
+ return;
+ }
+
+ if ($lvl eq 'ERR' or $lvl eq 'WARNING') {
+ # always copy problem messages to stdout
+ warn $msg_str; # avoid dupe messages
+ } else {
+ print $msg_str if $verbose;
+ }
+}
+
+# ---------------------------------------------------------------------
+# DATABASE
+# ---------------------------------------------------------------------
+our $db_user = $ENV{PGUSER} || 'evergreen';
+our $db_name = $ENV{PGDATABASE} || 'evergreen';
+our $db_host = $ENV{PGHOST} || 'localhost';
+our $db_port = $ENV{PGPORT} || '5432';
+our $db_pass = $ENV{PGPASSWORD};
+our $db_handle;
+our $db_handle_ops = 0;
+our @db_statements;
+
+sub connect_db {
+ my $dsn = "dbi:Pg:db=$db_name;host=$db_host;port=$db_port";
+ $KU->announce('DEBUG', "Connecting to DB $dsn");
+
+ $db_handle = DBI->connect(
+ "$dsn;options='--statement-timeout=0'",
+ $db_user, $db_pass, {
+ RaiseError => 1,
+ PrintError => 0,
+ AutoCommit => 1,
+ pg_expand_array => 0,
+ pg_enable_utf8 => 1,
+ Callbacks => {
+ ChildCallbacks => {
+ # Track the number of execute() operations
+ execute => sub { $db_handle_ops++; return; }
+ }
+ }
+ }
+ ) or $KU->announce('ERR',
+ "Connection to database failed: $DBI::err : $DBI::errstr", 1);
+
+ return $db_handle;
+}
+
+sub db_handle {
+ return $db_handle;
+}
+
+sub db_handle_ops {
+ return $db_handle_ops;
+}
+
+sub disconnect_db {
+ return unless $db_handle;
+ $KU->announce('DEBUG', 'Disconnecting DB handle and cleaning up statements');
+ $_->finish for @db_statements;
+ $db_handle->disconnect;
+ @db_statements = ();
+}
+
+sub reset_db_handle {
+ $KU->announce('DEBUG', 'Resetting DB connection') if $db_handle;
+ $db_handle_ops = 0;
+ disconnect_db();
+ connect_db();
+}
+
+sub prepare_statement {
+ my ($cls, $sql) = @_;
+ my $sth = $db_handle->prepare($sql);
+ push(@db_statements, $sth);
+ return $sth;
+}
+
+
+1;