From e217c6fbbc591b2d7674c9e5102e1715ae163054 Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Thu, 15 Dec 2016 12:18:20 -0500 Subject: [PATCH] JBAS-1437 Linking script additions and repairs Teach the bib linker how to find the records to link instead of requiring external batch files. Remove aging perl docs. Update and rename link-all-bibs-daily.sh to leverage the new bib linker bib-finding parameters. Remove no longer needed find-bibs-to-link script, which is superseded by the above. Update README to reflect changes. == Adds a collection of common logging and DB utility to a new DB module OpenILS::Utils::KCLSScriptUtil. Updated various Backstage / linking scripts to use this new module. More migrations to follow. Auth2Auth gets --auth-mod-since Signed-off-by: Bill Erickson --- KCLS/authority-control/backstage/export-bibs.pl | 97 ++--- .../backstage/process-backstage-files.pl | 197 +++------ KCLS/authority-control/linking/README.adoc | 10 +- .../linking/authority_authority_linker.pl | 249 ++++------- .../linking/authority_control_fields.pl | 480 ++++++++++----------- .../authority-control/linking/find-bibs-to-link.pl | 169 -------- .../linking/link-all-bibs-daily.sh | 37 ++ KCLS/authority-control/linking/link-bib-batches.sh | 37 -- .../perlmods/lib/OpenILS/Utils/KCLSScriptUtil.pm | 164 +++++++ 9 files changed, 615 insertions(+), 825 deletions(-) delete mode 100755 KCLS/authority-control/linking/find-bibs-to-link.pl create mode 100755 KCLS/authority-control/linking/link-all-bibs-daily.sh delete mode 100755 KCLS/authority-control/linking/link-bib-batches.sh create mode 100644 Open-ILS/src/perlmods/lib/OpenILS/Utils/KCLSScriptUtil.pm diff --git a/KCLS/authority-control/backstage/export-bibs.pl b/KCLS/authority-control/backstage/export-bibs.pl index a82a0b0c8d..dee53a9a46 100755 --- a/KCLS/authority-control/backstage/export-bibs.pl +++ b/KCLS/authority-control/backstage/export-bibs.pl @@ -3,7 +3,6 @@ # Export bib records for Backstage processing. # # The UTF-8 encoded USMARC string for each record is printed to STDOUT. -# Each exported bib has its export_date value updated to NOW(). # # Exported bibs meet the following criteria: # @@ -20,38 +19,24 @@ use DBI; use Getopt::Long; use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF-8'); - -my $db_handle; +use OpenILS::Utils::KCLSScriptUtil; +my $KU = 'OpenILS::Utils::KCLSScriptUtil'; my $start_date; my $end_date; -my $export_date; my $ids_only; my $count_only; my $out_file; -my $limit; -my $db_user = $ENV{PGUSER} || 'evergreen'; -my $db_name = $ENV{PGDATABASE} || 'evergreen'; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_pass = $ENV{PGPASSWORD}; my $help; GetOptions( 'start-date=s' => \$start_date, 'end-date=s' => \$end_date, - 'export-date=s' => \$export_date, 'ids-only' => \$ids_only, 'count-only' => \$count_only, 'out-file=s' => \$out_file, - 'limit=f' => \$limit, - 'db-user=s' => \$db_user, - 'db-host=s' => \$db_host, - 'db-name=s' => \$db_name, - 'db-port=i' => \$db_port, - 'db-pass=s' => \$db_pass, 'help' => \$help -); +) || help(); sub help { print < - Sets the export date to the provided value. If no --export-date - value is set, no export date value will be applied in the database. --out-file Write MARC records (or IDs) to this file. @@ -85,22 +64,21 @@ Options --count-only Only print the number of bibs that would be exported to STDOUT. - --limit - Export at most this many records. - HELP exit; } -die "--start-date and --end-date required\n" +help() if $help; + +$KU->announce('ERR', "--start-date and --end-date required", 1) unless $start_date && $end_date; -die "Invalid date format\n" unless +$KU->announce('ERR', "Invalid date format", 1) unless $start_date =~ /^\d{4}-\d{2}-\d{2}$/ && - $end_date =~ /^\d{4}-\d{2}-\d{2}$/ && - (!$export_date || $export_date =~ /^\d{4}-\d{2}-\d{2}$/); + $end_date =~ /^\d{4}-\d{2}-\d{2}$/; -die "--out-file required\n" unless $out_file || $count_only; +$KU->announce('ERR', "--out-file required", 1) + unless $out_file || $count_only; sub bib_query { my $sql = <prepare(bib_query()); - my $edate_sth = $db_handle->prepare( - 'SELECT * FROM metabib.set_export_date(?, ?)'); - + my $sth = $KU->prepare_statement(bib_query()); $sth->execute; + my $count = 0; my @skipped; while (my $bib = $sth->fetchrow_hashref) { @@ -207,60 +182,40 @@ sub export_marc { if ($ids_only) { print MARCFILE "$bib_id\n"; - print "$count records written...\n" if ($count % 1000) == 0; + $KU->announce('INFO', "$count records written...") + if ($count % 1000) == 0; next; } - my $rec = $db_handle->selectall_arrayref( + my $rec = $KU->db_handle->selectall_arrayref( "SELECT marc FROM biblio.record_entry WHERE id = $bib_id"); my $marc = $rec->[0]->[0]; my $marcdoc = MARC::Record->new_from_xml($marc, 'UTF-8', 'USMARC'); if (my @warnings = $marcdoc->warnings) { - print "Skipping record $bib_id on warnings: @warnings\n"; + $KU->announce('WARNING', + "Skipping record $bib_id on warnings: @warnings"); push(@skipped, $bib_id); } else { print MARCFILE $marcdoc->as_usmarc; } - print "$count records processed...\n" if ($count % 1000) == 0; - - next unless $export_date; - - # Update the bib record's metabib.bib_export_data entry. - eval { $edate_sth->execute($bib_id, $export_date) }; - die "Error setting export date for bib ". - "$bib_id to $export_date : $@\n" if $@; + $KU->announce('INFO', "$count records processed...") + if ($count % 1000) == 0; } close(MARCFILE) if $out_file; my $skip_count = scalar(@skipped); - print "total bib count = $count\n"; - print "skipped bib count = $skip_count\n"; - print "export bib count = ".($count - $skip_count)."\n"; - print "skipped bibs: @skipped\n"; - $sth->finish; - $edate_sth->finish; + $KU->announce('INFO', "total bibs = $count"); + $KU->announce('INFO', "skipped bibs = $skip_count"); + $KU->announce('INFO', "exported bibs = ".($count - $skip_count)); + $KU->announce('INFO', "skipped bibs: @skipped") if @skipped; } -sub connect_db { - $db_handle = DBI->connect( - "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", - $db_user, $db_pass, { - RaiseError => 1, - PrintError => 0, - AutoCommit => 1, - pg_expand_array => 0, - pg_enable_utf8 => 1 - } - ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; -} - -connect_db(); +$KU->connect_db; export_marc(); - -$db_handle->disconnect; +$KU->disconnect_db; diff --git a/KCLS/authority-control/backstage/process-backstage-files.pl b/KCLS/authority-control/backstage/process-backstage-files.pl index 8f571f3d14..b29c6d8082 100755 --- a/KCLS/authority-control/backstage/process-backstage-files.pl +++ b/KCLS/authority-control/backstage/process-backstage-files.pl @@ -17,10 +17,11 @@ use Archive::Zip qw(:ERROR_CODES :CONSTANTS); use File::Basename; use Sys::Syslog qw(syslog openlog); use OpenILS::Utils::Normalize qw(clean_marc); +use OpenILS::Utils::KCLSScriptUtil; binmode(STDOUT, ':utf8'); -my $db_handle; -my $db_handle_ops = 0; +my $KU = 'OpenILS::Utils::KCLSScriptUtil'; + # Reset the DB handle have this many operations to avoid memory leaks. my $db_handle_reset = 500; my $log_mod = 500; # log every 500th of each type of event (see verbose) @@ -29,17 +30,8 @@ my $marc_file; my $zip_file; my $export_date; my $working_dir = '.', -my $bib_collision_file = 'bib-collisions.mrc'; # in --working-dir my $verbose; -my $db_user = $ENV{PGUSER} || 'evergreen'; -my $db_name = $ENV{PGDATABASE} || 'evergreen'; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_pass = $ENV{PGPASSWORD}; - -my $syslog_facility = 'LOCAL6'; # matches Evergreen gateway -my $syslog_ops = 'pid'; -my $syslog_ident = 'BACKSTAGE'; +my $bib_collision_file = 'bib-collisions.mrc'; # in --working-dir my $new_auth_sth; my $mod_auth_sth; @@ -53,31 +45,26 @@ my $mod_auth_ctr = 0; my $del_auth_ctr = 0; my $mod_bibs_ctr = 0; my $col_bibs_ctr = 0; - my $help; GetOptions( - 'marc-file=s' => \$marc_file, - 'zip-file=s' => \$zip_file, - 'export-date=s' => \$export_date, - 'working-dir=s' => \$working_dir, - 'bib-collision-file=s' => \$bib_collision_file, - 'verbose' => \$verbose, - 'db-user=s' => \$db_user, - 'db-host=s' => \$db_host, - 'db-name=s' => \$db_name, - 'db-port=i' => \$db_port, - 'db-pass=s' => \$db_pass, + 'marc-file=s' => \$marc_file, + 'zip-file=s' => \$zip_file, + 'export-date=s' => \$export_date, + 'working-dir=s' => \$working_dir, + 'verbose' => \$verbose, 'help' => \$help ); sub help { print <verbose($verbose); +$KU->syslog_ident('BACKSTAGE'); -die "--marc-file or --zip-file required\n" unless ($marc_file || $zip_file); +$KU->announce('ERR', "required: --export-date YYYY-MM-DD", 1) + unless $export_date && $export_date =~ /^\d{4}-\d{2}-\d{2}$/; + +$KU->announce('ERR', "--marc-file or --zip-file required", 1) + unless ($marc_file || $zip_file); # Log every occurrence of each event type. $log_mod = 1 if $verbose; -sub announce { - my ($level, $msg, $die) = @_; - syslog("LOG_$level", $msg); - - my $date_str = DateTime->now(time_zone => 'local')->strftime('%F %T'); - my $msg_str = "$date_str [$$] $level $msg\n"; - - if ($die) { - die $msg_str; - - } else { - if ($level eq 'ERR' or $level eq 'WARNING') { - # always copy problem messages to stdout - warn $msg_str; # avoid dupe messages - } else { - print $msg_str; - } - } -} - sub check_db_handle { - return if $db_handle_ops < $db_handle_reset; - reset_db_handle(); -} - -sub reset_db_handle { - announce('INFO', 'Refreshing DB connection on max ops.') - if $verbose && $db_handle; - $db_handle_ops = 0; - disconnect_db(); - connect_db(); + return if $KU->db_handle_ops < $db_handle_reset; + $KU->reset_db_handle; prepare_statements(); } -sub disconnect_db { - return unless $db_handle; - - $new_auth_sth->finish; - $mod_auth_sth->finish; - $del_auth_sth->finish; - $delmod_auth_sth->finish; - $match_auth_sth->finish; - $match_auth_001_sth->finish; - $mod_bibs_sth->finish; - $db_handle->disconnect; -} - - -sub connect_db { - $db_handle = DBI->connect( - "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", - $db_user, $db_pass, { - RaiseError => 1, - PrintError => 0, - AutoCommit => 1, - pg_expand_array => 0, - pg_enable_utf8 => 1, - Callbacks => { - # Track the number of execute() operations - ChildCallbacks => { - execute => sub { $db_handle_ops++; return; } - } - } - } - ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; -} - sub process_zip_file { my $zip = Archive::Zip->new(); - announce('ERR', "Failed to read $zip_file", 1) + $KU->announce('ERR', "Failed to read $zip_file", 1) unless $zip->read($zip_file) == AZ_OK; my %marc_files = (bib => [], auth => []); @@ -202,11 +119,11 @@ sub process_zip_file { my $basename = basename($member->fileName()); - announce('INFO', "Extracting file $basename"); + $KU->announce('INFO', "Extracting file $basename"); my $local_file = "$working_dir/$basename"; - announce('ERR', "Unable to extract to file: $local_file", 1) + $KU->announce('ERR', "Unable to extract to file: $local_file", 1) unless $member->extractToFileNamed($local_file) == AZ_OK; my $key = ($basename =~ /BIB/) ? 'bib' : 'auth'; @@ -220,10 +137,10 @@ sub process_zip_file { sub process_marc_file { my $local_file = shift; my $basename = basename($local_file); - announce('INFO', "Processing file $basename"); + $KU->announce('INFO', "Processing file $basename"); my $marc_batch = MARC::File::USMARC->in($local_file, 'UTF8') - or announce('ERR', "Unable to read $local_file as MARC", 1); + or $KU->announce('ERR', "Unable to read $local_file as MARC", 1); if ($basename =~ /BIB/) { @@ -239,7 +156,7 @@ sub process_marc_file { } else { - announce('WARNING', "Unknown file type: $basename"); + $KU->announce('WARNING', "Unknown file type: $basename"); } } @@ -251,7 +168,7 @@ sub find_modified_bibs { return if $mod_searched; $mod_searched = 1; - my $id_arrays = $db_handle->selectall_arrayref(<<" SQL"); + my $id_arrays = $KU->db_handle->selectall_arrayref(<<" SQL"); SELECT id FROM biblio.record_entry WHERE NOT deleted AND edit_date >= '$export_date' @@ -259,7 +176,7 @@ sub find_modified_bibs { @modified_bibs = map {$_->[0]} @$id_arrays; - announce('INFO', scalar(@modified_bibs)." bibs modified since export"); + $KU->announce('INFO', scalar(@modified_bibs)." bibs modified since export"); } @@ -278,7 +195,7 @@ sub handle_modified_bibs { check_db_handle(); if (!$bib_id) { - announce('ERR', "Bib record has no 901c (ID) value. Skipping"); + $KU->announce('ERR', "Bib record has no 901c (ID) value. Skipping"); next; } @@ -304,13 +221,13 @@ sub update_bib { eval { $mod_bibs_sth->execute($marcxml, $bib_id) }; if ($@) { - announce('ERR', "Error updating biblio record: $@ : $marcxml"); + $KU->announce('ERR', "Error updating biblio record: $@ : $marcxml"); return; } $mod_bibs_ctr++; - announce('INFO', "Updated $mod_bibs_ctr bib records") + $KU->announce('INFO', "Updated $mod_bibs_ctr bib records") if $mod_bibs_ctr % $log_mod == 0; } @@ -320,18 +237,18 @@ sub write_bib_collision { my $filename = "$working_dir/$bib_collision_file"; open(BIBS_FILE, ">>$filename") or - announce('ERR', "Cannot open bib collision file: $filename : $!", 1); + $KU->announce('ERR', "Cannot open bib collision file: $filename : $!", 1); binmode(BIBS_FILE, ":utf8"); print BIBS_FILE $record->as_usmarc(); close BIBS_FILE or - announce('WARNING', "Error closing bib collision file: $filename : $!"); + $KU->announce('WARNING', "Error closing bib collision file: $filename : $!"); $col_bibs_ctr++; - announce('INFO', "Dumped $col_bibs_ctr bib collisions to file") + $KU->announce('INFO', "Dumped $col_bibs_ctr bib collisions to file") if $col_bibs_ctr % $log_mod == 0; } @@ -351,14 +268,14 @@ sub handle_deleted_auths { }; if ($@) { - announce( + $KU->announce( 'ERR', "Error deleting authority record: $@ : $auth_id"); next; } $del_auth_ctr++; - announce('INFO', "Deleted $del_auth_ctr authority records") + $KU->announce('INFO', "Deleted $del_auth_ctr authority records") if $del_auth_ctr % $log_mod == 0; } } @@ -411,13 +328,13 @@ sub update_auth { eval { $mod_auth_sth->execute($marcxml, $auth_id) }; if ($@) { - announce('ERR', "Error updating authority record: $@ : $marcxml"); + $KU->announce('ERR', "Error updating authority record: $@ : $marcxml"); return; } $mod_auth_ctr++; - announce('INFO', "Updated $mod_auth_ctr authority records") + $KU->announce('INFO', "Updated $mod_auth_ctr authority records") if $mod_auth_ctr % $log_mod == 0; } @@ -427,14 +344,14 @@ sub insert_auth { eval { $new_auth_sth->execute($marcxml, "IMPORT-" . time) }; if ($@) { - announce('ERR', + $KU->announce('ERR', "Error creating new authority record: $@ : $marcxml"); return; } $new_auth_ctr++; - announce('INFO', "Created $new_auth_ctr authority records") + $KU->announce('INFO', "Created $new_auth_ctr authority records") if $new_auth_ctr % $log_mod == 0; } @@ -452,7 +369,7 @@ sub find_replaced_auths { my $matches = $match_auth_sth->fetchall_arrayref; my @ids = map {$_->[0]} @$matches; - announce('INFO', "Auth 010z=$subfield matched records: @ids") + $KU->announce('INFO', "Auth 010z=$subfield matched records: @ids") if $verbose && @ids; return @ids; @@ -473,7 +390,7 @@ sub find_matching_auths { $match_auth_001_sth->execute($subfield); my $matches = $match_auth_001_sth->fetchall_arrayref; my @ids = map {$_->[0]} @$matches; - announce('INFO', "Auth 001=$subfield matched records: @ids") + $KU->announce('INFO', "Auth 001=$subfield matched records: @ids") if $verbose && @ids; return @ids; } @@ -493,7 +410,7 @@ sub find_matching_auths { my $matches = $match_auth_sth->fetchall_arrayref; my @ids = map {$_->[0]} @$matches; - announce('INFO', "Auth ${tag}a=$subfield matched records: @ids") + $KU->announce('INFO', "Auth ${tag}a=$subfield matched records: @ids") if $verbose && @ids; return @ids; @@ -501,33 +418,33 @@ sub find_matching_auths { sub prepare_statements { - $del_auth_sth = $db_handle->prepare(<<" SQL"); + $del_auth_sth = $KU->prepare_statement(<<" SQL"); DELETE FROM authority.record_entry WHERE id = ? SQL - $delmod_auth_sth = $db_handle->prepare(<<" SQL"); + $delmod_auth_sth = $KU->prepare_statement(<<" SQL"); UPDATE authority.record_entry SET edit_date = NOW() WHERE id = ? SQL - $mod_bibs_sth = $db_handle->prepare(<<" SQL"); + $mod_bibs_sth = $KU->prepare_statement(<<" SQL"); UPDATE biblio.record_entry SET marc = ?, edit_date = NOW() WHERE id = ? SQL - $mod_auth_sth = $db_handle->prepare(<<" SQL"); + $mod_auth_sth = $KU->prepare_statement(<<" SQL"); UPDATE authority.record_entry SET marc = ?, edit_date = NOW() WHERE id = ? SQL - $new_auth_sth = $db_handle->prepare(<<" SQL"); + $new_auth_sth = $KU->prepare_statement(<<" SQL"); INSERT INTO authority.record_entry (marc, last_xact_id) VALUES (?, ?) SQL - $match_auth_sth = $db_handle->prepare(<<" SQL"); + $match_auth_sth = $KU->prepare_statement(<<" SQL"); SELECT DISTINCT(rec.id) FROM authority.record_entry rec JOIN authority.full_rec frec ON (frec.record = rec.id) @@ -538,7 +455,7 @@ sub prepare_statements { AND frec.value = NACO_NORMALIZE(?, 'a') SQL - $match_auth_001_sth = $db_handle->prepare(<<" SQL"); + $match_auth_001_sth = $KU->prepare_statement(<<" SQL"); SELECT DISTINCT(rec.id) FROM authority.record_entry rec JOIN authority.full_rec frec ON (frec.record = rec.id) @@ -549,9 +466,9 @@ sub prepare_statements { SQL } -openlog($syslog_ident, $syslog_ops, $syslog_facility); -reset_db_handle(); +$KU->connect_db; +prepare_statements(); process_zip_file() if $zip_file; process_marc_file($marc_file) if $marc_file; -disconnect_db(); +$KU->disconnect_db; diff --git a/KCLS/authority-control/linking/README.adoc b/KCLS/authority-control/linking/README.adoc index 27685f4f12..b19687d641 100644 --- a/KCLS/authority-control/linking/README.adoc +++ b/KCLS/authority-control/linking/README.adoc @@ -2,17 +2,13 @@ == Re-Link All Bibs Over 1 Week == - [source,sh] -------------------------------------------------------------------------- sudo -u opensrf -LINK_DIR=/openils/var/data/authority-control/link-bibs - -mkdir -p $LINK_DIR -PGPASSWORD= ./find-bibs-to-link.pl --db-host \ - --batch-size 5000 --out-dir $LINK_DIR +export PGPASSWORD= +export PGHOST= -./link-bib-batches.sh # e.g. via CRON +./link-all-bibs-daily.sh # CRON -------------------------------------------------------------------------- diff --git a/KCLS/authority-control/linking/authority_authority_linker.pl b/KCLS/authority-control/linking/authority_authority_linker.pl index 2c134e74a4..699a7909b9 100755 --- a/KCLS/authority-control/linking/authority_authority_linker.pl +++ b/KCLS/authority-control/linking/authority_authority_linker.pl @@ -14,12 +14,12 @@ use OpenSRF::EX qw/:try/; use Encode; use Unicode::Normalize; use OpenILS::Utils::Normalize; -use Data::Dumper; -use Pod::Usage qw/ pod2usage /; +use OpenILS::Utils::KCLSScriptUtil; $ENV{OSRF_LOG_CLIENT} = 1; +my $KU = 'OpenILS::Utils::KCLSScriptUtil'; +$KU->syslog_ident('LINKING'); -$Data::Dumper::Indent = 0; MARC::Charset->assume_unicode(1); my $acsaf_cache = {}; @@ -47,41 +47,33 @@ my ($start_id, $end_id); my $bootstrap = '/openils/conf/opensrf_core.xml'; my @records; my $verbose; +my $auth_mod_since; my $input_file =''; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_user = $ENV{PGDATABASE} || 'evergreen'; -my $db_pass = $ENV{PGPASSWORD}; +my $count_only; +my $all; +my $help; + my $links_removed = 0; my $links_added = 0; my $CNI = 'KCLS'; -my %options; my $result = GetOptions( - \%options, - 'configuration=s' => \$bootstrap, - 'record=i' => \@records, - 'all', 'help', + 'record=i' => \@records, + 'all' => \$all, + 'help' => \$help, + 'auth-mod-since=s' => \$auth_mod_since, 'start_id=i' => \$start_id, - 'end_id=i' => \$end_id, - 'file=s' => \$input_file, - 'verbose' => \$verbose, - "db-host=s" => \$db_host, - "db-user=s" => \$db_user, - "db-pass=s" => \$db_pass, - "db-port=s" => \$db_port + 'end_id=i' => \$end_id, + 'file=s' => \$input_file, + 'count-only' => \$count_only, + 'verbose' => \$verbose, ); -sub announce { - my $msg = shift; - return unless $verbose; - print DateTime->now(time_zone => 'local')->strftime('%F %T') . " $msg\n"; -} - -pod2usage(0) if not $result or $options{help}; +$KU->verbose($verbose); OpenSRF::System->bootstrap_client(config_file => $bootstrap); -Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); +Fieldmapper->import( + IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); # must be loaded and initialized after the IDL is parsed @@ -90,6 +82,27 @@ OpenILS::Utils::CStoreEditor::init(); my $e = OpenILS::Utils::CStoreEditor->new; +# returns list of authority.record_entry IDs +sub find_modified_auths { + + $KU->announce('INFO', + "Searching for authority records modified since '$auth_mod_since'"); + + my $sth = $KU->prepare_statement(<<" SQL"); + SELECT id FROM authority.record_entry + WHERE DATE(edit_date) >= '$auth_mod_since'::TIMESTAMPTZ AND NOT deleted + SQL + $sth->execute; + + my @ids; + while (my $ref = $sth->fetchrow_hashref()) { + push(@ids, $ref->{id}); + } + + $KU->announce('INFO', "Found ".scalar(@ids)." authority records"); + return @ids; +} + my $query = q{ SELECT source, @@ -113,19 +126,29 @@ my $query = q{ ) x GROUP BY 1 ORDER BY source }; +$KU->connect_db; + my @bind_params; if (@records) { $query = sprintf($query, "WHERE sh2.record = ?"); @bind_params = @records; # should be just one scalar in this array. -} elsif ($options{all}) { + +} elsif ($auth_mod_since) { + + my @are_ids = find_modified_auths(); + my $id_str = join(',', @are_ids); + $query = sprintf($query, "WHERE sh2.record IN ($id_str)"); + +} elsif ($all) { $query = sprintf($query, ""); # no where clause + } elsif ($start_id and $end_id) { $query = sprintf($query, "WHERE sh2.record BETWEEN ? AND ?"); @bind_params = ($start_id, $end_id); } elsif ($input_file) { # Load authority record IDs from a file. - announce("Reading authority record IDs from $input_file"); + $KU->announce('INFO', "Reading authority record IDs from $input_file"); open FILE, "<", $input_file or die "Can't open file $input_file\n"; while() { @@ -134,44 +157,36 @@ if (@records) { } close FILE; - announce("Read ".scalar(@records)." from $input_file"); + $KU->announce('INFO', "Read ".scalar(@records)." from $input_file"); # No bind_params needed. my $recstr = join(',', @records); $query = sprintf($query, "WHERE sh2.record IN ($recstr)"); - -} else { - pod2usage(0); } -announce("SQL, params: ", Dumper($query, \@bind_params)); +my $sth = $KU->prepare_statement($query); +$sth->execute(@bind_params); -my $dsn = "dbi:Pg:database=evergreen;host=$db_host;port=$db_port"; -my $dbh = DBI->connect($dsn, $db_user, $db_pass); -$dbh->do('SET statement_timeout = 0'); +my @src_links; +while (my ($src, $links) = $sth->fetchrow_array) { + push(@src_links, [$src, $links]); +} +$KU->disconnect_db; -my $sth = $dbh->prepare($query); +my $total_records = scalar(@src_links); +$KU->announce('INFO', "Processing $total_records authority records"); -announce("Executing query ..."); -$sth->execute(@bind_params); +exit if $count_only; my $problems = 0; +for my $src_link (@src_links) { + my $src = $src_link->[0]; + my $links = $src_link->[1]; -if (@records) { - announce("Processing ".scalar(@records)." records"); -} elsif ($start_id) { - announce("Processing records $start_id => $end_id"); -} else { - announce("Processing all records"); -} - -my $total_records = 0; -while (my ($src, $links) = $sth->fetchrow_array) { - announce("--------------------"); - announce("Processing authority source record $src"); - $total_records++; + $KU->announce('INFO', "--------------------"); + $KU->announce('INFO', "Processing authority source record $src"); - try { + eval { my $src_rec = $e->retrieve_authority_record_entry($src); if (!$src_rec) { @@ -181,7 +196,7 @@ while (my ($src, $links) = $sth->fetchrow_array) { my $src_marc = MARC::Record->new_from_xml($src_rec->marc); my $auth_src_thesaurus = substr($src_marc->field('008')->data(), 11, 1); - announce("Source record thesaurus value=$auth_src_thesaurus"); + $KU->announce('INFO', "Source record thesaurus value=$auth_src_thesaurus"); my $changed = 0; my %tags_seen; @@ -190,7 +205,7 @@ while (my ($src, $links) = $sth->fetchrow_array) { next if $target eq $src_rec->id; - announce("Target: $target, field_id: $field_id"); + $KU->announce('INFO', "Target: $target, field_id: $field_id"); my $target_rec = $e->retrieve_authority_record_entry($target); if (!$target_rec) { @@ -203,10 +218,10 @@ while (my ($src, $links) = $sth->fetchrow_array) { my $auth_target_thesaurus = substr($target_marc->field('008')->data(), 11, 1); - announce("Target record thesaurus value=$auth_target_thesaurus"); + $KU->announce('INFO', "Target record thesaurus value=$auth_target_thesaurus"); if ($auth_src_thesaurus ne $auth_target_thesaurus) { - announce("Thesauri for source/target records do not match. Skipping.."); + $KU->announce('INFO', "Thesauri for source/target records do not match. Skipping.."); next; } @@ -222,7 +237,7 @@ while (my ($src, $links) = $sth->fetchrow_array) { # so they can be completely rebuilt. for my $field ($src_marc->field($acsaf->tag)) { if (my $val = $field->subfield('0')) { - announce("Removing existing subfield 0 : $val"); + $KU->announce('INFO', "Removing existing subfield 0 : $val"); $field->delete_subfield(code => '0'); $changed = 1; $links_removed++; @@ -239,7 +254,7 @@ while (my ($src, $links) = $sth->fetchrow_array) { $acsaf->main_entry->joiner ); - announce(sprintf( + $KU->announce('INFO', sprintf( "At field id=%s (tag=%s) / trying to match '%s'", $acsaf->id, $acsaf->tag, $src_string)); @@ -253,7 +268,7 @@ while (my ($src, $links) = $sth->fetchrow_array) { ); if ($target_string eq $src_string) { - announce("Got a match"); + $KU->announce('INFO', "Got a match"); $field->update('0' => "($CNI)$target"); $changed = 1; $links_added++; @@ -263,123 +278,37 @@ while (my ($src, $links) = $sth->fetchrow_array) { } if ($changed) { - announce("Updating authority record ".$src_rec->id); + $KU->announce('INFO', "Updating authority record ".$src_rec->id); $src_rec->marc(OpenILS::Utils::Normalize::clean_marc($src_marc)); $e->xact_begin; $e->update_authority_record_entry($src_rec) or die $e->die_event; $e->xact_commit; } - - } otherwise { - my $err = shift; - print STDERR "\nRecord # $src : ", - (ref $err eq "HASH" ? Dumper($err) : $err), "\n"; + }; + + if ($@) { + $KU->announce('WARNING', "Error processing record $src : $@"); # Reset SAX parser so that one bad record doesn't # kill the entire process. - import MARC::File::XML; $problems++; } } + if (@records) { - announce("Processed records ". scalar(@records). + $KU->announce('INFO', "Processed records ". scalar(@records). " records; processed=$total_records problems=$problems"); } elsif ($start_id) { - announce("Processed records $start_id => $end_id; ". + $KU->announce('INFO', "Processed records $start_id => $end_id; ". "processed=$total_records; problems=$problems"); } else { - announce("Processed all records; processed=$total_records; problems=$problems"); + $KU->announce('INFO', + "Processed all records; processed=$total_records; problems=$problems"); } -announce("links removed: $links_removed"); -announce("links added: $links_added"); -announce("delta added: ".($links_added - $links_removed)); - -exit ($problems > 0); - -__END__ - -=head1 NAME - -authority_authority_linker.pl - Link reference headings in authority records to main entry headings in other authority records - -=head1 SYNOPSIS - -authority_authority_linker.pl [B<--configuration>=I] -[[B<--record>=I[ B<--record>=I]]] | [B<--all>] | [B<--start_id>=I B<--end_id>=I] - -=head1 DESCRIPTION - -For a given set of records, find authority reference headings that also -appear as main entry headings in any other authority record. In the -specific MARC field of the authority record (source) containing the reference -heading with such a match in another authority record (target), add a subfield -0 (zero) referring to the target record by ID. - -=head1 OPTIONS - -=over - -=item * B<-c> I, B<--configuration>=I - -Specifies the OpenSRF configuration file used to connect to the OpenSRF router. -Defaults to F - -=item * B<-r> I, B<--record>=I - -Specifies the authority record ID (found in the C -column) of the B record to process. This option may be specified more -than once to process multiple records in a single run. - -=item * B<-a>, B<--all> - -Specifies that all authority records should be processed. For large -databases, this may take an extraordinarily long amount of time. - -=item * B<-s> I, B<--start_id>=I - -Specifies the starting ID of the range of authority records to process. -This option is ignored unless it is accompanied by the B<-e> or B<--end_id> -option. - -=item * B<-e> I, B<--end_id>=I - -Specifies the ending ID of the range of authority records to process. -This option is ignored unless it is accompanied by the B<-s> or B<--start> -option. - -=back - -=head1 EXAMPLES - - authority_authority_linker.pl --start_id 1 --end_id 50000 - -Processes the authority records with IDs between 1 and 50,000 using the -default OpenSRF configuration file for connection information. - -=head1 AUTHOR - -Lebbeous Fogle-Weekley - -=head1 COPYRIGHT AND LICENSE - -Copyright (C) 2013 Equinox Software, Inc. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA -02110-1301, USA. +$KU->announce('INFO', "links removed: $links_removed"); +$KU->announce('INFO', "links added: $links_added"); +$KU->announce('INFO', "delta added: ".($links_added - $links_removed)); -=cut diff --git a/KCLS/authority-control/linking/authority_control_fields.pl b/KCLS/authority-control/linking/authority_control_fields.pl index a560384336..7e36820b0c 100755 --- a/KCLS/authority-control/linking/authority_control_fields.pl +++ b/KCLS/authority-control/linking/authority_control_fields.pl @@ -12,79 +12,102 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # --------------------------------------------------------------- - use strict; use warnings; -use DBI; use Getopt::Long; +use DateTime; use MARC::Record; use MARC::File::XML (BinaryEncoding => 'UTF-8'); use MARC::Charset; +use Encode; +use Unicode::Normalize; use OpenSRF::System; use OpenILS::Utils::Fieldmapper; use OpenSRF::Utils::SettingsClient; -use Encode; -use Unicode::Normalize; -use OpenILS::Application::AppUtils; use Data::Dumper; -use Pod::Usage qw/ pod2usage /; -use DateTime; +use OpenILS::Application::AppUtils; +use OpenILS::Utils::KCLSScriptUtil; +my $KU = 'OpenILS::Utils::KCLSScriptUtil'; $ENV{OSRF_LOG_CLIENT} = 1; +$KU->syslog_ident('LINKING'); $Data::Dumper::Indent = 0; MARC::Charset->assume_unicode(1); -my $start_id; -my $end_id; -my $refresh; -my $days_back; # days; 0 means today only -my $modified_since; # days; 0 means today only -my $input_file =''; my $bootstrap = '/openils/conf/opensrf_core.xml'; my @records; +my $input_file =''; +my $auth_mod_since; +my $slot_count; # starts at 1 +my $slot; # starts at 0 +my $count_only; +my $print_ids; my $verbose = 0; -my $sort_desc = 0; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_user = $ENV{PGDATABASE} || 'evergreen'; -my $db_pass = $ENV{PGPASSWORD}; -my $CNI = 'KCLS'; - -my %options; -my $result = GetOptions( - \%options, - 'configuration=s' => \$bootstrap, - 'record=i' => \@records, - 'refresh' => \$refresh, - 'start-id=i' => \$start_id, - 'end-id=i' => \$end_id, - 'days-back=i' => \$days_back, - 'modified-since=i' => \$modified_since, - 'sort-desc' => \$sort_desc, - 'file=s' => \$input_file, - 'verbose' => \$verbose, - "db-host=s" => \$db_host, - "db-user=s" => \$db_user, - "db-pass=s" => \$db_pass, - "db-port=s" => \$db_port, - 'all', # now assumed to be true when --file is unset - 'help' -); +my $CNI = 'KCLS'; +my $help; -sub announce { - my $msg = shift; - my $force = shift; - return unless $force || $verbose; - print DateTime->now->strftime('%F %T') . " [$$] $msg\n"; -} +GetOptions( + 'record=i' => \@records, + 'slot-count=i' => \$slot_count, + 'slot=i' => \$slot, + 'auth-mod-since=s' => \$auth_mod_since, + 'file=s' => \$input_file, + 'count-only' => \$count_only, + 'print-ids' => \$print_ids, + 'verbose' => \$verbose, + 'help' => \$help +) || help(); + +sub help { + print < + Process a single bib record by ID. + + --slot-count + Specifies the number of buckets the bib records will put into + for batch processing. The higher the slot-count number, the + smaller each bucket. + + --slot <0 .. (--slot-count - 1)> + Species which bucket of records to process in the current + iteration of the script. Valid values are 0 up to the + --slot-count value minus 1. + + E.g. Process the first of 10 batches: + --slot-count 10 --slot 0 + + Under the covers: WHERE MOD(id, 10) = 0 + + E.g. Process the (zero-based) 10th of 25 batches: + --slot-count 25 --slot 9 + + --auth-mod-since + Only process bibs that may need to link to authority + records added or edited on or after the specified date. + + --count-only + Log the number of records that would be processed and exit. + + --print-ids + Print the IDs of all bib records that would be processed. +HELP + exit; } +help() if $help; +$KU->verbose($verbose); + OpenSRF::System->bootstrap_client(config_file => $bootstrap); -Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); +Fieldmapper->import( + IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); # must be loaded and initialized after the IDL is parsed use OpenILS::Utils::CStoreEditor; @@ -92,52 +115,158 @@ OpenILS::Utils::CStoreEditor::init(); my $e = OpenILS::Utils::CStoreEditor->new; -if ($input_file) { - open FILE, "<", $input_file or die "Can't open file " . $input_file; - while() { - chomp; - if($_) { - push(@records, $_); - } - } - close FILE; +sub load_ids_from_file { + open FILE, "<", $input_file + or die "Can't open file " . $input_file . "\n"; -} else { + while() { + chomp; + push(@records, $_) if $_; + } + close FILE; +} +# returns list of authority.record_entry IDs +sub find_modified_auths { - my $where = "WHERE not deleted"; - $where .= " AND id >= $start_id" if $start_id; - $where .= " AND id <= $end_id" if $end_id; + $KU->announce('INFO', + "Searching for authority records modified since '$auth_mod_since'"); + + my $sth = $KU->prepare_statement(<<" SQL"); + SELECT id FROM authority.record_entry + WHERE DATE(edit_date) >= '$auth_mod_since'::TIMESTAMPTZ AND NOT deleted + SQL + $sth->execute; + + my @ids; + while (my $ref = $sth->fetchrow_hashref()) { + push(@ids, $ref->{id}); + } - my $order = "ORDER BY id"; - $order .= " DESC" if $sort_desc; + $KU->announce('INFO', "Found ".scalar(@ids)." authority records"); + return @ids; +} - if (defined $days_back or defined $modified_since) { - my $date_field = defined $days_back ? 'create_date' : 'edit_date'; - my $duration = defined $days_back ? $days_back : $modified_since; - $where .= " AND DATE($date_field) >= DATE(NOW() - '$duration day'::INTERVAL)"; +# ---------------------------------------------------------------------- +# Returns IDs for bib records that we might want to link to the new or +# modified authority records. +# +# Query: give me bib records that link to browse entries that also +# link to exactly one authority record, specifically the new authority +# records we are processing via this script. Only include bib records +# that are not already linked via bib_linking to said authority record. +# This represents the set of bib records that might need to be linked +# to our new authority records. +# ---------------------------------------------------------------------- +sub find_bibs_from_modified_auths { + my %bib_ids; # de-dupe by record ID. + + my @auth_ids = find_modified_auths(); + return () unless @auth_ids; + + my $auth_ids_param = join(',', @auth_ids); + + for my $axis (qw/author subject series title/) { + $KU->announce('INFO', + "Scanning browse axis '$axis' for linkable bibs..."); + + my $query = <prepare_statement($query); + $sth->execute; + + while (my $ref = $sth->fetchrow_hashref()) { + $bib_ids{$ref->{bib_record}} = 1; # de-dupe + } } - my $sql = - "SELECT DISTINCT(id) AS id FROM biblio.record_entry $where $order"; + return keys %bib_ids; +} + +sub load_ids_with_sql_filters { + my $where = "WHERE not deleted"; + $where .= " AND MOD(id, $slot_count) = $slot" + if $slot_count && defined $slot; - announce("Loading record ID's with query:\n$sql"); + my $sql = <<" SQL"; + SELECT DISTINCT(id) AS id FROM biblio.record_entry + $where + ORDER BY id + SQL - my $dsn = "dbi:Pg:database=evergreen;host=$db_host;port=$db_port"; - my $dbh = DBI->connect($dsn, $db_user, $db_pass); + (my $sql_str = $sql) =~ s/\n//g; + $KU->announce('INFO', "Loading record ID's with query [$sql_str]"); - my $sth = $dbh->prepare($sql); + my $sth = $KU->prepare_statement($sql); $sth->execute; while (my $ref = $sth->fetchrow_hashref()) { push(@records, $ref->{id}); } +} + +if (@records) { + $KU->announce('INFO', "Loading records from command line"); + +} elsif ($input_file) { + $KU->announce('INFO', "Loading records from file: $input_file"); + load_ids_from_file(); + +} else { + $KU->announce('INFO', "Loading records via SQL..."); + $KU->connect_db; + + if ($auth_mod_since) { + @records = find_bibs_from_modified_auths(); + + # Chop the link-able records into batches if requested. + @records = grep { ($_ % $slot_count) == $slot } @records + if $slot_count && defined $slot; - $sth->finish(); - $dbh->disconnect(); + } else { + load_ids_with_sql_filters(); + } + + $KU->disconnect_db; } -announce("Processing ".scalar(@records)." records"); +@records = sort { $a <=> $b } @records; +$KU->announce('INFO', "Processing ".scalar(@records)." records"); +$KU->announce('INFO', "@records") if $print_ids; + +exit if $count_only || $print_ids; # Hash of controlled fields & subfields in bibliographic records, and their # corresponding controlling fields & subfields in the authority record @@ -576,15 +705,6 @@ my %REMAP_BIB_SF2_TO_IND2 = ( rvm => '6' ); -my $start_time = localtime(); - -if($input_file) { - announce("Start $start_time for ".scalar(@records)." records"); -} elsif($start_id) { - announce("Start $start_time for record range: $start_id => $end_id"); -} else { - announce("Start $start_time for all records"); -} # Fetch leader/008 values for authority records. Filter out any whose # 008 14 or 15 field are not appropriate for the requested bib tag. @@ -611,7 +731,8 @@ sub authority_leaders_008_14_15 { if (substr($value, $index, 1) eq 'a') { push(@keepers, $leader); } else { - announce("Skipping authority record ".$leader->{record}. + $KU->announce('INFO', + "Skipping authority record ".$leader->{record}. " on bib $bib_tag match; 008/#14|#15 not appropriate"); } } @@ -628,7 +749,7 @@ sub find_matching_auth_for_thesaurus { # bib field thesaurus spec my $cfield_ind2 = $bib_field->indicator(2); - announce("6XX indicator 2 value = $cfield_ind2"); + $KU->announce('DEBUG', "6XX indicator 2 value = $cfield_ind2"); my $is_local = 0; if ($cfield_ind2 eq '7') { @@ -637,13 +758,13 @@ sub find_matching_auth_for_thesaurus { $is_local = 1; my $thesaurus = $bib_field->subfield('2') || ''; - announce("Found local thesaurus value $thesaurus"); + $KU->announce('DEBUG', "Found local thesaurus value $thesaurus"); # if we have no special remapping value for the found thesaurus, # fall back to ind2 => 7=Other. $cfield_ind2 = $REMAP_BIB_SF2_TO_IND2{$thesaurus} || '7'; - announce("Local thesaurus '$thesaurus' ". + $KU->announce('DEBUG', "Local thesaurus '$thesaurus' ". "remapped to ind2 value '$cfield_ind2'"); } @@ -659,7 +780,7 @@ sub find_matching_auth_for_thesaurus { $authz_found = $leader->{record} if $thesaurus eq 'z'; if ($AUTH_TO_BIB_IND2{$thesaurus} eq $cfield_ind2) { - announce("Found a match on thesaurus ". + $KU->announce('DEBUG', "Found a match on thesaurus ". "'$thesaurus' for " . $leader->{record}); return $leader->{record}; } @@ -711,14 +832,18 @@ sub update_record { my $count = 0; my $total = scalar(@records); -announce("processing $total bib records", 1); +$KU->announce('INFO', "processing $total bib records"); +$slot ||= 'N/A'; +my $start_time = time(); foreach my $rec_id (@records) { $count++; - announce("processing $count of $total", 1) if ($count % 1000) == 0; + $KU->announce('INFO', "processed $count of $total [slot=$slot]") + if ($count % 1000) == 0; - announce("processing bib record $rec_id [$count of $total]"); + $KU->announce('DEBUG', + "processing bib record $rec_id [$count of $total; slot=$slot]"); # State variable; was the record changed? my $changed = 0; @@ -745,13 +870,14 @@ foreach my $rec_id (@records) { if ($is_fast_heading && $sf0 =~ /\)fst/) { # fast heading looks OK. ignore it. - announce("Ignoring FAST heading field on ". + $KU->announce('DEBUG', "Ignoring FAST heading field on ". "rec=$rec_id and tag=$c_tag \$0 $sf0"); next; } - if ($sf0 && $refresh) { - announce("Removing \$0 $sf0 for rec=$rec_id and tag=$c_tag"); + if ($sf0) { + $KU->announce('DEBUG', + "Removing \$0 $sf0 for rec=$rec_id and tag=$c_tag"); $bib_field->delete_subfield(code => '0'); $changed = 1; } @@ -763,7 +889,8 @@ foreach my $rec_id (@records) { # We don't control fast headings, so there's nothing # left to do. Move on to the next field... - announce("No linking performed on FAST heading ". + $KU->announce('DEBUG', + "No linking performed on FAST heading ". "field on rec=$rec_id and tag=$c_tag"); next; } @@ -784,7 +911,8 @@ foreach my $rec_id (@records) { } next if !$match_tag; - announce("Searching for matches on controlled field $c_tag ". + $KU->announce('INFO', + "Searching for matches on controlled field $c_tag ". "(auth tag=$match_tag): \n - ".Dumper(\@searches)); my @tags = ($match_tag); @@ -801,11 +929,12 @@ foreach my $rec_id (@records) { # Protect against failed (error condition) search request if (!$validates) { - print STDERR "Search for matching authority failed; record # $rec_id\n"; - next if (!$changed); + $KU->announce('WARNING', + "Search for matching authority failed; record $rec_id"); + next unless $changed; } - announce("Match query returned @$validates"); + $KU->announce('INFO', "Match query returned @$validates"); # No matches found. Nothing left to do for this field. next if scalar(@$validates) == 0; @@ -860,154 +989,23 @@ foreach my $rec_id (@records) { # matching authority record to the controlled bib field. $changed = 1; $bib_field->add_subfields('0' => "($CNI)$auth_id"); - announce("auth=$auth_id cni=$CNI. It's a match!"); + $KU->announce('INFO', + "auth=$auth_id cni=$CNI. It's a match!"); } } } update_record($record, $marc) if $changed; - }; if ($@) { - print STDERR "\nRecord # $rec_id : $@\n"; - import MARC::File::XML; # reset SAX parser so that one bad record doesn't kill the entire process + $KU->announce('WARNING', "Error linking record $rec_id : $@"); + # Reset SAX parser so that one bad record doesn't kill the entire process + import MARC::File::XML; } } -my $end_time = localtime(); - -if($input_file) { - announce("Stop $end_time for ".scalar(@records)." records"); -} elsif($start_id) { - announce("Start $end_time for record range: $start_id => $end_id"); -} else { - announce("Start $end_time for all records"); -} - -__END__ - -=head1 NAME - -authority_control_fields.pl - Controls fields in bibliographic records with authorities in Evergreen - -=head1 SYNOPSIS - -C [B<--configuration>=I] [B<--refresh>] -[[B<--record>=I[ B<--record>=I]]] | [B<--all>] | [B<--start_id>=I B<--end_id>=I] - -=head1 DESCRIPTION - -For a given set of records: - -=over - -=item * Iterate through the list of fields that are controlled fields - -=item * Iterate through the list of subfields that are controlled for -that given field - -=item * Search for a matching authority record for that combination of -field + subfield(s) - -=over - -=item * If we find a match, then add a $0 subfield to that field identifying -the controlling authority record - -=item * If we do not find a match, then insert a row into an "uncontrolled" -table identifying the record ID, field, and subfield(s) that were not controlled - -=back - -=item * Iterate through the list of floating subdivisions - -=over - -=item * If we find a match, then add a $0 subfield to that field identifying -the controlling authority record - -=item * If we do not find a match, then insert a row into an "uncontrolled" -table identifying the record ID, field, and subfield(s) that were not controlled - -=back - -=item * If we changed the record, update it in the database - -=back - -=head1 OPTIONS - -=over - -=item * B<-f>, B<--file> - -Specifies a file of bibs ids to link. - -=item * B<-c> I, B<--configuration>=I - -Specifies the OpenSRF configuration file used to connect to the OpenSRF router. -Defaults to F - -=item * B<-r> I, B<--record>=I - -Specifies the bibliographic record ID (found in the C -column) of the record to process. This option may be specified more than once -to process multiple records in a single run. - -=item * B<-a>, B<--all> - -Specifies that all bibliographic records should be processed. For large -databases, this may take an extraordinarily long amount of time. - -=item * B<-r>, B<--refresh> - -Specifies that all authority links should be removed from the target -bibliographic record(s). This will effectively rewrite all authority -linking anew. - -=item * B<-s> I, B<--start_id>=I - -Specifies the starting ID of the range of bibliographic records to process. -This option is ignored unless it is accompanied by the B<-e> or B<--end_id> -option. - -=item * B<-e> I, B<--end_id>=I - -Specifies the ending ID of the range of bibliographic records to process. -This option is ignored unless it is accompanied by the B<-s> or B<--start> -option. - -=back - -=head1 EXAMPLES - - authority_control_fields.pl --start_id 1 --end_id 50000 - -Processes the bibliographic records with IDs between 1 and 50,000 using the -default OpenSRF configuration file for connection information. - -=head1 AUTHOR - -Dan Scott - -=head1 COPYRIGHT AND LICENSE - -Copyright 2010-2011 by Dan Scott - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -=cut +my $end_time = time(); +my $duration = $end_time - $start_time; +$KU->announce('INFO', "Bib linking script duration $duration seconds"); diff --git a/KCLS/authority-control/linking/find-bibs-to-link.pl b/KCLS/authority-control/linking/find-bibs-to-link.pl deleted file mode 100755 index 46a20bf298..0000000000 --- a/KCLS/authority-control/linking/find-bibs-to-link.pl +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/perl -# ---------------------------------------------------------------------- -# Find bib records matching the requested criteria for linking. -# Bib IDs are exported to one or more batch files for future processing. -# ---------------------------------------------------------------------- -use strict; -use warnings; -use DBI; -use Getopt::Long; -use DateTime; - -my $db_handle; -my $counter = 0; - -# options -my $help; -my $modified_since; -my $exported_since; -my $batch_size = 10000; -my $start_id; -my $end_id; -my $count_only; -my $out_dir = '/tmp'; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_user = $ENV{PGUSER} || 'evergreen'; -my $db_name = $ENV{PGDATABASE} || 'evergreen'; -my $db_pass = $ENV{PGPASSWORD}; - -my $opt_result = GetOptions( - 'modified-since=s' => \$modified_since, - 'exported-since=s' => \$exported_since, - 'start-id=i' => \$start_id, - 'end-id=i' => \$end_id, - 'batch-size=i' => \$batch_size, - 'count-only' => \$count_only, - 'out-dir=s' => \$out_dir, - "db-host=s" => \$db_host, - "db-user=s" => \$db_user, - "db-pass=s" => \$db_pass, - "db-port=s" => \$db_port, - 'help' => \$help -); - -sub announce { - my $msg = shift; - print DateTime->now(time_zone => 'local')->strftime('%F %T')." $msg\n"; -} - -sub help { - print < - Limit bibs to those modifed since the specified date. - - --exported-since - Limit bibs to those exported since the specified date. - Export date is based on data found in the - metabib.bib_export_data table. - - --start-id - Limit bibs to those whose ID is no less than - - --end-id - Limit bibs to those whose ID is no greater than - - --out-dir [/tmp] - Output directory. - - --batch-size - Number of bib IDs to write to each batch file. - - --count-only - Print the total number of records that would be added - to batch files without adding to any batch files. - - --db-host - --db-user - --db-pass - --db-port - Database connection params. PG environment variables are - also inspected for values. When all else fails, try to - connect to database evergreen\@localhost -HELP - exit 0; -} - -help() if $help || !$opt_result; - -sub connect_db { - $db_handle = DBI->connect( - "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", - $db_user, $db_pass, { - RaiseError => 1, - PrintError => 0, - AutoCommit => 1, - pg_expand_array => 0, - pg_enable_utf8 => 1 - } - ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; -} - -connect_db(); - -# ---------------------------------------------------------------------- -my $from = 'FROM biblio.record_entry bre'; - -my $where = 'WHERE NOT bre.deleted'; -$where .= " AND bre.id >= $start_id" if $start_id; -$where .= " AND bre.id <= $end_id" if $end_id; - -if ($exported_since) { - $where .= " AND bed.export_date > '$exported_since'"; - $from .= " JOIN metabib.bib_export_data bed ON (bed.bib = bre.id)"; -} - -my $sql = <prepare($sql); -$sth->execute; - -my $batch_file; -sub open_batch_file { - my $path = shift; - announce("Starting new batch file: $path"); - - close $batch_file if $batch_file; - - open $batch_file, '>', $path or - die "Cannot open batch file for writing: $!\n"; -} - -my $ctr = 0; -my $batch = 0; -while (my $ref = $sth->fetchrow_hashref()) { - $ctr++; - next if $count_only; - - if (( ($ctr - 1) % $batch_size) == 0) { - my $path = sprintf("$out_dir/bib-ids.%0.3d", $batch); - open_batch_file($path); - $batch++; - } - - print $batch_file $ref->{id} . "\n"; -} - -close $batch_file if $batch_file; -$sth->finish; - -announce("Found $ctr bib records"); - diff --git a/KCLS/authority-control/linking/link-all-bibs-daily.sh b/KCLS/authority-control/linking/link-all-bibs-daily.sh new file mode 100755 index 0000000000..73b2eabbaa --- /dev/null +++ b/KCLS/authority-control/linking/link-all-bibs-daily.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -u +NUM_PROCS=3 +SLOTS_PER_PROC=10 + +DOW=$(date +%u); +[ $DOW == 7 ] && DOW=0; # set Sunday = 0 + +let SLOTS_PER_SCRIPT=$NUM_PROCS*$SLOTS_PER_PROC; +let TOTAL_SLOTS=$SLOTS_PER_SCRIPT*7; # run daily +let PROC_START_SLOT=$SLOTS_PER_SCRIPT*$DOW; + +echo "Starting at slot $PROC_START_SLOT"; + +function execute_background_proc { + START_SLOT=$1 + let MAX_SLOT=$SLOTS_PER_PROC-1; + + for SLOT_OFFSET in $(seq 0 $MAX_SLOT); do + let SLOT=$START_SLOT+$SLOT_OFFSET + echo "Processing slot $SLOT of $TOTAL_SLOTS" + perl ./authority_control_fields.pl \ + --slot-count $TOTAL_SLOTS --slot $SLOT + done; +} + +for PROC in $(seq 1 $NUM_PROCS); do + (execute_background_proc $PROC_START_SLOT) & + let PROC_START_SLOT=$PROC_START_SLOT+$SLOTS_PER_PROC; +done + +# Wait for all child processes to complete. This makes it easier +# to track and/or kill all of the child processes if needed. +wait; + +echo "Done processing all batches" + diff --git a/KCLS/authority-control/linking/link-bib-batches.sh b/KCLS/authority-control/linking/link-bib-batches.sh deleted file mode 100755 index c7494fefd6..0000000000 --- a/KCLS/authority-control/linking/link-bib-batches.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -set -eu -PROC_COUNT=3 -BATCHES_PER_PROC=9 -BATCH_DIR="/openils/var/data/authority-control/link-bibs" - -DOW=$(date +%u); -[ $DOW == 7 ] && DOW=0; # make Sunday = 0 - -BATCH_SIZE=$((($PROC_COUNT * $BATCHES_PER_PROC))); -BATCH=$((($BATCH_SIZE * $DOW))); - -echo "Starting at batch number $BATCH"; - -function execute_background_proc { - START=$1 - for batch in $(seq 1 $BATCHES_PER_PROC); do - FILE=$(printf "$BATCH_DIR/bib-ids.%0.3d" $((($START + $batch)))); - if [ -f $FILE ]; then - echo "Linking bib batch file $FILE" - else - echo "Skipping nonexistent bib link batch file $FILE" - continue - fi - perl ./authority_control_fields.pl --refresh --file $FILE - done; -} - -for PROC in $(seq 1 $PROC_COUNT); do - (execute_background_proc $BATCH) & - BATCH=$((($BATCH + BATCHES_PER_PROC))); -done - -wait; - -echo "Done processing all batches" - diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Utils/KCLSScriptUtil.pm b/Open-ILS/src/perlmods/lib/OpenILS/Utils/KCLSScriptUtil.pm new file mode 100644 index 0000000000..db47f9ab32 --- /dev/null +++ b/Open-ILS/src/perlmods/lib/OpenILS/Utils/KCLSScriptUtil.pm @@ -0,0 +1,164 @@ +package OpenILS::Utils::KCLSScriptUtil; +# --------------------------------------------------------------------- +# Script utility functions. +# +# 1. Assumes a single DB connection per Perl process. +# +# 2. Keep imports at a minimum so utility scripts can be run from +# non-Evergreen servers (e.g. beefy DB servers). +# --------------------------------------------------------------------- +use strict; +use warnings; +use DBI; +use DateTime; +use Sys::Syslog qw(syslog openlog); +my $KU = 'OpenILS::Utils::KCLSScriptUtil'; + +# --------------------------------------------------------------------- +# LOGGING +# --------------------------------------------------------------------- +our $syslog_facility = 'LOCAL6'; # matches Evergreen gateway +our $syslog_ops = 'pid'; +our $syslog_ident = 'KCLSScriptUtil'; +my $syslog_opened = 0; +my $verbose = 0; + +sub syslog_facility { + my $cls = shift; + my $val = shift; + $syslog_facility = $val if defined $val; + return $syslog_facility; +} + +sub syslog_ops { + my $cls = shift; + my $val = shift; + $syslog_ops = $val if defined $val; + return $syslog_ops; +} + +sub syslog_ident { + my $cls = shift; + my $val = shift; + $syslog_ident = $val if defined $val; + return $syslog_ident; +} + +sub syslog_opened { + my $cls = shift; + my $val = shift; + $syslog_opened = $val if defined $val; + return $syslog_opened; +} + +sub verbose { + my $cls = shift; + my $val = shift; + $verbose = $val if defined $val; + return $verbose; +} + + + +# Send messages to syslog +# Send INFO, DEBUG messages to STDOUT if $verbose is true +# Send ERR, WARNING messages to STDERR +# Finishes with die() if $die is true +sub announce { + my $cls = shift; # class ref; unused + my $lvl = shift; # syslog level (minus the LOG_ part) + my $msg = shift; # message string to log + my $die = shift; # true if this should cause the script to die + + if (!$syslog_opened) { + openlog($syslog_ident, $syslog_ops, $syslog_facility); + $syslog_opened = 1; + } + + syslog("LOG_$lvl", $msg); + + my $date_str = DateTime->now(time_zone => 'local')->strftime('%F %T'); + my $msg_str = "$date_str [$$] $lvl $msg\n"; + + if ($die) { + die $msg_str; + return; + } + + if ($lvl eq 'ERR' or $lvl eq 'WARNING') { + # always copy problem messages to stdout + warn $msg_str; # avoid dupe messages + } else { + print $msg_str if $verbose; + } +} + +# --------------------------------------------------------------------- +# DATABASE +# --------------------------------------------------------------------- +our $db_user = $ENV{PGUSER} || 'evergreen'; +our $db_name = $ENV{PGDATABASE} || 'evergreen'; +our $db_host = $ENV{PGHOST} || 'localhost'; +our $db_port = $ENV{PGPORT} || '5432'; +our $db_pass = $ENV{PGPASSWORD}; +our $db_handle; +our $db_handle_ops = 0; +our @db_statements; + +sub connect_db { + my $dsn = "dbi:Pg:db=$db_name;host=$db_host;port=$db_port"; + $KU->announce('DEBUG', "Connecting to DB $dsn"); + + $db_handle = DBI->connect( + "$dsn;options='--statement-timeout=0'", + $db_user, $db_pass, { + RaiseError => 1, + PrintError => 0, + AutoCommit => 1, + pg_expand_array => 0, + pg_enable_utf8 => 1, + Callbacks => { + ChildCallbacks => { + # Track the number of execute() operations + execute => sub { $db_handle_ops++; return; } + } + } + } + ) or $KU->announce('ERR', + "Connection to database failed: $DBI::err : $DBI::errstr", 1); + + return $db_handle; +} + +sub db_handle { + return $db_handle; +} + +sub db_handle_ops { + return $db_handle_ops; +} + +sub disconnect_db { + return unless $db_handle; + $KU->announce('DEBUG', 'Disconnecting DB handle and cleaning up statements'); + $_->finish for @db_statements; + $db_handle->disconnect; + @db_statements = (); +} + +sub reset_db_handle { + $KU->announce('DEBUG', 'Resetting DB connection') if $db_handle; + $db_handle_ops = 0; + disconnect_db(); + connect_db(); +} + +sub prepare_statement { + my ($cls, $sql) = @_; + my $sth = $db_handle->prepare($sql); + push(@db_statements, $sth); + return $sth; +} + + +1; -- 2.11.0