From 50d573cf2ee4f98c5da8036368a1162245c9c53b Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Mon, 20 Jun 2016 15:33:56 -0400 Subject: [PATCH] JBAS-1437 Backstage record export/import scripts * export-bibs.pl : export bib records to send to Backstage * process-backstage-files.pl : process bib/auth records returned from BS * README.adoc : howto docs Signed-off-by: Bill Erickson --- KCLS/backstage/README.adoc | 23 ++ KCLS/backstage/export-bibs.pl | 256 +++++++++++++++++ KCLS/backstage/process-backstage-files.pl | 451 ++++++++++++++++++++++++++++++ 3 files changed, 730 insertions(+) create mode 100644 KCLS/backstage/README.adoc create mode 100755 KCLS/backstage/export-bibs.pl create mode 100755 KCLS/backstage/process-backstage-files.pl diff --git a/KCLS/backstage/README.adoc b/KCLS/backstage/README.adoc new file mode 100644 index 0000000000..360f086dd3 --- /dev/null +++ b/KCLS/backstage/README.adoc @@ -0,0 +1,23 @@ += Backstage Process = + +== Setup == + +[source,sh] +-------------------------------------------------------------------- +mkdir -p /openils/var/data/backstage +-------------------------------------------------------------------- + +== Exporting Bib Records == + +Bibs are exported as MARC and uploaded to Backstage + +=== Generate Export MARC File === + +[source,sh] +-------------------------------------------------------------------- +./export-bibs.pl \ + --cat-date-start 2010-01-01 \ + --cat-date-end 2016-06-01 \ + > /openils/var/data/backstage/bib-export-2016-06-01 +-------------------------------------------------------------------- + diff --git a/KCLS/backstage/export-bibs.pl b/KCLS/backstage/export-bibs.pl new file mode 100755 index 0000000000..839dc1c339 --- /dev/null +++ b/KCLS/backstage/export-bibs.pl @@ -0,0 +1,256 @@ +#!/usr/bin/env perl +# ----------------------------------------------------------------------- +# Export bib records for Backstage processing. +# +# The UTF-8 encoded USMARC string for each record is printed to STDOUT. +# Each exported bib has its export_date value updated to NOW(). +# +# Exported bibs meet the following criteria: +# +# 1. Delete flag must be false. +# 2. Record cannot contain any 086, 092, or 099 tags containing the phrase 'on order' +# 3. Boolean filter: +# [ (001_test OR 035_test) AND has_holdings AND cat_date_in_range ] +# OR +# [ 998_test AND create_date_in_range ] +# ----------------------------------------------------------------------- +use strict; +use warnings; +use DBI; +use Getopt::Long; +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); + +my $db_handle; + +my $start_date; +my $end_date; +my $export_date; +my $ids_only; +my $count_only; +my $out_file; +my $limit; +my $db_user = 'evergreen'; +my $db_host = 'localhost'; +my $db_name = 'evergreen'; +my $db_port = 5432; +my $db_pass; +my $help; + +GetOptions( + 'start-date=s' => \$start_date, + 'end-date=s' => \$end_date, + 'export-date=s' => \$export_date, + 'ids-only' => \$ids_only, + 'count-only' => \$count_only, + 'out-file=s' => \$out_file, + 'limit=f' => \$limit, + 'db-user=s' => \$db_user, + 'db-host=s' => \$db_host, + 'db-name=s' => \$db_name, + 'db-port=i' => \$db_port, + 'db-pass=s' => \$db_pass, + 'help' => \$help +); + +sub help { + print < + --end-date + Export bib records whose cataloging_date (for physical records) or + create_date (for electronic records) value is between the provided + start and end dates. + + --export-date + Sets the export date to the provided value. If no --export-date + value is set, no export date value will be applied in the database. + + --out-file + Write MARC records (or IDs) to this file. + + --ids-only + Write bib record IDs to the output file instead of the full MARC + record. + + --count-only + Only print the number of bibs that would be exported to STDOUT. + + --limit + Export at most this many records. + +HELP + exit; +} + +die "--start-date and --end-date required\n" + unless $start_date && $end_date; + +die "Invalid date format\n" unless + $start_date =~ /^\d{4}-\d{2}-\d{2}$/ && + $end_date =~ /^\d{4}-\d{2}-\d{2}$/ && + (!$export_date || $export_date =~ /^\d{4}-\d{2}-\d{2}$/); + +die "--out-file required\n" unless $out_file || $count_only; + +sub bib_query { + my $sql = <$out_file") + or die "Cannot open file for writing: $out_file\n"; + binmode(MARCFILE, ':utf8'); + } + + my $sth = $db_handle->prepare(bib_query()); + my $edate_sth = $db_handle->prepare( + 'SELECT * FROM metabib.set_export_date(?, ?)'); + + $sth->execute; + my $count = 0; + while (my $bib = $sth->fetchrow_hashref) { + $count++; + next if $count_only; + + my $bib_id = $bib->{id}; + + if ($ids_only) { + print MARCFILE "$bib_id\n"; + print "$count records written...\n" if ($count % 1000) == 0; + next; + } + + my $rec = $db_handle->selectall_arrayref( + "SELECT marc FROM biblio.record_entry WHERE id = $bib_id"); + + my $marc = $rec->[0]->[0]; + my $marcdoc = MARC::Record->new_from_xml($marc, 'UTF-8', 'USMARC'); + + print MARCFILE $marcdoc->as_usmarc; + + print "$count records written...\n" if ($count % 1000) == 0; + + next unless $export_date; + + # Update the bib record's metabib.bib_export_data entry. + eval { $edate_sth->execute($bib_id, $export_date) }; + die "Error setting export date for bib ". + "$bib_id to $export_date : $@\n" if $@; + } + + close(MARCFILE) if $out_file; + + print "$count total bib records\n"; + + $sth->finish; + $edate_sth->finish; +} + +sub connect_db { + $db_handle = DBI->connect( + "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", + $db_user, $db_pass, { + RaiseError => 1, + PrintError => 0, + AutoCommit => 1, + pg_expand_array => 0, + pg_enable_utf8 => 1 + } + ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; +} + +connect_db(); +export_marc(); + +$db_handle->disconnect; + diff --git a/KCLS/backstage/process-backstage-files.pl b/KCLS/backstage/process-backstage-files.pl new file mode 100755 index 0000000000..7fb0aee709 --- /dev/null +++ b/KCLS/backstage/process-backstage-files.pl @@ -0,0 +1,451 @@ +#!/usr/bin/env perl +# ----------------------------------------------------------------------- +# TODO: summary +# +# TODO: +# Disable auth record change propagation during auth record updates. +# ----------------------------------------------------------------------- +use strict; +use warnings; +use DBI; +use DateTime; +use Getopt::Long; +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::File::USMARC; +use Archive::Zip qw(:ERROR_CODES :CONSTANTS); +use File::Basename; +use Sys::Syslog qw(syslog openlog); +use OpenILS::Utils::Normalize qw(clean_marc); +binmode(STDOUT, ':utf8'); + +my $db_handle; +my $log_mod = 500; # log every 500th of each type of event (see verbose) + +my $file; +my $export_date; +my $working_dir = '.', +my $bib_collision_file; +my $verbose; +my $db_user = 'evergreen'; +my $db_host = 'localhost'; +my $db_name = 'evergreen'; +my $db_port = 5432; +my $db_pass; +my $syslog_facility = 'LOCAL6'; # matches Evergreen gateway +my $syslog_ops = 'pid'; +my $syslog_ident = 'BACKSTAGE'; + +my $new_auth_sth; +my $mod_auth_sth; +my $del_auth_sth; +my $delmod_auth_sth; +my $mod_bibs_sth; +my $match_auth_sth; +my $match_auth_001_sth; +my $new_auth_ctr = 0; +my $mod_auth_ctr = 0; +my $del_auth_ctr = 0; +my $mod_bibs_ctr = 0; +my $col_bibs_ctr = 0; + +my $help; + +GetOptions( + 'file=s' => \$file, + 'export-date=s' => \$export_date, + 'working-dir=s' => \$working_dir, + 'bib-collision-file=s' => \$bib_collision_file, + 'verbose' => \$verbose, + 'db-user=s' => \$db_user, + 'db-host=s' => \$db_host, + 'db-name=s' => \$db_name, + 'db-port=i' => \$db_port, + 'db-pass=s' => \$db_pass, + 'help' => \$help +); + +sub help { + print <now(time_zone => 'local')->strftime('%F %T'); + my $msg_str = "$date_str [$$] $level $msg\n"; + + if ($die) { + die $msg_str; + + } else { + if ($level eq 'ERR' or $level eq 'WARNING') { + # always copy problem messages to stdout + warn $msg_str; # avoid dupe messages + } else { + print $msg_str; + } + } +} + + +sub connect_db { + $db_handle = DBI->connect( + "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", + $db_user, $db_pass, { + RaiseError => 1, + PrintError => 0, + AutoCommit => 1, + pg_expand_array => 0, + pg_enable_utf8 => 1 + } + ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; +} + +sub process_zip_file { + + my $zip = Archive::Zip->new(); + + announce('ERR', "Failed to read $file", 1) + unless $zip->read($file) == AZ_OK; + + # Avoid processing XLS and HTM files. + # All of the MARC files end in .UTF8. + for my $member ($zip->membersMatching('.*\.UTF8')) { + + my $basename = basename($member->fileName()); + + announce('INFO', "Extracting file $basename"); + + my $local_file = "$working_dir/$basename"; + + announce('ERR', "Unable to extract to file: $local_file", 1) + unless $member->extractToFileNamed($local_file) == AZ_OK; + + my $marc_batch = MARC::File::USMARC->in($local_file, 'UTF8') + or announce('ERR', "Unable to read $local_file as MARC", 1); + + if ($basename =~ /BIB/) { + + handle_modified_bibs($marc_batch); + + } elsif ($basename =~ /DEL/) { + + handle_deleted_auths($marc_batch); + + } elsif ($basename =~ /CHG|NEW/) { + + handle_modified_auths($marc_batch); + + } else { + announce('WARNING', "Un-handled file type: $basename"); + } + } +} + +# Returns ID's of bib records that have been modified since the export date. +my @modified_bibs; +sub find_modified_bibs { + + my $id_arrays = $db_handle->selectall_arrayref(<<" SQL"); + SELECT id + FROM biblio.record_entry + WHERE NOT deleted AND edit_date >= '$export_date' + SQL + + @modified_bibs = map {$_->[0]} @$id_arrays; + + announce('INFO', scalar(@modified_bibs)." bibs modified since export"); +} + + + +# 1. Bibs that have been modified by Backstage and locally are written +# to the --bib-collision-file as MARC for later processing. +# 2. Bibs that have only been modified by Backstage are updated +# directly in the database. +sub handle_modified_bibs { + my $marc_batch = shift; + + find_modified_bibs() unless @modified_bibs; + + while (my $record = $marc_batch->next()) { + my $bib_id = $record->subfield('901', 'c'); + + if (!$bib_id) { + announce('ERR', "Bib record has no 901c (ID) value. Skipping"); + next; + } + + if (grep {$bib_id eq $_} @modified_bibs) { + # Bib was edited by both parties. Save to external file + # for later processing. + + write_bib_collision($record); + + } else { + # Update our copy of the record. + + my $marcxml = clean_marc($record->as_xml_record()); + update_bib($marcxml, $bib_id); + } + } +} + +sub update_bib { + my $marcxml = shift; + my $bib_id = shift; + + eval { $mod_bibs_sth->execute($marcxml, $bib_id) }; + + if ($@) { + announce('ERR', "Error updating biblio record: $@ : $marcxml"); + return; + } + + $mod_bibs_ctr++; + + announce('INFO', "Updated $mod_bibs_ctr bib records") + if $mod_bibs_ctr % $log_mod == 0; +} + +sub write_bib_collision { + my $record = shift; + + my $filename = "$working_dir/$bib_collision_file"; + + open(BIBS_FILE, ">>$filename") or + announce('ERR', "Cannot open bib collision file: $filename : $!", 1); + + binmode(BIBS_FILE, ":utf8"); + + print BIBS_FILE $record->as_usmarc(); + + close BIBS_FILE or + announce('WARNING', "Error closing bib collision file: $filename : $!"); + + $col_bibs_ctr++; + + announce('INFO', "Dumped $col_bibs_ctr bib collisions to file") + if $col_bibs_ctr % $log_mod == 0; +} + +sub handle_deleted_auths { + my $marc_batch = shift; + + while (my $record = $marc_batch->next()) { + my @matches = find_matching_auths($record); + + for my $auth_id (@matches) { + + eval { + # 2 mods.. wrap in transaction? (see autocommit) + $del_auth_sth->execute($auth_id); + $delmod_auth_sth->execute($auth_id); + }; + + if ($@) { + announce( + 'ERR', "Error deleting authority record: $@ : $auth_id"); + next; + } + + $del_auth_ctr++; + + announce('INFO', "Deleted $del_auth_ctr authority records") + if $del_auth_ctr % $log_mod == 0; + } + } +} + +sub handle_modified_auths { + my $marc_batch = shift; + + while (my $record = $marc_batch->next()) { + + my @matches = find_matching_auths($record); + my $marcxml = clean_marc($record->as_xml_record()); + + if (@matches) { + update_auth($marcxml, $_) for @matches; + } else { + insert_auth($marcxml); + } + } +} + + +sub update_auth { + my $marcxml = shift; + my $auth_id = shift; + + eval { $mod_auth_sth->execute($marcxml, $auth_id) }; + + if ($@) { + announce('ERR', "Error updating authority record: $@ : $marcxml"); + return; + } + + $mod_auth_ctr++; + + announce('INFO', "Updated $mod_auth_ctr authority records") + if $mod_auth_ctr % $log_mod == 0; +} + +sub insert_auth { + my $marcxml = shift; + + eval { $new_auth_sth->execute($marcxml, "IMPORT-" . time) }; + + if ($@) { + announce('ERR', + "Error creating new authority record: $@ : $marcxml"); + return; + } + + $new_auth_ctr++; + + announce('INFO', "Created $new_auth_ctr authority records") + if $new_auth_ctr % $log_mod == 0; +} + +# Return ID's of matching authority records. Matching tries: +# 001 -> 010a -> 035a. +sub find_matching_auths { + my $record = shift; + + my $tag = '001'; + my $subfield; + + # 001 test requires its own SQL query + if (my $field = $record->field($tag)) { + if ($subfield = $field->data) { + + $match_auth_001_sth->execute($subfield); + my $matches = $match_auth_001_sth->fetchall_arrayref; + + if (@$matches) { # DEBUGGING... + my @ids = map {$_->[0]} @$matches; + announce('INFO', "Auth 001=$subfield matched records: @ids"); + } + return map {$_->[0]} @$matches if @$matches; + } + } + + $tag = '010'; + $subfield = $record->subfield($tag, 'a'); + + if (!$subfield) { + $tag = '035'; + $subfield = $record->subfield($tag, 'a'); + } + + return () unless $subfield; + + $match_auth_sth->execute($tag, $subfield); + my $matches = $match_auth_sth->fetchall_arrayref; + + return map {$_->[0]} @$matches; +} + +sub prepare_statements { + + $del_auth_sth = $db_handle->prepare(<<" SQL"); + DELETE FROM authority.record_entry WHERE id = ? + SQL + + $delmod_auth_sth = $db_handle->prepare(<<" SQL"); + UPDATE authority.record_entry + SET edit_date = NOW() WHERE id = ? + SQL + + $mod_bibs_sth = $db_handle->prepare(<<" SQL"); + UPDATE biblio.record_entry + SET marc = ?, edit_date = NOW() + WHERE id = ? + SQL + + $mod_auth_sth = $db_handle->prepare(<<" SQL"); + UPDATE authority.record_entry + SET marc = ?, edit_date = NOW() + WHERE id = ? + SQL + + $new_auth_sth = $db_handle->prepare(<<" SQL"); + INSERT INTO authority.record_entry (marc, last_xact_id) + VALUES (?, ?) + SQL + + $match_auth_sth = $db_handle->prepare(<<" SQL"); + SELECT DISTINCT(record) + FROM authority.full_rec + WHERE + tag = ? + AND subfield = 'a' + AND value = NACO_NORMALIZE(?, 'a') + SQL + + $match_auth_001_sth = $db_handle->prepare(<<" SQL"); + SELECT DISTINCT(record) + FROM authority.full_rec + WHERE tag = '001' AND value = ? + SQL +} + +openlog($syslog_ident, $syslog_ops, $syslog_facility); +connect_db(); +prepare_statements(); +process_zip_file(); + +$new_auth_sth->finish; +$mod_auth_sth->finish; +$del_auth_sth->finish; +$delmod_auth_sth->finish; +$match_auth_sth->finish; +$match_auth_001_sth->finish; +$mod_bibs_sth->finish; + +$db_handle->disconnect; + -- 2.11.0