From: Bill Erickson Date: Fri, 2 Dec 2016 19:40:21 +0000 (-0500) Subject: JBAS-1437 Move authority/BS scripts to authority-control X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=6a2e85da640aabcd9e054c90e6ed0fe4abaa8655;p=working%2FEvergreen.git JBAS-1437 Move authority/BS scripts to authority-control Signed-off-by: Bill Erickson --- diff --git a/KCLS/authority-control/backstage/README.adoc b/KCLS/authority-control/backstage/README.adoc new file mode 100644 index 0000000000..0f834eaada --- /dev/null +++ b/KCLS/authority-control/backstage/README.adoc @@ -0,0 +1,127 @@ += Backstage Processes = + +Perform steps as 'opensrf' + +== Quarterly Export + Import == + +=== Setup === + +[source,sh] +-------------------------------------------------------------------- +export EXPORT_DATE=2016-10-01 # for example +export WORKING_DIR=/openils/var/data/authority-control/backstage/quarterly/$EXPORT_DATE +export PGHOST=foo +export PGPASSWORD=foo +export PGUSER=evergreen +mkdir -p $WORKING_DIR +-------------------------------------------------------------------- + +=== Exporting Bib Records === + +Bibs are exported as MARC and uploaded to Backstage + +==== Generate Export MARC File ==== + +[source,sh] +-------------------------------------------------------------------- +./export-bibs.pl \ + --start-date 2010-01-01 \ + --end-date 2016-06-01 \ + --export-date $EXPORT_DATE \ + --out-file $WORKING_DIR/exported-bibs.$EXPORT_DATE.mrc + +# Send file to BS FTP server in(bound) directory. +-------------------------------------------------------------------- + +=== Process Results === + +==== Fetch Results ==== + +[source,sh] +-------------------------------------------------------------------- +cd $WORKING_DIR +wget +-------------------------------------------------------------------- + +==== Process Results Files ==== + +* Import new and modified authority records. +* Import modified bib records. + +[source,sh] +-------------------------------------------------------------------- +cd /home/opensrf/Evergreen/KCLS/backstage/ + +./process-backstage-files.pl \ + --verbose \ + --export-date $EXPORT_DATE \ + --file $WORKING_DIR/ \ + --working-dir $WORKING_DIR \ + --bib-collision-file bib-collisions.mrc \ + > $WORKING_DIR/process.log +-------------------------------------------------------------------- + +==== Process Bib Collisions ==== + +Bib records that were locally modified during Backstage processing are +re-imported without clobbering the modifications. + +1. Create a new queue for this batch (rename to suit). + +[source,sh] +-------------------------------------------------------------------- +INSERT INTO vandelay.bib_queue (owner, name) + VALUES (1, 'Backstage Q3 2016'); +SELECT id FROM vandelay.bib_queue WHERE name = 'Backstage Q3 2016'; +-------------------------------------------------------------------- + +2. Import bib collisions via stream importer. + +[source,sh] +-------------------------------------------------------------------- +# Make a copy of the collisions file for safe keeping, +# since open-ils.vandelay deletes spool files. +cp $WORKING_DIR/bib-collisions.mrc $WORKING_DIR/bib-collisions.bak.mrc + +cd /openils/bin +./marc_stream_importer.pl \ + --spoolfile $WORKING_DIR/bib-collisions.mrc \ + --user admin \ + --password XXX \ + --bib-auto-overlay-exact \ + --queue \ + --merge-profile 104 # "Backstage Field Protection" +-------------------------------------------------------------------- + +==== Get auth/bib IDs that require re-linking ==== + +[source,sh] +-------------------------------------------------------------------- +cd /home/opensrf/Evergreen/KCLS/linking/ + +./link-new-auth-records.pl --modified-since \ + --progress --print-auth-ids \ + > $WORKING_DIR/auths-to-link.txt + +./link-new-auth-records.pl --modified-since \ + --progress --print-bib-ids \ + > $WORKING_DIR/bibs-to-link.txt +-------------------------------------------------------------------- + +==== Re-Link Modified Auths and Bibs ==== + +[source,sh] +-------------------------------------------------------------------- +cd /home/opensrf/Evergreen/KCLS/authority-control/linking/ + +./authority_authority_linker.pl --verbose \ + --file $WORKING_DIR/auths-to-link.txt \ + | tee -a $WORKING_DIR/auth2auth-linking.log + +# Bib linking takes many hours, sometimes days. + +./authority_control_fields.pl --verbose --refresh \ + --file $WORKING_DIR/bibs-to-link.txt \ + | tee -a $WORKING_DIR/bib-linking.log +-------------------------------------------------------------------- + diff --git a/KCLS/authority-control/backstage/export-bibs.pl b/KCLS/authority-control/backstage/export-bibs.pl new file mode 100755 index 0000000000..3858a31d6f --- /dev/null +++ b/KCLS/authority-control/backstage/export-bibs.pl @@ -0,0 +1,256 @@ +#!/usr/bin/env perl +# ----------------------------------------------------------------------- +# Export bib records for Backstage processing. +# +# The UTF-8 encoded USMARC string for each record is printed to STDOUT. +# Each exported bib has its export_date value updated to NOW(). +# +# Exported bibs meet the following criteria: +# +# 1. Delete flag must be false. +# 2. Record cannot contain any 086, 092, or 099 tags containing the phrase 'on order' +# 3. Boolean filter: +# [ (001_test OR 035_test) AND has_holdings AND cat_date_in_range ] +# OR +# [ 998_test AND create_date_in_range ] +# ----------------------------------------------------------------------- +use strict; +use warnings; +use DBI; +use Getopt::Long; +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); + +my $db_handle; + +my $start_date; +my $end_date; +my $export_date; +my $ids_only; +my $count_only; +my $out_file; +my $limit; +my $db_user = $ENV{PGUSER} || 'evergreen'; +my $db_name = $ENV{PGDATABASE} || 'evergreen'; +my $db_host = $ENV{PGHOST} || 'localhost'; +my $db_port = $ENV{PGPORT} || '5432'; +my $db_pass = $ENV{PGPASSWORD}; +my $help; + +GetOptions( + 'start-date=s' => \$start_date, + 'end-date=s' => \$end_date, + 'export-date=s' => \$export_date, + 'ids-only' => \$ids_only, + 'count-only' => \$count_only, + 'out-file=s' => \$out_file, + 'limit=f' => \$limit, + 'db-user=s' => \$db_user, + 'db-host=s' => \$db_host, + 'db-name=s' => \$db_name, + 'db-port=i' => \$db_port, + 'db-pass=s' => \$db_pass, + 'help' => \$help +); + +sub help { + print < + --end-date + Export bib records whose cataloging_date (for physical records) or + create_date (for electronic records) value is between the provided + start and end dates. + + --export-date + Sets the export date to the provided value. If no --export-date + value is set, no export date value will be applied in the database. + + --out-file + Write MARC records (or IDs) to this file. + + --ids-only + Write bib record IDs to the output file instead of the full MARC + record. + + --count-only + Only print the number of bibs that would be exported to STDOUT. + + --limit + Export at most this many records. + +HELP + exit; +} + +die "--start-date and --end-date required\n" + unless $start_date && $end_date; + +die "Invalid date format\n" unless + $start_date =~ /^\d{4}-\d{2}-\d{2}$/ && + $end_date =~ /^\d{4}-\d{2}-\d{2}$/ && + (!$export_date || $export_date =~ /^\d{4}-\d{2}-\d{2}$/); + +die "--out-file required\n" unless $out_file || $count_only; + +sub bib_query { + my $sql = <$out_file") + or die "Cannot open file for writing: $out_file\n"; + binmode(MARCFILE, ':utf8'); + } + + my $sth = $db_handle->prepare(bib_query()); + my $edate_sth = $db_handle->prepare( + 'SELECT * FROM metabib.set_export_date(?, ?)'); + + $sth->execute; + my $count = 0; + while (my $bib = $sth->fetchrow_hashref) { + $count++; + next if $count_only; + + my $bib_id = $bib->{id}; + + if ($ids_only) { + print MARCFILE "$bib_id\n"; + print "$count records written...\n" if ($count % 1000) == 0; + next; + } + + my $rec = $db_handle->selectall_arrayref( + "SELECT marc FROM biblio.record_entry WHERE id = $bib_id"); + + my $marc = $rec->[0]->[0]; + my $marcdoc = MARC::Record->new_from_xml($marc, 'UTF-8', 'USMARC'); + + print MARCFILE $marcdoc->as_usmarc; + + print "$count records written...\n" if ($count % 1000) == 0; + + next unless $export_date; + + # Update the bib record's metabib.bib_export_data entry. + eval { $edate_sth->execute($bib_id, $export_date) }; + die "Error setting export date for bib ". + "$bib_id to $export_date : $@\n" if $@; + } + + close(MARCFILE) if $out_file; + + print "$count total bib records\n"; + + $sth->finish; + $edate_sth->finish; +} + +sub connect_db { + $db_handle = DBI->connect( + "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", + $db_user, $db_pass, { + RaiseError => 1, + PrintError => 0, + AutoCommit => 1, + pg_expand_array => 0, + pg_enable_utf8 => 1 + } + ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; +} + +connect_db(); +export_marc(); + +$db_handle->disconnect; + diff --git a/KCLS/authority-control/backstage/process-backstage-files.pl b/KCLS/authority-control/backstage/process-backstage-files.pl new file mode 100755 index 0000000000..364461e281 --- /dev/null +++ b/KCLS/authority-control/backstage/process-backstage-files.pl @@ -0,0 +1,504 @@ +#!/usr/bin/env perl +# ----------------------------------------------------------------------- +# TODO: summary +# +# TODO: +# Disable auth record change propagation during auth record updates. +# ----------------------------------------------------------------------- +use strict; +use warnings; +use DBI; +use DateTime; +use Getopt::Long; +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::File::USMARC; +use Archive::Zip qw(:ERROR_CODES :CONSTANTS); +use File::Basename; +use Sys::Syslog qw(syslog openlog); +use OpenILS::Utils::Normalize qw(clean_marc); +binmode(STDOUT, ':utf8'); + +my $db_handle; +my $log_mod = 500; # log every 500th of each type of event (see verbose) + +my $file; +my $export_date; +my $working_dir = '.', +my $bib_collision_file; +my $verbose; +my $db_user = $ENV{PGUSER} || 'evergreen'; +my $db_name = $ENV{PGDATABASE} || 'evergreen'; +my $db_host = $ENV{PGHOST} || 'localhost'; +my $db_port = $ENV{PGPORT} || '5432'; +my $db_pass = $ENV{PGPASSWORD}; + +my $syslog_facility = 'LOCAL6'; # matches Evergreen gateway +my $syslog_ops = 'pid'; +my $syslog_ident = 'BACKSTAGE'; + +my $new_auth_sth; +my $mod_auth_sth; +my $del_auth_sth; +my $delmod_auth_sth; +my $mod_bibs_sth; +my $match_auth_sth; +my $match_auth_001_sth; +my $new_auth_ctr = 0; +my $mod_auth_ctr = 0; +my $del_auth_ctr = 0; +my $mod_bibs_ctr = 0; +my $col_bibs_ctr = 0; + +my $help; + +GetOptions( + 'file=s' => \$file, + 'export-date=s' => \$export_date, + 'working-dir=s' => \$working_dir, + 'bib-collision-file=s' => \$bib_collision_file, + 'verbose' => \$verbose, + 'db-user=s' => \$db_user, + 'db-host=s' => \$db_host, + 'db-name=s' => \$db_name, + 'db-port=i' => \$db_port, + 'db-pass=s' => \$db_pass, + 'help' => \$help +); + +sub help { + print <now(time_zone => 'local')->strftime('%F %T'); + my $msg_str = "$date_str [$$] $level $msg\n"; + + if ($die) { + die $msg_str; + + } else { + if ($level eq 'ERR' or $level eq 'WARNING') { + # always copy problem messages to stdout + warn $msg_str; # avoid dupe messages + } else { + print $msg_str; + } + } +} + + +sub connect_db { + $db_handle = DBI->connect( + "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", + $db_user, $db_pass, { + RaiseError => 1, + PrintError => 0, + AutoCommit => 1, + pg_expand_array => 0, + pg_enable_utf8 => 1 + } + ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; +} + +sub process_zip_file { + + my $zip = Archive::Zip->new(); + + announce('ERR', "Failed to read $file", 1) + unless $zip->read($file) == AZ_OK; + + # Avoid processing XLS and HTM files. + # All of the MARC files end in .UTF8. + for my $member ($zip->membersMatching('.*(\.UTF8|\.MRC)')) { + + my $basename = basename($member->fileName()); + + announce('INFO', "Extracting file $basename"); + + my $local_file = "$working_dir/$basename"; + + announce('ERR', "Unable to extract to file: $local_file", 1) + unless $member->extractToFileNamed($local_file) == AZ_OK; + + my $marc_batch = MARC::File::USMARC->in($local_file, 'UTF8') + or announce('ERR', "Unable to read $local_file as MARC", 1); + + if ($basename =~ /BIB/) { + + handle_modified_bibs($marc_batch); + + } elsif ($basename =~ /DEL/) { + + handle_deleted_auths($marc_batch); + + } elsif ($basename =~ /CHG|NEW|AUTH/) { + + handle_modified_auths($marc_batch); + + } else { + announce('WARNING', "Un-handled file type: $basename"); + } + } +} + +# Returns ID's of bib records that have been modified since the export date. +my @modified_bibs; +my $mod_searched = 0; +sub find_modified_bibs { + + return if $mod_searched; + $mod_searched = 1; + + my $id_arrays = $db_handle->selectall_arrayref(<<" SQL"); + SELECT id + FROM biblio.record_entry + WHERE NOT deleted AND edit_date >= '$export_date' + SQL + + @modified_bibs = map {$_->[0]} @$id_arrays; + + announce('INFO', scalar(@modified_bibs)." bibs modified since export"); +} + + + +# 1. Bibs that have been modified by Backstage and locally are written +# to the --bib-collision-file as MARC for later processing. +# 2. Bibs that have only been modified by Backstage are updated +# directly in the database. +sub handle_modified_bibs { + my $marc_batch = shift; + + find_modified_bibs(); + + while (my $record = $marc_batch->next()) { + my $bib_id = $record->subfield('901', 'c'); + + if (!$bib_id) { + announce('ERR', "Bib record has no 901c (ID) value. Skipping"); + next; + } + + if (grep {$bib_id eq $_} @modified_bibs) { + # Bib was edited by both parties. Save to external file + # for later processing. + + write_bib_collision($record); + + } else { + # Update our copy of the record. + + my $marcxml = clean_marc($record->as_xml_record()); + update_bib($marcxml, $bib_id); + } + } +} + +sub update_bib { + my $marcxml = shift; + my $bib_id = shift; + + eval { $mod_bibs_sth->execute($marcxml, $bib_id) }; + + if ($@) { + announce('ERR', "Error updating biblio record: $@ : $marcxml"); + return; + } + + $mod_bibs_ctr++; + + announce('INFO', "Updated $mod_bibs_ctr bib records") + if $mod_bibs_ctr % $log_mod == 0; +} + +sub write_bib_collision { + my $record = shift; + + my $filename = "$working_dir/$bib_collision_file"; + + open(BIBS_FILE, ">>$filename") or + announce('ERR', "Cannot open bib collision file: $filename : $!", 1); + + binmode(BIBS_FILE, ":utf8"); + + print BIBS_FILE $record->as_usmarc(); + + close BIBS_FILE or + announce('WARNING', "Error closing bib collision file: $filename : $!"); + + $col_bibs_ctr++; + + announce('INFO', "Dumped $col_bibs_ctr bib collisions to file") + if $col_bibs_ctr % $log_mod == 0; +} + +sub handle_deleted_auths { + my $marc_batch = shift; + + while (my $record = $marc_batch->next()) { + my @matches = find_matching_auths($record); + + for my $auth_id (@matches) { + + eval { + # 2 mods.. wrap in transaction? (see autocommit) + $del_auth_sth->execute($auth_id); + $delmod_auth_sth->execute($auth_id); + }; + + if ($@) { + announce( + 'ERR', "Error deleting authority record: $@ : $auth_id"); + next; + } + + $del_auth_ctr++; + + announce('INFO', "Deleted $del_auth_ctr authority records") + if $del_auth_ctr % $log_mod == 0; + } + } +} + +sub handle_modified_auths { + my $marc_batch = shift; + + while (my $record = $marc_batch->next()) { + + modify_auth_005($record); + + my @matches = find_matching_auths($record); + push(@matches, find_replaced_auths($record)); + + my $marcxml = clean_marc($record->as_xml_record()); + + if (@matches) { + update_auth($marcxml, $_) for @matches; + } else { + insert_auth($marcxml); + } + } +} + +# Update the 005 field to the current date +sub modify_auth_005 { + my $record = shift; + my $field_005 = $record->field('005'); + + # MARC 005-formatted date value + my $now_date = DateTime->now( + time_zone => 'local')->strftime('%Y%m%d%H%M%S.0'); + + if ($field_005) { + $field_005->update($now_date); + + } else { + $field_005 = MARC::Field->new('005', $now_date); + $record->insert_fields_ordered($field_005); + } +} + + +sub update_auth { + my $marcxml = shift; + my $auth_id = shift; + + eval { $mod_auth_sth->execute($marcxml, $auth_id) }; + + if ($@) { + announce('ERR', "Error updating authority record: $@ : $marcxml"); + return; + } + + $mod_auth_ctr++; + + announce('INFO', "Updated $mod_auth_ctr authority records") + if $mod_auth_ctr % $log_mod == 0; +} + +sub insert_auth { + my $marcxml = shift; + + eval { $new_auth_sth->execute($marcxml, "IMPORT-" . time) }; + + if ($@) { + announce('ERR', + "Error creating new authority record: $@ : $marcxml"); + return; + } + + $new_auth_ctr++; + + announce('INFO', "Created $new_auth_ctr authority records") + if $new_auth_ctr % $log_mod == 0; +} + +# Return ID's of authority records that should be replaced by the +# current record. Checks for records whose 010$a equals the 010$z of +# the current record. +# 010$z == Canceled/invalid LC control number +sub find_replaced_auths { + my $record = shift; + + my $subfield = $record->subfield('010', 'z'); + return () unless $subfield; + + $match_auth_sth->execute('010', $subfield); + my $matches = $match_auth_sth->fetchall_arrayref; + my @ids = map {$_->[0]} @$matches; + + announce('INFO', "Auth 010z=$subfield matched records: @ids") if @ids; + + return @ids; +} + +# Return ID's of matching authority records. Matching tries: +# 001 -> 010a -> 035a. +sub find_matching_auths { + my $record = shift; + + my $tag = '001'; + my $subfield; + + # 001 test requires its own SQL query + if (my $field = $record->field($tag)) { + if ($subfield = $field->data) { + + $match_auth_001_sth->execute($subfield); + my $matches = $match_auth_001_sth->fetchall_arrayref; + my @ids = map {$_->[0]} @$matches; + announce('INFO', + "Auth 001=$subfield matched records: @ids") if @ids; + return @ids; + } + } + + $tag = '010'; + $subfield = $record->subfield($tag, 'a'); + + if (!$subfield) { + $tag = '035'; + $subfield = $record->subfield($tag, 'a'); + } + + return () unless $subfield; + + $match_auth_sth->execute($tag, $subfield); + my $matches = $match_auth_sth->fetchall_arrayref; + + my @ids = map {$_->[0]} @$matches; + announce('INFO', "Auth ${tag}a=$subfield matched records: @ids") if @ids; + + return @ids; +} + +sub prepare_statements { + + $del_auth_sth = $db_handle->prepare(<<" SQL"); + DELETE FROM authority.record_entry WHERE id = ? + SQL + + $delmod_auth_sth = $db_handle->prepare(<<" SQL"); + UPDATE authority.record_entry + SET edit_date = NOW() WHERE id = ? + SQL + + $mod_bibs_sth = $db_handle->prepare(<<" SQL"); + UPDATE biblio.record_entry + SET marc = ?, edit_date = NOW() + WHERE id = ? + SQL + + $mod_auth_sth = $db_handle->prepare(<<" SQL"); + UPDATE authority.record_entry + SET marc = ?, edit_date = NOW() + WHERE id = ? + SQL + + $new_auth_sth = $db_handle->prepare(<<" SQL"); + INSERT INTO authority.record_entry (marc, last_xact_id) + VALUES (?, ?) + SQL + + $match_auth_sth = $db_handle->prepare(<<" SQL"); + SELECT DISTINCT(rec.id) + FROM authority.record_entry rec + JOIN authority.full_rec frec ON (frec.record = rec.id) + WHERE + NOT rec.deleted + AND frec.tag = ? + AND frec.subfield = 'a' + AND frec.value = NACO_NORMALIZE(?, 'a') + SQL + + $match_auth_001_sth = $db_handle->prepare(<<" SQL"); + SELECT DISTINCT(rec.id) + FROM authority.record_entry rec + JOIN authority.full_rec frec ON (frec.record = rec.id) + WHERE + NOT rec.deleted + AND frec.tag = '001' + AND frec.value = ? + SQL +} + +openlog($syslog_ident, $syslog_ops, $syslog_facility); +connect_db(); +prepare_statements(); +process_zip_file(); + +$new_auth_sth->finish; +$mod_auth_sth->finish; +$del_auth_sth->finish; +$delmod_auth_sth->finish; +$match_auth_sth->finish; +$match_auth_001_sth->finish; +$mod_bibs_sth->finish; + +$db_handle->disconnect; + diff --git a/KCLS/authority-control/linking/authority_authority_linker.pl b/KCLS/authority-control/linking/authority_authority_linker.pl new file mode 100755 index 0000000000..2c134e74a4 --- /dev/null +++ b/KCLS/authority-control/linking/authority_authority_linker.pl @@ -0,0 +1,385 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use DBI; +use Getopt::Long; +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; +use OpenSRF::System; +use OpenILS::Utils::Fieldmapper; +use OpenSRF::Utils::SettingsClient; +use OpenSRF::EX qw/:try/; +use Encode; +use Unicode::Normalize; +use OpenILS::Utils::Normalize; +use Data::Dumper; +use Pod::Usage qw/ pod2usage /; + +$ENV{OSRF_LOG_CLIENT} = 1; + +$Data::Dumper::Indent = 0; +MARC::Charset->assume_unicode(1); + +my $acsaf_cache = {}; + +sub get_acsaf { + my ($e, $id) = @_; + + $acsaf_cache->{$id} ||= + $e->retrieve_authority_control_set_authority_field([ + $id, + {flesh => 1, flesh_fields => {acsaf => ["main_entry"]}} + ]); + return $acsaf_cache->{$id}; +} + +sub matchable_string { + my ($field, $sf_list, $joiner) = @_; + $joiner ||= ' '; + + return join($joiner, map { $field->subfield($_) } split "", $sf_list); +} + +# ########### main +my ($start_id, $end_id); +my $bootstrap = '/openils/conf/opensrf_core.xml'; +my @records; +my $verbose; +my $input_file =''; +my $db_host = $ENV{PGHOST} || 'localhost'; +my $db_port = $ENV{PGPORT} || '5432'; +my $db_user = $ENV{PGDATABASE} || 'evergreen'; +my $db_pass = $ENV{PGPASSWORD}; +my $links_removed = 0; +my $links_added = 0; +my $CNI = 'KCLS'; + +my %options; +my $result = GetOptions( + \%options, + 'configuration=s' => \$bootstrap, + 'record=i' => \@records, + 'all', 'help', + 'start_id=i' => \$start_id, + 'end_id=i' => \$end_id, + 'file=s' => \$input_file, + 'verbose' => \$verbose, + "db-host=s" => \$db_host, + "db-user=s" => \$db_user, + "db-pass=s" => \$db_pass, + "db-port=s" => \$db_port +); + +sub announce { + my $msg = shift; + return unless $verbose; + print DateTime->now(time_zone => 'local')->strftime('%F %T') . " $msg\n"; +} + +pod2usage(0) if not $result or $options{help}; + +OpenSRF::System->bootstrap_client(config_file => $bootstrap); +Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + +# must be loaded and initialized after the IDL is parsed + +use OpenILS::Utils::CStoreEditor; +OpenILS::Utils::CStoreEditor::init(); + +my $e = OpenILS::Utils::CStoreEditor->new; + +my $query = q{ + SELECT + source, + ARRAY_TO_STRING(ARRAY_AGG(target || ',' || field), ';') AS links + FROM ( + SELECT sh1.record AS target, + sh2.record AS source, + sh2.atag AS field + FROM authority.simple_heading sh1 + JOIN authority.simple_heading sh2 USING (sort_value) + JOIN authority.control_set_authority_field af1 ON + (sh1.atag = af1.id AND af1.main_entry IS NULL) + JOIN authority.control_set_authority_field af2 ON + (sh2.atag = af2.id AND af2.main_entry IS NOT NULL + AND af2.linking_subfield IS NOT NULL) + %s -- where clause here + -- Ignore authority.authority_linking rows since we want to + -- rebuild all links, which may mean deleting bogus links. + -- EXCEPT SELECT target, source, field FROM authority.authority_linking + -- order by source for consistent testing + ) x GROUP BY 1 ORDER BY source +}; + +my @bind_params; +if (@records) { + $query = sprintf($query, "WHERE sh2.record = ?"); + @bind_params = @records; # should be just one scalar in this array. +} elsif ($options{all}) { + $query = sprintf($query, ""); # no where clause +} elsif ($start_id and $end_id) { + $query = sprintf($query, "WHERE sh2.record BETWEEN ? AND ?"); + @bind_params = ($start_id, $end_id); + +} elsif ($input_file) { + # Load authority record IDs from a file. + announce("Reading authority record IDs from $input_file"); + + open FILE, "<", $input_file or die "Can't open file $input_file\n"; + while() { + chomp; + push(@records, $_) if $_; + } + close FILE; + + announce("Read ".scalar(@records)." from $input_file"); + + # No bind_params needed. + my $recstr = join(',', @records); + $query = sprintf($query, "WHERE sh2.record IN ($recstr)"); + +} else { + pod2usage(0); +} + +announce("SQL, params: ", Dumper($query, \@bind_params)); + +my $dsn = "dbi:Pg:database=evergreen;host=$db_host;port=$db_port"; +my $dbh = DBI->connect($dsn, $db_user, $db_pass); +$dbh->do('SET statement_timeout = 0'); + +my $sth = $dbh->prepare($query); + +announce("Executing query ..."); +$sth->execute(@bind_params); + +my $problems = 0; + +if (@records) { + announce("Processing ".scalar(@records)." records"); +} elsif ($start_id) { + announce("Processing records $start_id => $end_id"); +} else { + announce("Processing all records"); +} + +my $total_records = 0; +while (my ($src, $links) = $sth->fetchrow_array) { + announce("--------------------"); + announce("Processing authority source record $src"); + $total_records++; + + try { + + my $src_rec = $e->retrieve_authority_record_entry($src); + if (!$src_rec) { + warn "Unable to load source authority record $src\n"; + next; + } + + my $src_marc = MARC::Record->new_from_xml($src_rec->marc); + my $auth_src_thesaurus = substr($src_marc->field('008')->data(), 11, 1); + announce("Source record thesaurus value=$auth_src_thesaurus"); + + my $changed = 0; + my %tags_seen; + for my $link (split ';', $links) { + my ($target, $field_id) = split ',', $link; + + next if $target eq $src_rec->id; + + announce("Target: $target, field_id: $field_id"); + + my $target_rec = $e->retrieve_authority_record_entry($target); + if (!$target_rec) { + warn "Unable to load authority record $target. Skipping\n"; + next; + } + + my $target_marc = MARC::Record->new_from_xml($target_rec->marc); + + my $auth_target_thesaurus = + substr($target_marc->field('008')->data(), 11, 1); + + announce("Target record thesaurus value=$auth_target_thesaurus"); + + if ($auth_src_thesaurus ne $auth_target_thesaurus) { + announce("Thesauri for source/target records do not match. Skipping.."); + next; + } + + my $acsaf = get_acsaf($e, $field_id); + if (!$acsaf) { + warn "No authority control set field found for $field_id. Skipping\n"; + next; + } + + if (!$tags_seen{$acsaf->tag}) { + # the first time we process each tag for a given record, + # remove all existing auth-to-auth link subfields + # so they can be completely rebuilt. + for my $field ($src_marc->field($acsaf->tag)) { + if (my $val = $field->subfield('0')) { + announce("Removing existing subfield 0 : $val"); + $field->delete_subfield(code => '0'); + $changed = 1; + $links_removed++; + } + } + $tags_seen{$acsaf->tag} = 1; + } + + # rebuild the links for the current tag + for my $field ($src_marc->field($acsaf->tag)) { + + my $src_string = matchable_string( + $field, $acsaf->main_entry->display_sf_list, + $acsaf->main_entry->joiner + ); + + announce(sprintf( + "At field id=%s (tag=%s) / trying to match '%s'", + $acsaf->id, $acsaf->tag, $src_string)); + + my ($tfield) = $target_marc->field($acsaf->main_entry->tag); + + if(defined $tfield) { + + my $target_string = matchable_string( + $tfield, $acsaf->main_entry->display_sf_list, + $acsaf->main_entry->joiner + ); + + if ($target_string eq $src_string) { + announce("Got a match"); + $field->update('0' => "($CNI)$target"); + $changed = 1; + $links_added++; + } + } + } + } + + if ($changed) { + announce("Updating authority record ".$src_rec->id); + $src_rec->marc(OpenILS::Utils::Normalize::clean_marc($src_marc)); + $e->xact_begin; + $e->update_authority_record_entry($src_rec) or die $e->die_event; + $e->xact_commit; + } + + } otherwise { + my $err = shift; + print STDERR "\nRecord # $src : ", + (ref $err eq "HASH" ? Dumper($err) : $err), "\n"; + + # Reset SAX parser so that one bad record doesn't + # kill the entire process. + + import MARC::File::XML; + $problems++; + } +} + +if (@records) { + announce("Processed records ". scalar(@records). + " records; processed=$total_records problems=$problems"); +} elsif ($start_id) { + announce("Processed records $start_id => $end_id; ". + "processed=$total_records; problems=$problems"); +} else { + announce("Processed all records; processed=$total_records; problems=$problems"); +} + +announce("links removed: $links_removed"); +announce("links added: $links_added"); +announce("delta added: ".($links_added - $links_removed)); + +exit ($problems > 0); + +__END__ + +=head1 NAME + +authority_authority_linker.pl - Link reference headings in authority records to main entry headings in other authority records + +=head1 SYNOPSIS + +authority_authority_linker.pl [B<--configuration>=I] +[[B<--record>=I[ B<--record>=I]]] | [B<--all>] | [B<--start_id>=I B<--end_id>=I] + +=head1 DESCRIPTION + +For a given set of records, find authority reference headings that also +appear as main entry headings in any other authority record. In the +specific MARC field of the authority record (source) containing the reference +heading with such a match in another authority record (target), add a subfield +0 (zero) referring to the target record by ID. + +=head1 OPTIONS + +=over + +=item * B<-c> I, B<--configuration>=I + +Specifies the OpenSRF configuration file used to connect to the OpenSRF router. +Defaults to F + +=item * B<-r> I, B<--record>=I + +Specifies the authority record ID (found in the C +column) of the B record to process. This option may be specified more +than once to process multiple records in a single run. + +=item * B<-a>, B<--all> + +Specifies that all authority records should be processed. For large +databases, this may take an extraordinarily long amount of time. + +=item * B<-s> I, B<--start_id>=I + +Specifies the starting ID of the range of authority records to process. +This option is ignored unless it is accompanied by the B<-e> or B<--end_id> +option. + +=item * B<-e> I, B<--end_id>=I + +Specifies the ending ID of the range of authority records to process. +This option is ignored unless it is accompanied by the B<-s> or B<--start> +option. + +=back + +=head1 EXAMPLES + + authority_authority_linker.pl --start_id 1 --end_id 50000 + +Processes the authority records with IDs between 1 and 50,000 using the +default OpenSRF configuration file for connection information. + +=head1 AUTHOR + +Lebbeous Fogle-Weekley + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2013 Equinox Software, Inc. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301, USA. + +=cut diff --git a/KCLS/authority-control/linking/authority_control_fields.pl b/KCLS/authority-control/linking/authority_control_fields.pl new file mode 100755 index 0000000000..a560384336 --- /dev/null +++ b/KCLS/authority-control/linking/authority_control_fields.pl @@ -0,0 +1,1013 @@ +#!/usr/bin/perl +# Copyright (C) 2010-2011 Laurentian University +# Author: Dan Scott +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# --------------------------------------------------------------- + +use strict; +use warnings; +use DBI; +use Getopt::Long; +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; +use OpenSRF::System; +use OpenILS::Utils::Fieldmapper; +use OpenSRF::Utils::SettingsClient; +use Encode; +use Unicode::Normalize; +use OpenILS::Application::AppUtils; +use Data::Dumper; +use Pod::Usage qw/ pod2usage /; +use DateTime; + +$ENV{OSRF_LOG_CLIENT} = 1; + +$Data::Dumper::Indent = 0; +MARC::Charset->assume_unicode(1); + +my $start_id; +my $end_id; +my $refresh; +my $days_back; # days; 0 means today only +my $modified_since; # days; 0 means today only +my $input_file =''; +my $bootstrap = '/openils/conf/opensrf_core.xml'; +my @records; +my $verbose = 0; +my $sort_desc = 0; +my $db_host = $ENV{PGHOST} || 'localhost'; +my $db_port = $ENV{PGPORT} || '5432'; +my $db_user = $ENV{PGDATABASE} || 'evergreen'; +my $db_pass = $ENV{PGPASSWORD}; +my $CNI = 'KCLS'; + +my %options; +my $result = GetOptions( + \%options, + 'configuration=s' => \$bootstrap, + 'record=i' => \@records, + 'refresh' => \$refresh, + 'start-id=i' => \$start_id, + 'end-id=i' => \$end_id, + 'days-back=i' => \$days_back, + 'modified-since=i' => \$modified_since, + 'sort-desc' => \$sort_desc, + 'file=s' => \$input_file, + 'verbose' => \$verbose, + "db-host=s" => \$db_host, + "db-user=s" => \$db_user, + "db-pass=s" => \$db_pass, + "db-port=s" => \$db_port, + 'all', # now assumed to be true when --file is unset + 'help' +); + +sub announce { + my $msg = shift; + my $force = shift; + return unless $force || $verbose; + print DateTime->now->strftime('%F %T') . " [$$] $msg\n"; +} + +if (!$result or $options{help}) { + pod2usage(0); +} + +OpenSRF::System->bootstrap_client(config_file => $bootstrap); +Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); + +# must be loaded and initialized after the IDL is parsed +use OpenILS::Utils::CStoreEditor; +OpenILS::Utils::CStoreEditor::init(); + +my $e = OpenILS::Utils::CStoreEditor->new; + +if ($input_file) { + open FILE, "<", $input_file or die "Can't open file " . $input_file; + while() { + chomp; + if($_) { + push(@records, $_); + } + } + close FILE; + +} else { + + + my $where = "WHERE not deleted"; + $where .= " AND id >= $start_id" if $start_id; + $where .= " AND id <= $end_id" if $end_id; + + my $order = "ORDER BY id"; + $order .= " DESC" if $sort_desc; + + if (defined $days_back or defined $modified_since) { + my $date_field = defined $days_back ? 'create_date' : 'edit_date'; + my $duration = defined $days_back ? $days_back : $modified_since; + $where .= " AND DATE($date_field) >= DATE(NOW() - '$duration day'::INTERVAL)"; + } + + my $sql = + "SELECT DISTINCT(id) AS id FROM biblio.record_entry $where $order"; + + announce("Loading record ID's with query:\n$sql"); + + my $dsn = "dbi:Pg:database=evergreen;host=$db_host;port=$db_port"; + my $dbh = DBI->connect($dsn, $db_user, $db_pass); + + my $sth = $dbh->prepare($sql); + $sth->execute; + + while (my $ref = $sth->fetchrow_hashref()) { + push(@records, $ref->{id}); + } + + $sth->finish(); + $dbh->disconnect(); +} + +announce("Processing ".scalar(@records)." records"); + +# Hash of controlled fields & subfields in bibliographic records, and their +# corresponding controlling fields & subfields in the authority record +# +# So, if the bib 650$a can be controlled by an auth 150$a, that maps to: +# 650 => { a => { 150 => 'a'}} +my %controllees = ( + 100 => { a => { 100 => 'a' }, + b => { 100 => 'b' }, + c => { 100 => 'c' }, + d => { 100 => 'd' }, + #f => { 100 => 'f' }, + g => { 100 => 'g' }, + h => { 100 => 'h' }, + k => { 100 => 'k' }, + l => { 100 => 'l' }, + m => { 100 => 'm' }, + n => { 100 => 'n' }, + o => { 100 => 'o' }, + p => { 100 => 'p' }, + q => { 100 => 'q' }, + r => { 100 => 'r' }, + s => { 100 => 's' }, + t => { 100 => 't' }, + y => { 100 => 'y' }, + z => { 100 => 'z' }, + }, + 110 => { a => { 110 => 'a' }, + b => { 110 => 'b' }, + c => { 110 => 'c' }, + d => { 110 => 'd' }, + #f => { 110 => 'f' }, + g => { 110 => 'g' }, + h => { 110 => 'h' }, + k => { 110 => 'k' }, + l => { 110 => 'l' }, + m => { 110 => 'm' }, + n => { 110 => 'n' }, + o => { 110 => 'o' }, + p => { 110 => 'p' }, + r => { 110 => 'r' }, + s => { 110 => 's' }, + t => { 110 => 't' }, + y => { 110 => 'y' }, + z => { 110 => 'z' }, + }, + 111 => { a => { 111 => 'a' }, + b => { 111 => 'b' }, + c => { 111 => 'c' }, + d => { 111 => 'd' }, + e => { 111 => 'e' }, + #f => { 111 => 'f' }, + g => { 111 => 'g' }, + h => { 111 => 'h' }, + k => { 111 => 'k' }, + l => { 111 => 'l' }, + m => { 111 => 'm' }, + n => { 111 => 'n' }, + o => { 111 => 'o' }, + p => { 111 => 'p' }, + q => { 111 => 'q' }, + r => { 111 => 'r' }, + s => { 111 => 's' }, + t => { 111 => 't' }, + u => { 111 => 'u' }, + y => { 111 => 'y' }, + z => { 111 => 'z' }, + }, + 130 => { a => { 130 => 'a' }, + d => { 130 => 'd' }, + #f => { 130 => 'f' }, + g => { 130 => 'g' }, + h => { 130 => 'h' }, + k => { 130 => 'k' }, + l => { 130 => 'l' }, + m => { 130 => 'm' }, + n => { 130 => 'n' }, + o => { 130 => 'o' }, + p => { 130 => 'p' }, + r => { 130 => 'r' }, + s => { 130 => 's' }, + t => { 130 => 't' }, + x => { 130 => 'x' }, + y => { 130 => 'y' }, + z => { 130 => 'z' }, + }, + 400 => { a => { 100 => 'a' }, + b => { 100 => 'b' }, + c => { 100 => 'c' }, + d => { 100 => 'd' }, + #f => { 100 => 'f' }, + g => { 100 => 'g' }, + h => { 100 => 'h' }, + k => { 100 => 'k' }, + l => { 100 => 'l' }, + m => { 100 => 'm' }, + n => { 100 => 'n' }, + o => { 100 => 'o' }, + p => { 100 => 'p' }, + q => { 100 => 'q' }, + r => { 100 => 'r' }, + s => { 100 => 's' }, + t => { 100 => 't' }, + y => { 100 => 'y' }, + z => { 100 => 'z' }, + }, + 410 => { a => { 110 => 'a' }, + b => { 110 => 'b' }, + c => { 110 => 'c' }, + d => { 110 => 'd' }, + #f => { 110 => 'f' }, + g => { 110 => 'g' }, + h => { 110 => 'h' }, + k => { 110 => 'k' }, + l => { 110 => 'l' }, + m => { 110 => 'm' }, + n => { 110 => 'n' }, + o => { 110 => 'o' }, + p => { 110 => 'p' }, + r => { 110 => 'r' }, + s => { 110 => 's' }, + t => { 110 => 't' }, + y => { 110 => 'y' }, + z => { 110 => 'z' }, + }, + 411 => { a => { 111 => 'a' }, + b => { 111 => 'b' }, + c => { 111 => 'c' }, + d => { 111 => 'd' }, + e => { 111 => 'e' }, + #f => { 111 => 'f' }, + g => { 111 => 'g' }, + h => { 111 => 'h' }, + k => { 111 => 'k' }, + l => { 111 => 'l' }, + m => { 111 => 'm' }, + n => { 111 => 'n' }, + o => { 111 => 'o' }, + p => { 111 => 'p' }, + q => { 111 => 'q' }, + r => { 111 => 'r' }, + s => { 111 => 's' }, + t => { 111 => 't' }, + u => { 111 => 'u' }, + y => { 111 => 'y' }, + z => { 111 => 'z' }, + }, + 600 => { a => { 100 => 'a' }, + b => { 100 => 'b' }, + c => { 100 => 'c' }, + d => { 100 => 'd' }, + #f => { 100 => 'f' }, + g => { 100 => 'g' }, + h => { 100 => 'h' }, + k => { 100 => 'k' }, + l => { 100 => 'l' }, + m => { 100 => 'm' }, + n => { 100 => 'n' }, + o => { 100 => 'o' }, + p => { 100 => 'p' }, + q => { 100 => 'q' }, + r => { 100 => 'r' }, + s => { 100 => 's' }, + t => { 100 => 't' }, + v => { 100 => 'v' }, + x => { 100 => 'x' }, + y => { 100 => 'y' }, + z => { 100 => 'z' }, + }, + 610 => { a => { 110 => 'a' }, + b => { 110 => 'b' }, + c => { 110 => 'c' }, + d => { 110 => 'd' }, + #f => { 110 => 'f' }, + g => { 110 => 'g' }, + h => { 110 => 'h' }, + k => { 110 => 'k' }, + l => { 110 => 'l' }, + m => { 110 => 'm' }, + n => { 110 => 'n' }, + o => { 110 => 'o' }, + p => { 110 => 'p' }, + r => { 110 => 'r' }, + s => { 110 => 's' }, + t => { 110 => 't' }, + v => { 110 => 'v' }, + x => { 110 => 'x' }, + y => { 110 => 'y' }, + z => { 110 => 'z' }, + }, + 611 => { a => { 111 => 'a' }, + b => { 111 => 'b' }, + c => { 111 => 'c' }, + d => { 111 => 'd' }, + e => { 111 => 'e' }, + #f => { 111 => 'f' }, + g => { 111 => 'g' }, + h => { 111 => 'h' }, + k => { 111 => 'k' }, + l => { 111 => 'l' }, + m => { 111 => 'm' }, + n => { 111 => 'n' }, + o => { 111 => 'o' }, + p => { 111 => 'p' }, + q => { 111 => 'q' }, + r => { 111 => 'r' }, + s => { 111 => 's' }, + t => { 111 => 't' }, + u => { 111 => 'u' }, + v => { 111 => 'v' }, + x => { 111 => 'x' }, + y => { 111 => 'y' }, + z => { 111 => 'z' }, + }, + 630 => { a => { 130 => 'a' }, + d => { 130 => 'd' }, + #f => { 130 => 'f' }, + g => { 130 => 'g' }, + h => { 130 => 'h' }, + k => { 130 => 'k' }, + l => { 130 => 'l' }, + m => { 130 => 'm' }, + n => { 130 => 'n' }, + o => { 130 => 'o' }, + p => { 130 => 'p' }, + r => { 130 => 'r' }, + s => { 130 => 's' }, + t => { 130 => 't' }, + v => { 130 => 'v' }, + x => { 130 => 'x' }, + y => { 130 => 'y' }, + z => { 130 => 'z' }, + }, + 650 => { a => { 150 => 'a' }, + b => { 150 => 'b' }, + c => { 150 => 'c' }, + d => { 150 => 'd' }, + v => { 150 => 'v' }, + x => { 150 => 'x' }, + y => { 150 => 'y' }, + z => { 150 => 'z' }, + }, + 651 => { a => { 151 => 'a' }, + b => { 151 => 'b' }, + v => { 151 => 'v' }, + x => { 151 => 'x' }, + y => { 151 => 'y' }, + z => { 151 => 'z' }, + }, + 655 => { a => { 155 => 'a' }, + b => { 155 => 'b' }, + c => { 155 => 'c' }, + v => { 155 => 'v' }, + x => { 155 => 'x' }, + y => { 155 => 'y' }, + z => { 155 => 'z' }, + }, + 700 => { a => { 100 => 'a' }, + b => { 100 => 'b' }, + c => { 100 => 'c' }, + d => { 100 => 'd' }, + #f => { 100 => 'f' }, + g => { 100 => 'g' }, + h => { 100 => 'h' }, + k => { 100 => 'k' }, + l => { 100 => 'l' }, + m => { 100 => 'm' }, + n => { 100 => 'n' }, + o => { 100 => 'o' }, + p => { 100 => 'p' }, + q => { 100 => 'q' }, + r => { 100 => 'r' }, + s => { 100 => 's' }, + t => { 100 => 't' }, + y => { 100 => 'y' }, + z => { 100 => 'z' }, + }, + 710 => { a => { 110 => 'a' }, + b => { 110 => 'b' }, + c => { 110 => 'c' }, + d => { 110 => 'd' }, + #f => { 110 => 'f' }, + g => { 110 => 'g' }, + h => { 110 => 'h' }, + k => { 110 => 'k' }, + l => { 110 => 'l' }, + m => { 110 => 'm' }, + n => { 110 => 'n' }, + o => { 110 => 'o' }, + p => { 110 => 'p' }, + r => { 110 => 'r' }, + s => { 110 => 's' }, + t => { 110 => 't' }, + y => { 110 => 'y' }, + z => { 110 => 'z' }, + }, + 711 => { a => { 111 => 'a' }, + b => { 111 => 'b' }, + c => { 111 => 'c' }, + d => { 111 => 'd' }, + e => { 111 => 'e' }, + #f => { 111 => 'f' }, + g => { 111 => 'g' }, + h => { 111 => 'h' }, + k => { 111 => 'k' }, + l => { 111 => 'l' }, + m => { 111 => 'm' }, + n => { 111 => 'n' }, + o => { 111 => 'o' }, + p => { 111 => 'p' }, + q => { 111 => 'q' }, + r => { 111 => 'r' }, + s => { 111 => 's' }, + t => { 111 => 't' }, + u => { 111 => 'u' }, + y => { 111 => 'y' }, + z => { 111 => 'z' }, + }, + 730 => { a => { 130 => 'a' }, + d => { 130 => 'd' }, + #f => { 130 => 'f' }, + g => { 130 => 'g' }, + h => { 130 => 'h' }, + k => { 130 => 'k' }, + l => { 130 => 'l' }, + m => { 130 => 'm' }, + n => { 130 => 'n' }, + o => { 130 => 'o' }, + p => { 130 => 'p' }, + r => { 130 => 'r' }, + s => { 130 => 's' }, + t => { 130 => 't' }, + y => { 130 => 'y' }, + z => { 130 => 'z' }, + }, + 800 => { a => { 100 => 'a' }, + b => { 100 => 'b' }, + c => { 100 => 'c' }, + d => { 100 => 'd' }, + #f => { 100 => 'f' }, + g => { 100 => 'g' }, + h => { 100 => 'h' }, + k => { 100 => 'k' }, + l => { 100 => 'l' }, + m => { 100 => 'm' }, + n => { 100 => 'n' }, + o => { 100 => 'o' }, + p => { 100 => 'p' }, + q => { 100 => 'q' }, + r => { 100 => 'r' }, + s => { 100 => 's' }, + t => { 100 => 't' }, + y => { 100 => 'y' }, + z => { 100 => 'z' }, + }, + 810 => { a => { 110 => 'a' }, + b => { 110 => 'b' }, + c => { 110 => 'c' }, + d => { 110 => 'd' }, + #f => { 110 => 'f' }, + g => { 110 => 'g' }, + h => { 110 => 'h' }, + k => { 110 => 'k' }, + l => { 110 => 'l' }, + m => { 110 => 'm' }, + n => { 110 => 'n' }, + o => { 110 => 'o' }, + p => { 110 => 'p' }, + r => { 110 => 'r' }, + s => { 110 => 's' }, + t => { 110 => 't' }, + y => { 110 => 'y' }, + z => { 110 => 'z' }, + }, + 811 => { a => { 111 => 'a' }, + b => { 111 => 'b' }, + c => { 111 => 'c' }, + d => { 111 => 'd' }, + e => { 111 => 'e' }, + #f => { 111 => 'f' }, + g => { 111 => 'g' }, + h => { 111 => 'h' }, + k => { 111 => 'k' }, + l => { 111 => 'l' }, + m => { 111 => 'm' }, + n => { 111 => 'n' }, + o => { 111 => 'o' }, + p => { 111 => 'p' }, + q => { 111 => 'q' }, + r => { 111 => 'r' }, + s => { 111 => 's' }, + t => { 111 => 't' }, + u => { 111 => 'u' }, + y => { 111 => 'y' }, + z => { 111 => 'z' }, + }, + 830 => { a => { 130 => 'a' }, + d => { 130 => 'd' }, + #f => { 130 => 'f' }, + g => { 130 => 'g' }, + h => { 130 => 'h' }, + k => { 130 => 'k' }, + l => { 130 => 'l' }, + m => { 130 => 'm' }, + n => { 130 => 'n' }, + o => { 130 => 'o' }, + p => { 130 => 'p' }, + r => { 130 => 'r' }, + s => { 130 => 's' }, + t => { 130 => 't' }, + x => { 130 => 'x' }, + y => { 130 => 'y' }, + z => { 130 => 'z' }, + }, +); + +# mapping of authority leader/11 "Subject heading system/thesaurus" +# to the matching bib record indicator +my %AUTH_TO_BIB_IND2 = ( + 'a' => '0', # Library of Congress Subject Headings (ADULT) + 'b' => '1', # Library of Congress Subject Headings (JUVENILE) + 'c' => '2', # Medical Subject Headings + 'd' => '3', # National Agricultural Library Subject Authority File + 'n' => '4', # Source not specified + 'k' => '5', # Canadian Subject Headings + 'v' => '6', # Répertoire de vedettes-matière + 'z' => '7' # Source specified in subfield $2 / Other +); + +# Produces a new 6XX ind2 value for values found in subfield $2 when the +# original ind2 value is 7 ("Source specified in subfield $2"). +my %REMAP_BIB_SF2_TO_IND2 = ( + lcsh => '0', + mesh => '2', + nal => '3', + rvm => '6' +); + +my $start_time = localtime(); + +if($input_file) { + announce("Start $start_time for ".scalar(@records)." records"); +} elsif($start_id) { + announce("Start $start_time for record range: $start_id => $end_id"); +} else { + announce("Start $start_time for all records"); +} + +# Fetch leader/008 values for authority records. Filter out any whose +# 008 14 or 15 field are not appropriate for the requested bib tag. +# https://www.loc.gov/marc/authority/ad008.html +sub authority_leaders_008_14_15 { + my ($e, $bib_tag, $auth_ids) = @_; + + my $auth_leaders = $e->json_query({ + select => {afr => ['record', 'value']}, + from => 'afr', + where => {'+afr' => {tag => '008', record => $auth_ids}} + }); + + my $index; + $index = 14 if $bib_tag =~ /^[17]/; # author/name record + $index = 15 if $bib_tag =~ /^6/; # subject record + + # avoid checking any other types of authority records. + return $auth_leaders unless $index; + + my @keepers; + for my $leader (@$auth_leaders) { + my $value = $leader->{value} || ''; + if (substr($value, $index, 1) eq 'a') { + push(@keepers, $leader); + } else { + announce("Skipping authority record ".$leader->{record}. + " on bib $bib_tag match; 008/#14|#15 not appropriate"); + } + } + + return \@keepers; +} + +# given a set of authority record ID's and a controlled bib field, +# returns the ID of the first authority record in the set that +# matches the thesaurus spec of the bib record. +sub find_matching_auth_for_thesaurus { + my ($e, $bib_field, $auth_leaders) = @_; + + # bib field thesaurus spec + my $cfield_ind2 = $bib_field->indicator(2); + + announce("6XX indicator 2 value = $cfield_ind2"); + + my $is_local = 0; + if ($cfield_ind2 eq '7') { + # subject thesaurus code is embedded in the bib field subfield 2 + + $is_local = 1; + + my $thesaurus = $bib_field->subfield('2') || ''; + announce("Found local thesaurus value $thesaurus"); + + # if we have no special remapping value for the found thesaurus, + # fall back to ind2 => 7=Other. + $cfield_ind2 = $REMAP_BIB_SF2_TO_IND2{$thesaurus} || '7'; + + announce("Local thesaurus '$thesaurus' ". + "remapped to ind2 value '$cfield_ind2'"); + } + + my $authz_found = undef; + for my $leader (@$auth_leaders) { + my $value = $leader->{value}; + next unless $value; + + my $thesaurus = substr($value, 11, 1); # leader/11 -- zero based. + + # Note for later that we encountered an authority record + # whose thesaurus values is z=Other. + $authz_found = $leader->{record} if $thesaurus eq 'z'; + + if ($AUTH_TO_BIB_IND2{$thesaurus} eq $cfield_ind2) { + announce("Found a match on thesaurus ". + "'$thesaurus' for " . $leader->{record}); + return $leader->{record}; + } + } + + # If the bib field in question has a locally encoded thesaurus + # (ind2=7) and no auth record was found above via remapped + # thesaurus value, use the authority record with thesaurus z=Other. + return $authz_found if $is_local; + + return undef; +} + +# Returns true if the thesaurus controlling the bib field is "fast". +sub is_fast_heading { + my $bib_field = shift; + + if ($bib_field->tag() =~ /^65[015]/) { + my $ind2 = $bib_field->indicator(2) || ''; + + if ($ind2 eq '7') { # field controlled by "other" + my $thesaurus = $bib_field->subfield('2') || ''; + return $thesaurus eq 'fast'; + } + } + + return 0; +} + +sub update_record { + my ($record, $marc) = @_; + + my $xml = $marc->as_xml_record(); + $xml =~ s/\n//sgo; + $xml =~ s/^<\?xml.+\?\s*>//go; + $xml =~ s/>\s+entityize($xml); + + $record->marc($xml); + + my $editor = OpenILS::Utils::CStoreEditor->new(xact=>1); + if ($editor->update_biblio_record_entry($record)) { + $editor->commit(); + } else { + $editor->rollback(); + } +} + +my $count = 0; +my $total = scalar(@records); +announce("processing $total bib records", 1); + +foreach my $rec_id (@records) { + $count++; + + announce("processing $count of $total", 1) if ($count % 1000) == 0; + + announce("processing bib record $rec_id [$count of $total]"); + + # State variable; was the record changed? + my $changed = 0; + + # get the record + my $record = $e->retrieve_biblio_record_entry($rec_id); + next unless $record && $record->deleted eq 'f'; + + eval { + my $marc = MARC::Record->new_from_xml($record->marc()); + + # get the list of controlled fields + my @c_fields = keys %controllees; + + foreach my $c_tag (@c_fields) { + my @c_subfields = keys %{$controllees{"$c_tag"}}; + + # Get the MARCXML from the record and check for controlled fields/subfields + my @bib_fields = ($marc->field($c_tag)); + foreach my $bib_field (@bib_fields) { + + my $sf0 = $bib_field->subfield('0') || ''; + my $is_fast_heading = is_fast_heading($bib_field); + + if ($is_fast_heading && $sf0 =~ /\)fst/) { + # fast heading looks OK. ignore it. + announce("Ignoring FAST heading field on ". + "rec=$rec_id and tag=$c_tag \$0 $sf0"); + next; + } + + if ($sf0 && $refresh) { + announce("Removing \$0 $sf0 for rec=$rec_id and tag=$c_tag"); + $bib_field->delete_subfield(code => '0'); + $changed = 1; + } + + if ($is_fast_heading) { + # Update record after potentially removing a bogus + # fast heading above. + update_record($record, $marc) if $changed; + + # We don't control fast headings, so there's nothing + # left to do. Move on to the next field... + announce("No linking performed on FAST heading ". + "field on rec=$rec_id and tag=$c_tag"); + next; + } + + my %match_subfields; + my $match_tag; + my @searches; + foreach my $c_subfield (@c_subfields) { + my @sf_values = $bib_field->subfield($c_subfield); + if (@sf_values) { + # Give me the first element of the list of authority controlling tags for this subfield + # XXX Will we need to support more than one controlling tag per subfield? Probably. That + # will suck. Oh well, leave that up to Ole to implement. + $match_subfields{$c_subfield} = (keys %{$controllees{$c_tag}{$c_subfield}})[0]; + $match_tag = $match_subfields{$c_subfield}; + push @searches, map {{term => $_, subfield => $c_subfield}} @sf_values; + } + } + next if !$match_tag; + + announce("Searching for matches on controlled field $c_tag ". + "(auth tag=$match_tag): \n - ".Dumper(\@searches)); + + my @tags = ($match_tag); + + # Now we've built up a complete set of matching controlled + # subfields for this particular field; let's check to see if + # we have a matching authority record + my $session = OpenSRF::AppSession->create("open-ils.search"); + my $validates = $session->request("open-ils.search.authority.validate.tag.id_list", + "tags", \@tags, "searches", \@searches + )->gather(); + $session->disconnect(); + + + # Protect against failed (error condition) search request + if (!$validates) { + print STDERR "Search for matching authority failed; record # $rec_id\n"; + next if (!$changed); + } + + announce("Match query returned @$validates"); + + # No matches found. Nothing left to do for this field. + next if scalar(@$validates) == 0; + + # Iterate through the returned authority record IDs to delete any + # matching $0 subfields already in the bib record + foreach my $auth_zero (@$validates) { + $bib_field->delete_subfield(code => '0', match => qr/\)$auth_zero$/); + } + + # Find the best authority record to use for linking. + + my $auth_leaders; + if ($bib_field->tag =~ /^[167]/) { + # For 1XX, 6XX, and 7XX bib fields, only link to + # authority records whose leader/008 positions + # 14 and 15 are coded to allow use as a name/author + # or subject record, depending. + + $auth_leaders = authority_leaders_008_14_15( + $e, $bib_field->tag, $validates); + + $validates = [map {$_->{record}} @$auth_leaders]; + } + + my $auth_id; + + if ($bib_field->tag() =~ /^65[015]/) { + # Using the indicator-2 value from the controlled bib + # field, find the first authority in the list of matches + # that uses the same thesaurus. If no such authority + # is found, no matching occurs. + # TODO: perhaps this step should be part of the + # validation API search call above. + + $auth_id = find_matching_auth_for_thesaurus( + $e, $bib_field, $auth_leaders) || ''; + + } else { + + # For all other controlled fields, use the first + # authority record in the result set. + $auth_id = $validates->[0]; + } + + # Don't exit here just because we have no $auth_id. The + # bib field could have been changed above in the cleanup / + # delete phase. + + if ($auth_id) { + # Add the auth ID and control number agency info from the + # matching authority record to the controlled bib field. + $changed = 1; + $bib_field->add_subfields('0' => "($CNI)$auth_id"); + announce("auth=$auth_id cni=$CNI. It's a match!"); + } + } + } + + update_record($record, $marc) if $changed; + + }; + + if ($@) { + print STDERR "\nRecord # $rec_id : $@\n"; + import MARC::File::XML; # reset SAX parser so that one bad record doesn't kill the entire process + } +} + +my $end_time = localtime(); + +if($input_file) { + announce("Stop $end_time for ".scalar(@records)." records"); +} elsif($start_id) { + announce("Start $end_time for record range: $start_id => $end_id"); +} else { + announce("Start $end_time for all records"); +} + +__END__ + +=head1 NAME + +authority_control_fields.pl - Controls fields in bibliographic records with authorities in Evergreen + +=head1 SYNOPSIS + +C [B<--configuration>=I] [B<--refresh>] +[[B<--record>=I[ B<--record>=I]]] | [B<--all>] | [B<--start_id>=I B<--end_id>=I] + +=head1 DESCRIPTION + +For a given set of records: + +=over + +=item * Iterate through the list of fields that are controlled fields + +=item * Iterate through the list of subfields that are controlled for +that given field + +=item * Search for a matching authority record for that combination of +field + subfield(s) + +=over + +=item * If we find a match, then add a $0 subfield to that field identifying +the controlling authority record + +=item * If we do not find a match, then insert a row into an "uncontrolled" +table identifying the record ID, field, and subfield(s) that were not controlled + +=back + +=item * Iterate through the list of floating subdivisions + +=over + +=item * If we find a match, then add a $0 subfield to that field identifying +the controlling authority record + +=item * If we do not find a match, then insert a row into an "uncontrolled" +table identifying the record ID, field, and subfield(s) that were not controlled + +=back + +=item * If we changed the record, update it in the database + +=back + +=head1 OPTIONS + +=over + +=item * B<-f>, B<--file> + +Specifies a file of bibs ids to link. + +=item * B<-c> I, B<--configuration>=I + +Specifies the OpenSRF configuration file used to connect to the OpenSRF router. +Defaults to F + +=item * B<-r> I, B<--record>=I + +Specifies the bibliographic record ID (found in the C +column) of the record to process. This option may be specified more than once +to process multiple records in a single run. + +=item * B<-a>, B<--all> + +Specifies that all bibliographic records should be processed. For large +databases, this may take an extraordinarily long amount of time. + +=item * B<-r>, B<--refresh> + +Specifies that all authority links should be removed from the target +bibliographic record(s). This will effectively rewrite all authority +linking anew. + +=item * B<-s> I, B<--start_id>=I + +Specifies the starting ID of the range of bibliographic records to process. +This option is ignored unless it is accompanied by the B<-e> or B<--end_id> +option. + +=item * B<-e> I, B<--end_id>=I + +Specifies the ending ID of the range of bibliographic records to process. +This option is ignored unless it is accompanied by the B<-s> or B<--start> +option. + +=back + +=head1 EXAMPLES + + authority_control_fields.pl --start_id 1 --end_id 50000 + +Processes the bibliographic records with IDs between 1 and 50,000 using the +default OpenSRF configuration file for connection information. + +=head1 AUTHOR + +Dan Scott + +=head1 COPYRIGHT AND LICENSE + +Copyright 2010-2011 by Dan Scott + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +=cut + diff --git a/KCLS/authority-control/linking/find-bibs-to-link.pl b/KCLS/authority-control/linking/find-bibs-to-link.pl new file mode 100755 index 0000000000..46a20bf298 --- /dev/null +++ b/KCLS/authority-control/linking/find-bibs-to-link.pl @@ -0,0 +1,169 @@ +#!/usr/bin/perl +# ---------------------------------------------------------------------- +# Find bib records matching the requested criteria for linking. +# Bib IDs are exported to one or more batch files for future processing. +# ---------------------------------------------------------------------- +use strict; +use warnings; +use DBI; +use Getopt::Long; +use DateTime; + +my $db_handle; +my $counter = 0; + +# options +my $help; +my $modified_since; +my $exported_since; +my $batch_size = 10000; +my $start_id; +my $end_id; +my $count_only; +my $out_dir = '/tmp'; +my $db_host = $ENV{PGHOST} || 'localhost'; +my $db_port = $ENV{PGPORT} || '5432'; +my $db_user = $ENV{PGUSER} || 'evergreen'; +my $db_name = $ENV{PGDATABASE} || 'evergreen'; +my $db_pass = $ENV{PGPASSWORD}; + +my $opt_result = GetOptions( + 'modified-since=s' => \$modified_since, + 'exported-since=s' => \$exported_since, + 'start-id=i' => \$start_id, + 'end-id=i' => \$end_id, + 'batch-size=i' => \$batch_size, + 'count-only' => \$count_only, + 'out-dir=s' => \$out_dir, + "db-host=s" => \$db_host, + "db-user=s" => \$db_user, + "db-pass=s" => \$db_pass, + "db-port=s" => \$db_port, + 'help' => \$help +); + +sub announce { + my $msg = shift; + print DateTime->now(time_zone => 'local')->strftime('%F %T')." $msg\n"; +} + +sub help { + print < + Limit bibs to those modifed since the specified date. + + --exported-since + Limit bibs to those exported since the specified date. + Export date is based on data found in the + metabib.bib_export_data table. + + --start-id + Limit bibs to those whose ID is no less than + + --end-id + Limit bibs to those whose ID is no greater than + + --out-dir [/tmp] + Output directory. + + --batch-size + Number of bib IDs to write to each batch file. + + --count-only + Print the total number of records that would be added + to batch files without adding to any batch files. + + --db-host + --db-user + --db-pass + --db-port + Database connection params. PG environment variables are + also inspected for values. When all else fails, try to + connect to database evergreen\@localhost +HELP + exit 0; +} + +help() if $help || !$opt_result; + +sub connect_db { + $db_handle = DBI->connect( + "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", + $db_user, $db_pass, { + RaiseError => 1, + PrintError => 0, + AutoCommit => 1, + pg_expand_array => 0, + pg_enable_utf8 => 1 + } + ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; +} + +connect_db(); + +# ---------------------------------------------------------------------- +my $from = 'FROM biblio.record_entry bre'; + +my $where = 'WHERE NOT bre.deleted'; +$where .= " AND bre.id >= $start_id" if $start_id; +$where .= " AND bre.id <= $end_id" if $end_id; + +if ($exported_since) { + $where .= " AND bed.export_date > '$exported_since'"; + $from .= " JOIN metabib.bib_export_data bed ON (bed.bib = bre.id)"; +} + +my $sql = <prepare($sql); +$sth->execute; + +my $batch_file; +sub open_batch_file { + my $path = shift; + announce("Starting new batch file: $path"); + + close $batch_file if $batch_file; + + open $batch_file, '>', $path or + die "Cannot open batch file for writing: $!\n"; +} + +my $ctr = 0; +my $batch = 0; +while (my $ref = $sth->fetchrow_hashref()) { + $ctr++; + next if $count_only; + + if (( ($ctr - 1) % $batch_size) == 0) { + my $path = sprintf("$out_dir/bib-ids.%0.3d", $batch); + open_batch_file($path); + $batch++; + } + + print $batch_file $ref->{id} . "\n"; +} + +close $batch_file if $batch_file; +$sth->finish; + +announce("Found $ctr bib records"); + diff --git a/KCLS/authority-control/linking/link-bib-batches.sh b/KCLS/authority-control/linking/link-bib-batches.sh new file mode 100755 index 0000000000..5be60ffa53 --- /dev/null +++ b/KCLS/authority-control/linking/link-bib-batches.sh @@ -0,0 +1,31 @@ +#!/bin/bash +set -eu +PROC_COUNT=3 +BATCHES_PER_PROC=3 +BATCH_DIR="/openils/var/data/linkbibs" + +DOW=$(date +%u); +[ $DOW == 7 ] && DOW=0; # make Sunday = 0 + +BATCH=$(echo "$PROC_COUNT * $BATCHES_PER_PROC * $DOW" | bc); + +echo "Starting at batch number $BATCH"; + +function execute_background_proc { + START=$1 + for batch in $(seq 1 $BATCHES_PER_PROC); do + FILE=$(printf "$BATCH_DIR/bib-ids.%0.3d" $((($START + $batch)))); + echo "Linking bib file $FILE" + perl ./authority_control_fields.pl --refresh --file $FILE + done; +} + +for PROC in $(seq 1 $PROC_COUNT); do + (execute_background_proc $BATCH) & + BATCH=$((($BATCH + BATCHES_PER_PROC))); +done + +wait; + +echo "Done processing all batches" + diff --git a/KCLS/authority-control/linking/link-new-auth-records.pl b/KCLS/authority-control/linking/link-new-auth-records.pl new file mode 100755 index 0000000000..c48939e30a --- /dev/null +++ b/KCLS/authority-control/linking/link-new-auth-records.pl @@ -0,0 +1,253 @@ +#!/usr/bin/perl +# ---------------------------------------------------------------------- +# Find authority records newer than a specified age. Once found, +# run each through the auth-to-auth linking process. Then locate +# bib records that we might want to link to the new records and +# pass them off to the bib-to-auth linker. +# ---------------------------------------------------------------------- +use strict; +use warnings; +use DBI; +use Getopt::Long; +use DateTime; +use Pod::Usage qw/pod2usage/; +use Time::HiRes qw/usleep/; + +my @auth_ids; +my @bib_ids; +my $counter = 0; + +# options +my $help; +my $modified_since; +my $max_auth_count; +my $start_auth_id; +my $print_auth_ids; +my $print_bib_ids; +my $link_auths; +my $link_bibs; +my $progress; +my $db_host = $ENV{PGHOST} || 'localhost'; +my $db_port = $ENV{PGPORT} || '5432'; +my $db_user = $ENV{PGDATABASE} || 'evergreen'; +my $db_pass = $ENV{PGPASSWORD}; + +my $opt_result = GetOptions( + 'modified-since=i' => \$modified_since, + 'max-auth-count=i' => \$max_auth_count, + 'start-auth-id=i' => \$start_auth_id, + 'print-bib-ids' => \$print_bib_ids, + 'print-auth-ids' => \$print_auth_ids, + 'link-bibs' => \$link_bibs, + 'link-auths' => \$link_auths, + 'progress' => \$progress, + "db-host=s" => \$db_host, + "db-user=s" => \$db_user, + "db-pass=s" => \$db_pass, + "db-port=s" => \$db_port, + 'help' => \$help +); + +sub announce { + my $msg = shift; + print DateTime->now(time_zone => 'local')->strftime('%F %T')." $msg\n"; +} + +sub help { + print < + Process authority records created or modified within the + last days. + + --max-auth-count + Process authority records in total. Use with + --start-auth-id to process batches of records across + multiple instances of the script. + + --start-auth-id + Process authority records whose ID is equal to or greater + than . Use with --max-auth-count to process batches + of records accross multiple runs of the script. + + --print-auth-ids + Print authority record IDs to process to STDOUT + + --print-bib-ids + Print bib record IDs to process to STDOUT + + --link-auths + Run idenditifed authority records through authority_authority_linker.pl + + --link-bibs + Run idenditifed bib records through authority_control_fields.pl + + --progress + Log linking progess to STDOUT + + --db-host + --db-user + --db-pass + --db-port + + Database connection params. PG environment variables are + also inspected for values. When all else fails, try to + connect to database evergreen\@localhost +HELP + exit 0; +} + +help() if $help || !$opt_result; + +my $dsn = "dbi:Pg:database=evergreen;host=$db_host;port=$db_port"; +my $dbh = DBI->connect($dsn, $db_user, $db_pass) + or die "Cannot connect to database: $dsn\n"; + +$dbh->do('SET statement_timeout = 0'); + +# ---------------------------------------------------------------------- +# Load the authority record IDs +my $where2 = $start_auth_id ? "AND id >= $start_auth_id" : ''; +my $limit = $max_auth_count ? "LIMIT $max_auth_count" : ''; + +my $sth = $dbh->prepare(<= DATE(NOW() - '$modified_since day'::INTERVAL) + $where2 + ORDER BY id + $limit +SQL + +$sth->execute; + +while (my $ref = $sth->fetchrow_hashref()) { + push(@auth_ids, $ref->{id}); +} +$sth->finish; + +my $auth_rec_count = scalar(@auth_ids); +print join("\n", @auth_ids) if $print_auth_ids; + +# Let the caller know what the last record processed will be, +# so the next iteration of the script can start there. +announce("Final auth ID will be: " . $auth_ids[-1]) if $max_auth_count; + +if (!@auth_ids) { + announce("No authority records edited in the last $modified_since days"); + exit 0; +} + +# ---------------------------------------------------------------------- +# Auth-to-Auth linking + +if ($link_auths) { + # Pass all new authority records to the auth-to-auth linker + for my $rec_id (@auth_ids) { + + system( + './authority_authority_linker.pl', + '--db-host', $db_host, + '--db-user', $db_user, + '--db-pass', ($db_pass || ''), + '--record', $rec_id + ); + + usleep(250000); # 1/4 second; allow ctrl-c to penetrate + announce("Auth records processed: $counter/$auth_rec_count") + if $progress && ++$counter % 10 == 0; + } +} +$counter = 0; + +# Exit if there is nothing left to do. +exit unless $print_bib_ids || $link_bibs; + +# ---------------------------------------------------------------------- +# Find bib records that we might want to link to the new authority +# record. +# +# Query: give me bib records that link to browse entries that also +# link to exactly one authority record, specifically the new authority +# records we are processing via this script. Only include bib records +# that are not already linked via bib_linking to said authority record. +# This represents the set of bib records that might need to be linked +# to our new authority records. +# ---------------------------------------------------------------------- +my %bib_ids; # de-dupe by record ID. +my $auth_ids_param = join(',', @auth_ids); + +for my $axis (qw/author subject series title/) { + my $query = <prepare($query); + $sth->execute; + while (my $ref = $sth->fetchrow_hashref()) { + $bib_ids{$ref->{bib_record}} = 1; # de-dupe + } + $sth->finish; +} + +@bib_ids = sort(keys(%bib_ids)); +my $bib_rec_count = scalar(@bib_ids); + +if ($link_bibs) { + for my $rec_id (@bib_ids) { + # fire off the linker for each of the records identied + system('./authority_control_fields.pl', + '--db-host', $db_host, + '--db-user', $db_user, + '--db-pass', ($db_pass || ''), + '--record', $rec_id, + '--refresh' + ); + + usleep(250000); # 1/4 second; allow ctrl-c to penetrate + announce("Bib records processed: $counter/$bib_rec_count") + if $progress && ++$counter % 10 == 0; + } +} + +print join("\n", @bib_ids) if $print_bib_ids; + diff --git a/KCLS/backstage/README.adoc b/KCLS/backstage/README.adoc deleted file mode 100644 index 0f834eaada..0000000000 --- a/KCLS/backstage/README.adoc +++ /dev/null @@ -1,127 +0,0 @@ -= Backstage Processes = - -Perform steps as 'opensrf' - -== Quarterly Export + Import == - -=== Setup === - -[source,sh] --------------------------------------------------------------------- -export EXPORT_DATE=2016-10-01 # for example -export WORKING_DIR=/openils/var/data/authority-control/backstage/quarterly/$EXPORT_DATE -export PGHOST=foo -export PGPASSWORD=foo -export PGUSER=evergreen -mkdir -p $WORKING_DIR --------------------------------------------------------------------- - -=== Exporting Bib Records === - -Bibs are exported as MARC and uploaded to Backstage - -==== Generate Export MARC File ==== - -[source,sh] --------------------------------------------------------------------- -./export-bibs.pl \ - --start-date 2010-01-01 \ - --end-date 2016-06-01 \ - --export-date $EXPORT_DATE \ - --out-file $WORKING_DIR/exported-bibs.$EXPORT_DATE.mrc - -# Send file to BS FTP server in(bound) directory. --------------------------------------------------------------------- - -=== Process Results === - -==== Fetch Results ==== - -[source,sh] --------------------------------------------------------------------- -cd $WORKING_DIR -wget --------------------------------------------------------------------- - -==== Process Results Files ==== - -* Import new and modified authority records. -* Import modified bib records. - -[source,sh] --------------------------------------------------------------------- -cd /home/opensrf/Evergreen/KCLS/backstage/ - -./process-backstage-files.pl \ - --verbose \ - --export-date $EXPORT_DATE \ - --file $WORKING_DIR/ \ - --working-dir $WORKING_DIR \ - --bib-collision-file bib-collisions.mrc \ - > $WORKING_DIR/process.log --------------------------------------------------------------------- - -==== Process Bib Collisions ==== - -Bib records that were locally modified during Backstage processing are -re-imported without clobbering the modifications. - -1. Create a new queue for this batch (rename to suit). - -[source,sh] --------------------------------------------------------------------- -INSERT INTO vandelay.bib_queue (owner, name) - VALUES (1, 'Backstage Q3 2016'); -SELECT id FROM vandelay.bib_queue WHERE name = 'Backstage Q3 2016'; --------------------------------------------------------------------- - -2. Import bib collisions via stream importer. - -[source,sh] --------------------------------------------------------------------- -# Make a copy of the collisions file for safe keeping, -# since open-ils.vandelay deletes spool files. -cp $WORKING_DIR/bib-collisions.mrc $WORKING_DIR/bib-collisions.bak.mrc - -cd /openils/bin -./marc_stream_importer.pl \ - --spoolfile $WORKING_DIR/bib-collisions.mrc \ - --user admin \ - --password XXX \ - --bib-auto-overlay-exact \ - --queue \ - --merge-profile 104 # "Backstage Field Protection" --------------------------------------------------------------------- - -==== Get auth/bib IDs that require re-linking ==== - -[source,sh] --------------------------------------------------------------------- -cd /home/opensrf/Evergreen/KCLS/linking/ - -./link-new-auth-records.pl --modified-since \ - --progress --print-auth-ids \ - > $WORKING_DIR/auths-to-link.txt - -./link-new-auth-records.pl --modified-since \ - --progress --print-bib-ids \ - > $WORKING_DIR/bibs-to-link.txt --------------------------------------------------------------------- - -==== Re-Link Modified Auths and Bibs ==== - -[source,sh] --------------------------------------------------------------------- -cd /home/opensrf/Evergreen/KCLS/authority-control/linking/ - -./authority_authority_linker.pl --verbose \ - --file $WORKING_DIR/auths-to-link.txt \ - | tee -a $WORKING_DIR/auth2auth-linking.log - -# Bib linking takes many hours, sometimes days. - -./authority_control_fields.pl --verbose --refresh \ - --file $WORKING_DIR/bibs-to-link.txt \ - | tee -a $WORKING_DIR/bib-linking.log --------------------------------------------------------------------- - diff --git a/KCLS/backstage/export-bibs.pl b/KCLS/backstage/export-bibs.pl deleted file mode 100755 index 3858a31d6f..0000000000 --- a/KCLS/backstage/export-bibs.pl +++ /dev/null @@ -1,256 +0,0 @@ -#!/usr/bin/env perl -# ----------------------------------------------------------------------- -# Export bib records for Backstage processing. -# -# The UTF-8 encoded USMARC string for each record is printed to STDOUT. -# Each exported bib has its export_date value updated to NOW(). -# -# Exported bibs meet the following criteria: -# -# 1. Delete flag must be false. -# 2. Record cannot contain any 086, 092, or 099 tags containing the phrase 'on order' -# 3. Boolean filter: -# [ (001_test OR 035_test) AND has_holdings AND cat_date_in_range ] -# OR -# [ 998_test AND create_date_in_range ] -# ----------------------------------------------------------------------- -use strict; -use warnings; -use DBI; -use Getopt::Long; -use MARC::Record; -use MARC::File::XML (BinaryEncoding => 'UTF-8'); - -my $db_handle; - -my $start_date; -my $end_date; -my $export_date; -my $ids_only; -my $count_only; -my $out_file; -my $limit; -my $db_user = $ENV{PGUSER} || 'evergreen'; -my $db_name = $ENV{PGDATABASE} || 'evergreen'; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_pass = $ENV{PGPASSWORD}; -my $help; - -GetOptions( - 'start-date=s' => \$start_date, - 'end-date=s' => \$end_date, - 'export-date=s' => \$export_date, - 'ids-only' => \$ids_only, - 'count-only' => \$count_only, - 'out-file=s' => \$out_file, - 'limit=f' => \$limit, - 'db-user=s' => \$db_user, - 'db-host=s' => \$db_host, - 'db-name=s' => \$db_name, - 'db-port=i' => \$db_port, - 'db-pass=s' => \$db_pass, - 'help' => \$help -); - -sub help { - print < - --end-date - Export bib records whose cataloging_date (for physical records) or - create_date (for electronic records) value is between the provided - start and end dates. - - --export-date - Sets the export date to the provided value. If no --export-date - value is set, no export date value will be applied in the database. - - --out-file - Write MARC records (or IDs) to this file. - - --ids-only - Write bib record IDs to the output file instead of the full MARC - record. - - --count-only - Only print the number of bibs that would be exported to STDOUT. - - --limit - Export at most this many records. - -HELP - exit; -} - -die "--start-date and --end-date required\n" - unless $start_date && $end_date; - -die "Invalid date format\n" unless - $start_date =~ /^\d{4}-\d{2}-\d{2}$/ && - $end_date =~ /^\d{4}-\d{2}-\d{2}$/ && - (!$export_date || $export_date =~ /^\d{4}-\d{2}-\d{2}$/); - -die "--out-file required\n" unless $out_file || $count_only; - -sub bib_query { - my $sql = <$out_file") - or die "Cannot open file for writing: $out_file\n"; - binmode(MARCFILE, ':utf8'); - } - - my $sth = $db_handle->prepare(bib_query()); - my $edate_sth = $db_handle->prepare( - 'SELECT * FROM metabib.set_export_date(?, ?)'); - - $sth->execute; - my $count = 0; - while (my $bib = $sth->fetchrow_hashref) { - $count++; - next if $count_only; - - my $bib_id = $bib->{id}; - - if ($ids_only) { - print MARCFILE "$bib_id\n"; - print "$count records written...\n" if ($count % 1000) == 0; - next; - } - - my $rec = $db_handle->selectall_arrayref( - "SELECT marc FROM biblio.record_entry WHERE id = $bib_id"); - - my $marc = $rec->[0]->[0]; - my $marcdoc = MARC::Record->new_from_xml($marc, 'UTF-8', 'USMARC'); - - print MARCFILE $marcdoc->as_usmarc; - - print "$count records written...\n" if ($count % 1000) == 0; - - next unless $export_date; - - # Update the bib record's metabib.bib_export_data entry. - eval { $edate_sth->execute($bib_id, $export_date) }; - die "Error setting export date for bib ". - "$bib_id to $export_date : $@\n" if $@; - } - - close(MARCFILE) if $out_file; - - print "$count total bib records\n"; - - $sth->finish; - $edate_sth->finish; -} - -sub connect_db { - $db_handle = DBI->connect( - "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", - $db_user, $db_pass, { - RaiseError => 1, - PrintError => 0, - AutoCommit => 1, - pg_expand_array => 0, - pg_enable_utf8 => 1 - } - ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; -} - -connect_db(); -export_marc(); - -$db_handle->disconnect; - diff --git a/KCLS/backstage/process-backstage-files.pl b/KCLS/backstage/process-backstage-files.pl deleted file mode 100755 index 364461e281..0000000000 --- a/KCLS/backstage/process-backstage-files.pl +++ /dev/null @@ -1,504 +0,0 @@ -#!/usr/bin/env perl -# ----------------------------------------------------------------------- -# TODO: summary -# -# TODO: -# Disable auth record change propagation during auth record updates. -# ----------------------------------------------------------------------- -use strict; -use warnings; -use DBI; -use DateTime; -use Getopt::Long; -use MARC::Record; -use MARC::File::XML (BinaryEncoding => 'UTF-8'); -use MARC::File::USMARC; -use Archive::Zip qw(:ERROR_CODES :CONSTANTS); -use File::Basename; -use Sys::Syslog qw(syslog openlog); -use OpenILS::Utils::Normalize qw(clean_marc); -binmode(STDOUT, ':utf8'); - -my $db_handle; -my $log_mod = 500; # log every 500th of each type of event (see verbose) - -my $file; -my $export_date; -my $working_dir = '.', -my $bib_collision_file; -my $verbose; -my $db_user = $ENV{PGUSER} || 'evergreen'; -my $db_name = $ENV{PGDATABASE} || 'evergreen'; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_pass = $ENV{PGPASSWORD}; - -my $syslog_facility = 'LOCAL6'; # matches Evergreen gateway -my $syslog_ops = 'pid'; -my $syslog_ident = 'BACKSTAGE'; - -my $new_auth_sth; -my $mod_auth_sth; -my $del_auth_sth; -my $delmod_auth_sth; -my $mod_bibs_sth; -my $match_auth_sth; -my $match_auth_001_sth; -my $new_auth_ctr = 0; -my $mod_auth_ctr = 0; -my $del_auth_ctr = 0; -my $mod_bibs_ctr = 0; -my $col_bibs_ctr = 0; - -my $help; - -GetOptions( - 'file=s' => \$file, - 'export-date=s' => \$export_date, - 'working-dir=s' => \$working_dir, - 'bib-collision-file=s' => \$bib_collision_file, - 'verbose' => \$verbose, - 'db-user=s' => \$db_user, - 'db-host=s' => \$db_host, - 'db-name=s' => \$db_name, - 'db-port=i' => \$db_port, - 'db-pass=s' => \$db_pass, - 'help' => \$help -); - -sub help { - print <now(time_zone => 'local')->strftime('%F %T'); - my $msg_str = "$date_str [$$] $level $msg\n"; - - if ($die) { - die $msg_str; - - } else { - if ($level eq 'ERR' or $level eq 'WARNING') { - # always copy problem messages to stdout - warn $msg_str; # avoid dupe messages - } else { - print $msg_str; - } - } -} - - -sub connect_db { - $db_handle = DBI->connect( - "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", - $db_user, $db_pass, { - RaiseError => 1, - PrintError => 0, - AutoCommit => 1, - pg_expand_array => 0, - pg_enable_utf8 => 1 - } - ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; -} - -sub process_zip_file { - - my $zip = Archive::Zip->new(); - - announce('ERR', "Failed to read $file", 1) - unless $zip->read($file) == AZ_OK; - - # Avoid processing XLS and HTM files. - # All of the MARC files end in .UTF8. - for my $member ($zip->membersMatching('.*(\.UTF8|\.MRC)')) { - - my $basename = basename($member->fileName()); - - announce('INFO', "Extracting file $basename"); - - my $local_file = "$working_dir/$basename"; - - announce('ERR', "Unable to extract to file: $local_file", 1) - unless $member->extractToFileNamed($local_file) == AZ_OK; - - my $marc_batch = MARC::File::USMARC->in($local_file, 'UTF8') - or announce('ERR', "Unable to read $local_file as MARC", 1); - - if ($basename =~ /BIB/) { - - handle_modified_bibs($marc_batch); - - } elsif ($basename =~ /DEL/) { - - handle_deleted_auths($marc_batch); - - } elsif ($basename =~ /CHG|NEW|AUTH/) { - - handle_modified_auths($marc_batch); - - } else { - announce('WARNING', "Un-handled file type: $basename"); - } - } -} - -# Returns ID's of bib records that have been modified since the export date. -my @modified_bibs; -my $mod_searched = 0; -sub find_modified_bibs { - - return if $mod_searched; - $mod_searched = 1; - - my $id_arrays = $db_handle->selectall_arrayref(<<" SQL"); - SELECT id - FROM biblio.record_entry - WHERE NOT deleted AND edit_date >= '$export_date' - SQL - - @modified_bibs = map {$_->[0]} @$id_arrays; - - announce('INFO', scalar(@modified_bibs)." bibs modified since export"); -} - - - -# 1. Bibs that have been modified by Backstage and locally are written -# to the --bib-collision-file as MARC for later processing. -# 2. Bibs that have only been modified by Backstage are updated -# directly in the database. -sub handle_modified_bibs { - my $marc_batch = shift; - - find_modified_bibs(); - - while (my $record = $marc_batch->next()) { - my $bib_id = $record->subfield('901', 'c'); - - if (!$bib_id) { - announce('ERR', "Bib record has no 901c (ID) value. Skipping"); - next; - } - - if (grep {$bib_id eq $_} @modified_bibs) { - # Bib was edited by both parties. Save to external file - # for later processing. - - write_bib_collision($record); - - } else { - # Update our copy of the record. - - my $marcxml = clean_marc($record->as_xml_record()); - update_bib($marcxml, $bib_id); - } - } -} - -sub update_bib { - my $marcxml = shift; - my $bib_id = shift; - - eval { $mod_bibs_sth->execute($marcxml, $bib_id) }; - - if ($@) { - announce('ERR', "Error updating biblio record: $@ : $marcxml"); - return; - } - - $mod_bibs_ctr++; - - announce('INFO', "Updated $mod_bibs_ctr bib records") - if $mod_bibs_ctr % $log_mod == 0; -} - -sub write_bib_collision { - my $record = shift; - - my $filename = "$working_dir/$bib_collision_file"; - - open(BIBS_FILE, ">>$filename") or - announce('ERR', "Cannot open bib collision file: $filename : $!", 1); - - binmode(BIBS_FILE, ":utf8"); - - print BIBS_FILE $record->as_usmarc(); - - close BIBS_FILE or - announce('WARNING', "Error closing bib collision file: $filename : $!"); - - $col_bibs_ctr++; - - announce('INFO', "Dumped $col_bibs_ctr bib collisions to file") - if $col_bibs_ctr % $log_mod == 0; -} - -sub handle_deleted_auths { - my $marc_batch = shift; - - while (my $record = $marc_batch->next()) { - my @matches = find_matching_auths($record); - - for my $auth_id (@matches) { - - eval { - # 2 mods.. wrap in transaction? (see autocommit) - $del_auth_sth->execute($auth_id); - $delmod_auth_sth->execute($auth_id); - }; - - if ($@) { - announce( - 'ERR', "Error deleting authority record: $@ : $auth_id"); - next; - } - - $del_auth_ctr++; - - announce('INFO', "Deleted $del_auth_ctr authority records") - if $del_auth_ctr % $log_mod == 0; - } - } -} - -sub handle_modified_auths { - my $marc_batch = shift; - - while (my $record = $marc_batch->next()) { - - modify_auth_005($record); - - my @matches = find_matching_auths($record); - push(@matches, find_replaced_auths($record)); - - my $marcxml = clean_marc($record->as_xml_record()); - - if (@matches) { - update_auth($marcxml, $_) for @matches; - } else { - insert_auth($marcxml); - } - } -} - -# Update the 005 field to the current date -sub modify_auth_005 { - my $record = shift; - my $field_005 = $record->field('005'); - - # MARC 005-formatted date value - my $now_date = DateTime->now( - time_zone => 'local')->strftime('%Y%m%d%H%M%S.0'); - - if ($field_005) { - $field_005->update($now_date); - - } else { - $field_005 = MARC::Field->new('005', $now_date); - $record->insert_fields_ordered($field_005); - } -} - - -sub update_auth { - my $marcxml = shift; - my $auth_id = shift; - - eval { $mod_auth_sth->execute($marcxml, $auth_id) }; - - if ($@) { - announce('ERR', "Error updating authority record: $@ : $marcxml"); - return; - } - - $mod_auth_ctr++; - - announce('INFO', "Updated $mod_auth_ctr authority records") - if $mod_auth_ctr % $log_mod == 0; -} - -sub insert_auth { - my $marcxml = shift; - - eval { $new_auth_sth->execute($marcxml, "IMPORT-" . time) }; - - if ($@) { - announce('ERR', - "Error creating new authority record: $@ : $marcxml"); - return; - } - - $new_auth_ctr++; - - announce('INFO', "Created $new_auth_ctr authority records") - if $new_auth_ctr % $log_mod == 0; -} - -# Return ID's of authority records that should be replaced by the -# current record. Checks for records whose 010$a equals the 010$z of -# the current record. -# 010$z == Canceled/invalid LC control number -sub find_replaced_auths { - my $record = shift; - - my $subfield = $record->subfield('010', 'z'); - return () unless $subfield; - - $match_auth_sth->execute('010', $subfield); - my $matches = $match_auth_sth->fetchall_arrayref; - my @ids = map {$_->[0]} @$matches; - - announce('INFO', "Auth 010z=$subfield matched records: @ids") if @ids; - - return @ids; -} - -# Return ID's of matching authority records. Matching tries: -# 001 -> 010a -> 035a. -sub find_matching_auths { - my $record = shift; - - my $tag = '001'; - my $subfield; - - # 001 test requires its own SQL query - if (my $field = $record->field($tag)) { - if ($subfield = $field->data) { - - $match_auth_001_sth->execute($subfield); - my $matches = $match_auth_001_sth->fetchall_arrayref; - my @ids = map {$_->[0]} @$matches; - announce('INFO', - "Auth 001=$subfield matched records: @ids") if @ids; - return @ids; - } - } - - $tag = '010'; - $subfield = $record->subfield($tag, 'a'); - - if (!$subfield) { - $tag = '035'; - $subfield = $record->subfield($tag, 'a'); - } - - return () unless $subfield; - - $match_auth_sth->execute($tag, $subfield); - my $matches = $match_auth_sth->fetchall_arrayref; - - my @ids = map {$_->[0]} @$matches; - announce('INFO', "Auth ${tag}a=$subfield matched records: @ids") if @ids; - - return @ids; -} - -sub prepare_statements { - - $del_auth_sth = $db_handle->prepare(<<" SQL"); - DELETE FROM authority.record_entry WHERE id = ? - SQL - - $delmod_auth_sth = $db_handle->prepare(<<" SQL"); - UPDATE authority.record_entry - SET edit_date = NOW() WHERE id = ? - SQL - - $mod_bibs_sth = $db_handle->prepare(<<" SQL"); - UPDATE biblio.record_entry - SET marc = ?, edit_date = NOW() - WHERE id = ? - SQL - - $mod_auth_sth = $db_handle->prepare(<<" SQL"); - UPDATE authority.record_entry - SET marc = ?, edit_date = NOW() - WHERE id = ? - SQL - - $new_auth_sth = $db_handle->prepare(<<" SQL"); - INSERT INTO authority.record_entry (marc, last_xact_id) - VALUES (?, ?) - SQL - - $match_auth_sth = $db_handle->prepare(<<" SQL"); - SELECT DISTINCT(rec.id) - FROM authority.record_entry rec - JOIN authority.full_rec frec ON (frec.record = rec.id) - WHERE - NOT rec.deleted - AND frec.tag = ? - AND frec.subfield = 'a' - AND frec.value = NACO_NORMALIZE(?, 'a') - SQL - - $match_auth_001_sth = $db_handle->prepare(<<" SQL"); - SELECT DISTINCT(rec.id) - FROM authority.record_entry rec - JOIN authority.full_rec frec ON (frec.record = rec.id) - WHERE - NOT rec.deleted - AND frec.tag = '001' - AND frec.value = ? - SQL -} - -openlog($syslog_ident, $syslog_ops, $syslog_facility); -connect_db(); -prepare_statements(); -process_zip_file(); - -$new_auth_sth->finish; -$mod_auth_sth->finish; -$del_auth_sth->finish; -$delmod_auth_sth->finish; -$match_auth_sth->finish; -$match_auth_001_sth->finish; -$mod_bibs_sth->finish; - -$db_handle->disconnect; - diff --git a/KCLS/linking/authority_authority_linker.pl b/KCLS/linking/authority_authority_linker.pl deleted file mode 100755 index 2c134e74a4..0000000000 --- a/KCLS/linking/authority_authority_linker.pl +++ /dev/null @@ -1,385 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use DBI; -use Getopt::Long; -use MARC::Record; -use MARC::File::XML (BinaryEncoding => 'UTF-8'); -use MARC::Charset; -use OpenSRF::System; -use OpenILS::Utils::Fieldmapper; -use OpenSRF::Utils::SettingsClient; -use OpenSRF::EX qw/:try/; -use Encode; -use Unicode::Normalize; -use OpenILS::Utils::Normalize; -use Data::Dumper; -use Pod::Usage qw/ pod2usage /; - -$ENV{OSRF_LOG_CLIENT} = 1; - -$Data::Dumper::Indent = 0; -MARC::Charset->assume_unicode(1); - -my $acsaf_cache = {}; - -sub get_acsaf { - my ($e, $id) = @_; - - $acsaf_cache->{$id} ||= - $e->retrieve_authority_control_set_authority_field([ - $id, - {flesh => 1, flesh_fields => {acsaf => ["main_entry"]}} - ]); - return $acsaf_cache->{$id}; -} - -sub matchable_string { - my ($field, $sf_list, $joiner) = @_; - $joiner ||= ' '; - - return join($joiner, map { $field->subfield($_) } split "", $sf_list); -} - -# ########### main -my ($start_id, $end_id); -my $bootstrap = '/openils/conf/opensrf_core.xml'; -my @records; -my $verbose; -my $input_file =''; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_user = $ENV{PGDATABASE} || 'evergreen'; -my $db_pass = $ENV{PGPASSWORD}; -my $links_removed = 0; -my $links_added = 0; -my $CNI = 'KCLS'; - -my %options; -my $result = GetOptions( - \%options, - 'configuration=s' => \$bootstrap, - 'record=i' => \@records, - 'all', 'help', - 'start_id=i' => \$start_id, - 'end_id=i' => \$end_id, - 'file=s' => \$input_file, - 'verbose' => \$verbose, - "db-host=s" => \$db_host, - "db-user=s" => \$db_user, - "db-pass=s" => \$db_pass, - "db-port=s" => \$db_port -); - -sub announce { - my $msg = shift; - return unless $verbose; - print DateTime->now(time_zone => 'local')->strftime('%F %T') . " $msg\n"; -} - -pod2usage(0) if not $result or $options{help}; - -OpenSRF::System->bootstrap_client(config_file => $bootstrap); -Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); - -# must be loaded and initialized after the IDL is parsed - -use OpenILS::Utils::CStoreEditor; -OpenILS::Utils::CStoreEditor::init(); - -my $e = OpenILS::Utils::CStoreEditor->new; - -my $query = q{ - SELECT - source, - ARRAY_TO_STRING(ARRAY_AGG(target || ',' || field), ';') AS links - FROM ( - SELECT sh1.record AS target, - sh2.record AS source, - sh2.atag AS field - FROM authority.simple_heading sh1 - JOIN authority.simple_heading sh2 USING (sort_value) - JOIN authority.control_set_authority_field af1 ON - (sh1.atag = af1.id AND af1.main_entry IS NULL) - JOIN authority.control_set_authority_field af2 ON - (sh2.atag = af2.id AND af2.main_entry IS NOT NULL - AND af2.linking_subfield IS NOT NULL) - %s -- where clause here - -- Ignore authority.authority_linking rows since we want to - -- rebuild all links, which may mean deleting bogus links. - -- EXCEPT SELECT target, source, field FROM authority.authority_linking - -- order by source for consistent testing - ) x GROUP BY 1 ORDER BY source -}; - -my @bind_params; -if (@records) { - $query = sprintf($query, "WHERE sh2.record = ?"); - @bind_params = @records; # should be just one scalar in this array. -} elsif ($options{all}) { - $query = sprintf($query, ""); # no where clause -} elsif ($start_id and $end_id) { - $query = sprintf($query, "WHERE sh2.record BETWEEN ? AND ?"); - @bind_params = ($start_id, $end_id); - -} elsif ($input_file) { - # Load authority record IDs from a file. - announce("Reading authority record IDs from $input_file"); - - open FILE, "<", $input_file or die "Can't open file $input_file\n"; - while() { - chomp; - push(@records, $_) if $_; - } - close FILE; - - announce("Read ".scalar(@records)." from $input_file"); - - # No bind_params needed. - my $recstr = join(',', @records); - $query = sprintf($query, "WHERE sh2.record IN ($recstr)"); - -} else { - pod2usage(0); -} - -announce("SQL, params: ", Dumper($query, \@bind_params)); - -my $dsn = "dbi:Pg:database=evergreen;host=$db_host;port=$db_port"; -my $dbh = DBI->connect($dsn, $db_user, $db_pass); -$dbh->do('SET statement_timeout = 0'); - -my $sth = $dbh->prepare($query); - -announce("Executing query ..."); -$sth->execute(@bind_params); - -my $problems = 0; - -if (@records) { - announce("Processing ".scalar(@records)." records"); -} elsif ($start_id) { - announce("Processing records $start_id => $end_id"); -} else { - announce("Processing all records"); -} - -my $total_records = 0; -while (my ($src, $links) = $sth->fetchrow_array) { - announce("--------------------"); - announce("Processing authority source record $src"); - $total_records++; - - try { - - my $src_rec = $e->retrieve_authority_record_entry($src); - if (!$src_rec) { - warn "Unable to load source authority record $src\n"; - next; - } - - my $src_marc = MARC::Record->new_from_xml($src_rec->marc); - my $auth_src_thesaurus = substr($src_marc->field('008')->data(), 11, 1); - announce("Source record thesaurus value=$auth_src_thesaurus"); - - my $changed = 0; - my %tags_seen; - for my $link (split ';', $links) { - my ($target, $field_id) = split ',', $link; - - next if $target eq $src_rec->id; - - announce("Target: $target, field_id: $field_id"); - - my $target_rec = $e->retrieve_authority_record_entry($target); - if (!$target_rec) { - warn "Unable to load authority record $target. Skipping\n"; - next; - } - - my $target_marc = MARC::Record->new_from_xml($target_rec->marc); - - my $auth_target_thesaurus = - substr($target_marc->field('008')->data(), 11, 1); - - announce("Target record thesaurus value=$auth_target_thesaurus"); - - if ($auth_src_thesaurus ne $auth_target_thesaurus) { - announce("Thesauri for source/target records do not match. Skipping.."); - next; - } - - my $acsaf = get_acsaf($e, $field_id); - if (!$acsaf) { - warn "No authority control set field found for $field_id. Skipping\n"; - next; - } - - if (!$tags_seen{$acsaf->tag}) { - # the first time we process each tag for a given record, - # remove all existing auth-to-auth link subfields - # so they can be completely rebuilt. - for my $field ($src_marc->field($acsaf->tag)) { - if (my $val = $field->subfield('0')) { - announce("Removing existing subfield 0 : $val"); - $field->delete_subfield(code => '0'); - $changed = 1; - $links_removed++; - } - } - $tags_seen{$acsaf->tag} = 1; - } - - # rebuild the links for the current tag - for my $field ($src_marc->field($acsaf->tag)) { - - my $src_string = matchable_string( - $field, $acsaf->main_entry->display_sf_list, - $acsaf->main_entry->joiner - ); - - announce(sprintf( - "At field id=%s (tag=%s) / trying to match '%s'", - $acsaf->id, $acsaf->tag, $src_string)); - - my ($tfield) = $target_marc->field($acsaf->main_entry->tag); - - if(defined $tfield) { - - my $target_string = matchable_string( - $tfield, $acsaf->main_entry->display_sf_list, - $acsaf->main_entry->joiner - ); - - if ($target_string eq $src_string) { - announce("Got a match"); - $field->update('0' => "($CNI)$target"); - $changed = 1; - $links_added++; - } - } - } - } - - if ($changed) { - announce("Updating authority record ".$src_rec->id); - $src_rec->marc(OpenILS::Utils::Normalize::clean_marc($src_marc)); - $e->xact_begin; - $e->update_authority_record_entry($src_rec) or die $e->die_event; - $e->xact_commit; - } - - } otherwise { - my $err = shift; - print STDERR "\nRecord # $src : ", - (ref $err eq "HASH" ? Dumper($err) : $err), "\n"; - - # Reset SAX parser so that one bad record doesn't - # kill the entire process. - - import MARC::File::XML; - $problems++; - } -} - -if (@records) { - announce("Processed records ". scalar(@records). - " records; processed=$total_records problems=$problems"); -} elsif ($start_id) { - announce("Processed records $start_id => $end_id; ". - "processed=$total_records; problems=$problems"); -} else { - announce("Processed all records; processed=$total_records; problems=$problems"); -} - -announce("links removed: $links_removed"); -announce("links added: $links_added"); -announce("delta added: ".($links_added - $links_removed)); - -exit ($problems > 0); - -__END__ - -=head1 NAME - -authority_authority_linker.pl - Link reference headings in authority records to main entry headings in other authority records - -=head1 SYNOPSIS - -authority_authority_linker.pl [B<--configuration>=I] -[[B<--record>=I[ B<--record>=I]]] | [B<--all>] | [B<--start_id>=I B<--end_id>=I] - -=head1 DESCRIPTION - -For a given set of records, find authority reference headings that also -appear as main entry headings in any other authority record. In the -specific MARC field of the authority record (source) containing the reference -heading with such a match in another authority record (target), add a subfield -0 (zero) referring to the target record by ID. - -=head1 OPTIONS - -=over - -=item * B<-c> I, B<--configuration>=I - -Specifies the OpenSRF configuration file used to connect to the OpenSRF router. -Defaults to F - -=item * B<-r> I, B<--record>=I - -Specifies the authority record ID (found in the C -column) of the B record to process. This option may be specified more -than once to process multiple records in a single run. - -=item * B<-a>, B<--all> - -Specifies that all authority records should be processed. For large -databases, this may take an extraordinarily long amount of time. - -=item * B<-s> I, B<--start_id>=I - -Specifies the starting ID of the range of authority records to process. -This option is ignored unless it is accompanied by the B<-e> or B<--end_id> -option. - -=item * B<-e> I, B<--end_id>=I - -Specifies the ending ID of the range of authority records to process. -This option is ignored unless it is accompanied by the B<-s> or B<--start> -option. - -=back - -=head1 EXAMPLES - - authority_authority_linker.pl --start_id 1 --end_id 50000 - -Processes the authority records with IDs between 1 and 50,000 using the -default OpenSRF configuration file for connection information. - -=head1 AUTHOR - -Lebbeous Fogle-Weekley - -=head1 COPYRIGHT AND LICENSE - -Copyright (C) 2013 Equinox Software, Inc. - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA -02110-1301, USA. - -=cut diff --git a/KCLS/linking/authority_control_fields.pl b/KCLS/linking/authority_control_fields.pl deleted file mode 100755 index a560384336..0000000000 --- a/KCLS/linking/authority_control_fields.pl +++ /dev/null @@ -1,1013 +0,0 @@ -#!/usr/bin/perl -# Copyright (C) 2010-2011 Laurentian University -# Author: Dan Scott -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# --------------------------------------------------------------- - -use strict; -use warnings; -use DBI; -use Getopt::Long; -use MARC::Record; -use MARC::File::XML (BinaryEncoding => 'UTF-8'); -use MARC::Charset; -use OpenSRF::System; -use OpenILS::Utils::Fieldmapper; -use OpenSRF::Utils::SettingsClient; -use Encode; -use Unicode::Normalize; -use OpenILS::Application::AppUtils; -use Data::Dumper; -use Pod::Usage qw/ pod2usage /; -use DateTime; - -$ENV{OSRF_LOG_CLIENT} = 1; - -$Data::Dumper::Indent = 0; -MARC::Charset->assume_unicode(1); - -my $start_id; -my $end_id; -my $refresh; -my $days_back; # days; 0 means today only -my $modified_since; # days; 0 means today only -my $input_file =''; -my $bootstrap = '/openils/conf/opensrf_core.xml'; -my @records; -my $verbose = 0; -my $sort_desc = 0; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_user = $ENV{PGDATABASE} || 'evergreen'; -my $db_pass = $ENV{PGPASSWORD}; -my $CNI = 'KCLS'; - -my %options; -my $result = GetOptions( - \%options, - 'configuration=s' => \$bootstrap, - 'record=i' => \@records, - 'refresh' => \$refresh, - 'start-id=i' => \$start_id, - 'end-id=i' => \$end_id, - 'days-back=i' => \$days_back, - 'modified-since=i' => \$modified_since, - 'sort-desc' => \$sort_desc, - 'file=s' => \$input_file, - 'verbose' => \$verbose, - "db-host=s" => \$db_host, - "db-user=s" => \$db_user, - "db-pass=s" => \$db_pass, - "db-port=s" => \$db_port, - 'all', # now assumed to be true when --file is unset - 'help' -); - -sub announce { - my $msg = shift; - my $force = shift; - return unless $force || $verbose; - print DateTime->now->strftime('%F %T') . " [$$] $msg\n"; -} - -if (!$result or $options{help}) { - pod2usage(0); -} - -OpenSRF::System->bootstrap_client(config_file => $bootstrap); -Fieldmapper->import(IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL")); - -# must be loaded and initialized after the IDL is parsed -use OpenILS::Utils::CStoreEditor; -OpenILS::Utils::CStoreEditor::init(); - -my $e = OpenILS::Utils::CStoreEditor->new; - -if ($input_file) { - open FILE, "<", $input_file or die "Can't open file " . $input_file; - while() { - chomp; - if($_) { - push(@records, $_); - } - } - close FILE; - -} else { - - - my $where = "WHERE not deleted"; - $where .= " AND id >= $start_id" if $start_id; - $where .= " AND id <= $end_id" if $end_id; - - my $order = "ORDER BY id"; - $order .= " DESC" if $sort_desc; - - if (defined $days_back or defined $modified_since) { - my $date_field = defined $days_back ? 'create_date' : 'edit_date'; - my $duration = defined $days_back ? $days_back : $modified_since; - $where .= " AND DATE($date_field) >= DATE(NOW() - '$duration day'::INTERVAL)"; - } - - my $sql = - "SELECT DISTINCT(id) AS id FROM biblio.record_entry $where $order"; - - announce("Loading record ID's with query:\n$sql"); - - my $dsn = "dbi:Pg:database=evergreen;host=$db_host;port=$db_port"; - my $dbh = DBI->connect($dsn, $db_user, $db_pass); - - my $sth = $dbh->prepare($sql); - $sth->execute; - - while (my $ref = $sth->fetchrow_hashref()) { - push(@records, $ref->{id}); - } - - $sth->finish(); - $dbh->disconnect(); -} - -announce("Processing ".scalar(@records)." records"); - -# Hash of controlled fields & subfields in bibliographic records, and their -# corresponding controlling fields & subfields in the authority record -# -# So, if the bib 650$a can be controlled by an auth 150$a, that maps to: -# 650 => { a => { 150 => 'a'}} -my %controllees = ( - 100 => { a => { 100 => 'a' }, - b => { 100 => 'b' }, - c => { 100 => 'c' }, - d => { 100 => 'd' }, - #f => { 100 => 'f' }, - g => { 100 => 'g' }, - h => { 100 => 'h' }, - k => { 100 => 'k' }, - l => { 100 => 'l' }, - m => { 100 => 'm' }, - n => { 100 => 'n' }, - o => { 100 => 'o' }, - p => { 100 => 'p' }, - q => { 100 => 'q' }, - r => { 100 => 'r' }, - s => { 100 => 's' }, - t => { 100 => 't' }, - y => { 100 => 'y' }, - z => { 100 => 'z' }, - }, - 110 => { a => { 110 => 'a' }, - b => { 110 => 'b' }, - c => { 110 => 'c' }, - d => { 110 => 'd' }, - #f => { 110 => 'f' }, - g => { 110 => 'g' }, - h => { 110 => 'h' }, - k => { 110 => 'k' }, - l => { 110 => 'l' }, - m => { 110 => 'm' }, - n => { 110 => 'n' }, - o => { 110 => 'o' }, - p => { 110 => 'p' }, - r => { 110 => 'r' }, - s => { 110 => 's' }, - t => { 110 => 't' }, - y => { 110 => 'y' }, - z => { 110 => 'z' }, - }, - 111 => { a => { 111 => 'a' }, - b => { 111 => 'b' }, - c => { 111 => 'c' }, - d => { 111 => 'd' }, - e => { 111 => 'e' }, - #f => { 111 => 'f' }, - g => { 111 => 'g' }, - h => { 111 => 'h' }, - k => { 111 => 'k' }, - l => { 111 => 'l' }, - m => { 111 => 'm' }, - n => { 111 => 'n' }, - o => { 111 => 'o' }, - p => { 111 => 'p' }, - q => { 111 => 'q' }, - r => { 111 => 'r' }, - s => { 111 => 's' }, - t => { 111 => 't' }, - u => { 111 => 'u' }, - y => { 111 => 'y' }, - z => { 111 => 'z' }, - }, - 130 => { a => { 130 => 'a' }, - d => { 130 => 'd' }, - #f => { 130 => 'f' }, - g => { 130 => 'g' }, - h => { 130 => 'h' }, - k => { 130 => 'k' }, - l => { 130 => 'l' }, - m => { 130 => 'm' }, - n => { 130 => 'n' }, - o => { 130 => 'o' }, - p => { 130 => 'p' }, - r => { 130 => 'r' }, - s => { 130 => 's' }, - t => { 130 => 't' }, - x => { 130 => 'x' }, - y => { 130 => 'y' }, - z => { 130 => 'z' }, - }, - 400 => { a => { 100 => 'a' }, - b => { 100 => 'b' }, - c => { 100 => 'c' }, - d => { 100 => 'd' }, - #f => { 100 => 'f' }, - g => { 100 => 'g' }, - h => { 100 => 'h' }, - k => { 100 => 'k' }, - l => { 100 => 'l' }, - m => { 100 => 'm' }, - n => { 100 => 'n' }, - o => { 100 => 'o' }, - p => { 100 => 'p' }, - q => { 100 => 'q' }, - r => { 100 => 'r' }, - s => { 100 => 's' }, - t => { 100 => 't' }, - y => { 100 => 'y' }, - z => { 100 => 'z' }, - }, - 410 => { a => { 110 => 'a' }, - b => { 110 => 'b' }, - c => { 110 => 'c' }, - d => { 110 => 'd' }, - #f => { 110 => 'f' }, - g => { 110 => 'g' }, - h => { 110 => 'h' }, - k => { 110 => 'k' }, - l => { 110 => 'l' }, - m => { 110 => 'm' }, - n => { 110 => 'n' }, - o => { 110 => 'o' }, - p => { 110 => 'p' }, - r => { 110 => 'r' }, - s => { 110 => 's' }, - t => { 110 => 't' }, - y => { 110 => 'y' }, - z => { 110 => 'z' }, - }, - 411 => { a => { 111 => 'a' }, - b => { 111 => 'b' }, - c => { 111 => 'c' }, - d => { 111 => 'd' }, - e => { 111 => 'e' }, - #f => { 111 => 'f' }, - g => { 111 => 'g' }, - h => { 111 => 'h' }, - k => { 111 => 'k' }, - l => { 111 => 'l' }, - m => { 111 => 'm' }, - n => { 111 => 'n' }, - o => { 111 => 'o' }, - p => { 111 => 'p' }, - q => { 111 => 'q' }, - r => { 111 => 'r' }, - s => { 111 => 's' }, - t => { 111 => 't' }, - u => { 111 => 'u' }, - y => { 111 => 'y' }, - z => { 111 => 'z' }, - }, - 600 => { a => { 100 => 'a' }, - b => { 100 => 'b' }, - c => { 100 => 'c' }, - d => { 100 => 'd' }, - #f => { 100 => 'f' }, - g => { 100 => 'g' }, - h => { 100 => 'h' }, - k => { 100 => 'k' }, - l => { 100 => 'l' }, - m => { 100 => 'm' }, - n => { 100 => 'n' }, - o => { 100 => 'o' }, - p => { 100 => 'p' }, - q => { 100 => 'q' }, - r => { 100 => 'r' }, - s => { 100 => 's' }, - t => { 100 => 't' }, - v => { 100 => 'v' }, - x => { 100 => 'x' }, - y => { 100 => 'y' }, - z => { 100 => 'z' }, - }, - 610 => { a => { 110 => 'a' }, - b => { 110 => 'b' }, - c => { 110 => 'c' }, - d => { 110 => 'd' }, - #f => { 110 => 'f' }, - g => { 110 => 'g' }, - h => { 110 => 'h' }, - k => { 110 => 'k' }, - l => { 110 => 'l' }, - m => { 110 => 'm' }, - n => { 110 => 'n' }, - o => { 110 => 'o' }, - p => { 110 => 'p' }, - r => { 110 => 'r' }, - s => { 110 => 's' }, - t => { 110 => 't' }, - v => { 110 => 'v' }, - x => { 110 => 'x' }, - y => { 110 => 'y' }, - z => { 110 => 'z' }, - }, - 611 => { a => { 111 => 'a' }, - b => { 111 => 'b' }, - c => { 111 => 'c' }, - d => { 111 => 'd' }, - e => { 111 => 'e' }, - #f => { 111 => 'f' }, - g => { 111 => 'g' }, - h => { 111 => 'h' }, - k => { 111 => 'k' }, - l => { 111 => 'l' }, - m => { 111 => 'm' }, - n => { 111 => 'n' }, - o => { 111 => 'o' }, - p => { 111 => 'p' }, - q => { 111 => 'q' }, - r => { 111 => 'r' }, - s => { 111 => 's' }, - t => { 111 => 't' }, - u => { 111 => 'u' }, - v => { 111 => 'v' }, - x => { 111 => 'x' }, - y => { 111 => 'y' }, - z => { 111 => 'z' }, - }, - 630 => { a => { 130 => 'a' }, - d => { 130 => 'd' }, - #f => { 130 => 'f' }, - g => { 130 => 'g' }, - h => { 130 => 'h' }, - k => { 130 => 'k' }, - l => { 130 => 'l' }, - m => { 130 => 'm' }, - n => { 130 => 'n' }, - o => { 130 => 'o' }, - p => { 130 => 'p' }, - r => { 130 => 'r' }, - s => { 130 => 's' }, - t => { 130 => 't' }, - v => { 130 => 'v' }, - x => { 130 => 'x' }, - y => { 130 => 'y' }, - z => { 130 => 'z' }, - }, - 650 => { a => { 150 => 'a' }, - b => { 150 => 'b' }, - c => { 150 => 'c' }, - d => { 150 => 'd' }, - v => { 150 => 'v' }, - x => { 150 => 'x' }, - y => { 150 => 'y' }, - z => { 150 => 'z' }, - }, - 651 => { a => { 151 => 'a' }, - b => { 151 => 'b' }, - v => { 151 => 'v' }, - x => { 151 => 'x' }, - y => { 151 => 'y' }, - z => { 151 => 'z' }, - }, - 655 => { a => { 155 => 'a' }, - b => { 155 => 'b' }, - c => { 155 => 'c' }, - v => { 155 => 'v' }, - x => { 155 => 'x' }, - y => { 155 => 'y' }, - z => { 155 => 'z' }, - }, - 700 => { a => { 100 => 'a' }, - b => { 100 => 'b' }, - c => { 100 => 'c' }, - d => { 100 => 'd' }, - #f => { 100 => 'f' }, - g => { 100 => 'g' }, - h => { 100 => 'h' }, - k => { 100 => 'k' }, - l => { 100 => 'l' }, - m => { 100 => 'm' }, - n => { 100 => 'n' }, - o => { 100 => 'o' }, - p => { 100 => 'p' }, - q => { 100 => 'q' }, - r => { 100 => 'r' }, - s => { 100 => 's' }, - t => { 100 => 't' }, - y => { 100 => 'y' }, - z => { 100 => 'z' }, - }, - 710 => { a => { 110 => 'a' }, - b => { 110 => 'b' }, - c => { 110 => 'c' }, - d => { 110 => 'd' }, - #f => { 110 => 'f' }, - g => { 110 => 'g' }, - h => { 110 => 'h' }, - k => { 110 => 'k' }, - l => { 110 => 'l' }, - m => { 110 => 'm' }, - n => { 110 => 'n' }, - o => { 110 => 'o' }, - p => { 110 => 'p' }, - r => { 110 => 'r' }, - s => { 110 => 's' }, - t => { 110 => 't' }, - y => { 110 => 'y' }, - z => { 110 => 'z' }, - }, - 711 => { a => { 111 => 'a' }, - b => { 111 => 'b' }, - c => { 111 => 'c' }, - d => { 111 => 'd' }, - e => { 111 => 'e' }, - #f => { 111 => 'f' }, - g => { 111 => 'g' }, - h => { 111 => 'h' }, - k => { 111 => 'k' }, - l => { 111 => 'l' }, - m => { 111 => 'm' }, - n => { 111 => 'n' }, - o => { 111 => 'o' }, - p => { 111 => 'p' }, - q => { 111 => 'q' }, - r => { 111 => 'r' }, - s => { 111 => 's' }, - t => { 111 => 't' }, - u => { 111 => 'u' }, - y => { 111 => 'y' }, - z => { 111 => 'z' }, - }, - 730 => { a => { 130 => 'a' }, - d => { 130 => 'd' }, - #f => { 130 => 'f' }, - g => { 130 => 'g' }, - h => { 130 => 'h' }, - k => { 130 => 'k' }, - l => { 130 => 'l' }, - m => { 130 => 'm' }, - n => { 130 => 'n' }, - o => { 130 => 'o' }, - p => { 130 => 'p' }, - r => { 130 => 'r' }, - s => { 130 => 's' }, - t => { 130 => 't' }, - y => { 130 => 'y' }, - z => { 130 => 'z' }, - }, - 800 => { a => { 100 => 'a' }, - b => { 100 => 'b' }, - c => { 100 => 'c' }, - d => { 100 => 'd' }, - #f => { 100 => 'f' }, - g => { 100 => 'g' }, - h => { 100 => 'h' }, - k => { 100 => 'k' }, - l => { 100 => 'l' }, - m => { 100 => 'm' }, - n => { 100 => 'n' }, - o => { 100 => 'o' }, - p => { 100 => 'p' }, - q => { 100 => 'q' }, - r => { 100 => 'r' }, - s => { 100 => 's' }, - t => { 100 => 't' }, - y => { 100 => 'y' }, - z => { 100 => 'z' }, - }, - 810 => { a => { 110 => 'a' }, - b => { 110 => 'b' }, - c => { 110 => 'c' }, - d => { 110 => 'd' }, - #f => { 110 => 'f' }, - g => { 110 => 'g' }, - h => { 110 => 'h' }, - k => { 110 => 'k' }, - l => { 110 => 'l' }, - m => { 110 => 'm' }, - n => { 110 => 'n' }, - o => { 110 => 'o' }, - p => { 110 => 'p' }, - r => { 110 => 'r' }, - s => { 110 => 's' }, - t => { 110 => 't' }, - y => { 110 => 'y' }, - z => { 110 => 'z' }, - }, - 811 => { a => { 111 => 'a' }, - b => { 111 => 'b' }, - c => { 111 => 'c' }, - d => { 111 => 'd' }, - e => { 111 => 'e' }, - #f => { 111 => 'f' }, - g => { 111 => 'g' }, - h => { 111 => 'h' }, - k => { 111 => 'k' }, - l => { 111 => 'l' }, - m => { 111 => 'm' }, - n => { 111 => 'n' }, - o => { 111 => 'o' }, - p => { 111 => 'p' }, - q => { 111 => 'q' }, - r => { 111 => 'r' }, - s => { 111 => 's' }, - t => { 111 => 't' }, - u => { 111 => 'u' }, - y => { 111 => 'y' }, - z => { 111 => 'z' }, - }, - 830 => { a => { 130 => 'a' }, - d => { 130 => 'd' }, - #f => { 130 => 'f' }, - g => { 130 => 'g' }, - h => { 130 => 'h' }, - k => { 130 => 'k' }, - l => { 130 => 'l' }, - m => { 130 => 'm' }, - n => { 130 => 'n' }, - o => { 130 => 'o' }, - p => { 130 => 'p' }, - r => { 130 => 'r' }, - s => { 130 => 's' }, - t => { 130 => 't' }, - x => { 130 => 'x' }, - y => { 130 => 'y' }, - z => { 130 => 'z' }, - }, -); - -# mapping of authority leader/11 "Subject heading system/thesaurus" -# to the matching bib record indicator -my %AUTH_TO_BIB_IND2 = ( - 'a' => '0', # Library of Congress Subject Headings (ADULT) - 'b' => '1', # Library of Congress Subject Headings (JUVENILE) - 'c' => '2', # Medical Subject Headings - 'd' => '3', # National Agricultural Library Subject Authority File - 'n' => '4', # Source not specified - 'k' => '5', # Canadian Subject Headings - 'v' => '6', # Répertoire de vedettes-matière - 'z' => '7' # Source specified in subfield $2 / Other -); - -# Produces a new 6XX ind2 value for values found in subfield $2 when the -# original ind2 value is 7 ("Source specified in subfield $2"). -my %REMAP_BIB_SF2_TO_IND2 = ( - lcsh => '0', - mesh => '2', - nal => '3', - rvm => '6' -); - -my $start_time = localtime(); - -if($input_file) { - announce("Start $start_time for ".scalar(@records)." records"); -} elsif($start_id) { - announce("Start $start_time for record range: $start_id => $end_id"); -} else { - announce("Start $start_time for all records"); -} - -# Fetch leader/008 values for authority records. Filter out any whose -# 008 14 or 15 field are not appropriate for the requested bib tag. -# https://www.loc.gov/marc/authority/ad008.html -sub authority_leaders_008_14_15 { - my ($e, $bib_tag, $auth_ids) = @_; - - my $auth_leaders = $e->json_query({ - select => {afr => ['record', 'value']}, - from => 'afr', - where => {'+afr' => {tag => '008', record => $auth_ids}} - }); - - my $index; - $index = 14 if $bib_tag =~ /^[17]/; # author/name record - $index = 15 if $bib_tag =~ /^6/; # subject record - - # avoid checking any other types of authority records. - return $auth_leaders unless $index; - - my @keepers; - for my $leader (@$auth_leaders) { - my $value = $leader->{value} || ''; - if (substr($value, $index, 1) eq 'a') { - push(@keepers, $leader); - } else { - announce("Skipping authority record ".$leader->{record}. - " on bib $bib_tag match; 008/#14|#15 not appropriate"); - } - } - - return \@keepers; -} - -# given a set of authority record ID's and a controlled bib field, -# returns the ID of the first authority record in the set that -# matches the thesaurus spec of the bib record. -sub find_matching_auth_for_thesaurus { - my ($e, $bib_field, $auth_leaders) = @_; - - # bib field thesaurus spec - my $cfield_ind2 = $bib_field->indicator(2); - - announce("6XX indicator 2 value = $cfield_ind2"); - - my $is_local = 0; - if ($cfield_ind2 eq '7') { - # subject thesaurus code is embedded in the bib field subfield 2 - - $is_local = 1; - - my $thesaurus = $bib_field->subfield('2') || ''; - announce("Found local thesaurus value $thesaurus"); - - # if we have no special remapping value for the found thesaurus, - # fall back to ind2 => 7=Other. - $cfield_ind2 = $REMAP_BIB_SF2_TO_IND2{$thesaurus} || '7'; - - announce("Local thesaurus '$thesaurus' ". - "remapped to ind2 value '$cfield_ind2'"); - } - - my $authz_found = undef; - for my $leader (@$auth_leaders) { - my $value = $leader->{value}; - next unless $value; - - my $thesaurus = substr($value, 11, 1); # leader/11 -- zero based. - - # Note for later that we encountered an authority record - # whose thesaurus values is z=Other. - $authz_found = $leader->{record} if $thesaurus eq 'z'; - - if ($AUTH_TO_BIB_IND2{$thesaurus} eq $cfield_ind2) { - announce("Found a match on thesaurus ". - "'$thesaurus' for " . $leader->{record}); - return $leader->{record}; - } - } - - # If the bib field in question has a locally encoded thesaurus - # (ind2=7) and no auth record was found above via remapped - # thesaurus value, use the authority record with thesaurus z=Other. - return $authz_found if $is_local; - - return undef; -} - -# Returns true if the thesaurus controlling the bib field is "fast". -sub is_fast_heading { - my $bib_field = shift; - - if ($bib_field->tag() =~ /^65[015]/) { - my $ind2 = $bib_field->indicator(2) || ''; - - if ($ind2 eq '7') { # field controlled by "other" - my $thesaurus = $bib_field->subfield('2') || ''; - return $thesaurus eq 'fast'; - } - } - - return 0; -} - -sub update_record { - my ($record, $marc) = @_; - - my $xml = $marc->as_xml_record(); - $xml =~ s/\n//sgo; - $xml =~ s/^<\?xml.+\?\s*>//go; - $xml =~ s/>\s+entityize($xml); - - $record->marc($xml); - - my $editor = OpenILS::Utils::CStoreEditor->new(xact=>1); - if ($editor->update_biblio_record_entry($record)) { - $editor->commit(); - } else { - $editor->rollback(); - } -} - -my $count = 0; -my $total = scalar(@records); -announce("processing $total bib records", 1); - -foreach my $rec_id (@records) { - $count++; - - announce("processing $count of $total", 1) if ($count % 1000) == 0; - - announce("processing bib record $rec_id [$count of $total]"); - - # State variable; was the record changed? - my $changed = 0; - - # get the record - my $record = $e->retrieve_biblio_record_entry($rec_id); - next unless $record && $record->deleted eq 'f'; - - eval { - my $marc = MARC::Record->new_from_xml($record->marc()); - - # get the list of controlled fields - my @c_fields = keys %controllees; - - foreach my $c_tag (@c_fields) { - my @c_subfields = keys %{$controllees{"$c_tag"}}; - - # Get the MARCXML from the record and check for controlled fields/subfields - my @bib_fields = ($marc->field($c_tag)); - foreach my $bib_field (@bib_fields) { - - my $sf0 = $bib_field->subfield('0') || ''; - my $is_fast_heading = is_fast_heading($bib_field); - - if ($is_fast_heading && $sf0 =~ /\)fst/) { - # fast heading looks OK. ignore it. - announce("Ignoring FAST heading field on ". - "rec=$rec_id and tag=$c_tag \$0 $sf0"); - next; - } - - if ($sf0 && $refresh) { - announce("Removing \$0 $sf0 for rec=$rec_id and tag=$c_tag"); - $bib_field->delete_subfield(code => '0'); - $changed = 1; - } - - if ($is_fast_heading) { - # Update record after potentially removing a bogus - # fast heading above. - update_record($record, $marc) if $changed; - - # We don't control fast headings, so there's nothing - # left to do. Move on to the next field... - announce("No linking performed on FAST heading ". - "field on rec=$rec_id and tag=$c_tag"); - next; - } - - my %match_subfields; - my $match_tag; - my @searches; - foreach my $c_subfield (@c_subfields) { - my @sf_values = $bib_field->subfield($c_subfield); - if (@sf_values) { - # Give me the first element of the list of authority controlling tags for this subfield - # XXX Will we need to support more than one controlling tag per subfield? Probably. That - # will suck. Oh well, leave that up to Ole to implement. - $match_subfields{$c_subfield} = (keys %{$controllees{$c_tag}{$c_subfield}})[0]; - $match_tag = $match_subfields{$c_subfield}; - push @searches, map {{term => $_, subfield => $c_subfield}} @sf_values; - } - } - next if !$match_tag; - - announce("Searching for matches on controlled field $c_tag ". - "(auth tag=$match_tag): \n - ".Dumper(\@searches)); - - my @tags = ($match_tag); - - # Now we've built up a complete set of matching controlled - # subfields for this particular field; let's check to see if - # we have a matching authority record - my $session = OpenSRF::AppSession->create("open-ils.search"); - my $validates = $session->request("open-ils.search.authority.validate.tag.id_list", - "tags", \@tags, "searches", \@searches - )->gather(); - $session->disconnect(); - - - # Protect against failed (error condition) search request - if (!$validates) { - print STDERR "Search for matching authority failed; record # $rec_id\n"; - next if (!$changed); - } - - announce("Match query returned @$validates"); - - # No matches found. Nothing left to do for this field. - next if scalar(@$validates) == 0; - - # Iterate through the returned authority record IDs to delete any - # matching $0 subfields already in the bib record - foreach my $auth_zero (@$validates) { - $bib_field->delete_subfield(code => '0', match => qr/\)$auth_zero$/); - } - - # Find the best authority record to use for linking. - - my $auth_leaders; - if ($bib_field->tag =~ /^[167]/) { - # For 1XX, 6XX, and 7XX bib fields, only link to - # authority records whose leader/008 positions - # 14 and 15 are coded to allow use as a name/author - # or subject record, depending. - - $auth_leaders = authority_leaders_008_14_15( - $e, $bib_field->tag, $validates); - - $validates = [map {$_->{record}} @$auth_leaders]; - } - - my $auth_id; - - if ($bib_field->tag() =~ /^65[015]/) { - # Using the indicator-2 value from the controlled bib - # field, find the first authority in the list of matches - # that uses the same thesaurus. If no such authority - # is found, no matching occurs. - # TODO: perhaps this step should be part of the - # validation API search call above. - - $auth_id = find_matching_auth_for_thesaurus( - $e, $bib_field, $auth_leaders) || ''; - - } else { - - # For all other controlled fields, use the first - # authority record in the result set. - $auth_id = $validates->[0]; - } - - # Don't exit here just because we have no $auth_id. The - # bib field could have been changed above in the cleanup / - # delete phase. - - if ($auth_id) { - # Add the auth ID and control number agency info from the - # matching authority record to the controlled bib field. - $changed = 1; - $bib_field->add_subfields('0' => "($CNI)$auth_id"); - announce("auth=$auth_id cni=$CNI. It's a match!"); - } - } - } - - update_record($record, $marc) if $changed; - - }; - - if ($@) { - print STDERR "\nRecord # $rec_id : $@\n"; - import MARC::File::XML; # reset SAX parser so that one bad record doesn't kill the entire process - } -} - -my $end_time = localtime(); - -if($input_file) { - announce("Stop $end_time for ".scalar(@records)." records"); -} elsif($start_id) { - announce("Start $end_time for record range: $start_id => $end_id"); -} else { - announce("Start $end_time for all records"); -} - -__END__ - -=head1 NAME - -authority_control_fields.pl - Controls fields in bibliographic records with authorities in Evergreen - -=head1 SYNOPSIS - -C [B<--configuration>=I] [B<--refresh>] -[[B<--record>=I[ B<--record>=I]]] | [B<--all>] | [B<--start_id>=I B<--end_id>=I] - -=head1 DESCRIPTION - -For a given set of records: - -=over - -=item * Iterate through the list of fields that are controlled fields - -=item * Iterate through the list of subfields that are controlled for -that given field - -=item * Search for a matching authority record for that combination of -field + subfield(s) - -=over - -=item * If we find a match, then add a $0 subfield to that field identifying -the controlling authority record - -=item * If we do not find a match, then insert a row into an "uncontrolled" -table identifying the record ID, field, and subfield(s) that were not controlled - -=back - -=item * Iterate through the list of floating subdivisions - -=over - -=item * If we find a match, then add a $0 subfield to that field identifying -the controlling authority record - -=item * If we do not find a match, then insert a row into an "uncontrolled" -table identifying the record ID, field, and subfield(s) that were not controlled - -=back - -=item * If we changed the record, update it in the database - -=back - -=head1 OPTIONS - -=over - -=item * B<-f>, B<--file> - -Specifies a file of bibs ids to link. - -=item * B<-c> I, B<--configuration>=I - -Specifies the OpenSRF configuration file used to connect to the OpenSRF router. -Defaults to F - -=item * B<-r> I, B<--record>=I - -Specifies the bibliographic record ID (found in the C -column) of the record to process. This option may be specified more than once -to process multiple records in a single run. - -=item * B<-a>, B<--all> - -Specifies that all bibliographic records should be processed. For large -databases, this may take an extraordinarily long amount of time. - -=item * B<-r>, B<--refresh> - -Specifies that all authority links should be removed from the target -bibliographic record(s). This will effectively rewrite all authority -linking anew. - -=item * B<-s> I, B<--start_id>=I - -Specifies the starting ID of the range of bibliographic records to process. -This option is ignored unless it is accompanied by the B<-e> or B<--end_id> -option. - -=item * B<-e> I, B<--end_id>=I - -Specifies the ending ID of the range of bibliographic records to process. -This option is ignored unless it is accompanied by the B<-s> or B<--start> -option. - -=back - -=head1 EXAMPLES - - authority_control_fields.pl --start_id 1 --end_id 50000 - -Processes the bibliographic records with IDs between 1 and 50,000 using the -default OpenSRF configuration file for connection information. - -=head1 AUTHOR - -Dan Scott - -=head1 COPYRIGHT AND LICENSE - -Copyright 2010-2011 by Dan Scott - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; either version 2 -of the License, or (at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -=cut - diff --git a/KCLS/linking/find-bibs-to-link.pl b/KCLS/linking/find-bibs-to-link.pl deleted file mode 100755 index 46a20bf298..0000000000 --- a/KCLS/linking/find-bibs-to-link.pl +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/perl -# ---------------------------------------------------------------------- -# Find bib records matching the requested criteria for linking. -# Bib IDs are exported to one or more batch files for future processing. -# ---------------------------------------------------------------------- -use strict; -use warnings; -use DBI; -use Getopt::Long; -use DateTime; - -my $db_handle; -my $counter = 0; - -# options -my $help; -my $modified_since; -my $exported_since; -my $batch_size = 10000; -my $start_id; -my $end_id; -my $count_only; -my $out_dir = '/tmp'; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_user = $ENV{PGUSER} || 'evergreen'; -my $db_name = $ENV{PGDATABASE} || 'evergreen'; -my $db_pass = $ENV{PGPASSWORD}; - -my $opt_result = GetOptions( - 'modified-since=s' => \$modified_since, - 'exported-since=s' => \$exported_since, - 'start-id=i' => \$start_id, - 'end-id=i' => \$end_id, - 'batch-size=i' => \$batch_size, - 'count-only' => \$count_only, - 'out-dir=s' => \$out_dir, - "db-host=s" => \$db_host, - "db-user=s" => \$db_user, - "db-pass=s" => \$db_pass, - "db-port=s" => \$db_port, - 'help' => \$help -); - -sub announce { - my $msg = shift; - print DateTime->now(time_zone => 'local')->strftime('%F %T')." $msg\n"; -} - -sub help { - print < - Limit bibs to those modifed since the specified date. - - --exported-since - Limit bibs to those exported since the specified date. - Export date is based on data found in the - metabib.bib_export_data table. - - --start-id - Limit bibs to those whose ID is no less than - - --end-id - Limit bibs to those whose ID is no greater than - - --out-dir [/tmp] - Output directory. - - --batch-size - Number of bib IDs to write to each batch file. - - --count-only - Print the total number of records that would be added - to batch files without adding to any batch files. - - --db-host - --db-user - --db-pass - --db-port - Database connection params. PG environment variables are - also inspected for values. When all else fails, try to - connect to database evergreen\@localhost -HELP - exit 0; -} - -help() if $help || !$opt_result; - -sub connect_db { - $db_handle = DBI->connect( - "dbi:Pg:db=$db_name;host=$db_host;port=$db_port;options='--statement-timeout=0'", - $db_user, $db_pass, { - RaiseError => 1, - PrintError => 0, - AutoCommit => 1, - pg_expand_array => 0, - pg_enable_utf8 => 1 - } - ) or die "Connection to database failed: $DBI::err : $DBI::errstr"; -} - -connect_db(); - -# ---------------------------------------------------------------------- -my $from = 'FROM biblio.record_entry bre'; - -my $where = 'WHERE NOT bre.deleted'; -$where .= " AND bre.id >= $start_id" if $start_id; -$where .= " AND bre.id <= $end_id" if $end_id; - -if ($exported_since) { - $where .= " AND bed.export_date > '$exported_since'"; - $from .= " JOIN metabib.bib_export_data bed ON (bed.bib = bre.id)"; -} - -my $sql = <prepare($sql); -$sth->execute; - -my $batch_file; -sub open_batch_file { - my $path = shift; - announce("Starting new batch file: $path"); - - close $batch_file if $batch_file; - - open $batch_file, '>', $path or - die "Cannot open batch file for writing: $!\n"; -} - -my $ctr = 0; -my $batch = 0; -while (my $ref = $sth->fetchrow_hashref()) { - $ctr++; - next if $count_only; - - if (( ($ctr - 1) % $batch_size) == 0) { - my $path = sprintf("$out_dir/bib-ids.%0.3d", $batch); - open_batch_file($path); - $batch++; - } - - print $batch_file $ref->{id} . "\n"; -} - -close $batch_file if $batch_file; -$sth->finish; - -announce("Found $ctr bib records"); - diff --git a/KCLS/linking/link-bib-batches.sh b/KCLS/linking/link-bib-batches.sh deleted file mode 100755 index 5be60ffa53..0000000000 --- a/KCLS/linking/link-bib-batches.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -set -eu -PROC_COUNT=3 -BATCHES_PER_PROC=3 -BATCH_DIR="/openils/var/data/linkbibs" - -DOW=$(date +%u); -[ $DOW == 7 ] && DOW=0; # make Sunday = 0 - -BATCH=$(echo "$PROC_COUNT * $BATCHES_PER_PROC * $DOW" | bc); - -echo "Starting at batch number $BATCH"; - -function execute_background_proc { - START=$1 - for batch in $(seq 1 $BATCHES_PER_PROC); do - FILE=$(printf "$BATCH_DIR/bib-ids.%0.3d" $((($START + $batch)))); - echo "Linking bib file $FILE" - perl ./authority_control_fields.pl --refresh --file $FILE - done; -} - -for PROC in $(seq 1 $PROC_COUNT); do - (execute_background_proc $BATCH) & - BATCH=$((($BATCH + BATCHES_PER_PROC))); -done - -wait; - -echo "Done processing all batches" - diff --git a/KCLS/linking/link-new-auth-records.pl b/KCLS/linking/link-new-auth-records.pl deleted file mode 100755 index c48939e30a..0000000000 --- a/KCLS/linking/link-new-auth-records.pl +++ /dev/null @@ -1,253 +0,0 @@ -#!/usr/bin/perl -# ---------------------------------------------------------------------- -# Find authority records newer than a specified age. Once found, -# run each through the auth-to-auth linking process. Then locate -# bib records that we might want to link to the new records and -# pass them off to the bib-to-auth linker. -# ---------------------------------------------------------------------- -use strict; -use warnings; -use DBI; -use Getopt::Long; -use DateTime; -use Pod::Usage qw/pod2usage/; -use Time::HiRes qw/usleep/; - -my @auth_ids; -my @bib_ids; -my $counter = 0; - -# options -my $help; -my $modified_since; -my $max_auth_count; -my $start_auth_id; -my $print_auth_ids; -my $print_bib_ids; -my $link_auths; -my $link_bibs; -my $progress; -my $db_host = $ENV{PGHOST} || 'localhost'; -my $db_port = $ENV{PGPORT} || '5432'; -my $db_user = $ENV{PGDATABASE} || 'evergreen'; -my $db_pass = $ENV{PGPASSWORD}; - -my $opt_result = GetOptions( - 'modified-since=i' => \$modified_since, - 'max-auth-count=i' => \$max_auth_count, - 'start-auth-id=i' => \$start_auth_id, - 'print-bib-ids' => \$print_bib_ids, - 'print-auth-ids' => \$print_auth_ids, - 'link-bibs' => \$link_bibs, - 'link-auths' => \$link_auths, - 'progress' => \$progress, - "db-host=s" => \$db_host, - "db-user=s" => \$db_user, - "db-pass=s" => \$db_pass, - "db-port=s" => \$db_port, - 'help' => \$help -); - -sub announce { - my $msg = shift; - print DateTime->now(time_zone => 'local')->strftime('%F %T')." $msg\n"; -} - -sub help { - print < - Process authority records created or modified within the - last days. - - --max-auth-count - Process authority records in total. Use with - --start-auth-id to process batches of records across - multiple instances of the script. - - --start-auth-id - Process authority records whose ID is equal to or greater - than . Use with --max-auth-count to process batches - of records accross multiple runs of the script. - - --print-auth-ids - Print authority record IDs to process to STDOUT - - --print-bib-ids - Print bib record IDs to process to STDOUT - - --link-auths - Run idenditifed authority records through authority_authority_linker.pl - - --link-bibs - Run idenditifed bib records through authority_control_fields.pl - - --progress - Log linking progess to STDOUT - - --db-host - --db-user - --db-pass - --db-port - - Database connection params. PG environment variables are - also inspected for values. When all else fails, try to - connect to database evergreen\@localhost -HELP - exit 0; -} - -help() if $help || !$opt_result; - -my $dsn = "dbi:Pg:database=evergreen;host=$db_host;port=$db_port"; -my $dbh = DBI->connect($dsn, $db_user, $db_pass) - or die "Cannot connect to database: $dsn\n"; - -$dbh->do('SET statement_timeout = 0'); - -# ---------------------------------------------------------------------- -# Load the authority record IDs -my $where2 = $start_auth_id ? "AND id >= $start_auth_id" : ''; -my $limit = $max_auth_count ? "LIMIT $max_auth_count" : ''; - -my $sth = $dbh->prepare(<= DATE(NOW() - '$modified_since day'::INTERVAL) - $where2 - ORDER BY id - $limit -SQL - -$sth->execute; - -while (my $ref = $sth->fetchrow_hashref()) { - push(@auth_ids, $ref->{id}); -} -$sth->finish; - -my $auth_rec_count = scalar(@auth_ids); -print join("\n", @auth_ids) if $print_auth_ids; - -# Let the caller know what the last record processed will be, -# so the next iteration of the script can start there. -announce("Final auth ID will be: " . $auth_ids[-1]) if $max_auth_count; - -if (!@auth_ids) { - announce("No authority records edited in the last $modified_since days"); - exit 0; -} - -# ---------------------------------------------------------------------- -# Auth-to-Auth linking - -if ($link_auths) { - # Pass all new authority records to the auth-to-auth linker - for my $rec_id (@auth_ids) { - - system( - './authority_authority_linker.pl', - '--db-host', $db_host, - '--db-user', $db_user, - '--db-pass', ($db_pass || ''), - '--record', $rec_id - ); - - usleep(250000); # 1/4 second; allow ctrl-c to penetrate - announce("Auth records processed: $counter/$auth_rec_count") - if $progress && ++$counter % 10 == 0; - } -} -$counter = 0; - -# Exit if there is nothing left to do. -exit unless $print_bib_ids || $link_bibs; - -# ---------------------------------------------------------------------- -# Find bib records that we might want to link to the new authority -# record. -# -# Query: give me bib records that link to browse entries that also -# link to exactly one authority record, specifically the new authority -# records we are processing via this script. Only include bib records -# that are not already linked via bib_linking to said authority record. -# This represents the set of bib records that might need to be linked -# to our new authority records. -# ---------------------------------------------------------------------- -my %bib_ids; # de-dupe by record ID. -my $auth_ids_param = join(',', @auth_ids); - -for my $axis (qw/author subject series title/) { - my $query = <prepare($query); - $sth->execute; - while (my $ref = $sth->fetchrow_hashref()) { - $bib_ids{$ref->{bib_record}} = 1; # de-dupe - } - $sth->finish; -} - -@bib_ids = sort(keys(%bib_ids)); -my $bib_rec_count = scalar(@bib_ids); - -if ($link_bibs) { - for my $rec_id (@bib_ids) { - # fire off the linker for each of the records identied - system('./authority_control_fields.pl', - '--db-host', $db_host, - '--db-user', $db_user, - '--db-pass', ($db_pass || ''), - '--record', $rec_id, - '--refresh' - ); - - usleep(250000); # 1/4 second; allow ctrl-c to penetrate - announce("Bib records processed: $counter/$bib_rec_count") - if $progress && ++$counter % 10 == 0; - } -} - -print join("\n", @bib_ids) if $print_bib_ids; -