From 199ce7ab47ede1fa4a104e5d5755839ebb0adfdd Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Tue, 17 Nov 2015 14:49:01 -0500 Subject: [PATCH] JBAS-980 auth-to-bib linking repairs continued Signed-off-by: Bill Erickson --- KCLS/linking/authority_control_fields.pl | 201 +++++++++++++++++++------------ 1 file changed, 126 insertions(+), 75 deletions(-) diff --git a/KCLS/linking/authority_control_fields.pl b/KCLS/linking/authority_control_fields.pl index 176cb95265..61d737e5f6 100755 --- a/KCLS/linking/authority_control_fields.pl +++ b/KCLS/linking/authority_control_fields.pl @@ -29,7 +29,9 @@ use Unicode::Normalize; use OpenILS::Application::AppUtils; use Data::Dumper; use Pod::Usage qw/ pod2usage /; +use DateTime; +$Data::Dumper::Indent = 0; MARC::Charset->assume_unicode(1); my ($start_id, $end_id, $refresh); @@ -37,6 +39,7 @@ my ($days_back); my $input_file =''; my $bootstrap = '/openils/conf/opensrf_core.xml'; my @records; +my $verbose = 0; my %options; my $result = GetOptions( @@ -49,8 +52,15 @@ my $result = GetOptions( 'end_id=i' => \$end_id, 'days_back=i' => \$days_back, 'file=s' => \$input_file, + 'verbose' => \$verbose ); +sub announce { + my $msg = shift; + return unless $verbose; + print DateTime->now->strftime('%F %T') . " $msg\n"; +} + if (!$result or $options{help}) { pod2usage(0); } @@ -542,14 +552,56 @@ my %controllees = ( z => { 130 => 'z' }, }, ); + +# mapping of authority leader/11 "Subject heading system/thesaurus" +# to the matching bib record indicator +my %AUTH_TO_BIB_IND2 = ( + 'a' => '0', # Library of Congress Subject Headings (ADULT) + 'b' => '1', # Library of Congress Subject Headings (JUVENILE) + 'c' => '2', # Medical Subject Headings + 'd' => '3', # National Agricultural Library Subject Authority File + 'n' => '4', # Source not specified + 'k' => '5', # Canadian Subject Headings + 'v' => '6', # Répertoire de vedettes-matière + 'n' => '7', # Source specified in subfield $2 + 'z' => '8' # Other +); + my $start_time = localtime(); + if($input_file) { - print "Start " . $start_time . " for " . scalar(@records) . " records.\n"; + announce("Start $start_time for ".scalar(@records)." records"); +} elsif($start_id) { + announce("Start $start_time for record range: $start_id => $end_id"); } else { - print "Start " . $start_time . " for records " . $start_id . " to " . $end_id . "\n"; + announce("Start $start_time for all records"); +} + +# given a set of authority record ID's and a controlled bib field +# indicator 2 (thesaurus) value, returns the ID of the first +# authority record in the set that matches the thesaurus. +sub find_matching_auth_for_thesaurus { + my ($e, $cfield_ind2, $auth_ids) = @_; + + my $auth_leaders = $e->json_query({ + select => {afr => ['record', 'value']}, + from => 'afr', + where => {'+afr' => {tag => '008', record => $auth_ids}} + }); + + for my $leader (@$auth_leaders) { + my $value = $leader->{value}; + next unless $value; + my $thesaurus = substr($value, 11, 1); # leader/11 -- zero based. + return $leader->{record} if + $AUTH_TO_BIB_IND2{$thesaurus} eq $cfield_ind2; + } + + return undef; } + foreach my $rec_id (@records) { - # print "$rec_id\n"; + announce("processing bib record $rec_id"); # State variable; was the record changed? my $changed = 0; @@ -557,7 +609,6 @@ foreach my $rec_id (@records) { # get the record my $record = $e->retrieve_biblio_record_entry($rec_id); next unless $record; - # print Dumper($record); try { my $marc = MARC::Record->new_from_xml($record->marc()); @@ -567,8 +618,7 @@ foreach my $rec_id (@records) { foreach my $c_tag (@c_fields) { my @c_subfields = keys %{$controllees{"$c_tag"}}; - # print "Field: $field subfields: "; - # foreach (@subfields) { print "$_ "; } + announce "Inspecting controlled field $c_tag"; # Get the MARCXML from the record and check for controlled fields/subfields my @bib_fields = ($marc->field($c_tag)); @@ -594,13 +644,12 @@ foreach my $rec_id (@records) { push @searches, map {{term => $_, subfield => $c_subfield}} @sf_values; } } - # print Dumper(\%match_subfields); next if !$match_tag; - my @tags = ($match_tag); + announce("Searching for matches (auth tag=$match_tag): ". + Dumper(\@searches)); - # print "Controlling tag: $c_tag and match tag $match_tag\n"; - # print Dumper(\@tags, \@searches); + my @tags = ($match_tag); # Now we've built up a complete set of matching controlled # subfields for this particular field; let's check to see if @@ -611,78 +660,78 @@ foreach my $rec_id (@records) { )->gather(); $session->disconnect(); - # print Dumper($validates); # Protect against failed (error condition) search request if (!$validates) { print STDERR "Search for matching authority failed; record # $rec_id\n"; next if (!$changed); } - - my $num_records = scalar(@$validates); - - # Only add linking if one or more was found, but we may have changed - # the record already if in --refresh mode. - if (scalar(@$validates) > 0) { - - # Iterate through the returned authority record IDs to delete any - # matching $0 subfields already in the bib record - foreach my $auth_zero (@$validates) { - $bib_field->delete_subfield(code => '0', match => qr/\)$auth_zero$/); - } - - for(my $i = 0; $i < $num_records; $i++) - { - # Okay, we have a matching authority control; time to - # add the magical subfield 0. Use the first returned auth - # record as a match. - my $auth_id = @$validates[$i]; - my $auth_rec = $e->retrieve_authority_record_entry($auth_id); - my $auth_marc = MARC::Record->new_from_xml($auth_rec->marc()); - - my %Auth2BibIndicatorTwo = ( - 'a' => '0', # Library of Congress Subject Headings (ADULT) - 'b' => '1', # Library of Congress Subject Headings (JUVENILE) - 'c' => '2', # Medical Subject Headings - 'd' => '3', # National Agricultural Library Subject Authority File - 'n' => '4', # Source not specified - 'k' => '5', # Canadian Subject Headings - 'v' => '6', # Répertoire de vedettes-matière - 'n' => '7', # Source specified in subfield $2 - 'z' => '8' # Other - ); - - my @marc_fieldz = $marc->fields(); - my $Auth_Indic = substr($auth_marc->field('008')->data(), 10, 1); - while ((my $key, my $value) = each (%Auth2BibIndicatorTwo)) - { - if($Auth_Indic eq $key) - { - foreach my $auth_field (@marc_fieldz) - { - if($auth_field->tag() ge 650 && $auth_field->tag() le 659) # 650-659 - { - #print "Trying To Match Against: " . $Auth2BibIndicatorTwo{$key} . "\n"; - if($auth_field->indicator(2) eq $Auth2BibIndicatorTwo{$key}) - { - #print "Found Match Between Bib Record(" . $rec_id . ") and Auth Record(" . $auth_marc->field('901')->subfield('c') . ")\n"; - if ($auth_marc->field('003')) { - my $cni = $auth_marc->field('003')->data(); - $bib_field->add_subfields('0' => "($cni)$auth_id"); - $changed = 1; - } else { - print "Authority # $auth_id missing field '003'\n"; - next if (!$changed); - } - } - } - } - } - } + + announce("Match query returned @$validates"); + + # No matches found. Nothing left to do for this field. + next if scalar(@$validates) == 0; + + # Iterate through the returned authority record IDs to delete any + # matching $0 subfields already in the bib record + foreach my $auth_zero (@$validates) { + $bib_field->delete_subfield(code => '0', match => qr/\)$auth_zero$/); + } + + # Find the best authority record to use for linking. + + my $auth_id; + my $stop_looking = 0; + + if ($bib_field->tag() ge 650 and $bib_field->tag() le 659) { + # If the controlled bib field has an indicator 2 value + # indicating the thesarus, use the first authority + # record in the set with the same thesaurus. + # TODO: perhaps this step should be part of the + # validateion API search call above. + + my $cfield_ind2 = $bib_field->indicator(2); + + if (defined $cfield_ind2) { + + $auth_id = find_matching_auth_for_thesaurus( + $e, $cfield_ind2, $validates) || ''; + + announce("Thesaurus match for ind2=$cfield_ind2 returned $auth_id"); + + # If we don't find a matching authority record using + # the same thesuarus, we have nothing to link to. + $stop_looking = 1; } + } + + # No best record found above. Use the first in the list. + $auth_id = $validates->[0] unless $auth_id or $stop_looking; + + # Don't exit here just because we have no $auth_id. The + # bib field could have been changed above in the cleanup / + # delete phase. + + if ($auth_id) { + # Add the control number agency info from the matching + # authority record to the controlled bib field. + + my $auth_003 = $e->json_query({ + select => {afr => ['value']}, + from => 'afr', + where => {'+afr' => + {tag => '003', record => $auth_id}} + })->[0]; + + my $cni = $auth_003->{value} || ''; + $bib_field->add_subfields('0' => "($cni)$auth_id"); + $changed = 1; + + announce("auth=$auth_id cni=$cni. It's a match!"); } } } + if ($changed) { # print $marc->as_formatted(); @@ -709,13 +758,15 @@ foreach my $rec_id (@records) { } } my $end_time = localtime(); + if($input_file) { - print "----- Stop " . $end_time . " for " . scalar(@records) . " records.\n"; + announce("Stop $end_time for ".scalar(@records)." records"); +} elsif($start_id) { + announce("Start $end_time for record range: $start_id => $end_id"); } else { - print "----- Stop " . $end_time . " for records " . $start_id . " to " . $end_id . "\n"; + announce("Start $end_time for all records"); } - __END__ =head1 NAME -- 2.11.0