From 3fe4d9e528989dc60b59c9e67d0a50259db806f0 Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Wed, 21 Dec 2016 15:31:49 -0500 Subject: [PATCH] JBAS-1699 Linking config loaded from database Load the bib-to-auth record maps from the authority.control_set_bib_field data instead of hard-coding the maps in the script. This sychronizes the linking configs and the authority data propagation configs to avoid cases where a mismatch can lead to bib field deletions. Signed-off-by: Bill Erickson --- .../linking/authority_control_fields.pl | 345 +++++++++++++-------- 1 file changed, 212 insertions(+), 133 deletions(-) diff --git a/KCLS/authority-control/linking/authority_control_fields.pl b/KCLS/authority-control/linking/authority_control_fields.pl index 7e36820b0c..932303562f 100755 --- a/KCLS/authority-control/linking/authority_control_fields.pl +++ b/KCLS/authority-control/linking/authority_control_fields.pl @@ -273,7 +273,7 @@ exit if $count_only || $print_ids; # # So, if the bib 650$a can be controlled by an auth 150$a, that maps to: # 650 => { a => { 150 => 'a'}} -my %controllees = ( +my %_controllees = ( 100 => { a => { 100 => 'a' }, b => { 100 => 'b' }, c => { 100 => 'c' }, @@ -683,6 +683,72 @@ my %controllees = ( }, ); + +my %controllees; +sub get_controlled_fields { + + # Fetch bib field configs for fields we want to link. + my $bib_fields = $e->search_authority_control_set_bib_field([ + {id => {'<>' => undef}}, + {flesh => 1, flesh_fields => {acsbf => ['authority_field']}} + ]); + + for my $bib_field (@$bib_fields) { + + # We only link bib 1XX, 6XX, 7XX, and 8XX fields + next unless $bib_field->tag =~ /^[1678]/; + + # Ignore authority 18X fields + next if $bib_field->authority_field->tag =~ /^18/; + + my @subfields = split(//, $bib_field->authority_field->sf_list); + + # Remove some nonsensical subfield mappings... + + @subfields = grep { $_ !~ /[vxyz]/ } @subfields + unless ($bib_field->tag =~ /^6/); + + @subfields = grep { $_ !~ /[mors]/ } @subfields + unless ($bib_field->tag =~ /(130|600|610|630|730|830)/); + + $controllees{$bib_field->tag} ||= {}; + $controllees{$bib_field->tag}->{$_} = + {$bib_field->authority_field->tag => $_} for @subfields; + } +} + +get_controlled_fields(); + +for my $tag (sort keys %controllees) { +# if (!exists $_controllees{$tag}) { +# print "adding mapping for bib tag $tag\n"; +# next; +# } + + for my $subfield (sort keys %{$controllees{$tag}}) { + for my $authtag (sort keys %{$controllees{$tag}->{$subfield}}) { + next if + $_controllees{$tag} && + $_controllees{$tag}{$subfield} && + $_controllees{$tag}{$subfield}{$authtag}; + + print "add mapping for bib $tag \$$subfield ". + "=> authority $authtag \$$subfield\n"; + } + } +} + +for my $tag (sort keys %_controllees) { + if (!exists $controllees{$tag}) { + print "removing mapping for bib tag $tag\n"; + next; + } + for my $subfield (sort keys %{$_controllees{$tag}}) { + print "removing mapping for bib tag $tag \$$subfield\n" + unless exists $controllees{$tag}{$subfield}; + } +} + # mapping of authority leader/11 "Subject heading system/thesaurus" # to the matching bib record indicator my %AUTH_TO_BIB_IND2 = ( @@ -820,6 +886,14 @@ sub update_record { $xml =~ s/\p{Cc}//go; $xml = OpenILS::Application::AppUtils->entityize($xml); + if ($record->marc eq $xml) { + $KU->announce('DEBUG', "Skipping update on bib ".$record->id. + "... no changes were made"); + return; + } + + $KU->announce('DEBUG', "Applying updates to bib ".$record->id); + $record->marc($xml); my $editor = OpenILS::Utils::CStoreEditor->new(xact=>1); @@ -834,16 +908,20 @@ my $count = 0; my $total = scalar(@records); $KU->announce('INFO', "processing $total bib records"); -$slot ||= 'N/A'; +# for logging +if ($slot_count && defined $slot) { + $slot++ +} else { + $slot = 'N'; + $slot_count = 'A'; +} + my $start_time = time(); foreach my $rec_id (@records) { $count++; - $KU->announce('INFO', "processed $count of $total [slot=$slot]") - if ($count % 1000) == 0; - - $KU->announce('DEBUG', - "processing bib record $rec_id [$count of $total; slot=$slot]"); + $KU->announce('DEBUG', "processing bib record $rec_id ". + "[$count of $total; slot=$slot/$slot_count]"); # State variable; was the record changed? my $changed = 0; @@ -852,157 +930,158 @@ foreach my $rec_id (@records) { my $record = $e->retrieve_biblio_record_entry($rec_id); next unless $record && $record->deleted eq 'f'; - eval { - my $marc = MARC::Record->new_from_xml($record->marc()); + my $marc; - # get the list of controlled fields - my @c_fields = keys %controllees; + eval { $marc = MARC::Record->new_from_xml($record->marc); }; - foreach my $c_tag (@c_fields) { - my @c_subfields = keys %{$controllees{"$c_tag"}}; + if ($@) { + $KU->announce('WARNING', "Error parsing record $rec_id : $@"); + import MARC::File::XML; # Reset SAX parser + next; + } - # Get the MARCXML from the record and check for controlled fields/subfields - my @bib_fields = ($marc->field($c_tag)); - foreach my $bib_field (@bib_fields) { + # get the list of controlled fields + my @c_fields = keys %controllees; - my $sf0 = $bib_field->subfield('0') || ''; - my $is_fast_heading = is_fast_heading($bib_field); + foreach my $c_tag (@c_fields) { + my @c_subfields = keys %{$controllees{"$c_tag"}}; - if ($is_fast_heading && $sf0 =~ /\)fst/) { - # fast heading looks OK. ignore it. - $KU->announce('DEBUG', "Ignoring FAST heading field on ". - "rec=$rec_id and tag=$c_tag \$0 $sf0"); - next; - } + # Get the MARCXML from the record and check for controlled fields/subfields + my @bib_fields = ($marc->field($c_tag)); + foreach my $bib_field (@bib_fields) { - if ($sf0) { - $KU->announce('DEBUG', - "Removing \$0 $sf0 for rec=$rec_id and tag=$c_tag"); - $bib_field->delete_subfield(code => '0'); - $changed = 1; - } + my $sf0 = $bib_field->subfield('0') || ''; + my $is_fast_heading = is_fast_heading($bib_field); - if ($is_fast_heading) { - # Update record after potentially removing a bogus - # fast heading above. - update_record($record, $marc) if $changed; - - # We don't control fast headings, so there's nothing - # left to do. Move on to the next field... - $KU->announce('DEBUG', - "No linking performed on FAST heading ". - "field on rec=$rec_id and tag=$c_tag"); - next; - } + if ($is_fast_heading && $sf0 =~ /\)fst/) { + # fast heading looks OK. ignore it. + $KU->announce('DEBUG', "Ignoring FAST heading field on ". + "rec=$rec_id and tag=$c_tag \$0 $sf0"); + next; + } - my %match_subfields; - my $match_tag; - my @searches; - foreach my $c_subfield (@c_subfields) { - my @sf_values = $bib_field->subfield($c_subfield); - if (@sf_values) { - # Give me the first element of the list of authority controlling tags for this subfield - # XXX Will we need to support more than one controlling tag per subfield? Probably. That - # will suck. Oh well, leave that up to Ole to implement. - $match_subfields{$c_subfield} = (keys %{$controllees{$c_tag}{$c_subfield}})[0]; - $match_tag = $match_subfields{$c_subfield}; - push @searches, map {{term => $_, subfield => $c_subfield}} @sf_values; - } - } - next if !$match_tag; + if ($sf0) { + $KU->announce('DEBUG', + "Removing \$0 $sf0 for rec=$rec_id and tag=$c_tag"); + $bib_field->delete_subfield(code => '0'); + $changed = 1; + } - $KU->announce('INFO', - "Searching for matches on controlled field $c_tag ". - "(auth tag=$match_tag): \n - ".Dumper(\@searches)); - - my @tags = ($match_tag); - - # Now we've built up a complete set of matching controlled - # subfields for this particular field; let's check to see if - # we have a matching authority record - my $session = OpenSRF::AppSession->create("open-ils.search"); - my $validates = $session->request("open-ils.search.authority.validate.tag.id_list", - "tags", \@tags, "searches", \@searches - )->gather(); - $session->disconnect(); - - - # Protect against failed (error condition) search request - if (!$validates) { - $KU->announce('WARNING', - "Search for matching authority failed; record $rec_id"); - next unless $changed; + if ($is_fast_heading) { + # Update record after potentially removing a bogus + # fast heading above. + update_record($record, $marc) if $changed; + + # We don't control fast headings, so there's nothing + # left to do. Move on to the next field... + $KU->announce('DEBUG', + "No linking performed on FAST heading ". + "field on rec=$rec_id and tag=$c_tag"); + next; + } + + my %match_subfields; + my $match_tag; + my @searches; + foreach my $c_subfield (@c_subfields) { + my @sf_values = $bib_field->subfield($c_subfield); + if (@sf_values) { + # Give me the first element of the list of authority controlling tags for this subfield + # XXX Will we need to support more than one controlling tag per subfield? Probably. That + # will suck. Oh well, leave that up to Ole to implement. + $match_subfields{$c_subfield} = (keys %{$controllees{$c_tag}{$c_subfield}})[0]; + $match_tag = $match_subfields{$c_subfield}; + push @searches, map {{term => $_, subfield => $c_subfield}} @sf_values; } + } + next if !$match_tag; - $KU->announce('INFO', "Match query returned @$validates"); + $KU->announce('INFO', + "Searching for matches on controlled field $c_tag ". + "(auth tag=$match_tag): \n - ".Dumper(\@searches)); + + my @tags = ($match_tag); + + # Now we've built up a complete set of matching controlled + # subfields for this particular field; let's check to see if + # we have a matching authority record + my $session = OpenSRF::AppSession->create("open-ils.search"); + my $validates = $session->request( + "open-ils.search.authority.validate.tag.id_list", + "tags", \@tags, "searches", \@searches + )->gather(); + + # Protect against failed (error condition) search request + if (!$validates) { + $KU->announce('WARNING', + "Search for matching authority failed; record $rec_id"); + next; + } - # No matches found. Nothing left to do for this field. - next if scalar(@$validates) == 0; + $KU->announce('INFO', "Match query returned @$validates"); - # Iterate through the returned authority record IDs to delete any - # matching $0 subfields already in the bib record - foreach my $auth_zero (@$validates) { - $bib_field->delete_subfield(code => '0', match => qr/\)$auth_zero$/); - } + # No matches found. Nothing left to do for this field. + next if scalar(@$validates) == 0; - # Find the best authority record to use for linking. + # Iterate through the returned authority record IDs to delete any + # matching $0 subfields already in the bib record + foreach my $auth_zero (@$validates) { + $bib_field->delete_subfield(code => '0', match => qr/\)$auth_zero$/); + } - my $auth_leaders; - if ($bib_field->tag =~ /^[167]/) { - # For 1XX, 6XX, and 7XX bib fields, only link to - # authority records whose leader/008 positions - # 14 and 15 are coded to allow use as a name/author - # or subject record, depending. + # Find the best authority record to use for linking. - $auth_leaders = authority_leaders_008_14_15( - $e, $bib_field->tag, $validates); + my $auth_leaders; + if ($bib_field->tag =~ /^[167]/) { + # For 1XX, 6XX, and 7XX bib fields, only link to + # authority records whose leader/008 positions + # 14 and 15 are coded to allow use as a name/author + # or subject record, depending. - $validates = [map {$_->{record}} @$auth_leaders]; - } + $auth_leaders = authority_leaders_008_14_15( + $e, $bib_field->tag, $validates); - my $auth_id; + $validates = [map {$_->{record}} @$auth_leaders]; + } - if ($bib_field->tag() =~ /^65[015]/) { - # Using the indicator-2 value from the controlled bib - # field, find the first authority in the list of matches - # that uses the same thesaurus. If no such authority - # is found, no matching occurs. - # TODO: perhaps this step should be part of the - # validation API search call above. - - $auth_id = find_matching_auth_for_thesaurus( - $e, $bib_field, $auth_leaders) || ''; + my $auth_id; - } else { + if ($bib_field->tag() =~ /^65[015]/) { + # Using the indicator-2 value from the controlled bib + # field, find the first authority in the list of matches + # that uses the same thesaurus. If no such authority + # is found, no matching occurs. + # TODO: perhaps this step should be part of the + # validation API search call above. + + $auth_id = find_matching_auth_for_thesaurus( + $e, $bib_field, $auth_leaders) || ''; - # For all other controlled fields, use the first - # authority record in the result set. - $auth_id = $validates->[0]; - } + } else { - # Don't exit here just because we have no $auth_id. The - # bib field could have been changed above in the cleanup / - # delete phase. - - if ($auth_id) { - # Add the auth ID and control number agency info from the - # matching authority record to the controlled bib field. - $changed = 1; - $bib_field->add_subfields('0' => "($CNI)$auth_id"); - $KU->announce('INFO', - "auth=$auth_id cni=$CNI. It's a match!"); - } + # For all other controlled fields, use the first + # authority record in the result set. + $auth_id = $validates->[0]; } - } - update_record($record, $marc) if $changed; - }; + # Don't exit here just because we have no $auth_id. The + # bib field could have been changed above in the cleanup / + # delete phase. + + if ($auth_id) { + # Add the auth ID and control number agency info from the + # matching authority record to the controlled bib field. + $changed = 1; + $bib_field->add_subfields('0' => "($CNI)$auth_id"); + $KU->announce('INFO', + "auth=$auth_id cni=$CNI. It's a match!"); + } - if ($@) { - $KU->announce('WARNING', "Error linking record $rec_id : $@"); - # Reset SAX parser so that one bad record doesn't kill the entire process - import MARC::File::XML; - } + } # for each bib field with selected tag + + } # for each controlled bib tag + + update_record($record, $marc) if $changed; } my $end_time = time(); -- 2.11.0