From: Bill Erickson Date: Fri, 29 Sep 2017 20:44:23 +0000 (-0400) Subject: JBAS-1470 Bib2Auth link to fullest form of heading X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=3e1d160a8ec92a03a33957942c7dc7eacc7589ce;p=working%2FEvergreen.git JBAS-1470 Bib2Auth link to fullest form of heading Allow a bib field to link to an authority field even when the bib field has more controlled subfields than the authority field. Matching is based on finding the authority record with the closest ("left-anchored") heading match. Signed-off-by: Bill Erickson --- diff --git a/KCLS/authority-control/linking/authority_control_fields.pl b/KCLS/authority-control/linking/authority_control_fields.pl index 800915325d..727ba748d6 100755 --- a/KCLS/authority-control/linking/authority_control_fields.pl +++ b/KCLS/authority-control/linking/authority_control_fields.pl @@ -24,9 +24,11 @@ use Unicode::Normalize; use OpenSRF::System; use OpenILS::Utils::Fieldmapper; use OpenSRF::Utils::SettingsClient; +use OpenILS::Utils::Normalize qw(naco_normalize); use Data::Dumper; use OpenILS::Application::AppUtils; use OpenILS::Utils::KCLSScriptUtil; +my $U = 'OpenILS::Application::AppUtils'; my $KU = 'OpenILS::Utils::KCLSScriptUtil'; $ENV{OSRF_LOG_CLIENT} = 1; @@ -343,6 +345,15 @@ sub authority_leaders_008_14_15 { where => {'+afr' => {tag => '008', record => $auth_ids}} }); + # Sort the auth_leaders list to match the order of the origin + # auth_ids, since they are prioritized by heading matchy-ness. + my @tmp_leaders = @$auth_leaders; + $auth_leaders = []; + for my $auth_id (@$auth_ids) { + my ($leader) = grep {$_->{record} eq $auth_id} @tmp_leaders; + push(@$auth_leaders, $leader) if $leader; + } + my $index; $index = 14 if $bib_tag =~ /^[17]/; # author/name record $index = 15 if $bib_tag =~ /^6/; # subject record @@ -463,9 +474,92 @@ sub update_record { } } +sub find_potential_auth_matches { + my ($bib_field) = @_; + + my $bib_tag = $bib_field->tag; + my @c_subfields = keys %{$controllees{$bib_tag}}; + + # Iterate over the subfields within the record, instead + # of the known controlled subfields, to retain field order. + my @searches; + my $match_auth_tag; + for my $r_subfields ($bib_field->subfields) { + my $c_subfield = $r_subfields->[0]; + + # skip uncontrolled subfields. + next unless grep {$_ eq $c_subfield} @c_subfields; + + # Assume each bib field is controlled by one authority field. + $match_auth_tag = (keys %{$controllees{$bib_tag}{$c_subfield}})[0]; + + for my $sf_val ($bib_field->subfield($c_subfield)) { + push (@searches, {term => $sf_val, subfield => $c_subfield}); + } + } + + return [] unless $match_auth_tag; + + # KCLS JBAS-1470 + # Find all authority records whose simple_heading is (essentially) + # a left-anchored substring match of the normalized bib heading. + # Sort by longest to shortest match. Include the shorter matches + # because a longer match may later be discarded, e.g. because it + # uses a different thesaurus. + # + # We don't exactly want a substring match, more like a sub-tag + # match. A straight substring match on the heading is both slow + # (at the DB level) and could result in partial value matches, like + # 'smith' vs. 'smithsonian', which we don't want. + + if (1) { + + my @auth_ids; + while (@searches) { + my $heading = $match_auth_tag; + + $heading .= " " . $_->{subfield} . " " . + naco_normalize($_->{term}) for @searches; + + $KU->announce('DEBUG', + "Authority sub-heading search for: $heading"); + + my $ids = $e->search_authority_record_entry( + {simple_heading => $heading, deleted => 'f'}, + {idlist => 1} + ); + + # Don't let a single cstore query failure kill the whole process + return [] unless $ids; + + $KU->announce('DEBUG', + "Authority heading search returned @$ids") if @$ids; + + push(@auth_ids, @$ids); + pop(@searches); + } + + return \@auth_ids; + } + + # Legacy bib-to-auth lookup routine + # SHOULD NOT GET HERE. KEEPING FOR TESTS. + + $KU->announce('INFO', + "Searching for matches on controlled field $bib_tag ". + "(auth tag=$match_auth_tag): \n - ".Dumper(\@searches)); + + my $auth_ids = $U->simplereq( + 'open-ils.search', + "open-ils.search.authority.validate.tag.id_list", + "tags", [$match_auth_tag], "searches", \@searches + ); + + return $auth_ids ? $auth_ids : []; +} + my $count = 0; my $total = scalar(@records); -$KU->announce('INFO', "processing $total bib records"); # for logging if ($slot_count && defined $slot) { @@ -503,9 +597,7 @@ foreach my $rec_id (@records) { my @c_fields = keys %controllees; foreach my $c_tag (@c_fields) { - my @c_subfields = keys %{$controllees{"$c_tag"}}; - # Get the MARCXML from the record and check for controlled fields/subfields my @bib_fields = ($marc->field($c_tag)); foreach my $bib_field (@bib_fields) { @@ -535,43 +627,7 @@ foreach my $rec_id (@records) { next; } - my %match_subfields; - my $match_tag; - my @searches; - foreach my $c_subfield (@c_subfields) { - my @sf_values = $bib_field->subfield($c_subfield); - if (@sf_values) { - # Give me the first element of the list of authority controlling tags for this subfield - # XXX Will we need to support more than one controlling tag per subfield? Probably. That - # will suck. Oh well, leave that up to Ole to implement. - $match_subfields{$c_subfield} = (keys %{$controllees{$c_tag}{$c_subfield}})[0]; - $match_tag = $match_subfields{$c_subfield}; - push @searches, map {{term => $_, subfield => $c_subfield}} @sf_values; - } - } - next if !$match_tag; - - $KU->announce('INFO', - "Searching for matches on controlled field $c_tag ". - "(auth tag=$match_tag): \n - ".Dumper(\@searches)); - - my @tags = ($match_tag); - - # Now we've built up a complete set of matching controlled - # subfields for this particular field; let's check to see if - # we have a matching authority record - my $session = OpenSRF::AppSession->create("open-ils.search"); - my $validates = $session->request( - "open-ils.search.authority.validate.tag.id_list", - "tags", \@tags, "searches", \@searches - )->gather(); - - # Protect against failed (error condition) search request - if (!$validates) { - $KU->announce('WARNING', - "Search for matching authority failed; record $rec_id"); - next; - } + my $validates = find_potential_auth_matches($bib_field); $KU->announce('INFO', "Match query returned auth rec(s): @$validates");