From: Bill Erickson Date: Tue, 3 Sep 2019 21:33:19 +0000 (-0400) Subject: add marc data to bib search index X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=a47838ca372392159f2e65870e436037eb5ff2bc;p=working%2FEvergreen.git add marc data to bib search index Signed-off-by: Bill Erickson --- diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm index 883dfd80cc..af73b793ca 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/Biblio.pm @@ -2135,7 +2135,7 @@ sub marc_search { return OpenILS::Application::Search::ElasticMapper->marc_search( $args, ($method =~ /staff/ ? 1 : 0), $limit, $offset - ) if OpenILS::Application::Search::ElasticMapper->is_enabled('bib-marc'); + ) if OpenILS::Application::Search::ElasticMapper->is_enabled('bib-search'); # allow caller to pass in a call timeout since MARC searches # can take longer than the default 60-second timeout. diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm index c218b868b0..1bfedebc79 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Search/ElasticMapper.pm @@ -619,7 +619,8 @@ sub compile_elastic_marc_query { # Translate search results into a structure consistent with a bib search # API response. # TODO: This version is not currently holdings-aware, meaning it will return -# results for all non-deleted bib records that match the query. +# results for all non-deleted bib records that match the query. However, +# the data does exist in the EL index. Just need to integrate. sub marc_search { my ($class, $args, $staff, $limit, $offset) = @_; @@ -629,7 +630,7 @@ sub marc_search { my $elastic_query = compile_elastic_marc_query($args, $staff, $offset, $limit); - my $es = OpenILS::Elastic::Bib::Marc->new('main'); + my $es = OpenILS::Elastic::Bib::Search->new('main'); $es->connect; my $results = $es->search($elastic_query); diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Marc.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Marc.pm deleted file mode 100644 index dae48fa5ac..0000000000 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Marc.pm +++ /dev/null @@ -1,298 +0,0 @@ -package OpenILS::Elastic::Bib::Marc; -use base 'OpenILS::Elastic::Bib'; -# --------------------------------------------------------------- -# Copyright (C) 2019 King County Library System -# Author: Bill Erickson -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR code. See the -# GNU General Public License for more details. -# --------------------------------------------------------------- -use strict; -use warnings; -use Encode; -use DateTime; -use Time::HiRes qw/time/; -use OpenSRF::Utils::Logger qw/:logger/; -use OpenSRF::Utils::JSON; -use OpenILS::Utils::CStoreEditor qw/:funcs/; -use OpenILS::Utils::DateTime qw/interval_to_seconds/; -use OpenILS::Elastic::Bib; -use base qw/OpenILS::Elastic::Bib/; - -my $INDEX_NAME = 'bib-marc'; - -# TODO: it's possible to apply multiple language analyzers. -my $LANG_ANALYZER = 'english'; - -my $BASE_INDEX_SETTINGS = { - analysis => { - analyzer => { - folding => { - filter => ['lowercase', 'asciifolding'], - tokenizer => 'standard' - } - }, - normalizer => { - custom_lowercase => { - type => 'custom', - filter => ['lowercase'] - } - } - } -}; - -my $BASE_PROPERTIES = { - source => {type => 'integer', index => 'false'}, - create_date => {type => 'date'}, - edit_date => {type => 'date'}, - bib_source => {type => 'integer'}, - marc => { - type => 'nested', - properties => { - # tag is assumed to be composed of numbers, so no lowercase. - tag => {type => 'keyword'}, - subfield => { - type => 'keyword', - fields => { - lower => { - type => 'keyword', - normalizer => 'custom_lowercase' - } - } - }, - value => { - type => 'keyword', - fields => { - lower => { - type => 'keyword', - normalizer => 'custom_lowercase' - }, - text => { - type => 'text', - analyzer => $LANG_ANALYZER - }, - text_folded => { - type => 'text', - analyzer => 'folding' - } - } - } - - } - } -}; - -sub index_name { - return $INDEX_NAME; -} - -sub create_index { - my ($self) = @_; - - if ($self->es->indices->exists(index => $INDEX_NAME)) { - $logger->warn("ES index '$INDEX_NAME' already exists"); - return; - } - - $logger->info( - "ES creating index '$INDEX_NAME' on cluster '".$self->cluster."'"); - - my $mappings = $BASE_PROPERTIES; - my $settings = $BASE_INDEX_SETTINGS; - $settings->{number_of_replicas} = scalar(@{$self->nodes}); - $settings->{number_of_shards} = $self->index->num_shards; - - my $conf = { - index => $INDEX_NAME, - body => {settings => $settings} - }; - - $logger->info("ES creating index '$INDEX_NAME'"); - - # Create the base index with settings - eval { $self->es->indices->create($conf) }; - - if ($@) { - $logger->error("ES failed to create index cluster=". - $self->cluster. "index=$INDEX_NAME error=$@"); - warn "$@\n\n"; - return 0; - } - - # Create each mapping one at a time instead of en masse so we - # can more easily report when mapping creation fails. - - for my $field (keys %$mappings) { - $logger->info("ES Creating index mapping for field $field"); - - eval { - $self->es->indices->put_mapping({ - index => $INDEX_NAME, - type => 'record', - body => {properties => {$field => $mappings->{$field}}} - }); - }; - - if ($@) { - my $mapjson = OpenSRF::Utils::JSON->perl2JSON($mappings->{$field}); - - $logger->error("ES failed to create index mapping: " . - "index=$INDEX_NAME field=$field error=$@ mapping=$mapjson"); - - warn "$@\n\n"; - return 0; - } - } - - return 1; -} - -sub get_bib_data { - my ($self, $record_ids) = @_; - - my $ids_str = join(',', @$record_ids); - - my $sql = <get_db_rows($sql); -} - -sub populate_bib_index_batch { - my ($self, $state) = @_; - - my $index_count = 0; - - my $bib_ids = $self->get_bib_ids($state); - return 0 unless @$bib_ids; - - $logger->info("ES indexing ".scalar(@$bib_ids)." records"); - - my $bib_data = $self->get_bib_data($bib_ids); - - # Remove records that are marked deleted. - # This should only happen when running in refresh mode. - - my @active_ids; - for my $bib_id (@$bib_ids) { - - # Every row in the result data contains the 'deleted' value. - my ($field) = grep {$_->{id} == $bib_id} @$bib_data; - - if ($field->{deleted} == 1) { # not 't' / 'f' - $self->delete_documents($bib_id); - } else { - push(@active_ids, $bib_id); - } - } - - $bib_ids = [@active_ids]; - - my $marc = $self->load_marc($bib_ids); - - for my $bib_id (@$bib_ids) { - - my ($record) = grep {$_->{id} == $bib_id} @$bib_data; - - my $body = { - marc => $marc->{$bib_id} || [], - bib_source => $record->{bib_source}, - }; - - ($body->{create_date} = $record->{create_date}) =~ s/ /T/g; - ($body->{edit_date} = $record->{edit_date}) =~ s/ /T/g; - - return 0 unless $self->index_document($bib_id, $body); - - $state->{start_record} = $bib_id + 1; - $index_count++; - } - - $logger->info("ES indexing completed for records " . - $bib_ids->[0] . '...' . $bib_ids->[-1]); - - return $index_count; -} - -sub load_marc { - my ($self, $bib_ids) = @_; - - my $bib_ids_str = join(',', @$bib_ids); - - my $marc_data = $self->get_db_rows(<info("ES found ".scalar(@$marc_data). - " full record rows for current record batch"); - - my $marc = {}; - for my $row (@$marc_data) { - - my $value = $row->{value}; - next unless defined $value && $value ne ''; - - my $subfield = $row->{subfield}; - my $rec_id = $row->{record}; - delete $row->{record}; # avoid adding this to the index - - $row->{value} = $value = $self->truncate_value($value); - - $marc->{$rec_id} = [] unless $marc->{$rec_id}; - delete $row->{subfield} unless defined $subfield; - - # Add values to existing record/tag/subfield rows. - - my $existing; - for my $entry (@{$marc->{$rec_id}}) { - next unless $entry->{tag} eq $row->{tag}; - - if (defined $subfield) { - if (defined $entry->{subfield}) { - if ($subfield eq $entry->{subfield}) { - $existing = $entry; - last; - } - } - } elsif (!defined $entry->{subfield}) { - # Neither has a subfield value / not all tags have subfields - $existing = $entry; - last; - } - } - - if ($existing) { - - $existing->{value} = [$existing->{value}] unless ref $existing->{value}; - push(@{$existing->{value}}, $value); - - } else { - - push(@{$marc->{$rec_id}}, $row); - } - } - - return $marc; -} - - -1; - - diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm index 37e75bcee1..55da7d620e 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/Bib/Search.pm @@ -67,7 +67,42 @@ my $BASE_PROPERTIES = { circulate => {type => 'boolean'}, opac_visible => {type => 'boolean'} } + }, + marc => { + type => 'nested', + properties => { + # tag is assumed to be composed of numbers, so no lowercase. + tag => {type => 'keyword'}, + subfield => { + type => 'keyword', + fields => { + lower => { + type => 'keyword', + normalizer => 'custom_lowercase' + } + } + }, + value => { + type => 'keyword', + fields => { + lower => { + type => 'keyword', + normalizer => 'custom_lowercase' + }, + text => { + type => 'text', + analyzer => $LANG_ANALYZER + }, + text_folded => { + type => 'text', + analyzer => 'folding' + } + } + } + + } } + }; sub index_name { @@ -233,10 +268,12 @@ sub populate_bib_index_batch { $bib_ids = [@active_ids]; my $holdings = $self->load_holdings($bib_ids); + my $marc = $self->load_marc($bib_ids); for my $bib_id (@$bib_ids) { my $body = { + marc => $marc->{$bib_id} || [], holdings => $holdings->{$bib_id} || [] }; @@ -338,6 +375,69 @@ SQL return $holdings; } +sub load_marc { + my ($self, $bib_ids) = @_; + + my $bib_ids_str = join(',', @$bib_ids); + + my $marc_data = $self->get_db_rows(<info("ES found ".scalar(@$marc_data). + " MARC rows for current record batch"); + + my $marc = {}; + for my $row (@$marc_data) { + + my $value = $row->{value}; + next unless defined $value && $value ne ''; + + my $subfield = $row->{subfield}; + my $rec_id = $row->{record}; + delete $row->{record}; # avoid adding this to the index + + $row->{value} = $value = $self->truncate_value($value); + + $marc->{$rec_id} = [] unless $marc->{$rec_id}; + delete $row->{subfield} unless defined $subfield; + + # Add values to existing record/tag/subfield rows. + + my $existing; + for my $entry (@{$marc->{$rec_id}}) { + next unless $entry->{tag} eq $row->{tag}; + + if (defined $subfield) { + if (defined $entry->{subfield}) { + if ($subfield eq $entry->{subfield}) { + $existing = $entry; + last; + } + } + } elsif (!defined $entry->{subfield}) { + # Neither has a subfield value / not all tags have subfields + $existing = $entry; + last; + } + } + + if ($existing) { + + $existing->{value} = [$existing->{value}] unless ref $existing->{value}; + push(@{$existing->{value}}, $value); + + } else { + + push(@{$marc->{$rec_id}}, $row); + } + } + + return $marc; +} + 1;