From d456674cf5e97e597aeb0b4fd68cb5187c35b7c1 Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Thu, 20 Feb 2020 12:39:50 -0500 Subject: [PATCH] LP1844418 Direct indexing experiment WIP Signed-off-by: Bill Erickson --- Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm | 3 +- .../src/perlmods/lib/OpenILS/Elastic/BibSearch.pm | 294 +++-- .../perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm | 199 --- .../sql/Pg/upgrade/XXXX.schema.elastic-search.sql | 215 ---- Open-ILS/src/support-scripts/elastic-index.pl | 24 +- Open-ILS/xsl/elastic-bib-transform.xsl | 1278 ++++++++++++++++++++ 6 files changed, 1508 insertions(+), 505 deletions(-) delete mode 100644 Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm create mode 100644 Open-ILS/xsl/elastic-bib-transform.xsl diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm index 9462500f88..5f67427938 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic.pm @@ -367,7 +367,8 @@ sub search { # Avoid trying to index such data by lazily chopping it off # at 1/4 the limit to accomodate all UTF-8 chars. sub truncate_value { - my ($self, $value) = @_; + my ($self, $value, $length) = @_; + $length = 8190 unless $length; return substr($value, 0, 8190); } diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm index 5bcb5aed4d..7143bda4da 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch.pm @@ -1,6 +1,5 @@ -package OpenILS::Elastic::BibSearch; # --------------------------------------------------------------- -# Copyright (C) 2019 King County Library System +# Copyright (C) 2019-2020 King County Library System # Author: Bill Erickson # # This program is free software; you can redistribute it and/or @@ -13,9 +12,48 @@ package OpenILS::Elastic::BibSearch; # MERCHANTABILITY or FITNESS FOR A PARTICULAR code. See the # GNU General Public License for more details. # --------------------------------------------------------------- +package OpenILS::Elastic::BibSearch::BibField; +# Models a single indexable field. +use strict; +use warnings; + +sub new { + my ($class, %args) = @_; + return bless(\%args, $class); +} +sub name { + my $self = shift; + return $self->{name}; +} +sub field_class { + my $self = shift; + return $self->{field_class}; +} +sub search_field { + my $self = shift; + return $self->{purpose} eq 'search'; +} +sub facet_field { + my $self = shift; + return $self->{purpose} eq 'facet'; +} +sub sorter { + my $self = shift; + return $self->{purpose} eq 'sorter'; +} +sub filter { + my $self = shift; + return $self->{purpose} eq 'filter'; +} +sub weight { + my $self = shift; + return $self->{weight} || 1; +} + +# --------------------------------------------------------------- +package OpenILS::Elastic::BibSearch; use strict; use warnings; -use Encode; use DateTime; use Clone 'clone'; use Time::HiRes qw/time/; @@ -32,11 +70,17 @@ my $DEFAULT_BIB_BATCH_SIZE = 500; my $INDEX_CLASS = 'bib-search'; # https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html -# Useful for ignoring excessively long filters, sorters, and facets. +# Useful for ignoring excessively long filters and facets. # Only applied to the keyword variation of each index. Does not affect -# the 'text' varieties. +# the 'text' varieties. The selected limit is arbitrary. my $IGNORE_ABOVE = 256; +# Individual characters of some values like sorters provide less and less +# value as the length of the text gets longer and longer. Unlike +# $IGNORE_ABOVE, this only trims the string, it does not prevent it from +# getting indexed in the first place. The selected limit is arbitrary. +my $TRIM_ABOVE = 512; + my $BASE_INDEX_SETTINGS = { analysis => { analyzer => { @@ -188,17 +232,6 @@ my $BASE_PROPERTIES = { ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase', }, - - # Create some shortcut indexes for streamlining query_string searches. - ti => {type => 'text'}, - au => {type => 'text'}, - se => {type => 'text'}, - su => {type => 'text'}, - kw => {type => 'text'}, - id => { - type => 'keyword', - ignore_above => $IGNORE_ABOVE - } }; my %SHORT_GROUP_MAP = ( @@ -214,20 +247,144 @@ sub index_class { return $INDEX_CLASS; } -# TODO: add index-specific language analyzers to DB config +# TODO: determine when/how to apply language analyzers. +# e.g. create lang-specific index fields? sub language_analyzers { return ("english"); } +sub xsl_file { + my ($self, $filename) = @_; + $self->{xsl_file} = $filename if $filename; + return $self->{xsl_file}; +} + +sub xsl_doc { + my ($self) = @_; + + $self->{xsl_doc} = XML::LibXML->load_xml(location => $self->xsl_file) + unless $self->{xsl_doc}; + + return $self->{xsl_doc}; +} + +sub xsl_sheet { + my $self = shift; + + $self->{xsl_sheet} = XML::LibXSLT->new->parse_stylesheet($self->xsl_doc) + unless $self->{xsl_sheet}; + + return $self->{xsl_sheet}; +} + +my @seen_fields; +sub add_dynamic_field { + my ($self, $fields, $purpose, $field_class, $name, $weight) = @_; + return unless $name; + + $weight = '' if !$weight || $weight eq '_'; + $field_class = '' if !$field_class || $field_class eq '_'; + + my $tag = $purpose . $field_class . $name; + return if grep {$_ eq $tag} @seen_fields; + push(@seen_fields, $tag); + + $logger->info("ES adding dynamic field purpose=$purpose ". + "field_class=$field_class name=$name weight=$weight"); + + my $field = OpenILS::Elastic::BibSearch::BibField->new( + purpose => $purpose, + field_class => $field_class, + name => $name, + weight => $weight + ); + + push(@$fields, $field); +} + sub get_dynamic_fields { my $self = shift; + my $fields = []; + + @seen_fields = (); # reset with each run - # elastic.bib_field has no primary key field, so retrieve_all won't work. - # Note the name value may be repeated across search group depending - # on local configuration. - return new_editor()->search_elastic_bib_field({name => {'!=' => undef}}); + my $null_doc = XML::LibXML->load_xml(string => ''); + my $result = $self->xsl_sheet->transform($null_doc, target => '"index-fields"'); + my $output = $self->xsl_sheet->output_as_chars($result); + + my @rows = split(/\n/, $output); + for my $row (@rows) { + my @parts = split(/ /, $row); + $self->add_dynamic_field($fields, @parts); + } + + return $fields; } +sub get_bib_data { + my ($self, $record_ids) = @_; + + my $bib_data = []; + my $db_data = $self->get_bib_db_data($record_ids); + + for my $db_rec (@$db_data) { + + if ($db_rec->{deleted} == 1) { + # No need to extract index values. + push(@$bib_data, {deleted => 1}); + next; + } + + my $marc_doc = XML::LibXML->load_xml(string => $db_rec->{marc}); + my $result = $self->xsl_sheet->transform($marc_doc, target => '"index-values"'); + my $output = $self->xsl_sheet->output_as_chars($result); + + my @rows = split(/\n/, $output); + for my $row (@rows) { + my ($purpose, $field_class, $name, @tokens) = split(/ /, $row); + + $field_class = '' if ($field_class || '') eq '_'; + + my $value = join(' ', @tokens); + + my $field = { + purpose => $purpose, + field_class => $field_class, + name => $name, + value => $value + }; + + # Stamp each field with the additional bib metadata. + $field->{$_} = $db_rec->{$_} for + qw/id bib_source metarecord create_date edit_date deleted/; + + push(@$bib_data, $field); + } + } + + return $bib_data; +} + +sub get_bib_db_data { + my ($self, $record_ids) = @_; + + my $ids_str = join(',', @$record_ids); + + my $sql = <get_db_rows($sql); +} sub create_index_properties { my ($self) = @_; @@ -252,25 +409,21 @@ sub create_index_properties { my $fields = $self->get_dynamic_fields; - $logger->info('ES ' . OpenSRF::Utils::JSON->perl2JSON($fields)); - for my $field (@$fields) { - my $field_name = $field->name; my $field_class = $field->field_class; $field_name = "$field_class|$field_name" if $field_class; - $logger->info("ES ONE FIELD name=$field_name: " . OpenSRF::Utils::JSON->perl2JSON($field)); - my $def; if ($field_class) { - if ($field->search_field eq 't') { + if ($field->search_field) { # Use the same fields and analysis as the 'grouped' field. $def = clone($properties->{$field_class}); - $def->{copy_to} = [$field_class, $SHORT_GROUP_MAP{$field_class}]; + # Copy grouped fields into their group parent field. + $def->{copy_to} = $field_class; # Apply ranking boost to each analysis variation. my $flds = $def->{fields}; @@ -284,9 +437,12 @@ sub create_index_properties { $def = { type => 'keyword', - ignore_above => $IGNORE_ABOVE, normalizer => 'custom_lowercase' }; + + # Long sorter values are not necessarily unexpected, + # e.g. long titles. + $def->{ignore_above} = $IGNORE_ABOVE unless $field->sorter; } if ($def) { @@ -299,7 +455,7 @@ sub create_index_properties { # Search and facet fields can have the same name/group pair, # but are stored as separate fields in ES since the content # may vary between the two. - if ($field->facet_field eq 't') { + if ($field->facet_field) { # Facet fields are stored as separate fields, because their # content may differ from the matching search field. @@ -360,58 +516,48 @@ sub create_index { # Create each mapping one at a time instead of en masse so we # can more easily report when mapping creation fails. - for my $field (keys %$properties) { - $logger->info("ES Creating index mapping for field $field"); - - eval { - $self->es->indices->put_mapping({ - index => $index_name, - type => 'record', - body => { - dynamic => 'strict', - properties => {$field => $properties->{$field}} - } - }); - }; - - if ($@) { - my $mapjson = OpenSRF::Utils::JSON->perl2JSON($properties->{$field}); - - $logger->error("ES failed to create index mapping: " . - "index=$index_name field=$field error=$@ mapping=$mapjson"); + return 0 unless + $self->create_one_field_index($field, $properties->{$field}); + } - warn "$@\n\n"; - return 0; - } + # Now that we've added the static (and dynamic) fields, + # add the shortened field_class aliases. + while (my ($field, $alias) = each %SHORT_GROUP_MAP) { + return 0 unless $self->create_one_field_index( + $alias, {type => 'alias', path => $field}); } return 1; } -# TODO: elastic.bib_record_properties needs to also pull values -# from metabib.facet_entry -# TODO: stamp each field with a 'purpose' (search, facet, filter, sorter) -sub get_bib_data { - my ($self, $record_ids) = @_; +sub create_one_field_index { + my ($self, $field, $properties) = @_; + my $index_name = $self->index_name; + $logger->info("ES Creating index mapping for field $field"); + + eval { + $self->es->indices->put_mapping({ + index => $index_name, + type => 'record', + body => { + dynamic => 'strict', + properties => {$field => $properties} + } + }); + }; - my $ids_str = join(',', @$record_ids); + if ($@) { + my $mapjson = OpenSRF::Utils::JSON->perl2JSON($properties); - my $sql = <error("ES failed to create index mapping: " . + "index=$index_name field=$field error=$@ mapping=$mapjson"); - return $self->get_db_rows($sql); + warn "$@\n\n"; + return 0; + } + + return 1; } sub populate_bib_index_batch { @@ -480,7 +626,9 @@ sub populate_bib_index_batch { $fname = "$fclass|$fname" if $fclass; $fname = "$fname|facet" if $field->{purpose} eq 'facet'; - $value = $self->truncate_value($value); + + my $trim = $field->{purpose} eq 'sorter' ? $TRIM_ABOVE : undef; + $value = $self->truncate_value($value, $trim); if ($fname eq 'identifier|isbn') { index_isbns($body, $value); diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm b/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm deleted file mode 100644 index 6178936dbc..0000000000 --- a/Open-ILS/src/perlmods/lib/OpenILS/Elastic/BibSearch/XSLT.pm +++ /dev/null @@ -1,199 +0,0 @@ -# --------------------------------------------------------------- -# Copyright (C) 2020 King County Library System -# Author: Bill Erickson -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR code. See the -# GNU General Public License for more details. -# --------------------------------------------------------------- -package OpenILS::Elastic::BibSearch::BibField; -# Helper class for modeling an elastic bib field. -# This is what OpenILS::Elastic::BibSearch expects. - -sub new { - my ($class, %args) = @_; - return bless(\%args, $class); -} -sub name { - my $self = shift; - return $self->{name}; -} -sub field_class { - my $self = shift; - return $self->{field_class}; -} -sub search_field { - my $self = shift; - return $self->{purpose} eq 'search' ? 't' : 'f'; -} -sub facet_field { - my $self = shift; - return $self->{purpose} eq 'facet' ? 't' : 'f'; -} -sub sorter { - my $self = shift; - return $self->{purpose} eq 'sorter' ? 't' : 'f'; -} -sub filter { - my $self = shift; - return $self->{purpose} eq 'filter' ? 't' : 'f'; -} -sub weight { - my $self = shift; - return $self->{weight} || 1; -} - -package OpenILS::Elastic::BibSearch::XSLT; -use strict; -use warnings; -use XML::LibXML; -use XML::LibXSLT; -use OpenSRF::Utils::Logger qw/:logger/; -use OpenILS::Utils::CStoreEditor qw/:funcs/; -use OpenILS::Elastic::BibSearch; -use base qw/OpenILS::Elastic::BibSearch/; - - -sub xsl_file { - my ($self, $filename) = @_; - $self->{xsl_file} = $filename if $filename; - return $self->{xsl_file}; -} - -sub xsl_doc { - my ($self) = @_; - - $self->{xsl_doc} = XML::LibXML->load_xml(location => $self->xsl_file) - unless $self->{xsl_doc}; - - return $self->{xsl_doc}; -} - -sub xsl_sheet { - my $self = shift; - - $self->{xsl_sheet} = XML::LibXSLT->new->parse_stylesheet($self->xsl_doc) - unless $self->{xsl_sheet}; - - return $self->{xsl_sheet}; -} - - -my @seen_fields; -sub add_dynamic_field { - my ($self, $fields, $purpose, $field_class, $name, $weight) = @_; - return unless $name; - - $weight = '' if !$weight || $weight eq '_'; - $field_class = '' if !$field_class || $field_class eq '_'; - - my $tag = $purpose . $field_class . $name; - return if grep {$_ eq $tag} @seen_fields; - push(@seen_fields, $tag); - - $logger->info("ES adding dynamic field purpose=$purpose ". - "field_class=$field_class name=$name weight=$weight"); - - my $field = OpenILS::Elastic::BibSearch::BibField->new( - purpose => $purpose, - field_class => $field_class, - name => $name, - weight => $weight - ); - - push(@$fields, $field); -} - -sub get_dynamic_fields { - my $self = shift; - my $fields = []; - - @seen_fields = (); # reset with each run - - my $null_doc = XML::LibXML->load_xml(string => ''); - my $result = $self->xsl_sheet->transform($null_doc, target => '"index-fields"'); - my $output = $self->xsl_sheet->output_as_chars($result); - - my @rows = split(/\n/, $output); - for my $row (@rows) { - my @parts = split(/ /, $row); - $self->add_dynamic_field($fields, @parts); - } - - return $fields; -} - -sub get_bib_data { - my ($self, $record_ids) = @_; - - my $bib_data = []; - my $db_data = $self->get_bib_db_data($record_ids); - - for my $db_rec (@$db_data) { - - if ($db_rec->{deleted} == 1) { - # No need to extract index values. - push(@$bib_data, {deleted => 1}); - next; - } - - my $marc_doc = XML::LibXML->load_xml(string => $db_rec->{marc}); - my $result = $self->xsl_sheet->transform($marc_doc, target => '"index-values"'); - my $output = $self->xsl_sheet->output_as_chars($result); - - my @rows = split(/\n/, $output); - for my $row (@rows) { - my ($purpose, $field_class, $name, @tokens) = split(/ /, $row); - - $field_class = '' if ($field_class || '') eq '_'; - - my $value = join(' ', @tokens); - - my $field = { - purpose => $purpose, - field_class => $field_class, - name => $name, - value => $value - }; - - # Stamp each field with the additional bib metadata. - $field->{$_} = $db_rec->{$_} for - qw/id bib_source metarecord create_date edit_date deleted/; - - push(@$bib_data, $field); - } - } - - return $bib_data; -} - -sub get_bib_db_data { - my ($self, $record_ids) = @_; - - my $ids_str = join(',', @$record_ids); - - my $sql = <get_db_rows($sql); -} - - -1; - diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql index 193002adff..077101f769 100644 --- a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.elastic-search.sql @@ -3,30 +3,10 @@ DROP SCHEMA IF EXISTS elastic CASCADE; BEGIN; -ALTER TABLE config.record_attr_definition - ADD COLUMN elastic_field BOOLEAN NOT NULL DEFAULT FALSE; - -ALTER TABLE config.metabib_field - ADD COLUMN elastic_field BOOLEAN NOT NULL DEFAULT FALSE; - --- Provide a sweeping set of default elastic fields. --- Likely this set of fields can be trimmed significantly for most sites, --- since many of these fields will never be searched from the catalog. --- Reducing the number of elastic_field's will improve indexing time, --- search time, and reduce Elastic disk space requirements. -UPDATE config.record_attr_definition - SET elastic_field = TRUE WHERE name NOT LIKE 'marc21_%'; - -UPDATE config.metabib_field - SET elastic_field = TRUE WHERE search_field OR facet_field; - INSERT INTO config.global_flag (name, enabled, label) VALUES ( 'elastic.bib_search.enabled', FALSE, 'Elasticsearch Enable Bib Searching' -), ( - 'elastic.bib_search.dynamic_properties', FALSE, - 'Elasticsearch Dynamic Bib Record Properties' ); CREATE SCHEMA elastic; @@ -61,170 +41,6 @@ CREATE TABLE elastic.index ( CONSTRAINT valid_index_class CHECK (index_class IN ('bib-search')) ); -CREATE OR REPLACE VIEW elastic.bib_field AS - SELECT fields.* FROM ( - SELECT - NULL::INT AS metabib_field, - crad.name, - crad.label, - NULL AS field_class, - crad.sorter, - FALSE AS search_field, - FALSE AS facet_field, - 1 AS weight - FROM config.record_attr_definition crad - WHERE crad.elastic_field - UNION - SELECT - cmf.id AS metabib_field, - cmf.name, - cmf.label, - cmf.field_class, - FALSE AS sorter, - -- always treat identifier fields as non-search fields. - (cmf.field_class <> 'identifier' AND cmf.search_field) AS search_field, - cmf.facet_field, - cmf.weight - FROM config.metabib_field cmf - WHERE cmf.elastic_field - ) fields; - - -CREATE OR REPLACE FUNCTION elastic.bib_record_attrs(bre_id BIGINT) -RETURNS TABLE ( - field_class TEXT, - name TEXT, - source BIGINT, - value TEXT -) -AS $FUNK$ - SELECT DISTINCT record.* FROM ( - SELECT - NULL::TEXT AS field_class, - crad.name, - mrs.source, - mrs.value - FROM metabib.record_sorter mrs - JOIN config.record_attr_definition crad ON (crad.name = mrs.attr) - WHERE mrs.source = $1 AND crad.elastic_field - UNION - - -- record attributes - SELECT - NULL::TEXT AS field_class, - crad.name, - mraf.id AS source, - mraf.value - FROM metabib.record_attr_flat mraf - JOIN config.record_attr_definition crad ON (crad.name = mraf.attr) - WHERE mraf.id = $1 AND crad.elastic_field - ) record -$FUNK$ LANGUAGE SQL STABLE; - -CREATE OR REPLACE FUNCTION elastic.bib_record_static_props(bre_id BIGINT) -RETURNS TABLE ( - field_class TEXT, - name TEXT, - source BIGINT, - value TEXT -) -AS $FUNK$ - SELECT DISTINCT record.* FROM ( - SELECT - cmf.field_class, - cmf.name, - props.source, - CASE WHEN cmf.joiner IS NOT NULL THEN - REGEXP_SPLIT_TO_TABLE(props.value, cmf.joiner) - ELSE - props.value - END AS value - FROM ( - SELECT field, source, value - FROM metabib.title_field_entry mtfe WHERE mtfe.source = $1 - UNION - SELECT field, source, value - FROM metabib.author_field_entry mafe WHERE mafe.source = $1 - UNION - SELECT field, source, value - FROM metabib.subject_field_entry msfe WHERE msfe.source = $1 - UNION - SELECT field, source, value - FROM metabib.series_field_entry msrfe WHERE msrfe.source = $1 - UNION - SELECT field, source, value - FROM metabib.keyword_field_entry mkfe WHERE mkfe.source = $1 - UNION - SELECT field, source, value - FROM metabib.identifier_field_entry mife WHERE mife.source = $1 - UNION - SELECT field, source, value - FROM metabib.facet_entry mfe WHERE mfe.source = $1 - ) props - JOIN config.metabib_field cmf ON (cmf.id = props.field) - WHERE cmf.elastic_field - ) record -$FUNK$ LANGUAGE SQL STABLE; - -CREATE OR REPLACE FUNCTION elastic.bib_record_dynamic_props(bre_id BIGINT) -RETURNS TABLE ( - field_class TEXT, - name TEXT, - source BIGINT, - value TEXT -) -AS $FUNK$ - SELECT DISTINCT record.* FROM ( - SELECT - cmf.field_class, - cmf.name, - props.source, - CASE WHEN cmf.joiner IS NOT NULL THEN - REGEXP_SPLIT_TO_TABLE(props.value, cmf.joiner) - ELSE - props.value - END AS value - FROM biblio.extract_metabib_field_entry( - $1, ' ', '{facet,search}', - (SELECT ARRAY_AGG(id) FROM config.metabib_field WHERE elastic_field) - ) props - JOIN config.metabib_field cmf ON (cmf.id = props.field) - ) record -$FUNK$ LANGUAGE SQL STABLE; - - -CREATE OR REPLACE FUNCTION elastic.bib_record_properties(bre_id BIGINT) - RETURNS TABLE ( - field_class TEXT, - name TEXT, - source BIGINT, - value TEXT - ) - AS $FUNK$ -DECLARE - props_func TEXT; -BEGIN - - PERFORM 1 FROM config.internal_flag cif WHERE - cif.name = 'elastic.bib_search.dynamic_properties' AND cif.enabled; - - IF FOUND THEN - props_func := 'elastic.bib_record_dynamic_props'; - ELSE - props_func := 'elastic.bib_record_static_props'; - END IF; - - RETURN QUERY EXECUTE $$ - SELECT DISTINCT record.* FROM ( - SELECT * FROM elastic.bib_record_attrs($$ || QUOTE_LITERAL(bre_id) || $$) - UNION - SELECT * FROM $$ || props_func || '(' || QUOTE_LITERAL(bre_id) || $$) - ) record - $$; -END $FUNK$ LANGUAGE PLPGSQL; - -/* give me bibs I should upate */ - CREATE OR REPLACE VIEW elastic.bib_last_mod_date AS /** * Last update date for each bib, which is taken from most recent @@ -262,43 +78,12 @@ COMMIT; DROP SCHEMA IF EXISTS elastic CASCADE; -ALTER TABLE config.record_attr_definition DROP COLUMN elastic_field; - -ALTER TABLE config.metabib_field DROP COLUMN elastic_field; - DELETE FROM config.global_flag WHERE name ~ 'elastic.*'; */ /* --- Sample narrower set of elastic fields to avoid duplication and --- indexing data that will presumably never be searched in the catalog. - -UPDATE config.metabib_field SET elastic_field = FALSE -WHERE - (field_class = 'keyword' AND name <> 'keyword') OR - (field_class = 'subject' AND name = 'complete') OR - (field_class = 'author' AND name = 'first_author') -; - -UPDATE config.record_attr_definition SET elastic_field = FALSE -WHERE name NOT IN ( - 'authorsort', - 'date1', - 'date2', - 'bib_level', - 'item_form', - 'item_lang', - 'item_type', - 'lit_form', - 'pubdate', - 'search_format', - 'titlesort', - 'sr_format', - 'vr_format' -); - -- Bill's elastic VM for testing. UPDATE elastic.node SET host = 'elastic.gamma', port = 80, path = '/elastic/node1' diff --git a/Open-ILS/src/support-scripts/elastic-index.pl b/Open-ILS/src/support-scripts/elastic-index.pl index a03ea9b9b7..a728fe47d7 100755 --- a/Open-ILS/src/support-scripts/elastic-index.pl +++ b/Open-ILS/src/support-scripts/elastic-index.pl @@ -10,6 +10,7 @@ use OpenILS::Elastic::BibSearch::XSLT; my $help; my $osrf_config = '/openils/conf/opensrf_core.xml'; +my $bib_transform = '/openils/var/xsl/elastic-bib-transform.xsl'; my $cluster; my $create_index; my $delete_index; @@ -22,7 +23,6 @@ my $start_record; my $stop_record; my $modified_since; my $max_duration; -my $bib_transform; my $batch_size = 500; # Database settings read from ENV by default. @@ -144,22 +144,12 @@ OpenILS::Utils::CStoreEditor::init(); my $es; if ($index_class eq 'bib-search') { - - if ($bib_transform) { - $es = OpenILS::Elastic::BibSearch::XSLT->new( - cluster => $cluster, - index_name => $index_name, - write_mode => 1, - xsl_file => $bib_transform - ); - } else { - - $es = OpenILS::Elastic::BibSearch->new( - cluster => $cluster, - index_name => $index_name, - write_mode => 1 - ); - } + $es = OpenILS::Elastic::BibSearch->new( + cluster => $cluster, + index_name => $index_name, + xsl_file => $bib_transform, + write_mode => 1 + ); } if (!$es) { diff --git a/Open-ILS/xsl/elastic-bib-transform.xsl b/Open-ILS/xsl/elastic-bib-transform.xsl new file mode 100644 index 0000000000..8e6b4b335e --- /dev/null +++ b/Open-ILS/xsl/elastic-bib-transform.xsl @@ -0,0 +1,1278 @@ + + + + + + + + + + + + + + + 650 + subject + topic + abcdvxyz + + + 651 + subject + geographic + avxyz + + + 655 + subject + genre + abcvxyz + + + 630 + subject + uniftitle + adfgklmnoprstvxyz + + + 600 + subject + name + abcdfgjklmnopqrstuvxyz + + + 610 + subject + corpname + abcdfgklmnoprstuvxyz + + + 611 + subject + meeting + acdefgjklnpqstuvxyz + + + 490 + series + seriestitle + a + + + 800 + series + seriestitle + tflmnoprs + + + 810 + series + seriestitle + tflmnoprs + + + 830 + series + seriestitle + adfgklmnoprst + + + 100 + author + personal + abcdq + + + 110 + author + corporate + abcdn + + + 111 + author + meeting + acdegng + + + 700 + author + added_personal + abcdq + + + 710 + author + corporate + ab + + + 711 + author + meeting + acde + + + 400 + author + added_personal + abcd + + + 410 + author + corporate + abcd + + + 411 + author + meeting + acdegq + + + 010 + identifier + lccn + a + + + 010 + identifier + lccn + z + + + 020 + identifier + isbn + a + + + 020 + identifier + isbn + z + + + 022 + identifier + issn + a + + + 022 + identifier + issn + y + + + 022 + identifier + issn + z + + + 024 + identifier + upc + a + + + 024 + identifier + upc + z + + + 027 + identifier + tech_number + a + + + 027 + identifier + tech_number + z + + + 028 + identifier + tech_number + ab + + + 074 + identifier + sudoc + a + + + 074 + identifier + sudoc + z + + + 086 + identifier + sudoc + a + + + 086 + identifier + sudoc + z + + + 092 + identifier + bibcn + ab + + + 099 + identifier + bibcn + a + + + 099 + identifier + bibcn + a + + + 130 + title + uniform + abcefgijklmnopqrstuvwxyz + + + 210 + title + abbreviated + abcefghijklmnopqrstuvwxyz + + + 222 + title + magazine + a + + + 240 + title + uniform + abcefgijklmnopqrstuvwxyz + + + 245 + title + maintitle + a + 10 + + + 245 + title + proper + abefgijklmnopqrstuvwxyz + + + 245 + author + responsibility + c + + + 246 + title + alternative + abcefgjklmnopqrstuvwxyz + + + 247 + title + former + abcefgijklmnopqrstuvwxyz + + + 260 + keyword + publisher + b + + + 264 + keyword + publisher + b + + + 400 + series + seriestitle + ptv + + + 410 + author + corporate + abcde + + + 410 + series + seriestitle + ptv + + + 411 + author + conference + acdegq + + + 411 + title + seriestitle + ptv + + + 440 + series + seriestitle + abcefghijklmnopqrstuvwyz + + + 490 + series + seriestitle + abcefghijklmnopqrstuvwyz + + + 490 + title + uniform + abcefghijklmnopqrstuvwyz + + + 694 + series + seriestitle + a + + + 700 + title + added + fgklmnoprst + + + 710 + title + added + fgklmnoprst + + + 711 + title + added + fklnpst + + + 730 + title + added + abcefgijklmnopqrstuvwyz + + + 740 + title + added + abcefgijklmnopqrstuvwyz + + + 780 + title + previous + st + + + 785 + title + succeeding + st + + + 800 + author + personal_series + abcdq + + + 800 + series + seriestitle + fgklmnoprst + + + 810 + author + corporate_series + abcdn + + + 810 + series + seriestitle + abcdn + + + 811 + author + conference_series + acdegnq + + + 811 + series + seriestitle + fklnpstv + + + 830 + series + seriestitle + abcefgijklmnopqrstuvwxyz + + + 938 + identifier + match_isbn + a + + + 938 + identifier + match_isbn + a + + + + + + + 650 + subject + topic + abcdvxyz + + + 651 + subject + geographic + avxyz + + + 600 + subject + name + abcdfgjklmnopqrstuvxyz + + + 490 + series + seriestitle + a + + + 800 + series + seriestitle + tflmnoprs + + + 810 + series + seriestitle + tflmnoprs + + + 830 + series + seriestitle + adfgklmnoprst + + + 100 + author + personal + abcdq + + + 110 + author + corporate + ab + + + 710 + author + corporate + ab + + + 410 + author + corporate + abcd + + + 400 + series + seriestitle + + + + 410 + author + corporate + + + + 410 + series + seriestitle + + + + 411 + author + conference + + + + 411 + title + seriestitle + + + + 440 + series + seriestitle + abcefghijklmnopqrstuvwyz + + + 490 + series + seriestitle + + + + 694 + series + seriestitle + + + + 800 + series + seriestitle + fgklmnoprst + + + 810 + series + seriestitle + abcdn + + + 811 + series + seriestitle + fklnpstv + + + 830 + series + seriestitle + abcefgijklmnopqrstuvwxyz + + + + + + + + + + + author + + + + + + + + + + + + a + + + + + + + + + 0 + + + + + title + + + + + + + pubdate + + + 008 + 7 + 4 + + + + + + + + + + + date1 + + + 008 + 7 + 4 + + + 0000 + + + + date2 + + + 008 + 11 + 4 + + + 9999 + + + + lit_form + + + 008 + 33 + 1 + + + + + + item_lang + + + 008 + 35 + 3 + + + + + + audience + + + 008 + 22 + 1 + + + + + + + + + 6 + 1 + + + + + item_type + + + + + + 7 + 1 + + + + + bib_level + + + + + + 008 + 23 + 1 + + + + + item_form + + + + + + 007 + 0 + 1 + + + + + + + 007 + 4 + 1 + + + + + + vr_format + + + + + + + 007 + 3 + 1 + + + + + + sr_format + + + + + + + search_format + blu-ray + + s + + + + search_format + book + + at + + abcfoqrs + + acdm + + + + search_format + braille + + a + + f + + + + search_format + casaudiobook + + i + + l + + + + search_format + casmusic + + j + + l + + + + search_format + cdaudiobook + + i + + f + + + + search_format + cdaudiobook + + j + + f + + + + search_format + dvd + + v + + + + search_format + eaudio + + i + + oqs + + + + search_format + ebook + + at + + oqs + + acdm + + + + search_format + electronic + + os + + + + search_format + equip + + r + + + + search_format + evideo + + g + + oqs + + + + search_format + kit + + op + + + + search_format + lpbook + + at + + d + + acdm + + + + search_format + map + + ef + + + + search_format + microform + + abc + + + + search_format + music + + j + + + + search_format + phonomusic + + j + + abcde + + + + search_format + phonospoken + + i + + abcde + + + + search_format + picture + + k + + + + search_format + serial + + bs + + + + search_format + score + + cd + + + + search_format + software + + m + + + + search_format + vhs + + b + + + + + + + + sorter _ + + + _ + + + + + + + + + + + + + filter _ + + + _ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + abcdefghijklmnopqrstuvwxyz + + + + + + + + + + + + + + + + + + + + + + search + + + + + + + + search + + + + + + + + + + + + + + + + + + + facet + + + + _ + + + + + facet + + + + + + + + + + + + + + + search keyword keyword + + _ + + + + + + + + + + + + + -- 2.11.0