"type": "nested",
"properties": {
"status": {
- "type": "integer",
- "index": "false"
+ "type": "integer"
},
"circ_lib": {
- "type": "integer",
- "index": "false"
+ "type": "integer"
},
"location": {
- "type": "integer",
- "index": "false"
+ "type": "integer"
},
"circulate": {
- "type": "boolean",
- "index": "false"
+ "type": "boolean"
},
"opac_visible": {
- "type": "boolean",
- "index": "false"
+ "type": "boolean"
}
}
}
<!-- number of parallel processes to use during fine generation -->
<parallel>1</parallel>
</fine_generator>
+
+ <elastic_search>
+ <database>
+ <driver>Pg</driver>
+ <host>localhost</host>
+ <port>5432</port>
+ <db>evergreen</db>
+ <user>evergreen</user>
+ <pw>evergreen</pw>
+ <application_name>Elastic Search Indexer</application_name>
+ </database>
+ </elastic_search>
<reporter>
<!--
--- /dev/null
+package OpenILS::Elastic;
+# ---------------------------------------------------------------
+# Copyright (C) 2018 King County Library System
+# Author: Bill Erickson <berickxx@gmail.com>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# ---------------------------------------------------------------
+use strict;
+use warnings;
+use DBI;
+use OpenSRF::Utils::Logger qw/:logger/;
+use OpenSRF::Utils::SettingsClient;
+use OpenILS::Utils::CStoreEditor qw/:funcs/;
+use Search::Elasticsearch;
+
+sub new {
+ my ($class, $cluster) = @_;
+
+ my $self = {
+ cluster => $cluster,
+ indices => [],
+ marc_fields => []
+ };
+
+ return bless($self, $class);
+}
+
+sub cluster {
+ my $self = shift;
+ return $self->{cluster};
+}
+
+sub es {
+ my ($self) = @_;
+ return $self->{es};
+}
+
+sub index_name {
+ die "Index name must be provided by sub-class\n";
+}
+
+# Returns database connection object.
+# Connects if necessary.
+sub db {
+ my ($self) = @_;
+
+ return $self->{db} if $self->{db};
+
+ my $client = OpenSRF::Utils::SettingsClient->new;
+ my $settings = $client->config_value('elastic_search');
+ my $db_name = $settings->{database}->{db};
+ my $db_host = $settings->{database}->{host};
+ my $db_port = $settings->{database}->{port};
+ my $db_user = $settings->{database}->{user};
+ my $db_pass = $settings->{database}->{pw};
+ my $db_appn = $settings->{database}->{application_name};
+
+ # TODO Add application_name to dsn
+
+ my $dsn = "dbi:Pg:db=$db_name;host=$db_host;port=$db_port";
+ $logger->debug("ES connecting to DB $dsn");
+
+ $self->{db} = DBI->connect(
+ $dsn, $db_user, $db_pass, {
+ RaiseError => 1,
+ PrintError => 0,
+ pg_expand_array => 0,
+ pg_enable_utf8 => 1
+ }
+ ) or $logger->error(
+ "ES Connection to database failed: $DBI::err : $DBI::errstr", 1);
+
+ return $self->{db};
+}
+
+# Return selected rows as an array of hashes
+sub get_db_rows {
+ my ($self, $sql) = @_;
+ return $self->db->selectall_arrayref($sql, {Slice => {}});
+}
+
+# TODO: Add IDL entries for config.elastic* so we can
+# load the config via cstore.
+sub load_config {
+ my $self = shift;
+ my $cluster = $self->cluster;
+
+ my $clusters = $self->get_db_rows(
+ "SELECT * FROM config.elastic_cluster WHERE name = '$cluster'");
+
+ my $cluster_id = $clusters->[0]->{id};
+
+ unless ($cluster_id) {
+ $logger->error("ES no such cluster: $cluster");
+ return;
+ }
+
+ $self->{servers} = $self->get_db_rows(
+ "SELECT * FROM config.elastic_server WHERE cluster = $cluster_id AND active");
+
+ unless (@{$self->{servers}}) {
+ $logger->error("ES no servers defined for cluster $cluster");
+ return;
+ }
+
+ $self->{indices} = $self->get_db_rows(
+ "SELECT * FROM config.elastic_index WHERE cluster = $cluster_id AND active");
+
+ unless (@{$self->{indices}}) {
+ $logger->error("ES no indices defined for cluster $cluster");
+ return;
+ }
+
+ my $index_ids = join(',', map {$_->{id}} @{$self->{indices}});
+
+ $self->{marc_fields} = $self->get_db_rows(
+ "SELECT * FROM config.elastic_marc_field WHERE index IN ($index_ids) AND active");
+}
+
+sub connect {
+ my ($self) = @_;
+ $self->load_config;
+
+ my @nodes;
+ for my $server (@{$self->{servers}}) {
+ push(@nodes, sprintf("%s://%s:%d",
+ $server->{proto}, $server->{host}, $server->{port}));
+ }
+
+ $logger->info("ES connecting to nodes @nodes");
+
+ $self->{es} = Search::Elasticsearch->new(nodes => \@nodes)
+}
+
+sub delete_index {
+ my ($self) = @_;
+
+ my $index = $self->index_name;
+
+ if ($self->es->indices->exists(index => $index)) {
+ $logger->info(
+ "ES deleting index '$index' on cluster '".$self->cluster."'");
+ $self->es->indices->delete(index => $index);
+ } else {
+ $logger->warn("ES index '$index' does not exist");
+ }
+}
+
+sub index_document {
+ my ($self, $id, $body) = @_;
+
+ my $result = $self->es->index(
+ index => $self->index_name,
+ id => $id,
+ body => $body
+ );
+
+ $logger->debug("ES index command returned $result");
+}
+
+sub search {
+ my ($self, $query) = @_;
+
+ return $self->es->search(
+ index => $self->index_name,
+ body => $query
+ );
+}
+
+
+1;
+
+
--- /dev/null
+package OpenILS::Elastic::BibSearch;
+# ---------------------------------------------------------------
+# Copyright (C) 2018 King County Library System
+# Author: Bill Erickson <berickxx@gmail.com>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# ---------------------------------------------------------------
+use strict;
+use warnings;
+use Clone qw/clone/;
+use DBI;
+use XML::LibXML;
+use XML::LibXSLT;
+use OpenSRF::Utils::Logger qw/:logger/;
+use OpenILS::Elastic;
+use base qw/OpenILS::Elastic/;
+
+my $INDEX_NAME = 'bib-search';
+
+# number of bibs to index per batch.
+my $BIB_BATCH_SIZE = 1000;
+
+# TODO: it's possible to apply multiple language analyzers.
+my $LANG_ANALYZER = 'english';
+
+my $BASE_INDEX_SETTINGS = {
+ analysis => {
+ analyzer => {
+ folding => {
+ filter => ['lowercase', 'asciifolding'],
+ tokenizer => 'standard'
+ }
+ }
+ }
+};
+
+# Well-known bib-search index properties
+my $BASE_PROPERTIES = {
+ source => {type => 'integer', index => 'false'},
+ create_date => {type => 'date', index => 'false'},
+ edit_date => {type => 'date', index => 'false'},
+
+ # Holdings summaries. For bib-search purposes, we don't need
+ # copy-specific details, only aggregate visibility information.
+ holdings => {
+ type => 'nested',
+ properties => {
+ status => {type => 'integer'},
+ circ_lib => {type => 'integer'},
+ location => {type => 'integer'},
+ circulate => {type => 'boolean'},
+ opac_visible => {type => 'boolean'}
+ }
+ },
+
+ # Combo fields for field-class level searches.
+ # The value for every (for eaxmple) title|* field will be copied
+ # to the "title" field for searching accross all title entries.
+ title => {
+ type => 'text',
+ analyzer => $LANG_ANALYZER,
+ fields => {
+ folded => {type => 'text', analyzer => 'folding'},
+ raw => {type => 'keyword'}
+ }
+ },
+ author => {
+ type => 'text',
+ analyzer => $LANG_ANALYZER,
+ fields => {
+ folded => {type => 'text', analyzer => 'folding'},
+ raw => {type => 'keyword'}
+ }
+ },
+ subject => {
+ type => 'text',
+ analyzer => $LANG_ANALYZER,
+ fields => {
+ folded => {type => 'text', analyzer => 'folding'},
+ raw => {type => 'keyword'}
+ }
+ },
+ series => {
+ type => 'text',
+ analyzer => $LANG_ANALYZER,
+ fields => {
+ folded => {type => 'text', analyzer => 'folding'},
+ raw => {type => 'keyword'}
+ }
+ },
+
+ # No .raw field for keyword based on the assumption
+ # keyword values are not used for sorting or aggregation.
+ keyword => {
+ type => 'text',
+ analyzer => $LANG_ANALYZER,
+ fields => {
+ folded => {type => 'text', analyzer => 'folding'}
+ }
+ },
+
+ # Index identifier fields as keywords to avoid unnecessary
+ # ES analysis.
+ identifier => {type => 'keyword'}
+};
+
+sub index_name {
+ return $INDEX_NAME;
+}
+
+sub index {
+ my $self = shift;
+ return $self->{index} if $self->{index};
+ ($self->{index}) = grep {$_->{name} eq $INDEX_NAME} @{$self->{indices}};
+ return $self->{index};
+}
+
+sub get_marc_fields {
+ my $self = shift;
+ return grep {
+ $_->{index} == $self->index->{id}
+ } @{$self->{marc_fields}};
+}
+
+# Load the XSLT transforms from the DB.
+sub load_transforms {
+ my $self = shift;
+
+ $self->{xsl_transforms} = {} unless $self->{xsl_transforms};
+
+ for my $field ($self->get_marc_fields) {
+ my $format = $field->{format};
+ next if $self->{xsl_transforms}{$format};
+
+ $logger->info("ES loading info for document type $format");
+
+ my $xform = $self->get_db_rows(
+ "SELECT * FROM config.xml_transform WHERE name = '$format'")->[0];
+
+ $self->{xml_namespaces}{$format} = {
+ prefix => $xform->{prefix},
+ uri => $xform->{namespace_uri}
+ };
+
+ if ($format eq 'marcxml') {
+ # No transform needed for MARCXML.
+ # Indicate we've seen it and move on.
+ $self->{xsl_transforms}{$format} = {};
+ next;
+ }
+
+ $logger->info("ES parsing stylesheet for $format");
+
+ my $xsl_doc = XML::LibXML->new->parse_string($xform->{xslt});
+
+ $self->{xsl_transforms}{$format} =
+ XML::LibXSLT->new->parse_stylesheet($xsl_doc);
+ }
+}
+
+sub create_index {
+ my ($self) = @_;
+
+ if ($self->es->indices->exists(index => $INDEX_NAME)) {
+ $logger->warn("ES index '$INDEX_NAME' already exists");
+ return;
+ }
+
+ $logger->info(
+ "ES creating index '$INDEX_NAME' on cluster '".$self->cluster."'");
+
+ my $mappings = $BASE_PROPERTIES;
+
+ # Add an index definition for each dynamic field.
+ # Add copy_to for field_class-level combined searches.
+ for my $field ($self->get_marc_fields) {
+
+ my $field_class = $field->{field_class};
+ my $field_name = "$field_class|" . $field->{name};
+
+ # Clone the class-level index definition (e.g. title) to
+ # use as the source of the field-specific index.
+ my $def = clone($BASE_PROPERTIES->{$field_class});
+
+ # Copy data for all fields to their parent class to
+ # support group-level searches (e.g. title search)
+ $def->{copy_to} = $field_class;
+ $mappings->{$field_name} = $def;
+ }
+
+ my $settings = $BASE_INDEX_SETTINGS;
+ $settings->{number_of_replicas} = scalar($self->{servers});
+ $settings->{number_of_shards} = $self->index->{num_shards};
+
+ my $conf = {
+ index => $INDEX_NAME,
+ body => {
+ settings => $settings,
+ mappings => {properties => $mappings}
+ }
+ };
+
+ # Send the index definition to Elastic
+ eval { $self->es->indices->create($conf) };
+
+ if ($@) {
+ $logger->error("ES failed to create index cluster=".
+ $self->cluster. "index=$INDEX_NAME error=$@");
+ return 0;
+ }
+
+ return 1;
+}
+
+# Add data to the bib-search index
+sub populate_index {
+ my ($self) = @_;
+
+ $self->load_transforms;
+
+ my $index_count = 0;
+ my $total_indexed = 0;
+ my $state = {last_bib_id => 0};
+
+ do {
+ $index_count =
+ $self->populate_bib_search_index_page($state);
+ $total_indexed += $index_count;
+
+ $logger->info("ES indexed $total_indexed bib records");
+
+ } while ($index_count > 0);
+
+ $logger->info("ES bib indexing complete with $total_indexed records");
+}
+
+# TODO add support for last_edit_date for partial re-indexing
+sub get_bib_records {
+ my ($self, $state, $record_id) = @_;
+
+ my $sql = <<SQL;
+SELECT bre.id, bre.marc, bre.create_date, bre.edit_date, bre.source
+FROM biblio.record_entry bre
+SQL
+
+ if ($record_id) {
+ $sql .= " WHERE bre.id = $record_id"
+ } else {
+ my $last_id = $state->{last_bib_id};
+ $sql .= <<SQL;
+WHERE NOT bre.deleted AND bre.active AND bre.id > $last_id
+ORDER BY bre.edit_date , bre.id LIMIT $BIB_BATCH_SIZE
+SQL
+ }
+
+ return $self->get_db_rows($sql);
+}
+
+# TODO partial re-index
+sub populate_bib_search_index_page {
+ my ($self, $state) = @_;
+
+ my $index_count = 0;
+ my $last_id = $state->{last_bib_id};
+
+ my $bib_data = $self->get_bib_records($state);
+ return 0 unless @$bib_data;
+
+ my $bib_ids = [ map {$_->{id}} @$bib_data ];
+
+ my $holdings = $self->load_holdings($bib_ids);
+
+ for my $bib (@$bib_data) {
+ my $bib_id = $bib->{id};
+
+ my $marc_doc = XML::LibXML->new->parse_string($bib->{marc});
+ my $body = $self->extract_bib_values($marc_doc);
+
+ $body->{holdings} = $holdings->{$bib_id} || [];
+ $body->{source} = $bib->{source};
+
+ for my $field (q/create_date edit_date/) {
+ next unless $bib->{$field};
+ # ES wants ISO dates with the 'T' separator
+ (my $val = $bib->{$field}) =~ s/ /T/g;
+ $body->{$field} = $val;
+ }
+
+ $self->index_document($INDEX_NAME, $bib_id, $body);
+
+ $state->{last_bib_id} = $bib_id;
+ $index_count++;
+ }
+
+ return $index_count;
+}
+
+sub get_bib_as {
+ my ($self, $marc_doc, $format) = @_;
+ return $marc_doc if $format eq 'marcxml';
+ return $self->{xsl_transforms}{$format}->transform($marc_doc);
+}
+
+# Returns either a string value or an array of string values.
+sub extract_xpath {
+ my ($self, $xml_doc, $format, $xpath) = @_;
+
+ my $ns = $self->{xml_namespaces}{$format};
+ my $root = $xml_doc->documentElement;
+
+ $root->setNamespace($ns->{uri}, $ns->{prefix}, 1);
+
+ my @nodes = $root->findnodes($xpath);
+
+ if (@nodes) {
+ if (@nodes == 1) {
+ return $nodes[0]->textContent;
+ } else {
+ return [ map { $_->textContent } @nodes ];
+ }
+ } else {
+ # Some XPATH returns nodes, some (e.g. substring()) returns
+ # string values instead of nodes.
+ return $root->findvalue($xpath) || undef;
+ }
+}
+
+sub extract_bib_values {
+ my ($self, $marc_doc) = @_;
+
+ # various formats of the current MARC record (mods, etc.)
+ my %xform_docs;
+ my $values = {};
+ for my $field ($self->get_marc_fields) {
+
+ my $format = $field->{format};
+ my $field_name = $field->{field_class} .'|' . $field->{name};
+
+ $xform_docs{$format} = $self->get_bib_as($marc_doc, $format)
+ unless $xform_docs{$format};
+
+ my $xform_doc = $xform_docs{$format};
+
+ $values->{$field_name} =
+ $self->extract_xpath($xform_doc, $format, $field->{xpath});
+ }
+
+ return $values;
+}
+
+# Load holdings summary blobs for requested bibs
+sub load_holdings {
+ my ($self, $bib_ids) = @_;
+
+ my $bib_ids_str = join(',', @$bib_ids);
+
+ my $copy_data = $self->get_db_rows(<<SQL);
+SELECT
+ COUNT(*) AS count,
+ acn.record,
+ acp.status AS status,
+ acp.circ_lib AS circ_lib,
+ acp.location AS location,
+ acp.circulate AS circulate,
+ acp.opac_visible AS opac_visible
+FROM asset.copy acp
+JOIN asset.call_number acn ON acp.call_number = acn.id
+WHERE
+ NOT acp.deleted AND
+ NOT acn.deleted AND
+ acn.record IN ($bib_ids_str)
+GROUP BY 2, 3, 4, 5, 6, 7
+SQL
+
+ $logger->info("ES found ".scalar(@$copy_data).
+ " holdings summaries for current record batch");
+
+ my $holdings = {};
+ for my $copy (@$copy_data) {
+
+ $holdings->{$copy->{record}} = []
+ unless $holdings->{$copy->{record}};
+
+ push(@{$holdings->{$copy->{record}}}, {
+ count => $copy->{count},
+ status => $copy->{status},
+ circ_lib => $copy->{circ_lib},
+ location => $copy->{location},
+ circulate => $copy->{circulate} eq 't' ? 'true' : 'false',
+ opac_visbile => $copy->{opac_visible} eq 't' ? 'true' : 'false'
+ });
+ }
+
+ return $holdings;
+}
+
+1;
+
+
+++ /dev/null
-package OpenILS::Utils::ElasticSearch;
-# ---------------------------------------------------------------
-# Copyright (C) 2018 King County Library System
-# Author: Bill Erickson <berickxx@gmail.com>
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation; either version 2
-# of the License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-# ---------------------------------------------------------------
-use strict;
-use warnings;
-use Clone qw/clone/;
-use DateTime;
-use DBI;
-use XML::LibXML;
-use XML::LibXSLT;
-use Data::Dumper;
-use OpenSRF::Utils::JSON;
-use OpenSRF::Utils::Logger qw/:logger/;
-use OpenILS::Utils::DateTime qw/:datetime/;
-#use OpenILS::Utils::CStoreEditor qw/:funcs/;
-use Search::Elasticsearch;
-
-my $date_parser = DateTime::Format::ISO8601->new;
-my $db_handle;
-
-sub new {
- my ($class, %args) = @_;
- my $self = bless(
- { clusters => {},
- config_file => $args{config_file},
- xsl_transforms => {},
- xml_namespaces => {}
- }, $class
- );
- $self->read_config;
- return $self;
-}
-
-sub get_db_conn {
- my ($self) = @_;
- return $db_handle if $db_handle;
-
- my $settings = $self->{config}->{'evergreen-database'};
- my $db_name = $settings->{name};
- my $db_host = $settings->{host};
- my $db_port = $settings->{port};
- my $db_user = $settings->{user};
- my $db_pass = $settings->{pass};
-
- my $dsn = "dbi:Pg:db=$db_name;host=$db_host;port=$db_port";
- $logger->debug("ES connecting to DB $dsn");
-
- $db_handle = DBI->connect(
- "$dsn;options='--statement-timeout=0'",
- $db_user, $db_pass, {
- RaiseError => 1,
- PrintError => 0,
- AutoCommit => 1,
- pg_expand_array => 0,
- pg_enable_utf8 => 1
- }
- ) or $logger->error(
- "ES Connection to database failed: $DBI::err : $DBI::errstr", 1);
-
- return $db_handle;
-}
-
-sub read_config {
- my $self = shift;
-
- open(CONFIG_FILE, $self->{config_file})
- or die "Cannot open elastic config " .$self->{config_file}. ": $!\n";
-
- my $json = join('', <CONFIG_FILE>);
-
- $self->{config} = OpenSRF::Utils::JSON->JSON2perl($json);
-
- close(CONFIG_FILE);
-}
-
-sub connect {
- my ($self, $cluster) = @_;
-
- my $nodes = $self->{config}{clusters}{$cluster}{nodes};
-
- $logger->info("ES connecting to nodes @$nodes");
-
- $self->{clusters}{$cluster} = {
- es => Search::Elasticsearch->new(nodes => $nodes)
- };
-}
-
-sub es {
- my ($self, $cluster) = @_;
- return $self->{clusters}{$cluster}{es};
-}
-
-sub delete_index {
- my ($self, $cluster, $index) = @_;
-
- if ($self->es($cluster)->indices->exists(index => $index)) {
- $logger->info("ES deleting index '$index' on cluster '$cluster'");
- $self->es($cluster)->indices->delete(index => $index);
- } else {
- $logger->warn("ES index '$index' does not exist");
- }
-}
-
-sub create_index {
- my ($self, $cluster, $index) = @_;
-
- if ($self->es($cluster)->indices->exists(index => $index)) {
- $logger->warn("ES index '$index' already exists");
- return;
- }
-
- $logger->info("ES creating index '$index' on cluster '$cluster'");
-
- my $config = $self->{config};
-
- my $es = $self->es($cluster);
- my $mappings = $config->{indexes}{$index}{'base-properties'};
-
- # TODO: a dynamic property may live in multiple indexes
- my @dynamics = grep {$_->{index} eq $index}
- @{$config->{'dynamic-properties'}};
-
- # Add an index definition for each dynamic field.
- # Add copy_to for field_class-level combined searches.
- for my $prop (@dynamics) {
- my $field_class = $prop->{field_class};
- my $field_name = "$field_class|" . $prop->{name};
-
- # Clone the class-level index definition (e.g. title) to
- # use as the source of the field-specific index.
- my $def = clone($config->{indexes}{$index}{'base-properties'}{$field_class});
-
- # Copy data for all fields to their parent class to
- # support group-level searches (e.g. title search)
- $def->{copy_to} = $field_class;
- $mappings->{$field_name} = $def;
- }
-
- my $settings = $config->{indexes}{$index}{settings};
- $settings->{number_of_replicas} =
- scalar(@{$config->{clusters}{$cluster}{nodes}});
-
- my $doc_type = $config->{indexes}{$index}{'document-type'};
-
- my $conf = {
- index => $index,
- body => {
- settings => $settings,
- mappings => {$doc_type => {properties => $mappings}}
- }
- };
-
- # Send the index definition to Elastic
- eval { $self->es($cluster)->indices->create($conf) };
-
- if ($@) {
- my $msg =
- "ES failed to create index cluster=$cluster index=$index error=$@";
- $logger->error($msg);
- die "$msg\n";
- }
-}
-
-sub populate_index {
- my ($self, $cluster, $index) = @_;
-
- if ($index eq 'bib-search') {
- return $self->populate_bib_search_index($cluster, $index);
- }
-}
-
-sub populate_bib_search_index {
- my ($self, $cluster, $index) = @_;
-
- $self->load_transforms($index);
-
- my $index_count = 0;
- my $total_indexed = 0;
- my $state = {last_bib_id => 0};
-
- do {
- $index_count =
- $self->populate_bib_search_index_page($cluster, $index, $state);
- $total_indexed += $index_count;
- $logger->info("ES indexed $total_indexed bib records");
-
- } while ($index_count > 0);
-
- $logger->info("ES bib indexing complete with $total_indexed records");
-}
-
-# TODO holdings
-# TODO partial re-index
-sub populate_bib_search_index_page {
- my ($self, $cluster, $index, $state) = @_;
-
- my $index_count = 0;
- my $last_id = $state->{last_bib_id};
- my $doc_type = $self->{config}->{indexes}{$index}{'document-type'};
- my $bib_data = $self->get_db_conn()->selectall_arrayref(<<SQL, {Slice => {}});
-SELECT bre.id, bre.marc, bre.create_date, bre.edit_date, bre.source
-FROM biblio.record_entry bre
-WHERE (
- NOT bre.deleted
- AND bre.active
- AND bre.id > $last_id
-)
-ORDER BY bre.edit_date ASC, bre.id ASC
-LIMIT 1000
-SQL
-
- my $bib_ids = [ map {$_->{id}} @$bib_data ];
-
- my $holdings = $self->load_holdings($index, $bib_ids);
-
- for my $bib (@$bib_data) {
- my $bib_id = $bib->{id};
-
- my $marc_doc = XML::LibXML->new->parse_string($bib->{marc});
- my $body = $self->extract_bib_values($index, $marc_doc);
-
- $body->{holdings} = $holdings->{$bib_id} || [];
- $body->{source} = $bib->{source};
-
- for my $field (q/create_date edit_date/) {
- next unless $bib->{$field};
- # ES wants ISO dates with the 'T' separator
- (my $val = $bib->{$field}) =~ s/ /T/g;
- $body->{$field} = $val;
- }
-
- $self->add_to_elastic($cluster, $index, $doc_type, $bib_id, $body);
-
- $state->{last_bib_id} = $bib_id;
- $index_count++;
- }
-
- return $index_count;
-}
-
-sub load_transforms {
- my ($self, $index) = @_;
-
- my @dynamics = grep {$_->{index} eq $index}
- @{$self->{config}{'dynamic-properties'}};
-
- for my $prop (@dynamics) {
- my $format = $prop->{format};
- next if $self->{xsl_transforms}{$format};
-
- $logger->info("ES loading info for document type $format");
-
- my $xform = $self->get_db_conn()->selectrow_hashref(
- "SELECT * FROM config.xml_transform WHERE name = '$format'");
-
- $self->{xml_namespaces}{$format} = {
- prefix => $xform->{prefix},
- uri => $xform->{namespace_uri}
- };
-
- if ($format eq 'marcxml') {
- # No transform needed for MARCXML. Indicate we've seen
- # it and move on.
- $self->{xsl_transforms}{$format} = {};
- next;
- }
-
- $logger->info("ES parsing stylesheet for $format");
- my $xsl_doc = XML::LibXML->new->parse_string($xform->{xslt});
- my $stylesheet = XML::LibXSLT->new->parse_stylesheet($xsl_doc);
- $self->{xsl_transforms}{$format} = $stylesheet;
- }
-}
-
-sub extract_bib_values {
- my ($self, $index, $marc_doc) = @_;
- my $values = {};
-
- my @dynamics = grep {$_->{index} eq $index}
- @{$self->{config}{'dynamic-properties'}};
-
- # various formats of the current MARC record (mods, etc.)
- my %xform_docs;
-
- for my $prop (@dynamics) {
-
- my $format = $prop->{format};
- my $xform_doc = $marc_doc;
- my $field_name = $prop->{field_class} .'|' . $prop->{name};
-
- if ($format ne 'marcxml') { # no transform required for MARCXML
-
- if (!$xform_docs{$format}) {
- # No document exists for the current format.
- # Perform the transform here.
- $xform_docs{$format} =
- $self->{xsl_transforms}{$format}->transform($marc_doc);
- }
-
- $xform_doc = $xform_docs{$format};
- }
-
- # Apply the field-specific xpath to our transformed document
-
- my $ns = $self->{xml_namespaces}{$format};
- my $root = $xform_doc->documentElement;
- $root->setNamespace($ns->{uri}, $ns->{prefix}, 1);
-
- my @nodes = $root->findnodes($prop->{xpath});
-
- if (@nodes) {
- if (@nodes == 1) {
- $values->{$field_name} = $nodes[0]->textContent;
- } else {
- $values->{$field_name} = [ map { $_->textContent } @nodes ];
- }
- } else {
- # Some XPATH returns nodes, some (e.g. substring()) returns
- # string values instead of nodes.
- $values->{$field_name} = $root->findvalue($prop->{xpath}) || undef;
- }
-
- $logger->internal(
- "ES $field_name = " . Dumper($values->{$field_name}));
- }
-
- $logger->debug("ES extracted record values: " . Dumper($values));
-
- return $values;
-}
-
-# Load holdings summary blobs for requested bibs
-sub load_holdings {
- my ($self, $index, $bib_ids) = @_;
-
- my $bib_ids_str = join(',', @$bib_ids);
-
- my $copy_data = $self->get_db_conn()->selectall_arrayref(<<SQL, {Slice => {}});
-SELECT
- COUNT(*) AS count,
- acn.record,
- acp.status AS status,
- acp.circ_lib AS circ_lib,
- acp.location AS location,
- acp.circulate AS circulate,
- acp.opac_visible AS opac_visible
-FROM asset.copy acp
-JOIN asset.call_number acn ON acp.call_number = acn.id
-WHERE
- NOT acp.deleted AND
- NOT acn.deleted AND
- acn.record IN ($bib_ids_str)
-GROUP BY 2, 3, 4, 5, 6, 7
-SQL
-
- $logger->info("ES found ".scalar(@$copy_data).
- " holdings summaries for current record batch");
-
- my $holdings = {};
- for my $copy (@$copy_data) {
- $holdings->{$copy->{record}} = []
- unless $holdings->{$copy->{record}};
-
- push(@{$holdings->{$copy->{record}}}, {
- count => $copy->{count},
- status => $copy->{status},
- circ_lib => $copy->{circ_lib},
- location => $copy->{location},
- circulate => $copy->{circulate} eq 't' ? 'true' : 'false',
- opac_visbile => $copy->{opac_visible} eq 't' ? 'true' : 'false'
- });
- }
-
- return $holdings;
-}
-
-sub add_to_elastic {
- my ($self, $cluster, $index, $doc_type, $id, $body) = @_;
-
- my $result = $self->es($cluster)->index(
- index => $index,
- type => $doc_type,
- id => $id,
- body => $body
- );
-
- $logger->debug("ES index command returned $result");
-}
-
-
-1;
-
-
--- /dev/null
+
+BEGIN;
+
+CREATE TABLE config.elastic_cluster (
+ id SERIAL PRIMARY KEY,
+ name TEXT NOT NULL
+);
+
+CREATE TABLE config.elastic_server (
+ id SERIAL PRIMARY KEY,
+ label TEXT NOT NULL UNIQUE,
+ host TEXT NOT NULL,
+ proto TEXT NOT NULL,
+ port INTEGER NOT NULL,
+ active BOOLEAN NOT NULL DEFAULT FALSE,
+ cluster INTEGER NOT NULL
+ REFERENCES config.elastic_cluster (id) ON DELETE CASCADE
+);
+
+CREATE TABLE config.elastic_index (
+ id SERIAL PRIMARY KEY,
+ name TEXT NOT NULL UNIQUE,
+ purpose TEXT NOT NULL DEFAULT 'bib-search',
+ num_shards INTEGER NOT NULL DEFAULT 5,
+ active BOOLEAN NOT NULL DEFAULT FALSE,
+ cluster INTEGER NOT NULL
+ REFERENCES config.elastic_cluster (id) ON DELETE CASCADE,
+ CONSTRAINT valid_index_purpose CHECK (purpose IN ('bib-search'))
+);
+
+CREATE TABLE config.elastic_marc_field (
+ id SERIAL PRIMARY KEY,
+ index INTEGER NOT NULL
+ REFERENCES config.elastic_index (id) ON DELETE CASCADE,
+ active BOOLEAN NOT NULL DEFAULT FALSE,
+ field_class TEXT NOT NULL REFERENCES config.metabib_class (name),
+ label TEXT NOT NULL,
+ name TEXT NOT NULL,
+ datatype TEXT NOT NULL DEFAULT 'text',
+ weight INTEGER NOT NULL DEFAULT 1,
+ format TEXT NOT NULL REFERENCES config.xml_transform (name),
+ xpath TEXT NOT NULL,
+ search_field BOOLEAN NOT NULL DEFAULT FALSE,
+ facet_field BOOLEAN NOT NULL DEFAULT FALSE,
+ sort_field BOOLEAN NOT NULL DEFAULT FALSE,
+ multi_value BOOLEAN NOT NULL DEFAULT FALSE,
+ CONSTRAINT valid_datatype CHECK (datatype IN
+ ('text', 'keyword', 'date', 'long', 'double', 'boolean', 'ip'))
+);
+
+/* SEED DATA ------------------------------------------------------------ */
+
+
+INSERT INTO config.elastic_cluster (name) VALUES ('main');
+
+INSERT INTO config.elastic_server
+ (label, host, proto, port, active, cluster)
+VALUES ('localhost', 'localhost', 'http', 9200, TRUE,
+ (SELECT id FROM config.elastic_cluster WHERE name = 'main'));
+
+INSERT INTO config.elastic_index (name, purpose, active, cluster)
+VALUES ('Bib Search', 'bib-search', TRUE,
+ (SELECT id FROM config.elastic_cluster WHERE name = 'main'));
+
+-- Start with indexes that match search/facet fields in config.metabib_field
+
+INSERT INTO config.elastic_marc_field
+ (index, active, field_class, label, name, weight, format,
+ xpath, search_field, facet_field, datatype)
+SELECT
+ (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
+ TRUE,
+ cmf.field_class,
+ cmf.label,
+ cmf.name,
+ cmf.weight,
+ cmf.format,
+ cmf.xpath || COALESCE(cmf.facet_xpath, COALESCE(cmf.display_xpath, '')),
+ cmf.search_field,
+ cmf.facet_field,
+ 'text'
+FROM config.metabib_field cmf
+WHERE cmf.xpath IS NOT NULL AND (cmf.search_field OR cmf.facet_field);
+
+-- Add additional indexes for other search-y / filter-y stuff
+
+INSERT INTO config.elastic_marc_field
+ (index, active, field_class, label, name, format,
+ datatype, search_field, facet_field, xpath)
+VALUES (
+ (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
+ TRUE,
+ 'identifier', 'Language', 'item_lang', 'marcxml', 'keyword', TRUE, TRUE,
+ $$substring(//marc:controlfield[@tag='008']/text(), '36', '3')$$
+), (
+ (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
+ TRUE,
+ 'identifier', 'Item Form', 'item_form', 'marcxml', 'keyword', TRUE, TRUE,
+ $$substring(//marc:controlfield[@tag='008']/text(), '24', '1')$$
+), (
+ (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
+ TRUE,
+ 'identifier', 'Audience', 'audience', 'marcxml', 'keyword', TRUE, TRUE,
+ $$substring(//marc:controlfield[@tag='008']/text(), '23', '1')$$
+), (
+ (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
+ TRUE,
+ 'identifier', 'Literary Form', 'lit_form', 'marcxml', 'keyword', TRUE, TRUE,
+ $$substring(//marc:controlfield[@tag='008']/text(), '34', '1')$$
+), (
+ (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
+ TRUE,
+ 'identifier', 'Publication Date', 'pub_date', 'mods32', 'long', TRUE, TRUE,
+ $$//mods32:mods/mods32:originInfo/mods32:dateIssued[@encoding='marc']$$
+);
+
+-- TODO ADD MORE FIELDS
+
+-- avoid full-text indexing on identifier fields
+UPDATE config.elastic_marc_field SET datatype = 'keyword'
+WHERE field_class = 'identifier';
+
+COMMIT;
+
+/* UNDO
+
+DROP TABLE config.elastic_marc_field;
+DROP TABLE config.elastic_index;
+DROP TABLE config.elastic_server;
+DROP TABLE config.elastic_cluster;
+
+*/
+
use strict;
use warnings;
use Getopt::Long;
-use OpenILS::Utils::ElasticSearch;
use OpenILS::Utils::Fieldmapper;
+use OpenILS::Elastic::BibSearch;
my $help;
my $elastic_config;
Fieldmapper->import(
IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL"));
-my $es = OpenILS::Utils::ElasticSearch->new(
- cluster => $cluster,
- config_file => $elastic_config
-);
+my $es = OpenILS::Elastic::BibSearch->new($cluster);
-$es->connect($cluster);
+$es->connect;
-$es->delete_index($cluster, $index_name) if $delete_index;
+$es->delete_index if $delete_index;
-$es->create_index($cluster, $index_name) if $create_index;
+$es->create_index if $create_index;
-$es->populate_index($cluster, $index_name) if $populate;
+$es->populate_index if $populate;
--- /dev/null
+#!/usr/bin/perl
+use strict;
+use warnings;
+use Getopt::Long;
+use Time::HiRes qw/time/;
+use OpenSRF::Utils::JSON;
+use OpenILS::Utils::Fieldmapper;
+use OpenILS::Elastic;
+
+my $help;
+my $elastic_config;
+my $osrf_config = '/openils/conf/opensrf_core.xml';
+my $cluster = 'main';
+my $index = 'bib-search';
+my $query_string;
+
+GetOptions(
+ 'help' => \$help,
+ 'elastic-config=s' => \$elastic_config,
+ 'osrf-config=s' => \$osrf_config,
+ 'cluster=s' => \$cluster,
+ 'index=s' => \$index,
+ 'query-string=s' => \$query_string
+) || die "\nSee --help for more\n";
+
+sub help {
+ print <<HELP;
+ Synopsis:
+
+ $0 --query-string "author:mozart || title:piano"
+
+HELP
+ exit(0);
+}
+
+help() if $help;
+
+# connect to osrf...
+OpenSRF::System->bootstrap_client(config_file => $osrf_config);
+Fieldmapper->import(
+ IDL => OpenSRF::Utils::SettingsClient->new->config_value("IDL"));
+
+my $query = {
+ _source => ['id'] , # return only the ID field
+ sort => [
+ {'title.raw' => 'asc'},
+ {'author.raw' => 'asc'},
+ '_score'
+ ],
+ query => {
+ bool => {
+ must => {
+ query_string => {
+ default_field => 'keyword',
+ query => $query_string
+ }
+ },
+ filter => {
+ nested => {
+ path => 'holdings',
+ query => {
+ bool => {
+ must => [
+ {
+ bool => {
+ should => [
+ {term => {'holdings.status' => '0'}},
+ {term => {'holdings.status' => '7'}}
+ ]
+ }
+ },
+ {
+ bool => {
+ should => [
+ {term => {'holdings.circ_lib' => '4'}},
+ {term => {'holdings.circ_lib' => '5'}}
+ ]
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ }
+ },
+ aggs => {
+ genres => {
+ terms => {
+ field => 'identifier|genre.raw'
+ }
+ },
+ 'subject|topic' => {
+ terms => {
+ field => 'subject|topic.raw'
+ }
+ }
+ }
+};
+
+my $es = OpenILS::Utils::ElasticSearch->new(
+ config_file => $elastic_config
+);
+
+$es->connect($cluster);
+
+my $start = time();
+my $results = $es->search($index, $query);
+my $duration = substr(time() - $start, 0, 6);
+
+print OpenSRF::Utils::JSON->perl2JSON($results) . "\n\n";
+
+print "Search returned ".$results->{hits}->{total}.
+ " hits with a reported duration of ".$results->{took}."ms.\n";
+print "Full round-trip time was $duration seconds.\n";
+
+