# TODO TODO check settings/db to see if elasticsearch is
# enabled for bib-search.
- return elastic_search($search_hash->{query}, $user_offset, $user_limit);
+ #return elastic_search($search_hash->{query}, $user_offset, $user_limit);
# we're grabbing results on a per-superpage basis, which means the
# limit and offset should coincide with superpage boundaries
if ($key =~ /title/) {
$elastic_query->{sort} = [
{'title|sort' => $dir},
- {'title|proper.raw' => $dir}
{'title|maintitle.raw' => $dir}
];
my $self = shift;
my $cluster = $self->cluster;
- my $clusters = $self->get_db_rows(
- "SELECT * FROM config.elastic_cluster WHERE name = '$cluster'");
-
- my $cluster_id = $clusters->[0]->{id};
-
- unless ($cluster_id) {
- $logger->error("ES no such cluster: $cluster");
- return;
- }
-
$self->{servers} = $self->get_db_rows(
- "SELECT * FROM config.elastic_server WHERE cluster = $cluster_id AND active");
+ "SELECT * FROM elastic.node WHERE cluster = '$cluster' AND active");
unless (@{$self->{servers}}) {
$logger->error("ES no servers defined for cluster $cluster");
}
$self->{indices} = $self->get_db_rows(
- "SELECT * FROM config.elastic_index WHERE cluster = $cluster_id AND active");
+ "SELECT * FROM elastic.index WHERE cluster = '$cluster' AND active");
unless (@{$self->{indices}}) {
$logger->error("ES no indices defined for cluster $cluster");
return;
}
-
- my $index_ids = join(',', map {$_->{id}} @{$self->{indices}});
-
- $self->{marc_fields} = $self->get_db_rows(
- "SELECT * FROM config.elastic_marc_field WHERE index IN ($index_ids) AND active");
}
sub connect {
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR code. See the
# GNU General Public License for more details.
# ---------------------------------------------------------------
use strict;
create_date => {type => 'date', index => 'false'},
edit_date => {type => 'date', index => 'false'},
- # Holdings summaries. For bib-search purposes, we don't need
+ # Holdings summaries. For bib-search codes, we don't need
# copy-specific details, only aggregate visibility information.
holdings => {
type => 'nested',
type => 'text',
analyzer => $LANG_ANALYZER,
fields => {
- folded => {type => 'text', analyzer => 'folding'},
- raw => {type => 'keyword'}
+ folded => {type => 'text', analyzer => 'folding'}
}
},
author => {
type => 'text',
analyzer => $LANG_ANALYZER,
fields => {
- folded => {type => 'text', analyzer => 'folding'},
- raw => {type => 'keyword'}
+ folded => {type => 'text', analyzer => 'folding'}
}
},
subject => {
type => 'text',
analyzer => $LANG_ANALYZER,
fields => {
- folded => {type => 'text', analyzer => 'folding'},
- raw => {type => 'keyword'}
+ folded => {type => 'text', analyzer => 'folding'}
}
},
series => {
type => 'text',
analyzer => $LANG_ANALYZER,
fields => {
- folded => {type => 'text', analyzer => 'folding'},
- raw => {type => 'keyword'}
+ folded => {type => 'text', analyzer => 'folding'}
}
},
- # No .raw field for keyword based on the assumption
- # keyword values are not used for sorting or aggregation.
keyword => {
type => 'text',
analyzer => $LANG_ANALYZER,
}
},
- # Index identifier fields as keywords to avoid unnecessary
- # ES analysis.
+ # Avoid full-text analysis on identifer fields.
identifier => {type => 'keyword'}
};
sub index {
my $self = shift;
return $self->{index} if $self->{index};
- ($self->{index}) = grep {$_->{purpose} eq $INDEX_NAME} @{$self->{indices}};
+ ($self->{index}) = grep {$_->{code} eq $INDEX_NAME} @{$self->{indices}};
return $self->{index};
}
-sub get_marc_fields {
- my $self = shift;
- return grep {
- $_->{index} == $self->index->{id}
- } @{$self->{marc_fields}};
-}
-
-# Load the XSLT transforms from the DB.
-sub load_transforms {
- my $self = shift;
-
- $self->{xsl_transforms} = {} unless $self->{xsl_transforms};
-
- for my $field ($self->get_marc_fields) {
- my $format = $field->{format};
- next if $self->{xsl_transforms}{$format};
-
- $logger->info("ES loading info for document type $format");
-
- my $xform = $self->get_db_rows(
- "SELECT * FROM config.xml_transform WHERE name = '$format'")->[0];
-
- $self->{xml_namespaces}{$format} = {
- prefix => $xform->{prefix},
- uri => $xform->{namespace_uri}
- };
-
- if ($format eq 'marcxml') {
- # No transform needed for MARCXML.
- # Indicate we've seen it and move on.
- $self->{xsl_transforms}{$format} = {};
- next;
- }
-
- $logger->info("ES parsing stylesheet for $format");
-
- my $xsl_doc = XML::LibXML->new->parse_string($xform->{xslt});
-
- $self->{xsl_transforms}{$format} =
- XML::LibXSLT->new->parse_stylesheet($xsl_doc);
- }
-}
-
sub create_index {
my ($self) = @_;
my $mappings = $BASE_PROPERTIES;
- # Add an index definition for each dynamic field.
- # Add copy_to for field_class-level combined searches.
- for my $field ($self->get_marc_fields) {
-
- my $field_class = $field->{field_class};
- my $field_name = "$field_class|" . $field->{name};
- my $datatype = $field->{datatype};
- my $def;
+ my $fields = $self->get_db_rows(
+ 'SELECT * FROM elastic.bib_index_properties');
- if ($datatype eq 'text') {
+ for my $field (@$fields) {
- # Clone the class-level index definition (e.g. title) to
- # use as the source of the field-specific index.
- $def = clone($BASE_PROPERTIES->{$field_class});
+ my $field_name = $field->{name};
+ my $search_group = $field->{search_group};
+ $field_name = "$search_group|$field_name" if $search_group;
- # Copy data for all search fields to their parent class to
- # support group-level searches (e.g. title search)
- $def->{copy_to} = $field_class;
+ my $def;
+ if ($field->{search_field}) {
+ # Search fields get full text indexing and analysis
+
+ $def = {
+ type => 'text',
+ analyzer => $LANG_ANALYZER,
+ fields => {
+ folded => {type => 'text', analyzer => 'folding'}
+ }
+ };
+
+ if ($field->{facet_field} || $field->{sorter}) {
+ # If it's also a facet field, add a keyword version
+ # of the field to use for aggregation
+ $def->{fields}{raw} = {type => 'keyword'};
+
+ if ($search_group) {
+ # Fields in a search group are "copy_to"'ed the
+ # group definition
+ $def->{copy_to} = $search_group;
+ }
+ }
} else {
- # non-text (keyword, etc.) fields are indexed as-is, no extra text field
- # index analysis is necessary.
- $def = {type => $datatype};
+ # Fields that are only used for aggregation and sorting
+ # and filtering get no full-text treatment.
+ $def = {type => 'keyword'};
}
+ $logger->info("ES adding field $field_name: ".
+ OpenSRF::Utils::JSON->perl2JSON($def));
+
$mappings->{$field_name} = $def;
}
sub populate_index {
my ($self) = @_;
- $self->load_transforms;
-
my $index_count = 0;
my $total_indexed = 0;
my $state = {last_bib_id => 0};
my ($self, $state, $record_id) = @_;
my $sql = <<SQL;
-SELECT bre.id, bre.marc, bre.create_date, bre.edit_date, bre.source
+SELECT bre.id, bre.create_date, bre.edit_date, bre.source AS bib_source
FROM biblio.record_entry bre
SQL
my $last_id = $state->{last_bib_id};
$sql .= <<SQL;
WHERE NOT bre.deleted AND bre.active AND bre.id > $last_id
-ORDER BY bre.edit_date , bre.id LIMIT $BIB_BATCH_SIZE
+ORDER BY bre.edit_date, bre.id LIMIT $BIB_BATCH_SIZE
SQL
}
my $holdings = $self->load_holdings($bib_ids);
+ my $fields = $self->get_db_rows(
+ 'SELECT * FROM elastic.bib_index_properties');
+
for my $bib (@$bib_data) {
my $bib_id = $bib->{id};
- my $marc_doc = XML::LibXML->new->parse_string($bib->{marc});
- my $body = $self->extract_bib_values($marc_doc);
-
- $body->{holdings} = $holdings->{$bib_id} || [];
- $body->{source} = $bib->{source};
+ my $body = {
+ bib_source => $bib->{bib_source},
+ holdings => $holdings->{$bib_id} || []
+ };
- for my $field (q/create_date edit_date/) {
- next unless $bib->{$field};
+ for my $df (q/create_date edit_date/) {
+ next unless $bib->{$df};
# ES wants ISO dates with the 'T' separator
- (my $val = $bib->{$field}) =~ s/ /T/g;
- $body->{$field} = $val;
+ (my $val = $bib->{$df}) =~ s/ /T/g;
+ $body->{$df} = $val;
+ }
+
+ my $fields = $self->get_db_rows(
+ "SELECT * FROM elastic.bib_record_properties($bib_id)");
+
+ for my $field (@$fields) {
+ my $fclass = $field->{search_group};
+ my $fname = $field->{name};
+ $fname = "$fclass|$fname" if $fclass;
+ $body->{$fname} = $field->{value}
}
return 0 unless
return $index_count;
}
-sub get_bib_as {
- my ($self, $marc_doc, $format) = @_;
- return $marc_doc if $format eq 'marcxml';
- return $self->{xsl_transforms}{$format}->transform($marc_doc);
-}
-
-# Returns either a string value or an array of string values.
-sub extract_xpath {
- my ($self, $xml_doc, $format, $xpath) = @_;
-
- my $ns = $self->{xml_namespaces}{$format};
- my $root = $xml_doc->documentElement;
-
- $root->setNamespace($ns->{uri}, $ns->{prefix}, 1);
-
- my @nodes = $root->findnodes($xpath);
-
- if (@nodes) {
- if (@nodes == 1) {
- return $nodes[0]->textContent;
- } else {
- return [ map { $_->textContent } @nodes ];
- }
- } else {
- # Some XPATH returns nodes, some (e.g. substring()) returns
- # string values instead of nodes.
- return $root->findvalue($xpath) || undef;
- }
-}
-
-sub extract_bib_values {
- my ($self, $marc_doc) = @_;
-
- # various formats of the current MARC record (mods, etc.)
- my %xform_docs;
- my $values = {};
- for my $field ($self->get_marc_fields) {
-
- my $format = $field->{format};
- my $field_name = $field->{field_class} .'|' . $field->{name};
-
- $xform_docs{$format} = $self->get_bib_as($marc_doc, $format)
- unless $xform_docs{$format};
-
- my $xform_doc = $xform_docs{$format};
-
- $values->{$field_name} =
- $self->extract_xpath($xform_doc, $format, $field->{xpath});
- }
-
- return $values;
-}
-
# Load holdings summary blobs for requested bibs
sub load_holdings {
my ($self, $bib_ids) = @_;
status => $copy->{status},
circ_lib => $copy->{circ_lib},
location => $copy->{location},
- circulate => $copy->{circulate} eq 't' ? 'true' : 'false',
- opac_visbile => $copy->{opac_visible} eq 't' ? 'true' : 'false'
+ #circulate => $copy->{circulate} eq 't' ? 'true' : 'false',
+ #opac_visbile => $copy->{opac_visible} eq 't' ? 'true' : 'false'
+ circulate => $copy->{circulate} ? 'true' : 'false',
+ opac_visbile => $copy->{opac_visible} ? 'true' : 'false'
});
}
+DROP SCHEMA IF EXISTS elastic CASCADE;
+
BEGIN;
-CREATE TABLE config.elastic_cluster (
- id SERIAL PRIMARY KEY,
- name TEXT NOT NULL
+CREATE SCHEMA elastic;
+
+CREATE TABLE elastic.cluster (
+ code TEXT NOT NULL DEFAULT 'main' PRIMARY KEY,
+ label TEXT NOT NULL
);
-CREATE TABLE config.elastic_server (
+CREATE TABLE elastic.node (
id SERIAL PRIMARY KEY,
label TEXT NOT NULL UNIQUE,
host TEXT NOT NULL,
proto TEXT NOT NULL,
port INTEGER NOT NULL,
active BOOLEAN NOT NULL DEFAULT FALSE,
- cluster INTEGER NOT NULL
- REFERENCES config.elastic_cluster (id) ON DELETE CASCADE
+ cluster TEXT NOT NULL
+ REFERENCES elastic.cluster (code) ON DELETE CASCADE,
+ CONSTRAINT node_once UNIQUE (host, port)
);
-CREATE TABLE config.elastic_index (
+CREATE TABLE elastic.index (
id SERIAL PRIMARY KEY,
- name TEXT NOT NULL UNIQUE,
- purpose TEXT NOT NULL DEFAULT 'bib-search',
- num_shards INTEGER NOT NULL DEFAULT 5,
+ code TEXT NOT NULL DEFAULT 'bib-search',
+ cluster TEXT NOT NULL
+ REFERENCES elastic.cluster (code) ON DELETE CASCADE,
active BOOLEAN NOT NULL DEFAULT FALSE,
- cluster INTEGER NOT NULL
- REFERENCES config.elastic_cluster (id) ON DELETE CASCADE,
- CONSTRAINT valid_index_purpose CHECK (purpose IN ('bib-search'))
+ num_shards INTEGER NOT NULL DEFAULT 1,
+ CONSTRAINT valid_index_code CHECK (code IN ('bib-search')),
+ CONSTRAINT index_type_once_per_cluster UNIQUE (code, cluster)
);
-CREATE TABLE config.elastic_marc_field (
- id SERIAL PRIMARY KEY,
- index INTEGER NOT NULL
- REFERENCES config.elastic_index (id) ON DELETE CASCADE,
- active BOOLEAN NOT NULL DEFAULT FALSE,
- field_class TEXT NOT NULL REFERENCES config.metabib_class (name),
- label TEXT NOT NULL,
- name TEXT NOT NULL,
- datatype TEXT NOT NULL DEFAULT 'text',
- weight INTEGER NOT NULL DEFAULT 1,
- format TEXT NOT NULL REFERENCES config.xml_transform (name),
- xpath TEXT NOT NULL,
- search_field BOOLEAN NOT NULL DEFAULT FALSE,
- facet_field BOOLEAN NOT NULL DEFAULT FALSE,
- sort_field BOOLEAN NOT NULL DEFAULT FALSE,
- multi_value BOOLEAN NOT NULL DEFAULT FALSE,
- CONSTRAINT valid_datatype CHECK (datatype IN
- ('text', 'keyword', 'date', 'long', 'double', 'boolean', 'ip'))
-);
+CREATE OR REPLACE VIEW elastic.bib_index_properties AS
+ SELECT fields.* FROM (
+ SELECT
+ NULL::INT AS metabib_field,
+ crad.name,
+ NULL AS search_group,
+ crad.sorter,
+ crad.multi,
+ FALSE AS search_field,
+ FALSE AS facet_field,
+ 1 AS weight
+ FROM config.record_attr_definition crad
+ WHERE crad.name NOT LIKE '%_ind_%'
+ UNION
+ SELECT
+ cmf.id AS metabib_field,
+ cmf.name,
+ cmf.field_class AS search_group,
+ FALSE AS sorter,
+ TRUE AS multi,
+ (cmf.field_class <> 'identifier' AND cmf.search_field) AS search_field,
+ cmf.facet_field,
+ cmf.weight
+ FROM config.metabib_field cmf
+ WHERE cmf.search_field OR cmf.facet_field
+ ) fields;
+
+CREATE OR REPLACE FUNCTION elastic.bib_record_properties(bre_id BIGINT)
+ RETURNS TABLE (
+ search_group TEXT,
+ name TEXT,
+ source BIGINT,
+ value TEXT
+ )
+ AS $FUNK$
+DECLARE
+BEGIN
+ RETURN QUERY EXECUTE $$
+ SELECT record.* FROM (
+ SELECT NULL::TEXT AS search_group, crad.name, mrs.source, mrs.value
+ FROM metabib.record_sorter mrs
+ JOIN config.record_attr_definition crad ON (crad.name = mrs.attr)
+ WHERE mrs.source = $$ || QUOTE_LITERAL(bre_id) || $$
+ UNION
+ SELECT NULL::TEXT AS search_group, crad.name, mraf.id AS source, mraf.value
+ FROM metabib.record_attr_flat mraf
+ JOIN config.record_attr_definition crad ON (crad.name = mraf.attr)
+ WHERE mraf.id = $$ || QUOTE_LITERAL(bre_id) || $$
+ UNION
+ SELECT cmf.field_class AS search_group, cmf.name, mfe.source, mfe.value
+ FROM (
+ SELECT * FROM metabib.title_field_entry UNION
+ SELECT * FROM metabib.author_field_entry UNION
+ SELECT * FROM metabib.subject_field_entry UNION
+ SELECT * FROM metabib.series_field_entry UNION
+ SELECT * FROM metabib.keyword_field_entry UNION
+ SELECT * FROM metabib.identifier_field_entry
+ ) mfe
+ JOIN config.metabib_field cmf ON (cmf.id = mfe.field)
+ WHERE mfe.source = $$ || QUOTE_LITERAL(bre_id) || $$
+ AND (cmf.search_field OR cmf.facet_field)
+ ) record
+ $$;
+END $FUNK$ LANGUAGE PLPGSQL;
-/* SEED DATA ------------------------------------------------------------ */
-INSERT INTO config.elastic_cluster (name) VALUES ('main');
+/* SEED DATA ------------------------------------------------------------ */
-INSERT INTO config.elastic_server
- (label, host, proto, port, active, cluster)
-VALUES ('localhost', 'localhost', 'http', 9200, TRUE,
- (SELECT id FROM config.elastic_cluster WHERE name = 'main'));
-
-INSERT INTO config.elastic_index (name, purpose, active, cluster)
-VALUES ('Bib Search', 'bib-search', TRUE,
- (SELECT id FROM config.elastic_cluster WHERE name = 'main'));
-
--- Start with indexes that match search/facet fields in config.metabib_field
-
-INSERT INTO config.elastic_marc_field
- (index, active, field_class, label, name, weight, format,
- xpath, search_field, facet_field, datatype)
-SELECT
- (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
- TRUE,
- cmf.field_class,
- cmf.label,
- cmf.name,
- cmf.weight,
- cmf.format,
- cmf.xpath || COALESCE(cmf.facet_xpath, COALESCE(cmf.display_xpath, '')),
- cmf.search_field,
- cmf.facet_field,
- 'text'
-FROM config.metabib_field cmf
-WHERE cmf.xpath IS NOT NULL AND (cmf.search_field OR cmf.facet_field);
-
--- Add additional indexes for other search-y / filter-y stuff
-
-INSERT INTO config.elastic_marc_field
- (index, active, search_field, facet_field,
- field_class, label, name, format, datatype, xpath)
-VALUES (
- (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
- TRUE, TRUE, TRUE,
- 'identifier', 'Language', 'item_lang', 'marcxml', 'keyword',
- $$substring(//marc:controlfield[@tag='008']/text(), '36', '3')$$
-), (
- (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
- TRUE, TRUE, TRUE,
- 'identifier', 'Item Form', 'item_form', 'marcxml', 'keyword',
- $$substring(//marc:controlfield[@tag='008']/text(), '24', '1')$$
-), (
- (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
- TRUE, TRUE, TRUE,
- 'identifier', 'Audience', 'audience', 'marcxml', 'keyword',
- $$substring(//marc:controlfield[@tag='008']/text(), '23', '1')$$
-), (
- (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
- TRUE, TRUE, TRUE,
- 'identifier', 'Literary Form', 'lit_form', 'marcxml', 'keyword',
- $$substring(//marc:controlfield[@tag='008']/text(), '34', '1')$$
-), (
- (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
- TRUE, TRUE, TRUE,
- 'identifier', 'Publication Date', 'pub_date', 'mods32', 'long',
- $$//mods32:mods/mods32:originInfo/mods32:dateIssued[@encoding='marc']$$
-), (
- (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
- TRUE, FALSE, TRUE,
- 'title', 'Title Sort', 'sort', 'mods32', 'keyword',
- $$(//mods32:mods/mods32:titleInfo[mods32:nonSort]/mods32:title|//mods32:mods/mods32:titleNonfiling[mods32:title and not (@type)])[1]$$
-), (
- (SELECT id FROM config.elastic_index WHERE purpose = 'bib-search'),
- TRUE, FALSE, TRUE,
- 'author', 'Author Sort', 'sort', 'mods32', 'keyword',
- $$//mods32:mods/mods32:name[mods32:role/mods32:roleTerm[text()='creator']][1]$$
-);
+INSERT INTO elastic.cluster (code, label) VALUES ('main', 'Main Cluster');
--- TODO ADD MORE FIELDS
+INSERT INTO elastic.node
+ (label, host, proto, port, active, cluster)
+VALUES ('Localhost', 'localhost', 'http', 9200, TRUE, 'main');
--- avoid full-text indexing on identifier fields
-UPDATE config.elastic_marc_field SET datatype = 'keyword'
-WHERE field_class = 'identifier';
+INSERT INTO elastic.index (code, active, cluster)
+VALUES ('bib-search', TRUE, 'main');
COMMIT;
/* UNDO
-DROP TABLE config.elastic_marc_field;
-DROP TABLE config.elastic_index;
-DROP TABLE config.elastic_server;
-DROP TABLE config.elastic_cluster;
+DROP SCHEMA IF EXISTS elastic CASCADE;
*/
my $osrf_config = '/openils/conf/opensrf_core.xml';
my $cluster = 'main';
my $index = 'bib-search';
+my $quiet = 0;
my $query_string;
GetOptions(
'osrf-config=s' => \$osrf_config,
'cluster=s' => \$cluster,
'index=s' => \$index,
+ 'quiet' => \$quiet,
'query-string=s' => \$query_string
) || die "\nSee --help for more\n";
my $results = $es->search($query);
my $duration = substr(time() - $start, 0, 6);
-print OpenSRF::Utils::JSON->perl2JSON($results) . "\n\n";
+print OpenSRF::Utils::JSON->perl2JSON($results) . "\n";
-print "Search returned ".$results->{hits}->{total}.
- " hits with a reported duration of ".$results->{took}."ms.\n";
-print "Full round-trip time was $duration seconds.\n";
+unless ($quiet) {
+ print "\nSearch returned ".$results->{hits}->{total}.
+ " hits with a reported duration of ".$results->{took}."ms.\n";
+ print "Full round-trip time was $duration seconds.\n";
+}