From 7da2227f1ee3c98b98b518fa2ca4f857b9173a88 Mon Sep 17 00:00:00 2001 From: Thomas Berezansky Date: Tue, 18 Dec 2012 14:35:28 -0500 Subject: [PATCH] Upgrade script for ts config use And the rest of that work Signed-off-by: Thomas Berezansky Signed-off-by: Lebbeous Fogle-Weekley --- .../src/sql/Pg/upgrade/XXXX.schema.ts_configs.sql | 432 +++++++++++++++++++++ 1 file changed, 432 insertions(+) create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.schema.ts_configs.sql diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.ts_configs.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.ts_configs.sql new file mode 100644 index 0000000000..f36d4478ec --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.ts_configs.sql @@ -0,0 +1,432 @@ +BEGIN; + +SET search_path = public, pg_catalog; + +DO $$ +DECLARE +lang TEXT; +BEGIN +FOR lang IN SELECT substring(pptsd.dictname from '(.*)_stem$') AS lang FROM pg_catalog.pg_ts_dict pptsd JOIN pg_catalog.pg_namespace ppn ON ppn.oid = pptsd.dictnamespace +WHERE ppn.nspname = 'pg_catalog' AND pptsd.dictname LIKE '%_stem' LOOP +RAISE NOTICE 'FOUND LANGUAGE %', lang; + +EXECUTE 'DROP TEXT SEARCH DICTIONARY IF EXISTS ' || lang || '_nostop CASCADE; +CREATE TEXT SEARCH DICTIONARY ' || lang || '_nostop (TEMPLATE=pg_catalog.snowball, language=''' || lang || '''); +COMMENT ON TEXT SEARCH DICTIONARY ' || lang || '_nostop IS ''' ||lang || ' snowball stemmer with no stopwords for ASCII words only.''; +CREATE TEXT SEARCH CONFIGURATION ' || lang || '_nostop ( COPY = pg_catalog.' || lang || ' ); +ALTER TEXT SEARCH CONFIGURATION ' || lang || '_nostop ALTER MAPPING FOR word, hword, hword_part WITH pg_catalog.simple; +ALTER TEXT SEARCH CONFIGURATION ' || lang || '_nostop ALTER MAPPING FOR asciiword, asciihword, hword_asciipart WITH ' || lang || '_nostop;'; + +END LOOP; +END; +$$; +CREATE TEXT SEARCH CONFIGURATION keyword ( COPY = english_nostop ); +CREATE TEXT SEARCH CONFIGURATION "default" ( COPY = english_nostop ); + +SET search_path = evergreen, public, pg_catalog; + +ALTER TABLE config.metabib_class + ADD COLUMN a_weight NUMERIC DEFAULT 1.0 NOT NULL, + ADD COLUMN b_weight NUMERIC DEFAULT 0.4 NOT NULL, + ADD COLUMN c_weight NUMERIC DEFAULT 0.2 NOT NULL, + ADD COLUMN d_weight NUMERIC DEFAULT 0.1 NOT NULL; + +CREATE TABLE config.ts_config_list ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL +); +COMMENT ON TABLE config.ts_config_list IS $$ +Full Text Configs + +A list of full text configs with names and descriptions. +$$; + +CREATE TABLE config.metabib_class_ts_map ( + id SERIAL PRIMARY KEY, + field_class TEXT NOT NULL REFERENCES config.metabib_class (name), + ts_config TEXT NOT NULL REFERENCES config.ts_config_list (id), + active BOOL NOT NULL DEFAULT TRUE, + index_weight CHAR(1) NOT NULL DEFAULT 'C' CHECK (index_weight IN ('A','B','C','D')), + index_lang TEXT NULL, + search_lang TEXT NULL, + always BOOL NOT NULL DEFAULT true +); +COMMENT ON TABLE config.metabib_class_ts_map IS $$ +Text Search Configs for metabib class indexing + +This table contains text search config definitions for +storing index_vector values. +$$; + +CREATE TABLE config.metabib_field_ts_map ( + id SERIAL PRIMARY KEY, + metabib_field INT NOT NULL REFERENCES config.metabib_field (id), + ts_config TEXT NOT NULL REFERENCES config.ts_config_list (id), + active BOOL NOT NULL DEFAULT TRUE, + index_weight CHAR(1) NOT NULL DEFAULT 'C' CHECK (index_weight IN ('A','B','C','D')), + index_lang TEXT NULL, + search_lang TEXT NULL +); +COMMENT ON TABLE config.metabib_field_ts_map IS $$ +Text Search Configs for metabib field indexing + +This table contains text search config definitions for +storing index_vector values. +$$; + +CREATE TABLE metabib.combined_identifier_field_entry ( + record BIGINT NOT NULL, + metabib_field INT NULL, + index_vector tsvector NOT NULL +); +CREATE UNIQUE INDEX metabib_combined_identifier_field_entry_fakepk_idx ON metabib.combined_identifier_field_entry (record, COALESCE(metabib_field::TEXT,'')); +CREATE INDEX metabib_combined_identifier_field_entry_index_vector_idx ON metabib.combined_identifier_field_entry USING GIST (index_vector); +CREATE INDEX metabib_combined_identifier_field_source_idx ON metabib.combined_identifier_field_entry (metabib_field); + +CREATE TABLE metabib.combined_title_field_entry ( + record BIGINT NOT NULL, + metabib_field INT NULL, + index_vector tsvector NOT NULL +); +CREATE UNIQUE INDEX metabib_combined_title_field_entry_fakepk_idx ON metabib.combined_title_field_entry (record, COALESCE(metabib_field::TEXT,'')); +CREATE INDEX metabib_combined_title_field_entry_index_vector_idx ON metabib.combined_title_field_entry USING GIST (index_vector); +CREATE INDEX metabib_combined_title_field_source_idx ON metabib.combined_title_field_entry (metabib_field); + +CREATE TABLE metabib.combined_author_field_entry ( + record BIGINT NOT NULL, + metabib_field INT NULL, + index_vector tsvector NOT NULL +); +CREATE UNIQUE INDEX metabib_combined_author_field_entry_fakepk_idx ON metabib.combined_author_field_entry (record, COALESCE(metabib_field::TEXT,'')); +CREATE INDEX metabib_combined_author_field_entry_index_vector_idx ON metabib.combined_author_field_entry USING GIST (index_vector); +CREATE INDEX metabib_combined_author_field_source_idx ON metabib.combined_author_field_entry (metabib_field); + +CREATE TABLE metabib.combined_subject_field_entry ( + record BIGINT NOT NULL, + metabib_field INT NULL, + index_vector tsvector NOT NULL +); +CREATE UNIQUE INDEX metabib_combined_subject_field_entry_fakepk_idx ON metabib.combined_subject_field_entry (record, COALESCE(metabib_field::TEXT,'')); +CREATE INDEX metabib_combined_subject_field_entry_index_vector_idx ON metabib.combined_subject_field_entry USING GIST (index_vector); +CREATE INDEX metabib_combined_subject_field_source_idx ON metabib.combined_subject_field_entry (metabib_field); + +CREATE TABLE metabib.combined_keyword_field_entry ( + record BIGINT NOT NULL, + metabib_field INT NULL, + index_vector tsvector NOT NULL +); +CREATE UNIQUE INDEX metabib_combined_keyword_field_entry_fakepk_idx ON metabib.combined_keyword_field_entry (record, COALESCE(metabib_field::TEXT,'')); +CREATE INDEX metabib_combined_keyword_field_entry_index_vector_idx ON metabib.combined_keyword_field_entry USING GIST (index_vector); +CREATE INDEX metabib_combined_keyword_field_source_idx ON metabib.combined_keyword_field_entry (metabib_field); + +CREATE TABLE metabib.combined_series_field_entry ( + record BIGINT NOT NULL, + metabib_field INT NULL, + index_vector tsvector NOT NULL +); +CREATE UNIQUE INDEX metabib_combined_series_field_entry_fakepk_idx ON metabib.combined_series_field_entry (record, COALESCE(metabib_field::TEXT,'')); +CREATE INDEX metabib_combined_series_field_entry_index_vector_idx ON metabib.combined_series_field_entry USING GIST (index_vector); +CREATE INDEX metabib_combined_series_field_source_idx ON metabib.combined_series_field_entry (metabib_field); + +CREATE OR REPLACE FUNCTION metabib.update_combined_index_vectors(bib_id BIGINT) RETURNS VOID AS $func$ +BEGIN + DELETE FROM metabib.combined_keyword_field_entry WHERE record = bib_id; + INSERT INTO metabib.combined_keyword_field_entry(record, metabib_field, index_vector) + SELECT bib_id, field, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.keyword_field_entry WHERE source = bib_id GROUP BY field; + INSERT INTO metabib.combined_keyword_field_entry(record, metabib_field, index_vector) + SELECT bib_id, NULL, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.keyword_field_entry WHERE source = bib_id; + + DELETE FROM metabib.combined_title_field_entry WHERE record = bib_id; + INSERT INTO metabib.combined_title_field_entry(record, metabib_field, index_vector) + SELECT bib_id, field, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.title_field_entry WHERE source = bib_id GROUP BY field; + INSERT INTO metabib.combined_title_field_entry(record, metabib_field, index_vector) + SELECT bib_id, NULL, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.title_field_entry WHERE source = bib_id; + + DELETE FROM metabib.combined_author_field_entry WHERE record = bib_id; + INSERT INTO metabib.combined_author_field_entry(record, metabib_field, index_vector) + SELECT bib_id, field, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.author_field_entry WHERE source = bib_id GROUP BY field; + INSERT INTO metabib.combined_author_field_entry(record, metabib_field, index_vector) + SELECT bib_id, NULL, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.author_field_entry WHERE source = bib_id; + + DELETE FROM metabib.combined_subject_field_entry WHERE record = bib_id; + INSERT INTO metabib.combined_subject_field_entry(record, metabib_field, index_vector) + SELECT bib_id, field, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.subject_field_entry WHERE source = bib_id GROUP BY field; + INSERT INTO metabib.combined_subject_field_entry(record, metabib_field, index_vector) + SELECT bib_id, NULL, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.subject_field_entry WHERE source = bib_id; + + DELETE FROM metabib.combined_series_field_entry WHERE record = bib_id; + INSERT INTO metabib.combined_series_field_entry(record, metabib_field, index_vector) + SELECT bib_id, field, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.series_field_entry WHERE source = bib_id GROUP BY field; + INSERT INTO metabib.combined_series_field_entry(record, metabib_field, index_vector) + SELECT bib_id, NULL, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.series_field_entry WHERE source = bib_id; + + DELETE FROM metabib.combined_identifier_field_entry WHERE record = bib_id; + INSERT INTO metabib.combined_identifier_field_entry(record, metabib_field, index_vector) + SELECT bib_id, field, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.identifier_field_entry WHERE source = bib_id GROUP BY field; + INSERT INTO metabib.combined_identifier_field_entry(record, metabib_field, index_vector) + SELECT bib_id, NULL, strip(COALESCE(string_agg(index_vector::TEXT,' '),'')::tsvector) + FROM metabib.identifier_field_entry WHERE source = bib_id; + +END; +$func$ LANGUAGE PLPGSQL; + +CREATE OR REPLACE FUNCTION metabib.reingest_metabib_field_entries( bib_id BIGINT, skip_facet BOOL DEFAULT FALSE, skip_browse BOOL DEFAULT FALSE, skip_search BOOL DEFAULT FALSE ) RETURNS VOID AS $func$ +DECLARE + fclass RECORD; + ind_data metabib.field_entry_template%ROWTYPE; + mbe_row metabib.browse_entry%ROWTYPE; + mbe_id BIGINT; +BEGIN + PERFORM * FROM config.internal_flag WHERE name = 'ingest.assume_inserts_only' AND enabled; + IF NOT FOUND THEN + IF NOT skip_search THEN + FOR fclass IN SELECT * FROM config.metabib_class LOOP + -- RAISE NOTICE 'Emptying out %', fclass.name; + EXECUTE $$DELETE FROM metabib.$$ || fclass.name || $$_field_entry WHERE source = $$ || bib_id; + END LOOP; + END IF; + IF NOT skip_facet THEN + DELETE FROM metabib.facet_entry WHERE source = bib_id; + END IF; + IF NOT skip_browse THEN + DELETE FROM metabib.browse_entry_def_map WHERE source = bib_id; + END IF; + END IF; + + FOR ind_data IN SELECT * FROM biblio.extract_metabib_field_entry( bib_id ) LOOP + IF ind_data.field < 0 THEN + ind_data.field = -1 * ind_data.field; + END IF; + + IF ind_data.facet_field AND NOT skip_facet THEN + INSERT INTO metabib.facet_entry (field, source, value) + VALUES (ind_data.field, ind_data.source, ind_data.value); + END IF; + + IF ind_data.browse_field AND NOT skip_browse THEN + -- A caveat about this SELECT: this should take care of replacing + -- old mbe rows when data changes, but not if normalization (by + -- which I mean specifically the output of + -- evergreen.oils_tsearch2()) changes. It may or may not be + -- expensive to add a comparison of index_vector to index_vector + -- to the WHERE clause below. + SELECT INTO mbe_row * FROM metabib.browse_entry WHERE value = ind_data.value; + IF FOUND THEN + mbe_id := mbe_row.id; + ELSE + INSERT INTO metabib.browse_entry (value) VALUES + (metabib.browse_normalize(ind_data.value, ind_data.field)); + mbe_id := CURRVAL('metabib.browse_entry_id_seq'::REGCLASS); + END IF; + + INSERT INTO metabib.browse_entry_def_map (entry, def, source) + VALUES (mbe_id, ind_data.field, ind_data.source); + END IF; + + IF ind_data.search_field AND NOT skip_search THEN + EXECUTE $$ + INSERT INTO metabib.$$ || ind_data.field_class || $$_field_entry (field, source, value) + VALUES ($$ || + quote_literal(ind_data.field) || $$, $$ || + quote_literal(ind_data.source) || $$, $$ || + quote_literal(ind_data.value) || + $$);$$; + END IF; + + END LOOP; + + IF NOT skip_search THEN + PERFORM metabib.update_combined_index_vectors(bib_id); + END IF; + + RETURN; +END; +$func$ LANGUAGE PLPGSQL; + +DROP FUNCTION IF EXISTS evergreen.oils_tsearch2() CASCADE; +DROP FUNCTION IF EXISTS public.oils_tsearch2() CASCADE; + +CREATE OR REPLACE FUNCTION public.oils_tsearch2 () RETURNS TRIGGER AS $$ +DECLARE + normalizer RECORD; + value TEXT := ''; + temp_vector TEXT := ''; + ts_rec RECORD; + cur_weight "char"; +BEGIN + value := NEW.value; + NEW.index_vector = ''::tsvector; + + IF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN + FOR normalizer IN + SELECT n.func AS func, + n.param_count AS param_count, + m.params AS params + FROM config.index_normalizer n + JOIN config.metabib_field_index_norm_map m ON (m.norm = n.id) + WHERE field = NEW.field + ORDER BY m.pos LOOP + EXECUTE 'SELECT ' || normalizer.func || '(' || + quote_literal( value ) || + CASE + WHEN normalizer.param_count > 0 + THEN ',' || REPLACE(REPLACE(BTRIM(normalizer.params,'[]'),E'\'',E'\\\''),E'"',E'\'') + ELSE '' + END || + ')' INTO value; + + END LOOP; + NEW.value = value; + END IF; + + IF TG_TABLE_NAME::TEXT ~ 'browse_entry$' THEN + value := ARRAY_TO_STRING( + evergreen.regexp_split_to_array(value, E'\\W+'), ' ' + ); + value := public.search_normalize(value); + NEW.index_vector = to_tsvector(TG_ARGV[0]::regconfig, value); + ELSIF TG_TABLE_NAME::TEXT ~ 'field_entry$' THEN + FOR ts_rec IN + SELECT ts_config, index_weight + FROM config.metabib_class_ts_map + WHERE field_class = TG_ARGV[0] + AND index_lang IS NULL OR EXISTS (SELECT 1 FROM metabib.record_attr WHERE id = NEW.source AND index_lang IN(attrs->'item_lang',attrs->'language')) + AND always OR NOT EXISTS (SELECT 1 FROM config.metabib_field_ts_map WHERE metabib_field = NEW.field) + UNION + SELECT ts_config, index_weight + FROM config.metabib_field_ts_map + WHERE metabib_field = NEW.field + AND index_lang IS NULL OR EXISTS (SELECT 1 FROM metabib.record_attr WHERE id = NEW.source AND index_lang IN(attrs->'item_lang',attrs->'language')) + ORDER BY index_weight ASC + LOOP + IF cur_weight IS NOT NULL AND cur_weight != ts_rec.index_weight THEN + NEW.index_vector = NEW.index_vector || setweight(temp_vector::tsvector,cur_weight); + temp_vector = ''; + END IF; + cur_weight = ts_rec.index_weight; + SELECT INTO temp_vector temp_vector || ' ' || to_tsvector(ts_rec.ts_config::regconfig, value)::TEXT; + END LOOP; + NEW.index_vector = NEW.index_vector || setweight(temp_vector::tsvector,cur_weight); + ELSE + NEW.index_vector = to_tsvector(TG_ARGV[0]::regconfig, value); + END IF; + + RETURN NEW; +END; +$$ LANGUAGE PLPGSQL; + +CREATE TRIGGER authority_full_rec_fti_trigger + BEFORE UPDATE OR INSERT ON authority.full_rec + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('keyword'); + +CREATE TRIGGER authority_simple_heading_fti_trigger + BEFORE UPDATE OR INSERT ON authority.simple_heading + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('keyword'); + +CREATE TRIGGER metabib_identifier_field_entry_fti_trigger + BEFORE UPDATE OR INSERT ON metabib.identifier_field_entry + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('identifier'); + +CREATE TRIGGER metabib_title_field_entry_fti_trigger + BEFORE UPDATE OR INSERT ON metabib.title_field_entry + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('title'); + +CREATE TRIGGER metabib_author_field_entry_fti_trigger + BEFORE UPDATE OR INSERT ON metabib.author_field_entry + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('author'); + +CREATE TRIGGER metabib_subject_field_entry_fti_trigger + BEFORE UPDATE OR INSERT ON metabib.subject_field_entry + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('subject'); + +CREATE TRIGGER metabib_keyword_field_entry_fti_trigger + BEFORE UPDATE OR INSERT ON metabib.keyword_field_entry + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('keyword'); + +CREATE TRIGGER metabib_series_field_entry_fti_trigger + BEFORE UPDATE OR INSERT ON metabib.series_field_entry + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('series'); + +CREATE TRIGGER metabib_browse_entry_fti_trigger + BEFORE INSERT OR UPDATE ON metabib.browse_entry + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('keyword'); + +CREATE TRIGGER metabib_full_rec_fti_trigger + BEFORE UPDATE OR INSERT ON metabib.real_full_rec + FOR EACH ROW EXECUTE PROCEDURE oils_tsearch2('default'); + +INSERT INTO config.ts_config_list(id, name) VALUES + ('simple','Non-Stemmed Simple'), + ('danish_nostop','Danish Stemmed'), + ('dutch_nostop','Dutch Stemmed'), + ('english_nostop','English Stemmed'), + ('finnish_nostop','Finnish Stemmed'), + ('french_nostop','French Stemmed'), + ('german_nostop','German Stemmed'), + ('hungarian_nostop','Hungarian Stemmed'), + ('italian_nostop','Italian Stemmed'), + ('norwegian_nostop','Norwegian Stemmed'), + ('portuguese_nostop','Portuguese Stemmed'), + ('romanian_nostop','Romanian Stemmed'), + ('russian_nostop','Russian Stemmed'), + ('spanish_nostop','Spanish Stemmed'), + ('swedish_nostop','Swedish Stemmed'), + ('turkish_nostop','Turkish Stemmed'); + +INSERT INTO config.metabib_class_ts_map(field_class, ts_config, index_weight, always) VALUES + ('keyword','simple','A',true), + ('keyword','english_nostop','C',true), + ('title','simple','A',true), + ('title','english_nostop','C',true), + ('author','simple','A',true), + ('author','english_nostop','C',true), + ('series','simple','A',true), + ('series','english_nostop','C',true), + ('subject','simple','A',true), + ('subject','english_nostop','C',true), + ('identifier','simple','A',true); + +CREATE OR REPLACE FUNCTION evergreen.rel_bump(terms TEXT[], value TEXT, bumps TEXT[], mults NUMERIC[]) RETURNS NUMERIC AS +$BODY$ +use strict; +my ($terms,$value,$bumps,$mults) = @_; + +my $retval = 1; + +for (my $id = 0; $id < @$bumps; $id++) { + if ($bumps->[$id] eq 'first_word') { + $retval *= $mults->[$id] if ($value =~ /^$terms->[0]/); + } elsif ($bumps->[$id] eq 'full_match') { + my $fullmatch = join(' ', @$terms); + $retval *= $mults->[$id] if ($value =~ /^$fullmatch$/); + } elsif ($bumps->[$id] eq 'word_order') { + my $wordorder = join('.*', @$terms); + $retval *= $mults->[$id] if ($value =~ /$wordorder/); + } +} +return $retval; +$BODY$ LANGUAGE plperlu IMMUTABLE STRICT COST 100; + +UPDATE metabib.identifier_field_entry set value = value; +UPDATE metabib.title_field_entry set value = value; +UPDATE metabib.author_field_entry set value = value; +UPDATE metabib.subject_field_entry set value = value; +UPDATE metabib.keyword_field_entry set value = value; +UPDATE metabib.series_field_entry set value = value; + +SELECT metabib.update_combined_index_vectors(id) + FROM biblio.record_entry + WHERE NOT deleted; + +COMMIT; -- 2.11.0