From fe9db51e9442c938d7d7226aa211b7a3cd09c38d Mon Sep 17 00:00:00 2001 From: Mike Rylander Date: Mon, 24 Feb 2020 11:15:22 -0500 Subject: [PATCH] LP#1864507: Improve browse by removing more ISBD The fix for bug 1308090 introduced an improvement targetted at Facets that removed some extraneous punctuation from author-ish fields. Here we expand that to cover more field types (primarily title), and protect some author data from unhelpful changes. The need for this is primarily driven by the browse interface for patrons. Signed-off-by: Mike Rylander Signed-off-by: Jennifer Weston Signed-off-by: Jane Sandberg --- Open-ILS/src/sql/Pg/030.schema.metabib.sql | 24 +++++-- Open-ILS/src/sql/Pg/950.data.seed-values.sql | 9 +++ Open-ILS/src/sql/Pg/t/lp1308090-facet_punct.pg | 6 ++ ...XX.function.trim_trailing_punct-improvement.sql | 76 ++++++++++++++++++++++ 4 files changed, 109 insertions(+), 6 deletions(-) create mode 100644 Open-ILS/src/sql/Pg/upgrade/XXXX.function.trim_trailing_punct-improvement.sql diff --git a/Open-ILS/src/sql/Pg/030.schema.metabib.sql b/Open-ILS/src/sql/Pg/030.schema.metabib.sql index 86416cac97..40cd60ecaa 100644 --- a/Open-ILS/src/sql/Pg/030.schema.metabib.sql +++ b/Open-ILS/src/sql/Pg/030.schema.metabib.sql @@ -2314,11 +2314,17 @@ $p$ LANGUAGE SQL STABLE; -- This function is used to help clean up facet labels. Due to quirks in -- MARC parsing, some facet labels may be generated with periods or commas -- at the end. This will strip a trailing commas off all the time, and --- periods when they don't look like they are part of initials. --- Smith, John => no change --- Smith, John, => Smith, John --- Smith, John. => Smith, John --- Public, John Q. => no change +-- periods when they don't look like they are part of initials or dotted +-- abbreviations. +-- Smith, John => no change +-- Smith, John, => Smith, John +-- Smith, John. => Smith, John +-- Public, John Q. => no change +-- Public, John, Ph.D. => no change +-- Atlanta -- Georgia -- U.S. => no change +-- Atlanta -- Georgia. => Atlanta, Georgia +-- The fellowship of the rings / => The fellowship of the rings +-- Some title ; => Some title CREATE OR REPLACE FUNCTION metabib.trim_trailing_punctuation ( TEXT ) RETURNS TEXT AS $$ DECLARE result TEXT; @@ -2331,9 +2337,15 @@ BEGIN result := substring(result from '^(.*),$'); ELSIF last_char = '.' THEN - IF substring(result from ' \w\.$') IS NULL THEN + -- must have a single word-character following at least one non-word character + IF substring(result from '\W\w\.$') IS NULL THEN result := substring(result from '^(.*)\.$'); END IF; + + ELSIF last_char IN ('/',':',';','=') THEN -- Dangling subtitle/SoR separator + IF substring(result from ' .$') IS NOT NULL THEN -- must have a space before last_char + result := substring(result from '^(.*) .$'); + END IF; END IF; RETURN result; diff --git a/Open-ILS/src/sql/Pg/950.data.seed-values.sql b/Open-ILS/src/sql/Pg/950.data.seed-values.sql index a3f50050ee..7cd50de78d 100644 --- a/Open-ILS/src/sql/Pg/950.data.seed-values.sql +++ b/Open-ILS/src/sql/Pg/950.data.seed-values.sql @@ -10977,6 +10977,15 @@ INSERT INTO config.metabib_field_index_norm_map (field,norm,pos) WHERE i.func = 'metabib.trim_trailing_punctuation' AND m.id IN (7,8,9,10); +INSERT INTO config.metabib_field_index_norm_map (field,norm,pos) + SELECT m.id, + i.id, + -1 + FROM config.metabib_field m, + config.index_normalizer i + WHERE i.func = 'metabib.trim_trailing_punctuation' + m.field_class='title' AND (m.browse_field OR m.facet_field OR m.display_field) + AND NOT EXISTS (SELECT 1 FROM config.metabib_field_index_norm_map WHERE field = m.id AND norm = i.id); INSERT INTO config.record_attr_index_norm_map (attr,norm,pos) SELECT m.name, i.id, 0 diff --git a/Open-ILS/src/sql/Pg/t/lp1308090-facet_punct.pg b/Open-ILS/src/sql/Pg/t/lp1308090-facet_punct.pg index 07e69f66c0..d68eaf2d3d 100644 --- a/Open-ILS/src/sql/Pg/t/lp1308090-facet_punct.pg +++ b/Open-ILS/src/sql/Pg/t/lp1308090-facet_punct.pg @@ -16,7 +16,13 @@ SELECT is( metabib.trim_trailing_punctuation('Saki'), 'Saki', 'Name no trailing SELECT is( metabib.trim_trailing_punctuation('Smith, John.'), 'Smith, John', 'Chop trailing period'); SELECT is( metabib.trim_trailing_punctuation('Public, John Q.'), 'Public, John Q.', 'Retain trailing period'); SELECT is( metabib.trim_trailing_punctuation('Public, John Q,'), 'Public, John Q', 'Eliminate comma C'); +SELECT is( metabib.trim_trailing_punctuation('Public, John, Ph.D.'), 'Public, John, Ph.D.', 'Retain trailing period in abbr.'); +SELECT is( metabib.trim_trailing_punctuation('Washington DC, U.S.'), 'Washington DC, U.S.', 'Retain trailing period in abbr.'); SELECT is( metabib.trim_trailing_punctuation('(FTC).'), '(FTC)', 'Trailing period'); +SELECT is( metabib.trim_trailing_punctuation('A title with no SoR /'), 'A title with no SoR', 'Remove erroneous trailing slash'); +SELECT is( metabib.trim_trailing_punctuation('A title with no subtitle :'), 'A title with no subtitle', 'Remove erroneous trailing colon'); +SELECT is( metabib.trim_trailing_punctuation('A title ;'), 'A title', 'Remove erroneous trailing semicolon'); +SELECT is( metabib.trim_trailing_punctuation('A title ='), 'A title', 'Remove erroneous trailing equal'); SELECT * FROM finish(); diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.function.trim_trailing_punct-improvement.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.function.trim_trailing_punct-improvement.sql new file mode 100644 index 0000000000..e469f2b2af --- /dev/null +++ b/Open-ILS/src/sql/Pg/upgrade/XXXX.function.trim_trailing_punct-improvement.sql @@ -0,0 +1,76 @@ +BEGIN; + +SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version); + +-- This function is used to help clean up facet labels. Due to quirks in +-- MARC parsing, some facet labels may be generated with periods or commas +-- at the end. This will strip a trailing commas off all the time, and +-- periods when they don't look like they are part of initials or dotted +-- abbreviations. +-- Smith, John => no change +-- Smith, John, => Smith, John +-- Smith, John. => Smith, John +-- Public, John Q. => no change +-- Public, John, Ph.D. => no change +-- Atlanta -- Georgia -- U.S. => no change +-- Atlanta -- Georgia. => Atlanta, Georgia +-- The fellowship of the rings / => The fellowship of the rings +-- Some title ; => Some title +CREATE OR REPLACE FUNCTION metabib.trim_trailing_punctuation ( TEXT ) RETURNS TEXT AS $$ +DECLARE + result TEXT; + last_char TEXT; +BEGIN + result := $1; + last_char = substring(result from '.$'); + + IF last_char = ',' THEN + result := substring(result from '^(.*),$'); + + ELSIF last_char = '.' THEN + -- must have a single word-character following at least one non-word character + IF substring(result from '\W\w\.$') IS NULL THEN + result := substring(result from '^(.*)\.$'); + END IF; + + ELSIF last_char IN ('/',':',';','=') THEN -- Dangling subtitle/SoR separator + IF substring(result from ' .$') IS NOT NULL THEN -- must have a space before last_char + result := substring(result from '^(.*) .$'); + END IF; + END IF; + + RETURN result; + +END; +$$ language 'plpgsql'; + + +INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES ( + 'Trim Trailing Punctuation', + 'Eliminate extraneous trailing ISBD punctuation in text: slashes, colons, commas, and periods', + 'metabib.trim_trailing_punctuation', + 0 +); + +INSERT INTO config.metabib_field_index_norm_map (field,norm,pos) + SELECT m.id, + i.id, + -1 + FROM config.metabib_field m, + config.index_normalizer i + WHERE i.func = 'metabib.trim_trailing_punctuation' + m.field_class='title' AND (m.browse_field OR m.facet_field OR m.display_field) + AND NOT EXISTS (SELECT 1 FROM config.metabib_field_index_norm_map WHERE field = m.id AND norm = i.id); + +COMMIT; + +\qecho A partial reingest is necessary to get the full benefit of this change. +\qecho It will take a while. You can cancel now withoug losing the effect of +\qecho the rest of the upgrade script, and arrange the reingest later. +\qecho + +SELECT metabib.reingest_metabib_field_entries( + id, TRUE, FALSE, FALSE, TRUE, + (SELECT ARRAY_AGG(id) INTO field_list FROM config.metabib_field WHERE field_class='title' AND (browse_field OR facet_field OR display_field)) +) FROM biblio.record_entry; + -- 2.11.0