-- This function is used to help clean up facet labels. Due to quirks in
-- MARC parsing, some facet labels may be generated with periods or commas
-- at the end. This will strip a trailing commas off all the time, and
--- periods when they don't look like they are part of initials.
--- Smith, John => no change
--- Smith, John, => Smith, John
--- Smith, John. => Smith, John
--- Public, John Q. => no change
+-- periods when they don't look like they are part of initials or dotted
+-- abbreviations.
+-- Smith, John => no change
+-- Smith, John, => Smith, John
+-- Smith, John. => Smith, John
+-- Public, John Q. => no change
+-- Public, John, Ph.D. => no change
+-- Atlanta -- Georgia -- U.S. => no change
+-- Atlanta -- Georgia. => Atlanta, Georgia
+-- The fellowship of the rings / => The fellowship of the rings
+-- Some title ; => Some title
CREATE OR REPLACE FUNCTION metabib.trim_trailing_punctuation ( TEXT ) RETURNS TEXT AS $$
DECLARE
result TEXT;
result := substring(result from '^(.*),$');
ELSIF last_char = '.' THEN
- IF substring(result from ' \w\.$') IS NULL THEN
+ -- must have a single word-character following at least one non-word character
+ IF substring(result from '\W\w\.$') IS NULL THEN
result := substring(result from '^(.*)\.$');
END IF;
+
+ ELSIF last_char IN ('/',':',';','=') THEN -- Dangling subtitle/SoR separator
+ IF substring(result from ' .$') IS NOT NULL THEN -- must have a space before last_char
+ result := substring(result from '^(.*) .$');
+ END IF;
END IF;
RETURN result;
WHERE i.func = 'metabib.trim_trailing_punctuation'
AND m.id IN (7,8,9,10);
+INSERT INTO config.metabib_field_index_norm_map (field,norm,pos)
+ SELECT m.id,
+ i.id,
+ -1
+ FROM config.metabib_field m,
+ config.index_normalizer i
+ WHERE i.func = 'metabib.trim_trailing_punctuation'
+ m.field_class='title' AND (m.browse_field OR m.facet_field OR m.display_field)
+ AND NOT EXISTS (SELECT 1 FROM config.metabib_field_index_norm_map WHERE field = m.id AND norm = i.id);
INSERT INTO config.record_attr_index_norm_map (attr,norm,pos)
SELECT m.name, i.id, 0
SELECT is( metabib.trim_trailing_punctuation('Smith, John.'), 'Smith, John', 'Chop trailing period');
SELECT is( metabib.trim_trailing_punctuation('Public, John Q.'), 'Public, John Q.', 'Retain trailing period');
SELECT is( metabib.trim_trailing_punctuation('Public, John Q,'), 'Public, John Q', 'Eliminate comma C');
+SELECT is( metabib.trim_trailing_punctuation('Public, John, Ph.D.'), 'Public, John, Ph.D.', 'Retain trailing period in abbr.');
+SELECT is( metabib.trim_trailing_punctuation('Washington DC, U.S.'), 'Washington DC, U.S.', 'Retain trailing period in abbr.');
SELECT is( metabib.trim_trailing_punctuation('(FTC).'), '(FTC)', 'Trailing period');
+SELECT is( metabib.trim_trailing_punctuation('A title with no SoR /'), 'A title with no SoR', 'Remove erroneous trailing slash');
+SELECT is( metabib.trim_trailing_punctuation('A title with no subtitle :'), 'A title with no subtitle', 'Remove erroneous trailing colon');
+SELECT is( metabib.trim_trailing_punctuation('A title ;'), 'A title', 'Remove erroneous trailing semicolon');
+SELECT is( metabib.trim_trailing_punctuation('A title ='), 'A title', 'Remove erroneous trailing equal');
SELECT * FROM finish();
--- /dev/null
+BEGIN;
+
+SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
+
+-- This function is used to help clean up facet labels. Due to quirks in
+-- MARC parsing, some facet labels may be generated with periods or commas
+-- at the end. This will strip a trailing commas off all the time, and
+-- periods when they don't look like they are part of initials or dotted
+-- abbreviations.
+-- Smith, John => no change
+-- Smith, John, => Smith, John
+-- Smith, John. => Smith, John
+-- Public, John Q. => no change
+-- Public, John, Ph.D. => no change
+-- Atlanta -- Georgia -- U.S. => no change
+-- Atlanta -- Georgia. => Atlanta, Georgia
+-- The fellowship of the rings / => The fellowship of the rings
+-- Some title ; => Some title
+CREATE OR REPLACE FUNCTION metabib.trim_trailing_punctuation ( TEXT ) RETURNS TEXT AS $$
+DECLARE
+ result TEXT;
+ last_char TEXT;
+BEGIN
+ result := $1;
+ last_char = substring(result from '.$');
+
+ IF last_char = ',' THEN
+ result := substring(result from '^(.*),$');
+
+ ELSIF last_char = '.' THEN
+ -- must have a single word-character following at least one non-word character
+ IF substring(result from '\W\w\.$') IS NULL THEN
+ result := substring(result from '^(.*)\.$');
+ END IF;
+
+ ELSIF last_char IN ('/',':',';','=') THEN -- Dangling subtitle/SoR separator
+ IF substring(result from ' .$') IS NOT NULL THEN -- must have a space before last_char
+ result := substring(result from '^(.*) .$');
+ END IF;
+ END IF;
+
+ RETURN result;
+
+END;
+$$ language 'plpgsql';
+
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+ 'Trim Trailing Punctuation',
+ 'Eliminate extraneous trailing ISBD punctuation in text: slashes, colons, commas, and periods',
+ 'metabib.trim_trailing_punctuation',
+ 0
+);
+
+INSERT INTO config.metabib_field_index_norm_map (field,norm,pos)
+ SELECT m.id,
+ i.id,
+ -1
+ FROM config.metabib_field m,
+ config.index_normalizer i
+ WHERE i.func = 'metabib.trim_trailing_punctuation'
+ m.field_class='title' AND (m.browse_field OR m.facet_field OR m.display_field)
+ AND NOT EXISTS (SELECT 1 FROM config.metabib_field_index_norm_map WHERE field = m.id AND norm = i.id);
+
+COMMIT;
+
+\qecho A partial reingest is necessary to get the full benefit of this change.
+\qecho It will take a while. You can cancel now withoug losing the effect of
+\qecho the rest of the upgrade script, and arrange the reingest later.
+\qecho
+
+SELECT metabib.reingest_metabib_field_entries(
+ id, TRUE, FALSE, FALSE, TRUE,
+ (SELECT ARRAY_AGG(id) INTO field_list FROM config.metabib_field WHERE field_class='title' AND (browse_field OR facet_field OR display_field))
+) FROM biblio.record_entry;
+