raw_text := REGEXP_REPLACE(raw_text, E'^(\\w+).*?$', E'\\1');
END IF;
- output_text := output_text || REGEXP_REPLACE(raw_text, E'\\s+', '', 'g');
+ output_text := output_text || idx.name || ':' ||
+ REGEXP_REPLACE(raw_text, E'\\s+', '', 'g') || ' ';
END LOOP;
- RETURN output_text;
+ RETURN BTRIM(output_text);
END;
$func$ LANGUAGE PLPGSQL;
--- /dev/null
+-- tests to verify biblio fingerprints avoid conflating
+-- works where words coming from titles and authors might overlap
+BEGIN;
+
+SELECT plan(1);
+
+INSERT INTO biblio.record_entry (last_xact_id, marc) VALUES (
+ 'bib-fingerprint-test-1',
+ $record$<record xmlns="http://www.loc.gov/MARC21/slim">
+ <leader>02137cam a2200457 a 4500</leader>
+ <controlfield tag="001">ocn694080497</controlfield>
+ <controlfield tag="005">20160729104757.0</controlfield>
+ <controlfield tag="008">101217s2011 txu b 001 0 eng </controlfield>
+ <datafield tag="100" ind1="0" ind2="0">
+ <subfield code="a">Jasper, Frances</subfield>
+ </datafield>
+ <datafield tag="245" ind1="0" ind2="0">
+ <subfield code="a">Gzarniblat</subfield>
+ </datafield>
+</record>$record$);
+
+INSERT INTO biblio.record_entry (last_xact_id, marc) VALUES (
+ 'bib-fingerprint-test-2',
+ $record$<record xmlns="http://www.loc.gov/MARC21/slim">
+ <leader>02137cam a2200457 a 4500</leader>
+ <controlfield tag="001">ocn694080497</controlfield>
+ <controlfield tag="005">20160729104757.0</controlfield>
+ <controlfield tag="008">101217s2011 txu b 001 0 eng </controlfield>
+ <datafield tag="245" ind1="0" ind2="0">
+ <subfield code="a">Gzarniblat Jasper</subfield>
+ </datafield>
+</record>$record$);
+
+SELECT results_ne(
+ $$
+ SELECT metarecord FROM metabib.metarecord_source_map
+ WHERE source = (
+ SELECT id FROM biblio.record_entry WHERE last_xact_id = 'bib-fingerprint-test-1'
+ )
+ $$,
+ $$
+ SELECT metarecord FROM metabib.metarecord_source_map
+ WHERE source = (
+ SELECT id FROM biblio.record_entry WHERE last_xact_id = 'bib-fingerprint-test-2'
+ )
+ $$,
+ 'LP#1528901: same words in title and author do not stick different bibs in same metarecord'
+);
+
+ROLLBACK;
--- /dev/null
+BEGIN;
+
+--- SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
+
+CREATE OR REPLACE FUNCTION biblio.extract_fingerprint ( marc text ) RETURNS TEXT AS $func$
+DECLARE
+ idx config.biblio_fingerprint%ROWTYPE;
+ xfrm config.xml_transform%ROWTYPE;
+ prev_xfrm TEXT;
+ transformed_xml TEXT;
+ xml_node TEXT;
+ xml_node_list TEXT[];
+ raw_text TEXT;
+ output_text TEXT := '';
+BEGIN
+
+ IF marc IS NULL OR marc = '' THEN
+ RETURN NULL;
+ END IF;
+
+ -- Loop over the indexing entries
+ FOR idx IN SELECT * FROM config.biblio_fingerprint ORDER BY format, id LOOP
+
+ SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format;
+
+ -- See if we can skip the XSLT ... it's expensive
+ IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN
+ -- Can't skip the transform
+ IF xfrm.xslt <> '---' THEN
+ transformed_xml := oils_xslt_process(marc,xfrm.xslt);
+ ELSE
+ transformed_xml := marc;
+ END IF;
+
+ prev_xfrm := xfrm.name;
+ END IF;
+
+ raw_text := COALESCE(
+ naco_normalize(
+ ARRAY_TO_STRING(
+ oils_xpath(
+ '//text()',
+ (oils_xpath(
+ idx.xpath,
+ transformed_xml,
+ ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]]
+ ))[1]
+ ),
+ ''
+ )
+ ),
+ ''
+ );
+
+ raw_text := REGEXP_REPLACE(raw_text, E'\\[.+?\\]', E'');
+ raw_text := REGEXP_REPLACE(raw_text, E'\\mthe\\M|\\man?d?d\\M', E'', 'g'); -- arg! the pain!
+
+ IF idx.first_word IS TRUE THEN
+ raw_text := REGEXP_REPLACE(raw_text, E'^(\\w+).*?$', E'\\1');
+ END IF;
+
+ output_text := output_text || idx.name || ':' ||
+ REGEXP_REPLACE(raw_text, E'\\s+', '', 'g') || ' ';
+
+ END LOOP;
+
+ RETURN BTRIM(output_text);
+
+END;
+$func$ LANGUAGE PLPGSQL;
+
+COMMIT;
+
+\qecho Recalculating bib fingerprints
+ALTER TABLE biblio.record_entry DISABLE TRIGGER USER;
+UPDATE biblio.record_entry SET fingerprint = biblio.extract_fingerprint(marc) WHERE NOT deleted;
+ALTER TABLE biblio.record_entry ENABLE TRIGGER USER;
+
+SELECT metabib.remap_metarecord_for_bib(id, fingerprint)
+FROM biblio.record_entry
+WHERE NOT deleted;
+
+\qecho Remapping metarecords
+SELECT metabib.remap_metarecord_for_bib(id, fingerprint)
+FROM biblio.record_entry
+WHERE NOT deleted;