LP#1528901: avoid accidental bib fingerprint collisions
authorGalen Charlton <gmc@esilibrary.com>
Fri, 28 Oct 2016 17:00:45 +0000 (13:00 -0400)
committerMike Rylander <mrylander@gmail.com>
Thu, 16 Feb 2017 15:02:41 +0000 (10:02 -0500)
This patch fixes a problem where the bib fingerprint algorithm
could end up putting completely different works in the same
metarecord. For example,

100 $a Steel, Danielle
245 $a Blue

and

*no 1XX
245 $a Blue steel

previously (with stock config.biblio_fingerprint settings) got
a fingerprint of "bluesteel". With this patch, their fingerprints
are now:

"Title:blue Author:steel"

and

"Title:bluesteel Author:"

The upgrade script supplied with this patch remaps the metarecords
after updating the fingerprints. While existing metarecord holds
may get moved, note that there is no known way of ensuring that a
metarecord hold placed on a collided metarecord will end up attach
to whatever work the patron intended to request.

To test:

[1] Add records for "Blue" and "Blue steel".
[2] Note that they end up on the same metarecord.
[3] Apply the patch and perform the update.
[4] The two bibs should now be on separate metarecords.

Signed-off-by: Galen Charlton <gmc@esilibrary.com>
Signed-off-by: Rogan Hamby <rogan.hamby@gmail.com>
Signed-off-by: Kathy Lussier <klussier@masslnc.org>
Signed-off-by: Mike Rylander <mrylander@gmail.com>
Open-ILS/src/sql/Pg/030.schema.metabib.sql
Open-ILS/src/sql/Pg/t/lp1528901_more_precise_fingerprints.pg [new file with mode: 0644]
Open-ILS/src/sql/Pg/upgrade/XXXX.schema.update_fingerprinting.sql [new file with mode: 0644]

index 71bab56..f79d09e 100644 (file)
@@ -1073,11 +1073,12 @@ BEGIN
             raw_text := REGEXP_REPLACE(raw_text, E'^(\\w+).*?$', E'\\1');
         END IF;
 
-               output_text := output_text || REGEXP_REPLACE(raw_text, E'\\s+', '', 'g');
+               output_text := output_text || idx.name || ':' ||
+                                          REGEXP_REPLACE(raw_text, E'\\s+', '', 'g') || ' ';
 
        END LOOP;
 
-    RETURN output_text;
+    RETURN BTRIM(output_text);
 
 END;
 $func$ LANGUAGE PLPGSQL;
diff --git a/Open-ILS/src/sql/Pg/t/lp1528901_more_precise_fingerprints.pg b/Open-ILS/src/sql/Pg/t/lp1528901_more_precise_fingerprints.pg
new file mode 100644 (file)
index 0000000..bb06244
--- /dev/null
@@ -0,0 +1,50 @@
+-- tests to verify biblio fingerprints avoid conflating
+-- works where words coming from titles and authors might overlap
+BEGIN;
+
+SELECT plan(1);
+
+INSERT INTO biblio.record_entry (last_xact_id, marc) VALUES (
+    'bib-fingerprint-test-1',
+    $record$<record xmlns="http://www.loc.gov/MARC21/slim">
+  <leader>02137cam a2200457 a 4500</leader>
+  <controlfield tag="001">ocn694080497</controlfield>
+  <controlfield tag="005">20160729104757.0</controlfield>
+  <controlfield tag="008">101217s2011    txu      b    001 0 eng  </controlfield>
+  <datafield tag="100" ind1="0" ind2="0">
+    <subfield code="a">Jasper, Frances</subfield>
+  </datafield>
+  <datafield tag="245" ind1="0" ind2="0">
+    <subfield code="a">Gzarniblat</subfield>
+  </datafield>
+</record>$record$);
+
+INSERT INTO biblio.record_entry (last_xact_id, marc) VALUES (
+    'bib-fingerprint-test-2',
+    $record$<record xmlns="http://www.loc.gov/MARC21/slim">
+  <leader>02137cam a2200457 a 4500</leader>
+  <controlfield tag="001">ocn694080497</controlfield>
+  <controlfield tag="005">20160729104757.0</controlfield>
+  <controlfield tag="008">101217s2011    txu      b    001 0 eng  </controlfield>
+  <datafield tag="245" ind1="0" ind2="0">
+    <subfield code="a">Gzarniblat Jasper</subfield>
+  </datafield>
+</record>$record$);
+
+SELECT results_ne(
+    $$
+        SELECT metarecord FROM metabib.metarecord_source_map
+        WHERE source = (
+            SELECT id FROM biblio.record_entry WHERE last_xact_id = 'bib-fingerprint-test-1'
+        )
+    $$,
+    $$ 
+        SELECT metarecord FROM metabib.metarecord_source_map
+        WHERE source = (
+            SELECT id FROM biblio.record_entry WHERE last_xact_id = 'bib-fingerprint-test-2'
+        )
+    $$,
+    'LP#1528901: same words in title and author do not stick different bibs in same metarecord'
+);
+
+ROLLBACK;
diff --git a/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.update_fingerprinting.sql b/Open-ILS/src/sql/Pg/upgrade/XXXX.schema.update_fingerprinting.sql
new file mode 100644 (file)
index 0000000..2eb5ac8
--- /dev/null
@@ -0,0 +1,86 @@
+BEGIN;
+
+--- SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
+
+CREATE OR REPLACE FUNCTION biblio.extract_fingerprint ( marc text ) RETURNS TEXT AS $func$
+DECLARE
+       idx             config.biblio_fingerprint%ROWTYPE;
+       xfrm            config.xml_transform%ROWTYPE;
+       prev_xfrm       TEXT;
+       transformed_xml TEXT;
+       xml_node        TEXT;
+       xml_node_list   TEXT[];
+       raw_text        TEXT;
+    output_text TEXT := '';
+BEGIN
+
+    IF marc IS NULL OR marc = '' THEN
+        RETURN NULL;
+    END IF;
+
+       -- Loop over the indexing entries
+       FOR idx IN SELECT * FROM config.biblio_fingerprint ORDER BY format, id LOOP
+
+               SELECT INTO xfrm * from config.xml_transform WHERE name = idx.format;
+
+               -- See if we can skip the XSLT ... it's expensive
+               IF prev_xfrm IS NULL OR prev_xfrm <> xfrm.name THEN
+                       -- Can't skip the transform
+                       IF xfrm.xslt <> '---' THEN
+                               transformed_xml := oils_xslt_process(marc,xfrm.xslt);
+                       ELSE
+                               transformed_xml := marc;
+                       END IF;
+
+                       prev_xfrm := xfrm.name;
+               END IF;
+
+               raw_text := COALESCE(
+            naco_normalize(
+                ARRAY_TO_STRING(
+                    oils_xpath(
+                        '//text()',
+                        (oils_xpath(
+                            idx.xpath,
+                            transformed_xml,
+                            ARRAY[ARRAY[xfrm.prefix, xfrm.namespace_uri]] 
+                        ))[1]
+                    ),
+                    ''
+                )
+            ),
+            ''
+        );
+
+        raw_text := REGEXP_REPLACE(raw_text, E'\\[.+?\\]', E'');
+        raw_text := REGEXP_REPLACE(raw_text, E'\\mthe\\M|\\man?d?d\\M', E'', 'g'); -- arg! the pain!
+
+        IF idx.first_word IS TRUE THEN
+            raw_text := REGEXP_REPLACE(raw_text, E'^(\\w+).*?$', E'\\1');
+        END IF;
+
+               output_text := output_text || idx.name || ':' ||
+                                          REGEXP_REPLACE(raw_text, E'\\s+', '', 'g') || ' ';
+
+       END LOOP;
+
+    RETURN BTRIM(output_text);
+
+END;
+$func$ LANGUAGE PLPGSQL;
+
+COMMIT;
+
+\qecho Recalculating bib fingerprints
+ALTER TABLE biblio.record_entry DISABLE TRIGGER USER;
+UPDATE biblio.record_entry SET fingerprint = biblio.extract_fingerprint(marc) WHERE NOT deleted;
+ALTER TABLE biblio.record_entry ENABLE TRIGGER USER;
+
+SELECT metabib.remap_metarecord_for_bib(id, fingerprint)
+FROM biblio.record_entry
+WHERE NOT deleted;
+
+\qecho Remapping metarecords
+SELECT metabib.remap_metarecord_for_bib(id, fingerprint)
+FROM biblio.record_entry
+WHERE NOT deleted;