JBAS-2019 Vand preserve spec keeps all fields
authorBill Erickson <berickxx@gmail.com>
Mon, 9 Apr 2018 21:07:51 +0000 (17:07 -0400)
committerBill Erickson <berickxx@gmail.com>
Thu, 21 Mar 2019 19:46:23 +0000 (15:46 -0400)
Adjust the new vandelay.replace_field function so that it copies all
"preserved" fields from the source record to the target record, not just
those that have matching spot on the target record.

Signed-off-by: Bill Erickson <berickxx@gmail.com>
KCLS/sql/schema/deploy/vand-preserve-all-fields.sql [new file with mode: 0644]
KCLS/sql/schema/revert/vand-preserve-all-fields.sql [new file with mode: 0644]
KCLS/sql/schema/sqitch.plan

diff --git a/KCLS/sql/schema/deploy/vand-preserve-all-fields.sql b/KCLS/sql/schema/deploy/vand-preserve-all-fields.sql
new file mode 100644 (file)
index 0000000..53b71a2
--- /dev/null
@@ -0,0 +1,230 @@
+-- Deploy kcls-evergreen:vand-preserve-all-fields to pg
+-- requires: vand-import-edit-date-fix
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION vandelay.replace_field 
+    (target_xml TEXT, source_xml TEXT, field TEXT) RETURNS TEXT AS $_$
+
+    use strict;
+    use MARC::Record;
+    use MARC::Field;
+    use MARC::File::XML (BinaryEncoding => 'UTF-8');
+    use MARC::Charset;
+
+    MARC::Charset->assume_unicode(1);
+
+    my $target_xml = shift;
+    my $source_xml = shift;
+    my $field_spec = shift;
+
+    my $target_r = MARC::Record->new_from_xml($target_xml);
+    my $source_r = MARC::Record->new_from_xml($source_xml);
+
+    return $target_xml unless $target_r && $source_r;
+
+    # Extract the field_spec components into MARC tags, subfields, 
+    # and regex matches.  Copied wholesale from vandelay.strip_field()
+
+    my @field_list = split(',', $field_spec);
+    my %fields;
+    for my $f (@field_list) {
+        $f =~ s/^\s*//; $f =~ s/\s*$//;
+        if ($f =~ /^(.{3})(\w*)(?:\[([^]]*)\])?$/) {
+            my $field = $1;
+            $field =~ s/\s+//;
+            my $sf = $2;
+            $sf =~ s/\s+//;
+            my $match = $3;
+            $match =~ s/^\s*//; $match =~ s/\s*$//;
+            $fields{$field} = { sf => [ split('', $sf) ] };
+            if ($match) {
+                my ($msf,$mre) = split('~', $match);
+                if (length($msf) > 0 and length($mre) > 0) {
+                    $msf =~ s/^\s*//; $msf =~ s/\s*$//;
+                    $mre =~ s/^\s*//; $mre =~ s/\s*$//;
+                    $fields{$field}{match} = { sf => $msf, re => qr/$mre/ };
+                }
+            }
+        }
+    }
+
+    # Returns a flat list of subfield (code, value, code, value, ...)
+    # suitable for adding to a MARC::Field.
+    sub generate_replacement_subfields {
+        my ($source_field, $target_field, @controlled_subfields) = @_;
+
+        # Performing a wholesale field replacment.  
+        # Use the entire source field as-is.
+        return map {$_->[0], $_->[1]} $source_field->subfields
+            unless @controlled_subfields;
+
+        my @new_subfields;
+
+        # Iterate over all target field subfields:
+        # 1. Keep uncontrolled subfields as is.
+        # 2. Replace values for controlled subfields when a
+        #    replacement value exists on the source record.
+        # 3. Delete values for controlled subfields when no 
+        #    replacement value exists on the source record.
+
+        for my $target_sf ($target_field->subfields) {
+            my $subfield = $target_sf->[0];
+            my $target_val = $target_sf->[1];
+
+            if (grep {$_ eq $subfield} @controlled_subfields) {
+                if (my $source_val = $source_field->subfield($subfield)) {
+                    # We have a replacement value
+                    push(@new_subfields, $subfield, $source_val);
+                } else {
+                    # no replacement value for controlled subfield, drop it.
+                }
+            } else {
+                # Field is not controlled.  Copy it over as-is.
+                push(@new_subfields, $subfield, $target_val);
+            }
+        }
+
+        # Iterate over all subfields in the source field and back-fill
+        # any values that exist only in the source field.  Insert these
+        # subfields in the same relative position they exist in the
+        # source field.
+                
+        my @seen_subfields;
+        for my $source_sf ($source_field->subfields) {
+            my $subfield = $source_sf->[0];
+            my $source_val = $source_sf->[1];
+            push(@seen_subfields, $subfield);
+
+            # target field already contains this subfield, 
+            # so it would have been addressed above.
+            next if $target_field->subfield($subfield);
+
+            # Ignore uncontrolled subfields.
+            next unless grep {$_ eq $subfield} @controlled_subfields;
+
+            # Adding a new subfield.  Find its relative position and add
+            # it to the list under construction.  Work backwards from
+            # the list of already seen subfields to find the best slot.
+
+            my $done = 0;
+            for my $seen_sf (reverse(@seen_subfields)) {
+                my $idx = @new_subfields;
+                for my $new_sf (reverse(@new_subfields)) {
+                    $idx--;
+                    next if $idx % 2 == 1; # sf codes are in the even slots
+
+                    if ($new_subfields[$idx] eq $seen_sf) {
+                        splice(@new_subfields, $idx + 2, 0, $subfield, $source_val);
+                        $done = 1;
+                        last;
+                    }
+                }
+                last if $done;
+            }
+
+            # if no slot was found, add to the end of the list.
+            push(@new_subfields, $subfield, $source_val) unless $done;
+        }
+
+        return @new_subfields;
+    }
+
+    # MARC tag loop
+    for my $f (keys %fields) {
+
+        # Loop over affected tags in the target record
+        my $tag_idx = -1;
+        for my $target_field ($target_r->field($f)) {
+
+            # field spec contains a regex for this field.  Confirm field on 
+            # target record matches the specified regex before replacing.
+            if (exists($fields{$f}{match})) {
+                next unless (grep { $_ =~ $fields{$f}{match}{re} } 
+                    $target_field->subfield($fields{$f}{match}{sf}));
+            }
+
+            my @new_subfields;
+            my @controlled_subfields = @{$fields{$f}{sf}};
+
+            # If the target record has multiple matching bib fields,
+            # replace them from matching fields on the source record
+            # in a predictable order to avoid replacing with them with
+            # same source field repeatedly.
+            my @source_fields = $source_r->field($f);
+            my $source_field = $source_fields[++$tag_idx];
+
+            if (!$source_field && @controlled_subfields) {
+                # When there are more target fields than source fields
+                # and we are replacing values for subfields and not
+                # performing wholesale field replacment, use the last
+                # available source field as the input for all remaining
+                # target fields.
+                $source_field = $source_fields[$#source_fields];
+            }
+
+            if (!$source_field) {
+                # No source field exists.  Delete all affected target
+                # data.  This is a little bit counterintuitive, but is
+                # backwards compatible with the previous version of this
+                # function which first deleted all affected data, then
+                # replaced values where possible.
+                if (@controlled_subfields) {
+                    $target_field->delete_subfield($_) for @controlled_subfields;
+                } else {
+                    $target_r->delete_field($target_field);
+                }
+                next;
+            }
+
+            my @new_subfields = generate_replacement_subfields(
+                $source_field, $target_field, @controlled_subfields);
+
+            # Build the replacement field from scratch.  
+            my $replacement_field = MARC::Field->new(
+                $target_field->tag,
+                $target_field->indicator(1),
+                $target_field->indicator(2),
+                @new_subfields
+            );
+
+            $target_field->replace_with($replacement_field);
+        }
+
+        # ---
+        # Handle cases where there are more fields on the source
+        # record than the target record.
+
+        # Skip overflow fields when a regex is defined since there 
+        # will be no target fields to compare the regex to.
+        next if exists($fields{$f}{match});
+
+        # Loop over affected tags in source record and copy any overlow
+        # fields wholesale into the target record, regardless of whether
+        # the replace rule specifies subfields.
+        # Note insert_fields_ordered() will put the overflow fields
+        # directly in front of any existing with the same tag.
+
+        my @source_fields = $source_r->field($f);
+        my @target_fields = $target_r->field($f);
+        my $sf_count = scalar(@source_fields);
+        my $tf_count = scalar(@target_fields);
+        my $cp_index = $tf_count;
+        while ($cp_index < $sf_count) {
+            my $s_field = $source_fields[$cp_index];
+            $target_r->insert_fields_ordered($s_field->clone);
+            $cp_index++;
+        }
+    }
+
+    $target_xml = $target_r->as_xml_record;
+    $target_xml =~ s/^<\?.+?\?>$//mo;
+    $target_xml =~ s/\n//sgo;
+    $target_xml =~ s/>\s+</></sgo;
+
+    return $target_xml;
+
+$_$ LANGUAGE PLPERLU;
+
+
+COMMIT;
diff --git a/KCLS/sql/schema/revert/vand-preserve-all-fields.sql b/KCLS/sql/schema/revert/vand-preserve-all-fields.sql
new file mode 100644 (file)
index 0000000..4ae7a41
--- /dev/null
@@ -0,0 +1,202 @@
+-- Revert kcls-evergreen:vand-preserve-all-fields from pg
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION vandelay.replace_field 
+    (target_xml TEXT, source_xml TEXT, field TEXT) RETURNS TEXT AS $_$
+
+    use strict;
+    use MARC::Record;
+    use MARC::Field;
+    use MARC::File::XML (BinaryEncoding => 'UTF-8');
+    use MARC::Charset;
+
+    MARC::Charset->assume_unicode(1);
+
+    my $target_xml = shift;
+    my $source_xml = shift;
+    my $field_spec = shift;
+
+    my $target_r = MARC::Record->new_from_xml($target_xml);
+    my $source_r = MARC::Record->new_from_xml($source_xml);
+
+    return $target_xml unless $target_r && $source_r;
+
+    # Extract the field_spec components into MARC tags, subfields, 
+    # and regex matches.  Copied wholesale from vandelay.strip_field()
+
+    my @field_list = split(',', $field_spec);
+    my %fields;
+    for my $f (@field_list) {
+        $f =~ s/^\s*//; $f =~ s/\s*$//;
+        if ($f =~ /^(.{3})(\w*)(?:\[([^]]*)\])?$/) {
+            my $field = $1;
+            $field =~ s/\s+//;
+            my $sf = $2;
+            $sf =~ s/\s+//;
+            my $match = $3;
+            $match =~ s/^\s*//; $match =~ s/\s*$//;
+            $fields{$field} = { sf => [ split('', $sf) ] };
+            if ($match) {
+                my ($msf,$mre) = split('~', $match);
+                if (length($msf) > 0 and length($mre) > 0) {
+                    $msf =~ s/^\s*//; $msf =~ s/\s*$//;
+                    $mre =~ s/^\s*//; $mre =~ s/\s*$//;
+                    $fields{$field}{match} = { sf => $msf, re => qr/$mre/ };
+                }
+            }
+        }
+    }
+
+    # Returns a flat list of subfield (code, value, code, value, ...)
+    # suitable for adding to a MARC::Field.
+    sub generate_replacement_subfields {
+        my ($source_field, $target_field, @controlled_subfields) = @_;
+
+        # Performing a wholesale field replacment.  
+        # Use the entire source field as-is.
+        return map {$_->[0], $_->[1]} $source_field->subfields
+            unless @controlled_subfields;
+
+        my @new_subfields;
+
+        # Iterate over all target field subfields:
+        # 1. Keep uncontrolled subfields as is.
+        # 2. Replace values for controlled subfields when a
+        #    replacement value exists on the source record.
+        # 3. Delete values for controlled subfields when no 
+        #    replacement value exists on the source record.
+
+        for my $target_sf ($target_field->subfields) {
+            my $subfield = $target_sf->[0];
+            my $target_val = $target_sf->[1];
+
+            if (grep {$_ eq $subfield} @controlled_subfields) {
+                if (my $source_val = $source_field->subfield($subfield)) {
+                    # We have a replacement value
+                    push(@new_subfields, $subfield, $source_val);
+                } else {
+                    # no replacement value for controlled subfield, drop it.
+                }
+            } else {
+                # Field is not controlled.  Copy it over as-is.
+                push(@new_subfields, $subfield, $target_val);
+            }
+        }
+
+        # Iterate over all subfields in the source field and back-fill
+        # any values that exist only in the source field.  Insert these
+        # subfields in the same relative position they exist in the
+        # source field.
+                
+        my @seen_subfields;
+        for my $source_sf ($source_field->subfields) {
+            my $subfield = $source_sf->[0];
+            my $source_val = $source_sf->[1];
+            push(@seen_subfields, $subfield);
+
+            # target field already contains this subfield, 
+            # so it would have been addressed above.
+            next if $target_field->subfield($subfield);
+
+            # Ignore uncontrolled subfields.
+            next unless grep {$_ eq $subfield} @controlled_subfields;
+
+            # Adding a new subfield.  Find its relative position and add
+            # it to the list under construction.  Work backwards from
+            # the list of already seen subfields to find the best slot.
+
+            my $done = 0;
+            for my $seen_sf (reverse(@seen_subfields)) {
+                my $idx = @new_subfields;
+                for my $new_sf (reverse(@new_subfields)) {
+                    $idx--;
+                    next if $idx % 2 == 1; # sf codes are in the even slots
+
+                    if ($new_subfields[$idx] eq $seen_sf) {
+                        splice(@new_subfields, $idx + 2, 0, $subfield, $source_val);
+                        $done = 1;
+                        last;
+                    }
+                }
+                last if $done;
+            }
+
+            # if no slot was found, add to the end of the list.
+            push(@new_subfields, $subfield, $source_val) unless $done;
+        }
+
+        return @new_subfields;
+    }
+
+    # MARC tag loop
+    for my $f (keys %fields) {
+        my $tag_idx = -1;
+        for my $target_field ($target_r->field($f)) {
+
+            # field spec contains a regex for this field.  Confirm field on 
+            # target record matches the specified regex before replacing.
+            if (exists($fields{$f}{match})) {
+                next unless (grep { $_ =~ $fields{$f}{match}{re} } 
+                    $target_field->subfield($fields{$f}{match}{sf}));
+            }
+
+            my @new_subfields;
+            my @controlled_subfields = @{$fields{$f}{sf}};
+
+            # If the target record has multiple matching bib fields,
+            # replace them from matching fields on the source record
+            # in a predictable order to avoid replacing with them with
+            # same source field repeatedly.
+            my @source_fields = $source_r->field($f);
+            my $source_field = $source_fields[++$tag_idx];
+
+            if (!$source_field && @controlled_subfields) {
+                # When there are more target fields than source fields
+                # and we are replacing values for subfields and not
+                # performing wholesale field replacment, use the last
+                # available source field as the input for all remaining
+                # target fields.
+                $source_field = $source_fields[$#source_fields];
+            }
+
+            if (!$source_field) {
+                # No source field exists.  Delete all affected target
+                # data.  This is a little bit counterintuitive, but is
+                # backwards compatible with the previous version of this
+                # function which first deleted all affected data, then
+                # replaced values where possible.
+                if (@controlled_subfields) {
+                    $target_field->delete_subfield($_) for @controlled_subfields;
+                } else {
+                    $target_r->delete_field($target_field);
+                }
+                next;
+            }
+
+            my @new_subfields = generate_replacement_subfields(
+                $source_field, $target_field, @controlled_subfields);
+
+            # Build the replacement field from scratch.  
+            my $replacement_field = MARC::Field->new(
+                $target_field->tag,
+                $target_field->indicator(1),
+                $target_field->indicator(2),
+                @new_subfields
+            );
+
+            $target_field->replace_with($replacement_field);
+        }
+    }
+
+    $target_xml = $target_r->as_xml_record;
+    $target_xml =~ s/^<\?.+?\?>$//mo;
+    $target_xml =~ s/\n//sgo;
+    $target_xml =~ s/>\s+</></sgo;
+
+    return $target_xml;
+
+$_$ LANGUAGE PLPERLU;
+
+
+COMMIT;
index cc09f9f..88ef322 100644 (file)
@@ -65,3 +65,4 @@ outreach-si-profile [at-purge-interval-data] 2018-02-06T15:18:44Z Bill Erickson,
 hold-queue-pos-materialized [2.10-to-2.12-upgrade] 2018-03-25T18:50:51Z Bill Erickson,,, <berick@kcls-dev-local> # materialized hold queue posistions
 search-strip-apos-recover [hold-queue-pos-materialized] 2018-03-30T16:15:21Z Bill Erickson,,, <berick@kcls-dev-local> # Recover lost apostrophe stripping in search
 vand-import-edit-date-fix [hold-queue-pos-materialized] 2018-03-30T14:50:53Z Bill Erickson,,, <berick@kcls-dev-local> # Recover lost vandelay edit date changes
+vand-preserve-all-fields [vand-import-edit-date-fix] 2018-04-09T20:15:27Z Bill Erickson,,, <berick@kcls-dev-local> # Bug in vandelay.replace_fields()