From 085c676f429c8ad221e636c5147e90cf1282da64 Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Mon, 9 Apr 2018 17:07:51 -0400 Subject: [PATCH] JBAS-2019 Vand preserve spec keeps all fields Adjust the new vandelay.replace_field function so that it copies all "preserved" fields from the source record to the target record, not just those that have matching spot on the target record. Signed-off-by: Bill Erickson --- .../sql/schema/deploy/vand-preserve-all-fields.sql | 230 +++++++++++++++++++++ .../sql/schema/revert/vand-preserve-all-fields.sql | 202 ++++++++++++++++++ KCLS/sql/schema/sqitch.plan | 1 + 3 files changed, 433 insertions(+) create mode 100644 KCLS/sql/schema/deploy/vand-preserve-all-fields.sql create mode 100644 KCLS/sql/schema/revert/vand-preserve-all-fields.sql diff --git a/KCLS/sql/schema/deploy/vand-preserve-all-fields.sql b/KCLS/sql/schema/deploy/vand-preserve-all-fields.sql new file mode 100644 index 0000000000..53b71a27b4 --- /dev/null +++ b/KCLS/sql/schema/deploy/vand-preserve-all-fields.sql @@ -0,0 +1,230 @@ +-- Deploy kcls-evergreen:vand-preserve-all-fields to pg +-- requires: vand-import-edit-date-fix + +BEGIN; + +CREATE OR REPLACE FUNCTION vandelay.replace_field + (target_xml TEXT, source_xml TEXT, field TEXT) RETURNS TEXT AS $_$ + + use strict; + use MARC::Record; + use MARC::Field; + use MARC::File::XML (BinaryEncoding => 'UTF-8'); + use MARC::Charset; + + MARC::Charset->assume_unicode(1); + + my $target_xml = shift; + my $source_xml = shift; + my $field_spec = shift; + + my $target_r = MARC::Record->new_from_xml($target_xml); + my $source_r = MARC::Record->new_from_xml($source_xml); + + return $target_xml unless $target_r && $source_r; + + # Extract the field_spec components into MARC tags, subfields, + # and regex matches. Copied wholesale from vandelay.strip_field() + + my @field_list = split(',', $field_spec); + my %fields; + for my $f (@field_list) { + $f =~ s/^\s*//; $f =~ s/\s*$//; + if ($f =~ /^(.{3})(\w*)(?:\[([^]]*)\])?$/) { + my $field = $1; + $field =~ s/\s+//; + my $sf = $2; + $sf =~ s/\s+//; + my $match = $3; + $match =~ s/^\s*//; $match =~ s/\s*$//; + $fields{$field} = { sf => [ split('', $sf) ] }; + if ($match) { + my ($msf,$mre) = split('~', $match); + if (length($msf) > 0 and length($mre) > 0) { + $msf =~ s/^\s*//; $msf =~ s/\s*$//; + $mre =~ s/^\s*//; $mre =~ s/\s*$//; + $fields{$field}{match} = { sf => $msf, re => qr/$mre/ }; + } + } + } + } + + # Returns a flat list of subfield (code, value, code, value, ...) + # suitable for adding to a MARC::Field. + sub generate_replacement_subfields { + my ($source_field, $target_field, @controlled_subfields) = @_; + + # Performing a wholesale field replacment. + # Use the entire source field as-is. + return map {$_->[0], $_->[1]} $source_field->subfields + unless @controlled_subfields; + + my @new_subfields; + + # Iterate over all target field subfields: + # 1. Keep uncontrolled subfields as is. + # 2. Replace values for controlled subfields when a + # replacement value exists on the source record. + # 3. Delete values for controlled subfields when no + # replacement value exists on the source record. + + for my $target_sf ($target_field->subfields) { + my $subfield = $target_sf->[0]; + my $target_val = $target_sf->[1]; + + if (grep {$_ eq $subfield} @controlled_subfields) { + if (my $source_val = $source_field->subfield($subfield)) { + # We have a replacement value + push(@new_subfields, $subfield, $source_val); + } else { + # no replacement value for controlled subfield, drop it. + } + } else { + # Field is not controlled. Copy it over as-is. + push(@new_subfields, $subfield, $target_val); + } + } + + # Iterate over all subfields in the source field and back-fill + # any values that exist only in the source field. Insert these + # subfields in the same relative position they exist in the + # source field. + + my @seen_subfields; + for my $source_sf ($source_field->subfields) { + my $subfield = $source_sf->[0]; + my $source_val = $source_sf->[1]; + push(@seen_subfields, $subfield); + + # target field already contains this subfield, + # so it would have been addressed above. + next if $target_field->subfield($subfield); + + # Ignore uncontrolled subfields. + next unless grep {$_ eq $subfield} @controlled_subfields; + + # Adding a new subfield. Find its relative position and add + # it to the list under construction. Work backwards from + # the list of already seen subfields to find the best slot. + + my $done = 0; + for my $seen_sf (reverse(@seen_subfields)) { + my $idx = @new_subfields; + for my $new_sf (reverse(@new_subfields)) { + $idx--; + next if $idx % 2 == 1; # sf codes are in the even slots + + if ($new_subfields[$idx] eq $seen_sf) { + splice(@new_subfields, $idx + 2, 0, $subfield, $source_val); + $done = 1; + last; + } + } + last if $done; + } + + # if no slot was found, add to the end of the list. + push(@new_subfields, $subfield, $source_val) unless $done; + } + + return @new_subfields; + } + + # MARC tag loop + for my $f (keys %fields) { + + # Loop over affected tags in the target record + my $tag_idx = -1; + for my $target_field ($target_r->field($f)) { + + # field spec contains a regex for this field. Confirm field on + # target record matches the specified regex before replacing. + if (exists($fields{$f}{match})) { + next unless (grep { $_ =~ $fields{$f}{match}{re} } + $target_field->subfield($fields{$f}{match}{sf})); + } + + my @new_subfields; + my @controlled_subfields = @{$fields{$f}{sf}}; + + # If the target record has multiple matching bib fields, + # replace them from matching fields on the source record + # in a predictable order to avoid replacing with them with + # same source field repeatedly. + my @source_fields = $source_r->field($f); + my $source_field = $source_fields[++$tag_idx]; + + if (!$source_field && @controlled_subfields) { + # When there are more target fields than source fields + # and we are replacing values for subfields and not + # performing wholesale field replacment, use the last + # available source field as the input for all remaining + # target fields. + $source_field = $source_fields[$#source_fields]; + } + + if (!$source_field) { + # No source field exists. Delete all affected target + # data. This is a little bit counterintuitive, but is + # backwards compatible with the previous version of this + # function which first deleted all affected data, then + # replaced values where possible. + if (@controlled_subfields) { + $target_field->delete_subfield($_) for @controlled_subfields; + } else { + $target_r->delete_field($target_field); + } + next; + } + + my @new_subfields = generate_replacement_subfields( + $source_field, $target_field, @controlled_subfields); + + # Build the replacement field from scratch. + my $replacement_field = MARC::Field->new( + $target_field->tag, + $target_field->indicator(1), + $target_field->indicator(2), + @new_subfields + ); + + $target_field->replace_with($replacement_field); + } + + # --- + # Handle cases where there are more fields on the source + # record than the target record. + + # Skip overflow fields when a regex is defined since there + # will be no target fields to compare the regex to. + next if exists($fields{$f}{match}); + + # Loop over affected tags in source record and copy any overlow + # fields wholesale into the target record, regardless of whether + # the replace rule specifies subfields. + # Note insert_fields_ordered() will put the overflow fields + # directly in front of any existing with the same tag. + + my @source_fields = $source_r->field($f); + my @target_fields = $target_r->field($f); + my $sf_count = scalar(@source_fields); + my $tf_count = scalar(@target_fields); + my $cp_index = $tf_count; + while ($cp_index < $sf_count) { + my $s_field = $source_fields[$cp_index]; + $target_r->insert_fields_ordered($s_field->clone); + $cp_index++; + } + } + + $target_xml = $target_r->as_xml_record; + $target_xml =~ s/^<\?.+?\?>$//mo; + $target_xml =~ s/\n//sgo; + $target_xml =~ s/>\s+ 'UTF-8'); + use MARC::Charset; + + MARC::Charset->assume_unicode(1); + + my $target_xml = shift; + my $source_xml = shift; + my $field_spec = shift; + + my $target_r = MARC::Record->new_from_xml($target_xml); + my $source_r = MARC::Record->new_from_xml($source_xml); + + return $target_xml unless $target_r && $source_r; + + # Extract the field_spec components into MARC tags, subfields, + # and regex matches. Copied wholesale from vandelay.strip_field() + + my @field_list = split(',', $field_spec); + my %fields; + for my $f (@field_list) { + $f =~ s/^\s*//; $f =~ s/\s*$//; + if ($f =~ /^(.{3})(\w*)(?:\[([^]]*)\])?$/) { + my $field = $1; + $field =~ s/\s+//; + my $sf = $2; + $sf =~ s/\s+//; + my $match = $3; + $match =~ s/^\s*//; $match =~ s/\s*$//; + $fields{$field} = { sf => [ split('', $sf) ] }; + if ($match) { + my ($msf,$mre) = split('~', $match); + if (length($msf) > 0 and length($mre) > 0) { + $msf =~ s/^\s*//; $msf =~ s/\s*$//; + $mre =~ s/^\s*//; $mre =~ s/\s*$//; + $fields{$field}{match} = { sf => $msf, re => qr/$mre/ }; + } + } + } + } + + # Returns a flat list of subfield (code, value, code, value, ...) + # suitable for adding to a MARC::Field. + sub generate_replacement_subfields { + my ($source_field, $target_field, @controlled_subfields) = @_; + + # Performing a wholesale field replacment. + # Use the entire source field as-is. + return map {$_->[0], $_->[1]} $source_field->subfields + unless @controlled_subfields; + + my @new_subfields; + + # Iterate over all target field subfields: + # 1. Keep uncontrolled subfields as is. + # 2. Replace values for controlled subfields when a + # replacement value exists on the source record. + # 3. Delete values for controlled subfields when no + # replacement value exists on the source record. + + for my $target_sf ($target_field->subfields) { + my $subfield = $target_sf->[0]; + my $target_val = $target_sf->[1]; + + if (grep {$_ eq $subfield} @controlled_subfields) { + if (my $source_val = $source_field->subfield($subfield)) { + # We have a replacement value + push(@new_subfields, $subfield, $source_val); + } else { + # no replacement value for controlled subfield, drop it. + } + } else { + # Field is not controlled. Copy it over as-is. + push(@new_subfields, $subfield, $target_val); + } + } + + # Iterate over all subfields in the source field and back-fill + # any values that exist only in the source field. Insert these + # subfields in the same relative position they exist in the + # source field. + + my @seen_subfields; + for my $source_sf ($source_field->subfields) { + my $subfield = $source_sf->[0]; + my $source_val = $source_sf->[1]; + push(@seen_subfields, $subfield); + + # target field already contains this subfield, + # so it would have been addressed above. + next if $target_field->subfield($subfield); + + # Ignore uncontrolled subfields. + next unless grep {$_ eq $subfield} @controlled_subfields; + + # Adding a new subfield. Find its relative position and add + # it to the list under construction. Work backwards from + # the list of already seen subfields to find the best slot. + + my $done = 0; + for my $seen_sf (reverse(@seen_subfields)) { + my $idx = @new_subfields; + for my $new_sf (reverse(@new_subfields)) { + $idx--; + next if $idx % 2 == 1; # sf codes are in the even slots + + if ($new_subfields[$idx] eq $seen_sf) { + splice(@new_subfields, $idx + 2, 0, $subfield, $source_val); + $done = 1; + last; + } + } + last if $done; + } + + # if no slot was found, add to the end of the list. + push(@new_subfields, $subfield, $source_val) unless $done; + } + + return @new_subfields; + } + + # MARC tag loop + for my $f (keys %fields) { + my $tag_idx = -1; + for my $target_field ($target_r->field($f)) { + + # field spec contains a regex for this field. Confirm field on + # target record matches the specified regex before replacing. + if (exists($fields{$f}{match})) { + next unless (grep { $_ =~ $fields{$f}{match}{re} } + $target_field->subfield($fields{$f}{match}{sf})); + } + + my @new_subfields; + my @controlled_subfields = @{$fields{$f}{sf}}; + + # If the target record has multiple matching bib fields, + # replace them from matching fields on the source record + # in a predictable order to avoid replacing with them with + # same source field repeatedly. + my @source_fields = $source_r->field($f); + my $source_field = $source_fields[++$tag_idx]; + + if (!$source_field && @controlled_subfields) { + # When there are more target fields than source fields + # and we are replacing values for subfields and not + # performing wholesale field replacment, use the last + # available source field as the input for all remaining + # target fields. + $source_field = $source_fields[$#source_fields]; + } + + if (!$source_field) { + # No source field exists. Delete all affected target + # data. This is a little bit counterintuitive, but is + # backwards compatible with the previous version of this + # function which first deleted all affected data, then + # replaced values where possible. + if (@controlled_subfields) { + $target_field->delete_subfield($_) for @controlled_subfields; + } else { + $target_r->delete_field($target_field); + } + next; + } + + my @new_subfields = generate_replacement_subfields( + $source_field, $target_field, @controlled_subfields); + + # Build the replacement field from scratch. + my $replacement_field = MARC::Field->new( + $target_field->tag, + $target_field->indicator(1), + $target_field->indicator(2), + @new_subfields + ); + + $target_field->replace_with($replacement_field); + } + } + + $target_xml = $target_r->as_xml_record; + $target_xml =~ s/^<\?.+?\?>$//mo; + $target_xml =~ s/\n//sgo; + $target_xml =~ s/>\s+ # materialized hold queue posistions search-strip-apos-recover [hold-queue-pos-materialized] 2018-03-30T16:15:21Z Bill Erickson,,, # Recover lost apostrophe stripping in search vand-import-edit-date-fix [hold-queue-pos-materialized] 2018-03-30T14:50:53Z Bill Erickson,,, # Recover lost vandelay edit date changes +vand-preserve-all-fields [vand-import-edit-date-fix] 2018-04-09T20:15:27Z Bill Erickson,,, # Bug in vandelay.replace_fields() -- 2.11.0