$_$ LANGUAGE PLPERLU;
-CREATE OR REPLACE FUNCTION vandelay.replace_field ( target_xml TEXT, source_xml TEXT, field TEXT ) RETURNS TEXT AS $_$
-DECLARE
- xml_output TEXT;
- parsed_target TEXT;
- curr_field TEXT;
-BEGIN
+CREATE OR REPLACE FUNCTION vandelay.replace_field
+ (target_xml TEXT, source_xml TEXT, field TEXT) RETURNS TEXT AS $_$
- parsed_target := vandelay.strip_field( target_xml, ''); -- this dance normalizes the format of the xml for the IF below
- xml_output := parsed_target; -- if there are no replace rules, just return the input
+ use strict;
+ use MARC::Record;
+ use MARC::Field;
+ use MARC::File::XML (BinaryEncoding => 'UTF-8');
+ use MARC::Charset;
- FOR curr_field IN SELECT UNNEST( STRING_TO_ARRAY(field, ',') ) LOOP -- naive split, but it's the same we use in the perl
+ MARC::Charset->assume_unicode(1);
- xml_output := vandelay.strip_field( parsed_target, curr_field);
+ my $target_xml = shift;
+ my $source_xml = shift;
+ my $field_spec = shift;
- IF xml_output <> parsed_target AND curr_field ~ E'~' THEN
- -- we removed something, and there was a regexp restriction in the curr_field definition, so proceed
- xml_output := vandelay.add_field( xml_output, source_xml, curr_field, 1 );
- ELSIF curr_field !~ E'~' THEN
- -- No regexp restriction, add the curr_field
- xml_output := vandelay.add_field( xml_output, source_xml, curr_field, 0 );
- END IF;
+ my $target_r = MARC::Record->new_from_xml($target_xml);
+ my $source_r = MARC::Record->new_from_xml($source_xml);
- parsed_target := xml_output; -- in prep for any following loop iterations
+ return $target_xml unless $target_r && $source_r;
- END LOOP;
+ # Extract the field_spec components into MARC tags, subfields,
+ # and regex matches. Copied wholesale from vandelay.strip_field()
- RETURN xml_output;
-END;
-$_$ LANGUAGE PLPGSQL;
+ my @field_list = split(',', $field_spec);
+ my %fields;
+ for my $f (@field_list) {
+ $f =~ s/^\s*//; $f =~ s/\s*$//;
+ if ($f =~ /^(.{3})(\w*)(?:\[([^]]*)\])?$/) {
+ my $field = $1;
+ $field =~ s/\s+//;
+ my $sf = $2;
+ $sf =~ s/\s+//;
+ my $match = $3;
+ $match =~ s/^\s*//; $match =~ s/\s*$//;
+ $fields{$field} = { sf => [ split('', $sf) ] };
+ if ($match) {
+ my ($msf,$mre) = split('~', $match);
+ if (length($msf) > 0 and length($mre) > 0) {
+ $msf =~ s/^\s*//; $msf =~ s/\s*$//;
+ $mre =~ s/^\s*//; $mre =~ s/\s*$//;
+ $fields{$field}{match} = { sf => $msf, re => qr/$mre/ };
+ }
+ }
+ }
+ }
+
+ # Returns a flat list of subfield (code, value, code, value, ...)
+ # suitable for adding to a MARC::Field.
+ sub generate_replacement_subfields {
+ my ($source_field, $target_field, @controlled_subfields) = @_;
+
+ # Performing a wholesale field replacment.
+ # Use the entire source field as-is.
+ return map {$_->[0], $_->[1]} $source_field->subfields
+ unless @controlled_subfields;
+
+ my @new_subfields;
+
+ # Iterate over all target field subfields:
+ # 1. Keep uncontrolled subfields as is.
+ # 2. Replace values for controlled subfields when a
+ # replacement value exists on the source record.
+ # 3. Delete values for controlled subfields when no
+ # replacement value exists on the source record.
+
+ for my $target_sf ($target_field->subfields) {
+ my $subfield = $target_sf->[0];
+ my $target_val = $target_sf->[1];
+
+ if (grep {$_ eq $subfield} @controlled_subfields) {
+ if (my $source_val = $source_field->subfield($subfield)) {
+ # We have a replacement value
+ push(@new_subfields, $subfield, $source_val);
+ } else {
+ # no replacement value for controlled subfield, drop it.
+ }
+ } else {
+ # Field is not controlled. Copy it over as-is.
+ push(@new_subfields, $subfield, $target_val);
+ }
+ }
+
+ # Iterate over all subfields in the source field and back-fill
+ # any values that exist only in the source field. Insert these
+ # subfields in the same relative position they exist in the
+ # source field.
+
+ my @seen_subfields;
+ for my $source_sf ($source_field->subfields) {
+ my $subfield = $source_sf->[0];
+ my $source_val = $source_sf->[1];
+ push(@seen_subfields, $subfield);
+
+ # target field already contains this subfield,
+ # so it would have been addressed above.
+ next if $target_field->subfield($subfield);
+
+ # Ignore uncontrolled subfields.
+ next unless grep {$_ eq $subfield} @controlled_subfields;
+
+ # Adding a new subfield. Find its relative position and add
+ # it to the list under construction. Work backwards from
+ # the list of already seen subfields to find the best slot.
+
+ my $done = 0;
+ for my $seen_sf (reverse(@seen_subfields)) {
+ my $idx = @new_subfields;
+ for my $new_sf (reverse(@new_subfields)) {
+ $idx--;
+ next if $idx % 2 == 1; # sf codes are in the even slots
+
+ if ($new_subfields[$idx] eq $seen_sf) {
+ splice(@new_subfields, $idx + 2, 0, $subfield, $source_val);
+ $done = 1;
+ last;
+ }
+ }
+ last if $done;
+ }
+
+ # if no slot was found, add to the end of the list.
+ push(@new_subfields, $subfield, $source_val) unless $done;
+ }
+
+ return @new_subfields;
+ }
+
+ # MARC tag loop
+ for my $f (keys %fields) {
+ my $tag_idx = -1;
+ for my $target_field ($target_r->field($f)) {
+
+ # field spec contains a regex for this field. Confirm field on
+ # target record matches the specified regex before replacing.
+ if (exists($fields{$f}{match})) {
+ next unless (grep { $_ =~ $fields{$f}{match}{re} }
+ $target_field->subfield($fields{$f}{match}{sf}));
+ }
+
+ my @new_subfields;
+ my @controlled_subfields = @{$fields{$f}{sf}};
+
+ # If the target record has multiple matching bib fields,
+ # replace them from matching fields on the source record
+ # in a predictable order to avoid replacing with them with
+ # same source field repeatedly.
+ my @source_fields = $source_r->field($f);
+ my $source_field = $source_fields[++$tag_idx];
+
+ if (!$source_field && @controlled_subfields) {
+ # When there are more target fields than source fields
+ # and we are replacing values for subfields and not
+ # performing wholesale field replacment, use the last
+ # available source field as the input for all remaining
+ # target fields.
+ $source_field = $source_fields[$#source_fields];
+ }
+
+ if (!$source_field) {
+ # No source field exists. Delete all affected target
+ # data. This is a little bit counterintuitive, but is
+ # backwards compatible with the previous version of this
+ # function which first deleted all affected data, then
+ # replaced values where possible.
+ if (@controlled_subfields) {
+ $target_field->delete_subfield($_) for @controlled_subfields;
+ } else {
+ $target_r->delete_field($target_field);
+ }
+ next;
+ }
+
+ my @new_subfields = generate_replacement_subfields(
+ $source_field, $target_field, @controlled_subfields);
+
+ # Build the replacement field from scratch.
+ my $replacement_field = MARC::Field->new(
+ $target_field->tag,
+ $target_field->indicator(1),
+ $target_field->indicator(2),
+ @new_subfields
+ );
+
+ $target_field->replace_with($replacement_field);
+ }
+ }
+
+ $target_xml = $target_r->as_xml_record;
+ $target_xml =~ s/^<\?.+?\?>$//mo;
+ $target_xml =~ s/\n//sgo;
+ $target_xml =~ s/>\s+</></sgo;
+
+ return $target_xml;
+
+$_$ LANGUAGE PLPERLU;
CREATE OR REPLACE FUNCTION vandelay.merge_record_xml ( target_xml TEXT, source_xml TEXT, add_rule TEXT, replace_preserve_rule TEXT, strip_rule TEXT ) RETURNS TEXT AS $_$
SELECT vandelay.replace_field( vandelay.add_field( vandelay.strip_field( $1, $5) , $2, $3 ), $2, $4);
--- /dev/null
+BEGIN;
+
+-- SELECT evergreen.upgrade_deps_block_check('XXXX', :eg_version);
+
+CREATE OR REPLACE FUNCTION vandelay.replace_field
+ (target_xml TEXT, source_xml TEXT, field TEXT) RETURNS TEXT AS $_$
+
+ use strict;
+ use MARC::Record;
+ use MARC::Field;
+ use MARC::File::XML (BinaryEncoding => 'UTF-8');
+ use MARC::Charset;
+
+ MARC::Charset->assume_unicode(1);
+
+ my $target_xml = shift;
+ my $source_xml = shift;
+ my $field_spec = shift;
+
+ my $target_r = MARC::Record->new_from_xml($target_xml);
+ my $source_r = MARC::Record->new_from_xml($source_xml);
+
+ return $target_xml unless $target_r && $source_r;
+
+ # Extract the field_spec components into MARC tags, subfields,
+ # and regex matches. Copied wholesale from vandelay.strip_field()
+
+ my @field_list = split(',', $field_spec);
+ my %fields;
+ for my $f (@field_list) {
+ $f =~ s/^\s*//; $f =~ s/\s*$//;
+ if ($f =~ /^(.{3})(\w*)(?:\[([^]]*)\])?$/) {
+ my $field = $1;
+ $field =~ s/\s+//;
+ my $sf = $2;
+ $sf =~ s/\s+//;
+ my $match = $3;
+ $match =~ s/^\s*//; $match =~ s/\s*$//;
+ $fields{$field} = { sf => [ split('', $sf) ] };
+ if ($match) {
+ my ($msf,$mre) = split('~', $match);
+ if (length($msf) > 0 and length($mre) > 0) {
+ $msf =~ s/^\s*//; $msf =~ s/\s*$//;
+ $mre =~ s/^\s*//; $mre =~ s/\s*$//;
+ $fields{$field}{match} = { sf => $msf, re => qr/$mre/ };
+ }
+ }
+ }
+ }
+
+ # Returns a flat list of subfield (code, value, code, value, ...)
+ # suitable for adding to a MARC::Field.
+ sub generate_replacement_subfields {
+ my ($source_field, $target_field, @controlled_subfields) = @_;
+
+ # Performing a wholesale field replacment.
+ # Use the entire source field as-is.
+ return map {$_->[0], $_->[1]} $source_field->subfields
+ unless @controlled_subfields;
+
+ my @new_subfields;
+
+ # Iterate over all target field subfields:
+ # 1. Keep uncontrolled subfields as is.
+ # 2. Replace values for controlled subfields when a
+ # replacement value exists on the source record.
+ # 3. Delete values for controlled subfields when no
+ # replacement value exists on the source record.
+
+ for my $target_sf ($target_field->subfields) {
+ my $subfield = $target_sf->[0];
+ my $target_val = $target_sf->[1];
+
+ if (grep {$_ eq $subfield} @controlled_subfields) {
+ if (my $source_val = $source_field->subfield($subfield)) {
+ # We have a replacement value
+ push(@new_subfields, $subfield, $source_val);
+ } else {
+ # no replacement value for controlled subfield, drop it.
+ }
+ } else {
+ # Field is not controlled. Copy it over as-is.
+ push(@new_subfields, $subfield, $target_val);
+ }
+ }
+
+ # Iterate over all subfields in the source field and back-fill
+ # any values that exist only in the source field. Insert these
+ # subfields in the same relative position they exist in the
+ # source field.
+
+ my @seen_subfields;
+ for my $source_sf ($source_field->subfields) {
+ my $subfield = $source_sf->[0];
+ my $source_val = $source_sf->[1];
+ push(@seen_subfields, $subfield);
+
+ # target field already contains this subfield,
+ # so it would have been addressed above.
+ next if $target_field->subfield($subfield);
+
+ # Ignore uncontrolled subfields.
+ next unless grep {$_ eq $subfield} @controlled_subfields;
+
+ # Adding a new subfield. Find its relative position and add
+ # it to the list under construction. Work backwards from
+ # the list of already seen subfields to find the best slot.
+
+ my $done = 0;
+ for my $seen_sf (reverse(@seen_subfields)) {
+ my $idx = @new_subfields;
+ for my $new_sf (reverse(@new_subfields)) {
+ $idx--;
+ next if $idx % 2 == 1; # sf codes are in the even slots
+
+ if ($new_subfields[$idx] eq $seen_sf) {
+ splice(@new_subfields, $idx + 2, 0, $subfield, $source_val);
+ $done = 1;
+ last;
+ }
+ }
+ last if $done;
+ }
+
+ # if no slot was found, add to the end of the list.
+ push(@new_subfields, $subfield, $source_val) unless $done;
+ }
+
+ return @new_subfields;
+ }
+
+ # MARC tag loop
+ for my $f (keys %fields) {
+ my $tag_idx = -1;
+ for my $target_field ($target_r->field($f)) {
+
+ # field spec contains a regex for this field. Confirm field on
+ # target record matches the specified regex before replacing.
+ if (exists($fields{$f}{match})) {
+ next unless (grep { $_ =~ $fields{$f}{match}{re} }
+ $target_field->subfield($fields{$f}{match}{sf}));
+ }
+
+ my @new_subfields;
+ my @controlled_subfields = @{$fields{$f}{sf}};
+
+ # If the target record has multiple matching bib fields,
+ # replace them from matching fields on the source record
+ # in a predictable order to avoid replacing with them with
+ # same source field repeatedly.
+ my @source_fields = $source_r->field($f);
+ my $source_field = $source_fields[++$tag_idx];
+
+ if (!$source_field && @controlled_subfields) {
+ # When there are more target fields than source fields
+ # and we are replacing values for subfields and not
+ # performing wholesale field replacment, use the last
+ # available source field as the input for all remaining
+ # target fields.
+ $source_field = $source_fields[$#source_fields];
+ }
+
+ if (!$source_field) {
+ # No source field exists. Delete all affected target
+ # data. This is a little bit counterintuitive, but is
+ # backwards compatible with the previous version of this
+ # function which first deleted all affected data, then
+ # replaced values where possible.
+ if (@controlled_subfields) {
+ $target_field->delete_subfield($_) for @controlled_subfields;
+ } else {
+ $target_r->delete_field($target_field);
+ }
+ next;
+ }
+
+ my @new_subfields = generate_replacement_subfields(
+ $source_field, $target_field, @controlled_subfields);
+
+ # Build the replacement field from scratch.
+ my $replacement_field = MARC::Field->new(
+ $target_field->tag,
+ $target_field->indicator(1),
+ $target_field->indicator(2),
+ @new_subfields
+ );
+
+ $target_field->replace_with($replacement_field);
+ }
+ }
+
+ $target_xml = $target_r->as_xml_record;
+ $target_xml =~ s/^<\?.+?\?>$//mo;
+ $target_xml =~ s/\n//sgo;
+ $target_xml =~ s/>\s+</></sgo;
+
+ return $target_xml;
+
+$_$ LANGUAGE PLPERLU;
+
+COMMIT;
+