LP1754455 Add Ability to remove fields or subfields in marc_export
authorDan Pearl <dpearl@cwmars.org>
Fri, 16 Mar 2018 17:14:03 +0000 (13:14 -0400)
committerGalen Charlton <gmc@equinoxOLI.org>
Tue, 21 Feb 2023 15:16:22 +0000 (10:16 -0500)
This commit adds a --strip option to marc_export that allows fields
and/or subfields to be removed from MARC records on export.

To test: Try all forms of --strip:
    --strip <tag> e.g. --strip 856
    --strip <tag>/<sub>   e.g.  --strip 856/0
    --strip /<sub>   e.g. --strip /0

Try some regular expressions:
    --strip 8..
    --strip /[ab0]

For all tests, compare the regular output to the stripped otput and
the plain (unstripped) output.

Signed-off-by: Dan Pearl <dpearl@cwmars.org>
Signed-off-by: Jason Boyer <JBoyer@equinoxOLI.org>
Signed-off-by: Galen Charlton <gmc@equinoxOLI.org>
Open-ILS/src/support-scripts/marc_export.in
docs/RELEASE_NOTES_NEXT/Administration/marc_export_strip.adoc [new file with mode: 0644]

index 843ddff..78c9c4f 100755 (executable)
@@ -86,6 +86,7 @@ sub new {
                'mfhd',
                'all',
                'replace_001',
+               'strip=s@',
                'location=s',
                'money=s',
                'config=s',
@@ -126,6 +127,8 @@ Usage: $0 [options]
  --type or -t       Record type (BIBLIO, AUTHORITY) [BIBLIO]
  --all or -a        Export all records; ignores input list
  --replace_001      Replace the 001 field value with the record ID
+ --strip tagRE/subRE
+                    Strip fields or subfields matching regular expressions
  --store            Use the given storage backend to connect to the database.
                     Choices are (reporter, cstore, storage) [reporter]
  --since            Export records modified since a certain date and time.
@@ -162,6 +165,14 @@ libraries with the short names "BR1" and "BR2":
 
   $0 --library BR1 --library BR2 --encoding UTF-8 > sys1_bibs.mrc
 
+The --strip option can be used more than once which imples an "OR" operation.
+If the fie]d argument is omitted (e.g., "/0", it is treated like "..." (all fields).
+If the subfield argument is omitted (e.g. "100/", it is treated like "."
+(all subfields).  Examples:
+
+  --strip /0      Remove all 0 subfields
+  --strip 1[23].  Remove fields with tags 120 through 139, inclusive.
+  --strip /       Remove all subfields (probably not useful).
 HELP
         exit;
     }
@@ -193,6 +204,27 @@ HELP
             "Right now that means one of [".
                 join('|',(FORMATS)). "]\n";
     }
+    # Process --strip arguments.  They are in the form tagRE/subRE.  Note
+    # that the RE pieces cannot contain a slash (/), as this would create
+    # ambiguity.  If there is no /subRE, then it's OK, and just the tagRE
+    # is specified.
+    # $opts{strip} is an array-ref for this multi-valued option.
+    my @strip = ();
+
+    foreach my $strip_value (@{$opts{strip}}) {
+        my $trec = {};
+        if ($strip_value =~ /\//) {
+            $trec->{tag} = $`;
+            $trec->{tag} = "..." if ($` eq "");
+            $trec->{subfield} = $';
+            $trec->{subfield} = "." if ($' eq "");
+        } else {
+            # No slash case
+            $trec->{tag} = $strip_value;
+            $trec->{subfield} = '';
+        }
+        push @strip, $trec;
+    }
 
     if ($opts{format} eq 'ARE' && $opts{type} ne 'authority') {
         die "Format ARE is not compatible with type " . $opts{type};
@@ -234,6 +266,8 @@ HELP
     }
     $opts{encoding} = uc($opts{encoding});
 
+    $opts{strip} = \@strip;
+
     $self->{'options'} = \%opts;
     bless $self, $class;
     return $self;
@@ -524,6 +558,43 @@ sub next {
                         $marc->insert_fields_ordered($tcn);
                     }
                 }
+
+                my $strip_arg_ref = $Marque::config->option_value('strip');
+                my @strip = @{$strip_arg_ref};
+                foreach my $strip_ref (@strip) {
+                    my $tagRE = $strip_ref->{tag};
+                    my $subfieldRE = $strip_ref->{subfield};
+
+                    if ( $subfieldRE eq "") {
+                        # Case 1: Field only check, e.g. "--strip 5.[0,1]"
+                        # If the supplied regexp matches the field, then
+                        # delete that field.
+                        foreach my $test_field ($marc->fields()) {
+                            if ($test_field->tag() =~ /$tagRE/) {
+                                # A hit!
+                                $marc->delete_field($test_field);
+                            }
+                        }
+                    } elsif ($subfieldRE ne "" && $tagRE ne "") {
+                        # Case 2: Field & subfield supplied.
+                        # Note a blank tag will be wildcarded to "*".
+                        # Traverse fields, then traverse subfields if match
+                        # is found.
+                        foreach my $test_field ($marc->fields()) {
+                            if ( !$test_field->is_control_field() &&
+                                 $test_field->tag() =~ /$tagRE/) {
+                                # Traverse all subfields:
+                                foreach my $test_subfield ($test_field->subfields()) {
+                                    my $sfcode = @{$test_subfield}[0];
+                                    if ($sfcode =~ /$subfieldRE/) {
+                                        $test_field->delete_subfield($sfcode);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+
                 if ($Marque::config->option_value('items')) {
                     my @acps = $self->acps_for_bre($r);
                     foreach my $acp (@acps) {
@@ -913,6 +984,44 @@ sub next {
                     $marc->insert_fields_ordered($tcn);
                 }
             }
+
+            my $strip_arg_ref = $Marque::config->option_value('strip');
+            my @strip = @{$strip_arg_ref};
+
+            foreach my $strip_ref (@strip) {
+                my $tagRE = $strip_ref->{tag};
+                my $subfieldRE = $strip_ref->{subfield};
+
+                if ( $subfieldRE eq "") {
+                    # Case 1: Field only check, e.g. "--strip 5.[0,1]"
+                    # If the supplied regexp matches the field, then
+                    # delete that field.
+                    foreach my $test_field ($marc->fields()) {
+                        if ($test_field->tag() =~ /$tagRE/) {
+                            # A hit!
+                            $marc->delete_field($test_field);
+                        }
+                    }
+                } elsif ($subfieldRE ne "" && $tagRE ne "") {
+                    # Case 2: Field & subfield supplied.
+                    # Note a blank tag will be wildcarded to "*".
+                    # Traverse fields, then traverse subfields if match
+                    # is found.
+                    foreach my $test_field ($marc->fields()) {
+                        if ( !$test_field->is_control_field() &&
+                             $test_field->tag() =~ /$tagRE/) {
+                            # Traverse all subfields:
+                            foreach my $test_subfield ($test_field->subfields()) {
+                                my $sfcode = @{$test_subfield}[0];
+                                if ($sfcode =~ /$subfieldRE/) {
+                                    $test_field->delete_subfield($sfcode);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
             if ($Marque::config->option_value('since')) {
                 my $leader = $marc->leader();
                 if ($U->is_true($r->deleted())) {
diff --git a/docs/RELEASE_NOTES_NEXT/Administration/marc_export_strip.adoc b/docs/RELEASE_NOTES_NEXT/Administration/marc_export_strip.adoc
new file mode 100644 (file)
index 0000000..4fd1153
--- /dev/null
@@ -0,0 +1,34 @@
+--strip option for marc_export
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The --strip option is used to suppress generation of specified elements
+from the marc_export output. The option, which can be specified more than
+once, is in one if these forms:
+
+[source]
+--------
+        --strip <field RE>/<subfield RE>
+        --strip /<subfield RE>
+        --strip <field RE>
+
+Some examples:
+
+        --strip 856/0   [Delete subfield 0's in fields with tag 856.]
+
+Regular expressions are accepted:
+
+        --strip 8../0   [Delete subfield 0's in fields with tag 800-899.]
+
+If the field is omitted, it is as if you specified "..." for the field RE.
+
+    --strip /0      [Delete subfield 0's in all fields.]
+    --strip /[abc]  [Delete subfield a, b or c in all fields.]
+
+If the slash and subfield are omitted, it means to delete the given fields.
+
+    -strip 856     [Delete fields with tag 856]
+
+If the slash is present, but the subfield is omitted, it means "all subfields"
+
+    --strip 856/    [Delete all subfields with tag 856]
+--------