use warnings;
foreach my $file (@ARGV) {
- clean_empty_datafields($file);
+ process_file($file);
}
-sub clean_empty_datafields {
+sub process_file {
my $file = shift;
# Empty datafields anger MARC::File::XML
open(FH, '<', $file) or die $!;
open(CLEAN, '>', "$file.new");
- my ($trim, $lastline) = (0, '');
+ my ($trim, $lastline, $lineno) = (0, '', 1);
while (<FH>) {
if ($_ =~ m#</datafield># and $lastline =~ m#<datafield#) {
+ print STDERR "Empty datafield at line $lineno of file $file\n";
$trim = 1;
} elsif ($trim) {
$trim = 0;
print CLEAN $lastline;
$trim = 0;
}
+
+ # Given questionable input, yaz-marcdump creates invalid XML like this:
+ # <datafield tag="500" ind1=" " ind2=" ">
+ # <subfield code="a">In subtitle "sports" appears as "</subfield>
+ # <subfield code="p">ort</subfield>
+ # <subfield code=""">.</subfield>
+ # </datafield>
+ #
+ # This will at least enable MARC::File::XML to process it:
+ if ($_ =~ m#<subfield code=""">#o) {
+ print STDERR "Bad subfield code \" at line $lineno of file $file\n";
+ $_ =~ s{<subfield code=""">}{<subfield code="a">}o;
+ } elsif ($_ =~ m#<subfield code="<">#o) {
+ print STDERR "Bad subfield code < at line $lineno of file $file\n";
+ $_ =~ s{<subfield code="<">}{<subfield code="a">}o;
+ }
$lastline = $_;
+ $lineno++;
}
print CLEAN $lastline;