From: dbs Date: Thu, 16 Apr 2009 18:33:42 +0000 (+0000) Subject: Clean up: X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=46d4b7874dde51c73775ed04f84f4bd6b0976b31;p=contrib%2FConifer.git Clean up: * allow command line specification of input/output files * provide more granular feedback on breakdown of what got touched git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/trunk@340 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- diff --git a/tools/migration-scripts/fixURIs.pl b/tools/migration-scripts/fixURIs.pl index 378a84a266..70c8897513 100644 --- a/tools/migration-scripts/fixURIs.pl +++ b/tools/migration-scripts/fixURIs.pl @@ -1,68 +1,99 @@ #!/usr/bin/perl -w use strict; +use Getopt::Long; use MARC::File::XML( BinaryEncoding => 'utf8', RecordFormat => 'USMARC' ); -# Clean up URIs prior to batch ingest +# Clean up URIs from MARCXML records prior to batch ingest # * If we detect a proxy URL: # * Ensure ind1 = 4 # * Ensure ind2 = 2 # * Ensure $9 = aou.shortname # * Trim whitespace and other tweaks while we're at it? -my $input = MARC::File::XML->in( shift ); -my $output = MARC::File::XML->out( 'bibs_edited.xml' ); +my ($input_file, $output_file); +GetOptions( + 'input=s' => \$input_file, + 'output=s' => \$output_file +); -my $touched = 0; +if (!$input_file or !$output_file) { + print("Please specify the following options:\n"); + print("\t--input : input file of MARCXML records\n"); + print("\t--output : output file of processed MARCXML records\n"); + exit(); +} + +my $input = MARC::File::XML->in( $input_file ); +my $output = MARC::File::XML->out( $output_file ); + +my ($touched, $url_cnt, $ind1_cnt, $ind2_cnt, $sub9_cnt) = (0, 0, 0, 0, 0); while (my $marc = $input->next()) { - my $edited = 0; - my @uri_fields = $marc->field('856'); - foreach my $uri (@uri_fields) { - my ($orgunit); + my $edited = 0; + my @uri_fields = $marc->field('856'); + foreach my $uri (@uri_fields) { + my ($orgunit); - # There's no way we should have multiples, but let's iterate anyway - my @urls = $uri->subfield('u'); + # There's no way we should have multiples, but let's iterate anyway + my @urls = $uri->subfield('u'); - foreach my $url (@urls) { - if ($url =~ m/librweb.laurentian.ca/o) { - $orgunit = 'OSUL'; - } elsif ($url =~ m/libproxy.auc.ca/o) { - $orgunit = 'OSTMA'; - } elsif ($url =~ m/normedproxy.lakeheadu.ca/o) { - $orgunit = 'OSM'; - } + foreach my $url (@urls) { + # For general use we should factor these out to a hash. Oh well. - if ($orgunit) { - my $clean_url = $url; - $clean_url =~ s/^\s*(.*?)\s*$/$1/o; - if ($url ne $clean_url) { - $uri->update(u => $clean_url); - $edited++; - } + # We're filtering by proxy address, because theoretically anything + # that is not proxied is open to the world to access and doesn't + # need to be treated as a URI particular to that org_unit + if ($url =~ m/librweb.laurentian.ca/o) { + $orgunit = 'OSUL'; + } elsif ($url =~ m/libproxy.auc.ca/o) { + $orgunit = 'OSTMA'; + } elsif ($url =~ m/normedproxy.lakeheadu.ca/o) { + $orgunit = 'OSM'; + } - my $ind1 = $uri->indicator(1); - if ($ind1 and $ind1 ne '1' and $ind1 ne '4') { - $uri->update(ind1 => '4'); - $edited++; - } + if ($orgunit) { + my $clean_url = $url; + $clean_url =~ s/^\s*(.*?)\s*$/$1/o; + if ($url ne $clean_url) { + $uri->update(u => $clean_url); + $edited++; + $url_cnt++; + } - my $ind2 = $uri->indicator(2); - if ($ind2 and $ind2 ne '0' and $ind2 ne '1') { - $uri->update(ind2 => '1'); - $edited++; - } + my $ind1 = $uri->indicator(1); + if ($ind1 and $ind1 ne '1' and $ind1 ne '4') { + $uri->update(ind1 => '4'); + $edited++; + $ind1_cnt++; + } - # Risking that we only have one subfield 9 here - my $aou = $uri->subfield('9'); - if (!$aou or $aou ne $orgunit) { - $uri->update(9 => $orgunit); - $edited++; - } - } - } - } - if ($edited) { - $touched++; - } - $output->write($marc); + my $ind2 = $uri->indicator(2); + if ($ind2 and $ind2 ne '0' and $ind2 ne '1') { + $uri->update(ind2 => '1'); + $edited++; + $ind2_cnt++; + } + + # Risking that we only have one subfield 9 here + # Should be a slight risk as it's not defined in the spec + my $aou = $uri->subfield('9'); + if (!$aou or $aou ne $orgunit) { + $uri->update(9 => $orgunit); + $edited++; + $sub9_cnt++; + } + } + } + } + if ($edited) { + $touched++; + } + $output->write($marc); } $output->close(); +print "Touched $touched records to fix URIs.\n"; +print "\t$url_cnt URLs were touched\n"; +print "\t$ind1_cnt indicator 1 values were touched\n"; +print "\t$ind2_cnt indicator 2 values were touched\n"; +print "\t$sub9_cnt subfield '9' values were touched\n"; + +# vim: et:ts=4:sw=4: