Script for cleaning up URIs to conform to the URIs-as-copies ingest scheme
authordbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Thu, 16 Apr 2009 18:07:04 +0000 (18:07 +0000)
committerdbs <dbs@6d9bc8c9-1ec2-4278-b937-99fde70a366f>
Thu, 16 Apr 2009 18:07:04 +0000 (18:07 +0000)
git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/trunk@339 6d9bc8c9-1ec2-4278-b937-99fde70a366f

tools/migration-scripts/fixURIs.pl [new file with mode: 0644]

diff --git a/tools/migration-scripts/fixURIs.pl b/tools/migration-scripts/fixURIs.pl
new file mode 100644 (file)
index 0000000..378a84a
--- /dev/null
@@ -0,0 +1,68 @@
+#!/usr/bin/perl -w
+use strict;
+use MARC::File::XML( BinaryEncoding => 'utf8', RecordFormat => 'USMARC' );
+
+# Clean up URIs prior to batch ingest
+#   * If we detect a proxy URL:
+#     * Ensure ind1 = 4
+#     * Ensure ind2 = 2
+#     * Ensure $9 = aou.shortname
+#   * Trim whitespace and other tweaks while we're at it?
+
+my $input = MARC::File::XML->in( shift );
+my $output = MARC::File::XML->out( 'bibs_edited.xml' );
+
+my $touched = 0;
+while (my $marc = $input->next()) {
+       my $edited = 0;
+       my @uri_fields = $marc->field('856');
+       foreach my $uri (@uri_fields) {
+               my ($orgunit);
+
+               # There's no way we should have multiples, but let's iterate anyway
+               my @urls = $uri->subfield('u');
+
+               foreach my $url (@urls) {
+                       if ($url =~ m/librweb.laurentian.ca/o) {
+                               $orgunit = 'OSUL';
+                       } elsif ($url =~ m/libproxy.auc.ca/o) {
+                               $orgunit = 'OSTMA';
+                       } elsif ($url =~ m/normedproxy.lakeheadu.ca/o) {
+                               $orgunit = 'OSM';
+                       }
+
+                       if ($orgunit) {
+                               my $clean_url = $url;
+                               $clean_url =~ s/^\s*(.*?)\s*$/$1/o;
+                               if ($url ne $clean_url) {
+                                       $uri->update(u => $clean_url);
+                                       $edited++;
+                               }
+
+                               my $ind1 = $uri->indicator(1);
+                               if ($ind1 and $ind1 ne '1' and $ind1 ne '4') {
+                                       $uri->update(ind1 => '4');
+                                       $edited++;
+                               }
+
+                               my $ind2 = $uri->indicator(2);
+                               if ($ind2 and $ind2 ne '0' and $ind2 ne '1') {
+                                       $uri->update(ind2 => '1');
+                                       $edited++;
+                               }
+
+                               # Risking that we only have one subfield 9 here
+                               my $aou = $uri->subfield('9');
+                               if (!$aou or $aou ne $orgunit) {
+                                       $uri->update(9 => $orgunit);
+                                       $edited++;
+                               }
+                       }
+               }
+       }
+       if ($edited) {
+               $touched++;
+       }
+       $output->write($marc);
+}
+$output->close();