From: dbs Date: Thu, 16 Apr 2009 18:07:04 +0000 (+0000) Subject: Script for cleaning up URIs to conform to the URIs-as-copies ingest scheme X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=658ba17df116668fe804ce2386ada07a519c155e;p=contrib%2FConifer.git Script for cleaning up URIs to conform to the URIs-as-copies ingest scheme git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/trunk@339 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- diff --git a/tools/migration-scripts/fixURIs.pl b/tools/migration-scripts/fixURIs.pl new file mode 100644 index 0000000000..378a84a266 --- /dev/null +++ b/tools/migration-scripts/fixURIs.pl @@ -0,0 +1,68 @@ +#!/usr/bin/perl -w +use strict; +use MARC::File::XML( BinaryEncoding => 'utf8', RecordFormat => 'USMARC' ); + +# Clean up URIs prior to batch ingest +# * If we detect a proxy URL: +# * Ensure ind1 = 4 +# * Ensure ind2 = 2 +# * Ensure $9 = aou.shortname +# * Trim whitespace and other tweaks while we're at it? + +my $input = MARC::File::XML->in( shift ); +my $output = MARC::File::XML->out( 'bibs_edited.xml' ); + +my $touched = 0; +while (my $marc = $input->next()) { + my $edited = 0; + my @uri_fields = $marc->field('856'); + foreach my $uri (@uri_fields) { + my ($orgunit); + + # There's no way we should have multiples, but let's iterate anyway + my @urls = $uri->subfield('u'); + + foreach my $url (@urls) { + if ($url =~ m/librweb.laurentian.ca/o) { + $orgunit = 'OSUL'; + } elsif ($url =~ m/libproxy.auc.ca/o) { + $orgunit = 'OSTMA'; + } elsif ($url =~ m/normedproxy.lakeheadu.ca/o) { + $orgunit = 'OSM'; + } + + if ($orgunit) { + my $clean_url = $url; + $clean_url =~ s/^\s*(.*?)\s*$/$1/o; + if ($url ne $clean_url) { + $uri->update(u => $clean_url); + $edited++; + } + + my $ind1 = $uri->indicator(1); + if ($ind1 and $ind1 ne '1' and $ind1 ne '4') { + $uri->update(ind1 => '4'); + $edited++; + } + + my $ind2 = $uri->indicator(2); + if ($ind2 and $ind2 ne '0' and $ind2 ne '1') { + $uri->update(ind2 => '1'); + $edited++; + } + + # Risking that we only have one subfield 9 here + my $aou = $uri->subfield('9'); + if (!$aou or $aou ne $orgunit) { + $uri->update(9 => $orgunit); + $edited++; + } + } + } + } + if ($edited) { + $touched++; + } + $output->write($marc); +} +$output->close();