From 64453a3c8cc76c8de1c541488228355177b8f548 Mon Sep 17 00:00:00 2001 From: dbs Date: Fri, 3 Jul 2009 03:42:38 +0000 Subject: [PATCH] Migration guy - clean up thine own mess git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/trunk@561 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- tools/migration-scripts/fix_windsors_diacritics.pl | 166 +++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 tools/migration-scripts/fix_windsors_diacritics.pl diff --git a/tools/migration-scripts/fix_windsors_diacritics.pl b/tools/migration-scripts/fix_windsors_diacritics.pl new file mode 100644 index 0000000000..c78120267c --- /dev/null +++ b/tools/migration-scripts/fix_windsors_diacritics.pl @@ -0,0 +1,166 @@ +#!/usr/bin/perl +use strict; +use warnings; + +# Let's say you accidentally stripped the diacritics from, oh, 189,000 records during a migration. +# If you still have the original records, you might want to use a script like this to load +# them back into the database. + +require 'oils_header.pl'; +use Error qw/:try/; +use Digest::MD5 qw/md5_hex/; +use OpenSRF::Utils::JSON; +use OpenILS::Application::AppUtils; +use Unicode::Normalize; +use Encode; + +use FileHandle; +use Time::HiRes qw/time/; +use Getopt::Long; +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Charset; +use DBI; + +my ($marcfile, $marctype, $enc, $config, $username, $password) = ('/openils/migration/windsor/bib20090430.mrc', 'USMARC', 'UTF8', '/openils/conf/opensrf_core.xml'); + +GetOptions( + 'encoding=s' => \$enc, # set assumed MARC encoding for MARC::Charset + 'config=s' => \$config, # location of OpenSRF core config file, defaults to /openils/conf/opensrf_core.xml + "username=s" => \$username, # EG username + "password=s" => \$password, # EG password +); + +if ($enc) { + MARC::Charset->ignore_errors(1); + MARC::Charset->assume_encoding($enc); +} + +OpenSRF::System->bootstrap_client( config_file => $config ); + +# Login to Evergreen and get an authentication token +my $auth = oils_login($username, $password); +if (!$auth) { + die "Could not retrieve an authentication token"; +} + +select STDERR; $| = 1; +select STDOUT; $| = 1; +binmode STDOUT, ":utf8"; + +my $batch = new MARC::Batch ( $marctype, $marcfile ); +$batch->strict_off(); +$batch->warnings_off(); + +my $starttime = time; +my $rec; +my $count = 0; +my $recs = 0; +PROCESS: while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) { + next if ($rec == -1); + + $count++; + + if ($rec->as_formatted =~ m/[^\x00-\x7f]/) { + $rec_count++; + print "$rec_count of $count\n"; + update_id_field(\$rec); + fix_URIs(\$rec); + update_marc(\$rec); + + # Exit nice and early so that we don't wander off and update a whole batch without testing + if ($rec_count > 0) { + exit; + } + } + +} + +# Set the 001 and 901 to our record ID in Conifer +# Windsor records are offset by 1 million from their legacy ID +sub update_id_field { + my $rec = shift; + + my $tcn = $$rec->field('001'); + my $rec_id = $tcn->data + 1000000; + $tcn->update($rec_id); + my $id_field = MARC::Field->new('901', '', '', 'a' => $rec_id, 'b' => 'Unknown', 'c' => $rec_id); + $$rec->append_fields($id_field); +} + +sub fix_URIs { + my $marc = shift; + + my @uri_fields = $$marc->field('856'); + foreach my $uri (@uri_fields) { + my ($orgunit); + + # There's no way we should have multiples, but let's iterate anyway + my @urls = $uri->subfield('u'); + + foreach my $url (@urls) { + # For general use we should factor these out to a hash. Oh well. + + # We're filtering by proxy address, because theoretically anything + # that is not proxied is open to the world to access and doesn't + # need to be treated as a URI particular to that org_unit + if ($url =~ m/librweb.laurentian.ca/o) { + $orgunit = 'OSUL'; + } elsif ($url =~ m/libproxy.auc.ca/o) { + $orgunit = 'OSTMA'; + } elsif ($url =~ m/normedproxy.lakeheadu.ca/o) { + $orgunit = 'OSM'; + } elsif ($url =~ m/ezproxy.uwindsor.ca/o or $url =~ m/webvoy.uwindsor.ca/o ) { + $orgunit = 'OWA'; + } + + if ($orgunit) { + my $clean_url = $url; + $clean_url =~ s/^\s*(.*?)\s*$/$1/o; + if ($url ne $clean_url) { + $uri->update(u => $clean_url); + } + + my $ind1 = $uri->indicator(1); + if ($ind1 and $ind1 ne '1' and $ind1 ne '4') { + $uri->update(ind1 => '4'); + } + + my $ind2 = $uri->indicator(2); + if ($ind2 and $ind2 ne '0' and $ind2 ne '1') { + $uri->update(ind2 => '1'); + } + + # Risking that we only have one subfield 9 here + # Should be a slight risk as it's not defined in the spec + my $aou = $uri->subfield('9'); + if (!$aou or $aou ne $orgunit) { + $uri->update(9 => $orgunit); + } + } + } + } +} + +sub update_marc { + my $rec = shift; + + # Borrowed from marc2bre.pl to get clean XML + (my $xml = $$rec->as_xml_record()) =~ s/\n//sog; + $xml =~ s/^<\?xml.+\?\s*>//go; + $xml =~ s/>\s+entityize($xml); + $xml =~ s/[\x00-\x1f]//go; + + # Update and ingest this puppy + my $update = OpenILS::Application::AppUtils->simplereq('open-ils.cat', + 'open-ils.cat.biblio.record.xml.update', + ($auth, int($$rec->field('001')->data), $xml) + ); + + # Return the cleaned-up XML in case we want to inspect it + return $xml; +} + + -- 2.11.0