From fd2a5165bada460b7d4745e62ffb953f9714f131 Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Mon, 29 Dec 2014 08:53:25 -0500 Subject: [PATCH] Routine for fixing 44,000 not-good EEBO MARC records Signed-off-by: Dan Scott --- Open-ILS/src/sql/Pg/fix_eebo_uris.sql | 141 ++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 Open-ILS/src/sql/Pg/fix_eebo_uris.sql diff --git a/Open-ILS/src/sql/Pg/fix_eebo_uris.sql b/Open-ILS/src/sql/Pg/fix_eebo_uris.sql new file mode 100644 index 0000000000..509a4cec91 --- /dev/null +++ b/Open-ILS/src/sql/Pg/fix_eebo_uris.sql @@ -0,0 +1,141 @@ +CREATE OR REPLACE FUNCTION conifer.fix_eebo_uris(record BIGINT) RETURNS TEXT AS $func$ +use strict; +use MARC::Record; +use MARC::File::XML (BinaryEncoding => 'UTF-8'); +use MARC::Charset; +use Encode; +use Unicode::Normalize; +use LWP::UserAgent; +use JSON::XS; + +MARC::Charset->assume_unicode(1); + +my $q = spi_prepare('SELECT marc FROM biblio.record_entry WHERE id = $1', 'BIGINT'); +my $marc = spi_exec_prepared($q, $_[0])->{rows}->[0]->{marc}; + +my $record = MARC::Record->new_from_xml($marc); + +my @eights = $record->field('856'); +foreach my $ocho (@eights) { + my @ous = $ocho->subfield('9'); + foreach my $ou (@ous) { + if ($ou eq 'WINDSYS' or $ou eq 'OWA') { + $record->delete_field($ocho); + } + if ($ou eq 'OSUL') { + $ocho->update('9' => 'LUSYS'); + } + } + + # Fix URIs that raise SSL cert issues + my $uri = $ocho->subfield('u'); + my $old_uri = $uri; + if ($uri =~ m#^https://librweb#) { + $uri =~ s{^https://librweb}{http://librweb}; + } + + # Fix URIs with a space after the URL proxy parameter + $uri =~ s{url= http}{url=http}; + + # Point at LU ECCO URIs + $uri =~ s{wind05901}{subd78095}; + + if ($old_uri ne $uri) { + $ocho->update('u' => $uri); + } + + # Provide an indication of the platform + if ($uri =~ m#scholarsportal#) { + $ocho->update('y' => 'Available online / disponible en ligne (ScholarsPortal)'); + } elsif ($uri =~ m#myilibrary#) { + $ocho->update('y' => 'Available online / disponible en ligne (MyiLibrary)'); + } elsif ($uri =~ m#sagepub#) { + $ocho->update('y' => 'Available online / disponible en ligne (Sage)'); + } elsif ($uri =~ m#galegroup.com/ecco#) { + $ocho->update('y' => 'Available online / disponible en ligne (Eighteenth Century Collection Online)'); + } elsif ($uri =~ m#proquest.com/.*xri:eebo#) { + $ocho->update('y' => 'Available online / disponible en ligne (Early English Books Online)'); + } +} + +my %eightflat; +# Dedupe 856 fields as we sometimes have two identical LUSYS entries +my @eights = $record->field('856'); +foreach my $ocho (@eights) { + my $flat = $ocho->as_formatted(); + if (exists $eightflat{$flat}) { + $record->delete_field($ocho); + } else { + $eightflat{$flat} = 1; + } +} + +# Do not need access notes anymore +my @access = $record->field('506'); +foreach my $note (@access) { + my @ous = $note->subfield('9'); + foreach my $ou (@ous) { + if ($ou eq 'OWA') { + $record->delete_field($note); + } + } +} + +# Add explicit publisher relator code +my @added = $record->field('710'); +foreach my $entry (@added) { + next if $entry->subfield('4'); + if ($entry->subfield('a') =~ 'Early English Books Online') { + $entry->add_subfields('4' => 'pbl'); + } +} + +# Update provenance of the records +my @provs = $record->field('040'); +foreach my $prov (@provs) { + $prov->delete_subfield(code => 'd', match => qr/CaOWA/); + my @subfields = $prov->subfield('d'); + my $found = 0; + foreach my $subfield (@subfields) { + if ($subfield eq 'CaOSUL') { + $found = 1; + } + } + if (!$found) { + $prov->add_subfields('d' => 'CaOSUL'); + } +} + +# Fix broken OCLC numbers with trailing 'e' +my @oclcnums = $record->field('035'); +foreach my $num (@oclcnums) { + my @subfields = $num->subfield('a'); + foreach my $subfield (@subfields) { + if ($subfield =~ m#^\(OCoLC\).+e$#) { + $subfield =~ s{e$}{}; + $num->update('a' => $subfield); + } + } +} + +my $xml = $record->as_xml_record(); +$xml =~ s/\n//sgo; +$xml =~ s/^<\?xml.+\?\s*>//go; +$xml =~ s/>\s+entityize() +# to avoid having to set PERL5LIB for PostgreSQL as well + +$xml = NFC($xml); + +# Convert raw ampersands to entities +$xml =~ s/&(?!\S+;)/&/gso; + +# Convert Unicode characters to entities +$xml =~ s/([\x{0080}-\x{fffd}])/sprintf('&#x%X;',ord($1))/sgoe; + +$xml =~ s/[\x00-\x1f]//go; + +return $xml; +$func$ LANGUAGE PLPERLU; -- 2.11.0