From: dbs Date: Fri, 1 Apr 2011 03:09:34 +0000 (+0000) Subject: Commit current ebook processing scripts X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=refs%2Fheads%2Frel_1_6_1;p=contrib%2FConifer.git Commit current ebook processing scripts These scripts are helping me attempt to make sense of the past three years of our electronic book loading, which happened consortially and non-consortially, and with and without a working ingest for multiple located URIs per record. git-svn-id: svn://svn.open-ils.org/ILS-Contrib/conifer/branches/rel_1_6_1@1295 6d9bc8c9-1ec2-4278-b937-99fde70a366f --- diff --git a/tools/ebooks/ebook_reports.pl b/tools/ebooks/ebook_reports.pl new file mode 100644 index 0000000000..76ad7fcf73 --- /dev/null +++ b/tools/ebooks/ebook_reports.pl @@ -0,0 +1,92 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use DBI; +use Spreadsheet::WriteExcel; + +my %library = ( + id => 109, + name => 'Windsor' +); + +my @ebook_records = qw/ + cambridge-2009-12-01.mrc + cambridge-2010-04-12.mrc + cambridge-2010-08-18.mrc + cambridge-2010-09-30_137.mrc + duke-2010-08-24_92.mrc + duke-2011-02-02_10.mrc + duke-2011-02-14_15.mrc + gibson-chrc-2010-08-06.mrc + gibson-chrc-2010-08-20.mrc + gibson-chrc-2010-09-17.mrc + gibson-chrc-2010-10-21_66.mrc + gibson-chrc-2010-12-02_127.mrc + gibson_chrc-2011-02-23_212.mrc + oxford-2010-03-04.mrc + oxford-2010-04-27.mrc + oxford-2010-09-30_156.mrc + oxford-2010-10-28_49.mrc + oxford_2010-03-04.mrc + springer-2009-12-01.mrc + springer-2010-02-11.mrc + springer-2010-04-28_1218.mrc + springer-2010-06-27_165.mrc + springer-2011-02-17_1751.mrc +/; + +my $dbh = DBI->connect("dbi:Pg:dbname=conifer;host=polaris.cs.uoguelph.ca", "evergreen", "") || die "Can't connect to database.\n"; + +my $workbook = Spreadsheet::WriteExcel->new("/openils/var/web/ebooks/" . $library{"name"} . ".xls"); + +my $worksheet = $workbook->add_worksheet("Cover Sheet"); + +my $bold = $workbook->add_format(); +$bold->set_bold(); + +$worksheet->write(5, 0, "Ebook analysis:", $bold); +$worksheet->write(5, 1, $library{"name"}); + +foreach my $marc (@ebook_records) { + add_analysis($marc, $library{"id"}); +} + +$dbh->disconnect(); +$workbook->close(); +exit; + +sub add_analysis { + my ($collection, $library) = @_; + + # Format as a string. Doesn't change to a number when edited + my $format_num = $workbook->add_format(num_format => '@'); + + $worksheet = $workbook->add_worksheet("$collection"); + + my $col = 0; + foreach my $head ("Collection", "Ebook ID", "Record ID", "ISBN", "System Control", "Author", "Title") { + $worksheet->write(0, $col, $head, $bold); + $col++; + } + + my $sth = $dbh->prepare("SELECT collection, id, record, isbn, sysctl, author, title + FROM scratchpad.ebook_missing_record_matches(?, ?)"); + $sth->execute($library, $collection); + + my $row = 1; + while (my $a = $sth->fetchrow_hashref()) { + $col = 0; + $worksheet->write_string($row, $col++, $a->{collection}, $bold); + $worksheet->write($row, $col++, $a->{id}); + $worksheet->write($row, $col++, $a->{record}); + $worksheet->write_string($row, $col++, $a->{isbn}, $format_num); + $worksheet->write_string($row, $col++, $a->{sysctl}); + $worksheet->write_string($row, $col++, $a->{author}); + $worksheet->write_string($row, $col++, $a->{title}); + + $row++; + } +} + diff --git a/tools/ebooks/ebooks.sql b/tools/ebooks/ebooks.sql new file mode 100644 index 0000000000..5358950a96 --- /dev/null +++ b/tools/ebooks/ebooks.sql @@ -0,0 +1,156 @@ +TRUNCATE scratchpad.ebook_collections_to_records; + +TRUNCATE scratchpad.ebook_links_by_institution; + +-- Find any bib records that match on isbn +INSERT INTO scratchpad.ebook_collections_to_records (collection, id, record) SELECT DISTINCT situ.collection, situ.id, rmsr.id AS record FROM scratchpad.ids_to_urls situ INNER JOIN reporter.materialized_simple_record rmsr ON ARRAY[situ.isbn] <@ rmsr.isbn WHERE situ.isbn IS NOT NULL AND (collection, situ.id, rmsr.id) NOT IN (SELECT collection, id, record FROM scratchpad.ebook_collections_to_records); + +-- Find any bib records that match on isbn, round 2, because reporter.materialized_simple_record needs updating +INSERT INTO scratchpad.ebook_collections_to_records (collection, id, record) SELECT DISTINCT collection, situ.id, record FROM metabib.full_rec mfr INNER JOIN scratchpad.ids_to_urls situ ON situ.isbn = mfr.value AND mfr.tag = '020' AND mfr.subfield = 'a' WHERE (collection, situ.id, record) NOT IN (SELECT collection, id, record FROM scratchpad.ebook_collections_to_records) AND situ.isbn IS NOT NULL; + +-- Find any bib records that match on system control number +INSERT INTO scratchpad.ebook_collections_to_records (collection, id, record) SELECT DISTINCT collection, situ.id, record FROM metabib.full_rec mfr INNER JOIN scratchpad.ids_to_urls situ ON LOWER(situ.sysctl) = mfr.value AND mfr.tag = '035' AND mfr.subfield = 'a' WHERE (collection, situ.id, record) NOT IN (SELECT collection, id, record FROM scratchpad.ebook_collections_to_records) AND situ.sysctl IS NOT NULL; + +-- Set the canonical number of unique records per batch +INSERT INTO scratchpad.ebook_links_by_institution (library, cnt, collection) SELECT 'ALL', COUNT(*), x.collection FROM (SELECT DISTINCT id, collection FROM scratchpad.ids_to_urls) AS x GROUP BY x.collection ORDER BY x.collection; + +-- Table that maps which ebook record each library has for a given collection record +CREATE TABLE scratchpad.ebook_record_by_library (library INTEGER, record BIGINT, id INTEGER, collection TEXT); + +-- Populate the table +INSERT INTO scratchpad.ebook_record_by_library (library, record, id, collection) SELECT 109, record, id, seb.collection FROM scratchpad.ebook_collections_to_records seb WHERE seb.record IN (SELECT record FROM asset.call_number acn WHERE acn.deleted IS FALSE AND acn.owning_lib = 109) ORDER BY collection, id; + +INSERT INTO scratchpad.ebook_record_by_library (library, record, id, collection) SELECT 103, record, id, seb.collection FROM scratchpad.ebook_collections_to_records seb WHERE seb.record IN (SELECT record FROM asset.call_number acn WHERE acn.deleted IS FALSE AND acn.owning_lib = 103) ORDER BY collection, id; + +INSERT INTO scratchpad.ebook_record_by_library (library, record, id, collection) SELECT 124, record, id, seb.collection FROM scratchpad.ebook_collections_to_records seb WHERE seb.record IN (SELECT record FROM asset.call_number acn WHERE acn.deleted IS FALSE AND acn.owning_lib = 124) ORDER BY collection, id; + +CREATE TYPE scratchpad.ebook AS (id INTEGER, isbn TEXT, sysctl TEXT, author TEXT, title TEXT, url TEXT); + +CREATE OR REPLACE FUNCTION scratchpad.ebook_missing_records (IN INTEGER, IN TEXT) + RETURNS SETOF scratchpad.ebook AS +$$ + SELECT DISTINCT id, isbn, sysctl, author, title, url + FROM scratchpad.ids_to_urls + WHERE collection = $2 + AND id NOT IN ( + SELECT id + FROM scratchpad.ebook_record_by_library + WHERE collection = $2 + AND library = $1 + ) + ORDER BY id; +$$ LANGUAGE SQL; + +-- Create a function to tell us which records a particular library has for a particular collection +CREATE OR REPLACE FUNCTION scratchpad.ebook_has_records (IN INTEGER, IN TEXT, OUT INTEGER, OUT TEXT, OUT TEXT, OUT TEXT, OUT TEXT) + RETURNS SETOF record AS +$$ + SELECT DISTINCT situ.id, isbn, sysctl, author, title + FROM scratchpad.ids_to_urls situ + INNER JOIN scratchpad.ebook_collections_to_records seb + ON situ.id = seb.id AND situ.collection = seb.collection + INNER JOIN asset.call_number acn + ON acn.record = seb.record +-- INNER JOIN asset.uri_call_number_map auricnm +-- ON auricnm.call_number = acn.id + WHERE situ.collection = $2 AND acn.deleted IS FALSE AND acn.owning_lib = $1 + ORDER BY id; +$$ LANGUAGE SQL; + +-- Should have scripted this but went with the power of Vim macros +-- OWA missing records +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2009-12-01.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2010-04-12.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2010-04-12.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2010-08-18.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2010-08-18.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2010-09-30_137.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2010-09-30_137.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'duke-2010-08-24_92.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'duke-2010-08-24_92.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'duke-2011-02-02_10.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'duke-2011-02-02_10.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'duke-2011-02-14_15.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'duke-2011-02-14_15.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-08-06.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-08-06.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-08-20.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-08-20.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-09-17.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-09-17.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-10-21_66.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-10-21_66.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-12-02_127.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-12-02_127.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson_chrc-2011-02-23_212.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson_chrc-2011-02-23_212.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-03-04.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-03-04.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-04-27.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-04-27.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-09-30_156.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-09-30_156.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-10-28_49.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-10-28_49.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford_2010-03-04.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford_2010-03-04.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2009-12-01.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2010-02-11.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2010-02-11.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2010-04-28_1218.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2010-04-28_1218.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2010-06-27_165.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2011-06-27_165.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2011-02-17_1751.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2011-02-17_1751.mrc')) AS foo; + +-- Laurentian ebook links +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2009-12-01.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2010-04-12.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2010-04-12.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2010-08-18.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2010-08-18.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2010-09-30_137.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2010-09-30_137.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'duke-2010-08-24_92.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'duke-2010-08-24_92.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'duke-2011-02-02_10.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'duke-2011-02-02_10.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'duke-2011-02-14_15.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'duke-2011-02-14_15.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-08-06.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-08-06.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-08-20.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-08-20.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-09-17.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-09-17.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-10-21_66.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-10-21_66.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-12-02_127.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-12-02_127.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson_chrc-2011-02-23_212.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson_chrc-2011-02-23_212.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-03-04.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-03-04.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-04-27.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-04-27.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-09-30_156.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-09-30_156.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-10-28_49.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-10-28_49.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2009-12-01.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2010-02-11.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2010-02-11.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2010-04-28_1218.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2010-04-28_1218.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2010-06-27_165.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2011-06-27_165.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2011-02-17_1751.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2011-02-17_1751.mrc')) AS foo; + +-- Algoma ebook links +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2009-12-01.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2010-04-12.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2010-04-12.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2010-08-18.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2010-08-18.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2010-09-30_137.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2010-09-30_137.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'duke-2010-08-24_92.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'duke-2010-08-24_92.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'duke-2011-02-02_10.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'duke-2011-02-02_10.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'duke-2011-02-14_15.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'duke-2011-02-14_15.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-08-06.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-08-06.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-08-20.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-08-20.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-09-17.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-09-17.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-10-21_66.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-10-21_66.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-12-02_127.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-12-02_127.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson_chrc-2011-02-23_212.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson_chrc-2011-02-23_212.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-03-04.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-03-04.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-04-27.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-04-27.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-09-30_156.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-09-30_156.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-10-28_49.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-10-28_49.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2009-12-01.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2010-02-11.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2010-02-11.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2010-04-28_1218.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2010-04-28_1218.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2010-06-27_165.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2011-06-27_165.mrc')) AS foo; +INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2011-02-17_1751.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2011-02-17_1751.mrc')) AS foo; + +-- List record issues for springer +SELECT * FROM scratchpad.ebook_links_by_institution WHERE collection LIKE 'sprin%' ORDER BY collection, library; + +-- Count how many records are missing from a given library's collection +SELECT COUNT(*) FROM (SELECT DISTINCT collection, id FROM scratchpad.ids_to_urls WHERE collection = 'springer-2009-12-01.mrc' AND id NOT IN (SELECT id FROM scratchpad.ebook_record_by_library WHERE collection = 'springer-2009-12-01.mrc' AND library = 109)) AS foo; + +-- List the records missing from a given library's collection +SELECT * FROM scratchpad.ids_to_urls WHERE collection = 'springer-2009-12-01.mrc' AND id NOT IN (SELECT id FROM scratchpad.ebook_record_by_library WHERE collection = 'springer-2009-12-01.mrc' AND library = 109) + +-- Now, a function to easily generate records to look up as potential matches +CREATE TYPE scratchpad.ebook_missing_record_matches AS (collection TEXT, id INTEGER, record BIGINT, isbn TEXT, sysctl TEXT, author TEXT, title TEXT); + +CREATE FUNCTION scratchpad.ebook_missing_record_matches (IN library INTEGER, IN collection TEXT) + RETURNS SETOF scratchpad.ebook_missing_record_matches AS +$$ +SELECT DISTINCT $2, seb.id, seb.record, situ.isbn, situ.sysctl, situ.author, situ.title + FROM scratchpad.ebook_collections_to_records seb + INNER JOIN scratchpad.ids_to_urls situ + ON situ.collection = $2 AND situ.id = seb.id + WHERE seb.id IN ( + SELECT id FROM scratchpad.ebook_missing_records($1, $2) + ) AND seb.collection = $2 +$$ LANGUAGE SQL; diff --git a/tools/ebooks/map_isbns_to_urls.py b/tools/ebooks/map_isbns_to_urls.py new file mode 100644 index 0000000000..c2efa8ca10 --- /dev/null +++ b/tools/ebooks/map_isbns_to_urls.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +""" +Iterate through a set of records and generate a TSV file containing +every ISBN and system control number mapped to every URL in each record. +""" + +#import os, os.path, sys, pymarc, pymarc.marc8, re +import glob, pymarc, pymarc.marc8, re + +def parse_file(infile, writer): + """ + Parse the file of MARC records + """ + reader = pymarc.MARCReader(open(infile, 'rb')) + cnt = 0 + + for record in reader: + cnt = cnt + 1 + + isbn = get_field('020', 'a', record, cnt, infile) + sys_ctl_num = get_field('035', 'a', record, cnt, infile) + + if not (record['856'] and record['856']['u']): + print("* No URL for record %s in file %s" % (cnt, infile)) + continue + + for url_field in record.get_fields('856'): + if url_field.indicator1 != '4': + # print("* Record %d has an 856 with ind1 = %s" % (cnt, url_field.indicator1)) + continue + # if not (url_field.indicator2 == '0' or url_field.indicator2 == '1'): + # print("* Record %d has an 856 with ind2 = %s" % (cnt, url_field.indicator2)) + # continue + for url in url_field.get_subfields('u'): + if url.find('loc.gov') > -1: + # print("* Record %d has an 856 with url containing %s" % (cnt, url)) + continue + isbn = re.sub(r'^\D*(\d+)\D.*?$', r'\1', isbn) + writer.write('%d\t%s\t%s\t%s\t%s\t%s\t%s\n' % ( + cnt, sys_ctl_num, isbn, record.author(), record.title(), url, infile + )) + +def get_field(field, subfield, record, cnt, infile): + """ + Return a field and subfield without complaining + + Should just try/catch this sucker + """ + if not (record[field] and record[field][subfield]): + print("* No [%s][%s] for record %s in file %s" % (field, subfield, cnt, infile)) + return('None') + return(record[field][subfield]) + +if __name__ == '__main__': + + OUTFILE = '/home/dan/Downloads/ebooks/isbns_to_urls.tsv' + tsv_writer = open(OUTFILE, 'w') + for marc in glob.glob('*.mrc'): + parse_file(marc, tsv_writer)