From 61908d31213415c7f8d9d0e48e1ef7d6429da0aa Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Tue, 16 Feb 2016 13:09:18 -0500 Subject: [PATCH] Cloning pingest from git.mvlcstaff.org/jason/evergreen_utilities.git Signed-off-by: Bill Erickson --- README | 7 +- pingest.pl | 214 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 215 insertions(+), 6 deletions(-) create mode 100644 pingest.pl diff --git a/README b/README index 939ae5a7f..ab8c9b663 100644 --- a/README +++ b/README @@ -1,8 +1,3 @@ -This repo is for putting random things in. -You can push to it from your own repos without building off of the master branch, or anything else in it really. For example: +CLONED FROM http://git.mvlcstaff.org/?p=jason/evergreen_utilities.git;a=summary -git add random git@git.evergreen-ils.org:working/random -git push random local_branch:user/yourusername/local_branch - -Otherwise it follows the rules of working repos. diff --git a/pingest.pl b/pingest.pl new file mode 100644 index 000000000..28cb031b2 --- /dev/null +++ b/pingest.pl @@ -0,0 +1,214 @@ +#!/usr/bin/perl +# --------------------------------------------------------------- +# Copyright © 2013,2014 Merrimack Valley Library Consortium +# Jason Stephenson +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# --------------------------------------------------------------- +# TODO: Document with POD. +# This guy parallelizes a reingest. +use strict; +use warnings; +use DBI; + +# You will want to adjust the next two based on your database size, +# i.e. number of bib records as well as the number of cores on your +# database server. Using roughly number of cores/2 doesn't seem to +# have much impact in off peak times. +use constant { + BATCHSIZE => 10000, + MAXCHILD => 8 +}; + +# Globals for the command line options: +my $do_browse = 1; # Do the browse reingest. +my $do_attrs = 1; # Do the record attributes reingest. +my $do_search = 1; # Do the search reingest. +my $do_facets = 1; # Do the facets reingest. + +# Command line options to skip different reingests. In this case, we +# use the '-' to indicate a minus or a no, so to +# skip browse reingest: -browse or -b +# skip attribute reingest: -attributes or -a +# skip search reingest: -search or -s +# skip facet reingest: -facets or -f +foreach (@ARGV) { + if (/^-b(?:rowse)?$/) { + $do_browse = 0; + } elsif (/^-a(?:ttr(?:ibute)?s?)?$/) { + $do_attrs = 0; + } elsif (/^-s(?:earch)?$/) { + $do_search = 0; + } elsif (/^-f(?:acets?)?$/) { + $do_facets = 0; + } else { + # TODO: Add usage() function to report allowed options. + die ("Unrecognized option: $_"); + } +} + +# "Gimme the keys! I'll drive!" +my $q = < 0 +ORDER BY id ASC +END_OF_Q + +# Stuffs needed for looping, tracking how many lists of records we +# have, storing the actual list of records, and the list of the lists +# of records. +my ($count, $lists, $records) = (0,0,[]); +my @lol = (); +# To do the browse-only ingest: +my @blist = (); + +# All of the DBI->connect() calls in this file assume that you have +# configured the PGHOST, PGPORT, PGDATABASE, PGUSER, and PGPASSWORD +# variables in your execution environment. If you have not, you have +# two options: +# +# 1) configure them +# +# 2) edit the DBI->connect() calls in this program so that it can +# connect to your database. +my $dbh = DBI->connect('DBI:Pg:'); + +my $results = $dbh->selectall_arrayref($q); +foreach my $r (@$results) { + my $record = $r->[0]; + push(@blist, $record); # separate list of browse-only ingest + push(@$records, $record); + if (++$count == BATCHSIZE) { + $lol[$lists++] = $records; + $count = 0; + $records = []; + } +} +$lol[$lists++] = $records if ($count); # Last batch is likely to be + # small. +$dbh->disconnect(); + +# We're going to reuse $count to keep track of the total number of +# batches processed. +$count = 0; + +# @running keeps track of the running child processes. +my @running = (); + +# We start the browse-only ingest before starting the other ingests. +browse_ingest(@blist) if ($do_browse); + +# We loop until we have processed all of the batches stored in @lol: +while ($count < $lists) { + if (scalar(@lol) && scalar(@running) < MAXCHILD) { + # Reuse $records for the lulz. + $records = shift(@lol); + if ($do_search || $do_facets || $do_attrs) { + reingest($records); + } else { + $count++; + } + } else { + my $pid = wait(); + if (grep {$_ == $pid} @running) { + @running = grep {$_ != $pid} @running; + $count++; + print "$count of $lists processed\n"; + } + } +} + +# This subroutine forks a process to do the browse-only ingest on the +# @blist above. It cannot be parallelized, but can run in parrallel +# to the other ingests. +sub browse_ingest { + my @list = @_; + my $pid = fork(); + if (!defined($pid)) { + die "failed to spawn child"; + } elsif ($pid > 0) { + # Add our browser to the list of running children. + push(@running, $pid); + # Increment the number of lists, because this list was not + # previously counted. + $lists++; + } elsif ($pid == 0) { + my $dbh = DBI->connect('DBI:Pg:'); + my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, TRUE, FALSE, TRUE)"); + foreach (@list) { + if ($sth->execute($_)) { + my $crap = $sth->fetchall_arrayref(); + } else { + warn ("Browse ingest failed for record $_"); + } + } + $dbh->disconnect(); + exit(0); + } +} + +# Fork a child to do the other reingests: + +sub reingest { + my $list = shift; + my $pid = fork(); + if (!defined($pid)) { + die "Failed to spawn a child"; + } elsif ($pid > 0) { + push(@running, $pid); + } elsif ($pid == 0) { + my $dbh = DBI->connect('DBI:Pg:'); + reingest_attributes($dbh, $list) if ($do_attrs); + reingest_field_entries($dbh, $list) if ($do_facets || $do_search); + $dbh->disconnect(); + exit(0); + } +} + +# Reingest metabib field entries on a list of records. +sub reingest_field_entries { + my $dbh = shift; + my $list = shift; + my $sth = $dbh->prepare("SELECT metabib.reingest_metabib_field_entries(?, ?, TRUE, ?)"); + # Because reingest uses "skip" options we invert the logic of do variables. + $sth->bind_param(2, ($do_facets) ? 0 : 1); + $sth->bind_param(3, ($do_search) ? 0 : 1); + foreach (@$list) { + $sth->bind_param(1, $_); + if ($sth->execute()) { + my $crap = $sth->fetchall_arrayref(); + } else { + warn ("metabib.reingest_metabib_field_entries failed for record $_"); + } + } +} + +# Reingest record attributes on a list of records. +sub reingest_attributes { + my $dbh = shift; + my $list = shift; + my $sth = $dbh->prepare(<bind_param(1, $_); + if ($sth->execute()) { + my $crap = $sth->fetchall_arrayref(); + } else { + warn ("metabib.reingest_record_attributes failed for record $_"); + } + } +} -- 2.11.0