my $ids = $e->json_query($query);
$url_ids = [ map {$_->{id}} @$ids ];
-
- # this is kinda hinky and probably an abuse of order_by, but
- # shuffling is good for spreading out domains and this avoids
- # the necessity of loading all the URLs (could be lots) and
- # shuffling them here.
-
- $url_ids = $e->search_url_verify_url(
- [ {id => $url_ids},
- {order_by => {uvu => 'RANDOM()'}}
- ], {idlist => 1}
- );
}
my $url_count = scalar(@$url_ids);
# Now cycle through the URLs in batches.
my $batch_size = $U->ou_ancestor_setting_value(
- $session->owning_lib,
+ $session->owning_lib,
'url_verify.verification_batch_size', $e) || 5;
my $num_processed = 0; # total number processed, including redirects
}
);
- # Queue up the requests and let them fly
- $multises->request(
- 'open-ils.url_verify.verify_url',
- $auth, $attempt->id, $_) for @$url_ids;
+ sort_and_fire_domains($e, $auth, $attempt, $url_ids, $multises);
# Wait for all requests to be completed
$multises->session_wait(1);
};
}
+# retrieves the URL domains and sorts them into buckets
+# Iterates over the buckets and fires the multi-session call
+# the main drawback to this domain sorting approach is that
+# any domain used a lot more than the others will be the
+# only domain standing after the others are exhausted, which
+# means it will take a beating at the end of the batch.
+sub sort_and_fire_domains {
+ my ($e, $auth, $attempt, $url_ids, $multises) = @_;
+
+ # there is potential here for data sets to be too large
+ # for delivery, but it's not likely, since we're only
+ # fetching ID and domain.
+ my $urls = $e->json_query(
+ {
+ select => {uvu => ['id', 'domain']},
+ from => 'uvu',
+ where => {id => $url_ids}
+ },
+ # {substream => 1} only if needed
+ );
+
+ # sort them into buckets based on domain name
+ my %domains;
+ for my $url (@$urls) {
+ $domains{$url->{domain}} = [] unless $domains{$url->{domain}};
+ push(@{$domains{$url->{domain}}}, $url->{id});
+ }
+
+ # loop through the domains and fire the verification call
+ while (keys %domains) {
+ for my $domain (keys %domains) {
+
+ my $url_id = pop(@{$domains{$domain}});
+ delete $domains{$domain} unless @{$domains{$domain}};
+
+ $multises->request(
+ 'open-ils.url_verify.verify_url',
+ $auth, $attempt->id, $url_id);
+ }
+ }
+}
+
__PACKAGE__->register_method(
method => 'verify_url',
return undef;
}
-# temporarily cache some data to avoid a pile
+# temporarily cache some data to avoid a pile
# of data lookups on every URL processed.
my %cache;
sub collect_verify_attempt_and_settings {
if ( !($attempt = $cache{attempt}{$attempt_id}) ) {
- # attempt may have just been created, so
+ # attempt may have just been created, so
# we need to guarantee a write-DB read.
$e->xact_begin;
- $attempt =
+ $attempt =
$e->retrieve_url_verify_verification_attempt([
$attempt_id, {
flesh => 1,
}
-# searches for a completed url_verfication for any url processed
-# within this verification attempt whose full_url matches the
+# searches for a completed url_verfication for any url processed
+# within this verification attempt whose full_url matches the
# full_url of the provided URL.
sub find_matching_url_for_attempt {
- my ($e, $attempt, $url) = @_;
+ my ($e, $attempt, $url) = @_;
my $match = $e->json_query({
select => {uvuv => ['id']},
# There could be multiple verifications for matching URLs
# We only want a verification that completed.
# Note also that 2 identical URLs processed within the same
- # sub-batch will have to each be fully processed in their own
+ # sub-batch will have to each be fully processed in their own
# right, since neither knows how the other will ultimately fare.
'+uvuv' => {
res_time => {'!=' => undef}
my $req = HTTP::Request->new(HEAD => $url->full_url);
# simple_request avoids LWP's auto-redirect magic
- my $res = $ua->simple_request($req);
+ my $res = $ua->simple_request($req);
$logger->info(sprintf(
- "url: received HTTP '%s' / '%s' [%s]",
- $res->code,
+ "url: received HTTP '%s' / '%s' [%s]",
+ $res->code,
$res->message,
$url_text
));