From c7dd8f75cb2e14c4a641c6933c7238f2266c758a Mon Sep 17 00:00:00 2001 From: Bill Erickson Date: Fri, 10 Aug 2012 09:10:52 -0400 Subject: [PATCH] URLVerify.pm; docs; url domain looping; cleanup Signed-off-by: Bill Erickson --- .../perlmods/lib/OpenILS/Application/URLVerify.pm | 80 +++++++++++++++------- 1 file changed, 54 insertions(+), 26 deletions(-) diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm index 23a4b84466..c02af13eea 100644 --- a/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm +++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm @@ -118,17 +118,6 @@ sub validate_session { my $ids = $e->json_query($query); $url_ids = [ map {$_->{id}} @$ids ]; - - # this is kinda hinky and probably an abuse of order_by, but - # shuffling is good for spreading out domains and this avoids - # the necessity of loading all the URLs (could be lots) and - # shuffling them here. - - $url_ids = $e->search_url_verify_url( - [ {id => $url_ids}, - {order_by => {uvu => 'RANDOM()'}} - ], {idlist => 1} - ); } my $url_count = scalar(@$url_ids); @@ -161,7 +150,7 @@ sub validate_session { # Now cycle through the URLs in batches. my $batch_size = $U->ou_ancestor_setting_value( - $session->owning_lib, + $session->owning_lib, 'url_verify.verification_batch_size', $e) || 5; my $num_processed = 0; # total number processed, including redirects @@ -217,10 +206,7 @@ sub validate_session { } ); - # Queue up the requests and let them fly - $multises->request( - 'open-ils.url_verify.verify_url', - $auth, $attempt->id, $_) for @$url_ids; + sort_and_fire_domains($e, $auth, $attempt, $url_ids, $multises); # Wait for all requests to be completed $multises->session_wait(1); @@ -239,6 +225,48 @@ sub validate_session { }; } +# retrieves the URL domains and sorts them into buckets +# Iterates over the buckets and fires the multi-session call +# the main drawback to this domain sorting approach is that +# any domain used a lot more than the others will be the +# only domain standing after the others are exhausted, which +# means it will take a beating at the end of the batch. +sub sort_and_fire_domains { + my ($e, $auth, $attempt, $url_ids, $multises) = @_; + + # there is potential here for data sets to be too large + # for delivery, but it's not likely, since we're only + # fetching ID and domain. + my $urls = $e->json_query( + { + select => {uvu => ['id', 'domain']}, + from => 'uvu', + where => {id => $url_ids} + }, + # {substream => 1} only if needed + ); + + # sort them into buckets based on domain name + my %domains; + for my $url (@$urls) { + $domains{$url->{domain}} = [] unless $domains{$url->{domain}}; + push(@{$domains{$url->{domain}}}, $url->{id}); + } + + # loop through the domains and fire the verification call + while (keys %domains) { + for my $domain (keys %domains) { + + my $url_id = pop(@{$domains{$domain}}); + delete $domains{$domain} unless @{$domains{$domain}}; + + $multises->request( + 'open-ils.url_verify.verify_url', + $auth, $attempt->id, $url_id); + } + } +} + __PACKAGE__->register_method( method => 'verify_url', @@ -347,7 +375,7 @@ sub verify_url { return undef; } -# temporarily cache some data to avoid a pile +# temporarily cache some data to avoid a pile # of data lookups on every URL processed. my %cache; sub collect_verify_attempt_and_settings { @@ -366,11 +394,11 @@ sub collect_verify_attempt_and_settings { if ( !($attempt = $cache{attempt}{$attempt_id}) ) { - # attempt may have just been created, so + # attempt may have just been created, so # we need to guarantee a write-DB read. $e->xact_begin; - $attempt = + $attempt = $e->retrieve_url_verify_verification_attempt([ $attempt_id, { flesh => 1, @@ -416,11 +444,11 @@ sub collect_verify_attempt_and_settings { } -# searches for a completed url_verfication for any url processed -# within this verification attempt whose full_url matches the +# searches for a completed url_verfication for any url processed +# within this verification attempt whose full_url matches the # full_url of the provided URL. sub find_matching_url_for_attempt { - my ($e, $attempt, $url) = @_; + my ($e, $attempt, $url) = @_; my $match = $e->json_query({ select => {uvuv => ['id']}, @@ -439,7 +467,7 @@ sub find_matching_url_for_attempt { # There could be multiple verifications for matching URLs # We only want a verification that completed. # Note also that 2 identical URLs processed within the same - # sub-batch will have to each be fully processed in their own + # sub-batch will have to each be fully processed in their own # right, since neither knows how the other will ultimately fare. '+uvuv' => { res_time => {'!=' => undef} @@ -507,11 +535,11 @@ sub verify_one_url { my $req = HTTP::Request->new(HEAD => $url->full_url); # simple_request avoids LWP's auto-redirect magic - my $res = $ua->simple_request($req); + my $res = $ua->simple_request($req); $logger->info(sprintf( - "url: received HTTP '%s' / '%s' [%s]", - $res->code, + "url: received HTTP '%s' / '%s' [%s]", + $res->code, $res->message, $url_text )); -- 2.11.0