URLVerify.pm; docs; url domain looping; cleanup

author Bill Erickson <berick@esilibrary.com>

Fri, 10 Aug 2012 13:10:52 +0000 (09:10 -0400)

committer Bill Erickson <berick@esilibrary.com>

Fri, 10 Aug 2012 13:10:52 +0000 (09:10 -0400)
author Bill Erickson <berick@esilibrary.com>
Fri, 10 Aug 2012 13:10:52 +0000 (09:10 -0400)
committer Bill Erickson <berick@esilibrary.com>
Fri, 10 Aug 2012 13:10:52 +0000 (09:10 -0400)
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm

index 23a4b84..c02af13 100644 (file)
--- a/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm
@@ -118,17 +118,6 @@ sub validate_session {
  
          my $ids = $e->json_query($query);
          $url_ids = [ map {$_->{id}} @$ids ];
-
-        # this is kinda hinky and probably an abuse of order_by, but 
-        # shuffling is good for spreading out domains and this avoids 
-        # the necessity of loading all the URLs (could be lots) and 
-        # shuffling them here.
-
-        $url_ids = $e->search_url_verify_url(
-            [   {id => $url_ids},
-                {order_by => {uvu => 'RANDOM()'}}
-            ],  {idlist => 1}
-        );
      }
  
      my $url_count = scalar(@$url_ids);
@@ -161,7 +150,7 @@ sub validate_session {
      # Now cycle through the URLs in batches.
  
      my $batch_size = $U->ou_ancestor_setting_value(
-        $session->owning_lib, 
+        $session->owning_lib,
          'url_verify.verification_batch_size', $e) || 5;
  
      my $num_processed = 0; # total number processed, including redirects
@@ -217,10 +206,7 @@ sub validate_session {
          }
      );
  
-    # Queue up the requests and let them fly
-    $multises->request(
-        'open-ils.url_verify.verify_url',
-        $auth, $attempt->id, $_) for @$url_ids;
+    sort_and_fire_domains($e, $auth, $attempt, $url_ids, $multises);
  
      # Wait for all requests to be completed
      $multises->session_wait(1);
@@ -239,6 +225,48 @@ sub validate_session {
      };
  }
  
+# retrieves the URL domains and sorts them into buckets
+# Iterates over the buckets and fires the multi-session call
+# the main drawback to this domain sorting approach is that
+# any domain used a lot more than the others will be the
+# only domain standing after the others are exhausted, which
+# means it will take a beating at the end of the batch.
+sub sort_and_fire_domains {
+    my ($e, $auth, $attempt, $url_ids, $multises) = @_;
+
+    # there is potential here for data sets to be too large
+    # for delivery, but it's not likely, since we're only
+    # fetching ID and domain.
+    my $urls = $e->json_query(
+        {
+            select => {uvu => ['id', 'domain']},
+            from => 'uvu',
+            where => {id => $url_ids}
+        },
+        # {substream => 1} only if needed
+    );
+
+    # sort them into buckets based on domain name
+    my %domains;
+    for my $url (@$urls) {
+        $domains{$url->{domain}} = [] unless $domains{$url->{domain}};
+        push(@{$domains{$url->{domain}}}, $url->{id});
+    }
+
+    # loop through the domains and fire the verification call
+    while (keys %domains) {
+        for my $domain (keys %domains) {
+
+            my $url_id = pop(@{$domains{$domain}});
+            delete $domains{$domain} unless @{$domains{$domain}};
+
+            $multises->request(
+                'open-ils.url_verify.verify_url',
+                $auth, $attempt->id, $url_id);
+        }
+    }
+}
+
  
  __PACKAGE__->register_method(
      method => 'verify_url',
@@ -347,7 +375,7 @@ sub verify_url {
      return undef;
  }
  
-# temporarily cache some data to avoid a pile 
+# temporarily cache some data to avoid a pile
  # of data lookups on every URL processed.
  my %cache;
  sub collect_verify_attempt_and_settings {
@@ -366,11 +394,11 @@ sub collect_verify_attempt_and_settings {
  
      if ( !($attempt = $cache{attempt}{$attempt_id}) ) {
  
-        # attempt may have just been created, so 
+        # attempt may have just been created, so
          # we need to guarantee a write-DB read.
          $e->xact_begin;
  
-        $attempt = 
+        $attempt =
              $e->retrieve_url_verify_verification_attempt([
                  $attempt_id, {
                      flesh => 1,
@@ -416,11 +444,11 @@ sub collect_verify_attempt_and_settings {
  }
  
  
-# searches for a completed url_verfication for any url processed 
-# within this verification attempt whose full_url matches the 
+# searches for a completed url_verfication for any url processed
+# within this verification attempt whose full_url matches the
  # full_url of the provided URL.
  sub find_matching_url_for_attempt {
-    my ($e, $attempt, $url) = @_; 
+    my ($e, $attempt, $url) = @_;
  
      my $match = $e->json_query({
          select => {uvuv => ['id']},
@@ -439,7 +467,7 @@ sub find_matching_url_for_attempt {
              # There could be multiple verifications for matching URLs
              # We only want a verification that completed.
              # Note also that 2 identical URLs processed within the same
-            # sub-batch will have to each be fully processed in their own 
+            # sub-batch will have to each be fully processed in their own
              # right, since neither knows how the other will ultimately fare.
              '+uvuv' => {
                  res_time => {'!=' => undef}
@@ -507,11 +535,11 @@ sub verify_one_url {
      my $req = HTTP::Request->new(HEAD => $url->full_url);
  
      # simple_request avoids LWP's auto-redirect magic
-    my $res = $ua->simple_request($req); 
+    my $res = $ua->simple_request($req);
  
      $logger->info(sprintf(
-        "url: received HTTP '%s' / '%s' [%s]", 
-        $res->code, 
+        "url: received HTTP '%s' / '%s' [%s]",
+        $res->code,
          $res->message,
          $url_text
      ));
author	Bill Erickson <berick@esilibrary.com>
	Fri, 10 Aug 2012 13:10:52 +0000 (09:10 -0400)
committer	Bill Erickson <berick@esilibrary.com>
	Fri, 10 Aug 2012 13:10:52 +0000 (09:10 -0400)