URLVerify.pm; redirects / error handling
authorBill Erickson <berick@esilibrary.com>
Wed, 8 Aug 2012 19:38:39 +0000 (15:38 -0400)
committerBill Erickson <berick@esilibrary.com>
Wed, 8 Aug 2012 19:38:39 +0000 (15:38 -0400)
Signed-off-by: Bill Erickson <berick@esilibrary.com>
Open-ILS/src/perlmods/lib/OpenILS/Application/URLVerify.pm

index 973c522..b7d4b4f 100644 (file)
@@ -26,25 +26,25 @@ __PACKAGE__->register_method(
             {
                 desc => q/
                     Options (optional).
-                        report_all => bypass response throttling and return all URL sub-process 
-                            responses to the caller.  Not recommened for remote (web, etc.) clients, 
+                        report_all => bypass response throttling and return all URL sub-process
+                            responses to the caller.  Not recommened for remote (web, etc.) clients,
                             because it can be a lot of data.
                         resume_attempt => atttempt_id.  Resume verification after a failure.
-                        resume_with_new_attempt => If true, resume from resume_attempt, but 
+                        resume_with_new_attempt => If true, resume from resume_attempt, but
                             create a new attempt to track the resumption.
                     /,
                 type => 'hash'
             }
         ],
         return => {desc => q/
-            Stream of objects containing the number of URLs to be processed (url_count), 
-            the number processed thus far including redirects (total_processed), 
-            and the current url_verification object (current_verification).  
-            
-            Note that total_processed may ultimately exceed url_count, since it 
+            Stream of objects containing the number of URLs to be processed (url_count),
+            the number processed thus far including redirects (total_processed),
+            and the current url_verification object (current_verification).
+
+            Note that total_processed may ultimately exceed url_count, since it
             includes non-anticipate-able redirects.
 
-            The final response contains url_count, total_processed, and the 
+            The final response contains url_count, total_processed, and the
             verification_attempt object (attempt).
             /
         }
@@ -64,7 +64,7 @@ sub validate_session {
     return $e->die_event unless $e->checkauth;
     return $e->die_event unless $e->allowed('VERIFY_URL');
 
-    my $session = $e->retrieve_url_verify_session($session_id) 
+    my $session = $e->retrieve_url_verify_session($session_id)
         or return $e->die_event;
 
     my $attempt_id = $options->{resume_attempt};
@@ -79,8 +79,8 @@ sub validate_session {
                         join => { cbreb => { # bucket
                             join => { uvs => { # session
                                 filter => {id => $session_id}
-                            }} 
-                        }} 
+                            }}
+                        }}
                     }
                 }
             }
@@ -90,7 +90,7 @@ sub validate_session {
             $logger->info("url: resuming attempt $attempt_id");
 
             # when resuming an existing attempt (that presumably failed
-            # mid-processing), we only want to process URLs that either 
+            # mid-processing), we only want to process URLs that either
             # have no linked url_verification or have an un-completed
             # url_verification.
 
@@ -107,6 +107,15 @@ sub validate_session {
                     ]
                 }
             };
+
+        } else {
+
+            # this is a new run, so we only want to process URLs that
+            # originated from the source records and not from redirects.
+
+            $query->{where} = {
+                '+uvu' => {redirect_from => undef}
+            };
         }
 
         my $ids = $e->json_query($query);
@@ -119,7 +128,7 @@ sub validate_session {
     my $attempt;
     if ($attempt_id and !$options->{resume_with_new_attempt}) {
 
-        $attempt = $e->retrieve_url_verification_attempt($attempt_id) 
+        $attempt = $e->retrieve_url_verification_attempt($attempt_id)
             or return $e->die_event;
 
         # no data was written
@@ -132,13 +141,13 @@ sub validate_session {
         $attempt->usr($e->requestor->id);
         $attempt->start_time('now');
 
-        $e->create_url_verify_verification_attempt($attempt) 
+        $e->create_url_verify_verification_attempt($attempt)
             or return $e->die_event;
 
         $e->commit;
     }
 
-    # END DB TRANSACTION 
+    # END DB TRANSACTION
 
     # Now cycle through the URLs in batches.
 
@@ -146,7 +155,7 @@ sub validate_session {
     my $num_processed = 0; # total number processed, including redirects
     my $resp_window = 1;
 
-    # before we start the real work, let the caller know 
+    # before we start the real work, let the caller know
     # the attempt (id) so recovery is possible.
 
     $client->respond({
@@ -326,9 +335,9 @@ sub verify_url {
         $e->xact_commit;
     }
 
-    # The calling code is likely not multi-threaded, so a 
-    # per-URL (i.e. per-thread) delay would not be possible.  
-    # Applying the delay here allows the caller to process 
+    # The calling code is likely not multi-threaded, so a
+    # per-URL (i.e. per-thread) delay would not be possible.
+    # Applying the delay here allows the caller to process
     # batches of URLs without having to worry about the delay.
     sleep $delay;
 
@@ -369,7 +378,11 @@ sub verify_one_url {
 
     # Now test the URL.
 
-    my $req = Net::HTTP::NB->new(Host => $url->host);
+    my $req;
+    eval {
+        # uses 'die' internally
+        $req = Net::HTTP::NB->new(Host => $url->host);
+    };
 
     if ($req) {
 
@@ -425,12 +438,16 @@ sub verify_one_url {
 
             # request timed out
             $logger->info("url: request timed out for $url_text");
+
+            $vcation->res_code('997');
+            $vcation->res_text('Request Timeout');
         }
 
     } else {
 
         # Error building connection.  Invalid hostname, etc.
 
+        $logger->info("url: error building connection: $@");
         $vcation->res_code('999');
         $vcation->res_text($@);
     }