From: Galen Charlton Date: Thu, 2 Dec 2021 16:48:48 +0000 (-0500) Subject: LP#1953057: improve reset-after-crash handling for Perl apps X-Git-Url: https://old-git.evergreen-ils.org/?a=commitdiff_plain;h=a55887ef21ea2c9a699810a672f5b7cb88deb884;p=working%2FOpenSRF.git LP#1953057: improve reset-after-crash handling for Perl apps This patch more forcibly reaps children and resets the active and idle lists after an uncaught exception in the main run method triggers a reset of the service. This avoids an issue where such a reset would result in the listener thinking it has more running drones than is actually the case, which can make it think it's hit max_children limits sooner than is actually the case. To test ------- [1] Do something to arrange for a busy Perl service to throw an exception in the listener, such as setting up a replication of bug 1953044 or adding code to cause the listener to randomly throw an exception. [2] Observe that the listener does not fully clean up the drones it has at the point of the crash, and consequently may think post-reset that it is closer to the max_children limit than is actually the case. One symptom of this is requests getting added to the backlog queue unecessarily. [3] Apply the patch and repeat step 1. This time, the service should reset itself properly and handle requests and drones post-reset as expected. Signed-off-by: Galen Charlton --- diff --git a/src/perl/lib/OpenSRF/Server.pm b/src/perl/lib/OpenSRF/Server.pm index 52c53d2..53f97c8 100644 --- a/src/perl/lib/OpenSRF/Server.pm +++ b/src/perl/lib/OpenSRF/Server.pm @@ -99,7 +99,17 @@ sub cleanup { $self->{osrf_handle}->disconnect; # clean up our dead children - $self->reap_children(1); + if ($no_exit) { + # reap children in a way that anticipates keeping + # the listener going + $self->forcibly_reap_children(); + } else { + # we're not sticking around, so if setting + # $SIG{CHLD} to ignore means that reap_children() + # doesn't detect the auto-reaped ones, we don't + # care + $self->reap_children(1); + } exit(0) unless $no_exit; } @@ -515,6 +525,44 @@ sub reap_children { } # ---------------------------------------------------------------- +# Cleans up any child processes that (hopefully) have exited in +# the event that a no-exit cleanup was requested (i.e., if the listener +# hit an exception but we're restarting things rather than letting +# the listener die). Note that it's possible that (effectively) +# orphans could be left over. +# ---------------------------------------------------------------- +sub forcibly_reap_children { + my ($self) = @_; + + $chatty and $logger->warn('reaping after a listener crash'); + + # try a normal reap... + $self->reap_children(1); + + # ...then apply the hammer + foreach my $pid (keys %{ $self->{pid_map} }) { + my $child = $self->{pid_map}->{$pid}; + + close($child->{pipe_to_parent}); + close($child->{pipe_to_child}); + + delete $self->{pid_map}->{$pid}; + # unlike the normal reaping process, we don't need + # to put this on a zombie list to clean up later; if we + # get here, we're not in the main listener run loop + delete $child->{$_} for keys %$child; # destroy with a vengeance + } + + # Reset the lists. It's possible that leftover children from the + # previous ->run() invocation are still around, but there's not much + # we can do about those leaks at this point. + $self->{num_children} = 0; + $self->{active_list} = []; + $self->{idle_list} = []; + +} + +# ---------------------------------------------------------------- # Spawn up to max_children processes # ---------------------------------------------------------------- sub spawn_children {