runtests: better detect and handle pipe errors in the controller

Errors reading and writing to the pipes are now better detected and propagated up to the main test loop so it can be cleanly shut down. Such errors are usually due to a runner dying so it doesn't make much sense to try to continue the test run.
2023-06-15 20:10:24 -07:00 · 2023-06-15 20:10:24 -07:00 · 089a84660d
--- a/tests/runner.pm
+++ b/tests/runner.pm
@ -1171,7 +1171,7 @@ sub runner_test_run {
 # Async call runner_clearlocks
 # Called by controller
 sub runnerac_clearlocks {
-    controlleripccall(\&runner_clearlocks, @_);
+    return controlleripccall(\&runner_clearlocks, @_);
 }

 # Async call runner_shutdown
@ -1180,36 +1180,38 @@ sub runnerac_clearlocks {
 # Called by controller
 sub runnerac_shutdown {
    my ($runnerid)=$_[0];
-    controlleripccall(\&runner_shutdown, @_);
+    my $err = controlleripccall(\&runner_shutdown, @_);

    # These have no more use
    close($controllerw{$runnerid});
    undef $controllerw{$runnerid};
    close($controllerr{$runnerid});
    undef $controllerr{$runnerid};
+    return $err;
 }

 # Async call of runner_stopservers
 # Called by controller
 sub runnerac_stopservers {
-    controlleripccall(\&runner_stopservers, @_);
+    return controlleripccall(\&runner_stopservers, @_);
 }

 # Async call of runner_test_preprocess
 # Called by controller
 sub runnerac_test_preprocess {
-    controlleripccall(\&runner_test_preprocess, @_);
+    return controlleripccall(\&runner_test_preprocess, @_);
 }

 # Async call of runner_test_run
 # Called by controller
 sub runnerac_test_run {
-    controlleripccall(\&runner_test_run, @_);
+    return controlleripccall(\&runner_test_run, @_);
 }

 ###################################################################
 # Call an arbitrary function via IPC
 # The first argument is the function reference, the second is the runner ID
+# Returns 0 on success, -1 on error writing to runner
 # Called by controller (indirectly, via a more specific function)
 sub controlleripccall {
    my $funcref = shift @_;
@ -1223,30 +1225,44 @@ sub controlleripccall {
    my $margs = freeze \@_;

    # Send IPC call via pipe
-    syswrite($controllerw{$runnerid}, (pack "L", length($margs)) . $margs);
+    my $err;
+    while(! defined ($err = syswrite($controllerw{$runnerid}, (pack "L", length($margs)) . $margs)) || $err <= 0) {
+        if((!defined $err && ! $!{EINTR}) || (defined $err && $err == 0)) {
+            # Runner has likely died
+            return -1;
+        }
+        # system call was interrupted, probably by ^C; restart it so we stay in sync
+    }

    if(!$multiprocess) {
        # Call the remote function here in single process mode
        ipcrecv();
     }
+     return 0;
 }

 ###################################################################
 # Receive async response of a previous call via IPC
-# The first return value is the runner ID
+# The first return value is the runner ID or undef on error
 # Called by controller
 sub runnerar {
    my ($runnerid) = @_;
    my $err;
    my $datalen;
    while(! defined ($err = sysread($controllerr{$runnerid}, $datalen, 4)) || $err <= 0) {
-        $!{EINTR} || die "error in runnerar: $!\n";
+        if((!defined $err && ! $!{EINTR}) || (defined $err && $err == 0)) {
+            # Runner is likely dead and closed the pipe
+            return undef;
+        }
        # system call was interrupted, probably by ^C; restart it so we stay in sync
    }
    my $len=unpack("L", $datalen);
    my $buf;
    while(! defined ($err = sysread($controllerr{$runnerid}, $buf, $len)) || $err <= 0) {
-        $!{EINTR} || die "error in runnerar: $!\n";
+        if((!defined $err && ! $!{EINTR}) || (defined $err && $err == 0)) {
+            # Runner is likely dead and closed the pipe
+            return undef;
+        }
        # system call was interrupted, probably by ^C; restart it so we stay in sync
    }

@ -1260,7 +1276,9 @@ sub runnerar {
 }

 ###################################################################
-# Returns runner ID if a response from an async call is ready
+# Returns runner ID if a response from an async call is ready or error
+# First value is ready, second is error, however an error case shows up
+# as ready in Linux, so you can't trust it.
 # argument is 0 for nonblocking, undef for blocking, anything else for timeout
 # Called by controller
 sub runnerar_ready {
@ -1279,19 +1297,26 @@ sub runnerar_ready {
    $maxfileno || die "Internal error: no runners are available to wait on\n";

    # Wait for any pipe from any runner to be ready
+    # This may be interrupted and return EINTR, but this is ignored and the
+    # caller will need to later call this function again.
    # TODO: this is relatively slow with hundreds of fds
-    # TODO: handle errors
-    if(select(my $rout=$rin, undef, undef, $blocking)) {
+    my $ein = $rin;
+    if(select(my $rout=$rin, undef, my $eout=$ein, $blocking) >= 1) {
        for my $fd (0..$maxfileno) {
+            # Return an error condition first in case it's both
+            if(vec($eout, $fd, 1)) {
+                return (undef, $idbyfileno{$fd});
+            }
            if(vec($rout, $fd, 1)) {
-                return $idbyfileno{$fd};
+                return ($idbyfileno{$fd}, undef);
            }
        }
        die "Internal pipe readiness inconsistency\n";
    }
-    return undef;
+    return (undef, undef);
 }

+
 ###################################################################
 # Cleanly abort and exit the runner
 # This uses print since there is no longer any controller to write logs.
--- a/tests/runtests.pl
+++ b/tests/runtests.pl
@ -272,6 +272,7 @@ sub catch_usr1 {
 }

 $SIG{USR1} = \&catch_usr1;
+$SIG{PIPE} = 'IGNORE';  # these errors are captured in the read/write calls

 ##########################################################################
 # Clear all possible '*_proxy' environment variables for various protocols
@ -1505,12 +1506,20 @@ sub singletest_check {
            if(!$filename) {
                logmsg "ERROR: section verify=>file$partsuffix ".
                       "has no name attribute\n";
-                runnerac_stopservers($runnerid);
-                # TODO: this is a blocking call that will stall the controller,
-                # but this error condition should never happen except during
-                # development.
-                my ($rid, $unexpected, $logs) = runnerar($runnerid);
-                logmsg $logs;
+                if (runnerac_stopservers($runnerid)) {
+                    logmsg "ERROR: runner $runnerid seems to have died\n";
+                } else {
+
+                    # TODO: this is a blocking call that will stall the controller,
+                    # but this error condition should never happen except during
+                    # development.
+                    my ($rid, $unexpected, $logs) = runnerar($runnerid);
+                    if(!$rid) {
+                        logmsg "ERROR: runner $runnerid seems to have died\n";
+                    } else {
+                        logmsg $logs;
+                    }
+                }
                # timestamp test result verification end
                $timevrfyend{$testnum} = Time::HiRes::time();
                return -1;
@ -1740,7 +1749,11 @@ sub singletest {
            && $clearlocks) {
            # On Windows, lock files can't be deleted when the process still
            # has them open, so kill those processes first
-            runnerac_clearlocks($runnerid, "$logdir/$LOCKDIR");
+            if(runnerac_clearlocks($runnerid, "$logdir/$LOCKDIR")) {
+                logmsg "ERROR: runner $runnerid seems to have died\n";
+                $singletest_state{$runnerid} = ST_INIT;
+                return (-1, 0);
+            }
            $singletest_state{$runnerid} = ST_CLEARLOCKS;
        } else {
            $singletest_state{$runnerid} = ST_INITED;
@ -1751,6 +1764,11 @@ sub singletest {

    } elsif($singletest_state{$runnerid} == ST_CLEARLOCKS) {
        my ($rid, $logs) = runnerar($runnerid);
+        if(!$rid) {
+            logmsg "ERROR: runner $runnerid seems to have died\n";
+            $singletest_state{$runnerid} = ST_INIT;
+            return (-1, 0);
+        }
        logmsg $logs;
        my $logdir = getrunnerlogdir($runnerid);
        cleardir($logdir);
@ -1776,11 +1794,20 @@ sub singletest {
        # Register the test case with the CI environment
        citest_starttest($testnum);

-        runnerac_test_preprocess($runnerid, $testnum);
+        if(runnerac_test_preprocess($runnerid, $testnum)) {
+            logmsg "ERROR: runner $runnerid seems to have died\n";
+            $singletest_state{$runnerid} = ST_INIT;
+            return (-1, 0);
+        }
        $singletest_state{$runnerid} = ST_PREPROCESS;

    } elsif($singletest_state{$runnerid} == ST_PREPROCESS) {
        my ($rid, $why, $error, $logs, $testtimings) = runnerar($runnerid);
+        if(!$rid) {
+            logmsg "ERROR: runner $runnerid seems to have died\n";
+            $singletest_state{$runnerid} = ST_INIT;
+            return (-1, 0);
+        }
        logmsg $logs;
        updatetesttimings($testnum, %$testtimings);
        if($error == -2) {
@ -1813,11 +1840,20 @@ sub singletest {
        my $CURLOUT;
        my $tool;
        my $usedvalgrind;
-        runnerac_test_run($runnerid, $testnum);
+        if(runnerac_test_run($runnerid, $testnum)) {
+            logmsg "ERROR: runner $runnerid seems to have died\n";
+            $singletest_state{$runnerid} = ST_INIT;
+            return (-1, 0);
+        }
        $singletest_state{$runnerid} = ST_RUN;

    } elsif($singletest_state{$runnerid} == ST_RUN) {
        my ($rid, $error, $logs, $testtimings, $cmdres, $CURLOUT, $tool, $usedvalgrind) = runnerar($runnerid);
+        if(!$rid) {
+            logmsg "ERROR: runner $runnerid seems to have died\n";
+            $singletest_state{$runnerid} = ST_INIT;
+            return (-1, 0);
+        }
        logmsg $logs;
        updatetesttimings($testnum, %$testtimings);
        if($error == -1) {
@ -2782,8 +2818,13 @@ while () {
            # this runner is busy running a test
            $runnersrunning{$runnerid} = $testnum;
        } else {
-            # We make this assumption to avoid having to handle $error here
-            die "Internal error: test must not complete on first call";
+            runnerready($runnerid);
+            if($error >= 0) {
+                # We make this simplifying assumption to avoid having to handle
+                # $error properly here, but we must handle the case of runner
+                # death without abending here.
+                die "Internal error: test must not complete on first call";
+            }
        }
    }

@ -2799,7 +2840,15 @@ while () {
    # one immediately. If all runners are busy, wait a fraction of a second
    # for one to finish so we can still loop around to check the abort flag.
    my $runnerwait = scalar(@runnersidle) && scalar(@runtests) ? 0 : 0.5;
-    my $ridready = runnerar_ready($runnerwait);
+    my ($ridready, $riderror) = runnerar_ready($runnerwait);
+    if($ridready && ! defined $runnersrunning{$ridready}) {
+        # On Linux, a closed pipe still shows up as ready instead of error.
+        # Detect this here by seeing if we are expecting it to be ready and
+        # treat it as an error if not.
+        logmsg "ERROR: Runner $ridready is unexpectedly ready; is probably actually dead\n";
+        $riderror = $ridready;
+        undef $ridready;
+    }
    if($ridready) {
        # This runner is ready to be serviced
        my $testnum = $runnersrunning{$ridready};
@ -2812,7 +2861,6 @@ while () {
        } else {
            # Test is complete
            runnerready($ridready);
-print "COMPLETED $testnum \n" if($verbose); #. join(",", keys(%runnersrunning)) . "\n";

            if($error < 0) {
                # not a test we can run
@ -2847,6 +2895,11 @@ print "COMPLETED $testnum \n" if($verbose); #. join(",", keys(%runnersrunning))
            }
        }
    }
+    if($riderror) {
+        logmsg "ERROR: runner $riderror is dead! aborting test run\n";
+        delete $runnersrunning{$riderror} if(defined $runnersrunning{$riderror});
+        $globalabort = 1;
+    }
 }

 my $sofar = time() - $start;