runtests: improve error logging

Give more information about test harness error conditions to help figure out what might be wrong. Print some internal test state when SIGUSR1 is sent to runtests.pl. Ref: #11328
2023-06-14 15:27:07 -07:00 · 2023-06-14 15:27:07 -07:00 · 2fe52412b1
--- a/tests/runner.pm
+++ b/tests/runner.pm
@ -185,9 +185,10 @@ sub runner_init {
        # Create a separate process in multiprocess mode
        my $child = fork();
        if(0 == $child) {
-            # TODO: set up a better signal handler
+            # TODO: set up better signal handlers
            $SIG{INT} = 'IGNORE';
            $SIG{TERM} = 'IGNORE';
+            $SIG{USR1} = 'IGNORE';

            $thisrunnerid = $$;
            print "Runner $thisrunnerid starting\n" if($verbose);
@ -1270,6 +1271,7 @@ sub runnerar_ready {
            $maxfileno = $fd;
        }
    }
+    $maxfileno || die "Internal error: no runners are available to wait on\n";

    # Wait for any pipe from any runner to be ready
    # TODO: this is relatively slow with hundreds of fds
@ -1293,13 +1295,13 @@ sub ipcrecv {
    my $err;
    my $datalen;
    while(! defined ($err = sysread($runnerr, $datalen, 4)) || $err <= 0) {
-        $!{EINTR} || die "error in ipcrecv: $!\n";
+        $!{EINTR} || die "error $err in ipcrecv: $! in runner $$ for $LOGDIR\n";
        # system call was interrupted, probably by ^C; restart it so we stay in sync
    }
    my $len=unpack("L", $datalen);
    my $buf;
    while(! defined ($err = sysread($runnerr, $buf, $len)) || $err <= 0) {
-        $!{EINTR} || die "error in ipcrecv: $!\n";
+        $!{EINTR} || die "error $err in ipcrecv: $! in runner $$ for $LOGDIR\n";
        # system call was interrupted, probably by ^C; restart it so we stay in sync
    }

@ -1336,7 +1338,7 @@ sub ipcrecv {
    # Marshall the results to return
    $buf = freeze \@res;

-    syswrite($runnerw, (pack "L", length($buf)) . $buf);
+    defined syswrite($runnerw, (pack "L", length($buf)) . $buf) || $!{EINTR} || die "error $err in ipcrecv write: $! in runner $$ for $LOGDIR\n";

    return 0;
 }
--- a/tests/runtests.pl
+++ b/tests/runtests.pl
@ -250,12 +250,29 @@ sub singletest_dumplogs {

 sub catch_zap {
    my $signame = shift;
-    logmsg "runtests.pl received SIG$signame, exiting\n";
+    print "runtests.pl received SIG$signame, exiting\r\n";
    $globalabort = 1;
 }
 $SIG{INT} = \&catch_zap;
 $SIG{TERM} = \&catch_zap;

+sub catch_usr1 {
+    print "runtests.pl internal state:\r\n";
+    print scalar(%runnersrunning) . " busy test runner(s) of " . scalar(keys %runnerids) . "\r\n";
+    foreach my $rid (sort(keys(%runnersrunning))) {
+        my $runnernum = "unknown";
+        foreach my $rnum (keys %runnerids) {
+            if($runnerids{$rnum} == $rid) {
+                $runnernum = $rnum;
+                last;
+            }
+        }
+        print "Runner $runnernum (id $rid) running test $runnersrunning{$rid} in state $singletest_state{$rid}\r\n";
+    }
+}
+
+$SIG{USR1} = \&catch_usr1;
+
 ##########################################################################
 # Clear all possible '*_proxy' environment variables for various protocols
 # to prevent them to interfere with our testing!
@ -2734,14 +2751,17 @@ while () {
    if($globalabort) {
        logmsg singletest_dumplogs();
        logmsg "Aborting tests\n";
-        logmsg "Waiting for tests to finish...\n";
+        logmsg "Waiting for " . scalar((keys %runnersrunning)) . " outstanding test(s) to finish...\n";
        # Wait for the last requests to complete and throw them away so
        # that IPC calls & responses stay in sync
        # TODO: send a signal to the runners to interrupt a long test
        foreach my $rid (keys %runnersrunning) {
            runnerar($rid);
            delete $runnersrunning{$rid};
+            logmsg ".";
+            $| = 1;
        }
+        logmsg "\n";
        last;
    }

@ -2770,7 +2790,7 @@ while () {
    # See if we've completed all the tests
    if(!scalar(%runnersrunning)) {
        # No runners are running; we must be done
-        scalar(@runtests) && die 'Internal error: tests to run';
+        scalar(@runtests) && die 'Internal error: still have tests to run';
        last;
    }

@ -2783,7 +2803,7 @@ while () {
    if($ridready) {
        # This runner is ready to be serviced
        my $testnum = $runnersrunning{$ridready};
-        defined $testnum ||  die 'Internal error: test for runner unknown';
+        defined $testnum ||  die "Internal error: test for runner $ridready unknown";
        delete $runnersrunning{$ridready};
        my ($error, $again) = singletest($ridready, $testnum, $countforrunner{$ridready}, $totaltests);
        if($again) {