src/os/signal/signal_test.go | 42 ++++++++++++++++++++++++++++++++++++++++++ src/runtime/proc.go | 52 ++++++++++++++++++++++++++++++++++++++++++++++++---- src/runtime/runtime2.go | 7 +++++-- src/runtime/sigqueue.go | 2 +- diff --git a/src/os/signal/signal_test.go b/src/os/signal/signal_test.go index bbc68af9fbdf4d60a663ec420d87098500bfd6a9..9e5ee8bd88e941152e07e5cf655d1023715b2845 100644 --- a/src/os/signal/signal_test.go +++ b/src/os/signal/signal_test.go @@ -15,6 +15,7 @@ "internal/testenv" "os" "os/exec" "runtime" + "runtime/trace" "strconv" "sync" "syscall" @@ -853,3 +854,44 @@ if got := fmt.Sprint(c); got != want { t.Errorf("c.String() = %q, want %q", got, want) } } + +// #44193 test signal handling while stopping and starting the world. +func TestSignalTrace(t *testing.T) { + done := make(chan struct{}) + quit := make(chan struct{}) + c := make(chan os.Signal, 1) + Notify(c, syscall.SIGHUP) + + // Source and sink for signals busy loop unsynchronized with + // trace starts and stops. We are ultimately validating that + // signals and runtime.(stop|start)TheWorldGC are compatible. + go func() { + defer close(done) + defer Stop(c) + pid := syscall.Getpid() + for { + select { + case <-quit: + return + default: + syscall.Kill(pid, syscall.SIGHUP) + } + waitSig(t, c, syscall.SIGHUP) + } + }() + + for i := 0; i < 100; i++ { + buf := new(bytes.Buffer) + if err := trace.Start(buf); err != nil { + t.Fatalf("[%d] failed to start tracing: %v", i, err) + } + time.After(1 * time.Microsecond) + trace.Stop() + size := buf.Len() + if size == 0 { + t.Fatalf("[%d] trace is empty", i) + } + } + close(quit) + <-done +} diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 73a789c18973e793bda325d3769bfc0ecbb0e98c..cf9770587aa2c2f817a563f615bc527c0259527c 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -1338,6 +1338,9 @@ func mPark() { g := getg() for { notesleep(&g.m.park) + // Note, because of signal handling by this parked m, + // a preemptive mDoFixup() may actually occur via + // mDoFixupAndOSYield(). (See golang.org/issue/44193) noteclear(&g.m.park) if !mDoFixup() { return @@ -1571,6 +1574,22 @@ } for atomic.Load(&sched.sysmonStarting) != 0 { osyield() } + + // We don't want this thread to handle signals for the + // duration of this critical section. The underlying issue + // being that this locked coordinating m is the one monitoring + // for fn() execution by all the other m's of the runtime, + // while no regular go code execution is permitted (the world + // is stopped). If this present m were to get distracted to + // run signal handling code, and find itself waiting for a + // second thread to execute go code before being able to + // return from that signal handling, a deadlock will result. + // (See golang.org/issue/44193.) + lockOSThread() + var sigmask sigset + sigsave(&sigmask) + sigblock(false) + stopTheWorldGC("doAllThreadsSyscall") if atomic.Load(&newmHandoff.haveTemplateThread) != 0 { // Ensure that there are no in-flight thread @@ -1622,6 +1641,7 @@ // all the threads, so we lock here to avoid // the possibility of racing with mp. lock(&mp.mFixup.lock) mp.mFixup.fn = fn + atomic.Store(&mp.mFixup.used, 1) if mp.doesPark { // For non-service threads this will // cause the wakeup to be short lived @@ -1638,9 +1658,7 @@ for mp := allm; done && mp != nil; mp = mp.alllink { if mp.procid == tid { continue } - lock(&mp.mFixup.lock) - done = done && (mp.mFixup.fn == nil) - unlock(&mp.mFixup.lock) + done = atomic.Load(&mp.mFixup.used) == 0 } if done { break @@ -1667,6 +1685,8 @@ mFixupRace.ctx = 0 unlock(&mFixupRace.lock) } startTheWorldGC() + msigrestore(sigmask) + unlockOSThread() } // runSafePointFn runs the safe point function, if any, for this P. @@ -2157,9 +2177,21 @@ // mDoFixup runs any outstanding fixup function for the running m. // Returns true if a fixup was outstanding and actually executed. // +// Note: to avoid deadlocks, and the need for the fixup function +// itself to be async safe, signals are blocked for the working m +// while it holds the mFixup lock. (See golang.org/issue/44193) +// //go:nosplit func mDoFixup() bool { _g_ := getg() + if used := atomic.Load(&_g_.m.mFixup.used); used == 0 { + return false + } + + // slow path - if fixup fn is used, block signals and lock. + var sigmask sigset + sigsave(&sigmask) + sigblock(false) lock(&_g_.m.mFixup.lock) fn := _g_.m.mFixup.fn if fn != nil { @@ -2176,7 +2208,6 @@ // the GC is off so this lack of write barrier // is more obviously safe. throw("GC must be disabled to protect validity of fn value") } - *(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0 if _g_.racectx != 0 || !raceenabled { fn(false) } else { @@ -2191,9 +2222,22 @@ fn(false) _g_.racectx = 0 unlock(&mFixupRace.lock) } + *(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0 + atomic.Store(&_g_.m.mFixup.used, 0) } unlock(&_g_.m.mFixup.lock) + msigrestore(sigmask) return fn != nil +} + +// mDoFixupAndOSYield is called when an m is unable to send a signal +// because the allThreadsSyscall mechanism is in progress. That is, an +// mPark() has been interrupted with this signal handler so we need to +// ensure the fixup is executed from this context. +//go:nosplit +func mDoFixupAndOSYield() { + mDoFixup() + osyield() } // templateThread is a thread in a known-good state that exists solely diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 109f0da1311957ca96e60d1e19f39b3a1be64e79..9a032d8658c707ca28d11c6a1560ed29997cbadc 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -537,10 +537,13 @@ startingtrace bool syscalltick uint32 freelink *m // on sched.freem - // mFixup is used to synchronize OS related m state (credentials etc) - // use mutex to access. + // mFixup is used to synchronize OS related m state + // (credentials etc) use mutex to access. To avoid deadlocks + // an atomic.Load() of used being zero in mDoFixupFn() + // guarantees fn is nil. mFixup struct { lock mutex + used uint32 fn func(bool) bool } diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go index 28b9e26d0fd43d875fd1830666ef4f40632f7f51..6bed64e51ae1307bdb59e950d9959a3875870312 100644 --- a/src/runtime/sigqueue.go +++ b/src/runtime/sigqueue.go @@ -119,7 +119,7 @@ break Send } case sigFixup: // nothing to do - we need to wait for sigIdle. - osyield() + mDoFixupAndOSYield() } }