M. J. Fromberger 061e6266cf
util/eventbus: allow logging of slow subscribers (#17705)
Add options to the eventbus.Bus to plumb in a logger.

Route that logger in to the subscriber machinery, and trigger a log message to
it when a subscriber fails to respond to its delivered events for 5s or more.

The log message includes the package, filename, and line number of the call
site that created the subscription.

Add tests that verify this works.

Updates #17680

Change-Id: I0546516476b1e13e6a9cf79f19db2fe55e56c698
Signed-off-by: M. J. Fromberger <fromberger@tailscale.com>
2025-10-30 14:40:57 -07:00

339 lines
7.9 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package eventbus
import (
"context"
"log"
"reflect"
"slices"
"sync"
"tailscale.com/types/logger"
"tailscale.com/util/set"
)
type PublishedEvent struct {
Event any
From *Client
}
type RoutedEvent struct {
Event any
From *Client
To []*Client
}
// Bus is an event bus that distributes published events to interested
// subscribers.
type Bus struct {
router *worker
write chan PublishedEvent
snapshot chan chan []PublishedEvent
routeDebug hook[RoutedEvent]
logf logger.Logf
topicsMu sync.Mutex
topics map[reflect.Type][]*subscribeState
// Used for introspection/debugging only, not in the normal event
// publishing path.
clientsMu sync.Mutex
clients set.Set[*Client]
}
// New returns a new bus with default options. It is equivalent to
// calling [NewWithOptions] with zero [BusOptions].
func New() *Bus { return NewWithOptions(BusOptions{}) }
// NewWithOptions returns a new [Bus] with the specified [BusOptions].
// Use [Bus.Client] to construct clients on the bus.
// Use [Publish] to make event publishers.
// Use [Subscribe] and [SubscribeFunc] to make event subscribers.
func NewWithOptions(opts BusOptions) *Bus {
ret := &Bus{
write: make(chan PublishedEvent),
snapshot: make(chan chan []PublishedEvent),
topics: map[reflect.Type][]*subscribeState{},
clients: set.Set[*Client]{},
logf: opts.logger(),
}
ret.router = runWorker(ret.pump)
return ret
}
// BusOptions are optional parameters for a [Bus]. A zero value is ready for
// use and provides defaults as described.
type BusOptions struct {
// Logf, if non-nil, is used for debug logs emitted by the bus and clients,
// publishers, and subscribers under its care. If it is nil, logs are sent
// to [log.Printf].
Logf logger.Logf
}
func (o BusOptions) logger() logger.Logf {
if o.Logf == nil {
return log.Printf
}
return o.Logf
}
// Client returns a new client with no subscriptions. Use [Subscribe]
// to receive events, and [Publish] to emit events.
//
// The client's name is used only for debugging, to tell humans what
// piece of code a publisher/subscriber belongs to. Aim for something
// short but unique, for example "kernel-route-monitor" or "taildrop",
// not "watcher".
func (b *Bus) Client(name string) *Client {
ret := &Client{
name: name,
bus: b,
pub: set.Set[publisher]{},
}
b.clientsMu.Lock()
defer b.clientsMu.Unlock()
b.clients.Add(ret)
return ret
}
// Debugger returns the debugging facility for the bus.
func (b *Bus) Debugger() *Debugger {
return &Debugger{b}
}
// Close closes the bus. It implicitly closes all clients, publishers and
// subscribers attached to the bus.
//
// Close blocks until the bus is fully shut down. The bus is
// permanently unusable after closing.
func (b *Bus) Close() {
b.router.StopAndWait()
b.clientsMu.Lock()
defer b.clientsMu.Unlock()
for c := range b.clients {
c.Close()
}
b.clients = nil
}
func (b *Bus) pump(ctx context.Context) {
var vals queue[PublishedEvent]
acceptCh := func() chan PublishedEvent {
if vals.Full() {
return nil
}
return b.write
}
for {
// Drain all pending events. Note that while we're draining
// events into subscriber queues, we continue to
// opportunistically accept more incoming events, if we have
// queue space for it.
for !vals.Empty() {
val := vals.Peek()
dests := b.dest(reflect.ValueOf(val.Event).Type())
if b.routeDebug.active() {
clients := make([]*Client, len(dests))
for i := range len(dests) {
clients[i] = dests[i].client
}
b.routeDebug.run(RoutedEvent{
Event: val.Event,
From: val.From,
To: clients,
})
}
for _, d := range dests {
evt := DeliveredEvent{
Event: val.Event,
From: val.From,
To: d.client,
}
deliverOne:
for {
select {
case d.write <- evt:
break deliverOne
case <-d.closed():
// Queue closed, don't block but continue
// delivering to others.
break deliverOne
case in := <-acceptCh():
vals.Add(in)
in.From.publishDebug.run(in)
case <-ctx.Done():
return
case ch := <-b.snapshot:
ch <- vals.Snapshot()
}
}
}
vals.Drop()
}
// Inbound queue empty, wait for at least 1 work item before
// resuming.
for vals.Empty() {
select {
case <-ctx.Done():
return
case in := <-b.write:
vals.Add(in)
in.From.publishDebug.run(in)
case ch := <-b.snapshot:
ch <- nil
}
}
}
}
// logger returns a [logger.Logf] to which logs related to bus activity should be written.
func (b *Bus) logger() logger.Logf { return b.logf }
func (b *Bus) dest(t reflect.Type) []*subscribeState {
b.topicsMu.Lock()
defer b.topicsMu.Unlock()
return b.topics[t]
}
func (b *Bus) shouldPublish(t reflect.Type) bool {
if b.routeDebug.active() {
return true
}
b.topicsMu.Lock()
defer b.topicsMu.Unlock()
return len(b.topics[t]) > 0
}
func (b *Bus) listClients() []*Client {
b.clientsMu.Lock()
defer b.clientsMu.Unlock()
return b.clients.Slice()
}
func (b *Bus) snapshotPublishQueue() []PublishedEvent {
resp := make(chan []PublishedEvent)
select {
case b.snapshot <- resp:
return <-resp
case <-b.router.Done():
return nil
}
}
func (b *Bus) subscribe(t reflect.Type, q *subscribeState) (cancel func()) {
b.topicsMu.Lock()
defer b.topicsMu.Unlock()
b.topics[t] = append(b.topics[t], q)
return func() {
b.unsubscribe(t, q)
}
}
func (b *Bus) unsubscribe(t reflect.Type, q *subscribeState) {
b.topicsMu.Lock()
defer b.topicsMu.Unlock()
// Topic slices are accessed by pump without holding a lock, so we
// have to replace the entire slice when unsubscribing.
// Unsubscribing should be infrequent enough that this won't
// matter.
i := slices.Index(b.topics[t], q)
if i < 0 {
return
}
b.topics[t] = slices.Delete(slices.Clone(b.topics[t]), i, i+1)
}
// A worker runs a worker goroutine and helps coordinate its shutdown.
type worker struct {
ctx context.Context
stop context.CancelFunc
stopped chan struct{}
}
// runWorker creates a worker goroutine running fn. The context passed
// to fn is canceled by [worker.Stop].
func runWorker(fn func(context.Context)) *worker {
ctx, stop := context.WithCancel(context.Background())
ret := &worker{
ctx: ctx,
stop: stop,
stopped: make(chan struct{}),
}
go ret.run(fn)
return ret
}
func (w *worker) run(fn func(context.Context)) {
defer close(w.stopped)
fn(w.ctx)
}
// Stop signals the worker goroutine to shut down.
func (w *worker) Stop() { w.stop() }
// Done returns a channel that is closed when the worker goroutine
// exits.
func (w *worker) Done() <-chan struct{} { return w.stopped }
// Wait waits until the worker goroutine has exited.
func (w *worker) Wait() { <-w.stopped }
// StopAndWait signals the worker goroutine to shut down, then waits
// for it to exit.
func (w *worker) StopAndWait() {
w.stop()
<-w.stopped
}
// stopFlag is a value that can be watched for a notification. The
// zero value is ready for use.
//
// The flag is notified by running [stopFlag.Stop]. Stop can be called
// multiple times. Upon the first call to Stop, [stopFlag.Done] is
// closed, all pending [stopFlag.Wait] calls return, and future Wait
// calls return immediately.
//
// A stopFlag can only notify once, and is intended for use as a
// one-way shutdown signal that's lighter than a cancellable
// context.Context.
type stopFlag struct {
// guards the lazy construction of stopped, and the value of
// alreadyStopped.
mu sync.Mutex
stopped chan struct{}
alreadyStopped bool
}
func (s *stopFlag) Stop() {
s.mu.Lock()
defer s.mu.Unlock()
if s.alreadyStopped {
return
}
s.alreadyStopped = true
if s.stopped == nil {
s.stopped = make(chan struct{})
}
close(s.stopped)
}
func (s *stopFlag) Done() <-chan struct{} {
s.mu.Lock()
defer s.mu.Unlock()
if s.stopped == nil {
s.stopped = make(chan struct{})
}
return s.stopped
}
func (s *stopFlag) Wait() {
<-s.Done()
}