prometheus/notifier/manager_test.go
Siavash Safi 2d5d239883
feat(notifier): independent alertmanager queues
Independent Alertmanager queues avoid issues with queue overflowing when
one or more Alertmanager instances are unavailable which could result in
lost alert notifications.
The buffered queues are managed per AlertmanagerSet which are dynamically
added/removed with service discovery or configuration reload.

The following metrics now include an extra dimention for alertmanager label:
- prometheus_notifications_dropped_total
- prometheus_notifications_queue_capacity
- prometheus_notifications_queue_length

This change also includes the test from #14099

Closes #7676

Signed-off-by: Siavash Safi <siavash@cloudflare.com>
2025-06-18 10:15:53 +02:00

1426 lines
37 KiB
Go

// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package notifier
import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"net/http/httptest"
"net/url"
"strconv"
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
config_util "github.com/prometheus/common/config"
"github.com/prometheus/common/model"
"github.com/prometheus/common/promslog"
"github.com/stretchr/testify/require"
"go.uber.org/atomic"
"gopkg.in/yaml.v2"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/discovery"
_ "github.com/prometheus/prometheus/discovery/file"
"github.com/prometheus/prometheus/discovery/targetgroup"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/model/relabel"
)
const maxBatchSize = 256
func TestHandlerSendBatch(t *testing.T) {
h := NewManager(&Options{}, nil)
b := newBuffer(10_000)
h.alertmanagers = map[string]*alertmanagerSet{
"mock": {
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return "http://mock" },
},
},
cfg: &config.DefaultAlertmanagerConfig,
buffers: map[string]*buffer{"http://mock": b},
},
}
var alerts []*Alert
for i := range make([]struct{}, 2*maxBatchSize+1) {
alerts = append(alerts, &Alert{
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
})
}
h.Send(alerts...)
expected := append([]*Alert{}, alerts...)
batch := make([]*Alert, maxBatchSize)
b.pop(&batch)
require.NoError(t, alertsEqual(expected[0:maxBatchSize], batch))
b.pop(&batch)
require.NoError(t, alertsEqual(expected[maxBatchSize:2*maxBatchSize], batch))
b.pop(&batch)
require.NoError(t, alertsEqual(expected[2*maxBatchSize:], batch))
}
func alertsEqual(a, b []*Alert) error {
if len(a) != len(b) {
return fmt.Errorf("length mismatch: %v != %v", a, b)
}
for i, alert := range a {
if !labels.Equal(alert.Labels, b[i].Labels) {
return fmt.Errorf("label mismatch at index %d: %s != %s", i, alert.Labels, b[i].Labels)
}
}
return nil
}
func newTestHTTPServerBuilder(expected *[]*Alert, errc chan<- error, u, p string, status *atomic.Int32) *httptest.Server {
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
var err error
defer func() {
if err == nil {
return
}
select {
case errc <- err:
default:
}
}()
user, pass, _ := r.BasicAuth()
if user != u || pass != p {
err = fmt.Errorf("unexpected user/password: %s/%s != %s/%s", user, pass, u, p)
w.WriteHeader(http.StatusInternalServerError)
return
}
b, err := io.ReadAll(r.Body)
if err != nil {
err = fmt.Errorf("error reading body: %w", err)
w.WriteHeader(http.StatusInternalServerError)
return
}
var alerts []*Alert
err = json.Unmarshal(b, &alerts)
if err == nil {
err = alertsEqual(*expected, alerts)
}
w.WriteHeader(int(status.Load()))
}))
}
func getCounterValue(t *testing.T, metric *prometheus.CounterVec, labels ...string) float64 {
t.Helper()
m := &dto.Metric{}
if err := metric.WithLabelValues(labels...).Write(m); err != nil {
t.Fatal(err)
}
return m.Counter.GetValue()
}
func TestHandlerSendAll(t *testing.T) {
var (
errc = make(chan error, 1)
expected = make([]*Alert, 0)
status1, status2, status3 atomic.Int32
errors1, errors2, errors3 float64
)
status1.Store(int32(http.StatusOK))
status2.Store(int32(http.StatusOK))
status3.Store(int32(http.StatusOK))
server1 := newTestHTTPServerBuilder(&expected, errc, "prometheus", "testing_password", &status1)
server2 := newTestHTTPServerBuilder(&expected, errc, "", "", &status2)
server3 := newTestHTTPServerBuilder(&expected, errc, "", "", &status3)
defer server1.Close()
defer server2.Close()
defer server3.Close()
h := NewManager(&Options{}, nil)
authClient, _ := config_util.NewClientFromConfig(
config_util.HTTPClientConfig{
BasicAuth: &config_util.BasicAuth{
Username: "prometheus",
Password: "testing_password",
},
}, "auth_alertmanager")
h.alertmanagers = make(map[string]*alertmanagerSet)
am1Cfg := config.DefaultAlertmanagerConfig
am1Cfg.Timeout = model.Duration(time.Second)
am2Cfg := config.DefaultAlertmanagerConfig
am2Cfg.Timeout = model.Duration(time.Second)
am3Cfg := config.DefaultAlertmanagerConfig
am3Cfg.Timeout = model.Duration(time.Second)
opts := &Options{Do: do, QueueCapacity: 10_000, MaxBatchSize: maxBatchSize}
logger := slog.New(slog.NewTextHandler(io.Discard, nil))
h.alertmanagers["1"] = &alertmanagerSet{
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return server1.URL },
},
},
cfg: &am1Cfg,
client: authClient,
buffers: map[string]*buffer{server1.URL: newBuffer(opts.QueueCapacity)},
opts: opts,
metrics: h.metrics,
logger: logger,
}
h.alertmanagers["2"] = &alertmanagerSet{
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return server2.URL },
},
alertmanagerMock{
urlf: func() string { return server3.URL },
},
},
cfg: &am2Cfg,
buffers: map[string]*buffer{
server2.URL: newBuffer(opts.QueueCapacity),
server3.URL: newBuffer(opts.QueueCapacity),
},
opts: opts,
metrics: h.metrics,
logger: logger,
}
h.alertmanagers["3"] = &alertmanagerSet{
ams: []alertmanager{}, // empty set
cfg: &am3Cfg,
buffers: make(map[string]*buffer),
opts: opts,
metrics: h.metrics,
logger: logger,
}
var alerts []*Alert
for i := range make([]struct{}, maxBatchSize) {
alerts = append(alerts, &Alert{
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
})
expected = append(expected, &Alert{
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
})
}
checkNoErr := func() {
t.Helper()
select {
case err := <-errc:
require.NoError(t, err)
default:
}
}
// start send loops
for _, ams := range h.alertmanagers {
for _, am := range ams.ams {
go ams.sendLoop(am)
}
}
// all ams in all sets are up
h.Send(alerts...)
time.Sleep(time.Second)
// snapshot error metrics and check them
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
require.Zero(t, errors1, "server1 has unexpected send errors")
require.Zero(t, errors2, "server2 has unexpected send errors")
require.Zero(t, errors3, "server3 has unexpected send errors")
checkNoErr()
// the only am in set 1 is down
status1.Store(int32(http.StatusNotFound))
h.Send(alerts...)
time.Sleep(time.Second)
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
require.NotZero(t, errors1, "server1 has no send errors")
require.Zero(t, errors2, "server2 has unexpected send errors")
require.Zero(t, errors3, "server3 has unexpected send errors")
checkNoErr()
// reset it
status1.Store(int32(http.StatusOK))
// reset metrics
h.metrics.errors.Reset()
// only one of the ams in set 2 is down
status2.Store(int32(http.StatusInternalServerError))
h.Send(alerts...)
time.Sleep(time.Second)
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
require.Zero(t, errors1, "server1 has unexpected send errors")
require.NotZero(t, errors2, "server2 has no send errors")
require.Zero(t, errors3, "server3 has unexpected send errors")
checkNoErr()
// both ams in set 2 are down
status3.Store(int32(http.StatusInternalServerError))
h.Send(alerts...)
time.Sleep(time.Second)
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
require.Zero(t, errors1, "server1 has unexpected send errors")
require.NotZero(t, errors2, "server2 has no send errors")
require.NotZero(t, errors3, "server3 has no send errors")
checkNoErr()
// stop send routines by closing buffers
for _, ams := range h.alertmanagers {
for _, q := range ams.buffers {
q.close()
}
}
}
func TestHandlerSendAllRemapPerAm(t *testing.T) {
var (
errc = make(chan error, 1)
expected1 = make([]*Alert, 0)
expected2 = make([]*Alert, 0)
expected3 = make([]*Alert, 0)
status1, status2, status3 atomic.Int32
errors1, errors2, errors3 float64
)
status1.Store(int32(http.StatusOK))
status2.Store(int32(http.StatusOK))
status3.Store(int32(http.StatusOK))
server1 := newTestHTTPServerBuilder(&expected1, errc, "", "", &status1)
server2 := newTestHTTPServerBuilder(&expected2, errc, "", "", &status2)
server3 := newTestHTTPServerBuilder(&expected3, errc, "", "", &status3)
defer server1.Close()
defer server2.Close()
defer server3.Close()
h := NewManager(&Options{}, nil)
h.alertmanagers = make(map[string]*alertmanagerSet)
am1Cfg := config.DefaultAlertmanagerConfig
am1Cfg.Timeout = model.Duration(time.Second)
am2Cfg := config.DefaultAlertmanagerConfig
am2Cfg.Timeout = model.Duration(time.Second)
am2Cfg.AlertRelabelConfigs = []*relabel.Config{
{
SourceLabels: model.LabelNames{"alertnamedrop"},
Action: "drop",
Regex: relabel.MustNewRegexp(".+"),
},
}
am3Cfg := config.DefaultAlertmanagerConfig
am3Cfg.Timeout = model.Duration(time.Second)
am3Cfg.AlertRelabelConfigs = []*relabel.Config{
{
SourceLabels: model.LabelNames{"alertname"},
Action: "drop",
Regex: relabel.MustNewRegexp(".+"),
},
}
opts := &Options{Do: do, QueueCapacity: 10_000, MaxBatchSize: maxBatchSize}
logger := slog.New(slog.NewTextHandler(io.Discard, nil))
h.alertmanagers = map[string]*alertmanagerSet{
// Drop no alerts.
"1": {
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return server1.URL },
},
},
cfg: &am1Cfg,
buffers: map[string]*buffer{server1.URL: newBuffer(opts.QueueCapacity)},
opts: opts,
metrics: h.metrics,
logger: logger,
},
// Drop only alerts with the "alertnamedrop" label.
"2": {
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return server2.URL },
},
},
cfg: &am2Cfg,
buffers: map[string]*buffer{server2.URL: newBuffer(opts.QueueCapacity)},
opts: opts,
metrics: h.metrics,
logger: logger,
},
// Drop all alerts.
"3": {
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return server3.URL },
},
},
cfg: &am3Cfg,
buffers: map[string]*buffer{server3.URL: newBuffer(opts.QueueCapacity)},
opts: opts,
metrics: h.metrics,
logger: logger,
},
// Empty list of Alertmanager endpoints.
"4": {
ams: []alertmanager{},
cfg: &config.DefaultAlertmanagerConfig,
buffers: make(map[string]*buffer),
opts: opts,
metrics: h.metrics,
logger: logger,
},
}
var alerts []*Alert
for i := range make([]struct{}, maxBatchSize/2) {
alerts = append(alerts,
&Alert{
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
},
&Alert{
Labels: labels.FromStrings("alertname", "test", "alertnamedrop", strconv.Itoa(i)),
},
)
expected1 = append(expected1,
&Alert{
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
}, &Alert{
Labels: labels.FromStrings("alertname", "test", "alertnamedrop", strconv.Itoa(i)),
},
)
expected2 = append(expected2, &Alert{
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
})
}
checkNoErr := func() {
t.Helper()
select {
case err := <-errc:
require.NoError(t, err)
default:
}
}
// start send loops
for _, ams := range h.alertmanagers {
for _, am := range ams.ams {
go ams.sendLoop(am)
}
}
// all ams are up
h.Send(alerts...)
time.Sleep(time.Second)
// snapshot error metrics and check them
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
require.Zero(t, errors1, "server1 has unexpected send errors")
require.Zero(t, errors2, "server2 has unexpected send errors")
require.Zero(t, errors3, "server3 has unexpected send errors")
checkNoErr()
// the only am in set 1 goes down
status1.Store(int32(http.StatusInternalServerError))
h.Send(alerts...)
time.Sleep(time.Second)
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
require.NotZero(t, errors1, "server1 has no send errors")
require.Zero(t, errors2, "server2 has unexpected send errors")
require.Zero(t, errors3, "server3 has unexpected send errors")
checkNoErr()
// reset set 1
status1.Store(int32(http.StatusOK))
// reset metrics
h.metrics.errors.Reset()
// set 3 loses its only am, but all alerts were dropped
// so there was nothing to send, keeping sendAll true
status3.Store(int32(http.StatusInternalServerError))
h.Send(alerts...)
time.Sleep(3 * time.Second)
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
require.Zero(t, errors1, "server1 has unexpected send errors")
require.Zero(t, errors2, "server2 has unexpected send errors")
require.Zero(t, errors3, "server3 has unexpected send errors")
checkNoErr()
// stop send routines by closing buffers
for _, ams := range h.alertmanagers {
for _, q := range ams.buffers {
q.close()
}
}
// Verify that individual locks are released.
for k := range h.alertmanagers {
h.alertmanagers[k].mtx.Lock()
h.alertmanagers[k].ams = nil
h.alertmanagers[k].mtx.Unlock()
}
}
func TestExternalLabels(t *testing.T) {
h := NewManager(&Options{
QueueCapacity: 3 * maxBatchSize,
MaxBatchSize: maxBatchSize,
ExternalLabels: labels.FromStrings("a", "b"),
RelabelConfigs: []*relabel.Config{
{
SourceLabels: model.LabelNames{"alertname"},
TargetLabel: "a",
Action: "replace",
Regex: relabel.MustNewRegexp("externalrelabelthis"),
Replacement: "c",
},
},
}, nil)
queue := newBuffer(h.opts.QueueCapacity)
h.alertmanagers = map[string]*alertmanagerSet{
"test": {
buffers: map[string]*buffer{"test": queue},
cfg: &config.AlertmanagerConfig{
RelabelConfigs: h.opts.RelabelConfigs,
},
},
}
// This alert should get the external label attached.
h.Send(&Alert{
Labels: labels.FromStrings("alertname", "test"),
})
// This alert should get the external label attached, but then set to "c"
// through relabelling.
h.Send(&Alert{
Labels: labels.FromStrings("alertname", "externalrelabelthis"),
})
alerts := make([]*Alert, maxBatchSize)
queue.pop(&alerts)
expected := []*Alert{
{Labels: labels.FromStrings("alertname", "test", "a", "b")},
{Labels: labels.FromStrings("alertname", "externalrelabelthis", "a", "c")},
}
require.NoError(t, alertsEqual(expected, alerts))
}
func TestHandlerRelabel(t *testing.T) {
h := NewManager(&Options{
QueueCapacity: 3 * maxBatchSize,
MaxBatchSize: maxBatchSize,
RelabelConfigs: []*relabel.Config{
{
SourceLabels: model.LabelNames{"alertname"},
Action: "drop",
Regex: relabel.MustNewRegexp("drop"),
},
{
SourceLabels: model.LabelNames{"alertname"},
TargetLabel: "alertname",
Action: "replace",
Regex: relabel.MustNewRegexp("rename"),
Replacement: "renamed",
},
},
}, nil)
queue := newBuffer(h.opts.QueueCapacity)
h.alertmanagers = map[string]*alertmanagerSet{
"test": {
buffers: map[string]*buffer{"test": queue},
cfg: &config.AlertmanagerConfig{
RelabelConfigs: h.opts.RelabelConfigs,
},
},
}
// This alert should be dropped due to the configuration
h.Send(&Alert{
Labels: labels.FromStrings("alertname", "drop"),
})
// This alert should be replaced due to the configuration
h.Send(&Alert{
Labels: labels.FromStrings("alertname", "rename"),
})
alerts := make([]*Alert, maxBatchSize)
queue.pop(&alerts)
expected := []*Alert{
{Labels: labels.FromStrings("alertname", "renamed")},
}
require.NoError(t, alertsEqual(expected, alerts))
}
func TestHandlerQueuing(t *testing.T) {
var (
expectedc = make(chan []*Alert)
called = make(chan struct{})
done = make(chan struct{})
errc = make(chan error, 1)
)
server := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, r *http.Request) {
// Notify the test function that we have received something.
select {
case called <- struct{}{}:
case <-done:
return
}
// Wait for the test function to unblock us.
select {
case expected := <-expectedc:
var alerts []*Alert
b, err := io.ReadAll(r.Body)
if err != nil {
panic(err)
}
err = json.Unmarshal(b, &alerts)
if err == nil {
err = alertsEqual(expected, alerts)
}
select {
case errc <- err:
default:
}
case <-done:
}
}))
defer func() {
close(done)
server.Close()
}()
h := NewManager(
&Options{
QueueCapacity: 3 * maxBatchSize,
MaxBatchSize: maxBatchSize,
},
nil,
)
h.alertmanagers = make(map[string]*alertmanagerSet)
am1Cfg := config.DefaultAlertmanagerConfig
am1Cfg.Timeout = model.Duration(time.Second)
h.alertmanagers["1"] = &alertmanagerSet{
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return server.URL },
},
},
cfg: &am1Cfg,
buffers: map[string]*buffer{server.URL: newBuffer(h.opts.QueueCapacity)},
metrics: h.metrics,
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
}
for _, ams := range h.alertmanagers {
for _, am := range ams.ams {
go ams.sendLoop(am)
}
}
go h.Run(nil)
defer h.Stop()
var alerts []*Alert
for i := range make([]struct{}, 20*maxBatchSize) {
alerts = append(alerts, &Alert{
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
})
}
assertAlerts := func(expected []*Alert) {
t.Helper()
for {
select {
case <-called:
expectedc <- expected
case err := <-errc:
require.NoError(t, err)
return
case <-time.After(5 * time.Second):
require.FailNow(t, "Alerts were not pushed.")
}
}
}
// If the batch is larger than the queue capacity, it should be truncated
// from the front.
h.Send(alerts[:4*maxBatchSize]...)
for i := 1; i < 4; i++ {
assertAlerts(alerts[i*maxBatchSize : (i+1)*maxBatchSize])
}
// Send one batch, wait for it to arrive and block the server so the queue fills up.
h.Send(alerts[:maxBatchSize]...)
<-called
// Send several batches while the server is still blocked so the queue
// fills up to its maximum capacity (3*maxBatchSize). Then check that the
// queue is truncated in the front.
h.Send(alerts[1*maxBatchSize : 2*maxBatchSize]...) // this batch should be dropped.
h.Send(alerts[2*maxBatchSize : 3*maxBatchSize]...)
h.Send(alerts[3*maxBatchSize : 4*maxBatchSize]...)
// Send the batch that drops the first one.
h.Send(alerts[4*maxBatchSize : 5*maxBatchSize]...)
// Unblock the server.
expectedc <- alerts[:maxBatchSize]
select {
case err := <-errc:
require.NoError(t, err)
case <-time.After(5 * time.Second):
require.FailNow(t, "Alerts were not pushed.")
}
// Verify that we receive the last 3 batches.
for i := 2; i < 5; i++ {
assertAlerts(alerts[i*maxBatchSize : (i+1)*maxBatchSize])
}
}
type alertmanagerMock struct {
urlf func() string
}
func (a alertmanagerMock) url() *url.URL {
u, err := url.Parse(a.urlf())
if err != nil {
panic(err)
}
return u
}
func TestReload(t *testing.T) {
tests := []struct {
in *targetgroup.Group
out string
}{
{
in: &targetgroup.Group{
Targets: []model.LabelSet{
{
"__address__": "alertmanager:9093",
},
},
},
out: "http://alertmanager:9093/api/v2/alerts",
},
}
n := NewManager(&Options{}, nil)
cfg := &config.Config{}
s := `
alerting:
alertmanagers:
- static_configs:
`
err := yaml.UnmarshalStrict([]byte(s), cfg)
require.NoError(t, err, "Unable to load YAML config.")
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1)
err = n.ApplyConfig(cfg)
require.NoError(t, err, "Error applying the config.")
tgs := make(map[string][]*targetgroup.Group)
for _, tt := range tests {
for k := range cfg.AlertingConfig.AlertmanagerConfigs.ToMap() {
tgs[k] = []*targetgroup.Group{
tt.in,
}
break
}
n.reload(tgs)
res := n.Alertmanagers()[0].String()
require.Equal(t, tt.out, res)
}
}
func TestDroppedAlertmanagers(t *testing.T) {
tests := []struct {
in *targetgroup.Group
out string
}{
{
in: &targetgroup.Group{
Targets: []model.LabelSet{
{
"__address__": "alertmanager:9093",
},
},
},
out: "http://alertmanager:9093/api/v2/alerts",
},
}
n := NewManager(&Options{}, nil)
cfg := &config.Config{}
s := `
alerting:
alertmanagers:
- static_configs:
relabel_configs:
- source_labels: ['__address__']
regex: 'alertmanager:9093'
action: drop
`
err := yaml.UnmarshalStrict([]byte(s), cfg)
require.NoError(t, err, "Unable to load YAML config.")
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1)
err = n.ApplyConfig(cfg)
require.NoError(t, err, "Error applying the config.")
tgs := make(map[string][]*targetgroup.Group)
for _, tt := range tests {
for k := range cfg.AlertingConfig.AlertmanagerConfigs.ToMap() {
tgs[k] = []*targetgroup.Group{
tt.in,
}
break
}
n.reload(tgs)
res := n.DroppedAlertmanagers()[0].String()
require.Equal(t, res, tt.out)
}
}
func makeInputTargetGroup() *targetgroup.Group {
return &targetgroup.Group{
Targets: []model.LabelSet{
{
model.AddressLabel: model.LabelValue("1.1.1.1:9090"),
model.LabelName("notcommon1"): model.LabelValue("label"),
},
},
Labels: model.LabelSet{
model.LabelName("common"): model.LabelValue("label"),
},
Source: "testsource",
}
}
// TestHangingNotifier ensures that the notifier takes into account SD changes even when there are
// queued alerts. This test reproduces the issue described in https://github.com/prometheus/prometheus/issues/13676.
// and https://github.com/prometheus/prometheus/issues/8768.
// TODO: Drop this test as we have independent queues per alertmanager now.
func TestHangingNotifier(t *testing.T) {
const (
batches = 100
alertsCount = maxBatchSize * batches
)
var (
sendTimeout = 100 * time.Millisecond
sdUpdatert = sendTimeout / 2
done = make(chan struct{})
)
defer func() {
close(done)
}()
// Set up a faulty Alertmanager.
var faultyCalled atomic.Bool
faultyServer := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {
faultyCalled.Store(true)
select {
case <-done:
case <-time.After(time.Hour):
}
}))
faultyURL, err := url.Parse(faultyServer.URL)
require.NoError(t, err)
// Set up a functional Alertmanager.
var functionalCalled atomic.Bool
functionalServer := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {
functionalCalled.Store(true)
}))
functionalURL, err := url.Parse(functionalServer.URL)
require.NoError(t, err)
// Initialize the discovery manager
// This is relevant as the updates aren't sent continually in real life, but only each updatert.
// The old implementation of TestHangingNotifier didn't take that into account.
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
reg := prometheus.NewRegistry()
sdMetrics, err := discovery.RegisterSDMetrics(reg, discovery.NewRefreshMetrics(reg))
require.NoError(t, err)
sdManager := discovery.NewManager(
ctx,
promslog.NewNopLogger(),
reg,
sdMetrics,
discovery.Name("sd-manager"),
discovery.Updatert(sdUpdatert),
)
go sdManager.Run()
// Set up the notifier with both faulty and functional Alertmanagers.
notifier := NewManager(
&Options{
QueueCapacity: alertsCount,
},
nil,
)
notifier.alertmanagers = make(map[string]*alertmanagerSet)
amCfg := config.DefaultAlertmanagerConfig
amCfg.Timeout = model.Duration(sendTimeout)
notifier.alertmanagers["config-0"] = &alertmanagerSet{
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return faultyURL.String() },
},
alertmanagerMock{
urlf: func() string { return functionalURL.String() },
},
},
cfg: &amCfg,
metrics: notifier.metrics,
buffers: map[string]*buffer{
faultyURL.String(): newBuffer(notifier.opts.QueueCapacity),
functionalURL.String(): newBuffer(notifier.opts.QueueCapacity),
},
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
}
for _, ams := range notifier.alertmanagers {
for _, am := range ams.ams {
go ams.sendLoop(am)
}
}
go notifier.Run(sdManager.SyncCh())
defer notifier.Stop()
require.Len(t, notifier.Alertmanagers(), 2)
// Enqueue the alerts.
var alerts []*Alert
for i := range make([]struct{}, alertsCount) {
alerts = append(alerts, &Alert{
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
})
}
notifier.Send(alerts...)
// Wait for the Alertmanagers to start receiving alerts.
// 10*sdUpdatert is used as an arbitrary timeout here.
timeout := time.After(10 * sdUpdatert)
loop1:
for {
select {
case <-timeout:
t.Fatalf("Timeout waiting for the alertmanagers to be reached for the first time.")
default:
if faultyCalled.Load() && functionalCalled.Load() {
break loop1
}
}
}
// Request to remove the faulty Alertmanager.
c := map[string]discovery.Configs{
"config-0": {
discovery.StaticConfig{
&targetgroup.Group{
Targets: []model.LabelSet{
{
model.AddressLabel: model.LabelValue(functionalURL.Host),
},
},
},
},
},
}
require.NoError(t, sdManager.ApplyConfig(c))
// The notifier should not wait until the alerts queue is empty to apply the discovery changes
// A faulty Alertmanager could cause each alert sending cycle to take up to AlertmanagerConfig.Timeout
// The queue may never be emptied, as the arrival rate could be larger than the departure rate
// It could even overflow and alerts could be dropped.
timeout = time.After(batches * sendTimeout)
loop2:
for {
select {
case <-timeout:
t.Fatalf("Timeout, the faulty alertmanager not removed on time.")
default:
// The faulty alertmanager was dropped.
if len(notifier.Alertmanagers()) == 1 {
// Prevent from TOCTOU.
for _, ams := range notifier.alertmanagers {
for _, q := range ams.buffers {
require.Zero(t, q.len())
}
}
break loop2
}
}
}
}
func TestStop_DrainingDisabled(t *testing.T) {
releaseReceiver := make(chan struct{})
receiverReceivedRequest := make(chan struct{}, 2)
alertsReceived := atomic.NewInt64(0)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Let the test know we've received a request.
receiverReceivedRequest <- struct{}{}
var alerts []*Alert
b, err := io.ReadAll(r.Body)
require.NoError(t, err)
err = json.Unmarshal(b, &alerts)
require.NoError(t, err)
alertsReceived.Add(int64(len(alerts)))
// Wait for the test to release us.
<-releaseReceiver
w.WriteHeader(http.StatusOK)
}))
defer func() {
server.Close()
}()
m := NewManager(
&Options{
QueueCapacity: 10,
DrainOnShutdown: false,
},
nil,
)
m.alertmanagers = make(map[string]*alertmanagerSet)
am1Cfg := config.DefaultAlertmanagerConfig
am1Cfg.Timeout = model.Duration(time.Second)
m.alertmanagers["1"] = &alertmanagerSet{
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return server.URL },
},
},
cfg: &am1Cfg,
buffers: map[string]*buffer{server.URL: newBuffer(m.opts.QueueCapacity)},
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
metrics: newAlertMetrics(prometheus.DefaultRegisterer, nil),
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
}
for _, ams := range m.alertmanagers {
for _, am := range ams.ams {
go ams.sendLoop(am)
}
}
notificationManagerStopped := make(chan struct{})
go func() {
defer close(notificationManagerStopped)
m.Run(nil)
}()
// Queue two alerts. The first should be immediately sent to the receiver, which should block until we release it later.
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-1")})
select {
case <-receiverReceivedRequest:
// Nothing more to do.
case <-time.After(time.Second):
require.FailNow(t, "gave up waiting for receiver to receive notification of first alert")
}
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-2")})
// Stop the notification manager, pause to allow the shutdown to be observed, and then allow the receiver to proceed.
m.Stop()
time.Sleep(time.Second)
close(releaseReceiver)
// Wait for the notification manager to stop and confirm only the first notification was sent.
// The second notification should be dropped.
select {
case <-notificationManagerStopped:
// Nothing more to do.
case <-time.After(time.Second):
require.FailNow(t, "gave up waiting for notification manager to stop")
}
// At least one alert must have been delivered before notification manager stops.
require.Positive(t, alertsReceived.Load())
}
func TestStop_DrainingEnabled(t *testing.T) {
releaseReceiver := make(chan struct{})
receiverReceivedRequest := make(chan struct{}, 2)
alertsReceived := atomic.NewInt64(0)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
var alerts []*Alert
b, err := io.ReadAll(r.Body)
require.NoError(t, err)
err = json.Unmarshal(b, &alerts)
require.NoError(t, err)
alertsReceived.Add(int64(len(alerts)))
// Let the test know we've received a request.
receiverReceivedRequest <- struct{}{}
// Wait for the test to release us.
<-releaseReceiver
w.WriteHeader(http.StatusOK)
}))
defer func() {
server.Close()
}()
m := NewManager(
&Options{
QueueCapacity: 10,
DrainOnShutdown: true,
},
nil,
)
m.alertmanagers = make(map[string]*alertmanagerSet)
am1Cfg := config.DefaultAlertmanagerConfig
am1Cfg.Timeout = model.Duration(time.Second)
m.alertmanagers["1"] = &alertmanagerSet{
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return server.URL },
},
},
cfg: &am1Cfg,
buffers: map[string]*buffer{server.URL: newBuffer(m.opts.QueueCapacity)},
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
metrics: m.metrics,
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
}
for _, ams := range m.alertmanagers {
for _, am := range ams.ams {
go ams.sendLoop(am)
}
}
notificationManagerStopped := make(chan struct{})
go func() {
defer close(notificationManagerStopped)
m.Run(nil)
}()
// Queue two alerts. The first should be immediately sent to the receiver, which should block until we release it later.
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-1")})
select {
case <-receiverReceivedRequest:
// Nothing more to do.
case <-time.After(time.Second):
require.FailNow(t, "gave up waiting for receiver to receive notification of first alert")
}
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-2")})
// Stop the notification manager and allow the receiver to proceed.
m.Stop()
close(releaseReceiver)
// Wait for the notification manager to stop and confirm both notifications were sent.
select {
case <-notificationManagerStopped:
// Nothing more to do.
case <-time.After(400 * time.Millisecond):
require.FailNow(t, "gave up waiting for notification manager to stop")
}
<-receiverReceivedRequest
require.Equal(t, int64(2), alertsReceived.Load())
}
func TestApplyConfig(t *testing.T) {
targetURL := "alertmanager:9093"
targetGroup := &targetgroup.Group{
Targets: []model.LabelSet{
{
"__address__": model.LabelValue(targetURL),
},
},
}
alertmanagerURL := fmt.Sprintf("http://%s/api/v2/alerts", targetURL)
n := NewManager(&Options{}, nil)
cfg := &config.Config{}
s := `
alerting:
alertmanagers:
- file_sd_configs:
- files:
- foo.json
`
// 1. Ensure known alertmanagers are not dropped during ApplyConfig.
require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg))
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1)
// First, apply the config and reload.
require.NoError(t, n.ApplyConfig(cfg))
tgs := map[string][]*targetgroup.Group{"config-0": {targetGroup}}
n.reload(tgs)
require.Len(t, n.Alertmanagers(), 1)
require.Equal(t, alertmanagerURL, n.Alertmanagers()[0].String())
// Reapply the config.
require.NoError(t, n.ApplyConfig(cfg))
// Ensure the known alertmanagers are not dropped.
require.Len(t, n.Alertmanagers(), 1)
require.Equal(t, alertmanagerURL, n.Alertmanagers()[0].String())
// 2. Ensure known alertmanagers are not dropped during ApplyConfig even when
// the config order changes.
s = `
alerting:
alertmanagers:
- static_configs:
- file_sd_configs:
- files:
- foo.json
`
require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg))
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2)
require.NoError(t, n.ApplyConfig(cfg))
require.Len(t, n.Alertmanagers(), 1)
// Ensure no unnecessary alertmanagers are injected.
require.Empty(t, n.alertmanagers["config-0"].ams)
// Ensure the config order is taken into account.
ams := n.alertmanagers["config-1"].ams
require.Len(t, ams, 1)
require.Equal(t, alertmanagerURL, ams[0].url().String())
// 3. Ensure known alertmanagers are reused for new config with identical AlertmanagerConfig.
s = `
alerting:
alertmanagers:
- file_sd_configs:
- files:
- foo.json
- file_sd_configs:
- files:
- foo.json
`
require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg))
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2)
require.NoError(t, n.ApplyConfig(cfg))
require.Len(t, n.Alertmanagers(), 2)
for cfgIdx := range 2 {
ams := n.alertmanagers[fmt.Sprintf("config-%d", cfgIdx)].ams
require.Len(t, ams, 1)
require.Equal(t, alertmanagerURL, ams[0].url().String())
}
// 4. Ensure known alertmanagers are reused only for identical AlertmanagerConfig.
s = `
alerting:
alertmanagers:
- file_sd_configs:
- files:
- foo.json
path_prefix: /bar
- file_sd_configs:
- files:
- foo.json
relabel_configs:
- source_labels: ['__address__']
regex: 'doesntmatter:1234'
action: drop
`
require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg))
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2)
require.NoError(t, n.ApplyConfig(cfg))
require.Empty(t, n.Alertmanagers())
}
// Regression test for https://github.com/prometheus/prometheus/issues/7676
// The test creates a black hole alertmanager that never responds to any requests.
// The alertmanager_config.timeout is set to infinite (1 year).
// We check that the notifier does not hang and throughput is not affected.
func TestNotifierQueueIndependentOfFailedAlertmanager(t *testing.T) {
stopBlackHole := make(chan struct{})
blackHoleAM := newBlackHoleAlertmanager(stopBlackHole)
defer close(stopBlackHole)
doneAlertReceive := make(chan struct{})
immediateAM := newImmediateAlertManager(doneAlertReceive)
h := NewManager(&Options{}, nil)
h.alertmanagers = make(map[string]*alertmanagerSet)
amCfg := config.DefaultAlertmanagerConfig
amCfg.Timeout = model.Duration(time.Hour * 24 * 365)
h.alertmanagers["1"] = &alertmanagerSet{
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return blackHoleAM.URL },
},
},
cfg: &amCfg,
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
buffers: map[string]*buffer{blackHoleAM.URL: newBuffer(10)},
metrics: h.metrics,
}
h.alertmanagers["2"] = &alertmanagerSet{
ams: []alertmanager{
alertmanagerMock{
urlf: func() string { return immediateAM.URL },
},
},
cfg: &amCfg,
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
buffers: map[string]*buffer{immediateAM.URL: newBuffer(10)},
metrics: h.metrics,
}
doneSendAll := make(chan struct{})
go func() {
for _, s := range h.alertmanagers {
for _, am := range s.ams {
go s.sendLoop(am)
}
}
h.Send(&Alert{
Labels: labels.FromStrings("alertname", "test"),
})
close(doneSendAll)
}()
select {
case <-doneAlertReceive:
// This is the happy case, the alert was received by the immediate alertmanager.
case <-time.After(30 * time.Second):
t.Fatal("Timeout waiting for alert to be received by immediate alertmanager")
}
select {
case <-doneSendAll:
// This is the happy case, the sendAll function returned.
case <-time.After(30 * time.Second):
t.Fatal("Timeout waiting for sendAll to return")
}
}
func newBlackHoleAlertmanager(stop <-chan struct{}) *httptest.Server {
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
// Do nothing, wait to be canceled.
<-stop
w.WriteHeader(http.StatusOK)
}))
}
func newImmediateAlertManager(done chan<- struct{}) *httptest.Server {
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
close(done)
}))
}