// Copyright The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package notifier import ( "context" "encoding/json" "fmt" "io" "log/slog" "net/http" "net/http/httptest" "net/url" "strconv" "testing" "time" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" config_util "github.com/prometheus/common/config" "github.com/prometheus/common/model" "github.com/prometheus/common/promslog" "github.com/stretchr/testify/require" "go.uber.org/atomic" "gopkg.in/yaml.v2" "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/discovery" _ "github.com/prometheus/prometheus/discovery/file" "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/model/relabel" ) const maxBatchSize = 256 func TestHandlerSendBatch(t *testing.T) { h := NewManager(&Options{}, nil) b := newBuffer(10_000) h.alertmanagers = map[string]*alertmanagerSet{ "mock": { ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return "http://mock" }, }, }, cfg: &config.DefaultAlertmanagerConfig, buffers: map[string]*buffer{"http://mock": b}, }, } var alerts []*Alert for i := range make([]struct{}, 2*maxBatchSize+1) { alerts = append(alerts, &Alert{ Labels: labels.FromStrings("alertname", strconv.Itoa(i)), }) } h.Send(alerts...) expected := append([]*Alert{}, alerts...) batch := make([]*Alert, maxBatchSize) b.pop(&batch) require.NoError(t, alertsEqual(expected[0:maxBatchSize], batch)) b.pop(&batch) require.NoError(t, alertsEqual(expected[maxBatchSize:2*maxBatchSize], batch)) b.pop(&batch) require.NoError(t, alertsEqual(expected[2*maxBatchSize:], batch)) } func alertsEqual(a, b []*Alert) error { if len(a) != len(b) { return fmt.Errorf("length mismatch: %v != %v", a, b) } for i, alert := range a { if !labels.Equal(alert.Labels, b[i].Labels) { return fmt.Errorf("label mismatch at index %d: %s != %s", i, alert.Labels, b[i].Labels) } } return nil } func newTestHTTPServerBuilder(expected *[]*Alert, errc chan<- error, u, p string, status *atomic.Int32) *httptest.Server { return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { var err error defer func() { if err == nil { return } select { case errc <- err: default: } }() user, pass, _ := r.BasicAuth() if user != u || pass != p { err = fmt.Errorf("unexpected user/password: %s/%s != %s/%s", user, pass, u, p) w.WriteHeader(http.StatusInternalServerError) return } b, err := io.ReadAll(r.Body) if err != nil { err = fmt.Errorf("error reading body: %w", err) w.WriteHeader(http.StatusInternalServerError) return } var alerts []*Alert err = json.Unmarshal(b, &alerts) if err == nil { err = alertsEqual(*expected, alerts) } w.WriteHeader(int(status.Load())) })) } func getCounterValue(t *testing.T, metric *prometheus.CounterVec, labels ...string) float64 { t.Helper() m := &dto.Metric{} if err := metric.WithLabelValues(labels...).Write(m); err != nil { t.Fatal(err) } return m.Counter.GetValue() } func TestHandlerSendAll(t *testing.T) { var ( errc = make(chan error, 1) expected = make([]*Alert, 0) status1, status2, status3 atomic.Int32 errors1, errors2, errors3 float64 ) status1.Store(int32(http.StatusOK)) status2.Store(int32(http.StatusOK)) status3.Store(int32(http.StatusOK)) server1 := newTestHTTPServerBuilder(&expected, errc, "prometheus", "testing_password", &status1) server2 := newTestHTTPServerBuilder(&expected, errc, "", "", &status2) server3 := newTestHTTPServerBuilder(&expected, errc, "", "", &status3) defer server1.Close() defer server2.Close() defer server3.Close() h := NewManager(&Options{}, nil) authClient, _ := config_util.NewClientFromConfig( config_util.HTTPClientConfig{ BasicAuth: &config_util.BasicAuth{ Username: "prometheus", Password: "testing_password", }, }, "auth_alertmanager") h.alertmanagers = make(map[string]*alertmanagerSet) am1Cfg := config.DefaultAlertmanagerConfig am1Cfg.Timeout = model.Duration(time.Second) am2Cfg := config.DefaultAlertmanagerConfig am2Cfg.Timeout = model.Duration(time.Second) am3Cfg := config.DefaultAlertmanagerConfig am3Cfg.Timeout = model.Duration(time.Second) opts := &Options{Do: do, QueueCapacity: 10_000, MaxBatchSize: maxBatchSize} logger := slog.New(slog.NewTextHandler(io.Discard, nil)) h.alertmanagers["1"] = &alertmanagerSet{ ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return server1.URL }, }, }, cfg: &am1Cfg, client: authClient, buffers: map[string]*buffer{server1.URL: newBuffer(opts.QueueCapacity)}, opts: opts, metrics: h.metrics, logger: logger, } h.alertmanagers["2"] = &alertmanagerSet{ ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return server2.URL }, }, alertmanagerMock{ urlf: func() string { return server3.URL }, }, }, cfg: &am2Cfg, buffers: map[string]*buffer{ server2.URL: newBuffer(opts.QueueCapacity), server3.URL: newBuffer(opts.QueueCapacity), }, opts: opts, metrics: h.metrics, logger: logger, } h.alertmanagers["3"] = &alertmanagerSet{ ams: []alertmanager{}, // empty set cfg: &am3Cfg, buffers: make(map[string]*buffer), opts: opts, metrics: h.metrics, logger: logger, } var alerts []*Alert for i := range make([]struct{}, maxBatchSize) { alerts = append(alerts, &Alert{ Labels: labels.FromStrings("alertname", strconv.Itoa(i)), }) expected = append(expected, &Alert{ Labels: labels.FromStrings("alertname", strconv.Itoa(i)), }) } checkNoErr := func() { t.Helper() select { case err := <-errc: require.NoError(t, err) default: } } // start send loops for _, ams := range h.alertmanagers { for _, am := range ams.ams { go ams.sendLoop(am) } } // all ams in all sets are up h.Send(alerts...) time.Sleep(time.Second) // snapshot error metrics and check them errors1 = getCounterValue(t, h.metrics.errors, server1.URL) errors2 = getCounterValue(t, h.metrics.errors, server2.URL) errors3 = getCounterValue(t, h.metrics.errors, server3.URL) require.Zero(t, errors1, "server1 has unexpected send errors") require.Zero(t, errors2, "server2 has unexpected send errors") require.Zero(t, errors3, "server3 has unexpected send errors") checkNoErr() // the only am in set 1 is down status1.Store(int32(http.StatusNotFound)) h.Send(alerts...) time.Sleep(time.Second) errors1 = getCounterValue(t, h.metrics.errors, server1.URL) errors2 = getCounterValue(t, h.metrics.errors, server2.URL) errors3 = getCounterValue(t, h.metrics.errors, server3.URL) require.NotZero(t, errors1, "server1 has no send errors") require.Zero(t, errors2, "server2 has unexpected send errors") require.Zero(t, errors3, "server3 has unexpected send errors") checkNoErr() // reset it status1.Store(int32(http.StatusOK)) // reset metrics h.metrics.errors.Reset() // only one of the ams in set 2 is down status2.Store(int32(http.StatusInternalServerError)) h.Send(alerts...) time.Sleep(time.Second) errors1 = getCounterValue(t, h.metrics.errors, server1.URL) errors2 = getCounterValue(t, h.metrics.errors, server2.URL) errors3 = getCounterValue(t, h.metrics.errors, server3.URL) require.Zero(t, errors1, "server1 has unexpected send errors") require.NotZero(t, errors2, "server2 has no send errors") require.Zero(t, errors3, "server3 has unexpected send errors") checkNoErr() // both ams in set 2 are down status3.Store(int32(http.StatusInternalServerError)) h.Send(alerts...) time.Sleep(time.Second) errors1 = getCounterValue(t, h.metrics.errors, server1.URL) errors2 = getCounterValue(t, h.metrics.errors, server2.URL) errors3 = getCounterValue(t, h.metrics.errors, server3.URL) require.Zero(t, errors1, "server1 has unexpected send errors") require.NotZero(t, errors2, "server2 has no send errors") require.NotZero(t, errors3, "server3 has no send errors") checkNoErr() // stop send routines by closing buffers for _, ams := range h.alertmanagers { for _, q := range ams.buffers { q.close() } } } func TestHandlerSendAllRemapPerAm(t *testing.T) { var ( errc = make(chan error, 1) expected1 = make([]*Alert, 0) expected2 = make([]*Alert, 0) expected3 = make([]*Alert, 0) status1, status2, status3 atomic.Int32 errors1, errors2, errors3 float64 ) status1.Store(int32(http.StatusOK)) status2.Store(int32(http.StatusOK)) status3.Store(int32(http.StatusOK)) server1 := newTestHTTPServerBuilder(&expected1, errc, "", "", &status1) server2 := newTestHTTPServerBuilder(&expected2, errc, "", "", &status2) server3 := newTestHTTPServerBuilder(&expected3, errc, "", "", &status3) defer server1.Close() defer server2.Close() defer server3.Close() h := NewManager(&Options{}, nil) h.alertmanagers = make(map[string]*alertmanagerSet) am1Cfg := config.DefaultAlertmanagerConfig am1Cfg.Timeout = model.Duration(time.Second) am2Cfg := config.DefaultAlertmanagerConfig am2Cfg.Timeout = model.Duration(time.Second) am2Cfg.AlertRelabelConfigs = []*relabel.Config{ { SourceLabels: model.LabelNames{"alertnamedrop"}, Action: "drop", Regex: relabel.MustNewRegexp(".+"), }, } am3Cfg := config.DefaultAlertmanagerConfig am3Cfg.Timeout = model.Duration(time.Second) am3Cfg.AlertRelabelConfigs = []*relabel.Config{ { SourceLabels: model.LabelNames{"alertname"}, Action: "drop", Regex: relabel.MustNewRegexp(".+"), }, } opts := &Options{Do: do, QueueCapacity: 10_000, MaxBatchSize: maxBatchSize} logger := slog.New(slog.NewTextHandler(io.Discard, nil)) h.alertmanagers = map[string]*alertmanagerSet{ // Drop no alerts. "1": { ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return server1.URL }, }, }, cfg: &am1Cfg, buffers: map[string]*buffer{server1.URL: newBuffer(opts.QueueCapacity)}, opts: opts, metrics: h.metrics, logger: logger, }, // Drop only alerts with the "alertnamedrop" label. "2": { ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return server2.URL }, }, }, cfg: &am2Cfg, buffers: map[string]*buffer{server2.URL: newBuffer(opts.QueueCapacity)}, opts: opts, metrics: h.metrics, logger: logger, }, // Drop all alerts. "3": { ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return server3.URL }, }, }, cfg: &am3Cfg, buffers: map[string]*buffer{server3.URL: newBuffer(opts.QueueCapacity)}, opts: opts, metrics: h.metrics, logger: logger, }, // Empty list of Alertmanager endpoints. "4": { ams: []alertmanager{}, cfg: &config.DefaultAlertmanagerConfig, buffers: make(map[string]*buffer), opts: opts, metrics: h.metrics, logger: logger, }, } var alerts []*Alert for i := range make([]struct{}, maxBatchSize/2) { alerts = append(alerts, &Alert{ Labels: labels.FromStrings("alertname", strconv.Itoa(i)), }, &Alert{ Labels: labels.FromStrings("alertname", "test", "alertnamedrop", strconv.Itoa(i)), }, ) expected1 = append(expected1, &Alert{ Labels: labels.FromStrings("alertname", strconv.Itoa(i)), }, &Alert{ Labels: labels.FromStrings("alertname", "test", "alertnamedrop", strconv.Itoa(i)), }, ) expected2 = append(expected2, &Alert{ Labels: labels.FromStrings("alertname", strconv.Itoa(i)), }) } checkNoErr := func() { t.Helper() select { case err := <-errc: require.NoError(t, err) default: } } // start send loops for _, ams := range h.alertmanagers { for _, am := range ams.ams { go ams.sendLoop(am) } } // all ams are up h.Send(alerts...) time.Sleep(time.Second) // snapshot error metrics and check them errors1 = getCounterValue(t, h.metrics.errors, server1.URL) errors2 = getCounterValue(t, h.metrics.errors, server2.URL) errors3 = getCounterValue(t, h.metrics.errors, server3.URL) require.Zero(t, errors1, "server1 has unexpected send errors") require.Zero(t, errors2, "server2 has unexpected send errors") require.Zero(t, errors3, "server3 has unexpected send errors") checkNoErr() // the only am in set 1 goes down status1.Store(int32(http.StatusInternalServerError)) h.Send(alerts...) time.Sleep(time.Second) errors1 = getCounterValue(t, h.metrics.errors, server1.URL) errors2 = getCounterValue(t, h.metrics.errors, server2.URL) errors3 = getCounterValue(t, h.metrics.errors, server3.URL) require.NotZero(t, errors1, "server1 has no send errors") require.Zero(t, errors2, "server2 has unexpected send errors") require.Zero(t, errors3, "server3 has unexpected send errors") checkNoErr() // reset set 1 status1.Store(int32(http.StatusOK)) // reset metrics h.metrics.errors.Reset() // set 3 loses its only am, but all alerts were dropped // so there was nothing to send, keeping sendAll true status3.Store(int32(http.StatusInternalServerError)) h.Send(alerts...) time.Sleep(3 * time.Second) errors1 = getCounterValue(t, h.metrics.errors, server1.URL) errors2 = getCounterValue(t, h.metrics.errors, server2.URL) errors3 = getCounterValue(t, h.metrics.errors, server3.URL) require.Zero(t, errors1, "server1 has unexpected send errors") require.Zero(t, errors2, "server2 has unexpected send errors") require.Zero(t, errors3, "server3 has unexpected send errors") checkNoErr() // stop send routines by closing buffers for _, ams := range h.alertmanagers { for _, q := range ams.buffers { q.close() } } // Verify that individual locks are released. for k := range h.alertmanagers { h.alertmanagers[k].mtx.Lock() h.alertmanagers[k].ams = nil h.alertmanagers[k].mtx.Unlock() } } func TestExternalLabels(t *testing.T) { h := NewManager(&Options{ QueueCapacity: 3 * maxBatchSize, MaxBatchSize: maxBatchSize, ExternalLabels: labels.FromStrings("a", "b"), RelabelConfigs: []*relabel.Config{ { SourceLabels: model.LabelNames{"alertname"}, TargetLabel: "a", Action: "replace", Regex: relabel.MustNewRegexp("externalrelabelthis"), Replacement: "c", }, }, }, nil) queue := newBuffer(h.opts.QueueCapacity) h.alertmanagers = map[string]*alertmanagerSet{ "test": { buffers: map[string]*buffer{"test": queue}, cfg: &config.AlertmanagerConfig{ RelabelConfigs: h.opts.RelabelConfigs, }, }, } // This alert should get the external label attached. h.Send(&Alert{ Labels: labels.FromStrings("alertname", "test"), }) // This alert should get the external label attached, but then set to "c" // through relabelling. h.Send(&Alert{ Labels: labels.FromStrings("alertname", "externalrelabelthis"), }) alerts := make([]*Alert, maxBatchSize) queue.pop(&alerts) expected := []*Alert{ {Labels: labels.FromStrings("alertname", "test", "a", "b")}, {Labels: labels.FromStrings("alertname", "externalrelabelthis", "a", "c")}, } require.NoError(t, alertsEqual(expected, alerts)) } func TestHandlerRelabel(t *testing.T) { h := NewManager(&Options{ QueueCapacity: 3 * maxBatchSize, MaxBatchSize: maxBatchSize, RelabelConfigs: []*relabel.Config{ { SourceLabels: model.LabelNames{"alertname"}, Action: "drop", Regex: relabel.MustNewRegexp("drop"), }, { SourceLabels: model.LabelNames{"alertname"}, TargetLabel: "alertname", Action: "replace", Regex: relabel.MustNewRegexp("rename"), Replacement: "renamed", }, }, }, nil) queue := newBuffer(h.opts.QueueCapacity) h.alertmanagers = map[string]*alertmanagerSet{ "test": { buffers: map[string]*buffer{"test": queue}, cfg: &config.AlertmanagerConfig{ RelabelConfigs: h.opts.RelabelConfigs, }, }, } // This alert should be dropped due to the configuration h.Send(&Alert{ Labels: labels.FromStrings("alertname", "drop"), }) // This alert should be replaced due to the configuration h.Send(&Alert{ Labels: labels.FromStrings("alertname", "rename"), }) alerts := make([]*Alert, maxBatchSize) queue.pop(&alerts) expected := []*Alert{ {Labels: labels.FromStrings("alertname", "renamed")}, } require.NoError(t, alertsEqual(expected, alerts)) } func TestHandlerQueuing(t *testing.T) { var ( expectedc = make(chan []*Alert) called = make(chan struct{}) done = make(chan struct{}) errc = make(chan error, 1) ) server := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, r *http.Request) { // Notify the test function that we have received something. select { case called <- struct{}{}: case <-done: return } // Wait for the test function to unblock us. select { case expected := <-expectedc: var alerts []*Alert b, err := io.ReadAll(r.Body) if err != nil { panic(err) } err = json.Unmarshal(b, &alerts) if err == nil { err = alertsEqual(expected, alerts) } select { case errc <- err: default: } case <-done: } })) defer func() { close(done) server.Close() }() h := NewManager( &Options{ QueueCapacity: 3 * maxBatchSize, MaxBatchSize: maxBatchSize, }, nil, ) h.alertmanagers = make(map[string]*alertmanagerSet) am1Cfg := config.DefaultAlertmanagerConfig am1Cfg.Timeout = model.Duration(time.Second) h.alertmanagers["1"] = &alertmanagerSet{ ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return server.URL }, }, }, cfg: &am1Cfg, buffers: map[string]*buffer{server.URL: newBuffer(h.opts.QueueCapacity)}, metrics: h.metrics, opts: &Options{Do: do, MaxBatchSize: maxBatchSize}, logger: slog.New(slog.NewTextHandler(io.Discard, nil)), } for _, ams := range h.alertmanagers { for _, am := range ams.ams { go ams.sendLoop(am) } } go h.Run(nil) defer h.Stop() var alerts []*Alert for i := range make([]struct{}, 20*maxBatchSize) { alerts = append(alerts, &Alert{ Labels: labels.FromStrings("alertname", strconv.Itoa(i)), }) } assertAlerts := func(expected []*Alert) { t.Helper() for { select { case <-called: expectedc <- expected case err := <-errc: require.NoError(t, err) return case <-time.After(5 * time.Second): require.FailNow(t, "Alerts were not pushed.") } } } // If the batch is larger than the queue capacity, it should be truncated // from the front. h.Send(alerts[:4*maxBatchSize]...) for i := 1; i < 4; i++ { assertAlerts(alerts[i*maxBatchSize : (i+1)*maxBatchSize]) } // Send one batch, wait for it to arrive and block the server so the queue fills up. h.Send(alerts[:maxBatchSize]...) <-called // Send several batches while the server is still blocked so the queue // fills up to its maximum capacity (3*maxBatchSize). Then check that the // queue is truncated in the front. h.Send(alerts[1*maxBatchSize : 2*maxBatchSize]...) // this batch should be dropped. h.Send(alerts[2*maxBatchSize : 3*maxBatchSize]...) h.Send(alerts[3*maxBatchSize : 4*maxBatchSize]...) // Send the batch that drops the first one. h.Send(alerts[4*maxBatchSize : 5*maxBatchSize]...) // Unblock the server. expectedc <- alerts[:maxBatchSize] select { case err := <-errc: require.NoError(t, err) case <-time.After(5 * time.Second): require.FailNow(t, "Alerts were not pushed.") } // Verify that we receive the last 3 batches. for i := 2; i < 5; i++ { assertAlerts(alerts[i*maxBatchSize : (i+1)*maxBatchSize]) } } type alertmanagerMock struct { urlf func() string } func (a alertmanagerMock) url() *url.URL { u, err := url.Parse(a.urlf()) if err != nil { panic(err) } return u } func TestReload(t *testing.T) { tests := []struct { in *targetgroup.Group out string }{ { in: &targetgroup.Group{ Targets: []model.LabelSet{ { "__address__": "alertmanager:9093", }, }, }, out: "http://alertmanager:9093/api/v2/alerts", }, } n := NewManager(&Options{}, nil) cfg := &config.Config{} s := ` alerting: alertmanagers: - static_configs: ` err := yaml.UnmarshalStrict([]byte(s), cfg) require.NoError(t, err, "Unable to load YAML config.") require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1) err = n.ApplyConfig(cfg) require.NoError(t, err, "Error applying the config.") tgs := make(map[string][]*targetgroup.Group) for _, tt := range tests { for k := range cfg.AlertingConfig.AlertmanagerConfigs.ToMap() { tgs[k] = []*targetgroup.Group{ tt.in, } break } n.reload(tgs) res := n.Alertmanagers()[0].String() require.Equal(t, tt.out, res) } } func TestDroppedAlertmanagers(t *testing.T) { tests := []struct { in *targetgroup.Group out string }{ { in: &targetgroup.Group{ Targets: []model.LabelSet{ { "__address__": "alertmanager:9093", }, }, }, out: "http://alertmanager:9093/api/v2/alerts", }, } n := NewManager(&Options{}, nil) cfg := &config.Config{} s := ` alerting: alertmanagers: - static_configs: relabel_configs: - source_labels: ['__address__'] regex: 'alertmanager:9093' action: drop ` err := yaml.UnmarshalStrict([]byte(s), cfg) require.NoError(t, err, "Unable to load YAML config.") require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1) err = n.ApplyConfig(cfg) require.NoError(t, err, "Error applying the config.") tgs := make(map[string][]*targetgroup.Group) for _, tt := range tests { for k := range cfg.AlertingConfig.AlertmanagerConfigs.ToMap() { tgs[k] = []*targetgroup.Group{ tt.in, } break } n.reload(tgs) res := n.DroppedAlertmanagers()[0].String() require.Equal(t, res, tt.out) } } func makeInputTargetGroup() *targetgroup.Group { return &targetgroup.Group{ Targets: []model.LabelSet{ { model.AddressLabel: model.LabelValue("1.1.1.1:9090"), model.LabelName("notcommon1"): model.LabelValue("label"), }, }, Labels: model.LabelSet{ model.LabelName("common"): model.LabelValue("label"), }, Source: "testsource", } } // TestHangingNotifier ensures that the notifier takes into account SD changes even when there are // queued alerts. This test reproduces the issue described in https://github.com/prometheus/prometheus/issues/13676. // and https://github.com/prometheus/prometheus/issues/8768. // TODO: Drop this test as we have independent queues per alertmanager now. func TestHangingNotifier(t *testing.T) { const ( batches = 100 alertsCount = maxBatchSize * batches ) var ( sendTimeout = 100 * time.Millisecond sdUpdatert = sendTimeout / 2 done = make(chan struct{}) ) defer func() { close(done) }() // Set up a faulty Alertmanager. var faultyCalled atomic.Bool faultyServer := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) { faultyCalled.Store(true) select { case <-done: case <-time.After(time.Hour): } })) faultyURL, err := url.Parse(faultyServer.URL) require.NoError(t, err) // Set up a functional Alertmanager. var functionalCalled atomic.Bool functionalServer := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) { functionalCalled.Store(true) })) functionalURL, err := url.Parse(functionalServer.URL) require.NoError(t, err) // Initialize the discovery manager // This is relevant as the updates aren't sent continually in real life, but only each updatert. // The old implementation of TestHangingNotifier didn't take that into account. ctx, cancel := context.WithCancel(context.Background()) defer cancel() reg := prometheus.NewRegistry() sdMetrics, err := discovery.RegisterSDMetrics(reg, discovery.NewRefreshMetrics(reg)) require.NoError(t, err) sdManager := discovery.NewManager( ctx, promslog.NewNopLogger(), reg, sdMetrics, discovery.Name("sd-manager"), discovery.Updatert(sdUpdatert), ) go sdManager.Run() // Set up the notifier with both faulty and functional Alertmanagers. notifier := NewManager( &Options{ QueueCapacity: alertsCount, }, nil, ) notifier.alertmanagers = make(map[string]*alertmanagerSet) amCfg := config.DefaultAlertmanagerConfig amCfg.Timeout = model.Duration(sendTimeout) notifier.alertmanagers["config-0"] = &alertmanagerSet{ ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return faultyURL.String() }, }, alertmanagerMock{ urlf: func() string { return functionalURL.String() }, }, }, cfg: &amCfg, metrics: notifier.metrics, buffers: map[string]*buffer{ faultyURL.String(): newBuffer(notifier.opts.QueueCapacity), functionalURL.String(): newBuffer(notifier.opts.QueueCapacity), }, opts: &Options{Do: do, MaxBatchSize: maxBatchSize}, logger: slog.New(slog.NewTextHandler(io.Discard, nil)), } for _, ams := range notifier.alertmanagers { for _, am := range ams.ams { go ams.sendLoop(am) } } go notifier.Run(sdManager.SyncCh()) defer notifier.Stop() require.Len(t, notifier.Alertmanagers(), 2) // Enqueue the alerts. var alerts []*Alert for i := range make([]struct{}, alertsCount) { alerts = append(alerts, &Alert{ Labels: labels.FromStrings("alertname", strconv.Itoa(i)), }) } notifier.Send(alerts...) // Wait for the Alertmanagers to start receiving alerts. // 10*sdUpdatert is used as an arbitrary timeout here. timeout := time.After(10 * sdUpdatert) loop1: for { select { case <-timeout: t.Fatalf("Timeout waiting for the alertmanagers to be reached for the first time.") default: if faultyCalled.Load() && functionalCalled.Load() { break loop1 } } } // Request to remove the faulty Alertmanager. c := map[string]discovery.Configs{ "config-0": { discovery.StaticConfig{ &targetgroup.Group{ Targets: []model.LabelSet{ { model.AddressLabel: model.LabelValue(functionalURL.Host), }, }, }, }, }, } require.NoError(t, sdManager.ApplyConfig(c)) // The notifier should not wait until the alerts queue is empty to apply the discovery changes // A faulty Alertmanager could cause each alert sending cycle to take up to AlertmanagerConfig.Timeout // The queue may never be emptied, as the arrival rate could be larger than the departure rate // It could even overflow and alerts could be dropped. timeout = time.After(batches * sendTimeout) loop2: for { select { case <-timeout: t.Fatalf("Timeout, the faulty alertmanager not removed on time.") default: // The faulty alertmanager was dropped. if len(notifier.Alertmanagers()) == 1 { // Prevent from TOCTOU. for _, ams := range notifier.alertmanagers { for _, q := range ams.buffers { require.Zero(t, q.len()) } } break loop2 } } } } func TestStop_DrainingDisabled(t *testing.T) { releaseReceiver := make(chan struct{}) receiverReceivedRequest := make(chan struct{}, 2) alertsReceived := atomic.NewInt64(0) server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { // Let the test know we've received a request. receiverReceivedRequest <- struct{}{} var alerts []*Alert b, err := io.ReadAll(r.Body) require.NoError(t, err) err = json.Unmarshal(b, &alerts) require.NoError(t, err) alertsReceived.Add(int64(len(alerts))) // Wait for the test to release us. <-releaseReceiver w.WriteHeader(http.StatusOK) })) defer func() { server.Close() }() m := NewManager( &Options{ QueueCapacity: 10, DrainOnShutdown: false, }, nil, ) m.alertmanagers = make(map[string]*alertmanagerSet) am1Cfg := config.DefaultAlertmanagerConfig am1Cfg.Timeout = model.Duration(time.Second) m.alertmanagers["1"] = &alertmanagerSet{ ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return server.URL }, }, }, cfg: &am1Cfg, buffers: map[string]*buffer{server.URL: newBuffer(m.opts.QueueCapacity)}, opts: &Options{Do: do, MaxBatchSize: maxBatchSize}, metrics: newAlertMetrics(prometheus.DefaultRegisterer, nil), logger: slog.New(slog.NewTextHandler(io.Discard, nil)), } for _, ams := range m.alertmanagers { for _, am := range ams.ams { go ams.sendLoop(am) } } notificationManagerStopped := make(chan struct{}) go func() { defer close(notificationManagerStopped) m.Run(nil) }() // Queue two alerts. The first should be immediately sent to the receiver, which should block until we release it later. m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-1")}) select { case <-receiverReceivedRequest: // Nothing more to do. case <-time.After(time.Second): require.FailNow(t, "gave up waiting for receiver to receive notification of first alert") } m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-2")}) // Stop the notification manager, pause to allow the shutdown to be observed, and then allow the receiver to proceed. m.Stop() time.Sleep(time.Second) close(releaseReceiver) // Wait for the notification manager to stop and confirm only the first notification was sent. // The second notification should be dropped. select { case <-notificationManagerStopped: // Nothing more to do. case <-time.After(time.Second): require.FailNow(t, "gave up waiting for notification manager to stop") } // At least one alert must have been delivered before notification manager stops. require.Positive(t, alertsReceived.Load()) } func TestStop_DrainingEnabled(t *testing.T) { releaseReceiver := make(chan struct{}) receiverReceivedRequest := make(chan struct{}, 2) alertsReceived := atomic.NewInt64(0) server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { var alerts []*Alert b, err := io.ReadAll(r.Body) require.NoError(t, err) err = json.Unmarshal(b, &alerts) require.NoError(t, err) alertsReceived.Add(int64(len(alerts))) // Let the test know we've received a request. receiverReceivedRequest <- struct{}{} // Wait for the test to release us. <-releaseReceiver w.WriteHeader(http.StatusOK) })) defer func() { server.Close() }() m := NewManager( &Options{ QueueCapacity: 10, DrainOnShutdown: true, }, nil, ) m.alertmanagers = make(map[string]*alertmanagerSet) am1Cfg := config.DefaultAlertmanagerConfig am1Cfg.Timeout = model.Duration(time.Second) m.alertmanagers["1"] = &alertmanagerSet{ ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return server.URL }, }, }, cfg: &am1Cfg, buffers: map[string]*buffer{server.URL: newBuffer(m.opts.QueueCapacity)}, opts: &Options{Do: do, MaxBatchSize: maxBatchSize}, metrics: m.metrics, logger: slog.New(slog.NewTextHandler(io.Discard, nil)), } for _, ams := range m.alertmanagers { for _, am := range ams.ams { go ams.sendLoop(am) } } notificationManagerStopped := make(chan struct{}) go func() { defer close(notificationManagerStopped) m.Run(nil) }() // Queue two alerts. The first should be immediately sent to the receiver, which should block until we release it later. m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-1")}) select { case <-receiverReceivedRequest: // Nothing more to do. case <-time.After(time.Second): require.FailNow(t, "gave up waiting for receiver to receive notification of first alert") } m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-2")}) // Stop the notification manager and allow the receiver to proceed. m.Stop() close(releaseReceiver) // Wait for the notification manager to stop and confirm both notifications were sent. select { case <-notificationManagerStopped: // Nothing more to do. case <-time.After(400 * time.Millisecond): require.FailNow(t, "gave up waiting for notification manager to stop") } <-receiverReceivedRequest require.Equal(t, int64(2), alertsReceived.Load()) } func TestApplyConfig(t *testing.T) { targetURL := "alertmanager:9093" targetGroup := &targetgroup.Group{ Targets: []model.LabelSet{ { "__address__": model.LabelValue(targetURL), }, }, } alertmanagerURL := fmt.Sprintf("http://%s/api/v2/alerts", targetURL) n := NewManager(&Options{}, nil) cfg := &config.Config{} s := ` alerting: alertmanagers: - file_sd_configs: - files: - foo.json ` // 1. Ensure known alertmanagers are not dropped during ApplyConfig. require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg)) require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1) // First, apply the config and reload. require.NoError(t, n.ApplyConfig(cfg)) tgs := map[string][]*targetgroup.Group{"config-0": {targetGroup}} n.reload(tgs) require.Len(t, n.Alertmanagers(), 1) require.Equal(t, alertmanagerURL, n.Alertmanagers()[0].String()) // Reapply the config. require.NoError(t, n.ApplyConfig(cfg)) // Ensure the known alertmanagers are not dropped. require.Len(t, n.Alertmanagers(), 1) require.Equal(t, alertmanagerURL, n.Alertmanagers()[0].String()) // 2. Ensure known alertmanagers are not dropped during ApplyConfig even when // the config order changes. s = ` alerting: alertmanagers: - static_configs: - file_sd_configs: - files: - foo.json ` require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg)) require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2) require.NoError(t, n.ApplyConfig(cfg)) require.Len(t, n.Alertmanagers(), 1) // Ensure no unnecessary alertmanagers are injected. require.Empty(t, n.alertmanagers["config-0"].ams) // Ensure the config order is taken into account. ams := n.alertmanagers["config-1"].ams require.Len(t, ams, 1) require.Equal(t, alertmanagerURL, ams[0].url().String()) // 3. Ensure known alertmanagers are reused for new config with identical AlertmanagerConfig. s = ` alerting: alertmanagers: - file_sd_configs: - files: - foo.json - file_sd_configs: - files: - foo.json ` require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg)) require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2) require.NoError(t, n.ApplyConfig(cfg)) require.Len(t, n.Alertmanagers(), 2) for cfgIdx := range 2 { ams := n.alertmanagers[fmt.Sprintf("config-%d", cfgIdx)].ams require.Len(t, ams, 1) require.Equal(t, alertmanagerURL, ams[0].url().String()) } // 4. Ensure known alertmanagers are reused only for identical AlertmanagerConfig. s = ` alerting: alertmanagers: - file_sd_configs: - files: - foo.json path_prefix: /bar - file_sd_configs: - files: - foo.json relabel_configs: - source_labels: ['__address__'] regex: 'doesntmatter:1234' action: drop ` require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg)) require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2) require.NoError(t, n.ApplyConfig(cfg)) require.Empty(t, n.Alertmanagers()) } // Regression test for https://github.com/prometheus/prometheus/issues/7676 // The test creates a black hole alertmanager that never responds to any requests. // The alertmanager_config.timeout is set to infinite (1 year). // We check that the notifier does not hang and throughput is not affected. func TestNotifierQueueIndependentOfFailedAlertmanager(t *testing.T) { stopBlackHole := make(chan struct{}) blackHoleAM := newBlackHoleAlertmanager(stopBlackHole) defer close(stopBlackHole) doneAlertReceive := make(chan struct{}) immediateAM := newImmediateAlertManager(doneAlertReceive) h := NewManager(&Options{}, nil) h.alertmanagers = make(map[string]*alertmanagerSet) amCfg := config.DefaultAlertmanagerConfig amCfg.Timeout = model.Duration(time.Hour * 24 * 365) h.alertmanagers["1"] = &alertmanagerSet{ ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return blackHoleAM.URL }, }, }, cfg: &amCfg, opts: &Options{Do: do, MaxBatchSize: maxBatchSize}, buffers: map[string]*buffer{blackHoleAM.URL: newBuffer(10)}, metrics: h.metrics, } h.alertmanagers["2"] = &alertmanagerSet{ ams: []alertmanager{ alertmanagerMock{ urlf: func() string { return immediateAM.URL }, }, }, cfg: &amCfg, opts: &Options{Do: do, MaxBatchSize: maxBatchSize}, buffers: map[string]*buffer{immediateAM.URL: newBuffer(10)}, metrics: h.metrics, } doneSendAll := make(chan struct{}) go func() { for _, s := range h.alertmanagers { for _, am := range s.ams { go s.sendLoop(am) } } h.Send(&Alert{ Labels: labels.FromStrings("alertname", "test"), }) close(doneSendAll) }() select { case <-doneAlertReceive: // This is the happy case, the alert was received by the immediate alertmanager. case <-time.After(30 * time.Second): t.Fatal("Timeout waiting for alert to be received by immediate alertmanager") } select { case <-doneSendAll: // This is the happy case, the sendAll function returned. case <-time.After(30 * time.Second): t.Fatal("Timeout waiting for sendAll to return") } } func newBlackHoleAlertmanager(stop <-chan struct{}) *httptest.Server { return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { // Do nothing, wait to be canceled. <-stop w.WriteHeader(http.StatusOK) })) } func newImmediateAlertManager(done chan<- struct{}) *httptest.Server { return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) close(done) })) }