mirror of
https://github.com/prometheus/prometheus.git
synced 2025-08-06 14:17:12 +02:00
Independent Alertmanager queues avoid issues with queue overflowing when one or more Alertmanager instances are unavailable which could result in lost alert notifications. The buffered queues are managed per AlertmanagerSet which are dynamically added/removed with service discovery or configuration reload. The following metrics now include an extra dimention for alertmanager label: - prometheus_notifications_dropped_total - prometheus_notifications_queue_capacity - prometheus_notifications_queue_length This change also includes the test from #14099 Closes #7676 Signed-off-by: Siavash Safi <siavash@cloudflare.com>
1426 lines
37 KiB
Go
1426 lines
37 KiB
Go
// Copyright The Prometheus Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package notifier
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"net/url"
|
|
"strconv"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
dto "github.com/prometheus/client_model/go"
|
|
config_util "github.com/prometheus/common/config"
|
|
"github.com/prometheus/common/model"
|
|
"github.com/prometheus/common/promslog"
|
|
"github.com/stretchr/testify/require"
|
|
"go.uber.org/atomic"
|
|
"gopkg.in/yaml.v2"
|
|
|
|
"github.com/prometheus/prometheus/config"
|
|
"github.com/prometheus/prometheus/discovery"
|
|
_ "github.com/prometheus/prometheus/discovery/file"
|
|
"github.com/prometheus/prometheus/discovery/targetgroup"
|
|
"github.com/prometheus/prometheus/model/labels"
|
|
"github.com/prometheus/prometheus/model/relabel"
|
|
)
|
|
|
|
const maxBatchSize = 256
|
|
|
|
func TestHandlerSendBatch(t *testing.T) {
|
|
h := NewManager(&Options{}, nil)
|
|
|
|
b := newBuffer(10_000)
|
|
h.alertmanagers = map[string]*alertmanagerSet{
|
|
"mock": {
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return "http://mock" },
|
|
},
|
|
},
|
|
cfg: &config.DefaultAlertmanagerConfig,
|
|
buffers: map[string]*buffer{"http://mock": b},
|
|
},
|
|
}
|
|
|
|
var alerts []*Alert
|
|
for i := range make([]struct{}, 2*maxBatchSize+1) {
|
|
alerts = append(alerts, &Alert{
|
|
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
|
|
})
|
|
}
|
|
h.Send(alerts...)
|
|
|
|
expected := append([]*Alert{}, alerts...)
|
|
|
|
batch := make([]*Alert, maxBatchSize)
|
|
|
|
b.pop(&batch)
|
|
require.NoError(t, alertsEqual(expected[0:maxBatchSize], batch))
|
|
|
|
b.pop(&batch)
|
|
require.NoError(t, alertsEqual(expected[maxBatchSize:2*maxBatchSize], batch))
|
|
|
|
b.pop(&batch)
|
|
require.NoError(t, alertsEqual(expected[2*maxBatchSize:], batch))
|
|
}
|
|
|
|
func alertsEqual(a, b []*Alert) error {
|
|
if len(a) != len(b) {
|
|
return fmt.Errorf("length mismatch: %v != %v", a, b)
|
|
}
|
|
for i, alert := range a {
|
|
if !labels.Equal(alert.Labels, b[i].Labels) {
|
|
return fmt.Errorf("label mismatch at index %d: %s != %s", i, alert.Labels, b[i].Labels)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func newTestHTTPServerBuilder(expected *[]*Alert, errc chan<- error, u, p string, status *atomic.Int32) *httptest.Server {
|
|
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
var err error
|
|
defer func() {
|
|
if err == nil {
|
|
return
|
|
}
|
|
select {
|
|
case errc <- err:
|
|
default:
|
|
}
|
|
}()
|
|
user, pass, _ := r.BasicAuth()
|
|
if user != u || pass != p {
|
|
err = fmt.Errorf("unexpected user/password: %s/%s != %s/%s", user, pass, u, p)
|
|
w.WriteHeader(http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
b, err := io.ReadAll(r.Body)
|
|
if err != nil {
|
|
err = fmt.Errorf("error reading body: %w", err)
|
|
w.WriteHeader(http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
var alerts []*Alert
|
|
err = json.Unmarshal(b, &alerts)
|
|
if err == nil {
|
|
err = alertsEqual(*expected, alerts)
|
|
}
|
|
w.WriteHeader(int(status.Load()))
|
|
}))
|
|
}
|
|
|
|
func getCounterValue(t *testing.T, metric *prometheus.CounterVec, labels ...string) float64 {
|
|
t.Helper()
|
|
m := &dto.Metric{}
|
|
if err := metric.WithLabelValues(labels...).Write(m); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
return m.Counter.GetValue()
|
|
}
|
|
|
|
func TestHandlerSendAll(t *testing.T) {
|
|
var (
|
|
errc = make(chan error, 1)
|
|
expected = make([]*Alert, 0)
|
|
status1, status2, status3 atomic.Int32
|
|
errors1, errors2, errors3 float64
|
|
)
|
|
status1.Store(int32(http.StatusOK))
|
|
status2.Store(int32(http.StatusOK))
|
|
status3.Store(int32(http.StatusOK))
|
|
|
|
server1 := newTestHTTPServerBuilder(&expected, errc, "prometheus", "testing_password", &status1)
|
|
server2 := newTestHTTPServerBuilder(&expected, errc, "", "", &status2)
|
|
server3 := newTestHTTPServerBuilder(&expected, errc, "", "", &status3)
|
|
defer server1.Close()
|
|
defer server2.Close()
|
|
defer server3.Close()
|
|
|
|
h := NewManager(&Options{}, nil)
|
|
|
|
authClient, _ := config_util.NewClientFromConfig(
|
|
config_util.HTTPClientConfig{
|
|
BasicAuth: &config_util.BasicAuth{
|
|
Username: "prometheus",
|
|
Password: "testing_password",
|
|
},
|
|
}, "auth_alertmanager")
|
|
|
|
h.alertmanagers = make(map[string]*alertmanagerSet)
|
|
|
|
am1Cfg := config.DefaultAlertmanagerConfig
|
|
am1Cfg.Timeout = model.Duration(time.Second)
|
|
|
|
am2Cfg := config.DefaultAlertmanagerConfig
|
|
am2Cfg.Timeout = model.Duration(time.Second)
|
|
|
|
am3Cfg := config.DefaultAlertmanagerConfig
|
|
am3Cfg.Timeout = model.Duration(time.Second)
|
|
|
|
opts := &Options{Do: do, QueueCapacity: 10_000, MaxBatchSize: maxBatchSize}
|
|
logger := slog.New(slog.NewTextHandler(io.Discard, nil))
|
|
|
|
h.alertmanagers["1"] = &alertmanagerSet{
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return server1.URL },
|
|
},
|
|
},
|
|
cfg: &am1Cfg,
|
|
client: authClient,
|
|
buffers: map[string]*buffer{server1.URL: newBuffer(opts.QueueCapacity)},
|
|
opts: opts,
|
|
metrics: h.metrics,
|
|
logger: logger,
|
|
}
|
|
|
|
h.alertmanagers["2"] = &alertmanagerSet{
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return server2.URL },
|
|
},
|
|
alertmanagerMock{
|
|
urlf: func() string { return server3.URL },
|
|
},
|
|
},
|
|
cfg: &am2Cfg,
|
|
buffers: map[string]*buffer{
|
|
server2.URL: newBuffer(opts.QueueCapacity),
|
|
server3.URL: newBuffer(opts.QueueCapacity),
|
|
},
|
|
opts: opts,
|
|
metrics: h.metrics,
|
|
logger: logger,
|
|
}
|
|
|
|
h.alertmanagers["3"] = &alertmanagerSet{
|
|
ams: []alertmanager{}, // empty set
|
|
cfg: &am3Cfg,
|
|
buffers: make(map[string]*buffer),
|
|
opts: opts,
|
|
metrics: h.metrics,
|
|
logger: logger,
|
|
}
|
|
|
|
var alerts []*Alert
|
|
for i := range make([]struct{}, maxBatchSize) {
|
|
alerts = append(alerts, &Alert{
|
|
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
|
|
})
|
|
expected = append(expected, &Alert{
|
|
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
|
|
})
|
|
}
|
|
|
|
checkNoErr := func() {
|
|
t.Helper()
|
|
select {
|
|
case err := <-errc:
|
|
require.NoError(t, err)
|
|
default:
|
|
}
|
|
}
|
|
|
|
// start send loops
|
|
for _, ams := range h.alertmanagers {
|
|
for _, am := range ams.ams {
|
|
go ams.sendLoop(am)
|
|
}
|
|
}
|
|
|
|
// all ams in all sets are up
|
|
h.Send(alerts...)
|
|
time.Sleep(time.Second)
|
|
|
|
// snapshot error metrics and check them
|
|
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
|
|
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
|
|
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
|
|
require.Zero(t, errors1, "server1 has unexpected send errors")
|
|
require.Zero(t, errors2, "server2 has unexpected send errors")
|
|
require.Zero(t, errors3, "server3 has unexpected send errors")
|
|
checkNoErr()
|
|
|
|
// the only am in set 1 is down
|
|
status1.Store(int32(http.StatusNotFound))
|
|
h.Send(alerts...)
|
|
time.Sleep(time.Second)
|
|
|
|
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
|
|
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
|
|
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
|
|
require.NotZero(t, errors1, "server1 has no send errors")
|
|
require.Zero(t, errors2, "server2 has unexpected send errors")
|
|
require.Zero(t, errors3, "server3 has unexpected send errors")
|
|
checkNoErr()
|
|
|
|
// reset it
|
|
status1.Store(int32(http.StatusOK))
|
|
|
|
// reset metrics
|
|
h.metrics.errors.Reset()
|
|
|
|
// only one of the ams in set 2 is down
|
|
status2.Store(int32(http.StatusInternalServerError))
|
|
h.Send(alerts...)
|
|
time.Sleep(time.Second)
|
|
|
|
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
|
|
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
|
|
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
|
|
require.Zero(t, errors1, "server1 has unexpected send errors")
|
|
require.NotZero(t, errors2, "server2 has no send errors")
|
|
require.Zero(t, errors3, "server3 has unexpected send errors")
|
|
checkNoErr()
|
|
|
|
// both ams in set 2 are down
|
|
status3.Store(int32(http.StatusInternalServerError))
|
|
h.Send(alerts...)
|
|
time.Sleep(time.Second)
|
|
|
|
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
|
|
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
|
|
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
|
|
require.Zero(t, errors1, "server1 has unexpected send errors")
|
|
require.NotZero(t, errors2, "server2 has no send errors")
|
|
require.NotZero(t, errors3, "server3 has no send errors")
|
|
checkNoErr()
|
|
|
|
// stop send routines by closing buffers
|
|
for _, ams := range h.alertmanagers {
|
|
for _, q := range ams.buffers {
|
|
q.close()
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestHandlerSendAllRemapPerAm(t *testing.T) {
|
|
var (
|
|
errc = make(chan error, 1)
|
|
expected1 = make([]*Alert, 0)
|
|
expected2 = make([]*Alert, 0)
|
|
expected3 = make([]*Alert, 0)
|
|
|
|
status1, status2, status3 atomic.Int32
|
|
errors1, errors2, errors3 float64
|
|
)
|
|
status1.Store(int32(http.StatusOK))
|
|
status2.Store(int32(http.StatusOK))
|
|
status3.Store(int32(http.StatusOK))
|
|
|
|
server1 := newTestHTTPServerBuilder(&expected1, errc, "", "", &status1)
|
|
server2 := newTestHTTPServerBuilder(&expected2, errc, "", "", &status2)
|
|
server3 := newTestHTTPServerBuilder(&expected3, errc, "", "", &status3)
|
|
|
|
defer server1.Close()
|
|
defer server2.Close()
|
|
defer server3.Close()
|
|
|
|
h := NewManager(&Options{}, nil)
|
|
h.alertmanagers = make(map[string]*alertmanagerSet)
|
|
|
|
am1Cfg := config.DefaultAlertmanagerConfig
|
|
am1Cfg.Timeout = model.Duration(time.Second)
|
|
|
|
am2Cfg := config.DefaultAlertmanagerConfig
|
|
am2Cfg.Timeout = model.Duration(time.Second)
|
|
am2Cfg.AlertRelabelConfigs = []*relabel.Config{
|
|
{
|
|
SourceLabels: model.LabelNames{"alertnamedrop"},
|
|
Action: "drop",
|
|
Regex: relabel.MustNewRegexp(".+"),
|
|
},
|
|
}
|
|
|
|
am3Cfg := config.DefaultAlertmanagerConfig
|
|
am3Cfg.Timeout = model.Duration(time.Second)
|
|
am3Cfg.AlertRelabelConfigs = []*relabel.Config{
|
|
{
|
|
SourceLabels: model.LabelNames{"alertname"},
|
|
Action: "drop",
|
|
Regex: relabel.MustNewRegexp(".+"),
|
|
},
|
|
}
|
|
|
|
opts := &Options{Do: do, QueueCapacity: 10_000, MaxBatchSize: maxBatchSize}
|
|
logger := slog.New(slog.NewTextHandler(io.Discard, nil))
|
|
|
|
h.alertmanagers = map[string]*alertmanagerSet{
|
|
// Drop no alerts.
|
|
"1": {
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return server1.URL },
|
|
},
|
|
},
|
|
cfg: &am1Cfg,
|
|
buffers: map[string]*buffer{server1.URL: newBuffer(opts.QueueCapacity)},
|
|
opts: opts,
|
|
metrics: h.metrics,
|
|
logger: logger,
|
|
},
|
|
// Drop only alerts with the "alertnamedrop" label.
|
|
"2": {
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return server2.URL },
|
|
},
|
|
},
|
|
cfg: &am2Cfg,
|
|
buffers: map[string]*buffer{server2.URL: newBuffer(opts.QueueCapacity)},
|
|
opts: opts,
|
|
metrics: h.metrics,
|
|
logger: logger,
|
|
},
|
|
// Drop all alerts.
|
|
"3": {
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return server3.URL },
|
|
},
|
|
},
|
|
cfg: &am3Cfg,
|
|
buffers: map[string]*buffer{server3.URL: newBuffer(opts.QueueCapacity)},
|
|
opts: opts,
|
|
metrics: h.metrics,
|
|
logger: logger,
|
|
},
|
|
// Empty list of Alertmanager endpoints.
|
|
"4": {
|
|
ams: []alertmanager{},
|
|
cfg: &config.DefaultAlertmanagerConfig,
|
|
buffers: make(map[string]*buffer),
|
|
opts: opts,
|
|
metrics: h.metrics,
|
|
logger: logger,
|
|
},
|
|
}
|
|
|
|
var alerts []*Alert
|
|
for i := range make([]struct{}, maxBatchSize/2) {
|
|
alerts = append(alerts,
|
|
&Alert{
|
|
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
|
|
},
|
|
&Alert{
|
|
Labels: labels.FromStrings("alertname", "test", "alertnamedrop", strconv.Itoa(i)),
|
|
},
|
|
)
|
|
|
|
expected1 = append(expected1,
|
|
&Alert{
|
|
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
|
|
}, &Alert{
|
|
Labels: labels.FromStrings("alertname", "test", "alertnamedrop", strconv.Itoa(i)),
|
|
},
|
|
)
|
|
|
|
expected2 = append(expected2, &Alert{
|
|
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
|
|
})
|
|
}
|
|
|
|
checkNoErr := func() {
|
|
t.Helper()
|
|
select {
|
|
case err := <-errc:
|
|
require.NoError(t, err)
|
|
default:
|
|
}
|
|
}
|
|
|
|
// start send loops
|
|
for _, ams := range h.alertmanagers {
|
|
for _, am := range ams.ams {
|
|
go ams.sendLoop(am)
|
|
}
|
|
}
|
|
|
|
// all ams are up
|
|
h.Send(alerts...)
|
|
time.Sleep(time.Second)
|
|
|
|
// snapshot error metrics and check them
|
|
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
|
|
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
|
|
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
|
|
require.Zero(t, errors1, "server1 has unexpected send errors")
|
|
require.Zero(t, errors2, "server2 has unexpected send errors")
|
|
require.Zero(t, errors3, "server3 has unexpected send errors")
|
|
checkNoErr()
|
|
|
|
// the only am in set 1 goes down
|
|
status1.Store(int32(http.StatusInternalServerError))
|
|
h.Send(alerts...)
|
|
time.Sleep(time.Second)
|
|
|
|
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
|
|
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
|
|
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
|
|
require.NotZero(t, errors1, "server1 has no send errors")
|
|
require.Zero(t, errors2, "server2 has unexpected send errors")
|
|
require.Zero(t, errors3, "server3 has unexpected send errors")
|
|
checkNoErr()
|
|
|
|
// reset set 1
|
|
status1.Store(int32(http.StatusOK))
|
|
|
|
// reset metrics
|
|
h.metrics.errors.Reset()
|
|
|
|
// set 3 loses its only am, but all alerts were dropped
|
|
// so there was nothing to send, keeping sendAll true
|
|
status3.Store(int32(http.StatusInternalServerError))
|
|
h.Send(alerts...)
|
|
time.Sleep(3 * time.Second)
|
|
|
|
errors1 = getCounterValue(t, h.metrics.errors, server1.URL)
|
|
errors2 = getCounterValue(t, h.metrics.errors, server2.URL)
|
|
errors3 = getCounterValue(t, h.metrics.errors, server3.URL)
|
|
require.Zero(t, errors1, "server1 has unexpected send errors")
|
|
require.Zero(t, errors2, "server2 has unexpected send errors")
|
|
require.Zero(t, errors3, "server3 has unexpected send errors")
|
|
checkNoErr()
|
|
|
|
// stop send routines by closing buffers
|
|
for _, ams := range h.alertmanagers {
|
|
for _, q := range ams.buffers {
|
|
q.close()
|
|
}
|
|
}
|
|
|
|
// Verify that individual locks are released.
|
|
for k := range h.alertmanagers {
|
|
h.alertmanagers[k].mtx.Lock()
|
|
h.alertmanagers[k].ams = nil
|
|
h.alertmanagers[k].mtx.Unlock()
|
|
}
|
|
}
|
|
|
|
func TestExternalLabels(t *testing.T) {
|
|
h := NewManager(&Options{
|
|
QueueCapacity: 3 * maxBatchSize,
|
|
MaxBatchSize: maxBatchSize,
|
|
ExternalLabels: labels.FromStrings("a", "b"),
|
|
RelabelConfigs: []*relabel.Config{
|
|
{
|
|
SourceLabels: model.LabelNames{"alertname"},
|
|
TargetLabel: "a",
|
|
Action: "replace",
|
|
Regex: relabel.MustNewRegexp("externalrelabelthis"),
|
|
Replacement: "c",
|
|
},
|
|
},
|
|
}, nil)
|
|
|
|
queue := newBuffer(h.opts.QueueCapacity)
|
|
h.alertmanagers = map[string]*alertmanagerSet{
|
|
"test": {
|
|
buffers: map[string]*buffer{"test": queue},
|
|
cfg: &config.AlertmanagerConfig{
|
|
RelabelConfigs: h.opts.RelabelConfigs,
|
|
},
|
|
},
|
|
}
|
|
|
|
// This alert should get the external label attached.
|
|
h.Send(&Alert{
|
|
Labels: labels.FromStrings("alertname", "test"),
|
|
})
|
|
|
|
// This alert should get the external label attached, but then set to "c"
|
|
// through relabelling.
|
|
h.Send(&Alert{
|
|
Labels: labels.FromStrings("alertname", "externalrelabelthis"),
|
|
})
|
|
|
|
alerts := make([]*Alert, maxBatchSize)
|
|
queue.pop(&alerts)
|
|
|
|
expected := []*Alert{
|
|
{Labels: labels.FromStrings("alertname", "test", "a", "b")},
|
|
{Labels: labels.FromStrings("alertname", "externalrelabelthis", "a", "c")},
|
|
}
|
|
|
|
require.NoError(t, alertsEqual(expected, alerts))
|
|
}
|
|
|
|
func TestHandlerRelabel(t *testing.T) {
|
|
h := NewManager(&Options{
|
|
QueueCapacity: 3 * maxBatchSize,
|
|
MaxBatchSize: maxBatchSize,
|
|
RelabelConfigs: []*relabel.Config{
|
|
{
|
|
SourceLabels: model.LabelNames{"alertname"},
|
|
Action: "drop",
|
|
Regex: relabel.MustNewRegexp("drop"),
|
|
},
|
|
{
|
|
SourceLabels: model.LabelNames{"alertname"},
|
|
TargetLabel: "alertname",
|
|
Action: "replace",
|
|
Regex: relabel.MustNewRegexp("rename"),
|
|
Replacement: "renamed",
|
|
},
|
|
},
|
|
}, nil)
|
|
|
|
queue := newBuffer(h.opts.QueueCapacity)
|
|
h.alertmanagers = map[string]*alertmanagerSet{
|
|
"test": {
|
|
buffers: map[string]*buffer{"test": queue},
|
|
cfg: &config.AlertmanagerConfig{
|
|
RelabelConfigs: h.opts.RelabelConfigs,
|
|
},
|
|
},
|
|
}
|
|
|
|
// This alert should be dropped due to the configuration
|
|
h.Send(&Alert{
|
|
Labels: labels.FromStrings("alertname", "drop"),
|
|
})
|
|
|
|
// This alert should be replaced due to the configuration
|
|
h.Send(&Alert{
|
|
Labels: labels.FromStrings("alertname", "rename"),
|
|
})
|
|
|
|
alerts := make([]*Alert, maxBatchSize)
|
|
queue.pop(&alerts)
|
|
|
|
expected := []*Alert{
|
|
{Labels: labels.FromStrings("alertname", "renamed")},
|
|
}
|
|
|
|
require.NoError(t, alertsEqual(expected, alerts))
|
|
}
|
|
|
|
func TestHandlerQueuing(t *testing.T) {
|
|
var (
|
|
expectedc = make(chan []*Alert)
|
|
called = make(chan struct{})
|
|
done = make(chan struct{})
|
|
errc = make(chan error, 1)
|
|
)
|
|
|
|
server := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, r *http.Request) {
|
|
// Notify the test function that we have received something.
|
|
select {
|
|
case called <- struct{}{}:
|
|
case <-done:
|
|
return
|
|
}
|
|
|
|
// Wait for the test function to unblock us.
|
|
select {
|
|
case expected := <-expectedc:
|
|
var alerts []*Alert
|
|
|
|
b, err := io.ReadAll(r.Body)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
err = json.Unmarshal(b, &alerts)
|
|
if err == nil {
|
|
err = alertsEqual(expected, alerts)
|
|
}
|
|
select {
|
|
case errc <- err:
|
|
default:
|
|
}
|
|
case <-done:
|
|
}
|
|
}))
|
|
defer func() {
|
|
close(done)
|
|
server.Close()
|
|
}()
|
|
|
|
h := NewManager(
|
|
&Options{
|
|
QueueCapacity: 3 * maxBatchSize,
|
|
MaxBatchSize: maxBatchSize,
|
|
},
|
|
nil,
|
|
)
|
|
|
|
h.alertmanagers = make(map[string]*alertmanagerSet)
|
|
|
|
am1Cfg := config.DefaultAlertmanagerConfig
|
|
am1Cfg.Timeout = model.Duration(time.Second)
|
|
|
|
h.alertmanagers["1"] = &alertmanagerSet{
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return server.URL },
|
|
},
|
|
},
|
|
cfg: &am1Cfg,
|
|
buffers: map[string]*buffer{server.URL: newBuffer(h.opts.QueueCapacity)},
|
|
metrics: h.metrics,
|
|
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
|
|
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
|
|
}
|
|
|
|
for _, ams := range h.alertmanagers {
|
|
for _, am := range ams.ams {
|
|
go ams.sendLoop(am)
|
|
}
|
|
}
|
|
|
|
go h.Run(nil)
|
|
defer h.Stop()
|
|
|
|
var alerts []*Alert
|
|
for i := range make([]struct{}, 20*maxBatchSize) {
|
|
alerts = append(alerts, &Alert{
|
|
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
|
|
})
|
|
}
|
|
|
|
assertAlerts := func(expected []*Alert) {
|
|
t.Helper()
|
|
for {
|
|
select {
|
|
case <-called:
|
|
expectedc <- expected
|
|
case err := <-errc:
|
|
require.NoError(t, err)
|
|
return
|
|
case <-time.After(5 * time.Second):
|
|
require.FailNow(t, "Alerts were not pushed.")
|
|
}
|
|
}
|
|
}
|
|
|
|
// If the batch is larger than the queue capacity, it should be truncated
|
|
// from the front.
|
|
h.Send(alerts[:4*maxBatchSize]...)
|
|
for i := 1; i < 4; i++ {
|
|
assertAlerts(alerts[i*maxBatchSize : (i+1)*maxBatchSize])
|
|
}
|
|
|
|
// Send one batch, wait for it to arrive and block the server so the queue fills up.
|
|
h.Send(alerts[:maxBatchSize]...)
|
|
<-called
|
|
|
|
// Send several batches while the server is still blocked so the queue
|
|
// fills up to its maximum capacity (3*maxBatchSize). Then check that the
|
|
// queue is truncated in the front.
|
|
h.Send(alerts[1*maxBatchSize : 2*maxBatchSize]...) // this batch should be dropped.
|
|
h.Send(alerts[2*maxBatchSize : 3*maxBatchSize]...)
|
|
h.Send(alerts[3*maxBatchSize : 4*maxBatchSize]...)
|
|
|
|
// Send the batch that drops the first one.
|
|
h.Send(alerts[4*maxBatchSize : 5*maxBatchSize]...)
|
|
|
|
// Unblock the server.
|
|
expectedc <- alerts[:maxBatchSize]
|
|
select {
|
|
case err := <-errc:
|
|
require.NoError(t, err)
|
|
case <-time.After(5 * time.Second):
|
|
require.FailNow(t, "Alerts were not pushed.")
|
|
}
|
|
|
|
// Verify that we receive the last 3 batches.
|
|
for i := 2; i < 5; i++ {
|
|
assertAlerts(alerts[i*maxBatchSize : (i+1)*maxBatchSize])
|
|
}
|
|
}
|
|
|
|
type alertmanagerMock struct {
|
|
urlf func() string
|
|
}
|
|
|
|
func (a alertmanagerMock) url() *url.URL {
|
|
u, err := url.Parse(a.urlf())
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return u
|
|
}
|
|
|
|
func TestReload(t *testing.T) {
|
|
tests := []struct {
|
|
in *targetgroup.Group
|
|
out string
|
|
}{
|
|
{
|
|
in: &targetgroup.Group{
|
|
Targets: []model.LabelSet{
|
|
{
|
|
"__address__": "alertmanager:9093",
|
|
},
|
|
},
|
|
},
|
|
out: "http://alertmanager:9093/api/v2/alerts",
|
|
},
|
|
}
|
|
|
|
n := NewManager(&Options{}, nil)
|
|
|
|
cfg := &config.Config{}
|
|
s := `
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
`
|
|
err := yaml.UnmarshalStrict([]byte(s), cfg)
|
|
require.NoError(t, err, "Unable to load YAML config.")
|
|
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1)
|
|
|
|
err = n.ApplyConfig(cfg)
|
|
require.NoError(t, err, "Error applying the config.")
|
|
|
|
tgs := make(map[string][]*targetgroup.Group)
|
|
for _, tt := range tests {
|
|
for k := range cfg.AlertingConfig.AlertmanagerConfigs.ToMap() {
|
|
tgs[k] = []*targetgroup.Group{
|
|
tt.in,
|
|
}
|
|
break
|
|
}
|
|
n.reload(tgs)
|
|
res := n.Alertmanagers()[0].String()
|
|
|
|
require.Equal(t, tt.out, res)
|
|
}
|
|
}
|
|
|
|
func TestDroppedAlertmanagers(t *testing.T) {
|
|
tests := []struct {
|
|
in *targetgroup.Group
|
|
out string
|
|
}{
|
|
{
|
|
in: &targetgroup.Group{
|
|
Targets: []model.LabelSet{
|
|
{
|
|
"__address__": "alertmanager:9093",
|
|
},
|
|
},
|
|
},
|
|
out: "http://alertmanager:9093/api/v2/alerts",
|
|
},
|
|
}
|
|
|
|
n := NewManager(&Options{}, nil)
|
|
|
|
cfg := &config.Config{}
|
|
s := `
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
relabel_configs:
|
|
- source_labels: ['__address__']
|
|
regex: 'alertmanager:9093'
|
|
action: drop
|
|
`
|
|
err := yaml.UnmarshalStrict([]byte(s), cfg)
|
|
require.NoError(t, err, "Unable to load YAML config.")
|
|
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1)
|
|
|
|
err = n.ApplyConfig(cfg)
|
|
require.NoError(t, err, "Error applying the config.")
|
|
|
|
tgs := make(map[string][]*targetgroup.Group)
|
|
for _, tt := range tests {
|
|
for k := range cfg.AlertingConfig.AlertmanagerConfigs.ToMap() {
|
|
tgs[k] = []*targetgroup.Group{
|
|
tt.in,
|
|
}
|
|
break
|
|
}
|
|
|
|
n.reload(tgs)
|
|
res := n.DroppedAlertmanagers()[0].String()
|
|
|
|
require.Equal(t, res, tt.out)
|
|
}
|
|
}
|
|
|
|
func makeInputTargetGroup() *targetgroup.Group {
|
|
return &targetgroup.Group{
|
|
Targets: []model.LabelSet{
|
|
{
|
|
model.AddressLabel: model.LabelValue("1.1.1.1:9090"),
|
|
model.LabelName("notcommon1"): model.LabelValue("label"),
|
|
},
|
|
},
|
|
Labels: model.LabelSet{
|
|
model.LabelName("common"): model.LabelValue("label"),
|
|
},
|
|
Source: "testsource",
|
|
}
|
|
}
|
|
|
|
// TestHangingNotifier ensures that the notifier takes into account SD changes even when there are
|
|
// queued alerts. This test reproduces the issue described in https://github.com/prometheus/prometheus/issues/13676.
|
|
// and https://github.com/prometheus/prometheus/issues/8768.
|
|
// TODO: Drop this test as we have independent queues per alertmanager now.
|
|
func TestHangingNotifier(t *testing.T) {
|
|
const (
|
|
batches = 100
|
|
alertsCount = maxBatchSize * batches
|
|
)
|
|
|
|
var (
|
|
sendTimeout = 100 * time.Millisecond
|
|
sdUpdatert = sendTimeout / 2
|
|
|
|
done = make(chan struct{})
|
|
)
|
|
|
|
defer func() {
|
|
close(done)
|
|
}()
|
|
|
|
// Set up a faulty Alertmanager.
|
|
var faultyCalled atomic.Bool
|
|
faultyServer := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {
|
|
faultyCalled.Store(true)
|
|
select {
|
|
case <-done:
|
|
case <-time.After(time.Hour):
|
|
}
|
|
}))
|
|
faultyURL, err := url.Parse(faultyServer.URL)
|
|
require.NoError(t, err)
|
|
|
|
// Set up a functional Alertmanager.
|
|
var functionalCalled atomic.Bool
|
|
functionalServer := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {
|
|
functionalCalled.Store(true)
|
|
}))
|
|
functionalURL, err := url.Parse(functionalServer.URL)
|
|
require.NoError(t, err)
|
|
|
|
// Initialize the discovery manager
|
|
// This is relevant as the updates aren't sent continually in real life, but only each updatert.
|
|
// The old implementation of TestHangingNotifier didn't take that into account.
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
reg := prometheus.NewRegistry()
|
|
sdMetrics, err := discovery.RegisterSDMetrics(reg, discovery.NewRefreshMetrics(reg))
|
|
require.NoError(t, err)
|
|
sdManager := discovery.NewManager(
|
|
ctx,
|
|
promslog.NewNopLogger(),
|
|
reg,
|
|
sdMetrics,
|
|
discovery.Name("sd-manager"),
|
|
discovery.Updatert(sdUpdatert),
|
|
)
|
|
go sdManager.Run()
|
|
|
|
// Set up the notifier with both faulty and functional Alertmanagers.
|
|
notifier := NewManager(
|
|
&Options{
|
|
QueueCapacity: alertsCount,
|
|
},
|
|
nil,
|
|
)
|
|
notifier.alertmanagers = make(map[string]*alertmanagerSet)
|
|
amCfg := config.DefaultAlertmanagerConfig
|
|
amCfg.Timeout = model.Duration(sendTimeout)
|
|
notifier.alertmanagers["config-0"] = &alertmanagerSet{
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return faultyURL.String() },
|
|
},
|
|
alertmanagerMock{
|
|
urlf: func() string { return functionalURL.String() },
|
|
},
|
|
},
|
|
cfg: &amCfg,
|
|
metrics: notifier.metrics,
|
|
buffers: map[string]*buffer{
|
|
faultyURL.String(): newBuffer(notifier.opts.QueueCapacity),
|
|
functionalURL.String(): newBuffer(notifier.opts.QueueCapacity),
|
|
},
|
|
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
|
|
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
|
|
}
|
|
|
|
for _, ams := range notifier.alertmanagers {
|
|
for _, am := range ams.ams {
|
|
go ams.sendLoop(am)
|
|
}
|
|
}
|
|
|
|
go notifier.Run(sdManager.SyncCh())
|
|
defer notifier.Stop()
|
|
|
|
require.Len(t, notifier.Alertmanagers(), 2)
|
|
|
|
// Enqueue the alerts.
|
|
var alerts []*Alert
|
|
for i := range make([]struct{}, alertsCount) {
|
|
alerts = append(alerts, &Alert{
|
|
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
|
|
})
|
|
}
|
|
notifier.Send(alerts...)
|
|
|
|
// Wait for the Alertmanagers to start receiving alerts.
|
|
// 10*sdUpdatert is used as an arbitrary timeout here.
|
|
timeout := time.After(10 * sdUpdatert)
|
|
loop1:
|
|
for {
|
|
select {
|
|
case <-timeout:
|
|
t.Fatalf("Timeout waiting for the alertmanagers to be reached for the first time.")
|
|
default:
|
|
if faultyCalled.Load() && functionalCalled.Load() {
|
|
break loop1
|
|
}
|
|
}
|
|
}
|
|
|
|
// Request to remove the faulty Alertmanager.
|
|
c := map[string]discovery.Configs{
|
|
"config-0": {
|
|
discovery.StaticConfig{
|
|
&targetgroup.Group{
|
|
Targets: []model.LabelSet{
|
|
{
|
|
model.AddressLabel: model.LabelValue(functionalURL.Host),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
require.NoError(t, sdManager.ApplyConfig(c))
|
|
|
|
// The notifier should not wait until the alerts queue is empty to apply the discovery changes
|
|
// A faulty Alertmanager could cause each alert sending cycle to take up to AlertmanagerConfig.Timeout
|
|
// The queue may never be emptied, as the arrival rate could be larger than the departure rate
|
|
// It could even overflow and alerts could be dropped.
|
|
timeout = time.After(batches * sendTimeout)
|
|
loop2:
|
|
for {
|
|
select {
|
|
case <-timeout:
|
|
t.Fatalf("Timeout, the faulty alertmanager not removed on time.")
|
|
default:
|
|
// The faulty alertmanager was dropped.
|
|
if len(notifier.Alertmanagers()) == 1 {
|
|
// Prevent from TOCTOU.
|
|
for _, ams := range notifier.alertmanagers {
|
|
for _, q := range ams.buffers {
|
|
require.Zero(t, q.len())
|
|
}
|
|
}
|
|
break loop2
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestStop_DrainingDisabled(t *testing.T) {
|
|
releaseReceiver := make(chan struct{})
|
|
receiverReceivedRequest := make(chan struct{}, 2)
|
|
alertsReceived := atomic.NewInt64(0)
|
|
|
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
// Let the test know we've received a request.
|
|
receiverReceivedRequest <- struct{}{}
|
|
|
|
var alerts []*Alert
|
|
|
|
b, err := io.ReadAll(r.Body)
|
|
require.NoError(t, err)
|
|
|
|
err = json.Unmarshal(b, &alerts)
|
|
require.NoError(t, err)
|
|
|
|
alertsReceived.Add(int64(len(alerts)))
|
|
|
|
// Wait for the test to release us.
|
|
<-releaseReceiver
|
|
|
|
w.WriteHeader(http.StatusOK)
|
|
}))
|
|
defer func() {
|
|
server.Close()
|
|
}()
|
|
|
|
m := NewManager(
|
|
&Options{
|
|
QueueCapacity: 10,
|
|
DrainOnShutdown: false,
|
|
},
|
|
nil,
|
|
)
|
|
|
|
m.alertmanagers = make(map[string]*alertmanagerSet)
|
|
|
|
am1Cfg := config.DefaultAlertmanagerConfig
|
|
am1Cfg.Timeout = model.Duration(time.Second)
|
|
|
|
m.alertmanagers["1"] = &alertmanagerSet{
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return server.URL },
|
|
},
|
|
},
|
|
cfg: &am1Cfg,
|
|
buffers: map[string]*buffer{server.URL: newBuffer(m.opts.QueueCapacity)},
|
|
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
|
|
metrics: newAlertMetrics(prometheus.DefaultRegisterer, nil),
|
|
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
|
|
}
|
|
|
|
for _, ams := range m.alertmanagers {
|
|
for _, am := range ams.ams {
|
|
go ams.sendLoop(am)
|
|
}
|
|
}
|
|
|
|
notificationManagerStopped := make(chan struct{})
|
|
|
|
go func() {
|
|
defer close(notificationManagerStopped)
|
|
m.Run(nil)
|
|
}()
|
|
|
|
// Queue two alerts. The first should be immediately sent to the receiver, which should block until we release it later.
|
|
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-1")})
|
|
|
|
select {
|
|
case <-receiverReceivedRequest:
|
|
// Nothing more to do.
|
|
case <-time.After(time.Second):
|
|
require.FailNow(t, "gave up waiting for receiver to receive notification of first alert")
|
|
}
|
|
|
|
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-2")})
|
|
|
|
// Stop the notification manager, pause to allow the shutdown to be observed, and then allow the receiver to proceed.
|
|
m.Stop()
|
|
time.Sleep(time.Second)
|
|
close(releaseReceiver)
|
|
|
|
// Wait for the notification manager to stop and confirm only the first notification was sent.
|
|
// The second notification should be dropped.
|
|
select {
|
|
case <-notificationManagerStopped:
|
|
// Nothing more to do.
|
|
case <-time.After(time.Second):
|
|
require.FailNow(t, "gave up waiting for notification manager to stop")
|
|
}
|
|
|
|
// At least one alert must have been delivered before notification manager stops.
|
|
require.Positive(t, alertsReceived.Load())
|
|
}
|
|
|
|
func TestStop_DrainingEnabled(t *testing.T) {
|
|
releaseReceiver := make(chan struct{})
|
|
receiverReceivedRequest := make(chan struct{}, 2)
|
|
alertsReceived := atomic.NewInt64(0)
|
|
|
|
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
var alerts []*Alert
|
|
|
|
b, err := io.ReadAll(r.Body)
|
|
require.NoError(t, err)
|
|
|
|
err = json.Unmarshal(b, &alerts)
|
|
require.NoError(t, err)
|
|
|
|
alertsReceived.Add(int64(len(alerts)))
|
|
|
|
// Let the test know we've received a request.
|
|
receiverReceivedRequest <- struct{}{}
|
|
|
|
// Wait for the test to release us.
|
|
<-releaseReceiver
|
|
|
|
w.WriteHeader(http.StatusOK)
|
|
}))
|
|
defer func() {
|
|
server.Close()
|
|
}()
|
|
|
|
m := NewManager(
|
|
&Options{
|
|
QueueCapacity: 10,
|
|
DrainOnShutdown: true,
|
|
},
|
|
nil,
|
|
)
|
|
|
|
m.alertmanagers = make(map[string]*alertmanagerSet)
|
|
|
|
am1Cfg := config.DefaultAlertmanagerConfig
|
|
am1Cfg.Timeout = model.Duration(time.Second)
|
|
|
|
m.alertmanagers["1"] = &alertmanagerSet{
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return server.URL },
|
|
},
|
|
},
|
|
cfg: &am1Cfg,
|
|
buffers: map[string]*buffer{server.URL: newBuffer(m.opts.QueueCapacity)},
|
|
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
|
|
metrics: m.metrics,
|
|
logger: slog.New(slog.NewTextHandler(io.Discard, nil)),
|
|
}
|
|
|
|
for _, ams := range m.alertmanagers {
|
|
for _, am := range ams.ams {
|
|
go ams.sendLoop(am)
|
|
}
|
|
}
|
|
|
|
notificationManagerStopped := make(chan struct{})
|
|
|
|
go func() {
|
|
defer close(notificationManagerStopped)
|
|
m.Run(nil)
|
|
}()
|
|
|
|
// Queue two alerts. The first should be immediately sent to the receiver, which should block until we release it later.
|
|
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-1")})
|
|
|
|
select {
|
|
case <-receiverReceivedRequest:
|
|
// Nothing more to do.
|
|
case <-time.After(time.Second):
|
|
require.FailNow(t, "gave up waiting for receiver to receive notification of first alert")
|
|
}
|
|
|
|
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-2")})
|
|
|
|
// Stop the notification manager and allow the receiver to proceed.
|
|
m.Stop()
|
|
close(releaseReceiver)
|
|
|
|
// Wait for the notification manager to stop and confirm both notifications were sent.
|
|
select {
|
|
case <-notificationManagerStopped:
|
|
// Nothing more to do.
|
|
case <-time.After(400 * time.Millisecond):
|
|
require.FailNow(t, "gave up waiting for notification manager to stop")
|
|
}
|
|
|
|
<-receiverReceivedRequest
|
|
require.Equal(t, int64(2), alertsReceived.Load())
|
|
}
|
|
|
|
func TestApplyConfig(t *testing.T) {
|
|
targetURL := "alertmanager:9093"
|
|
targetGroup := &targetgroup.Group{
|
|
Targets: []model.LabelSet{
|
|
{
|
|
"__address__": model.LabelValue(targetURL),
|
|
},
|
|
},
|
|
}
|
|
alertmanagerURL := fmt.Sprintf("http://%s/api/v2/alerts", targetURL)
|
|
|
|
n := NewManager(&Options{}, nil)
|
|
cfg := &config.Config{}
|
|
s := `
|
|
alerting:
|
|
alertmanagers:
|
|
- file_sd_configs:
|
|
- files:
|
|
- foo.json
|
|
`
|
|
// 1. Ensure known alertmanagers are not dropped during ApplyConfig.
|
|
require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg))
|
|
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1)
|
|
|
|
// First, apply the config and reload.
|
|
require.NoError(t, n.ApplyConfig(cfg))
|
|
tgs := map[string][]*targetgroup.Group{"config-0": {targetGroup}}
|
|
n.reload(tgs)
|
|
require.Len(t, n.Alertmanagers(), 1)
|
|
require.Equal(t, alertmanagerURL, n.Alertmanagers()[0].String())
|
|
|
|
// Reapply the config.
|
|
require.NoError(t, n.ApplyConfig(cfg))
|
|
// Ensure the known alertmanagers are not dropped.
|
|
require.Len(t, n.Alertmanagers(), 1)
|
|
require.Equal(t, alertmanagerURL, n.Alertmanagers()[0].String())
|
|
|
|
// 2. Ensure known alertmanagers are not dropped during ApplyConfig even when
|
|
// the config order changes.
|
|
s = `
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- file_sd_configs:
|
|
- files:
|
|
- foo.json
|
|
`
|
|
require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg))
|
|
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2)
|
|
|
|
require.NoError(t, n.ApplyConfig(cfg))
|
|
require.Len(t, n.Alertmanagers(), 1)
|
|
// Ensure no unnecessary alertmanagers are injected.
|
|
require.Empty(t, n.alertmanagers["config-0"].ams)
|
|
// Ensure the config order is taken into account.
|
|
ams := n.alertmanagers["config-1"].ams
|
|
require.Len(t, ams, 1)
|
|
require.Equal(t, alertmanagerURL, ams[0].url().String())
|
|
|
|
// 3. Ensure known alertmanagers are reused for new config with identical AlertmanagerConfig.
|
|
s = `
|
|
alerting:
|
|
alertmanagers:
|
|
- file_sd_configs:
|
|
- files:
|
|
- foo.json
|
|
- file_sd_configs:
|
|
- files:
|
|
- foo.json
|
|
`
|
|
require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg))
|
|
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2)
|
|
|
|
require.NoError(t, n.ApplyConfig(cfg))
|
|
require.Len(t, n.Alertmanagers(), 2)
|
|
for cfgIdx := range 2 {
|
|
ams := n.alertmanagers[fmt.Sprintf("config-%d", cfgIdx)].ams
|
|
require.Len(t, ams, 1)
|
|
require.Equal(t, alertmanagerURL, ams[0].url().String())
|
|
}
|
|
|
|
// 4. Ensure known alertmanagers are reused only for identical AlertmanagerConfig.
|
|
s = `
|
|
alerting:
|
|
alertmanagers:
|
|
- file_sd_configs:
|
|
- files:
|
|
- foo.json
|
|
path_prefix: /bar
|
|
- file_sd_configs:
|
|
- files:
|
|
- foo.json
|
|
relabel_configs:
|
|
- source_labels: ['__address__']
|
|
regex: 'doesntmatter:1234'
|
|
action: drop
|
|
`
|
|
require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg))
|
|
require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2)
|
|
|
|
require.NoError(t, n.ApplyConfig(cfg))
|
|
require.Empty(t, n.Alertmanagers())
|
|
}
|
|
|
|
// Regression test for https://github.com/prometheus/prometheus/issues/7676
|
|
// The test creates a black hole alertmanager that never responds to any requests.
|
|
// The alertmanager_config.timeout is set to infinite (1 year).
|
|
// We check that the notifier does not hang and throughput is not affected.
|
|
func TestNotifierQueueIndependentOfFailedAlertmanager(t *testing.T) {
|
|
stopBlackHole := make(chan struct{})
|
|
blackHoleAM := newBlackHoleAlertmanager(stopBlackHole)
|
|
defer close(stopBlackHole)
|
|
|
|
doneAlertReceive := make(chan struct{})
|
|
immediateAM := newImmediateAlertManager(doneAlertReceive)
|
|
|
|
h := NewManager(&Options{}, nil)
|
|
|
|
h.alertmanagers = make(map[string]*alertmanagerSet)
|
|
|
|
amCfg := config.DefaultAlertmanagerConfig
|
|
amCfg.Timeout = model.Duration(time.Hour * 24 * 365)
|
|
|
|
h.alertmanagers["1"] = &alertmanagerSet{
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return blackHoleAM.URL },
|
|
},
|
|
},
|
|
cfg: &amCfg,
|
|
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
|
|
buffers: map[string]*buffer{blackHoleAM.URL: newBuffer(10)},
|
|
metrics: h.metrics,
|
|
}
|
|
|
|
h.alertmanagers["2"] = &alertmanagerSet{
|
|
ams: []alertmanager{
|
|
alertmanagerMock{
|
|
urlf: func() string { return immediateAM.URL },
|
|
},
|
|
},
|
|
cfg: &amCfg,
|
|
opts: &Options{Do: do, MaxBatchSize: maxBatchSize},
|
|
buffers: map[string]*buffer{immediateAM.URL: newBuffer(10)},
|
|
metrics: h.metrics,
|
|
}
|
|
|
|
doneSendAll := make(chan struct{})
|
|
go func() {
|
|
for _, s := range h.alertmanagers {
|
|
for _, am := range s.ams {
|
|
go s.sendLoop(am)
|
|
}
|
|
}
|
|
|
|
h.Send(&Alert{
|
|
Labels: labels.FromStrings("alertname", "test"),
|
|
})
|
|
close(doneSendAll)
|
|
}()
|
|
|
|
select {
|
|
case <-doneAlertReceive:
|
|
// This is the happy case, the alert was received by the immediate alertmanager.
|
|
case <-time.After(30 * time.Second):
|
|
t.Fatal("Timeout waiting for alert to be received by immediate alertmanager")
|
|
}
|
|
|
|
select {
|
|
case <-doneSendAll:
|
|
// This is the happy case, the sendAll function returned.
|
|
case <-time.After(30 * time.Second):
|
|
t.Fatal("Timeout waiting for sendAll to return")
|
|
}
|
|
}
|
|
|
|
func newBlackHoleAlertmanager(stop <-chan struct{}) *httptest.Server {
|
|
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
|
// Do nothing, wait to be canceled.
|
|
<-stop
|
|
w.WriteHeader(http.StatusOK)
|
|
}))
|
|
}
|
|
|
|
func newImmediateAlertManager(done chan<- struct{}) *httptest.Server {
|
|
return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
|
w.WriteHeader(http.StatusOK)
|
|
close(done)
|
|
}))
|
|
}
|