omni/internal/integration/stats_test.go
Artem Chernyshev c9c4c8e10d
Some checks failed
default / default (push) Has been cancelled
default / e2e-backups (push) Has been cancelled
default / e2e-forced-removal (push) Has been cancelled
default / e2e-scaling (push) Has been cancelled
default / e2e-short (push) Has been cancelled
default / e2e-short-secureboot (push) Has been cancelled
default / e2e-templates (push) Has been cancelled
default / e2e-upgrades (push) Has been cancelled
default / e2e-workload-proxy (push) Has been cancelled
test: use go test to build and run Omni integration tests
All test modules were moved under `integration` tag and are now in
`internal/integration` folder: no more `cmd/integration-test`
executable.

New Kres version is able to build the same executable from the tests
directory instead.

All Omni related flags were renamed, for example `--endpoint` ->
`--omni.endpoint`.

2 more functional changes:

- Enabled `--test.failfast` for all test runs.
- Removed finalizers, which were running if the test has failed.

Both of these changes should make it easier to understand the test
failure: Talos node logs won't be cluttered with the finalizer tearing
down the cluster.

Fixes: https://github.com/siderolabs/omni/issues/1171

Signed-off-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
2025-06-03 15:07:00 +03:00

144 lines
3.8 KiB
Go

// Copyright (c) 2025 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.
//go:build integration
package integration_test
import (
"context"
"errors"
"fmt"
"sort"
"strings"
"testing"
"time"
"github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
"github.com/siderolabs/go-retry/retry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// statsLimitsHook checks that metrics don't show any spikes of resource reads/writes, controller wakeups.
// This test should only be run after the integration tests set with Talemu enabled as the thresholds are adjusted for it.
// Should have Prometheus running on 9090.
func statsLimitsHook(t *testing.T) {
for _, tt := range []struct {
check func(assert *assert.Assertions, value float64)
name string
query string
}{
{
name: "resource CRUD",
query: `sum(omni_resource_operations_total{operation=~"create|update", type!="MachineStatusLinks.omni.sidero.dev"})`,
check: func(assert *assert.Assertions, value float64) {
limit := float64(12000)
assert.Lessf(value, limit, "resource CRUD operations were expected to be less than %f. "+
"If the limit is exceeded not because of a leak but because you added some new resources/controllers, adjust the limit accordingly.", limit)
},
},
{
name: "queue length",
query: `sum(omni_runtime_qcontroller_queue_length)`,
check: func(assert *assert.Assertions, value float64) { assert.Zero(value) },
},
{
name: "controller wakeups",
query: `sum(omni_runtime_controller_wakeups{controller!="MachineStatusLinkController"})`,
check: func(assert *assert.Assertions, value float64) {
limit := float64(12000)
assert.Lessf(value, limit, "controller wakeups were expected to be less than %f. "+
"If the limit is exceeded not because of a leak but because you added some new resources/controllers, adjust the limit accordingly.", limit)
},
},
} {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
ctx, cancel := context.WithTimeout(t.Context(), time.Second*16)
defer cancel()
err := retry.Constant(time.Second * 15).Retry(func() error {
promClient, err := api.NewClient(api.Config{
Address: "http://127.0.0.1:9090",
})
if err != nil {
return retry.ExpectedError(err)
}
var (
value model.Value
warnings v1.Warnings
)
agg := assertionAggregator{}
v1api := v1.NewAPI(promClient)
value, warnings, err = v1api.Query(ctx, tt.query, time.Now())
if err != nil {
return retry.ExpectedError(err)
}
if len(warnings) > 0 {
return retry.ExpectedErrorf("prometheus query had warnings %#v", warnings)
}
assert := assert.New(&agg)
switch val := value.(type) {
case *model.Scalar:
tt.check(assert, float64(val.Value))
case model.Vector:
tt.check(assert, float64(val[val.Len()-1].Value))
default:
return fmt.Errorf("unexpected value type %s", val.Type())
}
if agg.hadErrors {
return retry.ExpectedError(errors.New(agg.String()))
}
return nil
})
require.NoError(t, err)
})
}
}
type assertionAggregator struct {
errors map[string]struct{}
hadErrors bool
}
func (agg *assertionAggregator) Errorf(format string, args ...any) {
errorString := fmt.Sprintf(format, args...)
if agg.errors == nil {
agg.errors = map[string]struct{}{}
}
agg.errors[errorString] = struct{}{}
agg.hadErrors = true
}
func (agg *assertionAggregator) String() string {
lines := make([]string, 0, len(agg.errors))
for errorString := range agg.errors {
lines = append(lines, " * "+errorString)
}
sort.Strings(lines)
return strings.Join(lines, "\n")
}