mirror of
https://github.com/siderolabs/omni.git
synced 2025-08-06 01:27:02 +02:00
Some checks failed
default / default (push) Has been cancelled
default / e2e-backups (push) Has been cancelled
default / e2e-forced-removal (push) Has been cancelled
default / e2e-scaling (push) Has been cancelled
default / e2e-short (push) Has been cancelled
default / e2e-short-secureboot (push) Has been cancelled
default / e2e-templates (push) Has been cancelled
default / e2e-upgrades (push) Has been cancelled
default / e2e-workload-proxy (push) Has been cancelled
All test modules were moved under `integration` tag and are now in `internal/integration` folder: no more `cmd/integration-test` executable. New Kres version is able to build the same executable from the tests directory instead. All Omni related flags were renamed, for example `--endpoint` -> `--omni.endpoint`. 2 more functional changes: - Enabled `--test.failfast` for all test runs. - Removed finalizers, which were running if the test has failed. Both of these changes should make it easier to understand the test failure: Talos node logs won't be cluttered with the finalizer tearing down the cluster. Fixes: https://github.com/siderolabs/omni/issues/1171 Signed-off-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
144 lines
3.8 KiB
Go
144 lines
3.8 KiB
Go
// Copyright (c) 2025 Sidero Labs, Inc.
|
|
//
|
|
// Use of this software is governed by the Business Source License
|
|
// included in the LICENSE file.
|
|
|
|
//go:build integration
|
|
|
|
package integration_test
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/api"
|
|
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
|
"github.com/prometheus/common/model"
|
|
"github.com/siderolabs/go-retry/retry"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// statsLimitsHook checks that metrics don't show any spikes of resource reads/writes, controller wakeups.
|
|
// This test should only be run after the integration tests set with Talemu enabled as the thresholds are adjusted for it.
|
|
// Should have Prometheus running on 9090.
|
|
func statsLimitsHook(t *testing.T) {
|
|
for _, tt := range []struct {
|
|
check func(assert *assert.Assertions, value float64)
|
|
name string
|
|
query string
|
|
}{
|
|
{
|
|
name: "resource CRUD",
|
|
query: `sum(omni_resource_operations_total{operation=~"create|update", type!="MachineStatusLinks.omni.sidero.dev"})`,
|
|
check: func(assert *assert.Assertions, value float64) {
|
|
limit := float64(12000)
|
|
|
|
assert.Lessf(value, limit, "resource CRUD operations were expected to be less than %f. "+
|
|
"If the limit is exceeded not because of a leak but because you added some new resources/controllers, adjust the limit accordingly.", limit)
|
|
},
|
|
},
|
|
{
|
|
name: "queue length",
|
|
query: `sum(omni_runtime_qcontroller_queue_length)`,
|
|
check: func(assert *assert.Assertions, value float64) { assert.Zero(value) },
|
|
},
|
|
{
|
|
name: "controller wakeups",
|
|
query: `sum(omni_runtime_controller_wakeups{controller!="MachineStatusLinkController"})`,
|
|
check: func(assert *assert.Assertions, value float64) {
|
|
limit := float64(12000)
|
|
|
|
assert.Lessf(value, limit, "controller wakeups were expected to be less than %f. "+
|
|
"If the limit is exceeded not because of a leak but because you added some new resources/controllers, adjust the limit accordingly.", limit)
|
|
},
|
|
},
|
|
} {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx, cancel := context.WithTimeout(t.Context(), time.Second*16)
|
|
defer cancel()
|
|
|
|
err := retry.Constant(time.Second * 15).Retry(func() error {
|
|
promClient, err := api.NewClient(api.Config{
|
|
Address: "http://127.0.0.1:9090",
|
|
})
|
|
if err != nil {
|
|
return retry.ExpectedError(err)
|
|
}
|
|
|
|
var (
|
|
value model.Value
|
|
warnings v1.Warnings
|
|
)
|
|
|
|
agg := assertionAggregator{}
|
|
|
|
v1api := v1.NewAPI(promClient)
|
|
|
|
value, warnings, err = v1api.Query(ctx, tt.query, time.Now())
|
|
if err != nil {
|
|
return retry.ExpectedError(err)
|
|
}
|
|
|
|
if len(warnings) > 0 {
|
|
return retry.ExpectedErrorf("prometheus query had warnings %#v", warnings)
|
|
}
|
|
|
|
assert := assert.New(&agg)
|
|
|
|
switch val := value.(type) {
|
|
case *model.Scalar:
|
|
tt.check(assert, float64(val.Value))
|
|
case model.Vector:
|
|
tt.check(assert, float64(val[val.Len()-1].Value))
|
|
default:
|
|
return fmt.Errorf("unexpected value type %s", val.Type())
|
|
}
|
|
|
|
if agg.hadErrors {
|
|
return retry.ExpectedError(errors.New(agg.String()))
|
|
}
|
|
|
|
return nil
|
|
})
|
|
|
|
require.NoError(t, err)
|
|
})
|
|
}
|
|
}
|
|
|
|
type assertionAggregator struct {
|
|
errors map[string]struct{}
|
|
hadErrors bool
|
|
}
|
|
|
|
func (agg *assertionAggregator) Errorf(format string, args ...any) {
|
|
errorString := fmt.Sprintf(format, args...)
|
|
|
|
if agg.errors == nil {
|
|
agg.errors = map[string]struct{}{}
|
|
}
|
|
|
|
agg.errors[errorString] = struct{}{}
|
|
agg.hadErrors = true
|
|
}
|
|
|
|
func (agg *assertionAggregator) String() string {
|
|
lines := make([]string, 0, len(agg.errors))
|
|
|
|
for errorString := range agg.errors {
|
|
lines = append(lines, " * "+errorString)
|
|
}
|
|
|
|
sort.Strings(lines)
|
|
|
|
return strings.Join(lines, "\n")
|
|
}
|