omni/cmd/integration-test/pkg/tests/stats.go
Utku Ozdemir 7c17ed6cf8
Some checks are pending
default / default (push) Waiting to run
default / e2e-backups (push) Blocked by required conditions
default / e2e-forced-removal (push) Blocked by required conditions
default / e2e-scaling (push) Blocked by required conditions
default / e2e-short (push) Blocked by required conditions
default / e2e-short-secureboot (push) Blocked by required conditions
default / e2e-templates (push) Blocked by required conditions
default / e2e-upgrades (push) Blocked by required conditions
default / e2e-workload-proxy (push) Blocked by required conditions
fix: use the correct schematic IP for maintenance upgrades
Maintenance upgrades triggered from the UI were using the wrong schematic ID, causing the machines which use UKI to lose siderolink kernel args and disconnect.

Since we have a complex logic to build the correct install image including the schematic, move it to a central place.

Add a new management endpoint for the maintenance upgrades. UI now calls this endpoint instead of calling the Talos API directly.
The new endpoint builds the install image correctly using the common logic and issues the upgrade.

Signed-off-by: Utku Ozdemir <utku.ozdemir@siderolabs.com>
2025-05-22 20:10:37 +02:00

144 lines
3.8 KiB
Go

// Copyright (c) 2025 Sidero Labs, Inc.
//
// Use of this software is governed by the Business Source License
// included in the LICENSE file.
package tests
import (
"context"
"errors"
"fmt"
"sort"
"strings"
"testing"
"time"
"github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
"github.com/siderolabs/go-retry/retry"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// AssertStatsLimits checks that metrics don't show any spikes of resource reads/writes, controller wakeups.
// This test should only be run after the integration tests set with Talemu enabled as the thresholds are adjusted for it.
// Should have Prometheus running on 9090.
func AssertStatsLimits(testCtx context.Context) TestFunc {
return func(t *testing.T) {
for _, tt := range []struct {
check func(assert *assert.Assertions, value float64)
name string
query string
}{
{
name: "resource CRUD",
query: `sum(omni_resource_operations_total{operation=~"create|update", type!="MachineStatusLinks.omni.sidero.dev"})`,
check: func(assert *assert.Assertions, value float64) {
limit := float64(12000)
assert.Lessf(value, limit, "resource CRUD operations were expected to be less than %f. "+
"If the limit is exceeded not because of a leak but because you added some new resources/controllers, adjust the limit accordingly.", limit)
},
},
{
name: "queue length",
query: `sum(omni_runtime_qcontroller_queue_length)`,
check: func(assert *assert.Assertions, value float64) { assert.Zero(value) },
},
{
name: "controller wakeups",
query: `sum(omni_runtime_controller_wakeups{controller!="MachineStatusLinkController"})`,
check: func(assert *assert.Assertions, value float64) {
limit := float64(12000)
assert.Lessf(value, limit, "controller wakeups were expected to be less than %f. "+
"If the limit is exceeded not because of a leak but because you added some new resources/controllers, adjust the limit accordingly.", limit)
},
},
} {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
ctx, cancel := context.WithTimeout(testCtx, time.Second*16)
defer cancel()
err := retry.Constant(time.Second * 15).Retry(func() error {
promClient, err := api.NewClient(api.Config{
Address: "http://127.0.0.1:9090",
})
if err != nil {
return retry.ExpectedError(err)
}
var (
value model.Value
warnings v1.Warnings
)
agg := assertionAggregator{}
v1api := v1.NewAPI(promClient)
value, warnings, err = v1api.Query(ctx, tt.query, time.Now())
if err != nil {
return retry.ExpectedError(err)
}
if len(warnings) > 0 {
return retry.ExpectedErrorf("prometheus query had warnings %#v", warnings)
}
assert := assert.New(&agg)
switch val := value.(type) {
case *model.Scalar:
tt.check(assert, float64(val.Value))
case model.Vector:
tt.check(assert, float64(val[val.Len()-1].Value))
default:
return fmt.Errorf("unexpected value type %s", val.Type())
}
if agg.hadErrors {
return retry.ExpectedError(errors.New(agg.String()))
}
return nil
})
require.NoError(t, err)
})
}
}
}
type assertionAggregator struct {
errors map[string]struct{}
hadErrors bool
}
func (agg *assertionAggregator) Errorf(format string, args ...any) {
errorString := fmt.Sprintf(format, args...)
if agg.errors == nil {
agg.errors = map[string]struct{}{}
}
agg.errors[errorString] = struct{}{}
agg.hadErrors = true
}
func (agg *assertionAggregator) String() string {
lines := make([]string, 0, len(agg.errors))
for errorString := range agg.errors {
lines = append(lines, " * "+errorString)
}
sort.Strings(lines)
return strings.Join(lines, "\n")
}