From 0b99631a0b64ce8d65ddcf7f40b2168debf11a62 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Tue, 20 May 2025 22:12:13 +0400 Subject: [PATCH] fix: bump apid memory limit Fixes #11046 Test up to the maximum gRPC message size as we support. Signed-off-by: Andrey Smirnov --- internal/integration/api/apid.go | 95 +++++++++++++++++++++++++--- pkg/machinery/constants/constants.go | 2 +- 2 files changed, 87 insertions(+), 10 deletions(-) diff --git a/internal/integration/api/apid.go b/internal/integration/api/apid.go index 1ad88b43a..15d6cc905 100644 --- a/internal/integration/api/apid.go +++ b/internal/integration/api/apid.go @@ -7,15 +7,22 @@ package api import ( + "bytes" "context" + "slices" + "testing" "time" "github.com/cosi-project/runtime/pkg/safe" + "github.com/dustin/go-humanize" "google.golang.org/grpc/codes" "github.com/siderolabs/talos/internal/integration/base" + machineapi "github.com/siderolabs/talos/pkg/machinery/api/machine" "github.com/siderolabs/talos/pkg/machinery/client" "github.com/siderolabs/talos/pkg/machinery/config/machine" + "github.com/siderolabs/talos/pkg/machinery/constants" + "github.com/siderolabs/talos/pkg/machinery/resources/config" "github.com/siderolabs/talos/pkg/machinery/resources/network" ) @@ -35,15 +42,7 @@ func (suite *ApidSuite) SuiteName() string { // SetupTest ... func (suite *ApidSuite) SetupTest() { // make sure API calls have timeout - suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 15*time.Second) - - if suite.Cluster == nil { - suite.T().Skip("information about routable endpoints is not available") - } - - if suite.APISuite.Endpoint != "" { - suite.T().Skip("test skipped as custom endpoint is set") - } + suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), time.Minute) } // TearDownTest ... @@ -55,6 +54,14 @@ func (suite *ApidSuite) TearDownTest() { // TestControlPlaneRouting verify access to all nodes via each control plane node as an endpoints. func (suite *ApidSuite) TestControlPlaneRouting() { + if suite.Cluster == nil { + suite.T().Skip("information about routable endpoints is not available") + } + + if suite.APISuite.Endpoint != "" { + suite.T().Skip("test skipped as custom endpoint is set") + } + endpoints := suite.DiscoverNodeInternalIPsByType(suite.ctx, machine.TypeControlPlane) nodes := suite.DiscoverNodeInternalIPs(suite.ctx) @@ -97,6 +104,14 @@ func (suite *ApidSuite) TestControlPlaneRouting() { // TestWorkerNoRouting verifies that worker nodes perform no routing. func (suite *ApidSuite) TestWorkerNoRouting() { + if suite.Cluster == nil { + suite.T().Skip("information about routable endpoints is not available") + } + + if suite.APISuite.Endpoint != "" { + suite.T().Skip("test skipped as custom endpoint is set") + } + endpoints := suite.DiscoverNodeInternalIPsByType(suite.ctx, machine.TypeWorker) nodes := suite.DiscoverNodeInternalIPs(suite.ctx) @@ -155,6 +170,68 @@ func (suite *ApidSuite) TestWorkerNoRouting() { } } +// TestBigPayload verifies that big payloads are handled correctly. +func (suite *ApidSuite) TestBigPayload() { + if testing.Short() { + suite.T().Skip("skipping test in short mode") + } + + node := suite.RandomDiscoveredNodeInternalIP(machine.TypeWorker) + nodeCtx := client.WithNode(suite.ctx, node) + + suite.T().Logf("testing big payload on node %s", node) + + // we are going to simulate a big payload by making machine configuration big enough + cfg, err := safe.StateGetByID[*config.MachineConfig](nodeCtx, suite.Client.COSI, config.ActiveID) + suite.Require().NoError(err) + + originalCfg, err := cfg.Container().Bytes() + suite.Require().NoError(err) + + // the config is encoded twice in the resource gRPC message, so ensure that we can get to the one third of the size + const targetConfigSize = constants.GRPCMaxMessageSize / 3 + + suite.T().Logf("original config size: %d (%s), target size is %d (%s)", + len(originalCfg), humanize.Bytes(uint64(len(originalCfg))), targetConfigSize, humanize.Bytes(uint64(targetConfigSize)), + ) + + bytesToAdd := targetConfigSize - len(originalCfg) + if bytesToAdd <= 0 { + suite.T().Skip("configuration is already big enough") + } + + const commentLine = "# this is a comment line added to make the config bigger and bigger and bigger and bigger all the way\n" + + newConfig := slices.Concat(originalCfg, bytes.Repeat([]byte(commentLine), bytesToAdd/len(commentLine)+1)) + + suite.Assert().Greater(len(newConfig), targetConfigSize) + + _, err = suite.Client.ApplyConfiguration(nodeCtx, &machineapi.ApplyConfigurationRequest{ + Data: newConfig, + Mode: machineapi.ApplyConfigurationRequest_NO_REBOOT, + }) + suite.Require().NoError(err) + + // now get the machine configuration back several times + for range 5 { + cfg, err = safe.StateGetByID[*config.MachineConfig](nodeCtx, suite.Client.COSI, config.ActiveID) + suite.Require().NoError(err) + + // check that the configuration is the same + newCfg, err := cfg.Container().Bytes() + suite.Require().NoError(err) + + suite.Assert().Equal(newConfig, newCfg) + } + + // revert the configuration + _, err = suite.Client.ApplyConfiguration(nodeCtx, &machineapi.ApplyConfigurationRequest{ + Data: originalCfg, + Mode: machineapi.ApplyConfigurationRequest_NO_REBOOT, + }) + suite.Require().NoError(err) +} + func init() { allSuites = append(allSuites, new(ApidSuite)) } diff --git a/pkg/machinery/constants/constants.go b/pkg/machinery/constants/constants.go index 32b4b46b9..80cd7ddb9 100644 --- a/pkg/machinery/constants/constants.go +++ b/pkg/machinery/constants/constants.go @@ -796,7 +796,7 @@ const ( CgroupApidReservedMemory = 16 * 1024 * 1024 // CgroupApidMaxMemory is the hard memory limit for the apid process. - CgroupApidMaxMemory = 40 * 1024 * 1024 + CgroupApidMaxMemory = 128 * 1024 * 1024 // CgroupApidMillicores is the CPU weight for the apid process. CgroupApidMillicores = 500