From c7d4191e78bf0a455ab596f46d4cf212dce694a4 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 2 Jun 2025 21:12:36 +0400 Subject: [PATCH] fix: rework the way CRI config generation is waited for Instead of relying on the fact that CRI patch should modify the generated final CRI config, rely on the specific checksum of the CRI patch to be included into the generated CRI config. This also to resolve Talos hanging on boot when a CRI patch is a no-op (it doesn't change the generated config). Fixes #11132 Signed-off-by: Andrey Smirnov --- .../pkg/controllers/files/cri_config_parts.go | 11 ++- .../v1alpha1/v1alpha1_sequencer_tasks.go | 43 +++++----- internal/integration/api/apply-config.go | 84 ++++++++++++++++++- internal/pkg/toml/merge.go | 33 +++++--- internal/pkg/toml/merge_test.go | 7 +- internal/pkg/toml/testdata/expected.toml | 6 +- pkg/machinery/resources/files/files.go | 3 + 7 files changed, 144 insertions(+), 43 deletions(-) diff --git a/internal/app/machined/pkg/controllers/files/cri_config_parts.go b/internal/app/machined/pkg/controllers/files/cri_config_parts.go index 3d93b63d7..2a06b18e4 100644 --- a/internal/app/machined/pkg/controllers/files/cri_config_parts.go +++ b/internal/app/machined/pkg/controllers/files/cri_config_parts.go @@ -6,6 +6,7 @@ package files import ( "context" + "encoding/hex" "fmt" "path/filepath" "slices" @@ -72,13 +73,21 @@ func (ctrl *CRIConfigPartsController) Run(ctx context.Context, r controller.Runt slices.Sort(parts) - out, err := toml.Merge(parts) + out, checksums, err := toml.Merge(parts) if err != nil { return err } if err := safe.WriterModify(ctx, r, files.NewEtcFileSpec(files.NamespaceName, constants.CRIConfig), func(r *files.EtcFileSpec) error { + for _, key := range r.Metadata().Annotations().Raw() { + r.Metadata().Annotations().Delete(key) + } + + for path, checksum := range checksums { + r.Metadata().Annotations().Set(files.SourceFileAnnotation+":"+path, hex.EncodeToString(checksum)) + } + spec := r.TypedSpec() spec.Contents = out diff --git a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go index b7e4f9f38..ced7dd50d 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go @@ -8,6 +8,8 @@ import ( "bufio" "bytes" "context" + "crypto/sha256" + "encoding/hex" "encoding/json" "errors" "fmt" @@ -662,20 +664,6 @@ func injectCRIConfigPatch(ctx context.Context, st state.State, content []byte) e ctx, cancel := context.WithTimeout(ctx, time.Minute) defer cancel() - ch := make(chan state.Event) - - // wait for the CRI config to be created - if err := st.Watch(ctx, resourcefiles.NewEtcFileSpec(resourcefiles.NamespaceName, constants.CRIConfig).Metadata(), ch); err != nil { - return err - } - - // first update should be received about the existing resource - select { - case <-ch: - case <-ctx.Done(): - return ctx.Err() - } - etcFileSpec := resourcefiles.NewEtcFileSpec(resourcefiles.NamespaceName, constants.CRICustomizationConfigPart) etcFileSpec.TypedSpec().Mode = 0o600 etcFileSpec.TypedSpec().Contents = content @@ -685,24 +673,33 @@ func injectCRIConfigPatch(ctx context.Context, st state.State, content []byte) e return err } - // wait for the CRI config parts controller to generate the merged file - var version resource.Version + checksumRaw := sha256.Sum256(content) + expectedChecksum := hex.EncodeToString(checksumRaw[:]) + expectedAnnotation := resourcefiles.SourceFileAnnotation + ":" + filepath.Join("/etc", etcFileSpec.Metadata().ID()) - select { - case ev := <-ch: - version = ev.Resource.Metadata().Version() - case <-ctx.Done(): - return ctx.Err() + fileSpec, err := st.WatchFor(ctx, resourcefiles.NewEtcFileSpec(resourcefiles.NamespaceName, constants.CRIConfig).Metadata(), + state.WithCondition(func(r resource.Resource) (bool, error) { + spec, ok := r.(*resourcefiles.EtcFileSpec) + if !ok { + return false, nil + } + + value, ok := spec.Metadata().Annotations().Get(expectedAnnotation) + + return ok && value == expectedChecksum, nil + })) + if err != nil { + return fmt.Errorf("error waiting for file %q to be updated: %w", constants.CRIConfig, err) } // wait for the file to be rendered - _, err := st.WatchFor(ctx, resourcefiles.NewEtcFileStatus(resourcefiles.NamespaceName, constants.CRIConfig).Metadata(), state.WithCondition(func(r resource.Resource) (bool, error) { + _, err = st.WatchFor(ctx, resourcefiles.NewEtcFileStatus(resourcefiles.NamespaceName, constants.CRIConfig).Metadata(), state.WithCondition(func(r resource.Resource) (bool, error) { fileStatus, ok := r.(*resourcefiles.EtcFileStatus) if !ok { return false, nil } - return fileStatus.TypedSpec().SpecVersion == version.String(), nil + return fileStatus.TypedSpec().SpecVersion == fileSpec.Metadata().Version().String(), nil })) return err diff --git a/internal/integration/api/apply-config.go b/internal/integration/api/apply-config.go index 3a17586f1..9bdc48f39 100644 --- a/internal/integration/api/apply-config.go +++ b/internal/integration/api/apply-config.go @@ -18,6 +18,7 @@ import ( "github.com/cosi-project/runtime/pkg/resource/rtestutils" "github.com/cosi-project/runtime/pkg/safe" "github.com/siderolabs/gen/ensure" + "github.com/siderolabs/gen/xslices" "github.com/siderolabs/go-pointer" "github.com/siderolabs/go-retry/retry" "github.com/stretchr/testify/assert" @@ -121,10 +122,7 @@ func (suite *ApplyConfigSuite) TestApply() { Mode: machineapi.ApplyConfigurationRequest_REBOOT, }, ) - if err != nil { - // It is expected that the connection will EOF here, so just log the error - suite.Assert().NoErrorf(err, "failed to apply configuration (node %q)", node) - } + suite.Assert().NoErrorf(err, "failed to apply configuration (node %q)", node) return nil }, assertRebootedRebootTimeout, @@ -153,6 +151,84 @@ func (suite *ApplyConfigSuite) TestApply() { ) } +// TestApplyNoOpCRIPatch verifies the apply config with no-op CRI patch. +func (suite *ApplyConfigSuite) TestApplyNoOpCRIPatch() { + if testing.Short() { + suite.T().Skip("skipping in short mode") + } + + if !suite.Capabilities().SupportsReboot { + suite.T().Skip("cluster doesn't support reboot") + } + + node := suite.RandomDiscoveredNodeInternalIP(machine.TypeWorker) + + suite.WaitForBootDone(suite.ctx) + + nodeCtx := client.WithNode(suite.ctx, node) + + provider, err := suite.ReadConfigFromNode(nodeCtx) + suite.Assert().NoErrorf(err, "failed to read existing config from node %q", node) + + // this CRI patch is a no-op, as NRI is already disabled by default, this verifies that CRI config generation handles it correctly. + cfgDataOut := suite.PatchV1Alpha1Config(provider, func(cfg *v1alpha1.Config) { + cfg.MachineConfig.MachineFiles = xslices.Filter(cfg.MachineConfig.MachineFiles, func(file *v1alpha1.MachineFile) bool { + return file.FilePath != "/etc/cri/conf.d/20-customization.part" + }) + + cfg.MachineConfig.MachineFiles = append(cfg.MachineConfig.MachineFiles, + &v1alpha1.MachineFile{ + FilePath: "/etc/cri/conf.d/20-customization.part", + FileOp: "create", + FileContent: `[plugins] + [plugins."io.containerd.nri.v1.nri"] + disable = true`, + }, + ) + }) + + suite.AssertRebooted( + suite.ctx, node, func(nodeCtx context.Context) error { + _, err = suite.Client.ApplyConfiguration( + nodeCtx, &machineapi.ApplyConfigurationRequest{ + Data: cfgDataOut, + Mode: machineapi.ApplyConfigurationRequest_REBOOT, + }, + ) + suite.Assert().NoErrorf(err, "failed to apply configuration (node %q)", node) + + return nil + }, assertRebootedRebootTimeout, + suite.CleanupFailedPods, + ) + + // revert the patch + provider, err = suite.ReadConfigFromNode(nodeCtx) + suite.Assert().NoErrorf(err, "failed to read existing config from node %q", node) + + // this CRI patch is a no-op, as NRI is already disabled by default, this verifies that CRI config generation handles it correctly. + cfgDataOut = suite.PatchV1Alpha1Config(provider, func(cfg *v1alpha1.Config) { + cfg.MachineConfig.MachineFiles = xslices.Filter(cfg.MachineConfig.MachineFiles, func(file *v1alpha1.MachineFile) bool { + return file.FilePath != "/etc/cri/conf.d/20-customization.part" + }) + }) + + suite.AssertRebooted( + suite.ctx, node, func(nodeCtx context.Context) error { + _, err = suite.Client.ApplyConfiguration( + nodeCtx, &machineapi.ApplyConfigurationRequest{ + Data: cfgDataOut, + Mode: machineapi.ApplyConfigurationRequest_REBOOT, + }, + ) + suite.Assert().NoErrorf(err, "failed to apply configuration (node %q)", node) + + return nil + }, assertRebootedRebootTimeout, + suite.CleanupFailedPods, + ) +} + // TestApplyWithoutReboot verifies the apply config API without reboot. func (suite *ApplyConfigSuite) TestApplyWithoutReboot() { for _, mode := range []machineapi.ApplyConfigurationRequest_Mode{ diff --git a/internal/pkg/toml/merge.go b/internal/pkg/toml/merge.go index cc4023de3..e7574b62f 100644 --- a/internal/pkg/toml/merge.go +++ b/internal/pkg/toml/merge.go @@ -6,7 +6,10 @@ package toml import ( "bytes" + "crypto/sha256" + "encoding/hex" "fmt" + "io" "os" "github.com/pelletier/go-toml/v2" @@ -14,37 +17,47 @@ import ( "github.com/siderolabs/talos/pkg/machinery/config/merge" ) -func tomlDecodeFile(path string, dest any) error { +// tomlDecodeFile decodes a TOML file into the provided destination, and returns a sha256 hash of the file content. +func tomlDecodeFile(path string, dest any) ([]byte, error) { f, err := os.Open(path) if err != nil { - return err + return nil, err } defer f.Close() //nolint:errcheck - return toml.NewDecoder(f).Decode(dest) + hash := sha256.New() + + err = toml.NewDecoder(io.TeeReader(f, hash)).Decode(dest) + + return hash.Sum(nil), err } // Merge several TOML documents in files into one. // // Merge process relies on generic map[string]any merge which might not always be correct. -func Merge(parts []string) ([]byte, error) { +// +// Merge returns a sha256 checksum of each file merged. +func Merge(parts []string) ([]byte, map[string][]byte, error) { merged := map[string]any{} + checksums := make(map[string][]byte, len(parts)) var header []byte for _, part := range parts { partial := map[string]any{} - if err := tomlDecodeFile(part, &partial); err != nil { - return nil, fmt.Errorf("error decoding %q: %w", part, err) + hash, err := tomlDecodeFile(part, &partial) + if err != nil { + return nil, nil, fmt.Errorf("error decoding %q: %w", part, err) } if err := merge.Merge(merged, partial); err != nil { - return nil, fmt.Errorf("error merging %q: %w", part, err) + return nil, nil, fmt.Errorf("error merging %q: %w", part, err) } - header = append(header, []byte(fmt.Sprintf("## %s\n", part))...) + header = fmt.Appendf(header, "## %s (sha256:%s)\n", part, hex.EncodeToString(hash)) + checksums[part] = hash } var out bytes.Buffer @@ -53,8 +66,8 @@ func Merge(parts []string) ([]byte, error) { _ = out.WriteByte('\n') if err := toml.NewEncoder(&out).SetIndentTables(true).Encode(merged); err != nil { - return nil, fmt.Errorf("error encoding merged config: %w", err) + return nil, nil, fmt.Errorf("error encoding merged config: %w", err) } - return out.Bytes(), nil + return out.Bytes(), checksums, nil } diff --git a/internal/pkg/toml/merge_test.go b/internal/pkg/toml/merge_test.go index 7c5cf791b..d305ad96b 100644 --- a/internal/pkg/toml/merge_test.go +++ b/internal/pkg/toml/merge_test.go @@ -18,12 +18,15 @@ import ( var expected []byte func TestMerge(t *testing.T) { - out, err := toml.Merge([]string{ + out, checksums, err := toml.Merge([]string{ "testdata/1.toml", "testdata/2.toml", "testdata/3.toml", }) require.NoError(t, err) - assert.Equal(t, expected, out) + assert.Equal(t, string(expected), string(out)) + assert.Contains(t, checksums, "testdata/1.toml") + assert.Contains(t, checksums, "testdata/2.toml") + assert.Contains(t, checksums, "testdata/3.toml") } diff --git a/internal/pkg/toml/testdata/expected.toml b/internal/pkg/toml/testdata/expected.toml index 9bb7f8147..a0d9e09d0 100644 --- a/internal/pkg/toml/testdata/expected.toml +++ b/internal/pkg/toml/testdata/expected.toml @@ -1,6 +1,6 @@ -## testdata/1.toml -## testdata/2.toml -## testdata/3.toml +## testdata/1.toml (sha256:2eac71621235f8666c54ad3f29a77b2e8483bbd7f0717f8613af591fb5609b44) +## testdata/2.toml (sha256:47ae85a638a291b04518413a12b19c51883a17c8f5064193462d3527b4495e36) +## testdata/3.toml (sha256:159608dffd674e5fe351d47166eab59ee93f6523ff336602364edfd7be25c796) version = 2 diff --git a/pkg/machinery/resources/files/files.go b/pkg/machinery/resources/files/files.go index a3c07157c..57b478c70 100644 --- a/pkg/machinery/resources/files/files.go +++ b/pkg/machinery/resources/files/files.go @@ -9,3 +9,6 @@ import "github.com/cosi-project/runtime/pkg/resource" // NamespaceName contains file resources. const NamespaceName resource.Namespace = "files" + +// SourceFileAnnotation is used to annotate a file resource with the source file path(s). +const SourceFileAnnotation = "talos.dev/source-file"