fix: rework the way CRI config generation is waited for

Instead of relying on the fact that CRI patch should modify the
generated final CRI config, rely on the specific checksum of the CRI
patch to be included into the generated CRI config.

This also to resolve Talos hanging on boot when a CRI patch is a no-op
(it doesn't change the generated config).

Fixes #11132

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
This commit is contained in:
Andrey Smirnov 2025-06-02 21:12:36 +04:00
parent 0114183de6
commit c7d4191e78
No known key found for this signature in database
GPG Key ID: FE042E3D4085A811
7 changed files with 144 additions and 43 deletions

View File

@ -6,6 +6,7 @@ package files
import (
"context"
"encoding/hex"
"fmt"
"path/filepath"
"slices"
@ -72,13 +73,21 @@ func (ctrl *CRIConfigPartsController) Run(ctx context.Context, r controller.Runt
slices.Sort(parts)
out, err := toml.Merge(parts)
out, checksums, err := toml.Merge(parts)
if err != nil {
return err
}
if err := safe.WriterModify(ctx, r, files.NewEtcFileSpec(files.NamespaceName, constants.CRIConfig),
func(r *files.EtcFileSpec) error {
for _, key := range r.Metadata().Annotations().Raw() {
r.Metadata().Annotations().Delete(key)
}
for path, checksum := range checksums {
r.Metadata().Annotations().Set(files.SourceFileAnnotation+":"+path, hex.EncodeToString(checksum))
}
spec := r.TypedSpec()
spec.Contents = out

View File

@ -8,6 +8,8 @@ import (
"bufio"
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
@ -662,20 +664,6 @@ func injectCRIConfigPatch(ctx context.Context, st state.State, content []byte) e
ctx, cancel := context.WithTimeout(ctx, time.Minute)
defer cancel()
ch := make(chan state.Event)
// wait for the CRI config to be created
if err := st.Watch(ctx, resourcefiles.NewEtcFileSpec(resourcefiles.NamespaceName, constants.CRIConfig).Metadata(), ch); err != nil {
return err
}
// first update should be received about the existing resource
select {
case <-ch:
case <-ctx.Done():
return ctx.Err()
}
etcFileSpec := resourcefiles.NewEtcFileSpec(resourcefiles.NamespaceName, constants.CRICustomizationConfigPart)
etcFileSpec.TypedSpec().Mode = 0o600
etcFileSpec.TypedSpec().Contents = content
@ -685,24 +673,33 @@ func injectCRIConfigPatch(ctx context.Context, st state.State, content []byte) e
return err
}
// wait for the CRI config parts controller to generate the merged file
var version resource.Version
checksumRaw := sha256.Sum256(content)
expectedChecksum := hex.EncodeToString(checksumRaw[:])
expectedAnnotation := resourcefiles.SourceFileAnnotation + ":" + filepath.Join("/etc", etcFileSpec.Metadata().ID())
select {
case ev := <-ch:
version = ev.Resource.Metadata().Version()
case <-ctx.Done():
return ctx.Err()
fileSpec, err := st.WatchFor(ctx, resourcefiles.NewEtcFileSpec(resourcefiles.NamespaceName, constants.CRIConfig).Metadata(),
state.WithCondition(func(r resource.Resource) (bool, error) {
spec, ok := r.(*resourcefiles.EtcFileSpec)
if !ok {
return false, nil
}
value, ok := spec.Metadata().Annotations().Get(expectedAnnotation)
return ok && value == expectedChecksum, nil
}))
if err != nil {
return fmt.Errorf("error waiting for file %q to be updated: %w", constants.CRIConfig, err)
}
// wait for the file to be rendered
_, err := st.WatchFor(ctx, resourcefiles.NewEtcFileStatus(resourcefiles.NamespaceName, constants.CRIConfig).Metadata(), state.WithCondition(func(r resource.Resource) (bool, error) {
_, err = st.WatchFor(ctx, resourcefiles.NewEtcFileStatus(resourcefiles.NamespaceName, constants.CRIConfig).Metadata(), state.WithCondition(func(r resource.Resource) (bool, error) {
fileStatus, ok := r.(*resourcefiles.EtcFileStatus)
if !ok {
return false, nil
}
return fileStatus.TypedSpec().SpecVersion == version.String(), nil
return fileStatus.TypedSpec().SpecVersion == fileSpec.Metadata().Version().String(), nil
}))
return err

View File

@ -18,6 +18,7 @@ import (
"github.com/cosi-project/runtime/pkg/resource/rtestutils"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/siderolabs/gen/ensure"
"github.com/siderolabs/gen/xslices"
"github.com/siderolabs/go-pointer"
"github.com/siderolabs/go-retry/retry"
"github.com/stretchr/testify/assert"
@ -121,10 +122,7 @@ func (suite *ApplyConfigSuite) TestApply() {
Mode: machineapi.ApplyConfigurationRequest_REBOOT,
},
)
if err != nil {
// It is expected that the connection will EOF here, so just log the error
suite.Assert().NoErrorf(err, "failed to apply configuration (node %q)", node)
}
suite.Assert().NoErrorf(err, "failed to apply configuration (node %q)", node)
return nil
}, assertRebootedRebootTimeout,
@ -153,6 +151,84 @@ func (suite *ApplyConfigSuite) TestApply() {
)
}
// TestApplyNoOpCRIPatch verifies the apply config with no-op CRI patch.
func (suite *ApplyConfigSuite) TestApplyNoOpCRIPatch() {
if testing.Short() {
suite.T().Skip("skipping in short mode")
}
if !suite.Capabilities().SupportsReboot {
suite.T().Skip("cluster doesn't support reboot")
}
node := suite.RandomDiscoveredNodeInternalIP(machine.TypeWorker)
suite.WaitForBootDone(suite.ctx)
nodeCtx := client.WithNode(suite.ctx, node)
provider, err := suite.ReadConfigFromNode(nodeCtx)
suite.Assert().NoErrorf(err, "failed to read existing config from node %q", node)
// this CRI patch is a no-op, as NRI is already disabled by default, this verifies that CRI config generation handles it correctly.
cfgDataOut := suite.PatchV1Alpha1Config(provider, func(cfg *v1alpha1.Config) {
cfg.MachineConfig.MachineFiles = xslices.Filter(cfg.MachineConfig.MachineFiles, func(file *v1alpha1.MachineFile) bool {
return file.FilePath != "/etc/cri/conf.d/20-customization.part"
})
cfg.MachineConfig.MachineFiles = append(cfg.MachineConfig.MachineFiles,
&v1alpha1.MachineFile{
FilePath: "/etc/cri/conf.d/20-customization.part",
FileOp: "create",
FileContent: `[plugins]
[plugins."io.containerd.nri.v1.nri"]
disable = true`,
},
)
})
suite.AssertRebooted(
suite.ctx, node, func(nodeCtx context.Context) error {
_, err = suite.Client.ApplyConfiguration(
nodeCtx, &machineapi.ApplyConfigurationRequest{
Data: cfgDataOut,
Mode: machineapi.ApplyConfigurationRequest_REBOOT,
},
)
suite.Assert().NoErrorf(err, "failed to apply configuration (node %q)", node)
return nil
}, assertRebootedRebootTimeout,
suite.CleanupFailedPods,
)
// revert the patch
provider, err = suite.ReadConfigFromNode(nodeCtx)
suite.Assert().NoErrorf(err, "failed to read existing config from node %q", node)
// this CRI patch is a no-op, as NRI is already disabled by default, this verifies that CRI config generation handles it correctly.
cfgDataOut = suite.PatchV1Alpha1Config(provider, func(cfg *v1alpha1.Config) {
cfg.MachineConfig.MachineFiles = xslices.Filter(cfg.MachineConfig.MachineFiles, func(file *v1alpha1.MachineFile) bool {
return file.FilePath != "/etc/cri/conf.d/20-customization.part"
})
})
suite.AssertRebooted(
suite.ctx, node, func(nodeCtx context.Context) error {
_, err = suite.Client.ApplyConfiguration(
nodeCtx, &machineapi.ApplyConfigurationRequest{
Data: cfgDataOut,
Mode: machineapi.ApplyConfigurationRequest_REBOOT,
},
)
suite.Assert().NoErrorf(err, "failed to apply configuration (node %q)", node)
return nil
}, assertRebootedRebootTimeout,
suite.CleanupFailedPods,
)
}
// TestApplyWithoutReboot verifies the apply config API without reboot.
func (suite *ApplyConfigSuite) TestApplyWithoutReboot() {
for _, mode := range []machineapi.ApplyConfigurationRequest_Mode{

View File

@ -6,7 +6,10 @@ package toml
import (
"bytes"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"os"
"github.com/pelletier/go-toml/v2"
@ -14,37 +17,47 @@ import (
"github.com/siderolabs/talos/pkg/machinery/config/merge"
)
func tomlDecodeFile(path string, dest any) error {
// tomlDecodeFile decodes a TOML file into the provided destination, and returns a sha256 hash of the file content.
func tomlDecodeFile(path string, dest any) ([]byte, error) {
f, err := os.Open(path)
if err != nil {
return err
return nil, err
}
defer f.Close() //nolint:errcheck
return toml.NewDecoder(f).Decode(dest)
hash := sha256.New()
err = toml.NewDecoder(io.TeeReader(f, hash)).Decode(dest)
return hash.Sum(nil), err
}
// Merge several TOML documents in files into one.
//
// Merge process relies on generic map[string]any merge which might not always be correct.
func Merge(parts []string) ([]byte, error) {
//
// Merge returns a sha256 checksum of each file merged.
func Merge(parts []string) ([]byte, map[string][]byte, error) {
merged := map[string]any{}
checksums := make(map[string][]byte, len(parts))
var header []byte
for _, part := range parts {
partial := map[string]any{}
if err := tomlDecodeFile(part, &partial); err != nil {
return nil, fmt.Errorf("error decoding %q: %w", part, err)
hash, err := tomlDecodeFile(part, &partial)
if err != nil {
return nil, nil, fmt.Errorf("error decoding %q: %w", part, err)
}
if err := merge.Merge(merged, partial); err != nil {
return nil, fmt.Errorf("error merging %q: %w", part, err)
return nil, nil, fmt.Errorf("error merging %q: %w", part, err)
}
header = append(header, []byte(fmt.Sprintf("## %s\n", part))...)
header = fmt.Appendf(header, "## %s (sha256:%s)\n", part, hex.EncodeToString(hash))
checksums[part] = hash
}
var out bytes.Buffer
@ -53,8 +66,8 @@ func Merge(parts []string) ([]byte, error) {
_ = out.WriteByte('\n')
if err := toml.NewEncoder(&out).SetIndentTables(true).Encode(merged); err != nil {
return nil, fmt.Errorf("error encoding merged config: %w", err)
return nil, nil, fmt.Errorf("error encoding merged config: %w", err)
}
return out.Bytes(), nil
return out.Bytes(), checksums, nil
}

View File

@ -18,12 +18,15 @@ import (
var expected []byte
func TestMerge(t *testing.T) {
out, err := toml.Merge([]string{
out, checksums, err := toml.Merge([]string{
"testdata/1.toml",
"testdata/2.toml",
"testdata/3.toml",
})
require.NoError(t, err)
assert.Equal(t, expected, out)
assert.Equal(t, string(expected), string(out))
assert.Contains(t, checksums, "testdata/1.toml")
assert.Contains(t, checksums, "testdata/2.toml")
assert.Contains(t, checksums, "testdata/3.toml")
}

View File

@ -1,6 +1,6 @@
## testdata/1.toml
## testdata/2.toml
## testdata/3.toml
## testdata/1.toml (sha256:2eac71621235f8666c54ad3f29a77b2e8483bbd7f0717f8613af591fb5609b44)
## testdata/2.toml (sha256:47ae85a638a291b04518413a12b19c51883a17c8f5064193462d3527b4495e36)
## testdata/3.toml (sha256:159608dffd674e5fe351d47166eab59ee93f6523ff336602364edfd7be25c796)
version = 2

View File

@ -9,3 +9,6 @@ import "github.com/cosi-project/runtime/pkg/resource"
// NamespaceName contains file resources.
const NamespaceName resource.Namespace = "files"
// SourceFileAnnotation is used to annotate a file resource with the source file path(s).
const SourceFileAnnotation = "talos.dev/source-file"