From 689974bd55ca28ccdd519f11dee0c3b8bc727601 Mon Sep 17 00:00:00 2001 From: Noel Georgi Date: Sat, 25 Apr 2026 23:36:32 +0530 Subject: [PATCH] fix: volume mount permissions Make Talos volumes mount options more stricter. Fixes: #11946 Signed-off-by: Noel Georgi --- .github/workflows/ci.yaml | 139 +++++++++- ...tegration-qemu-csi-longhorn-triggered.yaml | 24 +- .kres.yaml | 34 ++- api/resource/definitions/block/block.proto | 3 + hack/release.toml | 23 ++ hack/test/patches/ephemeral-insecure.yaml | 6 + .../volumes/volumeconfig/system_volumes.go | 5 + .../volumeconfig/system_volumes_test.go | 40 +++ .../machined/pkg/controllers/block/mount.go | 10 +- .../pkg/controllers/cri/image_cache_config.go | 3 + .../v1alpha1/v1alpha1_sequencer_tasks.go | 6 +- .../app/machined/pkg/system/services/utils.go | 4 +- internal/integration/api/mounts.go | 259 ++++++++++++++++++ internal/integration/base/base.go | 3 + internal/integration/integration_test.go | 64 +++-- internal/integration/k8s/longhorn.go | 158 +++-------- internal/integration/k8s/longhorn_v1.go | 209 ++++++++++++++ internal/pkg/mount/v3/helpers.go | 39 ++- internal/pkg/mount/v3/manager.go | 4 +- internal/pkg/mount/v3/mount.go | 9 +- internal/pkg/mount/v3/point.go | 6 +- internal/pkg/selinux/policy/policy.33 | Bin 44320 -> 44356 bytes .../policy/selinux/common/processes.cil | 4 + .../policy/selinux/services/machined.cil | 5 + .../selinux/policy/selinux/services/udev.cil | 3 + .../resource/definitions/block/block.pb.go | 17 +- .../definitions/block/block_vtproto.pb.go | 33 +++ pkg/machinery/constants/constants.go | 13 +- .../resources/block/volume_config.go | 3 + website/content/v1.14/reference/api.md | 1 + 30 files changed, 913 insertions(+), 214 deletions(-) create mode 100644 hack/test/patches/ephemeral-insecure.yaml create mode 100644 internal/integration/api/mounts.go create mode 100644 internal/integration/k8s/longhorn_v1.go diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 188e4c159..f8d7d9da3 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2026-05-05T17:21:59Z by kres 1762ab2. +# Generated on 2026-05-05T14:21:38Z by kres 1762ab2. concurrency: group: ${{ github.head_ref || github.run_id }} @@ -3125,7 +3125,7 @@ jobs: pull-requests: read runs-on: group: large - if: (!startsWith(github.head_ref, 'renovate/') && !startsWith(github.head_ref, 'dependabot/')) && !cancelled() && github.event_name == 'pull_request' + if: (!startsWith(github.head_ref, 'renovate/') && !startsWith(github.head_ref, 'dependabot/')) && !cancelled() strategy: matrix: include: @@ -3404,7 +3404,7 @@ jobs: pull-requests: read runs-on: group: large - if: (!startsWith(github.head_ref, 'renovate/') && !startsWith(github.head_ref, 'dependabot/')) && !cancelled() && github.event_name == 'pull_request' + if: (!startsWith(github.head_ref, 'renovate/') && !startsWith(github.head_ref, 'dependabot/')) && !cancelled() strategy: matrix: include: @@ -3534,7 +3534,7 @@ jobs: pull-requests: read runs-on: group: large - if: (!startsWith(github.head_ref, 'renovate/') && !startsWith(github.head_ref, 'dependabot/')) && !cancelled() && github.event_name == 'pull_request' + if: (!startsWith(github.head_ref, 'renovate/') && !startsWith(github.head_ref, 'dependabot/')) && !cancelled() strategy: matrix: include: @@ -4006,7 +4006,7 @@ jobs: /tmp/logs-*.tar.gz /tmp/support-*.zip retention-days: "5" - integration-qemu-csi-longhorn: + integration-qemu-csi-longhorn-v1: permissions: actions: read contents: write @@ -4015,7 +4015,7 @@ jobs: pull-requests: read runs-on: group: large - if: contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/qemu-csi') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/extensions') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/qemu-csi-longhorn') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/release-gate') + if: contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/qemu-csi-longhorn-v1') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/qemu-csi') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/extensions') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/qemu-csi-longhorn') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/release-gate') needs: - default steps: @@ -4094,8 +4094,127 @@ jobs: make kubelet-fat-patch - name: e2e-qemu-csi-longhorn env: - EXTRA_TEST_ARGS: -talos.csi=longhorn - GITHUB_STEP_NAME: ${{ github.job}}-e2e-qemu-csi-longhorn + EXTRA_TEST_ARGS: -talos.csi=longhorn-v1 -talos.skip-ephemeral-policy + GITHUB_STEP_NAME: ${{ github.job}}-e2e-qemu-csi-longhorn-v1 + IMAGE_REGISTRY: registry.dev.siderolabs.io + QEMU_CPUS_WORKERS: "3" + QEMU_EXTRA_DISKS: "1" + QEMU_EXTRA_DISKS_DRIVERS: nvme + QEMU_EXTRA_DISKS_SIZE: "12288" + QEMU_MEMORY_WORKERS: "10240" + QEMU_SYSTEM_DISK_SIZE: "20480" + QEMU_WORKERS: "3" + SHORT_INTEGRATION_TEST: "yes" + WITH_CONFIG_PATCH_CONTROLPLANE: '@hack/test/patches/longhorn-cp.yaml' + WITH_CONFIG_PATCH_WORKER: '@_out/installer-extensions-patch.yaml:@_out/kubelet-fat-patch.yaml:@hack/test/patches/longhorn.yaml:@hack/test/patches/ephemeral-insecure.yaml' + run: | + sudo -E make e2e-qemu + - name: save artifacts + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # version: v7.0.1 + with: + name: fio-integration-qemu-csi-longhorn-v1 + path: | + /tmp/fio-*.json + retention-days: "180" + - name: save artifacts + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # version: v7.0.1 + with: + name: talos-logs-integration-qemu-csi-longhorn-v1 + path: |- + /tmp/logs-*.tar.gz + /tmp/support-*.zip + retention-days: "5" + integration-qemu-csi-longhorn-v2: + permissions: + actions: read + contents: write + issues: read + packages: write + pull-requests: read + runs-on: + group: large + if: contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/qemu-csi-longhorn-v2') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/qemu-csi') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/extensions') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/qemu-csi-longhorn') || contains(fromJSON(needs.default.outputs.labels || '[]'), 'integration/release-gate') + needs: + - default + steps: + - name: gather-system-info + id: system-info + uses: kenchan0130/actions-system-info@59699597e84e80085a750998045983daa49274c4 # version: v1.4.0 + continue-on-error: true + - name: print-system-info + run: | + MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024)) + + OUTPUTS=( + "CPU Core: ${{ steps.system-info.outputs.cpu-core }}" + "CPU Model: ${{ steps.system-info.outputs.cpu-model }}" + "Hostname: ${{ steps.system-info.outputs.hostname }}" + "NodeName: ${NODE_NAME}" + "Kernel release: ${{ steps.system-info.outputs.kernel-release }}" + "Kernel version: ${{ steps.system-info.outputs.kernel-version }}" + "Name: ${{ steps.system-info.outputs.name }}" + "Platform: ${{ steps.system-info.outputs.platform }}" + "Release: ${{ steps.system-info.outputs.release }}" + "Total memory: ${MEMORY_GB} GB" + ) + + for OUTPUT in "${OUTPUTS[@]}";do + echo "${OUTPUT}" + done + continue-on-error: true + - name: checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2 + - name: Unshallow + run: | + git fetch --prune --unshallow + - name: Set up Docker Buildx + id: setup-buildx + uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # version: v4.0.0 + with: + driver: remote + endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234 + timeout-minutes: 10 + - name: Download artifacts + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # version: v8.0.1 + with: + name: talos-artifacts + path: _out + - name: Fix artifact permissions + run: | + xargs -a _out/executable-artifacts -I {} chmod +x {} + - name: ci-temp-release-tag + run: | + make ci-temp-release-tag + - name: checkout extensions + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2 + with: + path: _out/extensions + ref: main + repository: siderolabs/extensions + - name: set variables + run: | + cat _out/talos-metadata >> "$GITHUB_ENV" + - name: build extensions + env: + PLATFORM: linux/amd64 + PUSH: "true" + REGISTRY: registry.dev.siderolabs.io + run: | + make iscsi-tools util-linux-tools extensions-metadata -C _out/extensions + - name: installer extensions + env: + EXTENSIONS_FILTER_COMMAND: grep -E '/iscsi-tools|util-linux-tools' + IMAGE_REGISTRY: registry.dev.siderolabs.io + run: | + make installer-with-extensions + - name: kubelet-fat-patch + run: | + make kubelet-fat-patch + - name: e2e-qemu-csi-longhorn + env: + EXTRA_TEST_ARGS: '-talos.csi=longhorn ' + GITHUB_STEP_NAME: ${{ github.job}}-e2e-qemu-csi-longhorn-v2 IMAGE_REGISTRY: registry.dev.siderolabs.io QEMU_CPUS_WORKERS: "3" QEMU_EXTRA_DISKS: "1" @@ -4112,7 +4231,7 @@ jobs: - name: save artifacts uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # version: v7.0.1 with: - name: fio-integration-qemu-csi-longhorn + name: fio-integration-qemu-csi-longhorn-v2 path: | /tmp/fio-*.json retention-days: "180" @@ -4120,7 +4239,7 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # version: v7.0.1 with: - name: talos-logs-integration-qemu-csi-longhorn + name: talos-logs-integration-qemu-csi-longhorn-v2 path: |- /tmp/logs-*.tar.gz /tmp/support-*.zip diff --git a/.github/workflows/integration-qemu-csi-longhorn-triggered.yaml b/.github/workflows/integration-qemu-csi-longhorn-triggered.yaml index 66b7bb916..b7afe17b4 100644 --- a/.github/workflows/integration-qemu-csi-longhorn-triggered.yaml +++ b/.github/workflows/integration-qemu-csi-longhorn-triggered.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2026-04-27T17:59:00Z by kres e4dc583. +# Generated on 2026-05-05T14:21:38Z by kres 1762ab2. concurrency: group: ${{ github.head_ref || github.run_id }} @@ -14,11 +14,23 @@ concurrency: name: integration-qemu-csi-longhorn-triggered jobs: default: + name: ${{ matrix.longhornEngine }} permissions: actions: read runs-on: group: large if: github.event.workflow_run.conclusion == 'success' + strategy: + matrix: + include: + - csi: longhorn + longhornEngine: v2 + - csi: longhorn-v1 + extraTestArgs: -talos.skip-ephemeral-policy + extraWorkerPatch: :@hack/test/patches/ephemeral-insecure.yaml + longhornEngine: v1 + fail-fast: false + max-parallel: 2 steps: - name: gather-system-info id: system-info @@ -97,8 +109,8 @@ jobs: make kubelet-fat-patch - name: e2e-qemu-csi-longhorn env: - EXTRA_TEST_ARGS: -talos.csi=longhorn - GITHUB_STEP_NAME: ${{ github.job}}-e2e-qemu-csi-longhorn + EXTRA_TEST_ARGS: -talos.csi=${{ matrix.csi }} ${{ matrix.extraTestArgs }} + GITHUB_STEP_NAME: ${{ github.job}}-e2e-qemu-csi-longhorn-${{ matrix.longhornEngine }} IMAGE_REGISTRY: registry.dev.siderolabs.io QEMU_CPUS_WORKERS: "3" QEMU_EXTRA_DISKS: "1" @@ -109,13 +121,13 @@ jobs: QEMU_WORKERS: "3" SHORT_INTEGRATION_TEST: "yes" WITH_CONFIG_PATCH_CONTROLPLANE: '@hack/test/patches/longhorn-cp.yaml' - WITH_CONFIG_PATCH_WORKER: '@_out/installer-extensions-patch.yaml:@_out/kubelet-fat-patch.yaml:@hack/test/patches/longhorn.yaml' + WITH_CONFIG_PATCH_WORKER: '@_out/installer-extensions-patch.yaml:@_out/kubelet-fat-patch.yaml:@hack/test/patches/longhorn.yaml${{ matrix.extraWorkerPatch }}' run: | sudo -E make e2e-qemu - name: save artifacts uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # version: v7.0.1 with: - name: fio-integration-qemu-csi-longhorn + name: fio-integration-qemu-csi-longhorn-${{ matrix.longhornEngine }} path: | /tmp/fio-*.json retention-days: "180" @@ -123,7 +135,7 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # version: v7.0.1 with: - name: talos-logs-integration-qemu-csi-longhorn + name: talos-logs-integration-qemu-csi-longhorn-${{ matrix.longhornEngine }} path: |- /tmp/logs-*.tar.gz /tmp/support-*.zip diff --git a/.kres.yaml b/.kres.yaml index 9d2e0983f..2c988992f 100644 --- a/.kres.yaml +++ b/.kres.yaml @@ -89,6 +89,8 @@ spec: integration/qemu-csi: "Run QEMU CSI integration tests" integration/qemu-csi-openebs: "Run QEMU CSI OpenEBS integration tests" integration/qemu-csi-longhorn: "Run QEMU CSI Longhorn integration tests" + integration/qemu-csi-longhorn-v1: "Run QEMU CSI Longhorn v1 integration tests" + integration/qemu-csi-longhorn-v2: "Run QEMU CSI Longhorn v2 integration tests" integration/qemu-csi-rook-ceph: "Run QEMU CSI Rook Ceph integration tests" integration/qemu-default: "Run QEMU integration tests on default variant" integration/qemu-encrypted-vip: "Run QEMU integration tests with disk encryption and virtual IP" @@ -934,7 +936,6 @@ spec: - integration-build-enforcing conditions: - not-cancelled - - on-pull-request runnerGroup: large onWorkflowRun: workflows: [integration-build-enforcing-triggered] @@ -1108,7 +1109,6 @@ spec: - integration-build-enforcing conditions: - not-cancelled - - on-pull-request runnerGroup: large onWorkflowRun: workflows: [integration-build-enforcing-triggered] @@ -1187,7 +1187,6 @@ spec: - integration-build-enforcing conditions: - not-cancelled - - on-pull-request runnerGroup: large onWorkflowRun: workflows: [integration-build-enforcing-triggered] @@ -1478,9 +1477,26 @@ spec: types: [completed] triggerLabels: - integration/qemu-csi - - integration/extensions # since iscsi is tested with longhorn + - integration/extensions # since iscsi is tested with longhorn v1 - integration/qemu-csi-longhorn - integration/release-gate + matrix: + maxParallel: 2 + labelKeys: [longhornEngine] + # v2 (SPDK) is the default-secure path; v1 needs noexec disabled on + # /var so the instance-manager can exec engine binaries it drops + # under /var/lib/longhorn/engine-binaries/. The v1 path also tells + # MountsSuite to skip the /var policy check via + # -talos.skip-ephemeral-policy. + include: + - longhornEngine: v2 + csi: longhorn + extraWorkerPatch: "" + extraTestArgs: "" + - longhornEngine: v1 + csi: longhorn-v1 + extraWorkerPatch: ":@hack/test/patches/ephemeral-insecure.yaml" + extraTestArgs: "-talos.skip-ephemeral-policy" steps: - name: download-artifacts artifactStep: @@ -1515,7 +1531,7 @@ spec: command: e2e-qemu withSudo: true environment: - GITHUB_STEP_NAME: ${{ github.job}}-e2e-qemu-csi-longhorn + GITHUB_STEP_NAME: ${{ github.job}}-e2e-qemu-csi-longhorn-${{ matrix.longhornEngine }} SHORT_INTEGRATION_TEST: yes QEMU_WORKERS: 3 QEMU_MEMORY_WORKERS: 10240 @@ -1525,13 +1541,13 @@ spec: QEMU_EXTRA_DISKS_SIZE: 12288 QEMU_EXTRA_DISKS_DRIVERS: nvme WITH_CONFIG_PATCH_CONTROLPLANE: "@hack/test/patches/longhorn-cp.yaml" - WITH_CONFIG_PATCH_WORKER: "@_out/installer-extensions-patch.yaml:@_out/kubelet-fat-patch.yaml:@hack/test/patches/longhorn.yaml" - EXTRA_TEST_ARGS: -talos.csi=longhorn + WITH_CONFIG_PATCH_WORKER: "@_out/installer-extensions-patch.yaml:@_out/kubelet-fat-patch.yaml:@hack/test/patches/longhorn.yaml${{ matrix.extraWorkerPatch }}" + EXTRA_TEST_ARGS: -talos.csi=${{ matrix.csi }} ${{ matrix.extraTestArgs }} IMAGE_REGISTRY: registry.dev.siderolabs.io - name: save-fio-benchmark artifactStep: type: upload - artifactName: fio-integration-qemu-csi-longhorn + artifactName: fio-integration-qemu-csi-longhorn-${{ matrix.longhornEngine }} disableExecutableListGeneration: true artifactPath: /tmp/fio-*.json retentionDays: "180" @@ -1540,7 +1556,7 @@ spec: - always artifactStep: type: upload - artifactName: talos-logs-integration-qemu-csi-longhorn + artifactName: talos-logs-integration-qemu-csi-longhorn-${{ matrix.longhornEngine }} disableExecutableListGeneration: true artifactPath: /tmp/logs-*.tar.gz additionalArtifacts: diff --git a/api/resource/definitions/block/block.proto b/api/resource/definitions/block/block.proto index 4bd6fc52a..c7a5a35db 100755 --- a/api/resource/definitions/block/block.proto +++ b/api/resource/definitions/block/block.proto @@ -173,6 +173,9 @@ message MountSpec { string bind_target = 9; // Parameters are additional filesystem mount options used when mounting the volume. repeated ParameterSpec parameters = 10; + // Secure applies MOUNT_ATTR_NOSUID\|NODEV\|NOEXEC to the mount. Set for + // config-only mounts; leave false for mounts hosting executables. + bool secure = 11; } // MountStatusSpec is the spec for MountStatus. diff --git a/hack/release.toml b/hack/release.toml index 78074121c..81347d935 100644 --- a/hack/release.toml +++ b/hack/release.toml @@ -77,6 +77,29 @@ List of changes: DHCPv4 search domains are now applied to the resolver configuration. """ +[notes.EPHEMERAL] + title = "noexec on EPHEMERAL (/var)" + description = """\ +The EPHEMERAL volume (`/var`) is now mounted with `noexec` in addition to the existing `nosuid` and `nodev`, +blocking binary execution from `/var`. + +Workloads that exec binaries placed under `/var` will break. +For example, Longhorn v1's `instance-manager` exec's engine binaries the `engine-image` DaemonSet drops under `/var/lib/longhorn/engine-binaries/`, +which now fails with `permission denied`. Affected users can opt out via a `VolumeConfig` document: + +```yaml +apiVersion: v1alpha1 +kind: VolumeConfig +name: EPHEMERAL +mount: + secure: false +``` + +> NOTE: Setting `secure: false` will also disable `nosuid` and `nodev`, which may have security implications. Use with caution. + +Upgrade note: apply this `VolumeConfig` patch *before* upgrading, otherwise affected workloads will fail after the next reboot. Longhorn v2 (SPDK data engine) runs the data plane inside the instance manager process and is not affected. +""" + [make_deps] [make_deps.tools] diff --git a/hack/test/patches/ephemeral-insecure.yaml b/hack/test/patches/ephemeral-insecure.yaml new file mode 100644 index 000000000..eeab03d77 --- /dev/null +++ b/hack/test/patches/ephemeral-insecure.yaml @@ -0,0 +1,6 @@ +--- +apiVersion: v1alpha1 +kind: VolumeConfig +name: EPHEMERAL +mount: + secure: false diff --git a/internal/app/machined/pkg/controllers/block/internal/volumes/volumeconfig/system_volumes.go b/internal/app/machined/pkg/controllers/block/internal/volumes/volumeconfig/system_volumes.go index 6b8aeb29e..ef3daffb4 100644 --- a/internal/app/machined/pkg/controllers/block/internal/volumes/volumeconfig/system_volumes.go +++ b/internal/app/machined/pkg/controllers/block/internal/volumes/volumeconfig/system_volumes.go @@ -61,6 +61,7 @@ func GetStateVolumeTransformer(encryptionMeta *runtime.MetaKey, inContainer, isA FileMode: 0o700, UID: 0, GID: 0, + Secure: true, }).WriterFunc() } else { // STATE configuration should be always created, but it depends on the configuration presence @@ -133,6 +134,7 @@ func GetEphemeralVolumeTransformer(inContainer bool) volumeConfigTransformer { UID: 0, GID: 0, ProjectQuotaSupport: cfg.Machine().Features().DiskQuotaSupportEnabled(), + Secure: extraVolumeConfig.Mount().Secure(), }). WithLocator(labelVolumeMatch(constants.EphemeralPartitionLabel)). WithFunc(func(vcs *block.VolumeConfigSpec) error { @@ -185,6 +187,7 @@ func GetOverlayVolumesTransformer(inContainer bool) func(configconfig.Config) ([ FileMode: 0o755, UID: 0, GID: 0, + Secure: overlay.Secure, }).WriterFunc(), }) } @@ -207,6 +210,7 @@ func manageStateNoConfig(encryptionMeta *runtime.MetaKey, isAgent bool) func(vc FileMode: 0o700, UID: 0, GID: 0, + Secure: true, }).WithLocator(match). WithFunc(func(spec *block.VolumeConfigSpec) error { if encryptionMeta != nil { @@ -248,6 +252,7 @@ func manageStateConfigPresent(cfg configconfig.Config) func(vc *block.VolumeConf FileMode: 0o700, UID: 0, GID: 0, + Secure: true, }). WithProvisioning(block.ProvisioningSpec{ Wave: block.WaveSystemDisk, diff --git a/internal/app/machined/pkg/controllers/block/internal/volumes/volumeconfig/system_volumes_test.go b/internal/app/machined/pkg/controllers/block/internal/volumes/volumeconfig/system_volumes_test.go index 4106b7e90..cd217dff1 100644 --- a/internal/app/machined/pkg/controllers/block/internal/volumes/volumeconfig/system_volumes_test.go +++ b/internal/app/machined/pkg/controllers/block/internal/volumes/volumeconfig/system_volumes_test.go @@ -448,6 +448,46 @@ func TestEphemeralVolumeTransformerWithExtraConfig(t *testing.T) { }) } +func TestEphemeralVolumeSecure(t *testing.T) { + t.Parallel() + + t.Run("default is secure", func(t *testing.T) { + t.Parallel() + + transformer := volumeconfig.GetEphemeralVolumeTransformer(false) + resources, err := transformer(container.NewV1Alpha1(&baseCfg)) + require.NoError(t, err) + require.Len(t, resources, 1) + + testTransformFunc(t, resources[0].TransformFunc, func(t *testing.T, vc *block.VolumeConfig, err error) { + require.NoError(t, err) + assert.True(t, vc.TypedSpec().Mount.Secure, "EPHEMERAL should be secure by default") + }) + }) + + t.Run("secure=false via VolumeConfig overrides default", func(t *testing.T) { + t.Parallel() + + secureOff := false + ephemeralCfg := blockcfg.NewVolumeConfigV1Alpha1() + ephemeralCfg.MetaName = constants.EphemeralPartitionLabel + ephemeralCfg.MountSpec.MountSecure = &secureOff + + cfg, err := container.New(baseCfg.DeepCopy(), ephemeralCfg) + require.NoError(t, err) + + transformer := volumeconfig.GetEphemeralVolumeTransformer(false) + resources, err := transformer(cfg) + require.NoError(t, err) + require.Len(t, resources, 1) + + testTransformFunc(t, resources[0].TransformFunc, func(t *testing.T, vc *block.VolumeConfig, err error) { + require.NoError(t, err) + assert.False(t, vc.TypedSpec().Mount.Secure, "EPHEMERAL Secure should be overridable via VolumeConfig") + }) + }) +} + func testTransformFunc(t *testing.T, transformer func(vc *block.VolumeConfig) error, checkFunc func(t *testing.T, vc *block.VolumeConfig, err error), diff --git a/internal/app/machined/pkg/controllers/block/mount.go b/internal/app/machined/pkg/controllers/block/mount.go index 77a9a7214..779b36263 100644 --- a/internal/app/machined/pkg/controllers/block/mount.go +++ b/internal/app/machined/pkg/controllers/block/mount.go @@ -756,11 +756,19 @@ func (ctrl *MountController) handleOverlayMountOperation( return fmt.Errorf("overlay mount is not supported for %q", volumeStatus.TypedSpec().ParentID) } + overlayOpts := []mount.ManagerOption{ + mount.WithSelinuxLabel(volumeStatus.TypedSpec().MountSpec.SelinuxLabel), + } + + if volumeStatus.TypedSpec().MountSpec.Secure { + overlayOpts = append(overlayOpts, mount.WithSecure()) + } + manager := mount.NewVarOverlay( []string{mountTarget}, mountTarget, logger.Sugar().Infof, - mount.WithSelinuxLabel(volumeStatus.TypedSpec().MountSpec.SelinuxLabel), + overlayOpts..., ) mountpoint, err := manager.Mount() diff --git a/internal/app/machined/pkg/controllers/cri/image_cache_config.go b/internal/app/machined/pkg/controllers/cri/image_cache_config.go index 06584bb89..b54d146ed 100644 --- a/internal/app/machined/pkg/controllers/cri/image_cache_config.go +++ b/internal/app/machined/pkg/controllers/cri/image_cache_config.go @@ -385,6 +385,9 @@ func (ctrl *ImageCacheConfigController) analyzeImageCacheVolumes(ctx context.Con mountRequest.TypedSpec().Requester = ctrl.Name() mountRequest.TypedSpec().VolumeID = volumeID mountRequest.TypedSpec().ReadOnly = !(volumeStatus.Metadata().ID() == VolumeImageCacheDISK && isoPresent) + // Image cache stores OCI image data only; Secure applies + // nosuid+nodev+noexec. + mountRequest.TypedSpec().Secure = true return nil }, diff --git a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go index 7219c035d..fbd7154ba 100644 --- a/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go +++ b/internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer_tasks.go @@ -1460,10 +1460,8 @@ func MountEphemeralPartition(runtime.Sequence, any) (runtime.TaskExecutionFunc, mountRequest.TypedSpec().VolumeID = constants.EphemeralPartitionLabel mountRequest.TypedSpec().Requester = "sequencer" - if cfg := r.Config(); cfg != nil { - vol, _ := cfg.Volumes().ByName(constants.EphemeralPartitionLabel) - mountRequest.TypedSpec().Secure = vol.Mount().Secure() - } + vol, _ := r.Config().Volumes().ByName(constants.EphemeralPartitionLabel) + mountRequest.TypedSpec().Secure = vol.Mount().Secure() if err := r.State().V1Alpha2().Resources().Create(ctx, mountRequest); err != nil { return fmt.Errorf("failed to create EPHEMERAL mount request: %w", err) diff --git a/internal/app/machined/pkg/system/services/utils.go b/internal/app/machined/pkg/system/services/utils.go index b8b5c22bc..37191c20e 100644 --- a/internal/app/machined/pkg/system/services/utils.go +++ b/internal/app/machined/pkg/system/services/utils.go @@ -10,9 +10,9 @@ import ( "path/filepath" specs "github.com/opencontainers/runtime-spec/specs-go" - "golang.org/x/sys/unix" "github.com/siderolabs/talos/internal/pkg/containermode" + mount "github.com/siderolabs/talos/internal/pkg/mount/v3" "github.com/siderolabs/talos/pkg/machinery/constants" ) @@ -30,7 +30,7 @@ func prepareRootfs(id string) error { return fmt.Errorf("failed to create empty executable %q: %w", executablePath, err) } - if err := unix.Mount("/sbin/init", executablePath, "", unix.MS_BIND, ""); err != nil { + if err := mount.BindReadonly("/sbin/init", executablePath); err != nil { return fmt.Errorf("failed to create bind mount for %q: %w", executablePath, err) } diff --git a/internal/integration/api/mounts.go b/internal/integration/api/mounts.go new file mode 100644 index 000000000..632fb55a2 --- /dev/null +++ b/internal/integration/api/mounts.go @@ -0,0 +1,259 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +//go:build integration_api + +package api + +import ( + "bufio" + "context" + "fmt" + "io" + "strings" + "time" + + "github.com/siderolabs/talos/internal/integration/base" + "github.com/siderolabs/talos/pkg/machinery/client" + "github.com/siderolabs/talos/pkg/machinery/constants" +) + +// MountsSuite verifies mount flag policy on a running node. +// +// Policy (see siderolabs/talos#11946): +// - every rw mount must carry MOUNT_ATTR_NOSUID, MOUNT_ATTR_NOEXEC, +// MOUNT_ATTR_NODEV unless explicitly exempt +// - device nodes are not allowed outside /dev and /dev/pts: NODEV is +// non-negotiable for every other mountpoint +type MountsSuite struct { + base.APISuite + + ctx context.Context //nolint:containedctx + ctxCancel context.CancelFunc +} + +// SuiteName implements suite.NamedSuite. +func (suite *MountsSuite) SuiteName() string { + return "api.MountsSuite" +} + +// SetupTest sets up the test context. +func (suite *MountsSuite) SetupTest() { + suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 30*time.Second) + + if suite.Cluster == nil || suite.Cluster.Provisioner() != base.ProvisionerQEMU { + suite.T().Skip("skipping mounts test since provisioner is not qemu") + } +} + +// TearDownTest cancels the test context. +func (suite *MountsSuite) TearDownTest() { + if suite.ctxCancel != nil { + suite.ctxCancel() + } +} + +// mountInfo is one parsed entry from /proc/self/mountinfo. +type mountInfo struct { + mountPoint string + fsType string + source string + options map[string]struct{} // per-mount options (field 6) +} + +func (m mountInfo) has(opt string) bool { + _, ok := m.options[opt] + + return ok +} + +// parseMountInfo parses /proc/self/mountinfo per Linux kernel docs: +// fields[4] = mount point, fields[5] = per-mount options, after " - ": +// fstype, source, super-options. +func parseMountInfo(r io.Reader) ([]mountInfo, error) { + var out []mountInfo + + scanner := bufio.NewScanner(r) + scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) + + for scanner.Scan() { + line := scanner.Text() + + pre, post, ok := strings.Cut(line, " - ") + if !ok { + continue + } + + preFields := strings.Fields(pre) + postFields := strings.Fields(post) + + if len(preFields) < 6 || len(postFields) < 2 { + continue + } + + opts := make(map[string]struct{}) + for o := range strings.SplitSeq(preFields[5], ",") { + opts[o] = struct{}{} + } + + out = append(out, mountInfo{ + mountPoint: preFields[4], + options: opts, + fsType: postFields[0], + source: postFields[1], + }) + } + + return out, scanner.Err() +} + +// nodevExempt returns true for mountpoints where device nodes are legitimate. +// Only devtmpfs at /dev and devpts at /dev/pts qualify. +func nodevExempt(m mountInfo) bool { + switch { + case m.fsType == "devtmpfs" && m.mountPoint == "/dev": + return true + case m.fsType == "devpts" && m.mountPoint == "/dev/pts": + return true + } + + return false +} + +// workloadManagedPrefixes lists mount path prefixes that are created by +// kubelet, containerd, or CNI plugins — not by Talos. Their flags are out +// of scope for the Talos mount policy. +var workloadManagedPrefixes = []string{ + "/run/containerd/io.containerd.", + "/run/netns/", + "/var/lib/kubelet/pods/", +} + +func workloadManaged(m mountInfo) bool { + for _, p := range workloadManagedPrefixes { + if strings.HasPrefix(m.mountPoint, p) { + return true + } + } + + return false +} + +// noexecExemptPrefixes lists mount path prefixes where executing binaries +// is part of the design. Read-only mounts are exempt elsewhere via the +// `ro` option. /var (EPHEMERAL) is intentionally NOT exempt: containerd +// container exec goes through overlay rootfs at /run/containerd/.../rootfs +// which is a separate mount with its own flags. +var noexecExemptPrefixes = []string{ + "/opt", // CNI plugins, containerd plugins + "/usr/libexec/kubernetes", // kubelet plugins + "/usr/lib/udev", // udev helpers + constants.ExtensionServiceRootfsPath, // /usr/local/lib/containers — extension service rootfs overlays (iscsid, etc.) +} + +func noexecExempt(m mountInfo) bool { + if m.has("ro") { + return true + } + + // devtmpfs and hugetlbfs cannot host regular executable files in any + // way that a userspace exec() would care about; systemd matches this + // stance (see mount_table in systemd/src/shared/mount-setup.c — no + // MS_NOEXEC on /dev). + switch m.fsType { + case "devtmpfs", "hugetlbfs": + return true + } + + for _, p := range noexecExemptPrefixes { + if m.mountPoint == p || strings.HasPrefix(m.mountPoint, p+"/") { + return true + } + } + + return false +} + +// TestNodevPolicy asserts every mount outside /dev and /dev/pts carries nodev. +func (suite *MountsSuite) TestNodevPolicy() { + suite.runPolicy("nodev", nodevExempt, "device nodes only in /dev and /dev/pts") +} + +// TestNosuidPolicy asserts every mount carries nosuid. Talos has no +// legitimate SUID surface — even read-only signed rootfs/extension +// squashfs mounts ship no setuid binaries, so no exemptions. +func (suite *MountsSuite) TestNosuidPolicy() { + suite.runPolicy("nosuid", func(m mountInfo) bool { + return false + }, "no SUID binaries anywhere in Talos") +} + +// TestNoexecPolicy asserts every rw mount carries noexec, except +// documented exemptions (EPHEMERAL, /opt/cni, kubelet plugins, udev +// helpers). Read-only mounts are exempt (signed rootfs / extension +// squashfs). +func (suite *MountsSuite) TestNoexecPolicy() { + suite.runPolicy("noexec", noexecExempt, + "binaries should only execute from RO or explicitly exempt mounts") +} + +func (suite *MountsSuite) runPolicy(opt string, exempt func(mountInfo) bool, rationale string) { + for _, node := range suite.DiscoverNodeInternalIPs(suite.ctx) { + suite.Run(node, func() { + suite.checkOptOnNode(node, opt, exempt, rationale) + }) + } +} + +func (suite *MountsSuite) checkOptOnNode(node, opt string, exempt func(mountInfo) bool, rationale string) { + mounts := suite.readMountInfo(node) + + var violations []string + + for _, m := range mounts { + if workloadManaged(m) || exempt(m) { + continue + } + + // /var honors the EPHEMERAL VolumeConfig's mount.secure setting; when + // the cluster was deployed with secure=false skip the assertion to match + // the configured policy rather than the secure-by-default one. + if suite.SkipEphemeralPolicy && m.mountPoint == constants.EphemeralMountPoint { + continue + } + + if !m.has(opt) { + violations = append( + violations, + fmt.Sprintf("%s (fstype=%s, source=%s)", m.mountPoint, m.fsType, m.source), + ) + } + } + + suite.Assert().Empty( + violations, + "mounts missing %s (policy: %s):\n %s", + opt, rationale, strings.Join(violations, "\n "), + ) +} + +// readMountInfo fetches and parses /proc/self/mountinfo from a node. +func (suite *MountsSuite) readMountInfo(node string) []mountInfo { + nodeCtx := client.WithNode(suite.ctx, node) + + r, err := suite.Client.Read(nodeCtx, "/proc/self/mountinfo") + suite.Require().NoError(err) + + defer r.Close() //nolint:errcheck + + mounts, err := parseMountInfo(r) + suite.Require().NoError(err) + suite.Require().NotEmpty(mounts) + + return mounts +} + +func init() { + allSuites = append(allSuites, new(MountsSuite)) +} diff --git a/internal/integration/base/base.go b/internal/integration/base/base.go index 1ccd259a6..7c22342a0 100644 --- a/internal/integration/base/base.go +++ b/internal/integration/base/base.go @@ -66,6 +66,9 @@ type TalosSuite struct { Virtiofsd bool // Race informs test suites about race detector being enabled (e.g. for skipping incompatible tests) Race bool + // SkipEphemeralPolicy disables MountsSuite's nosuid/nodev/noexec assertions + // for the EPHEMERAL (/var) mount point. + SkipEphemeralPolicy bool discoveredNodes cluster.Info } diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go index ca91b1ab7..b0e5dee7c 100644 --- a/internal/integration/integration_test.go +++ b/internal/integration/integration_test.go @@ -34,15 +34,16 @@ var allSuites []suite.TestingSuite // Flag values. var ( - failFast bool - trustedBoot bool - selinuxEnforcing bool - extensionsQEMU bool - extensionsNvidia bool - verifyUKIBooted bool - airgapped bool - virtiofsd bool - race bool + failFast bool + trustedBoot bool + selinuxEnforcing bool + extensionsQEMU bool + extensionsNvidia bool + verifyUKIBooted bool + airgapped bool + virtiofsd bool + race bool + skipEphemeralPolicy bool talosConfig string endpoint string @@ -101,27 +102,28 @@ func TestIntegration(t *testing.T) { for _, s := range allSuites { if configuredSuite, ok := s.(base.ConfiguredSuite); ok { configuredSuite.SetConfig(base.TalosSuite{ - Endpoint: endpoint, - K8sEndpoint: k8sEndpoint, - Cluster: cluster, - TalosConfig: talosConfig, - Version: expectedVersion, - GoVersion: expectedGoVersion, - TalosctlPath: talosctlPath, - KubectlPath: kubectlPath, - HelmPath: helmPath, - KubeStrPath: kubeStrPath, - ExtensionsQEMU: extensionsQEMU, - ExtensionsNvidia: extensionsNvidia, - TrustedBoot: trustedBoot, - SelinuxEnforcing: selinuxEnforcing, - VerifyUKIBooted: verifyUKIBooted, - TalosImage: talosImage, - CSITestName: csiTestName, - CSITestTimeout: csiTestTimeout, - Airgapped: airgapped, - Virtiofsd: virtiofsd, - Race: race, + Endpoint: endpoint, + K8sEndpoint: k8sEndpoint, + Cluster: cluster, + TalosConfig: talosConfig, + Version: expectedVersion, + GoVersion: expectedGoVersion, + TalosctlPath: talosctlPath, + KubectlPath: kubectlPath, + HelmPath: helmPath, + KubeStrPath: kubeStrPath, + ExtensionsQEMU: extensionsQEMU, + ExtensionsNvidia: extensionsNvidia, + TrustedBoot: trustedBoot, + SelinuxEnforcing: selinuxEnforcing, + VerifyUKIBooted: verifyUKIBooted, + TalosImage: talosImage, + CSITestName: csiTestName, + CSITestTimeout: csiTestTimeout, + Airgapped: airgapped, + Virtiofsd: virtiofsd, + Race: race, + SkipEphemeralPolicy: skipEphemeralPolicy, }) } @@ -185,6 +187,8 @@ func init() { flag.StringVar(&csiTestTimeout, "talos.csi.timeout", "15m", "CSI test timeout") flag.BoolVar(&airgapped, "talos.airgapped", false, "Marker to skip tests that should not be run on airgapped talos cluster") flag.BoolVar(&virtiofsd, "talos.virtiofsd", false, "Marker to skip tests that should not be run without virtiofsd") + flag.BoolVar(&skipEphemeralPolicy, "talos.skip-ephemeral-policy", false, + "Skip MountsSuite assertions for /var (EPHEMERAL); set when the cluster was deployed with VolumeConfig EPHEMERAL mount.secure=false") flag.StringVar(&provision_test.DefaultSettings.CIDR, "talos.provision.cidr", provision_test.DefaultSettings.CIDR, "CIDR to use to provision clusters (provision tests only)") flag.Var(&provision_test.DefaultSettings.RegistryMirrors, "talos.provision.registry-mirror", "registry mirrors to use (provision tests only)") diff --git a/internal/integration/k8s/longhorn.go b/internal/integration/k8s/longhorn.go index ac46d04bd..45b04822d 100644 --- a/internal/integration/k8s/longhorn.go +++ b/internal/integration/k8s/longhorn.go @@ -7,11 +7,8 @@ package k8s import ( - "bytes" "context" _ "embed" - "strings" - "text/template" "time" "github.com/siderolabs/talos/internal/integration/base" @@ -19,15 +16,6 @@ import ( ) var ( - //go:embed testdata/longhorn-iscsi-volume.yaml - longHornISCSIVolumeManifest []byte - - //go:embed testdata/longhorn-volumeattachment.yaml - longHornISCSIVolumeAttachmentManifestTemplate []byte - - //go:embed testdata/pod-iscsi-volume.yaml - podWithISCSIVolumeTemplate []byte - //go:embed testdata/longhorn-v2-engine-values.yaml longhornEngineV2Values []byte @@ -38,7 +26,12 @@ var ( longhornNodeDiskPatch []byte ) -// LongHornSuite tests deploying Longhorn. +// LongHornSuite tests deploying Longhorn with the v2 (SPDK) data engine. +// +// The v1 engine relies on exec'ing engine binaries the engine-image DaemonSet +// drops under /var/lib/longhorn/engine-binaries/, which is incompatible with +// noexec on /var (see LongHornV1Suite for the v1 path that opts out via the +// ephemeral-insecure VolumeConfig patch). type LongHornSuite struct { base.K8sSuite } @@ -48,7 +41,7 @@ func (suite *LongHornSuite) SuiteName() string { return "k8s.LongHornSuite" } -// TestDeploy tests deploying Longhorn and running a simple test. +// TestDeploy tests deploying Longhorn (v2 data engine) and running fio against it. func (suite *LongHornSuite) TestDeploy() { if suite.Cluster == nil { suite.T().Skip("without full cluster state reaching out to the node IP is not reliable") @@ -100,124 +93,35 @@ func (suite *LongHornSuite) TestDeploy() { suite.Require().NoError(suite.WaitForResource(ctx, "longhorn-system", "longhorn.io", "Node", "v1beta2", k8sNode.Name, "{.status.diskStatus.*.conditions[?(@.type==\"Schedulable\")].status}", "True")) suite.PatchK8sObject(ctx, "longhorn-system", "longhorn.io", "Node", "v1beta2", k8sNode.Name, longhornNodeDiskPatch) - } - suite.Run("fio", func() { - suite.Require().NoError(suite.RunFIOTest(ctx, "longhorn", "10G")) - }) + // Wait for the SPDK-managed nvme block disk to finish initializing + // before running fio: replica scheduling on this disk is what fio-v2 + // exercises, and SPDK can take several seconds per node. + suite.Require().NoError(suite.WaitForResource( + ctx, + "longhorn-system", + "longhorn.io", + "Node", + "v1beta2", + k8sNode.Name, + "{.status.diskStatus.nvme.conditions[?(@.type==\"Ready\")].status}", + "True", + )) + suite.Require().NoError(suite.WaitForResource( + ctx, + "longhorn-system", + "longhorn.io", + "Node", + "v1beta2", + k8sNode.Name, + "{.status.diskStatus.nvme.conditions[?(@.type==\"Schedulable\")].status}", + "True", + )) + } suite.Run("fio-v2", func() { suite.Require().NoError(suite.RunFIOTest(ctx, "longhorn-v2", "10G")) }) - - suite.Run("iscsi", func() { - suite.testDeployISCSI(ctx) - }) -} - -//nolint:gocyclo -func (suite *LongHornSuite) testDeployISCSI(ctx context.Context) { - longHornISCSIVolumeManifestUnstructured := suite.ParseManifests(longHornISCSIVolumeManifest) - - defer func() { - cleanUpCtx, cleanupCancel := context.WithTimeout(context.Background(), 2*time.Minute) - defer cleanupCancel() - - suite.DeleteManifests(cleanUpCtx, longHornISCSIVolumeManifestUnstructured) - }() - - suite.ApplyManifests(ctx, longHornISCSIVolumeManifestUnstructured) - - tmpl, err := template.New("longhorn-iscsi-volumeattachment").Parse(string(longHornISCSIVolumeAttachmentManifestTemplate)) - suite.Require().NoError(err) - - var longHornISCSIVolumeAttachmentManifest bytes.Buffer - - node := suite.RandomDiscoveredNodeInternalIP(machine.TypeWorker) - - nodeInfo, err := suite.GetK8sNodeByInternalIP(ctx, node) - if err != nil { - suite.T().Fatalf("failed to get K8s node by internal IP: %v", err) - } - - if err := tmpl.Execute(&longHornISCSIVolumeAttachmentManifest, struct { - NodeID string - }{ - NodeID: nodeInfo.Name, - }); err != nil { - suite.T().Fatalf("failed to render Longhorn ISCSI volume manifest: %v", err) - } - - longHornISCSIVolumeAttachmentManifestUnstructured := suite.ParseManifests(longHornISCSIVolumeAttachmentManifest.Bytes()) - - suite.ApplyManifests(ctx, longHornISCSIVolumeAttachmentManifestUnstructured) - - if err := suite.WaitForResource(ctx, "longhorn-system", "longhorn.io", "Volume", "v1beta2", "iscsi", "{.status.robustness}", "healthy"); err != nil { - suite.T().Fatalf("failed to wait for LongHorn Engine to be Ready: %v", err) - } - - if err := suite.WaitForResource(ctx, "longhorn-system", "longhorn.io", "Volume", "v1beta2", "iscsi", "{.status.state}", "attached"); err != nil { - suite.T().Fatalf("failed to wait for LongHorn Engine to be Ready: %v", err) - } - - if err := suite.WaitForResource(ctx, "longhorn-system", "longhorn.io", "Engine", "v1beta2", "iscsi-e-0", "{.status.currentState}", "running"); err != nil { - suite.T().Fatalf("failed to wait for LongHorn Engine to be Ready: %v", err) - } - - unstructured, err := suite.GetUnstructuredResource(ctx, "longhorn-system", "longhorn.io", "Engine", "v1beta2", "iscsi-e-0") - if err != nil { - suite.T().Fatalf("failed to get LongHorn Engine resource: %v", err) - } - - var endpointData string - - if status, ok := unstructured.Object["status"].(map[string]any); ok { - endpointData, ok = status["endpoint"].(string) - if !ok { - suite.T().Fatalf("failed to get LongHorn Engine endpoint") - } - } - - tmpl, err = template.New("pod-iscsi-volume").Parse(string(podWithISCSIVolumeTemplate)) - suite.Require().NoError(err) - - // endpoint is of the form `iscsi://10.244.0.5:3260/iqn.2019-10.io.longhorn:iscsi/1` - // trim the iscsi:// prefix - endpointData = strings.TrimPrefix(endpointData, "iscsi://") - // trim the /1 suffix - endpointData = strings.TrimSuffix(endpointData, "/1") - - targetPortal, IQN, ok := strings.Cut(endpointData, "/") - if !ok { - suite.T().Fatalf("failed to parse endpoint data from %s", endpointData) - } - - var podWithISCSIVolume bytes.Buffer - - if err := tmpl.Execute(&podWithISCSIVolume, struct { - NodeName string - TargetPortal string - IQN string - }{ - NodeName: nodeInfo.Name, - TargetPortal: targetPortal, - IQN: IQN, - }); err != nil { - suite.T().Fatalf("failed to render pod with ISCSI volume manifest: %v", err) - } - - podWithISCSIVolumeUnstructured := suite.ParseManifests(podWithISCSIVolume.Bytes()) - - defer func() { - cleanUpCtx, cleanupCancel := context.WithTimeout(context.Background(), time.Minute) - defer cleanupCancel() - - suite.DeleteManifests(cleanUpCtx, podWithISCSIVolumeUnstructured) - }() - - suite.ApplyManifests(ctx, podWithISCSIVolumeUnstructured) - - suite.Require().NoError(suite.WaitForPodToBeRunning(ctx, 3*time.Minute, "default", "iscsipd")) } func init() { diff --git a/internal/integration/k8s/longhorn_v1.go b/internal/integration/k8s/longhorn_v1.go new file mode 100644 index 000000000..57d64e2f5 --- /dev/null +++ b/internal/integration/k8s/longhorn_v1.go @@ -0,0 +1,209 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +//go:build integration_k8s + +package k8s + +import ( + "bytes" + "context" + _ "embed" + "strings" + "text/template" + "time" + + "github.com/siderolabs/talos/internal/integration/base" + "github.com/siderolabs/talos/pkg/machinery/config/machine" +) + +var ( + //go:embed testdata/longhorn-iscsi-volume.yaml + longHornISCSIVolumeManifest []byte + + //go:embed testdata/longhorn-volumeattachment.yaml + longHornISCSIVolumeAttachmentManifestTemplate []byte + + //go:embed testdata/pod-iscsi-volume.yaml + podWithISCSIVolumeTemplate []byte +) + +// LongHornV1Suite tests deploying Longhorn with the v1 data engine. +// +// The v1 engine's instance-manager exec's engine binaries it drops under +// /var/lib/longhorn/engine-binaries/, which only works when /var is mounted +// without noexec. The matching CI matrix entry applies +// hack/test/patches/ephemeral-insecure.yaml to disable Secure on the EPHEMERAL +// VolumeConfig. +type LongHornV1Suite struct { + base.K8sSuite +} + +// SuiteName returns the name of the suite. +func (suite *LongHornV1Suite) SuiteName() string { + return "k8s.LongHornV1Suite" +} + +// TestDeploy tests deploying Longhorn (v1 data engine) and running fio + an +// in-tree Kubernetes iscsi volume against a v1 iscsi-frontend Longhorn volume. +func (suite *LongHornV1Suite) TestDeploy() { + if suite.Cluster == nil { + suite.T().Skip("without full cluster state reaching out to the node IP is not reliable") + } + + if suite.CSITestName != "longhorn-v1" { + suite.T().Skip("skipping longhorn-v1 test as it is not enabled") + } + + timeout, err := time.ParseDuration(suite.CSITestTimeout) + if err != nil { + suite.T().Fatalf("failed to parse timeout: %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), timeout) + suite.T().Cleanup(cancel) + + if err := suite.HelmInstall( + ctx, + "longhorn-system", + "https://charts.longhorn.io", + LongHornHelmChartVersion, + "longhorn", + "longhorn", + nil, + ); err != nil { + suite.T().Fatalf("failed to install Longhorn chart: %v", err) + } + + nodes := suite.DiscoverNodeInternalIPsByType(ctx, machine.TypeWorker) + + suite.Require().Equal(3, len(nodes), "expected 3 worker nodes") + + for _, node := range nodes { + k8sNode, err := suite.GetK8sNodeByInternalIP(ctx, node) + suite.Require().NoError(err) + + suite.Require().NoError(suite.WaitForResourceToBeAvailable(ctx, 2*time.Minute, "longhorn-system", "longhorn.io", "Node", "v1beta2", k8sNode.Name)) + + suite.Require().NoError(suite.WaitForResource(ctx, "longhorn-system", "longhorn.io", "Node", "v1beta2", k8sNode.Name, "{.status.diskStatus.*.conditions[?(@.type==\"Ready\")].status}", "True")) + suite.Require().NoError(suite.WaitForResource(ctx, "longhorn-system", "longhorn.io", "Node", "v1beta2", k8sNode.Name, "{.status.diskStatus.*.conditions[?(@.type==\"Schedulable\")].status}", "True")) + } + + suite.Run("fio", func() { + suite.Require().NoError(suite.RunFIOTest(ctx, "longhorn", "10G")) + }) + + suite.Run("iscsi", func() { + suite.testDeployISCSI(ctx) + }) +} + +//nolint:gocyclo +func (suite *LongHornV1Suite) testDeployISCSI(ctx context.Context) { + longHornISCSIVolumeManifestUnstructured := suite.ParseManifests(longHornISCSIVolumeManifest) + + defer func() { + cleanUpCtx, cleanupCancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cleanupCancel() + + suite.DeleteManifests(cleanUpCtx, longHornISCSIVolumeManifestUnstructured) + }() + + suite.ApplyManifests(ctx, longHornISCSIVolumeManifestUnstructured) + + tmpl, err := template.New("longhorn-iscsi-volumeattachment").Parse(string(longHornISCSIVolumeAttachmentManifestTemplate)) + suite.Require().NoError(err) + + var longHornISCSIVolumeAttachmentManifest bytes.Buffer + + node := suite.RandomDiscoveredNodeInternalIP(machine.TypeWorker) + + nodeInfo, err := suite.GetK8sNodeByInternalIP(ctx, node) + if err != nil { + suite.T().Fatalf("failed to get K8s node by internal IP: %v", err) + } + + if err := tmpl.Execute(&longHornISCSIVolumeAttachmentManifest, struct { + NodeID string + }{ + NodeID: nodeInfo.Name, + }); err != nil { + suite.T().Fatalf("failed to render Longhorn ISCSI volume manifest: %v", err) + } + + longHornISCSIVolumeAttachmentManifestUnstructured := suite.ParseManifests(longHornISCSIVolumeAttachmentManifest.Bytes()) + + suite.ApplyManifests(ctx, longHornISCSIVolumeAttachmentManifestUnstructured) + + if err := suite.WaitForResource(ctx, "longhorn-system", "longhorn.io", "Volume", "v1beta2", "iscsi", "{.status.robustness}", "healthy"); err != nil { + suite.T().Fatalf("failed to wait for LongHorn Engine to be Ready: %v", err) + } + + if err := suite.WaitForResource(ctx, "longhorn-system", "longhorn.io", "Volume", "v1beta2", "iscsi", "{.status.state}", "attached"); err != nil { + suite.T().Fatalf("failed to wait for LongHorn Engine to be Ready: %v", err) + } + + if err := suite.WaitForResource(ctx, "longhorn-system", "longhorn.io", "Engine", "v1beta2", "iscsi-e-0", "{.status.currentState}", "running"); err != nil { + suite.T().Fatalf("failed to wait for LongHorn Engine to be Ready: %v", err) + } + + unstructured, err := suite.GetUnstructuredResource(ctx, "longhorn-system", "longhorn.io", "Engine", "v1beta2", "iscsi-e-0") + if err != nil { + suite.T().Fatalf("failed to get LongHorn Engine resource: %v", err) + } + + var endpointData string + + if status, ok := unstructured.Object["status"].(map[string]any); ok { + endpointData, ok = status["endpoint"].(string) + if !ok { + suite.T().Fatalf("failed to get LongHorn Engine endpoint") + } + } + + tmpl, err = template.New("pod-iscsi-volume").Parse(string(podWithISCSIVolumeTemplate)) + suite.Require().NoError(err) + + // endpoint is of the form `iscsi://10.244.0.5:3260/iqn.2019-10.io.longhorn:iscsi/1` + // trim the iscsi:// prefix + endpointData = strings.TrimPrefix(endpointData, "iscsi://") + // trim the /1 suffix + endpointData = strings.TrimSuffix(endpointData, "/1") + + targetPortal, IQN, ok := strings.Cut(endpointData, "/") + if !ok { + suite.T().Fatalf("failed to parse endpoint data from %s", endpointData) + } + + var podWithISCSIVolume bytes.Buffer + + if err := tmpl.Execute(&podWithISCSIVolume, struct { + NodeName string + TargetPortal string + IQN string + }{ + NodeName: nodeInfo.Name, + TargetPortal: targetPortal, + IQN: IQN, + }); err != nil { + suite.T().Fatalf("failed to render pod with ISCSI volume manifest: %v", err) + } + + podWithISCSIVolumeUnstructured := suite.ParseManifests(podWithISCSIVolume.Bytes()) + + defer func() { + cleanUpCtx, cleanupCancel := context.WithTimeout(context.Background(), time.Minute) + defer cleanupCancel() + + suite.DeleteManifests(cleanUpCtx, podWithISCSIVolumeUnstructured) + }() + + suite.ApplyManifests(ctx, podWithISCSIVolumeUnstructured) + + suite.Require().NoError(suite.WaitForPodToBeRunning(ctx, 3*time.Minute, "default", "iscsipd")) +} + +func init() { + allSuites = append(allSuites, new(LongHornV1Suite)) +} diff --git a/internal/pkg/mount/v3/helpers.go b/internal/pkg/mount/v3/helpers.go index 4c36a3929..219ba741e 100644 --- a/internal/pkg/mount/v3/helpers.go +++ b/internal/pkg/mount/v3/helpers.go @@ -29,7 +29,8 @@ func discard(string, ...any) {} func NewCgroup2() *Manager { return NewManager( WithTarget(constants.CgroupMountPath), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NODEV|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_RELATIME), + WithSecure(), + WithMountAttributes(unix.MOUNT_ATTR_RELATIME), WithFsopen( "cgroup2", fsopen.WithBoolParameter("nsdelegate"), @@ -59,6 +60,7 @@ func NewReadOnlyOverlay(sources []string, target string, printer func(string, .. WithPrinter(printer), WithTarget(target), WithReadOnly(), + WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NODEV), WithFsopen("overlay", fsOptions...), ) @@ -90,6 +92,7 @@ func NewOverlayWithBasePath(sources []string, target, basePath string, printer f options, WithTarget(target), WithExtraDirs(diff, workdir), + WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NODEV), WithFsopen("overlay", fsOptions...), WithPrinter(printer), ) @@ -118,6 +121,7 @@ func Squashfs(target, squashfsFile string, printer func(string, ...any)) (*Manag WithTarget(target), WithPrinter(printer), WithReadOnly(), + WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NODEV), WithShared(), WithExtraUnmountCallbacks(func(m *Manager) { dev.Detach() //nolint:errcheck @@ -190,7 +194,7 @@ func Pseudo(printer func(string, ...any)) Managers { WithPrinter(printer), WithTarget("/proc"), WithKeepOpenAfterMount(), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_NODEV), + WithSecure(), WithFsopen("proc"), ), newManager( @@ -198,6 +202,7 @@ func Pseudo(printer func(string, ...any)) Managers { WithPrinter(printer), WithTarget("/sys"), WithKeepOpenAfterMount(), + WithSecure(), WithFsopen("sysfs"), ), ) @@ -210,7 +215,8 @@ func PseudoLate(printer func(string, ...any)) Managers { always, WithPrinter(printer), WithTarget("/run"), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_RELATIME), + WithSecure(), + WithMountAttributes(unix.MOUNT_ATTR_RELATIME), WithSelinuxLabel(constants.RunSelinuxLabel), WithRecursiveUnmount(), WithFsopen( @@ -222,6 +228,8 @@ func PseudoLate(printer func(string, ...any)) Managers { always, WithPrinter(printer), WithTarget("/system"), + WithSecure(), + WithMountAttributes(unix.MOUNT_ATTR_RELATIME), WithSelinuxLabel(constants.SystemSelinuxLabel), WithRecursiveUnmount(), WithFsopen( @@ -233,7 +241,7 @@ func PseudoLate(printer func(string, ...any)) Managers { always, WithPrinter(printer), WithTarget("/tmp"), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_NODEV), + WithSecure(), WithFsopen( "tmpfs", fsopen.WithStringParameter("mode", "0755"), @@ -250,7 +258,8 @@ func PseudoSub(printer func(string, ...any)) Managers { always, WithPrinter(printer), WithTarget("/dev/shm"), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_NODEV|unix.MOUNT_ATTR_RELATIME), + WithSecure(), + WithMountAttributes(unix.MOUNT_ATTR_RELATIME), WithFsopen("tmpfs"), ), newManager( @@ -276,48 +285,56 @@ func PseudoSub(printer func(string, ...any)) Managers { always, WithPrinter(printer), WithTarget("/sys/fs/bpf"), + WithSecure(), + WithMountAttributes(unix.MOUNT_ATTR_RELATIME), WithFsopen("bpf"), ), newManager( always, WithPrinter(printer), WithTarget("/sys/kernel/security"), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_NODEV|unix.MOUNT_ATTR_RELATIME), + WithSecure(), + WithMountAttributes(unix.MOUNT_ATTR_RELATIME), WithFsopen("securityfs"), ), newManager( always, WithPrinter(printer), WithTarget("/sys/kernel/tracing"), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_NODEV), + WithSecure(), WithFsopen("tracefs"), ), newManager( always, WithPrinter(printer), WithTarget("/sys/kernel/config"), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_NODEV|unix.MOUNT_ATTR_RELATIME), + WithSecure(), + WithMountAttributes(unix.MOUNT_ATTR_RELATIME), WithFsopen("configfs"), ), newManager( always, WithPrinter(printer), WithTarget("/sys/kernel/debug"), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_NODEV|unix.MOUNT_ATTR_RELATIME), + WithSecure(), + WithMountAttributes(unix.MOUNT_ATTR_RELATIME), WithFsopen("debugfs"), ), newManager( selinux.IsEnabled, WithPrinter(printer), WithTarget("/sys/fs/selinux"), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_RELATIME), + WithSecure(), + WithMountAttributes(unix.MOUNT_ATTR_RELATIME), WithFsopen("selinuxfs"), ), newManager( hasEFIVars, WithPrinter(printer), WithTarget(constants.EFIVarsMountPoint), - WithMountAttributes(unix.MOUNT_ATTR_NOSUID|unix.MOUNT_ATTR_NOEXEC|unix.MOUNT_ATTR_NODEV|unix.MOUNT_ATTR_RELATIME|unix.MOUNT_ATTR_RDONLY), + WithSecure(), + WithReadOnly(), + WithMountAttributes(unix.MOUNT_ATTR_RELATIME), WithFsopen("efivarfs"), ), ) diff --git a/internal/pkg/mount/v3/manager.go b/internal/pkg/mount/v3/manager.go index 8830d87b1..a3787bc5c 100644 --- a/internal/pkg/mount/v3/manager.go +++ b/internal/pkg/mount/v3/manager.go @@ -217,9 +217,9 @@ func WithDisableAccessTime() ManagerOption { return WithMountAttributes(unix.MOUNT_ATTR_NOATIME) } -// WithSecure sets MOUNT_ATTR_NOSUID and MOUNT_ATTR_NODEV. +// WithSecure sets MOUNT_ATTR_NOSUID, MOUNT_ATTR_NODEV, and MOUNT_ATTR_NOEXEC. func WithSecure() ManagerOption { - return WithMountAttributes(unix.MOUNT_ATTR_NOSUID | unix.MOUNT_ATTR_NODEV) + return WithMountAttributes(unix.MOUNT_ATTR_NOSUID | unix.MOUNT_ATTR_NODEV | unix.MOUNT_ATTR_NOEXEC) } // WithReadOnly sets the mount as read only. diff --git a/internal/pkg/mount/v3/mount.go b/internal/pkg/mount/v3/mount.go index 1535e3b08..e4d225938 100644 --- a/internal/pkg/mount/v3/mount.go +++ b/internal/pkg/mount/v3/mount.go @@ -11,6 +11,11 @@ import ( "golang.org/x/sys/unix" ) +// bindHardenAttr is the baseline attribute set every read-only bind mount +// inherits: read-only, no setuid escalation, no device nodes (per +// siderolabs/talos#11946 — device nodes belong only in /dev and /dev/pts). +const bindHardenAttr = unix.MOUNT_ATTR_RDONLY | unix.MOUNT_ATTR_NOSUID | unix.MOUNT_ATTR_NODEV + // BindReadonly creates a common way to create a readonly bind mounted destination. func BindReadonly(src, dst string) error { sourceFD, err := unix.OpenTree(unix.AT_FDCWD, src, unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC) @@ -21,7 +26,7 @@ func BindReadonly(src, dst string) error { defer unix.Close(sourceFD) //nolint:errcheck if err := unix.MountSetattr(sourceFD, "", unix.AT_EMPTY_PATH, &unix.MountAttr{ - Attr_set: unix.MOUNT_ATTR_RDONLY, + Attr_set: bindHardenAttr, }); err != nil { return fmt.Errorf("failed to set mount attribute: %w", err) } @@ -43,7 +48,7 @@ func BindReadonlyFd(dfd int, dst string) error { defer unix.Close(sourceFD) //nolint:errcheck if err := unix.MountSetattr(sourceFD, "", unix.AT_EMPTY_PATH, &unix.MountAttr{ - Attr_set: unix.MOUNT_ATTR_RDONLY, + Attr_set: bindHardenAttr, }); err != nil { return fmt.Errorf("failed to set mount attribute: %w", err) } diff --git a/internal/pkg/mount/v3/point.go b/internal/pkg/mount/v3/point.go index dc4560370..117bacd4a 100644 --- a/internal/pkg/mount/v3/point.go +++ b/internal/pkg/mount/v3/point.go @@ -333,7 +333,7 @@ func (p *Point) SetDisableAccessTime(disable bool) error { }, 0) } -// SetSecure sets or clears the nosuid and nodev mount attributes. +// SetSecure sets or clears the nosuid, nodev, and noexec mount attributes. func (p *Point) SetSecure(secure bool) error { if p.detached { return nil @@ -341,12 +341,12 @@ func (p *Point) SetSecure(secure bool) error { if secure { return p.setattr(&unix.MountAttr{ - Attr_set: unix.MOUNT_ATTR_NOSUID | unix.MOUNT_ATTR_NODEV, + Attr_set: unix.MOUNT_ATTR_NOSUID | unix.MOUNT_ATTR_NODEV | unix.MOUNT_ATTR_NOEXEC, }, 0) } return p.setattr(&unix.MountAttr{ - Attr_clr: unix.MOUNT_ATTR_NOSUID | unix.MOUNT_ATTR_NODEV, + Attr_clr: unix.MOUNT_ATTR_NOSUID | unix.MOUNT_ATTR_NODEV | unix.MOUNT_ATTR_NOEXEC, }, 0) } diff --git a/internal/pkg/selinux/policy/policy.33 b/internal/pkg/selinux/policy/policy.33 index f30d56c48bd4e2b7b038b5e8ae90d4f9bb332a1f..34dfe900785f993c5cd0d39f0897fbb22c040e3b 100644 GIT binary patch delta 65 zcmZ2*i|NQMrVSgc7^60Cw9*#j@?dCWh+$x4U}D&O%e#+-JBvXW%4gV|9&pi=%Z(um KrogQ2l^OtscoBC1 delta 31 ncmX?di)q0vrVSgc7{fPjw9*#b{KC73WphKo1=GzjZ7Z=Zl diff --git a/internal/pkg/selinux/policy/selinux/common/processes.cil b/internal/pkg/selinux/policy/selinux/common/processes.cil index e7aca62ab..ad5b87fa6 100644 --- a/internal/pkg/selinux/policy/selinux/common/processes.cil +++ b/internal/pkg/selinux/policy/selinux/common/processes.cil @@ -101,6 +101,10 @@ ; Allow init to manage processes (allow init_t service_p (fs_classes (rw))) (allow init_t service_p (process_classes (full))) +; Service binaries exec from NOSUID mounts (rootfs squashfs, /sbin/init +; bind into /system/libexec//); init_t needs nosuid_transition +; for every service_p member. +(allow init_t service_p (process2 (nosuid_transition))) ; kernel cmdline (allow system_p proc_cmdline_t (fs_classes (ro))) diff --git a/internal/pkg/selinux/policy/selinux/services/machined.cil b/internal/pkg/selinux/policy/selinux/services/machined.cil index 959592d3d..60153a5c8 100644 --- a/internal/pkg/selinux/policy/selinux/services/machined.cil +++ b/internal/pkg/selinux/policy/selinux/services/machined.cil @@ -19,6 +19,8 @@ (call system_socket_f (dbus_client_socket_t)) (allow init_t service_p (process (transition))) +; (process2 nosuid_transition for init_t -> service_p lives in +; common/processes.cil — single source for the allow rule.) ; Manage processes (allow init_t any_p (fs_classes (rw))) (allow init_t any_p (process_classes (full))) @@ -98,6 +100,9 @@ (allow initramfs_t init_exec_t (file (execute))) (typetransition initramfs_t init_exec_t process init_t) (allow initramfs_t init_t (process_classes (full))) +; init_exec_t (/usr/bin/init) lives on the rootfs squashfs which is +; mounted NOSUID — allow the initramfs_t → init_t transition across it. +(allow initramfs_t init_t (process2 (nosuid_transition))) (allow init_t initramfs_t (fd (use))) ; Direct child processes diff --git a/internal/pkg/selinux/policy/selinux/services/udev.cil b/internal/pkg/selinux/policy/selinux/services/udev.cil index be4798156..8c90f0640 100644 --- a/internal/pkg/selinux/policy/selinux/services/udev.cil +++ b/internal/pkg/selinux/policy/selinux/services/udev.cil @@ -60,6 +60,9 @@ (allow kernel_t modprobe_exec_t (file (execute))) (allow kernel_t udev_t (process (all))) ; including transition +; modprobe_exec_t lives on rootfs squashfs (NOSUID); kernel_t is not +; in service_p so this rule is required separately. +(allow kernel_t udev_t (process2 (nosuid_transition))) (allow init_t modprobe_exec_t (file (execute))) (allow init_t udev_t (process (all))) ; including transition diff --git a/pkg/machinery/api/resource/definitions/block/block.pb.go b/pkg/machinery/api/resource/definitions/block/block.pb.go index 36e3cdf93..f4e12e68f 100644 --- a/pkg/machinery/api/resource/definitions/block/block.pb.go +++ b/pkg/machinery/api/resource/definitions/block/block.pb.go @@ -1092,7 +1092,10 @@ type MountSpec struct { // BindTarget is an optional path on the host to bind-mount the volume onto. BindTarget string `protobuf:"bytes,9,opt,name=bind_target,json=bindTarget,proto3" json:"bind_target,omitempty"` // Parameters are additional filesystem mount options used when mounting the volume. - Parameters []*ParameterSpec `protobuf:"bytes,10,rep,name=parameters,proto3" json:"parameters,omitempty"` + Parameters []*ParameterSpec `protobuf:"bytes,10,rep,name=parameters,proto3" json:"parameters,omitempty"` + // Secure applies MOUNT_ATTR_NOSUID\|NODEV\|NOEXEC to the mount. Set for + // config-only mounts; leave false for mounts hosting executables. + Secure bool `protobuf:"varint,11,opt,name=secure,proto3" json:"secure,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -1197,6 +1200,13 @@ func (x *MountSpec) GetParameters() []*ParameterSpec { return nil } +func (x *MountSpec) GetSecure() bool { + if x != nil { + return x.Secure + } + return false +} + // MountStatusSpec is the spec for MountStatus. type MountStatusSpec struct { state protoimpl.MessageState `protogen:"open.v1"` @@ -2630,7 +2640,7 @@ const file_resource_definitions_block_block_proto_rawDesc = "" + "\tread_only\x18\x05 \x01(\bR\breadOnly\x12\x1a\n" + "\bdetached\x18\x06 \x01(\bR\bdetached\x12.\n" + "\x13disable_access_time\x18\a \x01(\bR\x11disableAccessTime\x12\x16\n" + - "\x06secure\x18\b \x01(\bR\x06secure\"\x82\x03\n" + + "\x06secure\x18\b \x01(\bR\x06secure\"\x9a\x03\n" + "\tMountSpec\x12\x1f\n" + "\vtarget_path\x18\x01 \x01(\tR\n" + "targetPath\x12#\n" + @@ -2646,7 +2656,8 @@ const file_resource_definitions_block_block_proto_rawDesc = "" + "\n" + "parameters\x18\n" + " \x03(\v2/.talos.resource.definitions.block.ParameterSpecR\n" + - "parameters\"\xbd\x03\n" + + "parameters\x12\x16\n" + + "\x06secure\x18\v \x01(\bR\x06secure\"\xbd\x03\n" + "\x0fMountStatusSpec\x12F\n" + "\x04spec\x18\x01 \x01(\v22.talos.resource.definitions.block.MountRequestSpecR\x04spec\x12\x16\n" + "\x06target\x18\x02 \x01(\tR\x06target\x12\x16\n" + diff --git a/pkg/machinery/api/resource/definitions/block/block_vtproto.pb.go b/pkg/machinery/api/resource/definitions/block/block_vtproto.pb.go index 5d303a869..eb1420909 100644 --- a/pkg/machinery/api/resource/definitions/block/block_vtproto.pb.go +++ b/pkg/machinery/api/resource/definitions/block/block_vtproto.pb.go @@ -1042,6 +1042,16 @@ func (m *MountSpec) MarshalToSizedBufferVT(dAtA []byte) (int, error) { i -= len(m.unknownFields) copy(dAtA[i:], m.unknownFields) } + if m.Secure { + i-- + if m.Secure { + dAtA[i] = 1 + } else { + dAtA[i] = 0 + } + i-- + dAtA[i] = 0x58 + } if len(m.Parameters) > 0 { for iNdEx := len(m.Parameters) - 1; iNdEx >= 0; iNdEx-- { size, err := m.Parameters[iNdEx].MarshalToSizedBufferVT(dAtA[:i]) @@ -2798,6 +2808,9 @@ func (m *MountSpec) SizeVT() (n int) { n += 1 + l + protohelpers.SizeOfVarint(uint64(l)) } } + if m.Secure { + n += 2 + } n += len(m.unknownFields) return n } @@ -6317,6 +6330,26 @@ func (m *MountSpec) UnmarshalVT(dAtA []byte) error { return err } iNdEx = postIndex + case 11: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Secure", wireType) + } + var v int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return protohelpers.ErrIntOverflow + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + v |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + m.Secure = bool(v != 0) default: iNdEx = preIndex skippy, err := protohelpers.Skip(dAtA[iNdEx:]) diff --git a/pkg/machinery/constants/constants.go b/pkg/machinery/constants/constants.go index 659a71ffb..6c413cca6 100644 --- a/pkg/machinery/constants/constants.go +++ b/pkg/machinery/constants/constants.go @@ -1355,14 +1355,19 @@ const ( type SELinuxLabeledPath struct { Path string Label string + // Secure applies the nosuid+nodev+noexec triplet to the overlay. Set + // for config-only overlays (e.g. /etc/cni, /etc/kubernetes); leave + // false for overlays that host plugin/helper binaries (e.g. /opt, + // /usr/libexec/kubernetes). + Secure bool } // Overlays is the set of paths to create overlay mounts for. var Overlays = []SELinuxLabeledPath{ - {"/etc/cni", CNISELinuxLabel}, - {KubernetesConfigBaseDir, KubernetesConfigSELinuxLabel}, - {"/usr/libexec/kubernetes", KubeletPluginsSELinuxLabel}, - {"/opt", OptSELinuxLabel}, + {Path: "/etc/cni", Label: CNISELinuxLabel, Secure: true}, + {Path: KubernetesConfigBaseDir, Label: KubernetesConfigSELinuxLabel, Secure: true}, + {Path: "/usr/libexec/kubernetes", Label: KubeletPluginsSELinuxLabel}, + {Path: "/opt", Label: OptSELinuxLabel}, } // DefaultDroppedCapabilities is the default set of capabilities to drop. diff --git a/pkg/machinery/resources/block/volume_config.go b/pkg/machinery/resources/block/volume_config.go index 59f051138..faa0f589d 100644 --- a/pkg/machinery/resources/block/volume_config.go +++ b/pkg/machinery/resources/block/volume_config.go @@ -250,6 +250,9 @@ type MountSpec struct { BindTarget *string `yaml:"bindTarget,omitempty" protobuf:"9"` // Parameters are additional filesystem mount options used when mounting the volume. Parameters []ParameterSpec `yaml:"parameters,omitempty" protobuf:"10"` + // Secure applies MOUNT_ATTR_NOSUID\|NODEV\|NOEXEC to the mount. Set for + // config-only mounts; leave false for mounts hosting executables. + Secure bool `yaml:"secure,omitempty" protobuf:"11"` } // SymlinkProvisioningSpec is the spec for volume symlink. diff --git a/website/content/v1.14/reference/api.md b/website/content/v1.14/reference/api.md index afda6fca7..b51f3183f 100644 --- a/website/content/v1.14/reference/api.md +++ b/website/content/v1.14/reference/api.md @@ -6331,6 +6331,7 @@ MountSpec is the spec for volume mount. | recursive_relabel | [bool](#bool) | | RecursiveRelabel is the recursive relabel/chown flag for the mount target. | | bind_target | [string](#string) | | BindTarget is an optional path on the host to bind-mount the volume onto. | | parameters | [ParameterSpec](#talos.resource.definitions.block.ParameterSpec) | repeated | Parameters are additional filesystem mount options used when mounting the volume. | +| secure | [bool](#bool) | | Secure applies MOUNT_ATTR_NOSUID\|NODEV\|NOEXEC to the mount. Set for config-only mounts; leave false for mounts hosting executables. |