From 9dc1150e3aa08e5cb85c8fb6ecf0cfec5c613029 Mon Sep 17 00:00:00 2001 From: Noel Georgi Date: Fri, 31 Mar 2023 12:40:31 +0530 Subject: [PATCH] docs: update nvidia instructions Update NVIDIA install docs and add an example of setting `nvidia` as the default runtimeclass. NVIDIA doesn't have published images of vectoradd for CUDA 12, replacing example with running `nvidia-smi` command. Signed-off-by: Noel Georgi --- website/content/v1.4/_index.md | 8 +- .../configuration/nvidia-gpu-proprietary.md | 75 ++++++++----------- .../talos-guides/configuration/nvidia-gpu.md | 75 ++++++++----------- 3 files changed, 70 insertions(+), 88 deletions(-) diff --git a/website/content/v1.4/_index.md b/website/content/v1.4/_index.md index 8fffe2a7e..ffd2c9fe2 100644 --- a/website/content/v1.4/_index.md +++ b/website/content/v1.4/_index.md @@ -4,12 +4,12 @@ no_list: true linkTitle: "Documentation" cascade: type: docs -lastRelease: v1.4.0-alpha.1 +lastRelease: v1.4.0-alpha.3 kubernetesRelease: "1.27.0-rc.0" -prevKubernetesRelease: "1.26.0" +prevKubernetesRelease: "1.26.2" theilaRelease: "v0.2.1" -nvidiaContainerToolkitRelease: "v1.12.0" -nvidiaDriverRelease: "525.89.02" +nvidiaContainerToolkitRelease: "v1.12.1" +nvidiaDriverRelease: "530.41.03" iscsiToolsRelease: "v0.1.4" preRelease: true --- diff --git a/website/content/v1.4/talos-guides/configuration/nvidia-gpu-proprietary.md b/website/content/v1.4/talos-guides/configuration/nvidia-gpu-proprietary.md index 7d6b1e405..ddd909776 100644 --- a/website/content/v1.4/talos-guides/configuration/nvidia-gpu-proprietary.md +++ b/website/content/v1.4/talos-guides/configuration/nvidia-gpu-proprietary.md @@ -6,7 +6,6 @@ aliases: --- > Enabling NVIDIA GPU support on Talos is bound by [NVIDIA EULA](https://www.nvidia.com/en-us/drivers/nvidia-license/). -> Talos GPU support has been promoted to **beta**. These are the steps to enabling NVIDIA support in Talos. @@ -171,51 +170,43 @@ helm repo update helm install nvidia-device-plugin nvdp/nvidia-device-plugin --version=0.13.0 --set=runtimeClassName=nvidia ``` -Apply the following manifest to run CUDA pod via nvidia runtime: +## (Optional) Setting the default runtime class as `nvidia` + +> Do note that this will set the default runtime class to `nvidia` for all pods scheduled on the node. + +Create a patch yaml `nvidia-default-runtimeclass.yaml` to update the machine config similar to below: + +```yaml +- op: add + path: /machine/files + value: + - content: | + [plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + path: /etc/cri/conf.d/20-customization.part + op: create +``` + +Now apply the patch to all Talos nodes in the cluster having NVIDIA GPU's installed: ```bash -cat < Note the `spec.runtimeClassName` being explicitly set to `nvidia` in the pod spec. + +Run the following command to test the runtime class: ```bash -kubectl get pods -``` - -which should produce an output similar to below: - -```text -NAME READY STATUS RESTARTS AGE -gpu-operator-test 0/1 Completed 0 13s -``` - -```bash -kubectl logs gpu-operator-test -``` - -which should produce an output similar to below: - -```text -[Vector addition of 50000 elements] -Copy input data from the host memory to the CUDA device -CUDA kernel launch with 196 blocks of 256 threads -Copy output data from the CUDA device to the host memory -Test PASSED -Done +kubectl run \ + nvidia-test \ + --restart=Never \ + -ti --rm \ + --image nvcr.io/nvidia/cuda:12.1.0-base-ubuntu22.04 \ + --overrides '{"spec": {"runtimeClassName": "nvidia"}}' \ + nvidia-smi ``` diff --git a/website/content/v1.4/talos-guides/configuration/nvidia-gpu.md b/website/content/v1.4/talos-guides/configuration/nvidia-gpu.md index 54c479d77..287de3f14 100644 --- a/website/content/v1.4/talos-guides/configuration/nvidia-gpu.md +++ b/website/content/v1.4/talos-guides/configuration/nvidia-gpu.md @@ -6,7 +6,6 @@ aliases: --- > Enabling NVIDIA GPU support on Talos is bound by [NVIDIA EULA](https://www.nvidia.com/en-us/drivers/nvidia-license/). -> Talos GPU support has been promoted to **beta**. > The Talos published NVIDIA OSS drivers are bound to a specific Talos release. > The extensions versions also needs to be updated when upgrading Talos. @@ -120,51 +119,43 @@ helm repo update helm install nvidia-device-plugin nvdp/nvidia-device-plugin --version=0.13.0 --set=runtimeClassName=nvidia ``` -Apply the following manifest to run CUDA pod via nvidia runtime: +## (Optional) Setting the default runtime class as `nvidia` + +> Do note that this will set the default runtime class to `nvidia` for all pods scheduled on the node. + +Create a patch yaml `nvidia-default-runtimeclass.yaml` to update the machine config similar to below: + +```yaml +- op: add + path: /machine/files + value: + - content: | + [plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + path: /etc/cri/conf.d/20-customization.part + op: create +``` + +Now apply the patch to all Talos nodes in the cluster having NVIDIA GPU's installed: ```bash -cat < Note the `spec.runtimeClassName` being explicitly set to `nvidia` in the pod spec. + +Run the following command to test the runtime class: ```bash -kubectl get pods -``` - -which should produce an output similar to below: - -```text -NAME READY STATUS RESTARTS AGE -gpu-operator-test 0/1 Completed 0 13s -``` - -```bash -kubectl logs gpu-operator-test -``` - -which should produce an output similar to below: - -```text -[Vector addition of 50000 elements] -Copy input data from the host memory to the CUDA device -CUDA kernel launch with 196 blocks of 256 threads -Copy output data from the CUDA device to the host memory -Test PASSED -Done +kubectl run \ + nvidia-test \ + --restart=Never \ + -ti --rm \ + --image nvcr.io/nvidia/cuda:12.1.0-base-ubuntu22.04 \ + --overrides '{"spec": {"runtimeClassName": "nvidia"}}' \ + nvidia-smi ```