From 9dc1150e3aa08e5cb85c8fb6ecf0cfec5c613029 Mon Sep 17 00:00:00 2001
From: Noel Georgi <git@frezbo.dev>
Date: Fri, 31 Mar 2023 12:40:31 +0530
Subject: [PATCH] docs: update nvidia instructions

Update NVIDIA install docs and add an example of setting `nvidia` as the
default runtimeclass.

NVIDIA doesn't have published images of vectoradd for CUDA 12, replacing
example with running `nvidia-smi` command.

Signed-off-by: Noel Georgi <git@frezbo.dev>
---
 website/content/v1.4/_index.md                |  8 +-
 .../configuration/nvidia-gpu-proprietary.md   | 75 ++++++++-----------
 .../talos-guides/configuration/nvidia-gpu.md  | 75 ++++++++-----------
 3 files changed, 70 insertions(+), 88 deletions(-)
diff --git a/website/content/v1.4/_index.md b/website/content/v1.4/_index.md
index 8fffe2a7e..ffd2c9fe2 100644
--- a/website/content/v1.4/_index.md
+++ b/website/content/v1.4/_index.md
@@ -4,12 +4,12 @@ no_list: true
 linkTitle: "Documentation"
 cascade:
   type: docs
-lastRelease: v1.4.0-alpha.1
+lastRelease: v1.4.0-alpha.3
 kubernetesRelease: "1.27.0-rc.0"
-prevKubernetesRelease: "1.26.0"
+prevKubernetesRelease: "1.26.2"
 theilaRelease: "v0.2.1"
-nvidiaContainerToolkitRelease: "v1.12.0"
-nvidiaDriverRelease: "525.89.02"
+nvidiaContainerToolkitRelease: "v1.12.1"
+nvidiaDriverRelease: "530.41.03"
 iscsiToolsRelease: "v0.1.4"
 preRelease: true
 ---
diff --git a/website/content/v1.4/talos-guides/configuration/nvidia-gpu-proprietary.md b/website/content/v1.4/talos-guides/configuration/nvidia-gpu-proprietary.md
index 7d6b1e405..ddd909776 100644
--- a/website/content/v1.4/talos-guides/configuration/nvidia-gpu-proprietary.md
+++ b/website/content/v1.4/talos-guides/configuration/nvidia-gpu-proprietary.md
@@ -6,7 +6,6 @@ aliases:
 ---
 
 > Enabling NVIDIA GPU support on Talos is bound by [NVIDIA EULA](https://www.nvidia.com/en-us/drivers/nvidia-license/).
-> Talos GPU support has been promoted to **beta**.
 
 These are the steps to enabling NVIDIA support in Talos.
 
@@ -171,51 +170,43 @@ helm repo update
 helm install nvidia-device-plugin nvdp/nvidia-device-plugin --version=0.13.0 --set=runtimeClassName=nvidia
 ```
 
-Apply the following manifest to run CUDA pod via nvidia runtime:
+## (Optional) Setting the default runtime class as `nvidia`
+
+> Do note that this will set the default runtime class to `nvidia` for all pods scheduled on the node.
+
+Create a patch yaml `nvidia-default-runtimeclass.yaml` to update the machine config similar to below:
+
+```yaml
+- op: add
+  path: /machine/files
+  value:
+    - content: |
+        [plugins]
+          [plugins."io.containerd.grpc.v1.cri"]
+            [plugins."io.containerd.grpc.v1.cri".containerd]
+              default_runtime_name = "nvidia"
+      path: /etc/cri/conf.d/20-customization.part
+      op: create
+```
+
+Now apply the patch to all Talos nodes in the cluster having NVIDIA GPU's installed:
 
 ```bash
-cat <<EOF | kubectl apply -f -
----
-apiVersion: v1
-kind: Pod
-metadata:
-  name: gpu-operator-test
-spec:
-  restartPolicy: OnFailure
-  runtimeClassName: nvidia
-  containers:
-  - name: cuda-vector-add
-    image: "nvidia/samples:vectoradd-cuda11.6.0"
-    resources:
-      limits:
-         nvidia.com/gpu: 1
-<<EOF
+talosctl patch mc --patch @nvidia-default-runtimeclass.yaml
 ```
 
-The status can be viewed by running:
+### Testing the runtime class
+
+> Note the `spec.runtimeClassName` being explicitly set to `nvidia` in the pod spec.
+
+Run the following command to test the runtime class:
 
 ```bash
-kubectl get pods
-```
-
-which should produce an output similar to below:
-
-```text
-NAME                READY   STATUS      RESTARTS   AGE
-gpu-operator-test   0/1     Completed   0          13s
-```
-
-```bash
-kubectl logs gpu-operator-test
-```
-
-which should produce an output similar to below:
-
-```text
-[Vector addition of 50000 elements]
-Copy input data from the host memory to the CUDA device
-CUDA kernel launch with 196 blocks of 256 threads
-Copy output data from the CUDA device to the host memory
-Test PASSED
-Done
+kubectl run \
+  nvidia-test \
+  --restart=Never \
+  -ti --rm \
+  --image nvcr.io/nvidia/cuda:12.1.0-base-ubuntu22.04 \
+  --overrides '{"spec": {"runtimeClassName": "nvidia"}}' \
+  nvidia-smi
 ```
diff --git a/website/content/v1.4/talos-guides/configuration/nvidia-gpu.md b/website/content/v1.4/talos-guides/configuration/nvidia-gpu.md
index 54c479d77..287de3f14 100644
--- a/website/content/v1.4/talos-guides/configuration/nvidia-gpu.md
+++ b/website/content/v1.4/talos-guides/configuration/nvidia-gpu.md
@@ -6,7 +6,6 @@ aliases:
 ---
 
 > Enabling NVIDIA GPU support on Talos is bound by [NVIDIA EULA](https://www.nvidia.com/en-us/drivers/nvidia-license/).
-> Talos GPU support has been promoted to **beta**.
 > The Talos published NVIDIA OSS drivers are bound to a specific Talos release.
 > The extensions versions also needs to be updated when upgrading Talos.
 
@@ -120,51 +119,43 @@ helm repo update
 helm install nvidia-device-plugin nvdp/nvidia-device-plugin --version=0.13.0 --set=runtimeClassName=nvidia
 ```
 
-Apply the following manifest to run CUDA pod via nvidia runtime:
+## (Optional) Setting the default runtime class as `nvidia`
+
+> Do note that this will set the default runtime class to `nvidia` for all pods scheduled on the node.
+
+Create a patch yaml `nvidia-default-runtimeclass.yaml` to update the machine config similar to below:
+
+```yaml
+- op: add
+  path: /machine/files
+  value:
+    - content: |
+        [plugins]
+          [plugins."io.containerd.grpc.v1.cri"]
+            [plugins."io.containerd.grpc.v1.cri".containerd]
+              default_runtime_name = "nvidia"
+      path: /etc/cri/conf.d/20-customization.part
+      op: create
+```
+
+Now apply the patch to all Talos nodes in the cluster having NVIDIA GPU's installed:
 
 ```bash
-cat <<EOF | kubectl apply -f -
----
-apiVersion: v1
-kind: Pod
-metadata:
-  name: gpu-operator-test
-spec:
-  restartPolicy: OnFailure
-  runtimeClassName: nvidia
-  containers:
-  - name: cuda-vector-add
-    image: "nvidia/samples:vectoradd-cuda11.6.0"
-    resources:
-      limits:
-         nvidia.com/gpu: 1
-<<EOF
+talosctl patch mc --patch @nvidia-default-runtimeclass.yaml
 ```
 
-The status can be viewed by running:
+### Testing the runtime class
+
+> Note the `spec.runtimeClassName` being explicitly set to `nvidia` in the pod spec.
+
+Run the following command to test the runtime class:
 
 ```bash
-kubectl get pods
-```
-
-which should produce an output similar to below:
-
-```text
-NAME                READY   STATUS      RESTARTS   AGE
-gpu-operator-test   0/1     Completed   0          13s
-```
-
-```bash
-kubectl logs gpu-operator-test
-```
-
-which should produce an output similar to below:
-
-```text
-[Vector addition of 50000 elements]
-Copy input data from the host memory to the CUDA device
-CUDA kernel launch with 196 blocks of 256 threads
-Copy output data from the CUDA device to the host memory
-Test PASSED
-Done
+kubectl run \
+  nvidia-test \
+  --restart=Never \
+  -ti --rm \
+  --image nvcr.io/nvidia/cuda:12.1.0-base-ubuntu22.04 \
+  --overrides '{"spec": {"runtimeClassName": "nvidia"}}' \
+  nvidia-smi
 ```