From aae1f28c842b3bd504f7b9faf291b5526dec8fe2 Mon Sep 17 00:00:00 2001 From: Joseph Hanson Date: Fri, 6 Sep 2024 21:49:49 -0500 Subject: [PATCH] Add nvidia device plugin --- .../apps/kube-system/kustomization.yaml | 1 + .../nvidia-device-plugin/app/helmrelease.yaml | 42 +++++++++++++++++++ .../app/kustomization.yaml | 13 ++++++ .../app/resources/values.yml | 12 ++++++ .../app/runtimeclass.yaml | 6 +++ .../kube-system/nvidia-device-plugin/ks.yaml | 18 ++++++++ 6 files changed, 92 insertions(+) create mode 100644 kubernetes/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml create mode 100644 kubernetes/apps/kube-system/nvidia-device-plugin/app/kustomization.yaml create mode 100644 kubernetes/apps/kube-system/nvidia-device-plugin/app/resources/values.yml create mode 100644 kubernetes/apps/kube-system/nvidia-device-plugin/app/runtimeclass.yaml create mode 100644 kubernetes/apps/kube-system/nvidia-device-plugin/ks.yaml diff --git a/kubernetes/apps/kube-system/kustomization.yaml b/kubernetes/apps/kube-system/kustomization.yaml index dc37a56..2b28413 100644 --- a/kubernetes/apps/kube-system/kustomization.yaml +++ b/kubernetes/apps/kube-system/kustomization.yaml @@ -12,5 +12,6 @@ resources: - ./kubelet-csr-approver/ks.yaml - ./metrics-server/ks.yaml - ./node-feature-discovery/ks.yaml + - ./nvidia-device-plugin/ks.yaml - ./reloader/ks.yaml - ./spegel/ks.yaml diff --git a/kubernetes/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml b/kubernetes/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml new file mode 100644 index 0000000..6f01dcc --- /dev/null +++ b/kubernetes/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml @@ -0,0 +1,42 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/helmrelease-helm-v2beta2.json +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: nvidia-device-plugin +spec: + interval: 30m + chart: + spec: + chart: nvidia-device-plugin + version: 0.16.2 + sourceRef: + kind: HelmRepository + name: nvdp + namespace: flux-system + metadata: + annotations: + reloader.stakater.com/auto: "true" + install: + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + remediation: + retries: 3 + strategy: rollback + values: + podAnnotations: + configmap.reloader.stakater.com/reload: nvidia-helm-values + config: + name: nvidia-helm-values + runtimeClassName: "nvidia" + gfd: + enabled: true + nfd: + enabled: false + resources: + requests: + cpu: 100m + limits: + memory: 512Mi diff --git a/kubernetes/apps/kube-system/nvidia-device-plugin/app/kustomization.yaml b/kubernetes/apps/kube-system/nvidia-device-plugin/app/kustomization.yaml new file mode 100644 index 0000000..c325a94 --- /dev/null +++ b/kubernetes/apps/kube-system/nvidia-device-plugin/app/kustomization.yaml @@ -0,0 +1,13 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./helmrelease.yaml + - ./runtimeclass.yaml +configMapGenerator: + - name: nvidia-helm-values + files: + - values.yaml=./resources/values.yml +generatorOptions: + disableNameSuffixHash: true diff --git a/kubernetes/apps/kube-system/nvidia-device-plugin/app/resources/values.yml b/kubernetes/apps/kube-system/nvidia-device-plugin/app/resources/values.yml new file mode 100644 index 0000000..cdf9eca --- /dev/null +++ b/kubernetes/apps/kube-system/nvidia-device-plugin/app/resources/values.yml @@ -0,0 +1,12 @@ +--- +version: v1 +flags: + migStrategy: "single" + plugin: + deviceListStrategy: "envvar" + deviceIDStrategy: "uuid" +sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 3 diff --git a/kubernetes/apps/kube-system/nvidia-device-plugin/app/runtimeclass.yaml b/kubernetes/apps/kube-system/nvidia-device-plugin/app/runtimeclass.yaml new file mode 100644 index 0000000..7ba6add --- /dev/null +++ b/kubernetes/apps/kube-system/nvidia-device-plugin/app/runtimeclass.yaml @@ -0,0 +1,6 @@ +--- +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia diff --git a/kubernetes/apps/kube-system/nvidia-device-plugin/ks.yaml b/kubernetes/apps/kube-system/nvidia-device-plugin/ks.yaml new file mode 100644 index 0000000..3b0fd69 --- /dev/null +++ b/kubernetes/apps/kube-system/nvidia-device-plugin/ks.yaml @@ -0,0 +1,18 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: nvidia-device-plugin + namespace: flux-system + labels: + substitution.flux.home.arpa/enabled: "true" +spec: + targetNamespace: kube-system + interval: 10m + path: "./kubernetes/apps/kube-system/nvidia-device-plugin/app" + prune: true + sourceRef: + kind: GitRepository + name: theshire + wait: true