From a4d16e190d2298c964525cfb1143d85f1b613e1c Mon Sep 17 00:00:00 2001 From: "Pedro D. Bello-Maldonado" Date: Tue, 25 Jul 2023 11:04:48 -0400 Subject: [PATCH] Adding GPUs to Kind cluster (#494) * Triggering different build * Checking if nodes can be listed * Trigger build * Resource patching. Extending resources of Kubernetes nodes to include 'fake' GPUs. * Fixed command to describe nodes * Kuttl tests for checking if GPUs we added correctly to the nodes * Fixed namespace issue and node name issue. * Adding all tests again now that the resource extension passes * Changed where the extended resources are tested * Added error checks for 'curl' calls. Rename variables to use lower case. * Fixed 'if' equal operator --- hack/run-e2e-kind.sh | 57 +++++++++++++++++++ .../steps/00-assert.yaml | 10 ++++ .../steps/01-assert.yaml | 5 ++ .../steps/01-install.yaml | 4 ++ .../steps/02-assert.yaml | 8 +++ .../steps/02-install.yaml | 19 +++++++ test/kuttl-test-extended-resources.yaml | 7 +++ 7 files changed, 110 insertions(+) create mode 100644 test/e2e-kuttl-extended-resources/steps/00-assert.yaml create mode 100644 test/e2e-kuttl-extended-resources/steps/01-assert.yaml create mode 100644 test/e2e-kuttl-extended-resources/steps/01-install.yaml create mode 100644 test/e2e-kuttl-extended-resources/steps/02-assert.yaml create mode 100644 test/e2e-kuttl-extended-resources/steps/02-install.yaml create mode 100644 test/kuttl-test-extended-resources.yaml diff --git a/hack/run-e2e-kind.sh b/hack/run-e2e-kind.sh index 0688a5f5b..be9dc1b27 100755 --- a/hack/run-e2e-kind.sh +++ b/hack/run-e2e-kind.sh @@ -373,7 +373,63 @@ function setup-mcad-env { do echo -n "." && sleep 1; done +} + +function extend-resources { + # Patch nodes to provide GPUs resources without physical GPUs. + # This is intended to allow testing of GPU specific features such as histograms. + + # Start communication with cluster + kubectl proxy > /dev/null 2>&1 & + proxy_pid=$! + + echo "Starting background proxy connection (pid=${proxy_pid})..." + + curl 127.0.0.1:8001 > /dev/null 2>&1 + + if [[ ! $? -eq 0 ]]; then + echo "Calling 'kubectl proxy' did not create a successful connection to the kubelet needed to patch the nodes. Exiting." + exit 1 + else + echo "Connected to the kubelet for patching the nodes" + fi + + + # Variables + resource_name="nvidia.com~1gpu" + resource_count="8" + + # Patch nodes + for node_name in $(kubectl get nodes --no-headers -o custom-columns=":metadata.name") + do + echo "- Patching node (add): ${node_name}" + patching_status=$(curl --header "Content-Type: application/json-patch+json" \ + --request PATCH \ + --data '[{"op": "add", "path": "/status/capacity/'${resource_name}'", "value": "'${resource_count}'"}]' \ + http://localhost:8001/api/v1/nodes/${node_name}/status | jq -r '.status') + + if [[ ${patching_status} == "Failure" ]]; then + echo "Failed to patch node '${node_name}' with GPU resources" + exit 1 + fi + + echo + done + + # Stop communication with cluster + echo "Killing proxy (pid=${proxy_pid})..." + kill -9 ${proxy_pid} + + # Run kuttl tests to confirm GPUs were added correctly + kuttl_test="${ROOT_DIR}/test/kuttl-test-extended-resources.yaml" + echo "kubectl kuttl test --config ${kuttl_test}" + kubectl kuttl test --config ${kuttl_test} + if [ $? -ne 0 ] + then + echo "kuttl e2e test '${kuttl_test}' failure, exiting." + exit 1 + fi } function kuttl-tests { @@ -402,6 +458,7 @@ trap cleanup EXIT update_test_host check-prerequisites kind-up-cluster +extend-resources setup-mcad-env # MCAD with quotamanagement options is started by kuttl-tests kuttl-tests diff --git a/test/e2e-kuttl-extended-resources/steps/00-assert.yaml b/test/e2e-kuttl-extended-resources/steps/00-assert.yaml new file mode 100644 index 000000000..fe161130d --- /dev/null +++ b/test/e2e-kuttl-extended-resources/steps/00-assert.yaml @@ -0,0 +1,10 @@ +--- +# Verify that GPUs are a resource for the node +apiVersion: v1 +kind: Node +metadata: + name: test-worker +status: + allocatable: + nvidia.com/gpu: "8" + diff --git a/test/e2e-kuttl-extended-resources/steps/01-assert.yaml b/test/e2e-kuttl-extended-resources/steps/01-assert.yaml new file mode 100644 index 000000000..eca0eb63c --- /dev/null +++ b/test/e2e-kuttl-extended-resources/steps/01-assert.yaml @@ -0,0 +1,5 @@ +# Verify that the namespace was created +apiVersion: v1 +kind: Namespace +metadata: + name: extended-resources diff --git a/test/e2e-kuttl-extended-resources/steps/01-install.yaml b/test/e2e-kuttl-extended-resources/steps/01-install.yaml new file mode 100644 index 000000000..c74d93f2e --- /dev/null +++ b/test/e2e-kuttl-extended-resources/steps/01-install.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: extended-resources diff --git a/test/e2e-kuttl-extended-resources/steps/02-assert.yaml b/test/e2e-kuttl-extended-resources/steps/02-assert.yaml new file mode 100644 index 000000000..8a9462858 --- /dev/null +++ b/test/e2e-kuttl-extended-resources/steps/02-assert.yaml @@ -0,0 +1,8 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: gpu-job + namespace: extended-resources +status: + conditions: + - type: Complete diff --git a/test/e2e-kuttl-extended-resources/steps/02-install.yaml b/test/e2e-kuttl-extended-resources/steps/02-install.yaml new file mode 100644 index 000000000..1a5f86f4e --- /dev/null +++ b/test/e2e-kuttl-extended-resources/steps/02-install.yaml @@ -0,0 +1,19 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: gpu-job + namespace: extended-resources +spec: + template: + spec: + restartPolicy: Never + containers: + - name: gpu-job + image: ubuntu:latest + command: [ "/bin/bash", "-c", "--" ] + args: [ "sleep 10;" ] + resources: + requests: + nvidia.com/gpu: 8 + limits: + nvidia.com/gpu: 8 diff --git a/test/kuttl-test-extended-resources.yaml b/test/kuttl-test-extended-resources.yaml new file mode 100644 index 000000000..4b325eb27 --- /dev/null +++ b/test/kuttl-test-extended-resources.yaml @@ -0,0 +1,7 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestSuite +testDirs: + - test/e2e-kuttl-extended-resources/ +timeout: 60 +artifactsDir: _output/logs +commands: