Skip to content

Commit

Permalink
Adding GPUs to Kind cluster (#494)
Browse files Browse the repository at this point in the history
* Triggering different build

* Checking if nodes can be listed

* Trigger build

* Resource patching. Extending resources of Kubernetes nodes to include 'fake' GPUs.

* Fixed command to describe nodes

* Kuttl tests for checking if GPUs we added correctly to the nodes

* Fixed namespace issue and node name issue.

* Adding all tests again now that the resource extension passes

* Changed where the extended resources are tested

* Added error checks for 'curl' calls. Rename variables to use lower case.

* Fixed 'if' equal operator
  • Loading branch information
metalcycling authored Jul 25, 2023
1 parent 19ded5c commit a4d16e1
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 0 deletions.
57 changes: 57 additions & 0 deletions hack/run-e2e-kind.sh
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,63 @@ function setup-mcad-env {
do
echo -n "." && sleep 1;
done
}

function extend-resources {
# Patch nodes to provide GPUs resources without physical GPUs.
# This is intended to allow testing of GPU specific features such as histograms.

# Start communication with cluster
kubectl proxy > /dev/null 2>&1 &
proxy_pid=$!

echo "Starting background proxy connection (pid=${proxy_pid})..."

curl 127.0.0.1:8001 > /dev/null 2>&1

if [[ ! $? -eq 0 ]]; then
echo "Calling 'kubectl proxy' did not create a successful connection to the kubelet needed to patch the nodes. Exiting."
exit 1
else
echo "Connected to the kubelet for patching the nodes"
fi


# Variables
resource_name="nvidia.com~1gpu"
resource_count="8"

# Patch nodes
for node_name in $(kubectl get nodes --no-headers -o custom-columns=":metadata.name")
do
echo "- Patching node (add): ${node_name}"

patching_status=$(curl --header "Content-Type: application/json-patch+json" \
--request PATCH \
--data '[{"op": "add", "path": "/status/capacity/'${resource_name}'", "value": "'${resource_count}'"}]' \
http://localhost:8001/api/v1/nodes/${node_name}/status | jq -r '.status')

if [[ ${patching_status} == "Failure" ]]; then
echo "Failed to patch node '${node_name}' with GPU resources"
exit 1
fi

echo
done

# Stop communication with cluster
echo "Killing proxy (pid=${proxy_pid})..."
kill -9 ${proxy_pid}

# Run kuttl tests to confirm GPUs were added correctly
kuttl_test="${ROOT_DIR}/test/kuttl-test-extended-resources.yaml"
echo "kubectl kuttl test --config ${kuttl_test}"
kubectl kuttl test --config ${kuttl_test}
if [ $? -ne 0 ]
then
echo "kuttl e2e test '${kuttl_test}' failure, exiting."
exit 1
fi
}

function kuttl-tests {
Expand Down Expand Up @@ -402,6 +458,7 @@ trap cleanup EXIT
update_test_host
check-prerequisites
kind-up-cluster
extend-resources
setup-mcad-env
# MCAD with quotamanagement options is started by kuttl-tests
kuttl-tests
Expand Down
10 changes: 10 additions & 0 deletions test/e2e-kuttl-extended-resources/steps/00-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
# Verify that GPUs are a resource for the node
apiVersion: v1
kind: Node
metadata:
name: test-worker
status:
allocatable:
nvidia.com/gpu: "8"

5 changes: 5 additions & 0 deletions test/e2e-kuttl-extended-resources/steps/01-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Verify that the namespace was created
apiVersion: v1
kind: Namespace
metadata:
name: extended-resources
4 changes: 4 additions & 0 deletions test/e2e-kuttl-extended-resources/steps/01-install.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: extended-resources
8 changes: 8 additions & 0 deletions test/e2e-kuttl-extended-resources/steps/02-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: batch/v1
kind: Job
metadata:
name: gpu-job
namespace: extended-resources
status:
conditions:
- type: Complete
19 changes: 19 additions & 0 deletions test/e2e-kuttl-extended-resources/steps/02-install.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: batch/v1
kind: Job
metadata:
name: gpu-job
namespace: extended-resources
spec:
template:
spec:
restartPolicy: Never
containers:
- name: gpu-job
image: ubuntu:latest
command: [ "/bin/bash", "-c", "--" ]
args: [ "sleep 10;" ]
resources:
requests:
nvidia.com/gpu: 8
limits:
nvidia.com/gpu: 8
7 changes: 7 additions & 0 deletions test/kuttl-test-extended-resources.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kuttl.dev/v1beta1
kind: TestSuite
testDirs:
- test/e2e-kuttl-extended-resources/
timeout: 60
artifactsDir: _output/logs
commands:

0 comments on commit a4d16e1

Please sign in to comment.