Skip to content

Commit

Permalink
Add Helm chart for kubeflow trainer
Browse files Browse the repository at this point in the history
Signed-off-by: Yi Chen <[email protected]>
  • Loading branch information
ChenYi015 committed Feb 26, 2025
1 parent 48386a9 commit 77c023d
Show file tree
Hide file tree
Showing 54 changed files with 25,241 additions and 276 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ hack/python-sdk/openapi-generator-cli.jar

# Coverage
cover.out

# Helm
charts/kubeflow-trainer/charts/jobset
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ repos:
hooks:
- id: check-yaml
args: [--allow-multiple-documents]
exclude: '^charts/trainer/.*\.yaml$'
- id: check-json
- id: end-of-file-fixer
- id: trailing-whitespace
Expand Down
90 changes: 71 additions & 19 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,27 @@ endif
SHELL = /usr/bin/env bash -o pipefail
.SHELLFLAGS = -ec

PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
REPO := github.com/kubeflow/trainer
TRAINER_CHART_DIR := $(PROJECT_DIR)/charts/kubeflow-trainer

# Location to install tool binaries
LOCALBIN ?= $(PROJECT_DIR)/bin

# Tool versions
CONTROLLER_GEN_VERSION ?= v0.17.2
ENVTEST_VERSION ?= release-0.20
ENVTEST_K8S_VERSION ?= 1.31
HELM_VERSION ?= v3.15.3
HELM_UNITTEST_VERSION ?= 0.5.1
HELM_DOCS_VERSION ?= v1.14.2

# Tool binaries
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
ENVTEST ?= $(LOCALBIN)/setup-envtest
HELM ?= $(LOCALBIN)/helm
HELM_DOCS ?= $(LOCALBIN)/helm-docs

##@ General

# The help target prints out all targets with their descriptions organized
Expand All @@ -29,24 +50,6 @@ help: ## Display this help.

##@ Development

PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))

# Tool Binaries
LOCALBIN ?= $(PROJECT_DIR)/bin
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
ENVTEST ?= $(LOCALBIN)/setup-envtest

ENVTEST_K8S_VERSION ?= 1.31

# Instructions to download tools for development.
.PHONY: envtest
envtest: ## Download the setup-envtest binary if required.
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/[email protected]

.PHONY: controller-gen
controller-gen: ## Download the controller-gen binary if required.
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/[email protected]

# Download external CRDs for Go integration testings.
EXTERNAL_CRDS_DIR ?= $(PROJECT_DIR)/manifests/external-crds

Expand All @@ -65,11 +68,14 @@ scheduler-plugins-crd: ## Copy the CRDs from the Scheduler Plugins repository to
# Instructions for code generation.
.PHONY: manifests
manifests: controller-gen ## Generate manifests.
# Skip outputing the RBAC and webhook manifests as we will sync them from the manifests templated by the Helm chart.
$(CONTROLLER_GEN) "crd:generateEmbeddedObjectMeta=true" rbac:roleName=kubeflow-trainer-controller-manager webhook \
paths="./pkg/apis/trainer/v1alpha1/...;./pkg/controller/...;./pkg/runtime/...;./pkg/webhooks/...;./pkg/util/cert/..." \
output:crd:artifacts:config=manifests/base/crds \
output:rbac:artifacts:config=manifests/base/rbac \
output:webhook:artifacts:config=manifests/base/webhook
output:rbac:none \
output:webhook:artifacts:config=manifests/base/webhook \
output:webhook:none

.PHONY: generate
generate: go-mod-download manifests ## Generate APIs and SDK.
Expand Down Expand Up @@ -124,3 +130,49 @@ test-python-integration: ## Run Python integration test.
pip install -r ./cmd/initializer/dataset/requirements.txt

pytest ./test/integration/initializer

##@ Helm

.PHONY: sync-manifests
sync-manifests: ## Sync Kustomize manifests from manifests templated from Helm chart.
$(HELM) dependency update $(TRAINER_CHART_DIR)
hack/sync-manifests.sh

.PHONY: helm-unittest
helm-unittest: helm-unittest-plugin ## Run Helm chart unittests.
$(HELM) unittest $(TRAINER_CHART_DIR) --strict --file "tests/**/*_test.yaml"

.PHONY: helm-lint
helm-lint: ## Run Helm chart lint test.
docker run --rm --workdir /workspace --volume "$$(pwd):/workspace" quay.io/helmpack/chart-testing:latest ct lint --target-branch master --validate-maintainers=false

.PHONY: helm-docs
helm-docs: helm-docs-plugin ## Generates markdown documentation for helm charts from requirements and values files.
$(HELM_DOCS) --sort-values-order=file

##@ Dependencies

.PHONY: envtest
envtest: ## Download the setup-envtest binary if necessary.
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/setup-envtest@$(ENVTEST_VERSION)

.PHONY: controller-gen
controller-gen: ## Download the controller-gen binary if necessary.
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION)

.PHONY: helm
helm: $(HELM) ## Download helm locally if necessary.
$(HELM): $(LOCALBIN)
GOBIN=$(LOCALBIN) go install helm.sh/helm/v3/cmd/helm@$(HELM_VERSION)

.PHONY: helm-unittest-plugin
helm-unittest-plugin: helm ## Download helm unittest plugin locally if necessary.
if [ -z "$(shell $(HELM) plugin list | grep unittest)" ]; then \
echo "Installing helm unittest plugin"; \
$(HELM) plugin install https://github.com/helm-unittest/helm-unittest.git --version $(HELM_UNITTEST_VERSION); \
fi

.PHONY: helm-docs-plugin
helm-docs-plugin: $(HELM_DOCS) ## Download helm-docs plugin locally if necessary.
$(HELM_DOCS): $(LOCALBIN)
GOBIN=$(LOCALBIN) go install github.com/norwoodj/helm-docs/cmd/helm-docs@$(HELM_DOCS_VERSION)
39 changes: 39 additions & 0 deletions charts/kubeflow-trainer/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.

.helmignore
ci/

# helm-unittest
.debug
tests/
__snapshot__/

# helm-docs
README.md.gotmpl

# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/

# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~

# Various IDEs
*.tmproj
.project
.idea/
.vscode/

# MacOS
.DS_Store
42 changes: 42 additions & 0 deletions charts/kubeflow-trainer/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#
# Copyright 2024 The Kubeflow authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

apiVersion: v2

name: kubeflow-trainer

description: A Helm chart for deploying Kubeflow trainer on Kubernetes.

version: 2.0.0

appVersion: 2.0.0

type: application

dependencies:
- name: jobset
repository: oci://us-central1-docker.pkg.dev/k8s-staging-images/charts
version: 0.8.0
condition: jobset.install

keywords:
- kubeflow trainer

home: https://github.com/kubeflow/trainer

maintainers:
- name: ChenYi015
url: https://github.com/ChenYi015
106 changes: 106 additions & 0 deletions charts/kubeflow-trainer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# kubeflow-trainer

![Version: 2.0.0](https://img.shields.io/badge/Version-2.0.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.0.0](https://img.shields.io/badge/AppVersion-2.0.0-informational?style=flat-square)

A Helm chart for deploying Kubeflow trainer on Kubernetes.

**Homepage:** <https://github.com/kubeflow/trainer>

## Introduction

This chart bootstraps a [Kubernetes Trainer](https://github.com/kubeflow/trainer) deployment using the [Helm](https://helm.sh) package manager.

## Prerequisites

- Helm >= 3
- Kubernetes >= 1.20

## Usage

### Add Helm Repo

```bash
helm repo add kubeflow-trainer https://kubeflow.github.io/trainer

helm repo update
```

See [helm repo](https://helm.sh/docs/helm/helm_repo) for command documentation.

### Install the chart

```bash
helm install [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer
```

For example, if you want to create a release with name `kubeflow-trainer` in the `kubeflow-system` namespace:

```shell
helm install kubeflow-trainer kubeflow-trainer/kubeflow-trainer \
--namespace kubeflow-system \
--create-namespace
```

Note that by passing the `--create-namespace` flag to the `helm install` command, `helm` will create the release namespace if it does not exist.

See [helm install](https://helm.sh/docs/helm/helm_install) for command documentation.

### Upgrade the chart

```shell
helm upgrade [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer [flags]
```

See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade) for command documentation.

### Uninstall the chart

```shell
helm uninstall [RELEASE_NAME]
```

This removes all the Kubernetes resources associated with the chart and deletes the release, except for the `crds`, those will have to be removed manually.

See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command documentation.

## Values

| Key | Type | Default | Description |
|-----|------|---------|-------------|
| nameOverride | string | `""` | String to partially override release name. |
| fullnameOverride | string | `""` | String to fully override release name. |
| jobset.install | bool | `true` | Whether to install jobset as a dependency managed by trainer. This must be set to `false` if jobset controller/webhook has already been installed into the cluster. |
| commonLabels | object | `{}` | Common labels to add to the resources. |
| image.registry | string | `"docker.io"` | Image registry. |
| image.repository | string | `"kubeflow/trainer-controller-manager"` | Image repository. |
| image.tag | string | If not set, the chart appVersion will be used. | Image tag. |
| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. |
| image.pullSecrets | list | `[]` | Image pull secrets for private image registry. |
| controller.replicas | int | `1` | Number of replicas of controller. |
| controller.labels | object | `{}` | Extra labels for controller pods. |
| controller.annotations | object | `{}` | Extra annotations for controller pods. |
| controller.volumes | list | `[]` | Volumes for controller pods. |
| controller.nodeSelector | object | `{}` | Node selector for controller pods. |
| controller.affinity | object | `{}` | Affinity for controller pods. |
| controller.tolerations | list | `[]` | List of node taints to tolerate for controller pods. |
| controller.env | list | `[]` | Environment variables for controller containers. |
| controller.envFrom | list | `[]` | Environment variable sources for controller containers. |
| controller.volumeMounts | list | `[]` | Volume mounts for controller containers. |
| controller.resources | object | `{}` | Pod resource requests and limits for controller containers. |
| controller.securityContext | object | `{}` | Security context for controller containers. |
| controller.serviceAccount.create | bool | `true` | Specifies whether to create a service account for the controller. |
| controller.serviceAccount.name | string | `""` | Optional name for the controller service account. |
| controller.serviceAccount.annotations | object | `{}` | Extra annotations for the controller service account. |
| controller.serviceAccount.automountServiceAccountToken | bool | `true` | Auto-mount service account token to the controller pods. |
| webhook.enable | bool | `true` | Specifies whether to enable webhook. |
| webhook.failurePolicy | string | `"Fail"` | Specifies how unrecognized errors are handled. Available options are `Ignore` or `Fail`. |
| runtime.preTraining.torchDistributed.enable | bool | `true` | |
| runtime.preTraining.torchDistributed.image.registry | string | `"docker.io"` | |
| runtime.preTraining.torchDistributed.image.repository | string | `"pytorch/pytorch"` | |
| runtime.preTraining.torchDistributed.image.tag | string | `"2.5.0-cuda12.4-cudnn9-runtime"` | |

## Maintainers

| Name | Email | Url |
| ---- | ------ | --- |
| ChenYi015 | | <https://github.com/ChenYi015> |
70 changes: 70 additions & 0 deletions charts/kubeflow-trainer/README.md.gotmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{{ template "chart.header" . }}

{{ template "chart.deprecationWarning" . }}

{{ template "chart.badgesSection" . }}

{{ template "chart.description" . }}

{{ template "chart.homepageLine" . }}

## Introduction

This chart bootstraps a [Kubernetes Trainer]({{template "chart.homepage" . }}) deployment using the [Helm](https://helm.sh) package manager.

## Prerequisites

- Helm >= 3
- Kubernetes >= 1.20

## Usage

### Add Helm Repo

```bash
helm repo add kubeflow-trainer https://kubeflow.github.io/trainer

helm repo update
```

See [helm repo](https://helm.sh/docs/helm/helm_repo) for command documentation.

### Install the chart

```bash
helm install [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer
```

For example, if you want to create a release with name `kubeflow-trainer` in the `kubeflow-system` namespace:

```shell
helm install kubeflow-trainer kubeflow-trainer/kubeflow-trainer \
--namespace kubeflow-system \
--create-namespace
```

Note that by passing the `--create-namespace` flag to the `helm install` command, `helm` will create the release namespace if it does not exist.

See [helm install](https://helm.sh/docs/helm/helm_install) for command documentation.

### Upgrade the chart

```shell
helm upgrade [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer [flags]
```

See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade) for command documentation.

### Uninstall the chart

```shell
helm uninstall [RELEASE_NAME]
```

This removes all the Kubernetes resources associated with the chart and deletes the release, except for the `crds`, those will have to be removed manually.

See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command documentation.

{{ template "chart.valuesSection" . }}

{{ template "chart.maintainersSection" . }}
Loading

0 comments on commit 77c023d

Please sign in to comment.