Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Helm chart for kubeflow trainer #2435

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ hack/python-sdk/openapi-generator-cli.jar

# Coverage
cover.out

# Helm
charts/kubeflow-trainer/charts/jobset
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ repos:
hooks:
- id: check-yaml
args: [--allow-multiple-documents]
exclude: '^charts/trainer/.*\.yaml$'
- id: check-json
- id: end-of-file-fixer
- id: trailing-whitespace
Expand Down
90 changes: 71 additions & 19 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,27 @@ endif
SHELL = /usr/bin/env bash -o pipefail
.SHELLFLAGS = -ec

PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
REPO := github.com/kubeflow/trainer
TRAINER_CHART_DIR := $(PROJECT_DIR)/charts/kubeflow-trainer

# Location to install tool binaries
LOCALBIN ?= $(PROJECT_DIR)/bin

# Tool versions
CONTROLLER_GEN_VERSION ?= v0.17.2
ENVTEST_VERSION ?= release-0.20
ENVTEST_K8S_VERSION ?= 1.31
HELM_VERSION ?= v3.15.3
HELM_UNITTEST_VERSION ?= 0.5.1
HELM_DOCS_VERSION ?= v1.14.2

# Tool binaries
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
ENVTEST ?= $(LOCALBIN)/setup-envtest
HELM ?= $(LOCALBIN)/helm
HELM_DOCS ?= $(LOCALBIN)/helm-docs

##@ General

# The help target prints out all targets with their descriptions organized
Expand All @@ -29,24 +50,6 @@ help: ## Display this help.

##@ Development

PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))

# Tool Binaries
LOCALBIN ?= $(PROJECT_DIR)/bin
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
ENVTEST ?= $(LOCALBIN)/setup-envtest

ENVTEST_K8S_VERSION ?= 1.31

# Instructions to download tools for development.
.PHONY: envtest
envtest: ## Download the setup-envtest binary if required.
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/[email protected]

.PHONY: controller-gen
controller-gen: ## Download the controller-gen binary if required.
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/[email protected]

# Download external CRDs for Go integration testings.
EXTERNAL_CRDS_DIR ?= $(PROJECT_DIR)/manifests/external-crds

Expand All @@ -65,11 +68,14 @@ scheduler-plugins-crd: ## Copy the CRDs from the Scheduler Plugins repository to
# Instructions for code generation.
.PHONY: manifests
manifests: controller-gen ## Generate manifests.
# Skip outputing the RBAC and webhook manifests as we will sync them from the manifests templated by the Helm chart.
$(CONTROLLER_GEN) "crd:generateEmbeddedObjectMeta=true" rbac:roleName=kubeflow-trainer-controller-manager webhook \
paths="./pkg/apis/trainer/v1alpha1/...;./pkg/controller/...;./pkg/runtime/...;./pkg/webhooks/...;./pkg/util/cert/..." \
output:crd:artifacts:config=manifests/base/crds \
output:rbac:artifacts:config=manifests/base/rbac \
output:webhook:artifacts:config=manifests/base/webhook
output:rbac:none \
output:webhook:artifacts:config=manifests/base/webhook \
output:webhook:none

.PHONY: generate
generate: go-mod-download manifests ## Generate APIs and SDK.
Expand Down Expand Up @@ -124,3 +130,49 @@ test-python-integration: ## Run Python integration test.
pip install -r ./cmd/initializer/dataset/requirements.txt

pytest ./test/integration/initializer

##@ Helm

.PHONY: sync-manifests
sync-manifests: ## Sync Kustomize manifests from manifests templated from Helm chart.
$(HELM) dependency update $(TRAINER_CHART_DIR)
hack/sync-manifests.sh

.PHONY: helm-unittest
helm-unittest: helm-unittest-plugin ## Run Helm chart unittests.
$(HELM) unittest $(TRAINER_CHART_DIR) --strict --file "tests/**/*_test.yaml"

.PHONY: helm-lint
helm-lint: ## Run Helm chart lint test.
docker run --rm --workdir /workspace --volume "$$(pwd):/workspace" quay.io/helmpack/chart-testing:latest ct lint --target-branch master --validate-maintainers=false

.PHONY: helm-docs
helm-docs: helm-docs-plugin ## Generates markdown documentation for helm charts from requirements and values files.
$(HELM_DOCS) --sort-values-order=file

##@ Dependencies

.PHONY: envtest
envtest: ## Download the setup-envtest binary if necessary.
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/setup-envtest@$(ENVTEST_VERSION)

.PHONY: controller-gen
controller-gen: ## Download the controller-gen binary if necessary.
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION)

.PHONY: helm
helm: $(HELM) ## Download helm locally if necessary.
$(HELM): $(LOCALBIN)
GOBIN=$(LOCALBIN) go install helm.sh/helm/v3/cmd/helm@$(HELM_VERSION)

.PHONY: helm-unittest-plugin
helm-unittest-plugin: helm ## Download helm unittest plugin locally if necessary.
if [ -z "$(shell $(HELM) plugin list | grep unittest)" ]; then \
echo "Installing helm unittest plugin"; \
$(HELM) plugin install https://github.com/helm-unittest/helm-unittest.git --version $(HELM_UNITTEST_VERSION); \
fi

.PHONY: helm-docs-plugin
helm-docs-plugin: $(HELM_DOCS) ## Download helm-docs plugin locally if necessary.
$(HELM_DOCS): $(LOCALBIN)
GOBIN=$(LOCALBIN) go install github.com/norwoodj/helm-docs/cmd/helm-docs@$(HELM_DOCS_VERSION)
39 changes: 39 additions & 0 deletions charts/kubeflow-trainer/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.

.helmignore
ci/

# helm-unittest
.debug
tests/
__snapshot__/

# helm-docs
README.md.gotmpl

# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/

# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~

# Various IDEs
*.tmproj
.project
.idea/
.vscode/

# MacOS
.DS_Store
42 changes: 42 additions & 0 deletions charts/kubeflow-trainer/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#
# Copyright 2024 The Kubeflow authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

apiVersion: v2

name: kubeflow-trainer

description: A Helm chart for deploying Kubeflow trainer on Kubernetes.

version: 2.0.0

appVersion: 2.0.0

type: application

dependencies:
- name: jobset
repository: oci://us-central1-docker.pkg.dev/k8s-staging-images/charts
version: 0.8.0
condition: jobset.install

keywords:
- kubeflow trainer

home: https://github.com/kubeflow/trainer

maintainers:
- name: ChenYi015
url: https://github.com/ChenYi015
106 changes: 106 additions & 0 deletions charts/kubeflow-trainer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# kubeflow-trainer

![Version: 2.0.0](https://img.shields.io/badge/Version-2.0.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.0.0](https://img.shields.io/badge/AppVersion-2.0.0-informational?style=flat-square)

A Helm chart for deploying Kubeflow trainer on Kubernetes.

**Homepage:** <https://github.com/kubeflow/trainer>

## Introduction

This chart bootstraps a [Kubernetes Trainer](https://github.com/kubeflow/trainer) deployment using the [Helm](https://helm.sh) package manager.

## Prerequisites

- Helm >= 3
- Kubernetes >= 1.20

## Usage

### Add Helm Repo

```bash
helm repo add kubeflow-trainer https://kubeflow.github.io/trainer

helm repo update
```

See [helm repo](https://helm.sh/docs/helm/helm_repo) for command documentation.

### Install the chart

```bash
helm install [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer
```

For example, if you want to create a release with name `kubeflow-trainer` in the `kubeflow-system` namespace:

```shell
helm install kubeflow-trainer kubeflow-trainer/kubeflow-trainer \
--namespace kubeflow-system \
--create-namespace
```

Note that by passing the `--create-namespace` flag to the `helm install` command, `helm` will create the release namespace if it does not exist.

See [helm install](https://helm.sh/docs/helm/helm_install) for command documentation.

### Upgrade the chart

```shell
helm upgrade [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer [flags]
```

See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade) for command documentation.

### Uninstall the chart

```shell
helm uninstall [RELEASE_NAME]
```

This removes all the Kubernetes resources associated with the chart and deletes the release, except for the `crds`, those will have to be removed manually.

See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command documentation.

## Values

| Key | Type | Default | Description |
|-----|------|---------|-------------|
| nameOverride | string | `""` | String to partially override release name. |
| fullnameOverride | string | `""` | String to fully override release name. |
| jobset.install | bool | `true` | Whether to install jobset as a dependency managed by trainer. This must be set to `false` if jobset controller/webhook has already been installed into the cluster. |
| commonLabels | object | `{}` | Common labels to add to the resources. |
| image.registry | string | `"docker.io"` | Image registry. |
| image.repository | string | `"kubeflow/trainer-controller-manager"` | Image repository. |
| image.tag | string | If not set, the chart appVersion will be used. | Image tag. |
| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. |
| image.pullSecrets | list | `[]` | Image pull secrets for private image registry. |
| controller.replicas | int | `1` | Number of replicas of controller. |
| controller.labels | object | `{}` | Extra labels for controller pods. |
| controller.annotations | object | `{}` | Extra annotations for controller pods. |
| controller.volumes | list | `[]` | Volumes for controller pods. |
| controller.nodeSelector | object | `{}` | Node selector for controller pods. |
| controller.affinity | object | `{}` | Affinity for controller pods. |
| controller.tolerations | list | `[]` | List of node taints to tolerate for controller pods. |
| controller.env | list | `[]` | Environment variables for controller containers. |
| controller.envFrom | list | `[]` | Environment variable sources for controller containers. |
| controller.volumeMounts | list | `[]` | Volume mounts for controller containers. |
| controller.resources | object | `{}` | Pod resource requests and limits for controller containers. |
| controller.securityContext | object | `{}` | Security context for controller containers. |
| controller.serviceAccount.create | bool | `true` | Specifies whether to create a service account for the controller. |
| controller.serviceAccount.name | string | `""` | Optional name for the controller service account. |
| controller.serviceAccount.annotations | object | `{}` | Extra annotations for the controller service account. |
| controller.serviceAccount.automountServiceAccountToken | bool | `true` | Auto-mount service account token to the controller pods. |
| webhook.enable | bool | `true` | Specifies whether to enable webhook. |
| webhook.failurePolicy | string | `"Fail"` | Specifies how unrecognized errors are handled. Available options are `Ignore` or `Fail`. |
| runtime.preTraining.torchDistributed.enable | bool | `true` | |
| runtime.preTraining.torchDistributed.image.registry | string | `"docker.io"` | |
| runtime.preTraining.torchDistributed.image.repository | string | `"pytorch/pytorch"` | |
| runtime.preTraining.torchDistributed.image.tag | string | `"2.5.0-cuda12.4-cudnn9-runtime"` | |

## Maintainers

| Name | Email | Url |
| ---- | ------ | --- |
| ChenYi015 | | <https://github.com/ChenYi015> |
70 changes: 70 additions & 0 deletions charts/kubeflow-trainer/README.md.gotmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{{ template "chart.header" . }}

{{ template "chart.deprecationWarning" . }}

{{ template "chart.badgesSection" . }}

{{ template "chart.description" . }}

{{ template "chart.homepageLine" . }}

## Introduction

This chart bootstraps a [Kubernetes Trainer]({{template "chart.homepage" . }}) deployment using the [Helm](https://helm.sh) package manager.

## Prerequisites

- Helm >= 3
- Kubernetes >= 1.20

## Usage

### Add Helm Repo

```bash
helm repo add kubeflow-trainer https://kubeflow.github.io/trainer

helm repo update
```

See [helm repo](https://helm.sh/docs/helm/helm_repo) for command documentation.

### Install the chart

```bash
helm install [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer
```

For example, if you want to create a release with name `kubeflow-trainer` in the `kubeflow-system` namespace:

```shell
helm install kubeflow-trainer kubeflow-trainer/kubeflow-trainer \
--namespace kubeflow-system \
--create-namespace
```

Note that by passing the `--create-namespace` flag to the `helm install` command, `helm` will create the release namespace if it does not exist.

See [helm install](https://helm.sh/docs/helm/helm_install) for command documentation.

### Upgrade the chart

```shell
helm upgrade [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer [flags]
```

See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade) for command documentation.

### Uninstall the chart

```shell
helm uninstall [RELEASE_NAME]
```

This removes all the Kubernetes resources associated with the chart and deletes the release, except for the `crds`, those will have to be removed manually.

See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command documentation.

{{ template "chart.valuesSection" . }}

{{ template "chart.maintainersSection" . }}
Loading
Loading