-
Notifications
You must be signed in to change notification settings - Fork 733
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Yi Chen <[email protected]>
- Loading branch information
Showing
54 changed files
with
25,241 additions
and
276 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,3 +23,6 @@ hack/python-sdk/openapi-generator-cli.jar | |
|
||
# Coverage | ||
cover.out | ||
|
||
# Helm | ||
charts/kubeflow-trainer/charts/jobset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,27 @@ endif | |
SHELL = /usr/bin/env bash -o pipefail | ||
.SHELLFLAGS = -ec | ||
|
||
PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) | ||
REPO := github.com/kubeflow/trainer | ||
TRAINER_CHART_DIR := $(PROJECT_DIR)/charts/kubeflow-trainer | ||
|
||
# Location to install tool binaries | ||
LOCALBIN ?= $(PROJECT_DIR)/bin | ||
|
||
# Tool versions | ||
CONTROLLER_GEN_VERSION ?= v0.17.2 | ||
ENVTEST_VERSION ?= release-0.20 | ||
ENVTEST_K8S_VERSION ?= 1.31 | ||
HELM_VERSION ?= v3.15.3 | ||
HELM_UNITTEST_VERSION ?= 0.5.1 | ||
HELM_DOCS_VERSION ?= v1.14.2 | ||
|
||
# Tool binaries | ||
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen | ||
ENVTEST ?= $(LOCALBIN)/setup-envtest | ||
HELM ?= $(LOCALBIN)/helm | ||
HELM_DOCS ?= $(LOCALBIN)/helm-docs | ||
|
||
##@ General | ||
|
||
# The help target prints out all targets with their descriptions organized | ||
|
@@ -29,24 +50,6 @@ help: ## Display this help. | |
|
||
##@ Development | ||
|
||
PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) | ||
|
||
# Tool Binaries | ||
LOCALBIN ?= $(PROJECT_DIR)/bin | ||
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen | ||
ENVTEST ?= $(LOCALBIN)/setup-envtest | ||
|
||
ENVTEST_K8S_VERSION ?= 1.31 | ||
|
||
# Instructions to download tools for development. | ||
.PHONY: envtest | ||
envtest: ## Download the setup-envtest binary if required. | ||
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/[email protected] | ||
|
||
.PHONY: controller-gen | ||
controller-gen: ## Download the controller-gen binary if required. | ||
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/[email protected] | ||
|
||
# Download external CRDs for Go integration testings. | ||
EXTERNAL_CRDS_DIR ?= $(PROJECT_DIR)/manifests/external-crds | ||
|
||
|
@@ -65,11 +68,14 @@ scheduler-plugins-crd: ## Copy the CRDs from the Scheduler Plugins repository to | |
# Instructions for code generation. | ||
.PHONY: manifests | ||
manifests: controller-gen ## Generate manifests. | ||
# Skip outputing the RBAC and webhook manifests as we will sync them from the manifests templated by the Helm chart. | ||
$(CONTROLLER_GEN) "crd:generateEmbeddedObjectMeta=true" rbac:roleName=kubeflow-trainer-controller-manager webhook \ | ||
paths="./pkg/apis/trainer/v1alpha1/...;./pkg/controller/...;./pkg/runtime/...;./pkg/webhooks/...;./pkg/util/cert/..." \ | ||
output:crd:artifacts:config=manifests/base/crds \ | ||
output:rbac:artifacts:config=manifests/base/rbac \ | ||
output:webhook:artifacts:config=manifests/base/webhook | ||
output:rbac:none \ | ||
output:webhook:artifacts:config=manifests/base/webhook \ | ||
output:webhook:none | ||
|
||
.PHONY: generate | ||
generate: go-mod-download manifests ## Generate APIs and SDK. | ||
|
@@ -124,3 +130,49 @@ test-python-integration: ## Run Python integration test. | |
pip install -r ./cmd/initializer/dataset/requirements.txt | ||
|
||
pytest ./test/integration/initializer | ||
|
||
##@ Helm | ||
|
||
.PHONY: sync-manifests | ||
sync-manifests: ## Sync Kustomize manifests from manifests templated from Helm chart. | ||
$(HELM) dependency update $(TRAINER_CHART_DIR) | ||
hack/sync-manifests.sh | ||
|
||
.PHONY: helm-unittest | ||
helm-unittest: helm-unittest-plugin ## Run Helm chart unittests. | ||
$(HELM) unittest $(TRAINER_CHART_DIR) --strict --file "tests/**/*_test.yaml" | ||
|
||
.PHONY: helm-lint | ||
helm-lint: ## Run Helm chart lint test. | ||
docker run --rm --workdir /workspace --volume "$$(pwd):/workspace" quay.io/helmpack/chart-testing:latest ct lint --target-branch master --validate-maintainers=false | ||
|
||
.PHONY: helm-docs | ||
helm-docs: helm-docs-plugin ## Generates markdown documentation for helm charts from requirements and values files. | ||
$(HELM_DOCS) --sort-values-order=file | ||
|
||
##@ Dependencies | ||
|
||
.PHONY: envtest | ||
envtest: ## Download the setup-envtest binary if necessary. | ||
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/setup-envtest@$(ENVTEST_VERSION) | ||
|
||
.PHONY: controller-gen | ||
controller-gen: ## Download the controller-gen binary if necessary. | ||
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION) | ||
|
||
.PHONY: helm | ||
helm: $(HELM) ## Download helm locally if necessary. | ||
$(HELM): $(LOCALBIN) | ||
GOBIN=$(LOCALBIN) go install helm.sh/helm/v3/cmd/helm@$(HELM_VERSION) | ||
|
||
.PHONY: helm-unittest-plugin | ||
helm-unittest-plugin: helm ## Download helm unittest plugin locally if necessary. | ||
if [ -z "$(shell $(HELM) plugin list | grep unittest)" ]; then \ | ||
echo "Installing helm unittest plugin"; \ | ||
$(HELM) plugin install https://github.com/helm-unittest/helm-unittest.git --version $(HELM_UNITTEST_VERSION); \ | ||
fi | ||
|
||
.PHONY: helm-docs-plugin | ||
helm-docs-plugin: $(HELM_DOCS) ## Download helm-docs plugin locally if necessary. | ||
$(HELM_DOCS): $(LOCALBIN) | ||
GOBIN=$(LOCALBIN) go install github.com/norwoodj/helm-docs/cmd/helm-docs@$(HELM_DOCS_VERSION) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# Patterns to ignore when building packages. | ||
# This supports shell glob matching, relative path matching, and | ||
# negation (prefixed with !). Only one pattern per line. | ||
|
||
.helmignore | ||
ci/ | ||
|
||
# helm-unittest | ||
.debug | ||
tests/ | ||
__snapshot__/ | ||
|
||
# helm-docs | ||
README.md.gotmpl | ||
|
||
# Common VCS dirs | ||
.git/ | ||
.gitignore | ||
.bzr/ | ||
.bzrignore | ||
.hg/ | ||
.hgignore | ||
.svn/ | ||
|
||
# Common backup files | ||
*.swp | ||
*.bak | ||
*.tmp | ||
*.orig | ||
*~ | ||
|
||
# Various IDEs | ||
*.tmproj | ||
.project | ||
.idea/ | ||
.vscode/ | ||
|
||
# MacOS | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# | ||
# Copyright 2024 The Kubeflow authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
apiVersion: v2 | ||
|
||
name: kubeflow-trainer | ||
|
||
description: A Helm chart for deploying Kubeflow trainer on Kubernetes. | ||
|
||
version: 2.0.0 | ||
|
||
appVersion: 2.0.0 | ||
|
||
type: application | ||
|
||
dependencies: | ||
- name: jobset | ||
repository: oci://us-central1-docker.pkg.dev/k8s-staging-images/charts | ||
version: 0.8.0 | ||
condition: jobset.install | ||
|
||
keywords: | ||
- kubeflow trainer | ||
|
||
home: https://github.com/kubeflow/trainer | ||
|
||
maintainers: | ||
- name: ChenYi015 | ||
url: https://github.com/ChenYi015 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# kubeflow-trainer | ||
|
||
   | ||
|
||
A Helm chart for deploying Kubeflow trainer on Kubernetes. | ||
|
||
**Homepage:** <https://github.com/kubeflow/trainer> | ||
|
||
## Introduction | ||
|
||
This chart bootstraps a [Kubernetes Trainer](https://github.com/kubeflow/trainer) deployment using the [Helm](https://helm.sh) package manager. | ||
|
||
## Prerequisites | ||
|
||
- Helm >= 3 | ||
- Kubernetes >= 1.20 | ||
|
||
## Usage | ||
|
||
### Add Helm Repo | ||
|
||
```bash | ||
helm repo add kubeflow-trainer https://kubeflow.github.io/trainer | ||
|
||
helm repo update | ||
``` | ||
|
||
See [helm repo](https://helm.sh/docs/helm/helm_repo) for command documentation. | ||
|
||
### Install the chart | ||
|
||
```bash | ||
helm install [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer | ||
``` | ||
|
||
For example, if you want to create a release with name `kubeflow-trainer` in the `kubeflow-system` namespace: | ||
|
||
```shell | ||
helm install kubeflow-trainer kubeflow-trainer/kubeflow-trainer \ | ||
--namespace kubeflow-system \ | ||
--create-namespace | ||
``` | ||
|
||
Note that by passing the `--create-namespace` flag to the `helm install` command, `helm` will create the release namespace if it does not exist. | ||
|
||
See [helm install](https://helm.sh/docs/helm/helm_install) for command documentation. | ||
|
||
### Upgrade the chart | ||
|
||
```shell | ||
helm upgrade [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer [flags] | ||
``` | ||
|
||
See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade) for command documentation. | ||
|
||
### Uninstall the chart | ||
|
||
```shell | ||
helm uninstall [RELEASE_NAME] | ||
``` | ||
|
||
This removes all the Kubernetes resources associated with the chart and deletes the release, except for the `crds`, those will have to be removed manually. | ||
|
||
See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command documentation. | ||
|
||
## Values | ||
|
||
| Key | Type | Default | Description | | ||
|-----|------|---------|-------------| | ||
| nameOverride | string | `""` | String to partially override release name. | | ||
| fullnameOverride | string | `""` | String to fully override release name. | | ||
| jobset.install | bool | `true` | Whether to install jobset as a dependency managed by trainer. This must be set to `false` if jobset controller/webhook has already been installed into the cluster. | | ||
| commonLabels | object | `{}` | Common labels to add to the resources. | | ||
| image.registry | string | `"docker.io"` | Image registry. | | ||
| image.repository | string | `"kubeflow/trainer-controller-manager"` | Image repository. | | ||
| image.tag | string | If not set, the chart appVersion will be used. | Image tag. | | ||
| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. | | ||
| image.pullSecrets | list | `[]` | Image pull secrets for private image registry. | | ||
| controller.replicas | int | `1` | Number of replicas of controller. | | ||
| controller.labels | object | `{}` | Extra labels for controller pods. | | ||
| controller.annotations | object | `{}` | Extra annotations for controller pods. | | ||
| controller.volumes | list | `[]` | Volumes for controller pods. | | ||
| controller.nodeSelector | object | `{}` | Node selector for controller pods. | | ||
| controller.affinity | object | `{}` | Affinity for controller pods. | | ||
| controller.tolerations | list | `[]` | List of node taints to tolerate for controller pods. | | ||
| controller.env | list | `[]` | Environment variables for controller containers. | | ||
| controller.envFrom | list | `[]` | Environment variable sources for controller containers. | | ||
| controller.volumeMounts | list | `[]` | Volume mounts for controller containers. | | ||
| controller.resources | object | `{}` | Pod resource requests and limits for controller containers. | | ||
| controller.securityContext | object | `{}` | Security context for controller containers. | | ||
| controller.serviceAccount.create | bool | `true` | Specifies whether to create a service account for the controller. | | ||
| controller.serviceAccount.name | string | `""` | Optional name for the controller service account. | | ||
| controller.serviceAccount.annotations | object | `{}` | Extra annotations for the controller service account. | | ||
| controller.serviceAccount.automountServiceAccountToken | bool | `true` | Auto-mount service account token to the controller pods. | | ||
| webhook.enable | bool | `true` | Specifies whether to enable webhook. | | ||
| webhook.failurePolicy | string | `"Fail"` | Specifies how unrecognized errors are handled. Available options are `Ignore` or `Fail`. | | ||
| runtime.preTraining.torchDistributed.enable | bool | `true` | | | ||
| runtime.preTraining.torchDistributed.image.registry | string | `"docker.io"` | | | ||
| runtime.preTraining.torchDistributed.image.repository | string | `"pytorch/pytorch"` | | | ||
| runtime.preTraining.torchDistributed.image.tag | string | `"2.5.0-cuda12.4-cudnn9-runtime"` | | | ||
|
||
## Maintainers | ||
|
||
| Name | Email | Url | | ||
| ---- | ------ | --- | | ||
| ChenYi015 | | <https://github.com/ChenYi015> | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
{{ template "chart.header" . }} | ||
|
||
{{ template "chart.deprecationWarning" . }} | ||
|
||
{{ template "chart.badgesSection" . }} | ||
|
||
{{ template "chart.description" . }} | ||
|
||
{{ template "chart.homepageLine" . }} | ||
|
||
## Introduction | ||
|
||
This chart bootstraps a [Kubernetes Trainer]({{template "chart.homepage" . }}) deployment using the [Helm](https://helm.sh) package manager. | ||
|
||
## Prerequisites | ||
|
||
- Helm >= 3 | ||
- Kubernetes >= 1.20 | ||
|
||
## Usage | ||
|
||
### Add Helm Repo | ||
|
||
```bash | ||
helm repo add kubeflow-trainer https://kubeflow.github.io/trainer | ||
|
||
helm repo update | ||
``` | ||
|
||
See [helm repo](https://helm.sh/docs/helm/helm_repo) for command documentation. | ||
|
||
### Install the chart | ||
|
||
```bash | ||
helm install [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer | ||
``` | ||
|
||
For example, if you want to create a release with name `kubeflow-trainer` in the `kubeflow-system` namespace: | ||
|
||
```shell | ||
helm install kubeflow-trainer kubeflow-trainer/kubeflow-trainer \ | ||
--namespace kubeflow-system \ | ||
--create-namespace | ||
``` | ||
|
||
Note that by passing the `--create-namespace` flag to the `helm install` command, `helm` will create the release namespace if it does not exist. | ||
|
||
See [helm install](https://helm.sh/docs/helm/helm_install) for command documentation. | ||
|
||
### Upgrade the chart | ||
|
||
```shell | ||
helm upgrade [RELEASE_NAME] kubeflow-trainer/kubeflow-trainer [flags] | ||
``` | ||
|
||
See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade) for command documentation. | ||
|
||
### Uninstall the chart | ||
|
||
```shell | ||
helm uninstall [RELEASE_NAME] | ||
``` | ||
|
||
This removes all the Kubernetes resources associated with the chart and deletes the release, except for the `crds`, those will have to be removed manually. | ||
|
||
See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command documentation. | ||
|
||
{{ template "chart.valuesSection" . }} | ||
|
||
{{ template "chart.maintainersSection" . }} |
Oops, something went wrong.