diff --git a/.github/actions/install-deps/action.yaml b/.github/actions/install-deps/action.yaml index aba23bb8f9..d69a17375b 100644 --- a/.github/actions/install-deps/action.yaml +++ b/.github/actions/install-deps/action.yaml @@ -16,7 +16,7 @@ runs: # Root path permission workaround for caching https://github.com/actions/cache/issues/845#issuecomment-1252594999 - run: sudo chown "$USER" /usr/local shell: bash - - uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 + - uses: actions/cache@0c907a75c2c80ebcb7f088228285e798b750cf8f # v4.2.1 id: cache-toolchain with: path: | diff --git a/.github/workflows/semantic-pr.yaml b/.github/workflows/semantic-pr.yaml new file mode 100644 index 0000000000..c2315b7c5d --- /dev/null +++ b/.github/workflows/semantic-pr.yaml @@ -0,0 +1,21 @@ +name: "Semantic Pull Request" + +on: + pull_request_target: + types: + - opened + - edited + - synchronize + - reopened + +permissions: + pull-requests: read + +jobs: + main: + name: Validate a PR title is following the conventional commits + runs-on: ubuntu-latest + steps: + - uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3, to avoid Unpinned tag for 3rd party Action in workflow from CodeQL + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/Makefile b/Makefile index 52b0ad2be5..3029f80f63 100644 --- a/Makefile +++ b/Makefile @@ -22,14 +22,14 @@ uninstall-kwok: ## Uninstall kwok provider build-with-kind: # build with kind assumes the image will be uploaded directly onto the kind control plane, without an image repository $(eval CONTROLLER_IMG=$(shell $(WITH_GOFLAGS) KO_DOCKER_REPO="$(KWOK_REPO)" ko build sigs.k8s.io/karpenter/kwok)) $(eval IMG_REPOSITORY=$(shell echo $(CONTROLLER_IMG) | cut -d ":" -f 1)) - $(eval IMG_TAG=latest) + $(eval IMG_TAG=latest) build: ## Build the Karpenter KWOK controller images using ko build $(eval CONTROLLER_IMG=$(shell $(WITH_GOFLAGS) KO_DOCKER_REPO="$(KWOK_REPO)" ko build -B sigs.k8s.io/karpenter/kwok)) $(eval IMG_REPOSITORY=$(shell echo $(CONTROLLER_IMG) | cut -d "@" -f 1 | cut -d ":" -f 1)) $(eval IMG_TAG=$(shell echo $(CONTROLLER_IMG) | cut -d "@" -f 1 | cut -d ":" -f 2 -s)) $(eval IMG_DIGEST=$(shell echo $(CONTROLLER_IMG) | cut -d "@" -f 2)) - + apply-with-kind: verify build-with-kind ## Deploy the kwok controller from the current state of your git repository into your ~/.kube/config cluster kubectl apply -f kwok/charts/crds helm upgrade --install karpenter kwok/charts --namespace $(KARPENTER_NAMESPACE) --skip-crds \ @@ -38,7 +38,7 @@ apply-with-kind: verify build-with-kind ## Deploy the kwok controller from the c --set controller.image.tag=$(IMG_TAG) \ --set serviceMonitor.enabled=true \ --set-string controller.env[0].name=ENABLE_PROFILING \ - --set-string controller.env[0].value=true + --set-string controller.env[0].value=true e2etests: ## Run the e2e suite against your local cluster cd test && go test \ diff --git a/README.md b/README.md index 2a8ba5e2f1..919ce74b02 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ Karpenter is a multi-cloud project with implementations by the following cloud p - [AlibabaCloud](https://github.com/cloudpilot-ai/karpenter-provider-alibabacloud) - [Cluster API](https://github.com/kubernetes-sigs/karpenter-provider-cluster-api) - [GCP](https://github.com/cloudpilot-ai/karpenter-provider-gcp) +- [Proxmox](https://github.com/sergelogvinov/karpenter-provider-proxmox) ## Community, discussion, contribution, and support diff --git a/go.mod b/go.mod index 3509b5330b..4fe064423f 100644 --- a/go.mod +++ b/go.mod @@ -6,31 +6,31 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/avast/retry-go v3.0.0+incompatible github.com/awslabs/operatorpkg v0.0.0-20241205163410-0fff9f28d115 - github.com/docker/docker v27.5.1+incompatible + github.com/docker/docker v28.0.0+incompatible github.com/go-logr/logr v1.4.2 github.com/imdario/mergo v0.3.16 - github.com/klauspost/compress v1.17.9 // indirect + github.com/klauspost/compress v1.17.11 // indirect github.com/mitchellh/hashstructure/v2 v2.0.2 github.com/onsi/ginkgo/v2 v2.22.2 github.com/onsi/gomega v1.36.2 github.com/patrickmn/go-cache v2.1.0+incompatible - github.com/prometheus/client_golang v1.20.5 + github.com/prometheus/client_golang v1.21.0 github.com/prometheus/client_model v0.6.1 github.com/samber/lo v1.49.1 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.27.0 golang.org/x/text v0.22.0 golang.org/x/time v0.10.0 - k8s.io/api v0.32.1 - k8s.io/apiextensions-apiserver v0.32.1 - k8s.io/apimachinery v0.32.1 - k8s.io/client-go v0.32.1 - k8s.io/cloud-provider v0.32.1 - k8s.io/component-base v0.32.1 - k8s.io/csi-translation-lib v0.32.1 + k8s.io/api v0.32.2 + k8s.io/apiextensions-apiserver v0.32.2 + k8s.io/apimachinery v0.32.2 + k8s.io/client-go v0.32.2 + k8s.io/cloud-provider v0.32.2 + k8s.io/component-base v0.32.2 + k8s.io/csi-translation-lib v0.32.2 k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 - sigs.k8s.io/controller-runtime v0.20.1 + sigs.k8s.io/controller-runtime v0.20.2 ) require ( @@ -38,7 +38,7 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect - github.com/evanphx/json-patch/v5 v5.9.0 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-logr/zapr v1.3.0 github.com/go-openapi/jsonpointer v0.21.0 // indirect @@ -60,31 +60,32 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/common v0.55.0 // indirect + github.com/prometheus/common v0.62.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/robfig/cron/v3 v3.0.1 github.com/spf13/cobra v1.8.1 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/spf13/pflag v1.0.6 // indirect github.com/x448/float16 v0.8.4 // indirect - golang.org/x/net v0.33.0 // indirect - golang.org/x/oauth2 v0.23.0 // indirect - golang.org/x/sys v0.28.0 // indirect - golang.org/x/term v0.27.0 // indirect - golang.org/x/tools v0.28.0 // indirect + golang.org/x/net v0.35.0 // indirect + golang.org/x/oauth2 v0.24.0 // indirect + golang.org/x/sys v0.30.0 // indirect + golang.org/x/term v0.29.0 // indirect + golang.org/x/tools v0.30.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/protobuf v1.36.1 // indirect + google.golang.org/protobuf v1.36.4 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect - sigs.k8s.io/yaml v1.4.0 // indirect + sigs.k8s.io/yaml v1.4.0 ) require ( github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/google/btree v1.1.3 // indirect + github.com/rogpeppe/go-internal v1.13.1 // indirect golang.org/x/sync v0.11.0 // indirect ) diff --git a/go.sum b/go.sum index d0ffcc422c..cc0e5410d0 100644 --- a/go.sum +++ b/go.sum @@ -14,14 +14,14 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/docker/docker v27.5.1+incompatible h1:4PYU5dnBYqRQi0294d1FBECqT9ECWeQAIfE8q4YnPY8= -github.com/docker/docker v27.5.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/docker v28.0.0+incompatible h1:Olh0KS820sJ7nPsBKChVhk5pzqcwDR15fumfAd/p9hM= +github.com/docker/docker v28.0.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= -github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= -github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= @@ -68,8 +68,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= -github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= +github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -101,25 +101,26 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= -github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_golang v1.21.0 h1:DIsaGmiaBkSangBgMtWdNfxbMNdku5IK6iNhrEqWvdA= +github.com/prometheus/client_golang v1.21.0/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= -github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= -github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= +github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= +github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= -github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= -github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/samber/lo v1.49.1 h1:4BIFyVfuQSEpluc7Fua+j1NolZHiEHEpaSEKdsH0tew= github.com/samber/lo v1.49.1/go.mod h1:dO6KHFzUKXgP8LDhU0oI8d2hekjXnGOu0DB8Jecxd6o= github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -148,10 +149,10 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= -golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= -golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= -golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= +golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= +golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= +golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -160,10 +161,10 @@ golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= -golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q= -golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= +golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= @@ -174,16 +175,16 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8= -golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw= +golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY= +golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/protobuf v1.36.1 h1:yBPeRvTftaleIgM3PZ/WBIZ7XM/eEYAaEyCwvyjq/gk= -google.golang.org/protobuf v1.36.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.4 h1:6A3ZDJHn/eNqc1i+IdefRzy/9PokBTPvcqMySR7NNIM= +google.golang.org/protobuf v1.36.4/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= @@ -194,28 +195,28 @@ gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.32.1 h1:f562zw9cy+GvXzXf0CKlVQ7yHJVYzLfL6JAS4kOAaOc= -k8s.io/api v0.32.1/go.mod h1:/Yi/BqkuueW1BgpoePYBRdDYfjPF5sgTr5+YqDZra5k= -k8s.io/apiextensions-apiserver v0.32.1 h1:hjkALhRUeCariC8DiVmb5jj0VjIc1N0DREP32+6UXZw= -k8s.io/apiextensions-apiserver v0.32.1/go.mod h1:sxWIGuGiYov7Io1fAS2X06NjMIk5CbRHc2StSmbaQto= -k8s.io/apimachinery v0.32.1 h1:683ENpaCBjma4CYqsmZyhEzrGz6cjn1MY/X2jB2hkZs= -k8s.io/apimachinery v0.32.1/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= -k8s.io/client-go v0.32.1 h1:otM0AxdhdBIaQh7l1Q0jQpmo7WOFIk5FFa4bg6YMdUU= -k8s.io/client-go v0.32.1/go.mod h1:aTTKZY7MdxUaJ/KiUs8D+GssR9zJZi77ZqtzcGXIiDg= -k8s.io/cloud-provider v0.32.1 h1:74rRhnfca3o4CsjjnIp/C3ARVuSmyNsxgWPtH0yc9Z0= -k8s.io/cloud-provider v0.32.1/go.mod h1:GECSanFT+EeZ/ToX3xlasjETzMUI+VFu92zHUDUsGHw= -k8s.io/component-base v0.32.1 h1:/5IfJ0dHIKBWysGV0yKTFfacZ5yNV1sulPh3ilJjRZk= -k8s.io/component-base v0.32.1/go.mod h1:j1iMMHi/sqAHeG5z+O9BFNCF698a1u0186zkjMZQ28w= -k8s.io/csi-translation-lib v0.32.1 h1:qqlB+eKiIdUM+GGZfJN/4FMNeuIPIELLxfWfv/LWUYk= -k8s.io/csi-translation-lib v0.32.1/go.mod h1:dc7zXqpUW4FykfAe6TqU32tYewsGhrjI63ZwJWQng3k= +k8s.io/api v0.32.2 h1:bZrMLEkgizC24G9eViHGOPbW+aRo9duEISRIJKfdJuw= +k8s.io/api v0.32.2/go.mod h1:hKlhk4x1sJyYnHENsrdCWw31FEmCijNGPJO5WzHiJ6Y= +k8s.io/apiextensions-apiserver v0.32.2 h1:2YMk285jWMk2188V2AERy5yDwBYrjgWYggscghPCvV4= +k8s.io/apiextensions-apiserver v0.32.2/go.mod h1:GPwf8sph7YlJT3H6aKUWtd0E+oyShk/YHWQHf/OOgCA= +k8s.io/apimachinery v0.32.2 h1:yoQBR9ZGkA6Rgmhbp/yuT9/g+4lxtsGYwW6dR6BDPLQ= +k8s.io/apimachinery v0.32.2/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= +k8s.io/client-go v0.32.2 h1:4dYCD4Nz+9RApM2b/3BtVvBHw54QjMFUl1OLcJG5yOA= +k8s.io/client-go v0.32.2/go.mod h1:fpZ4oJXclZ3r2nDOv+Ux3XcJutfrwjKTCHz2H3sww94= +k8s.io/cloud-provider v0.32.2 h1:8EC+fCYo0r0REczSjOZcVuQPCMxXxCKlgxDbYMrzC30= +k8s.io/cloud-provider v0.32.2/go.mod h1:2s8TeAXhVezp5VISaTxM6vW3yDonOZXoN4Aryz1p1PQ= +k8s.io/component-base v0.32.2 h1:1aUL5Vdmu7qNo4ZsE+569PV5zFatM9hl+lb3dEea2zU= +k8s.io/component-base v0.32.2/go.mod h1:PXJ61Vx9Lg+P5mS8TLd7bCIr+eMJRQTyXe8KvkrvJq0= +k8s.io/csi-translation-lib v0.32.2 h1:aLzAyaoJUc5rgtLi8Xd4No1tet6UpvUsGIgRoGnPSSE= +k8s.io/csi-translation-lib v0.32.2/go.mod h1:PlOKan6Vc0G6a+giQbm36plJ+E1LH+GPRLAVMQMSMcY= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/controller-runtime v0.20.1 h1:JbGMAG/X94NeM3xvjenVUaBjy6Ui4Ogd/J5ZtjZnHaE= -sigs.k8s.io/controller-runtime v0.20.1/go.mod h1:BrP3w158MwvB3ZbNpaAcIKkHQ7YGpYnzpoSTZ8E14WU= +sigs.k8s.io/controller-runtime v0.20.2 h1:/439OZVxoEc02psi1h4QO3bHzTgu49bb347Xp4gW1pc= +sigs.k8s.io/controller-runtime v0.20.2/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= sigs.k8s.io/structured-merge-diff/v4 v4.4.2 h1:MdmvkGuXi/8io6ixD5wud3vOLwc1rj0aNqRlpuvjmwA= diff --git a/hack/toolchain.sh b/hack/toolchain.sh index b33f3b09d9..85641d4a08 100755 --- a/hack/toolchain.sh +++ b/hack/toolchain.sh @@ -2,7 +2,7 @@ set -euo pipefail K8S_VERSION="${K8S_VERSION:="1.32.x"}" -KUBEBUILDER_ASSETS="/usr/local/kubebuilder/bin" +KUBEBUILDER_ASSETS="${KUBEBUILDER_ASSETS:=/usr/local/kubebuilder/bin}" main() { tools diff --git a/kwok/cloudprovider/cloudprovider.go b/kwok/cloudprovider/cloudprovider.go index 75f2b816fc..4b6c2f9708 100644 --- a/kwok/cloudprovider/cloudprovider.go +++ b/kwok/cloudprovider/cloudprovider.go @@ -70,7 +70,7 @@ func (c CloudProvider) Delete(ctx context.Context, nodeClaim *v1.NodeClaim) erro } return fmt.Errorf("deleting node, %w", err) } - return nil + return cloudprovider.NewNodeClaimNotFoundError(fmt.Errorf("instance terminated")) } func (c CloudProvider) Get(ctx context.Context, providerID string) (*v1.NodeClaim, error) { diff --git a/pkg/cloudprovider/fake/cloudprovider.go b/pkg/cloudprovider/fake/cloudprovider.go index 946a56ac3f..7062c93b97 100644 --- a/pkg/cloudprovider/fake/cloudprovider.go +++ b/pkg/cloudprovider/fake/cloudprovider.go @@ -87,7 +87,7 @@ func (c *CloudProvider) Reset() { c.NextGetErr = nil c.DeleteCalls = []*v1.NodeClaim{} c.GetCalls = nil - c.Drifted = "drifted" + c.Drifted = "" c.NodeClassGroupVersionKind = []schema.GroupVersionKind{ { Group: "", diff --git a/pkg/controllers/disruption/controller.go b/pkg/controllers/disruption/controller.go index 0d94b49a23..351476ffce 100644 --- a/pkg/controllers/disruption/controller.go +++ b/pkg/controllers/disruption/controller.go @@ -202,7 +202,7 @@ func (c *Controller) disrupt(ctx context.Context, disruption Method) (bool, erro // 3. Add Command to orchestration.Queue to wait to delete the candiates. func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command, schedulingResults scheduling.Results) error { commandID := uuid.NewUUID() - log.FromContext(ctx).WithValues("command-id", commandID, "reason", strings.ToLower(string(m.Reason()))).Info(fmt.Sprintf("disrupting nodeclaim(s) via %s", cmd)) + log.FromContext(ctx).WithValues(append([]any{"command-id", string(commandID), "reason", strings.ToLower(string(m.Reason()))}, cmd.LogValues()...)...).Info("disrupting node(s)") // Cordon the old nodes before we launch the replacements to prevent new pods from scheduling to the old nodes if err := c.MarkDisrupted(ctx, m, cmd.candidates...); err != nil { diff --git a/pkg/controllers/disruption/emptiness.go b/pkg/controllers/disruption/emptiness.go index b5994ca96e..4fd248ce1c 100644 --- a/pkg/controllers/disruption/emptiness.go +++ b/pkg/controllers/disruption/emptiness.go @@ -104,7 +104,7 @@ func (e *Emptiness) ComputeCommand(ctx context.Context, disruptionBudgetMapping validatedCandidates, err := v.ValidateCandidates(ctx, cmd.candidates...) if err != nil { if IsValidationError(err) { - log.FromContext(ctx).V(1).Info(fmt.Sprintf("abandoning empty node consolidation attempt due to pod churn, command is no longer valid, %s", cmd)) + log.FromContext(ctx).V(1).WithValues(cmd.LogValues()...).Info("abandoning empty node consolidation attempt due to pod churn, command is no longer valid") return Command{}, scheduling.Results{}, nil } return Command{}, scheduling.Results{}, err @@ -114,7 +114,7 @@ func (e *Emptiness) ComputeCommand(ctx context.Context, disruptionBudgetMapping if lo.ContainsBy(validatedCandidates, func(c *Candidate) bool { return len(c.reschedulablePods) != 0 }) { - log.FromContext(ctx).V(1).Info(fmt.Sprintf("abandoning empty node consolidation attempt due to pod churn, command is no longer valid, %s", cmd)) + log.FromContext(ctx).V(1).WithValues(cmd.LogValues()...).Info("abandoning empty node consolidation attempt due to pod churn, command is no longer valid") return Command{}, scheduling.Results{}, nil } diff --git a/pkg/controllers/disruption/multinodeconsolidation.go b/pkg/controllers/disruption/multinodeconsolidation.go index d89bba91eb..320c78c9ff 100644 --- a/pkg/controllers/disruption/multinodeconsolidation.go +++ b/pkg/controllers/disruption/multinodeconsolidation.go @@ -98,7 +98,7 @@ func (m *MultiNodeConsolidation) ComputeCommand(ctx context.Context, disruptionB if err := NewValidation(m.clock, m.cluster, m.kubeClient, m.provisioner, m.cloudProvider, m.recorder, m.queue, m.Reason()).IsValid(ctx, cmd, consolidationTTL); err != nil { if IsValidationError(err) { - log.FromContext(ctx).V(1).Info(fmt.Sprintf("abandoning multi-node consolidation attempt due to pod churn, command is no longer valid, %s", cmd)) + log.FromContext(ctx).V(1).WithValues(cmd.LogValues()...).Info("abandoning multi-node consolidation attempt due to pod churn, command is no longer valid") return Command{}, scheduling.Results{}, nil } return Command{}, scheduling.Results{}, fmt.Errorf("validating consolidation, %w", err) @@ -129,9 +129,9 @@ func (m *MultiNodeConsolidation) firstNConsolidationOption(ctx context.Context, case <-timeoutCtx.Done(): ConsolidationTimeoutsTotal.Inc(map[string]string{consolidationTypeLabel: m.ConsolidationType()}) if lastSavedCommand.candidates == nil { - return Command{}, scheduling.Results{}, fmt.Errorf("multi-node consolidation timed out after %s without finding a valid command", MultiNodeConsolidationTimeoutDuration) + return Command{}, scheduling.Results{}, fmt.Errorf("multi-node consolidation timed out while considering %d nodes without finding a valid command", (min+max)/2)) } - log.FromContext(ctx).V(1).Info(fmt.Sprintf("stopping multi-node consolidation after timeout, returning last valid command %s", lastSavedCommand)) + log.FromContext(ctx).V(1).WithValues(lastSavedCommand.LogValues()...).Info(fmt.Sprintf("stopping multi-node consolidation after timeout, returning last valid command")) return lastSavedCommand, lastSavedResults, nil default: mid := (min + max) / 2 diff --git a/pkg/controllers/disruption/orchestration/queue.go b/pkg/controllers/disruption/orchestration/queue.go index 17e85c344a..7d49f1caad 100644 --- a/pkg/controllers/disruption/orchestration/queue.go +++ b/pkg/controllers/disruption/orchestration/queue.go @@ -30,6 +30,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" "k8s.io/utils/clock" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -63,6 +64,29 @@ type Command struct { lastError error } +func (c *Command) LogValues() []any { + candidateNodes := lo.Map(c.candidates, func(candidate *state.StateNode, _ int) interface{} { + return map[string]interface{}{ + "Node": klog.KObj(candidate.Node), + "NodeClaim": klog.KObj(candidate.NodeClaim), + } + }) + replacementNodes := lo.Map(c.Replacements, func(replacement Replacement, _ int) interface{} { + return map[string]interface{}{ + "NodeClaim": klog.KRef("", replacement.name), + } + }) + return []any{ + "command-id", c.id, + "reason", c.reason, + "decision", c.Decision(), + "disrupted-node-count", len(candidateNodes), + "replacement-node-count", len(replacementNodes), + "disrupted-nodes", candidateNodes, + "replacement-nodes", replacementNodes, + } +} + // Replacement wraps a NodeClaim name with an initialized field to save on readiness checks and identify // when a NodeClaim is first initialized for metrics and events. type Replacement struct { @@ -175,7 +199,7 @@ func (q *Queue) Reconcile(ctx context.Context) (reconcile.Result, error) { if shutdown { panic("unexpected failure, disruption queue has shut down") } - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("command-id", string(cmd.id))) + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues(cmd.LogValues()...)) if err := q.waitOrTerminate(ctx, cmd); err != nil { // If recoverable, re-queue and try again. diff --git a/pkg/controllers/disruption/singlenodeconsolidation.go b/pkg/controllers/disruption/singlenodeconsolidation.go index 8de9f69146..ad0196e15e 100644 --- a/pkg/controllers/disruption/singlenodeconsolidation.go +++ b/pkg/controllers/disruption/singlenodeconsolidation.go @@ -84,7 +84,7 @@ func (s *SingleNodeConsolidation) ComputeCommand(ctx context.Context, disruption } if err := v.IsValid(ctx, cmd, consolidationTTL); err != nil { if IsValidationError(err) { - log.FromContext(ctx).V(1).Info(fmt.Sprintf("abandoning single-node consolidation attempt due to pod churn, command is no longer valid, %s", cmd)) + log.FromContext(ctx).V(1).WithValues(cmd.LogValues()...).Info("abandoning single-node consolidation attempt due to pod churn, command is no longer valid") return Command{}, scheduling.Results{}, nil } return Command{}, scheduling.Results{}, fmt.Errorf("validating consolidation, %w", err) diff --git a/pkg/controllers/disruption/types.go b/pkg/controllers/disruption/types.go index b958f777c8..7ef8059d0c 100644 --- a/pkg/controllers/disruption/types.go +++ b/pkg/controllers/disruption/types.go @@ -17,12 +17,12 @@ limitations under the License. package disruption import ( - "bytes" "context" "fmt" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" "k8s.io/utils/clock" "sigs.k8s.io/controller-runtime/pkg/client" @@ -140,46 +140,34 @@ func (c Command) Decision() Decision { } } -func (c Command) String() string { - var buf bytes.Buffer +func (c Command) LogValues() []any { podCount := lo.Reduce(c.candidates, func(_ int, cd *Candidate, _ int) int { return len(cd.reschedulablePods) }, 0) - fmt.Fprintf(&buf, "%s, terminating %d nodes (%d pods) ", c.Decision(), len(c.candidates), podCount) - for i, old := range c.candidates { - if i != 0 { - fmt.Fprint(&buf, ", ") + + candidateNodes := lo.Map(c.candidates, func(candidate *Candidate, _ int) interface{} { + return map[string]interface{}{ + "Node": klog.KObj(candidate.Node), + "NodeClaim": klog.KObj(candidate.NodeClaim), + "instance-type": candidate.Labels()[corev1.LabelInstanceTypeStable], + "capacity-type": candidate.Labels()[v1.CapacityTypeLabelKey], } - fmt.Fprintf(&buf, "%s", old.Name()) - fmt.Fprintf(&buf, "/%s", old.Labels()[corev1.LabelInstanceTypeStable]) - fmt.Fprintf(&buf, "/%s", old.Labels()[v1.CapacityTypeLabelKey]) - } - if len(c.replacements) == 0 { - return buf.String() - } - odNodeClaims := 0 - spotNodeClaims := 0 - for _, nodeClaim := range c.replacements { - ct := nodeClaim.Requirements.Get(v1.CapacityTypeLabelKey) - if ct.Has(v1.CapacityTypeOnDemand) { - odNodeClaims++ + }) + replacementNodes := lo.Map(c.replacements, func(replacement *scheduling.NodeClaim, _ int) interface{} { + ct := replacement.Requirements.Get(v1.CapacityTypeLabelKey) + m := map[string]interface{}{ + "capacity-type": lo.Ternary[string](ct.Has(v1.CapacityTypeSpot), v1.CapacityTypeSpot, v1.CapacityTypeOnDemand), } - if ct.Has(v1.CapacityTypeSpot) { - spotNodeClaims++ + if len(c.replacements) == 1 { + m["instance-types"] = scheduling.InstanceTypeList(replacement.InstanceTypeOptions) } + return m + }) + + return []any{ + "decision", c.Decision(), + "disrupted-node-count", len(candidateNodes), + "replacement-node-count", len(replacementNodes), + "pod-count", podCount, + "disrupted-nodes", candidateNodes, + "replacement-nodes", replacementNodes, } - // Print list of instance types for the first replacements. - if len(c.replacements) > 1 { - fmt.Fprintf(&buf, " and replacing with %d spot and %d on-demand, from types %s", - spotNodeClaims, odNodeClaims, - scheduling.InstanceTypeList(c.replacements[0].InstanceTypeOptions)) - return buf.String() - } - ct := c.replacements[0].Requirements.Get(v1.CapacityTypeLabelKey) - nodeDesc := "node" - if ct.Len() == 1 { - nodeDesc = fmt.Sprintf("%s node", ct.Any()) - } - fmt.Fprintf(&buf, " and replacing with %s from types %s", - nodeDesc, - scheduling.InstanceTypeList(c.replacements[0].InstanceTypeOptions)) - return buf.String() } diff --git a/pkg/controllers/metrics/pod/controller.go b/pkg/controllers/metrics/pod/controller.go index 8b14ef067c..8af2dfcf61 100644 --- a/pkg/controllers/metrics/pod/controller.go +++ b/pkg/controllers/metrics/pod/controller.go @@ -160,7 +160,7 @@ var ( prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: metrics.PodSubsystem, - Name: "scheduling_undecided_time_seconds", + Name: "provisioning_scheduling_undecided_time_seconds", Help: "The time from when Karpenter has seen a pod without making a scheduling decision for the pod. Note: this calculated from a point in memory, not by the pod creation timestamp.", }, []string{podName, podNamespace}, diff --git a/pkg/controllers/metrics/pod/suite_test.go b/pkg/controllers/metrics/pod/suite_test.go index a69794e22f..c9ae4481b0 100644 --- a/pkg/controllers/metrics/pod/suite_test.go +++ b/pkg/controllers/metrics/pod/suite_test.go @@ -254,7 +254,7 @@ var _ = Describe("Pod Metrics", func() { ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) fakeClock.Step(1 * time.Hour) - _, found := FindMetricWithLabelValues("karpenter_pods_scheduling_undecided_time_seconds", map[string]string{ + _, found := FindMetricWithLabelValues("karpenter_pods_provisioning_scheduling_undecided_time_seconds", map[string]string{ "name": p.GetName(), "namespace": p.GetNamespace(), }) @@ -265,7 +265,7 @@ var _ = Describe("Pod Metrics", func() { ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) fakeClock.Step(1 * time.Hour) - _, found = FindMetricWithLabelValues("karpenter_pods_scheduling_undecided_time_seconds", map[string]string{ + _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_scheduling_undecided_time_seconds", map[string]string{ "name": p.GetName(), "namespace": p.GetNamespace(), }) @@ -274,7 +274,7 @@ var _ = Describe("Pod Metrics", func() { cluster.MarkPodSchedulingDecisions(map[*corev1.Pod]error{}, p) ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) - _, found = FindMetricWithLabelValues("karpenter_pods_scheduling_undecided_time_seconds", map[string]string{ + _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_scheduling_undecided_time_seconds", map[string]string{ "name": p.GetName(), "namespace": p.GetNamespace(), }) @@ -287,7 +287,7 @@ var _ = Describe("Pod Metrics", func() { ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) fakeClock.Step(1 * time.Hour) - _, found := FindMetricWithLabelValues("karpenter_pods_scheduling_undecided_time_seconds", map[string]string{ + _, found := FindMetricWithLabelValues("karpenter_pods_provisioning_scheduling_undecided_time_seconds", map[string]string{ "name": p.GetName(), "namespace": p.GetNamespace(), }) @@ -298,7 +298,7 @@ var _ = Describe("Pod Metrics", func() { ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) fakeClock.Step(1 * time.Hour) - _, found = FindMetricWithLabelValues("karpenter_pods_scheduling_undecided_time_seconds", map[string]string{ + _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_scheduling_undecided_time_seconds", map[string]string{ "name": p.GetName(), "namespace": p.GetNamespace(), }) @@ -307,7 +307,7 @@ var _ = Describe("Pod Metrics", func() { ExpectDeleted(ctx, env.Client, p) ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) - _, found = FindMetricWithLabelValues("karpenter_pods_scheduling_undecided_time_seconds", map[string]string{ + _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_scheduling_undecided_time_seconds", map[string]string{ "name": p.GetName(), "namespace": p.GetNamespace(), }) diff --git a/pkg/controllers/node/health/controller.go b/pkg/controllers/node/health/controller.go index 4f63c35a9e..e8c973c543 100644 --- a/pkg/controllers/node/health/controller.go +++ b/pkg/controllers/node/health/controller.go @@ -74,13 +74,13 @@ func (c *Controller) Register(_ context.Context, m manager.Manager) error { func (c *Controller) Reconcile(ctx context.Context, node *corev1.Node) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, "node.health") - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(node.Namespace, node.Name))) // Validate that the node is owned by us nodeClaim, err := nodeutils.NodeClaimForNode(ctx, c.kubeClient, node) if err != nil { return reconcile.Result{}, nodeutils.IgnoreDuplicateNodeClaimError(nodeutils.IgnoreNodeClaimNotFoundError(err)) } + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KObj(nodeClaim))) // If a nodeclaim does has a nodepool label, validate the nodeclaims inside the nodepool are healthy (i.e bellow the allowed threshold) // In the case of standalone nodeclaim, validate the nodes inside the cluster are healthy before proceeding diff --git a/pkg/controllers/node/hydration/controller.go b/pkg/controllers/node/hydration/controller.go index 53b091e802..400026f974 100644 --- a/pkg/controllers/node/hydration/controller.go +++ b/pkg/controllers/node/hydration/controller.go @@ -56,7 +56,6 @@ func NewController(kubeClient client.Client, cloudProvider cloudprovider.CloudPr func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, c.Name()) - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef(n.Namespace, n.Name))) nc, err := nodeutils.NodeClaimForNode(ctx, c.kubeClient, n) if err != nil { @@ -68,6 +67,7 @@ func (c *Controller) Reconcile(ctx context.Context, n *corev1.Node) (reconcile.R if !nodeclaimutils.IsManaged(nc, c.cloudProvider) { return reconcile.Result{}, nil } + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KObj(nc))) stored := n.DeepCopy() n.Labels = lo.Assign(n.Labels, map[string]string{ diff --git a/pkg/controllers/node/termination/controller.go b/pkg/controllers/node/termination/controller.go index 67e12c6ebf..c94f3ecf03 100644 --- a/pkg/controllers/node/termination/controller.go +++ b/pkg/controllers/node/termination/controller.go @@ -29,6 +29,7 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" "k8s.io/utils/clock" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" @@ -96,10 +97,12 @@ func (c *Controller) finalize(ctx context.Context, node *corev1.Node) (reconcile if err != nil { return reconcile.Result{}, fmt.Errorf("listing nodeclaims, %w", err) } - if err = c.deleteAllNodeClaims(ctx, nodeClaims...); err != nil { return reconcile.Result{}, fmt.Errorf("deleting nodeclaims, %w", err) } + if len(nodeClaims) != 0 { + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KObj(nodeClaims[0]))) + } nodeTerminationTime, err := c.nodeTerminationTime(node, nodeClaims...) if err != nil { diff --git a/pkg/controllers/nodeclaim/consistency/controller.go b/pkg/controllers/nodeclaim/consistency/controller.go index b03f34efd4..2c01484ec8 100644 --- a/pkg/controllers/nodeclaim/consistency/controller.go +++ b/pkg/controllers/nodeclaim/consistency/controller.go @@ -26,6 +26,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/klog/v2" "k8s.io/utils/clock" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" @@ -77,6 +78,10 @@ func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider clou func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, "nodeclaim.consistency") + if nodeClaim.Status.NodeName != "" { + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nodeClaim.Status.NodeName))) + } + if !nodeclaimutils.IsManaged(nodeClaim, c.cloudProvider) { return reconcile.Result{}, nil } diff --git a/pkg/controllers/nodeclaim/disruption/controller.go b/pkg/controllers/nodeclaim/disruption/controller.go index 9aadb36a04..6d81e37dae 100644 --- a/pkg/controllers/nodeclaim/disruption/controller.go +++ b/pkg/controllers/nodeclaim/disruption/controller.go @@ -24,11 +24,13 @@ import ( "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" "k8s.io/utils/clock" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -67,6 +69,9 @@ func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider clou // Reconcile executes a control loop for the resource func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, "nodeclaim.disruption") + if nodeClaim.Status.NodeName != "" { + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nodeClaim.Status.NodeName))) + } if !nodeclaimutils.IsManaged(nodeClaim, c.cloudProvider) { return reconcile.Result{}, nil diff --git a/pkg/controllers/nodeclaim/disruption/drift_test.go b/pkg/controllers/nodeclaim/disruption/drift_test.go index 7ba3dd13f0..8ce5ef7f0f 100644 --- a/pkg/controllers/nodeclaim/disruption/drift_test.go +++ b/pkg/controllers/nodeclaim/disruption/drift_test.go @@ -112,8 +112,11 @@ var _ = Describe("Drift", func() { It("should detect stale instance type drift if the instance type offerings aren't compatible with the nodeclaim", func() { cp.InstanceTypes = lo.Map(cp.InstanceTypes, func(it *cloudprovider.InstanceType, _ int) *cloudprovider.InstanceType { if it.Name == nodeClaim.Labels[corev1.LabelInstanceTypeStable] { - newLabels := lo.Keys(nodeClaim.Labels) - it.Requirements = scheduling.NewLabelRequirements(map[string]string{newLabels[0]: test.RandomName()}) + for i := range it.Offerings { + it.Offerings[i].Requirements = scheduling.NewLabelRequirements(map[string]string{ + corev1.LabelTopologyZone: test.RandomName(), + }) + } } return it }) @@ -190,7 +193,6 @@ var _ = Describe("Drift", func() { Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDrifted)).To(BeNil()) }) It("should remove the status condition from the nodeClaim if the nodeClaim is no longer drifted", func() { - cp.Drifted = "" nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeDrifted) ExpectApplied(ctx, env.Client, nodePool, nodeClaim) @@ -202,8 +204,6 @@ var _ = Describe("Drift", func() { Context("NodeRequirement Drift", func() { DescribeTable("", func(oldNodePoolReq []v1.NodeSelectorRequirementWithMinValues, newNodePoolReq []v1.NodeSelectorRequirementWithMinValues, labels map[string]string, drifted bool) { - cp.Drifted = "" - nodePool.Spec.Template.Spec.Requirements = oldNodePoolReq nodeClaim.Labels = lo.Assign(nodeClaim.Labels, labels) @@ -353,7 +353,6 @@ var _ = Describe("Drift", func() { ), ) It("should return drifted only on NodeClaims that are drifted from an updated nodePool", func() { - cp.Drifted = "" nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirementWithMinValues{ {NodeSelectorRequirement: corev1.NodeSelectorRequirement{Key: v1.CapacityTypeLabelKey, Operator: corev1.NodeSelectorOpIn, Values: []string{v1.CapacityTypeOnDemand}}}, {NodeSelectorRequirement: corev1.NodeSelectorRequirement{Key: corev1.LabelOSStable, Operator: corev1.NodeSelectorOpIn, Values: []string{string(corev1.Linux), string(corev1.Windows)}}}, @@ -408,7 +407,6 @@ var _ = Describe("Drift", func() { Context("NodePool Static Drift", func() { var nodePoolController *hash.Controller BeforeEach(func() { - cp.Drifted = "" nodePoolController = hash.NewController(env.Client, cp) nodePool = &v1.NodePool{ ObjectMeta: nodePool.ObjectMeta, diff --git a/pkg/controllers/nodeclaim/disruption/suite_test.go b/pkg/controllers/nodeclaim/disruption/suite_test.go index 23fbc1641b..a4d3b7606a 100644 --- a/pkg/controllers/nodeclaim/disruption/suite_test.go +++ b/pkg/controllers/nodeclaim/disruption/suite_test.go @@ -142,7 +142,6 @@ var _ = Describe("Disruption", func() { Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeConsolidatable).IsTrue()).To(BeTrue()) }) It("should remove multiple disruption conditions simultaneously", func() { - cp.Drifted = "" nodePool.Spec.Disruption.ConsolidateAfter = v1.MustParseNillableDuration("Never") nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeDrifted) diff --git a/pkg/controllers/nodeclaim/expiration/controller.go b/pkg/controllers/nodeclaim/expiration/controller.go index 1e1f1c7c40..97d411ec18 100644 --- a/pkg/controllers/nodeclaim/expiration/controller.go +++ b/pkg/controllers/nodeclaim/expiration/controller.go @@ -21,6 +21,7 @@ import ( "strings" "time" + "k8s.io/klog/v2" "k8s.io/utils/clock" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" @@ -29,6 +30,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/karpenter/pkg/operator/injection" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" "sigs.k8s.io/karpenter/pkg/metrics" @@ -52,6 +55,11 @@ func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider clou } func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { + ctx = injection.WithControllerName(ctx, c.Name()) + if nodeClaim.Status.NodeName != "" { + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nodeClaim.Status.NodeName))) + } + if !nodeclaimutils.IsManaged(nodeClaim, c.cloudProvider) { return reconcile.Result{}, nil } @@ -87,6 +95,10 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re return reconcile.Result{}, nil } +func (c *Controller) Name() string { + return "nodeclaim.expiration" +} + func (c *Controller) Register(_ context.Context, m manager.Manager) error { return controllerruntime.NewControllerManagedBy(m). Named("nodeclaim.expiration"). diff --git a/pkg/controllers/nodeclaim/garbagecollection/controller.go b/pkg/controllers/nodeclaim/garbagecollection/controller.go index bfaaa28bf0..fc5af465ae 100644 --- a/pkg/controllers/nodeclaim/garbagecollection/controller.go +++ b/pkg/controllers/nodeclaim/garbagecollection/controller.go @@ -100,9 +100,9 @@ func (c *Controller) Reconcile(ctx context.Context) (reconcile.Result, error) { return } log.FromContext(ctx).WithValues( - "NodeClaim", klog.KRef("", nodeClaims[i].Name), + "NodeClaim", klog.KObj(nodeClaims[i]), + "Node", klog.KRef("", nodeClaims[i].Status.NodeName), "provider-id", nodeClaims[i].Status.ProviderID, - "nodepool", nodeClaims[i].Labels[v1.NodePoolLabelKey], ).V(1).Info("garbage collecting nodeclaim with no cloudprovider representation") metrics.NodeClaimsDisruptedTotal.Inc(map[string]string{ metrics.ReasonLabel: "garbage_collected", diff --git a/pkg/controllers/nodeclaim/hydration/controller.go b/pkg/controllers/nodeclaim/hydration/controller.go index 5c9d08837c..f2e32236dd 100644 --- a/pkg/controllers/nodeclaim/hydration/controller.go +++ b/pkg/controllers/nodeclaim/hydration/controller.go @@ -54,7 +54,10 @@ func NewController(kubeClient client.Client, cloudProvider cloudprovider.CloudPr func (c *Controller) Reconcile(ctx context.Context, nc *v1.NodeClaim) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, c.Name()) - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KRef(nc.Namespace, nc.Name))) + if nc.Status.NodeName != "" { + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nc.Status.NodeName))) + } + if !nodeclaimutils.IsManaged(nc, c.cloudProvider) { return reconcile.Result{}, nil } diff --git a/pkg/controllers/nodeclaim/lifecycle/controller.go b/pkg/controllers/nodeclaim/lifecycle/controller.go index 6293af6b94..cc1cbca03a 100644 --- a/pkg/controllers/nodeclaim/lifecycle/controller.go +++ b/pkg/controllers/nodeclaim/lifecycle/controller.go @@ -52,10 +52,6 @@ import ( terminationutil "sigs.k8s.io/karpenter/pkg/utils/termination" ) -type nodeClaimReconciler interface { - Reconcile(context.Context, *v1.NodeClaim) (reconcile.Result, error) -} - // Controller is a NodeClaim Lifecycle controller that manages the lifecycle of the NodeClaim up until its termination // The controller is responsible for ensuring that new Nodes get launched, that they have properly registered with // the cluster as nodes and that they are properly initialized, ensuring that nodeclaims that do not have matching nodes @@ -78,7 +74,7 @@ func NewController(clk clock.Clock, kubeClient client.Client, cloudProvider clou recorder: recorder, launch: &Launch{kubeClient: kubeClient, cloudProvider: cloudProvider, cache: cache.New(time.Minute, time.Second*10), recorder: recorder}, - registration: &Registration{kubeClient: kubeClient}, + registration: &Registration{kubeClient: kubeClient, recorder: recorder}, initialization: &Initialization{kubeClient: kubeClient}, liveness: &Liveness{clock: clk, kubeClient: kubeClient}, } @@ -108,9 +104,15 @@ func (c *Controller) Name() string { return "nodeclaim.lifecycle" } +// nolint:gocyclo func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { ctx = injection.WithControllerName(ctx, c.Name()) - + if nodeClaim.Status.ProviderID != "" { + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID)) + } + if nodeClaim.Status.NodeName != "" { + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nodeClaim.Status.NodeName))) + } if !nodeclaimutils.IsManaged(nodeClaim, c.cloudProvider) { return reconcile.Result{}, nil } @@ -137,7 +139,7 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re stored = nodeClaim.DeepCopy() var results []reconcile.Result var errs error - for _, reconciler := range []nodeClaimReconciler{ + for _, reconciler := range []reconcile.TypedReconciler[*v1.NodeClaim]{ c.launch, c.registration, c.initialization, @@ -169,7 +171,6 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (re //nolint:gocyclo func (c *Controller) finalize(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", nodeClaim.Status.NodeName), "provider-id", nodeClaim.Status.ProviderID)) if !controllerutil.ContainsFinalizer(nodeClaim, v1.TerminationFinalizer) { return reconcile.Result{}, nil } diff --git a/pkg/controllers/nodeclaim/lifecycle/events.go b/pkg/controllers/nodeclaim/lifecycle/events.go index 040cb03df9..396cdb5839 100644 --- a/pkg/controllers/nodeclaim/lifecycle/events.go +++ b/pkg/controllers/nodeclaim/lifecycle/events.go @@ -44,3 +44,13 @@ func NodeClassNotReadyEvent(nodeClaim *v1.NodeClaim, err error) events.Event { DedupeValues: []string{string(nodeClaim.UID)}, } } + +func UnregisteredTaintMissingEvent(nodeClaim *v1.NodeClaim) events.Event { + return events.Event{ + InvolvedObject: nodeClaim, + Type: corev1.EventTypeWarning, + Reason: events.UnregisteredTaintMissing, + Message: fmt.Sprintf("Missing %s taint which prevents registration related race conditions on Karpenter-managed nodes", v1.UnregisteredTaintKey), + DedupeValues: []string{string(nodeClaim.UID)}, + } +} diff --git a/pkg/controllers/nodeclaim/lifecycle/initialization.go b/pkg/controllers/nodeclaim/lifecycle/initialization.go index 4a4405fbf3..d748ca6ccc 100644 --- a/pkg/controllers/nodeclaim/lifecycle/initialization.go +++ b/pkg/controllers/nodeclaim/lifecycle/initialization.go @@ -23,7 +23,6 @@ import ( "github.com/samber/lo" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" - "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -53,13 +52,11 @@ func (i *Initialization) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) if !nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsTrue() { return reconcile.Result{}, nil } - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID)) node, err := nodeclaimutils.NodeForNodeClaim(ctx, i.kubeClient, nodeClaim) if err != nil { nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "NodeNotFound", "Node not registered with cluster") return reconcile.Result{}, nil //nolint:nilerr } - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", node.Name))) if nodeutils.GetCondition(node, corev1.NodeReady).Status != corev1.ConditionTrue { nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "NodeNotReady", "Node status is NotReady") return reconcile.Result{}, nil diff --git a/pkg/controllers/nodeclaim/lifecycle/initialization_test.go b/pkg/controllers/nodeclaim/lifecycle/initialization_test.go index ade6be99f5..7827669242 100644 --- a/pkg/controllers/nodeclaim/lifecycle/initialization_test.go +++ b/pkg/controllers/nodeclaim/lifecycle/initialization_test.go @@ -113,50 +113,50 @@ var _ = Describe("Initialization", func() { Entry("should ignore NodeClaims which aren't managed by this Karpenter instance", false), ) It("shouldn't consider the nodeClaim initialized when it has not registered", func() { - nodeClaim := test.NodeClaim(v1.NodeClaim{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - v1.NodePoolLabelKey: nodePool.Name, - }, - }, - Spec: v1.NodeClaimSpec{ - Resources: v1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("2"), - corev1.ResourceMemory: resource.MustParse("50Mi"), - corev1.ResourcePods: resource.MustParse("5"), - }, - }, - }, - }) + nodeClaim := test.NodeClaim() ExpectApplied(ctx, env.Client, nodePool, nodeClaim) ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - node := test.Node(test.NodeOptions{ + node1 := test.Node(test.NodeOptions{ ProviderID: nodeClaim.Status.ProviderID, }) - ExpectApplied(ctx, env.Client, node) + node2 := test.Node(test.NodeOptions{ + ProviderID: nodeClaim.Status.ProviderID, + }) + ExpectApplied(ctx, env.Client, node1, node2) - _ = ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim) - ExpectMakeNodesReady(ctx, env.Client, node) // Remove the not-ready taint + // does not error but will not be registered because this reconcile returned multiple nodes + ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) + ExpectMakeNodesReady(ctx, env.Client, node1, node2) // Remove the not-ready taint nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionFalse)) Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeInitialized).Status).To(Equal(metav1.ConditionUnknown)) - node = ExpectExists(ctx, env.Client, node) - node.Status.Capacity = corev1.ResourceList{ + node1 = ExpectExists(ctx, env.Client, node1) + node1.Status.Capacity = corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("10"), corev1.ResourceMemory: resource.MustParse("100Mi"), corev1.ResourcePods: resource.MustParse("110"), } - node.Status.Allocatable = corev1.ResourceList{ + node1.Status.Allocatable = corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("8"), corev1.ResourceMemory: resource.MustParse("80Mi"), corev1.ResourcePods: resource.MustParse("110"), } - ExpectApplied(ctx, env.Client, node) + node2 = ExpectExists(ctx, env.Client, node2) + node2.Status.Capacity = corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("10"), + corev1.ResourceMemory: resource.MustParse("100Mi"), + corev1.ResourcePods: resource.MustParse("110"), + } + node2.Status.Allocatable = corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("8"), + corev1.ResourceMemory: resource.MustParse("80Mi"), + corev1.ResourcePods: resource.MustParse("110"), + } + ExpectApplied(ctx, env.Client, node1, node2) ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) diff --git a/pkg/controllers/nodeclaim/lifecycle/registration.go b/pkg/controllers/nodeclaim/lifecycle/registration.go index 0cbcaf156e..de92e108cd 100644 --- a/pkg/controllers/nodeclaim/lifecycle/registration.go +++ b/pkg/controllers/nodeclaim/lifecycle/registration.go @@ -31,6 +31,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/events" "sigs.k8s.io/karpenter/pkg/metrics" "sigs.k8s.io/karpenter/pkg/scheduling" nodeclaimutils "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" @@ -38,6 +39,7 @@ import ( type Registration struct { kubeClient client.Client + recorder events.Recorder } func (r *Registration) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { @@ -46,7 +48,6 @@ func (r *Registration) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) ( nodeClaim.StatusConditions().Set(*cond) return reconcile.Result{}, nil } - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID)) node, err := nodeclaimutils.NodeForNodeClaim(ctx, r.kubeClient, nodeClaim) if err != nil { if nodeclaimutils.IsNodeNotFoundError(err) { @@ -62,13 +63,13 @@ func (r *Registration) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) ( _, hasStartupTaint := lo.Find(node.Spec.Taints, func(t corev1.Taint) bool { return t.MatchTaint(&v1.UnregisteredNoExecuteTaint) }) - // check if sync succeeded but setting the registered status condition failed - // if sync succeeded, then the label will be present and the taint will be gone + // if the sync hasn't happened yet and the race protecting startup taint isn't present then log it as missing and proceed + // if the sync has happened then the startup taint has been removed if it was present if _, ok := node.Labels[v1.NodeRegisteredLabelKey]; !ok && !hasStartupTaint { - nodeClaim.StatusConditions().SetFalse(v1.ConditionTypeRegistered, "UnregisteredTaintNotFound", fmt.Sprintf("Invariant violated, %s taint must be present on Karpenter-managed nodes", v1.UnregisteredTaintKey)) - return reconcile.Result{}, fmt.Errorf("missing required startup taint, %s", v1.UnregisteredTaintKey) + log.FromContext(ctx).Error(fmt.Errorf("missing %s taint which prevents registration related race conditions on Karpenter-managed nodes", v1.UnregisteredTaintKey), "node claim registration error") + r.recorder.Publish(UnregisteredTaintMissingEvent(nodeClaim)) } - ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", node.Name))) + ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KObj(node))) if err = r.syncNode(ctx, nodeClaim, node); err != nil { if errors.IsConflict(err) { return reconcile.Result{Requeue: true}, nil diff --git a/pkg/controllers/nodeclaim/lifecycle/registration_test.go b/pkg/controllers/nodeclaim/lifecycle/registration_test.go index 9f9b93459e..254a08cfa4 100644 --- a/pkg/controllers/nodeclaim/lifecycle/registration_test.go +++ b/pkg/controllers/nodeclaim/lifecycle/registration_test.go @@ -23,6 +23,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/events" "sigs.k8s.io/karpenter/pkg/test" . "sigs.k8s.io/karpenter/pkg/test/expectations" ) @@ -112,7 +113,7 @@ var _ = Describe("Registration", func() { Expect(node.Labels).To(HaveKeyWithValue(v1.NodeRegisteredLabelKey, "true")) Expect(node.Spec.Taints).To(Not(ContainElement(v1.UnregisteredNoExecuteTaint))) }) - It("should fail registration if the karpenter.sh/unregistered taint is not present on the node and the node isn't labeled as registered", func() { + It("should succeed registration if the karpenter.sh/unregistered taint is not present and emit an event", func() { nodeClaim := test.NodeClaim(v1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ @@ -124,12 +125,23 @@ var _ = Describe("Registration", func() { ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + // Create a node without the unregistered taint node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID}) ExpectApplied(ctx, env.Client, node) - _ = ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim) + ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) + + // Verify the NodeClaim is registered nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(ExpectStatusConditionExists(nodeClaim, v1.ConditionTypeRegistered).Status).To(Equal(metav1.ConditionFalse)) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsTrue()).To(BeTrue()) + Expect(nodeClaim.Status.NodeName).To(Equal(node.Name)) + + // Verify the node is registered + node = ExpectExists(ctx, env.Client, node) + Expect(node.Labels).To(HaveKeyWithValue(v1.NodeRegisteredLabelKey, "true")) + + Expect(recorder.Calls(events.UnregisteredTaintMissing)).To(Equal(1)) }) + It("should sync the labels to the Node when the Node comes online", func() { nodeClaim := test.NodeClaim(v1.NodeClaim{ ObjectMeta: metav1.ObjectMeta{ diff --git a/pkg/controllers/nodeclaim/lifecycle/suite_test.go b/pkg/controllers/nodeclaim/lifecycle/suite_test.go index eeca378310..29d0b9ac0b 100644 --- a/pkg/controllers/nodeclaim/lifecycle/suite_test.go +++ b/pkg/controllers/nodeclaim/lifecycle/suite_test.go @@ -29,14 +29,12 @@ import ( apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/record" clock "k8s.io/utils/clock/testing" "sigs.k8s.io/karpenter/pkg/apis" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" nodeclaimlifecycle "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/lifecycle" - "sigs.k8s.io/karpenter/pkg/events" "sigs.k8s.io/karpenter/pkg/operator/options" "sigs.k8s.io/karpenter/pkg/test" . "sigs.k8s.io/karpenter/pkg/test/expectations" @@ -49,6 +47,7 @@ var nodeClaimController *nodeclaimlifecycle.Controller var env *test.Environment var fakeClock *clock.FakeClock var cloudProvider *fake.CloudProvider +var recorder *test.EventRecorder func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -70,11 +69,12 @@ func removeNodeClaimImmutabilityValidation(crds ...*apiextensionsv1.CustomResour var _ = BeforeSuite(func() { fakeClock = clock.NewFakeClock(time.Now()) + recorder = test.NewEventRecorder() env = test.NewEnvironment(test.WithCRDs(removeNodeClaimImmutabilityValidation(apis.CRDs...)...), test.WithCRDs(v1alpha1.CRDs...), test.WithFieldIndexers(test.NodeProviderIDFieldIndexer(ctx))) ctx = options.ToContext(ctx, test.Options()) cloudProvider = fake.NewCloudProvider() - nodeClaimController = nodeclaimlifecycle.NewController(fakeClock, env.Client, cloudProvider, events.NewRecorder(&record.FakeRecorder{})) + nodeClaimController = nodeclaimlifecycle.NewController(fakeClock, env.Client, cloudProvider, recorder) }) var _ = AfterSuite(func() { @@ -91,6 +91,7 @@ var _ = Describe("Finalizer", func() { var nodePool *v1.NodePool BeforeEach(func() { + recorder.Reset() // Reset the events that we captured during the run nodePool = test.NodePool() }) Context("TerminationFinalizer", func() { diff --git a/pkg/controllers/nodeclaim/lifecycle/termination_test.go b/pkg/controllers/nodeclaim/lifecycle/termination_test.go index d7102a6a74..c6e63780dc 100644 --- a/pkg/controllers/nodeclaim/lifecycle/termination_test.go +++ b/pkg/controllers/nodeclaim/lifecycle/termination_test.go @@ -353,20 +353,13 @@ var _ = Describe("Termination", func() { })) }) It("should not delete Nodes if the NodeClaim is not registered", func() { - ExpectApplied(ctx, env.Client, nodePool, nodeClaim) + node := test.NodeClaimLinkedNode(nodeClaim) + ExpectApplied(ctx, env.Client, nodePool, nodeClaim, node) ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) _, err := cloudProvider.Get(ctx, nodeClaim.Status.ProviderID) Expect(err).ToNot(HaveOccurred()) - - node := test.NodeClaimLinkedNode(nodeClaim) - // Remove the unregistered taint to ensure the NodeClaim can't be marked as registered - node.Spec.Taints = nil - ExpectApplied(ctx, env.Client, node) - _ = ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim) - nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) - Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsFalse()).To(BeTrue()) + ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) Expect(env.Client.Delete(ctx, nodeClaim)).To(Succeed()) ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go index 8541f33eb7..94d7d109c8 100644 --- a/pkg/controllers/provisioning/provisioner.go +++ b/pkg/controllers/provisioning/provisioner.go @@ -166,7 +166,7 @@ func (p *Provisioner) GetPendingPods(ctx context.Context) ([]*corev1.Pod, error) if err := p.Validate(ctx, po); err != nil { // Mark in memory that this pod is unschedulable p.cluster.MarkPodSchedulingDecisions(map[*corev1.Pod]error{po: fmt.Errorf("ignoring pod, %w", err)}, po) - log.FromContext(ctx).WithValues("Pod", klog.KRef(po.Namespace, po.Name)).V(1).Info(fmt.Sprintf("ignoring pod, %s", err)) + log.FromContext(ctx).WithValues("Pod", klog.KObj(po)).V(1).Info(fmt.Sprintf("ignoring pod, %s", err)) return true } return false @@ -220,7 +220,7 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*corev1.Pod, stat } nodePools = lo.Filter(nodePools, func(np *v1.NodePool, _ int) bool { if !np.StatusConditions().IsTrue(status.ConditionReady) { - log.FromContext(ctx).WithValues("NodePool", klog.KRef("", np.Name)).Error(err, "ignoring nodepool, not ready") + log.FromContext(ctx).WithValues("NodePool", klog.KObj(np)).Error(err, "ignoring nodepool, not ready") return false } return np.DeletionTimestamp.IsZero() @@ -242,11 +242,11 @@ func (p *Provisioner) NewScheduler(ctx context.Context, pods []*corev1.Pod, stat // Get instance type options its, err := p.cloudProvider.GetInstanceTypes(ctx, np) if err != nil { - log.FromContext(ctx).WithValues("NodePool", klog.KRef("", np.Name)).Error(err, "skipping, unable to resolve instance types") + log.FromContext(ctx).WithValues("NodePool", klog.KObj(np)).Error(err, "skipping, unable to resolve instance types") continue } if len(its) == 0 { - log.FromContext(ctx).WithValues("NodePool", klog.KRef("", np.Name)).Info("skipping, no resolved instance types found") + log.FromContext(ctx).WithValues("NodePool", klog.KObj(np)).Info("skipping, no resolved instance types found") continue } instanceTypes[np.Name] = its @@ -317,7 +317,7 @@ func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) { results := s.Solve(ctx, pods).TruncateInstanceTypes(scheduler.MaxInstanceTypes) scheduler.UnschedulablePodsCount.Set(float64(len(results.PodErrors)), map[string]string{scheduler.ControllerLabel: injection.GetControllerName(ctx)}) if len(results.NewNodeClaims) > 0 { - log.FromContext(ctx).WithValues("Pods", pretty.Slice(lo.Map(pods, func(p *corev1.Pod, _ int) string { return klog.KRef(p.Namespace, p.Name).String() }), 5), "duration", time.Since(start)).Info("found provisionable pod(s)") + log.FromContext(ctx).WithValues("Pods", pretty.Slice(lo.Map(pods, func(p *corev1.Pod, _ int) string { return klog.KObj(p).String() }), 5), "duration", time.Since(start)).Info("found provisionable pod(s)") } // Mark in memory when these pods were marked as schedulable or when we made a decision on the pods p.cluster.MarkPodSchedulingDecisions(results.PodErrors, pendingPods...) @@ -344,7 +344,7 @@ func (p *Provisioner) Create(ctx context.Context, n *scheduler.NodeClaim, opts . return req.Key == corev1.LabelInstanceTypeStable }) - log.FromContext(ctx).WithValues("NodeClaim", klog.KRef("", nodeClaim.Name), "requests", nodeClaim.Spec.Resources.Requests, "instance-types", instanceTypeList(instanceTypeRequirement.Values)). + log.FromContext(ctx).WithValues("NodeClaim", klog.KObj(nodeClaim), "requests", nodeClaim.Spec.Resources.Requests, "instance-types", instanceTypeList(instanceTypeRequirement.Values)). Info("created nodeclaim") metrics.NodeClaimsCreatedTotal.Inc(map[string]string{ metrics.ReasonLabel: options.Reason, @@ -432,7 +432,7 @@ func (p *Provisioner) injectVolumeTopologyRequirements(ctx context.Context, pods var schedulablePods []*corev1.Pod for _, pod := range pods { if err := p.volumeTopology.Inject(ctx, pod); err != nil { - log.FromContext(ctx).WithValues("Pod", klog.KRef(pod.Namespace, pod.Name)).Error(err, "failed getting volume topology requirements") + log.FromContext(ctx).WithValues("Pod", klog.KObj(pod)).Error(err, "failed getting volume topology requirements") } else { schedulablePods = append(schedulablePods, pod) } diff --git a/pkg/controllers/provisioning/scheduling/existingnode.go b/pkg/controllers/provisioning/scheduling/existingnode.go index 8bf44ff51d..804ca1cc61 100644 --- a/pkg/controllers/provisioning/scheduling/existingnode.go +++ b/pkg/controllers/provisioning/scheduling/existingnode.go @@ -92,11 +92,13 @@ func (n *ExistingNode) Add(ctx context.Context, kubeClient client.Client, pod *v return fmt.Errorf("exceeds node resources") } - nodeRequirements := scheduling.NewRequirements(n.requirements.Values()...) // Check NodeClaim Affinity Requirements - if err = nodeRequirements.Compatible(podData.Requirements); err != nil { + if err = n.requirements.Compatible(podData.Requirements); err != nil { return err } + // avoid creating our temp set of requirements until after we've ensured that at least + // the pod is compatible + nodeRequirements := scheduling.NewRequirements(n.requirements.Values()...) nodeRequirements.Add(podData.Requirements.Values()...) // Check Topology Requirements diff --git a/pkg/controllers/provisioning/scheduling/metrics.go b/pkg/controllers/provisioning/scheduling/metrics.go index 45444176ea..3497d00be8 100644 --- a/pkg/controllers/provisioning/scheduling/metrics.go +++ b/pkg/controllers/provisioning/scheduling/metrics.go @@ -74,7 +74,8 @@ var ( crmetrics.Registry, prometheus.GaugeOpts{ Namespace: metrics.Namespace, - Name: "ignored_pod_count", + Subsystem: schedulerSubsystem, + Name: "ignored_pods_count", Help: "Number of pods ignored during scheduling by Karpenter", }, []string{}, diff --git a/pkg/controllers/provisioning/scheduling/preferences.go b/pkg/controllers/provisioning/scheduling/preferences.go index 53305acd53..9ec3ff4f1b 100644 --- a/pkg/controllers/provisioning/scheduling/preferences.go +++ b/pkg/controllers/provisioning/scheduling/preferences.go @@ -49,7 +49,7 @@ func (p *Preferences) Relax(ctx context.Context, pod *v1.Pod) bool { for _, relaxFunc := range relaxations { if reason := relaxFunc(pod); reason != nil { - log.FromContext(ctx).WithValues("Pod", klog.KRef(pod.Namespace, pod.Name)).V(1).Info(fmt.Sprintf("relaxing soft constraints for pod since it previously failed to schedule, %s", lo.FromPtr(reason))) + log.FromContext(ctx).WithValues("Pod", klog.KObj(pod)).V(1).Info(fmt.Sprintf("relaxing soft constraints for pod since it previously failed to schedule, %s", lo.FromPtr(reason))) return true } } diff --git a/pkg/controllers/provisioning/scheduling/scheduler.go b/pkg/controllers/provisioning/scheduling/scheduler.go index bd37280305..b6ad529bd1 100644 --- a/pkg/controllers/provisioning/scheduling/scheduler.go +++ b/pkg/controllers/provisioning/scheduling/scheduler.go @@ -65,7 +65,7 @@ func NewScheduler(ctx context.Context, kubeClient client.Client, nodePools []*v1 nct.InstanceTypeOptions, _ = filterInstanceTypesByRequirements(instanceTypes[np.Name], nct.Requirements, corev1.ResourceList{}, corev1.ResourceList{}, corev1.ResourceList{}) if len(nct.InstanceTypeOptions) == 0 { recorder.Publish(NoCompatibleInstanceTypes(np)) - log.FromContext(ctx).WithValues("NodePool", klog.KRef("", np.Name)).Info("skipping, nodepool requirements filtered out all instance types") + log.FromContext(ctx).WithValues("NodePool", klog.KObj(np)).Info("skipping, nodepool requirements filtered out all instance types") return nil, false } return nct, true @@ -124,7 +124,7 @@ type Results struct { func (r Results) Record(ctx context.Context, recorder events.Recorder, cluster *state.Cluster) { // Report failures and nominations for p, err := range r.PodErrors { - log.FromContext(ctx).WithValues("Pod", klog.KRef(p.Namespace, p.Name)).Error(err, "could not schedule pod") + log.FromContext(ctx).WithValues("Pod", klog.KObj(p)).Error(err, "could not schedule pod") recorder.Publish(PodFailedToScheduleEvent(p, err)) } for _, existing := range r.ExistingNodes { diff --git a/pkg/controllers/provisioning/scheduling/topology.go b/pkg/controllers/provisioning/scheduling/topology.go index fb37880edd..bc9186541b 100644 --- a/pkg/controllers/provisioning/scheduling/topology.go +++ b/pkg/controllers/provisioning/scheduling/topology.go @@ -21,6 +21,7 @@ import ( "errors" "fmt" "math" + "sort" "github.com/awslabs/operatorpkg/option" "github.com/samber/lo" @@ -358,6 +359,13 @@ func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error { } } + // sort our pods by the node they are scheduled to + sort.Slice(pods, func(i, j int) bool { + return pods[i].Spec.NodeName < pods[j].Spec.NodeName + }) + var previousNode *corev1.Node + var previousNodeRequirements scheduling.Requirements + for i, p := range pods { if IgnoredForTopology(&pods[i]) { continue @@ -366,18 +374,32 @@ func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error { if t.excludedPods.Has(string(p.UID)) { continue } - node := &corev1.Node{} - if err := t.kubeClient.Get(ctx, types.NamespacedName{Name: p.Spec.NodeName}, node); err != nil { - // Pods that cannot be evicted can be leaked in the API Server after - // a Node is removed. Since pod bindings are immutable, these pods - // cannot be recovered, and will be deleted by the pod lifecycle - // garbage collector. These pods are not running, and should not - // impact future topology calculations. - if coreerrors.IsNotFound(err) { - continue + var node *corev1.Node + var nodeRequirements scheduling.Requirements + if previousNode != nil && previousNode.Name == p.Spec.NodeName { + // no need to look up the node since we already have it + node = previousNode + nodeRequirements = previousNodeRequirements + } else { + node = &corev1.Node{} + if err := t.kubeClient.Get(ctx, types.NamespacedName{Name: p.Spec.NodeName}, node); err != nil { + // Pods that cannot be evicted can be leaked in the API Server after + // a Node is removed. Since pod bindings are immutable, these pods + // cannot be recovered, and will be deleted by the pod lifecycle + // garbage collector. These pods are not running, and should not + // impact future topology calculations. + if errors.IsNotFound(err) { + continue + } + return fmt.Errorf("getting node %s, %w", p.Spec.NodeName, err) } - return fmt.Errorf("getting node %s, %w", p.Spec.NodeName, err) + nodeRequirements = scheduling.NewLabelRequirements(node.Labels) + + // assign back to previous node so we can hopefully re-use these in the next iteration + previousNode = node + previousNodeRequirements = nodeRequirements } + domain, ok := node.Labels[tg.Key] // Kubelet sets the hostname label, but the node may not be ready yet so there is no label. We fall back and just // treat the node name as the label. It probably is in most cases, but even if not we at least count the existence @@ -390,9 +412,10 @@ func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error { if !ok { continue // Don't include pods if node doesn't contain domain https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/#conventions } + // nodes may or may not be considered for counting purposes for topology spread constraints depending on if they // are selected by the pod's node selectors and required node affinities. If these are unset, the node always counts. - if !tg.nodeFilter.Matches(node.Spec.Taints, scheduling.NewLabelRequirements(node.Labels)) { + if !tg.nodeFilter.Matches(node.Spec.Taints, nodeRequirements) { continue } tg.Record(domain) diff --git a/pkg/controllers/provisioning/scheduling/volumetopology.go b/pkg/controllers/provisioning/scheduling/volumetopology.go index a4dc0be961..14d218eb13 100644 --- a/pkg/controllers/provisioning/scheduling/volumetopology.go +++ b/pkg/controllers/provisioning/scheduling/volumetopology.go @@ -72,7 +72,7 @@ func (v *VolumeTopology) Inject(ctx context.Context, pod *v1.Pod) error { } log.FromContext(ctx). - WithValues("Pod", klog.KRef(pod.Namespace, pod.Name)). + WithValues("Pod", klog.KObj(pod)). V(1).Info(fmt.Sprintf("adding requirements derived from pod volumes, %s", requirements)) return nil } diff --git a/pkg/events/reason.go b/pkg/events/reason.go index 7d8ebc1fd0..982733eb33 100644 --- a/pkg/events/reason.go +++ b/pkg/events/reason.go @@ -44,5 +44,6 @@ const ( // nodeclaim/lifecycle InsufficientCapacityError = "InsufficientCapacityError" + UnregisteredTaintMissing = "UnregisteredTaintMissing" NodeClassNotReady = "NodeClassNotReady" ) diff --git a/pkg/scheduling/requirements.go b/pkg/scheduling/requirements.go index 949f4edccb..bf8f6e1d63 100644 --- a/pkg/scheduling/requirements.go +++ b/pkg/scheduling/requirements.go @@ -172,7 +172,7 @@ func (r Requirements) IsCompatible(requirements Requirements, options ...option. } // Compatible ensures the provided requirements can loosely be met. -func (r Requirements) Compatible(requirements Requirements, options ...option.Function[CompatibilityOptions]) (errs error) { +func (r Requirements) Compatible(requirements Requirements, options ...option.Function[CompatibilityOptions]) error { opts := option.Resolve(options...) // Custom Labels must intersect, but if not defined are denied. @@ -183,49 +183,11 @@ func (r Requirements) Compatible(requirements Requirements, options ...option.Fu if operator := requirements.Get(key).Operator(); r.Has(key) || operator == corev1.NodeSelectorOpNotIn || operator == corev1.NodeSelectorOpDoesNotExist { continue } - errs = multierr.Append(errs, fmt.Errorf("label %q does not have known values%s", key, labelHint(r, key, opts.AllowUndefined))) + // break early so we only report the first error + return fmt.Errorf("label %q does not have known values%s", key, labelHint(r, key, opts.AllowUndefined)) } // Well Known Labels must intersect, but if not defined, are allowed. - return multierr.Append(errs, r.Intersects(requirements)) -} - -// editDistance is an implementation of edit distance from Algorithms/DPV -func editDistance(s, t string) int { - min := func(a, b, c int) int { - m := a - if b < m { - m = b - } - if c < m { - m = c - } - return m - } - - m := len(s) - n := len(t) - if m == 0 { - return n - } - if n == 0 { - return m - } - prevRow := make([]int, n) - curRow := make([]int, n) - for j := 1; j < n; j++ { - prevRow[j] = j - } - for i := 1; i < m; i++ { - for j := 1; j < n; j++ { - diff := 0 - if s[i] != t[j] { - diff = 1 - } - curRow[j] = min(prevRow[j]+1, curRow[j-1]+1, prevRow[j-1]+diff) - } - prevRow, curRow = curRow, prevRow - } - return prevRow[n-1] + return r.Intersects(requirements) } func getSuffix(key string) string { @@ -235,7 +197,7 @@ func getSuffix(key string) string { func labelHint(r Requirements, key string, allowedUndefined sets.Set[string]) string { for wellKnown := range allowedUndefined { - if strings.Contains(wellKnown, key) || editDistance(key, wellKnown) < len(wellKnown)/5 { + if strings.Contains(wellKnown, key) { return fmt.Sprintf(" (typo of %q?)", wellKnown) } if strings.HasSuffix(wellKnown, getSuffix(key)) { @@ -243,7 +205,7 @@ func labelHint(r Requirements, key string, allowedUndefined sets.Set[string]) st } } for existing := range r { - if strings.Contains(existing, key) || editDistance(key, existing) < len(existing)/5 { + if strings.Contains(existing, key) { return fmt.Sprintf(" (typo of %q?)", existing) } if strings.HasSuffix(existing, getSuffix(key)) { diff --git a/pkg/scheduling/requirements_test.go b/pkg/scheduling/requirements_test.go index dff6ea3de6..aa6a97ef75 100644 --- a/pkg/scheduling/requirements_test.go +++ b/pkg/scheduling/requirements_test.go @@ -561,10 +561,8 @@ var _ = Describe("Requirements", func() { }, Entry("Zone Label #1", "topology.kubernetesio/zone", `label "topology.kubernetesio/zone" does not have known values (typo of "topology.kubernetes.io/zone"?)`), Entry("Zone Label #1", "node.io/zone", `label "node.io/zone" does not have known values (typo of "topology.kubernetes.io/zone"?)`), - Entry("Zone Label #1", "topology.kubernetesiozone", `label "topology.kubernetesiozone" does not have known values (typo of "topology.kubernetes.io/zone"?)`), Entry("Region Label #1", "topology.kubernetes.io/regio", `label "topology.kubernetes.io/regio" does not have known values (typo of "topology.kubernetes.io/region"?)`), Entry("Region Label #2", "node.kubernetes.io/region", `label "node.kubernetes.io/region" does not have known values (typo of "topology.kubernetes.io/region"?)`), - Entry("NodePool Label #1", "karpenter.shnodepool", `label "karpenter.shnodepool" does not have known values (typo of "karpenter.sh/nodepool"?)`), Entry("NodePool Label #2", "karpenter/nodepool", `label "karpenter/nodepool" does not have known values (typo of "karpenter.sh/nodepool"?)`), ) It("should display an error message for unknown labels", func() { @@ -699,17 +697,6 @@ var _ = Describe("Requirements", func() { }) }) -// Keeping this in case we need it, I ran for 1m+ samples and had no issues -// fuzz: elapsed: 2m27s, execs: 1002748 (6130/sec), new interesting: 30 (total: 33) -func FuzzEditDistance(f *testing.F) { - f.Add("foo", "bar") - f.Add("foo", "") - f.Add("", "foo") - f.Fuzz(func(t *testing.T, lhs, rhs string) { - editDistance(lhs, rhs) - }) -} - // TestSchedulingProfile is used to gather profiling metrics, benchmarking is primarily done with standard // Go benchmark functions // go test -tags=test_performance -run=RequirementsProfile diff --git a/pkg/scheduling/volumeusage.go b/pkg/scheduling/volumeusage.go index d8575d1a2e..60efc84d11 100644 --- a/pkg/scheduling/volumeusage.go +++ b/pkg/scheduling/volumeusage.go @@ -88,7 +88,7 @@ func GetVolumes(ctx context.Context, kubeClient client.Client, pod *v1.Pod) (Vol // computing limits, otherwise Karpenter may never be able to update its cluster state. if err != nil { if errors.IsNotFound(err) { - log.FromContext(ctx).WithValues("Pod", klog.KRef(pod.Namespace, pod.Name), "volume", volume.Name).Error(err, "failed tracking CSI volume limits for volume") + log.FromContext(ctx).WithValues("Pod", klog.KObj(pod), "volume", volume.Name).Error(err, "failed tracking CSI volume limits for volume") continue } return nil, fmt.Errorf("failed updating volume limits, %w", err) @@ -134,7 +134,7 @@ func resolveDriver(ctx context.Context, kubeClient client.Client, pod *v1.Pod, v // In either of these cases, a PV must have been previously bound to the PVC and has since been removed. We can // ignore this PVC while computing limits and continue. if storageClassName == "" { - log.FromContext(ctx).WithValues("volume", volumeName, "Pod", klog.KRef(pod.Namespace, pod.Name), "PersistentVolumeClaim", klog.KRef(pvc.Namespace, pvc.Name)).V(1).Info("failed tracking CSI volume limits for volume with unbound PVC, no storage class specified") + log.FromContext(ctx).WithValues("volume", volumeName, "Pod", klog.KObj(pod), "PersistentVolumeClaim", klog.KObj(pvc)).V(1).Info("failed tracking CSI volume limits for volume with unbound PVC, no storage class specified") return "", nil } @@ -145,7 +145,7 @@ func resolveDriver(ctx context.Context, kubeClient client.Client, pod *v1.Pod, v // 2. The StorageClass never existed and was used to bind the PVC to an existing PV, but that PV was removed // In either of these cases, we should ignore the PVC while computing limits and continue. if errors.IsNotFound(err) { - log.FromContext(ctx).WithValues("volume", volumeName, "Pod", klog.KRef(pod.Namespace, pod.Name), "PersistentVolumeClaim", klog.KRef(pvc.Namespace, pvc.Name), "StorageClass", klog.KRef("", storageClassName)).V(1).Info(fmt.Sprintf("failed tracking CSI volume limits for volume with unbound PVC, %s", err)) + log.FromContext(ctx).WithValues("volume", volumeName, "Pod", klog.KObj(pod), "PersistentVolumeClaim", klog.KObj(pvc), "StorageClass", klog.KRef("", storageClassName)).V(1).Info(fmt.Sprintf("failed tracking CSI volume limits for volume with unbound PVC, %s", err)) return "", nil } return "", err diff --git a/test/pkg/environment/common/default_kowknodeclass.yaml b/test/pkg/environment/common/default_kowknodeclass.yaml new file mode 100644 index 0000000000..34ea7d2dbf --- /dev/null +++ b/test/pkg/environment/common/default_kowknodeclass.yaml @@ -0,0 +1,4 @@ +apiVersion: karpenter.kwok.sh/v1alpha1 +kind: KWOKNodeClass +metadata: + name: default \ No newline at end of file diff --git a/test/pkg/environment/common/default_nodepool.yaml b/test/pkg/environment/common/default_nodepool.yaml new file mode 100644 index 0000000000..a38cbe478b --- /dev/null +++ b/test/pkg/environment/common/default_nodepool.yaml @@ -0,0 +1,27 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: default +spec: + disruption: + consolidationPolicy: WhenEmptyOrUnderutilized + consolidateAfter: Never + budgets: + - nodes: 100% + limits: + cpu: 1000 + memory: 1000Gi + template: + spec: + expireAfter: Never + requirements: + - key: kubernetes.io/os + operator: In + values: ["linux"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + nodeClassRef: + group: karpenter.kwok.sh + kind: KWOKNodeClass + name: default \ No newline at end of file diff --git a/test/pkg/environment/common/environment.go b/test/pkg/environment/common/environment.go index 7c24b509be..da6f89021f 100644 --- a/test/pkg/environment/common/environment.go +++ b/test/pkg/environment/common/environment.go @@ -18,7 +18,10 @@ package common import ( "context" + _ "embed" + "flag" "fmt" + "io" "log" "os" "strconv" @@ -30,27 +33,35 @@ import ( "github.com/onsi/gomega" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + serializeryaml "k8s.io/apimachinery/pkg/runtime/serializer/yaml" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/yaml" - "sigs.k8s.io/karpenter/kwok/apis/v1alpha1" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/operator" - "sigs.k8s.io/karpenter/pkg/test" . "sigs.k8s.io/karpenter/pkg/utils/testing" //nolint:stylecheck "sigs.k8s.io/karpenter/test/pkg/debug" ) type ContextKey string -const ( - GitRefContextKey = ContextKey("gitRef") +const GitRefContextKey = ContextKey("gitRef") + +// I need to add the the default kwok nodeclass path +// That way it's not defined in code but we use it when we initialize the nodeclass +var ( + //go:embed default_kowknodeclass.yaml + defaultNodeClass []byte + //go:embed default_nodepool.yaml + defaultNodePool []byte + nodeClassPath = flag.String("default-nodeclass", "", "Pass in a default cloud specific node class") + nodePoolPath = flag.String("default-nodepool", "", "Pass in a default karpenter nodepool") ) type Environment struct { @@ -62,6 +73,7 @@ type Environment struct { Config *rest.Config KubeClient kubernetes.Interface Monitor *Monitor + DefaultNodeClass *unstructured.Unstructured OutputDir string StartingNodeCount int @@ -79,7 +91,7 @@ func NewEnvironment(t *testing.T) *Environment { // Get the output dir if it's set outputDir, _ := os.LookupEnv("OUTPUT_DIR") - gomega.SetDefaultEventuallyTimeout(5 * time.Minute) + gomega.SetDefaultEventuallyTimeout(10 * time.Minute) gomega.SetDefaultEventuallyPollingInterval(1 * time.Second) return &Environment{ Context: ctx, @@ -90,6 +102,7 @@ func NewEnvironment(t *testing.T) *Environment { Monitor: NewMonitor(ctx, client), TimeIntervalCollector: debug.NewTimestampCollector(), OutputDir: outputDir, + DefaultNodeClass: decodeNodeClass(), } } @@ -144,42 +157,38 @@ func NewClient(ctx context.Context, config *rest.Config) client.Client { return c } -func (env *Environment) DefaultNodeClass() *v1alpha1.KWOKNodeClass { - return &v1alpha1.KWOKNodeClass{ - ObjectMeta: metav1.ObjectMeta{ - Name: test.RandomName(), - }, +func (env *Environment) DefaultNodePool(nodeClass client.Object) *v1.NodePool { + nodePool := &v1.NodePool{} + if lo.FromPtr(nodePoolPath) == "" { + nodePool = object.Unmarshal[v1.NodePool](defaultNodePool) + } else { + file := lo.Must1(os.ReadFile(lo.FromPtr(nodePoolPath))) + lo.Must0(yaml.Unmarshal(file, nodePool)) } -} -func (env *Environment) DefaultNodePool(nodeClass *v1alpha1.KWOKNodeClass) *v1.NodePool { - nodePool := test.NodePool() + // Update to use the provided default nodeclass nodePool.Spec.Template.Spec.NodeClassRef = &v1.NodeClassReference{ - Name: nodeClass.Name, - Kind: object.GVK(nodeClass).Kind, - Group: object.GVK(nodeClass).Group, - } - nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirementWithMinValues{ - { - NodeSelectorRequirement: corev1.NodeSelectorRequirement{ - Key: corev1.LabelOSStable, - Operator: corev1.NodeSelectorOpIn, - Values: []string{string(corev1.Linux)}, - }, - }, - { - NodeSelectorRequirement: corev1.NodeSelectorRequirement{ - Key: v1.CapacityTypeLabelKey, - Operator: corev1.NodeSelectorOpIn, - Values: []string{v1.CapacityTypeOnDemand}, - }, - }, + Kind: env.DefaultNodeClass.GetObjectKind().GroupVersionKind().Kind, + Group: env.DefaultNodeClass.GetObjectKind().GroupVersionKind().Group, + Name: env.DefaultNodeClass.GetName(), } - nodePool.Spec.Disruption.ConsolidateAfter = v1.MustParseNillableDuration("Never") - nodePool.Spec.Template.Spec.ExpireAfter.Duration = nil - nodePool.Spec.Limits = v1.Limits(corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("1000"), - corev1.ResourceMemory: resource.MustParse("1000Gi"), - }) + return nodePool } + +func decodeNodeClass() *unstructured.Unstructured { + // Open the file + if lo.FromPtr(nodeClassPath) == "" { + return object.Unmarshal[unstructured.Unstructured](defaultNodeClass) + } + + file := lo.Must1(os.Open(lo.FromPtr(nodeClassPath))) + content := lo.Must1(io.ReadAll(file)) + + decoder := serializeryaml.NewDecodingSerializer(unstructured.UnstructuredJSONScheme) + u := &unstructured.Unstructured{} + _, gvk, _ := decoder.Decode(content, nil, u) + u.SetGroupVersionKind(lo.FromPtr(gvk)) + + return u +} diff --git a/test/pkg/environment/common/setup.go b/test/pkg/environment/common/setup.go index c5f557e368..cebe2f2f88 100644 --- a/test/pkg/environment/common/setup.go +++ b/test/pkg/environment/common/setup.go @@ -36,7 +36,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" - "sigs.k8s.io/karpenter/kwok/apis/v1alpha1" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/test" "sigs.k8s.io/karpenter/pkg/utils/pod" @@ -52,7 +51,6 @@ var ( &v1.NodePoolList{}, &corev1.NodeList{}, &v1.NodeClaimList{}, - &v1alpha1.KWOKNodeClassList{}, } CleanableObjects = []client.Object{ &corev1.Pod{}, @@ -67,7 +65,6 @@ var ( &schedulingv1.PriorityClass{}, &corev1.Node{}, &v1.NodeClaim{}, - &v1alpha1.KWOKNodeClass{}, } ) @@ -98,7 +95,7 @@ func (env *Environment) ExpectCleanCluster() { Expect(pods.Items[i].Namespace).ToNot(Equal("default"), fmt.Sprintf("expected no pods in the `default` namespace, found %s/%s", pods.Items[i].Namespace, pods.Items[i].Name)) } - for _, obj := range []client.Object{&v1.NodePool{}, &v1alpha1.KWOKNodeClass{}} { + for _, obj := range []client.Object{&v1.NodePool{}, env.DefaultNodeClass.DeepCopy()} { metaList := &metav1.PartialObjectMetadataList{} gvk := lo.Must(apiutil.GVKForObject(obj, env.Client.Scheme())) metaList.SetGroupVersionKind(gvk) @@ -125,7 +122,9 @@ func (env *Environment) AfterEach() { } func (env *Environment) PrintCluster() { - for _, obj := range ObjectListsToPrint { + nodeClassList := unstructured.UnstructuredList{} + nodeClassList.SetGroupVersionKind(env.DefaultNodeClass.GroupVersionKind()) + for _, obj := range append(ObjectListsToPrint, nodeClassList.DeepCopy()) { gvk := lo.Must(apiutil.GVKForObject(obj, env.Client.Scheme())) By(fmt.Sprintf("printing %s(s)", gvk.Kind)) list := &unstructured.UnstructuredList{} @@ -142,7 +141,7 @@ func (env *Environment) PrintCluster() { func (env *Environment) CleanupObjects(cleanableObjects ...client.Object) { time.Sleep(time.Second) // wait one second to let the caches get up-to-date for deletion wg := sync.WaitGroup{} - for _, obj := range cleanableObjects { + for _, obj := range append(cleanableObjects, env.DefaultNodeClass.DeepCopy()) { wg.Add(1) go func(obj client.Object) { defer wg.Done() @@ -163,7 +162,7 @@ func (env *Environment) CleanupObjects(cleanableObjects ...client.Object) { // If the deletes eventually succeed, we should have no elements here at the end of the test g.Expect(env.Client.List(env, metaList, client.HasLabels([]string{test.DiscoveryLabel}), client.Limit(1))).To(Succeed()) g.Expect(metaList.Items).To(HaveLen(0)) - }).Should(Succeed()) + }).WithTimeout(10 * time.Minute).Should(Succeed()) }(obj) } wg.Wait() diff --git a/test/suites/perf/scheduling_test.go b/test/suites/perf/scheduling_test.go index f15928b335..84423c1b1e 100644 --- a/test/suites/perf/scheduling_test.go +++ b/test/suites/perf/scheduling_test.go @@ -86,7 +86,7 @@ var _ = Describe("Performance", func() { nodeClaims := &v1.NodeClaimList{} g.Expect(env.Client.List(env, nodeClaims, client.MatchingFields{"status.conditions[*].type": v1.ConditionTypeDrifted})).To(Succeed()) g.Expect(len(nodeClaims.Items)).To(Equal(0)) - }).WithTimeout(3 * time.Minute).Should(Succeed()) + }).WithTimeout(10 * time.Minute).Should(Succeed()) env.TimeIntervalCollector.End("Drift") }) It("should do complex provisioning", func() { @@ -148,7 +148,7 @@ var _ = Describe("Performance", func() { nodeClaims := &v1.NodeClaimList{} g.Expect(env.Client.List(env, nodeClaims, client.MatchingFields{"status.conditions[*].type": v1.ConditionTypeDrifted})).To(Succeed()) g.Expect(len(nodeClaims.Items)).To(Equal(0)) - }).WithTimeout(3 * time.Minute).Should(Succeed()) + }).WithTimeout(10 * time.Minute).Should(Succeed()) env.TimeIntervalCollector.End("Drift") }) }) diff --git a/test/suites/perf/suite_test.go b/test/suites/perf/suite_test.go index 0d150ca0c5..d7c973e990 100644 --- a/test/suites/perf/suite_test.go +++ b/test/suites/perf/suite_test.go @@ -24,6 +24,7 @@ import ( . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/karpenter/kwok/apis/v1alpha1" @@ -34,7 +35,7 @@ import ( ) var nodePool *v1.NodePool -var nodeClass *v1alpha1.KWOKNodeClass +var nodeClass client.Object var env *common.Environment var testLabels = map[string]string{ @@ -61,7 +62,7 @@ func TestPerf(t *testing.T) { var _ = BeforeEach(func() { env.BeforeEach() - nodeClass = env.DefaultNodeClass() + nodeClass = env.DefaultNodeClass.DeepCopy() nodePool = env.DefaultNodePool(nodeClass) test.ReplaceRequirements(nodePool, v1.NodeSelectorRequirementWithMinValues{ NodeSelectorRequirement: corev1.NodeSelectorRequirement{