diff --git a/go.mod b/go.mod index 2dd1bcea..37a31cd0 100644 --- a/go.mod +++ b/go.mod @@ -12,13 +12,16 @@ require ( github.com/prometheus/client_model v0.3.0 github.com/stretchr/testify v1.8.2 k8s.io/api v0.26.2 - k8s.io/apiextensions-apiserver v0.25.1 + k8s.io/apiextensions-apiserver v0.26.1 k8s.io/apimachinery v0.26.2 k8s.io/apiserver v0.26.2 k8s.io/client-go v0.26.2 k8s.io/klog/v2 v2.90.1 k8s.io/metrics v0.26.2 + k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 + sigs.k8s.io/controller-runtime v0.14.6 sigs.k8s.io/custom-metrics-apiserver v0.0.0 + sigs.k8s.io/structured-merge-diff/v4 v4.2.3 ) replace sigs.k8s.io/custom-metrics-apiserver => sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3 @@ -85,7 +88,7 @@ require ( go.opentelemetry.io/proto/otlp v0.19.0 // indirect go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.6.0 // indirect - go.uber.org/zap v1.19.0 // indirect + go.uber.org/zap v1.24.0 // indirect golang.org/x/crypto v0.1.0 // indirect golang.org/x/net v0.8.0 // indirect golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect @@ -93,7 +96,7 @@ require ( golang.org/x/sys v0.6.0 // indirect golang.org/x/term v0.6.0 // indirect golang.org/x/text v0.8.0 // indirect - golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect + golang.org/x/time v0.3.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20220502173005-c8bf987b8c21 // indirect google.golang.org/grpc v1.49.0 // indirect @@ -107,9 +110,7 @@ require ( k8s.io/component-base v0.26.2 // indirect k8s.io/kms v0.26.2 // indirect k8s.io/kube-openapi v0.0.0-20230303024457-afdc3dddf62d // indirect - k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect sigs.k8s.io/yaml v1.3.0 // indirect ) diff --git a/go.sum b/go.sum index 47713209..52d5f3f0 100644 --- a/go.sum +++ b/go.sum @@ -47,7 +47,6 @@ github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kd github.com/antlr/antlr4/runtime/Go/antlr v1.4.10 h1:yL7+Jz0jTC6yykIK/Wh74gnTJnrGr5AyrNMXuA0gves= github.com/antlr/antlr4/runtime/Go/antlr v1.4.10/go.mod h1:F7bn7fEU90QkQ3tnmaTx3LTKLEDqnwWODIYppRQ5hnY= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= -github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -388,13 +387,12 @@ go.opentelemetry.io/proto/otlp v0.19.0 h1:IVN6GR+mhC4s5yfcTbmzHYODqvWAp3ZedA2SJP go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= -go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= -go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA= +go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk= go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo= -go.uber.org/zap v1.19.0 h1:mZQZefskPPCMIBCSEH0v2/iUqqLrYtaeqwD6FUGUnFE= -go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= +go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= +go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -556,8 +554,8 @@ golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44= -golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -573,7 +571,6 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= @@ -741,8 +738,8 @@ honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9 honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= k8s.io/api v0.26.2 h1:dM3cinp3PGB6asOySalOZxEG4CZ0IAdJsrYZXE/ovGQ= k8s.io/api v0.26.2/go.mod h1:1kjMQsFE+QHPfskEcVNgL3+Hp88B80uj0QtSOlj8itU= -k8s.io/apiextensions-apiserver v0.25.1 h1:HEIKlxj6oHaDwHgotEIX/Ld5K/RGuOFwN/TWMiQ5s5s= -k8s.io/apiextensions-apiserver v0.25.1/go.mod h1:67sgnMs2yIO2iV4DpCdS91vlP+pdnVIsG/mz60qRn44= +k8s.io/apiextensions-apiserver v0.26.1 h1:cB8h1SRk6e/+i3NOrQgSFij1B2S0Y0wDoNl66bn8RMI= +k8s.io/apiextensions-apiserver v0.26.1/go.mod h1:AptjOSXDGuE0JICx/Em15PaoO7buLwTs0dGleIHixSM= k8s.io/apimachinery v0.26.2 h1:da1u3D5wfR5u2RpLhE/ZtZS2P7QvDgLZTi9wrNZl/tQ= k8s.io/apimachinery v0.26.2/go.mod h1:ats7nN1LExKHvJ9TmwootT00Yz05MuYqPXEXaVeOy5I= k8s.io/apiserver v0.26.2 h1:Pk8lmX4G14hYqJd1poHGC08G03nIHVqdJMR0SD3IH3o= @@ -766,6 +763,8 @@ rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35 h1:+xBL5uTc+BkPBwmMi3vYfUJjq+N3K+H6PXeETwf5cPI= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35/go.mod h1:WxjusMwXlKzfAs4p9km6XJRndVt2FROgMVCE4cdohFo= +sigs.k8s.io/controller-runtime v0.14.6 h1:oxstGVvXGNnMvY7TAESYk+lzr6S3V5VFxQ6d92KcwQA= +sigs.k8s.io/controller-runtime v0.14.6/go.mod h1:WqIdsAY6JBsjfc/CqO0CORmNtoCtE4S6qbPc9s68h+0= sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3 h1:puQ5YlyBjhxg+OQ1YPMJXwtk7WhC4E6AlWIQ9pC8jws= sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3/go.mod h1:9nUXR/EgdYZto1aQ6yhwOksPR7J979jSyOqic1IgaOo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= diff --git a/pkg/controller/queuejob/metrics.go b/pkg/controller/queuejob/metrics.go new file mode 100644 index 00000000..2ffcd7ae --- /dev/null +++ b/pkg/controller/queuejob/metrics.go @@ -0,0 +1,123 @@ +/* +Copyright 2023 The Multi-Cluster App Dispatcher Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package queuejob + +import ( + arbv1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/klog/v2" + "reflect" + "runtime" + "sigs.k8s.io/controller-runtime/pkg/metrics" + "time" +) + +var ( + allocatableCapacityCpu = prometheus.NewGauge(prometheus.GaugeOpts{ + Subsystem: "mcad", + Name: "allocatable_capacity_cpu", + Help: "Allocatable CPU Capacity (in millicores)", + }) + allocatableCapacityMemory = prometheus.NewGauge(prometheus.GaugeOpts{ + Subsystem: "mcad", + Name: "allocatable_capacity_memory", + Help: "Allocatable Memory Capacity", + }) + allocatableCapacityGpu = prometheus.NewGauge(prometheus.GaugeOpts{ + Subsystem: "mcad", + Name: "allocatable_capacity_gpu", + Help: "Allocatable GPU Capacity", + }) + appWrappersCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: "mcad", + Name: "appwrappers_count", + Help: "AppWrappers count per state", + }, []string{"state"}) +) + +func init() { + klog.V(10).Infof("Registering metrics") + metrics.Registry.MustRegister( + allocatableCapacityCpu, + allocatableCapacityMemory, + allocatableCapacityGpu, + appWrappersCount, + ) +} + +func updateMetricsLoop(controller *XController, stopCh <-chan struct{}) { + updateMetricsLoopGeneric(controller, stopCh, time.Minute*1, updateAllocatableCapacity) + updateMetricsLoopGeneric(controller, stopCh, time.Second*5, updateQueue) +} + +func getFuncName(f interface{}) string { + return runtime.FuncForPC(reflect.ValueOf(f).Pointer()).Name() +} + +func updateMetricsLoopGeneric(controller *XController, stopCh <-chan struct{}, d time.Duration, updateFunc func(xController *XController)) { + ticker := time.NewTicker(d) + go func() { + updateFunc(controller) + for { + select { + case <-ticker.C: + if klog.V(10).Enabled() { + klog.Infof("[updateMetricsLoopGeneric] Update metrics loop tick: %v", getFuncName(updateFunc)) + } + updateFunc(controller) + case <-stopCh: + if klog.V(10).Enabled() { + klog.Infof("[updateMetricsLoopGeneric] Exiting update metrics loop: %v", getFuncName(updateFunc)) + } + ticker.Stop() + return + } + } + }() +} + +func updateAllocatableCapacity(controller *XController) { + res := controller.GetAllocatableCapacity() + allocatableCapacityCpu.Set(res.MilliCPU) + allocatableCapacityMemory.Set(res.Memory) + allocatableCapacityGpu.Set(float64(res.GPU)) +} + +func updateQueue(controller *XController) { + awList, err := controller.appWrapperLister.List(labels.Everything()) + + if err != nil { + klog.Errorf("[updateQueue] Unable to obtain the list of AppWrappers: %+v", err) + return + } + stateToAppWrapperCount := map[arbv1.AppWrapperState]int{ + arbv1.AppWrapperStateEnqueued: 0, + arbv1.AppWrapperStateActive: 0, + arbv1.AppWrapperStateDeleted: 0, + arbv1.AppWrapperStateFailed: 0, + arbv1.AppWrapperStateCompleted: 0, + arbv1.AppWrapperStateRunningHoldCompletion: 0, + } + for _, aw := range awList { + state := aw.Status.State + stateToAppWrapperCount[state]++ + } + for state, count := range stateToAppWrapperCount { + appWrappersCount.WithLabelValues(string(state)).Set(float64(count)) + } +} diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 8d356403..26cd0091 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -221,6 +221,10 @@ func (qjm *XController) allocatableCapacity() *clusterstateapi.Resource { return capacity } +func (qjm *XController) GetAllocatableCapacity() *clusterstateapi.Resource { + return qjm.allocatableCapacity() +} + // NewJobController create new AppWrapper Controller func NewJobController(restConfig *rest.Config, mcadConfig *config.MCADConfiguration, extConfig *config.MCADConfigurationExtended) *XController { cc := &XController{ @@ -1456,6 +1460,7 @@ func (cc *XController) Run(stopCh <-chan struct{}) { go cc.appwrapperInformer.Informer().Run(stopCh) cache.WaitForCacheSync(stopCh, cc.appWrapperSynced) + updateMetricsLoop(cc, stopCh) if cc.isDispatcher { go wait.Until(cc.UpdateAgent, 2*time.Second, stopCh) // In the Agent?