From d1936edab08da99f86c3696010ccf332ef3f9f0f Mon Sep 17 00:00:00 2001 From: Robin Deeboonchai Date: Thu, 16 Jan 2025 14:57:20 -0800 Subject: [PATCH] fix: don't crash when vmss not present or has no nodes --- .../cloudprovider/azure/azure_cache.go | 2 +- .../cloudprovider/azure/azure_manager_test.go | 43 +++++++++++++++ .../cloudprovider/azure/azure_scale_set.go | 55 ++++++++++++++----- 3 files changed, 84 insertions(+), 16 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_cache.go index 14ce9254e2c5..334048164dfc 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cache.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cache.go @@ -170,7 +170,7 @@ func (m *azureCache) regenerate() error { if err != nil { return err } - klog.V(4).Infof("regenerate: found nodes for node group %s: %+v", ng.Id(), instances) + klog.V(4).Infof("regenerate: found %d nodes for node group %s: %+v", len(instances), ng.Id(), instances) for _, instance := range instances { ref := azureRef{Name: instance.Id} diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index b8262a81348a..0733982e2653 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -1201,6 +1201,49 @@ func TestGetScaleSetOptions(t *testing.T) { assert.Equal(t, *opts, defaultOptions) } +// TestVMSSNotFound ensures that AzureManager is still able to be built +// if one nodeGroup (VMSS) is not found. Previously, we would fail on manager creation +// if even one expected nodeGroup was not found. When manager creation errored out, +// BuildAzure returns log.Fatalf() which caused CAS to crash. +func TestVMSSNotFound(t *testing.T) { + // client setup + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMClient := mockvmclient.NewMockInterface(ctrl) + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + client := azClient{} + client.virtualMachineScaleSetsClient = mockVMSSClient + client.virtualMachinesClient = mockVMClient + client.virtualMachineScaleSetVMsClient = mockVMSSVMClient + + // Expect that no vmss are present in the vmss client + mockVMSSVMClient.EXPECT().List(gomock.Any(), "fakeId", testASG, gomock.Any()).Return([]compute.VirtualMachineScaleSetVM{}, nil).AnyTimes() + mockVMClient.EXPECT().List(gomock.Any(), "fakeId").Return([]compute.VirtualMachine{}, nil).AnyTimes() + mockVMSSClient.EXPECT().List(gomock.Any(), "fakeId").Return([]compute.VirtualMachineScaleSet{}, nil).AnyTimes() + + // Add explicit node group to look for during init + ngdo := cloudprovider.NodeGroupDiscoveryOptions{ + NodeGroupSpecs: []string{ + fmt.Sprintf("%d:%d:%s", 1, 3, testASG), + }, + } + + // We expect the initial BuildAzure flow to pass when a NodeGroup is detected + // that doesn't have a corresponding VMSS in the cache. + t.Run("should not error when VMSS not found in cache", func(t *testing.T) { + manager, err := createAzureManagerInternal(strings.NewReader(validAzureCfg), ngdo, &client) + assert.NoError(t, err) + // expect one nodegroup to be present + nodeGroups := manager.getNodeGroups() + assert.Len(t, nodeGroups, 1) + assert.Equal(t, nodeGroups[0].Id(), testASG) + // expect no scale sets to be present + scaleSets := manager.azureCache.getScaleSets() + assert.Len(t, scaleSets, 0) + }) +} + func assertStructsMinimallyEqual(t *testing.T, struct1, struct2 interface{}) bool { return compareStructFields(t, reflect.ValueOf(struct1), reflect.ValueOf(struct2)) } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 64d5b93a2c8b..1b57dee63e0f 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -167,7 +167,12 @@ func (scaleSet *ScaleSet) Autoprovisioned() bool { func (scaleSet *ScaleSet) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) { template, err := scaleSet.getVMSSFromCache() if err != nil { - return nil, err + klog.Errorf("failed to get information for VMSS: %s", scaleSet.Name) + // Note: We don't return an error here and instead accept defaults. + // Every invocation of GetOptions() returns an error if this condition is met: + // `if err != nil && err != cloudprovider.ErrNotImplemented` + // The error return value is intended to only capture unimplemented. + return nil, nil } return scaleSet.manager.GetScaleSetOptions(*template.Name, defaults), nil } @@ -187,14 +192,14 @@ func (scaleSet *ScaleSet) getVMSSFromCache() (compute.VirtualMachineScaleSet, er return allVMSS[scaleSet.Name], nil } -func (scaleSet *ScaleSet) getCurSize() (int64, error) { +func (scaleSet *ScaleSet) getCurSize() (int64, *GetVMSSFailedError) { scaleSet.sizeMutex.Lock() defer scaleSet.sizeMutex.Unlock() set, err := scaleSet.getVMSSFromCache() if err != nil { klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err) - return -1, err + return -1, newGetVMSSFailedError(err, true) } // // Remove check for returning in-memory size when VMSS is in updating state @@ -231,7 +236,7 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { set, rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.Get(ctx, scaleSet.manager.config.ResourceGroup, scaleSet.Name) if rerr != nil { klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, rerr) - return -1, err + return -1, newGetVMSSFailedError(rerr.Error(), rerr.IsNotFound()) } } @@ -253,12 +258,11 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { // getScaleSetSize gets Scale Set size. func (scaleSet *ScaleSet) getScaleSetSize() (int64, error) { - // First, get the size of the ScaleSet reported by API - // -1 indiciates the ScaleSet hasn't been initialized - size, err := scaleSet.getCurSize() - if size == -1 || err != nil { - klog.V(3).Infof("getScaleSetSize: either size is -1 (actual: %d) or error exists (actual err:%v)", size, err) - return size, err + // First, get the current size of the ScaleSet + size, getVMSSError := scaleSet.getCurSize() + if size == -1 || getVMSSError != nil { + klog.V(3).Infof("getScaleSetSize: either size is -1 (actual: %d) or error exists (actual err:%v)", size, getVMSSError.error) + return size, getVMSSError.error } return size, nil } @@ -657,10 +661,13 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, erro // Nodes returns a list of all nodes that belong to this node group. func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) { - curSize, err := scaleSet.getCurSize() - if err != nil { - klog.Errorf("Failed to get current size for vmss %q: %v", scaleSet.Name, err) - return nil, err + curSize, getVMSSError := scaleSet.getCurSize() + if getVMSSError != nil { + klog.Errorf("Failed to get current size for vmss %q: %v", scaleSet.Name, getVMSSError.error) + if getVMSSError.notFound { + return []cloudprovider.Instance{}, nil // Don't return error if VMSS not found + } + return nil, getVMSSError.error // We want to return error if other errors occur. } scaleSet.instanceMutex.Lock() @@ -673,7 +680,7 @@ func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) { } // Forcefully updating the instanceCache as the instanceCacheSize didn't match curSize or cache is invalid. - err = scaleSet.updateInstanceCache() + err := scaleSet.updateInstanceCache() if err != nil { return nil, err } @@ -892,3 +899,21 @@ func (scaleSet *ScaleSet) verifyNodeGroup(instance *azureRef, commonNgID string) } return nil } + +// GetVMSSFailedError is used to differentiate between +// NotFound and other errors +type GetVMSSFailedError struct { + notFound bool + error error +} + +func newGetVMSSFailedError(error error, notFound bool) *GetVMSSFailedError { + return &GetVMSSFailedError{ + error: error, + notFound: notFound, + } +} + +func (v *GetVMSSFailedError) Error() string { + return v.error.Error() +}