Skip to content

Commit

Permalink
fix: don't crash when vmss not present or has no nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
comtalyst committed Jan 16, 2025
1 parent ea52310 commit 97dd5fe
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 16 deletions.
2 changes: 1 addition & 1 deletion cluster-autoscaler/cloudprovider/azure/azure_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ func (m *azureCache) regenerate() error {
if err != nil {
return err
}
klog.V(4).Infof("regenerate: found nodes for node group %s: %+v", ng.Id(), instances)
klog.V(4).Infof("regenerate: found %d nodes for node group %s: %+v", len(instances), ng.Id(), instances)

for _, instance := range instances {
ref := azureRef{Name: instance.Id}
Expand Down
43 changes: 43 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1249,6 +1249,49 @@ func TestGetScaleSetOptions(t *testing.T) {
assert.Equal(t, *opts, defaultOptions)
}

// TestVMSSNotFound ensures that AzureManager is still able to be built
// if one nodeGroup (VMSS) is not found. Previously, we would fail on manager creation
// if even one expected nodeGroup was not found. When manager creation errored out,
// BuildAzure returns log.Fatalf() which caused CAS to crash.
func TestVMSSNotFound(t *testing.T) {
// client setup
ctrl := gomock.NewController(t)
defer ctrl.Finish()
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMClient := mockvmclient.NewMockInterface(ctrl)
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
client := azClient{}
client.virtualMachineScaleSetsClient = mockVMSSClient
client.virtualMachinesClient = mockVMClient
client.virtualMachineScaleSetVMsClient = mockVMSSVMClient

// Expect that no vmss are present in the vmss client
mockVMSSVMClient.EXPECT().List(gomock.Any(), "fakeId", testASG, gomock.Any()).Return([]compute.VirtualMachineScaleSetVM{}, nil).AnyTimes()
mockVMClient.EXPECT().List(gomock.Any(), "fakeId").Return([]compute.VirtualMachine{}, nil).AnyTimes()
mockVMSSClient.EXPECT().List(gomock.Any(), "fakeId").Return([]compute.VirtualMachineScaleSet{}, nil).AnyTimes()

// Add explicit node group to look for during init
ngdo := cloudprovider.NodeGroupDiscoveryOptions{
NodeGroupSpecs: []string{
fmt.Sprintf("%d:%d:%s", 1, 3, testASG),
},
}

// We expect the initial BuildAzure flow to pass when a NodeGroup is detected
// that doesn't have a corresponding VMSS in the cache.
t.Run("should not error when VMSS not found in cache", func(t *testing.T) {
manager, err := createAzureManagerInternal(strings.NewReader(validAzureCfg), ngdo, &client)
assert.NoError(t, err)
// expect one nodegroup to be present
nodeGroups := manager.getNodeGroups()
assert.Len(t, nodeGroups, 1)
assert.Equal(t, nodeGroups[0].Id(), testASG)
// expect no scale sets to be present
scaleSets := manager.azureCache.getScaleSets()
assert.Len(t, scaleSets, 0)
})
}

func assertStructsMinimallyEqual(t *testing.T, struct1, struct2 interface{}) bool {
return compareStructFields(t, reflect.ValueOf(struct1), reflect.ValueOf(struct2))
}
Expand Down
55 changes: 40 additions & 15 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,12 @@ func (scaleSet *ScaleSet) Autoprovisioned() bool {
func (scaleSet *ScaleSet) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
template, err := scaleSet.getVMSSFromCache()
if err != nil {
return nil, err
klog.Errorf("failed to get information for VMSS: %s", scaleSet.Name)
// Note: We don't return an error here and instead accept defaults.
// Every invocation of GetOptions() returns an error if this condition is met:
// `if err != nil && err != cloudprovider.ErrNotImplemented`
// The error return value is intended to only capture unimplemented.
return nil, nil
}
return scaleSet.manager.GetScaleSetOptions(*template.Name, defaults), nil
}
Expand All @@ -187,14 +192,14 @@ func (scaleSet *ScaleSet) getVMSSFromCache() (compute.VirtualMachineScaleSet, er
return allVMSS[scaleSet.Name], nil
}

func (scaleSet *ScaleSet) getCurSize() (int64, error) {
func (scaleSet *ScaleSet) getCurSize() (int64, *GetVMSSFailedError) {
scaleSet.sizeMutex.Lock()
defer scaleSet.sizeMutex.Unlock()

set, err := scaleSet.getVMSSFromCache()
if err != nil {
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err)
return -1, err
return -1, newGetVMSSFailedError(err, true)
}

// // Remove check for returning in-memory size when VMSS is in updating state
Expand Down Expand Up @@ -231,7 +236,7 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
set, rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.Get(ctx, scaleSet.manager.config.ResourceGroup, scaleSet.Name)
if rerr != nil {
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, rerr)
return -1, err
return -1, newGetVMSSFailedError(rerr.Error(), rerr.IsNotFound())
}
}

Expand All @@ -253,12 +258,11 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {

// getScaleSetSize gets Scale Set size.
func (scaleSet *ScaleSet) getScaleSetSize() (int64, error) {
// First, get the size of the ScaleSet reported by API
// -1 indiciates the ScaleSet hasn't been initialized
size, err := scaleSet.getCurSize()
if size == -1 || err != nil {
klog.V(3).Infof("getScaleSetSize: either size is -1 (actual: %d) or error exists (actual err:%v)", size, err)
return size, err
// First, get the current size of the ScaleSet
size, getVMSSError := scaleSet.getCurSize()
if size == -1 || getVMSSError != nil {
klog.V(3).Infof("getScaleSetSize: either size is -1 (actual: %d) or error exists (actual err:%v)", size, getVMSSError.error)
return size, getVMSSError.error
}
return size, nil
}
Expand Down Expand Up @@ -666,10 +670,13 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*framework.NodeInfo, error) {

// Nodes returns a list of all nodes that belong to this node group.
func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) {
curSize, err := scaleSet.getCurSize()
if err != nil {
klog.Errorf("Failed to get current size for vmss %q: %v", scaleSet.Name, err)
return nil, err
curSize, getVMSSError := scaleSet.getCurSize()
if getVMSSError != nil {
klog.Errorf("Failed to get current size for vmss %q: %v", scaleSet.Name, getVMSSError.error)
if getVMSSError.notFound {
return []cloudprovider.Instance{}, nil // Don't return error if VMSS not found
}
return nil, getVMSSError.error // We want to return error if other errors occur.
}

scaleSet.instanceMutex.Lock()
Expand All @@ -682,7 +689,7 @@ func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) {
}

// Forcefully updating the instanceCache as the instanceCacheSize didn't match curSize or cache is invalid.
err = scaleSet.updateInstanceCache()
err := scaleSet.updateInstanceCache()
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -901,3 +908,21 @@ func (scaleSet *ScaleSet) verifyNodeGroup(instance *azureRef, commonNgID string)
}
return nil
}

// GetVMSSFailedError is used to differentiate between
// NotFound and other errors
type GetVMSSFailedError struct {
notFound bool
error error
}

func newGetVMSSFailedError(error error, notFound bool) *GetVMSSFailedError {
return &GetVMSSFailedError{
error: error,
notFound: notFound,
}
}

func (v *GetVMSSFailedError) Error() string {
return v.error.Error()
}

0 comments on commit 97dd5fe

Please sign in to comment.