From 9623899a3862f30812e0680ecb9577f534b49e3a Mon Sep 17 00:00:00 2001 From: Lorenz Bischof Date: Tue, 17 Jul 2018 10:48:44 +0200 Subject: [PATCH 1/7] Check capacity when node notready before alerting --- daemon/client/checks/openshift.go | 71 +++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/daemon/client/checks/openshift.go b/daemon/client/checks/openshift.go index e83cc26..2811513 100644 --- a/daemon/client/checks/openshift.go +++ b/daemon/client/checks/openshift.go @@ -6,6 +6,7 @@ import ( "fmt" "log" "os/exec" + "regexp" "strconv" "strings" "time" @@ -36,34 +37,68 @@ func CheckMasterApis(urls string) error { func CheckOcGetNodes() error { log.Println("Checking oc get nodes output") - out, err := runOcGetNodes() + availablePodHardLimit, err := getAvailablePodHardLimit() if err != nil { return err } - if strings.Contains(out, "NotReady") { - // Wait a few seconds and see if still NotReady - // to avoid wrong alerts - time.Sleep(10 * time.Second) - - out2, err := runOcGetNodes() + var notReadyCount int + var out string + for i := 0; i < 5; i++ { + out, err := runOcGetNodes() if err != nil { return err } - if strings.Contains(out2, "NotReady") { - time.Sleep(10 * time.Second) - - out3, err := runOcGetNodes() - if err != nil { - return err - } - if strings.Contains(out3, "NotReady") { - return errors.New("Some node is not ready! 'oc get nodes' output contained NotReady. Output: " + out3) - } + notReadyCount = nodesNotReady(out) + if notReadyCount*100 < availablePodHardLimit { + return nil } + // wait a few seconds and then check again + time.Sleep(10 * time.Second) } + return fmt.Errorf("%v nodes are not ready! AvailablePodHardLimit: %v (notReady*100>=hardLimit) Output: %v", notReadyCount, availablePodHardLimit, out) +} - return nil +func getAvailablePodHardLimit() (int, error) { + totalPods, err := getTotalPods() + if err != nil { + return 0, err + } + totalCapacity, err := getTotalPodCapacity() + if err != nil { + return 0, err + } + return totalCapacity - totalPods, nil +} + +func nodesNotReady(output string) int { + r := regexp.MustCompile("NotReady") + matches := r.FindAllStringIndex(output, -1) + return len(matches) +} + +func getTotalPods() (int, error) { + out, err := exec.Command("bash", "-c", "oc get pods --all-namespaces | grep -v Error | grep -v Completed | wc -l").Output() + if err != nil { + return 0, errors.New("Could not parse oc get pods output: " + err.Error()) + } + i, err := strconv.Atoi(string(out)) + if err != nil { + return 0, errors.New("Could not parse oc get pods output: " + err.Error()) + } + return i, nil +} + +func getTotalPodCapacity() (int, error) { + out, err := exec.Command("bash", "-c", "oc describe nodes -l purpose=workingnode | grep Capacity -A4 | grep pods | awk '{ print $2 }' | paste -sd+ | bc").Output() + if err != nil { + return 0, errors.New("Could not parse oc describe nodes output: " + err.Error()) + } + i, err := strconv.Atoi(string(out)) + if err != nil { + return 0, errors.New("Could not parse oc describe nodes output: " + err.Error()) + } + return i, nil } func runOcGetNodes() (string, error) { From a3edcc95bd680b08f5a72716523a426a4a43fbf1 Mon Sep 17 00:00:00 2001 From: lbischof Date: Mon, 10 Sep 2018 12:07:11 +0200 Subject: [PATCH 2/7] fmt --- daemon/client/checks/openshift.go | 38 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/daemon/client/checks/openshift.go b/daemon/client/checks/openshift.go index a7cc303..0f971b5 100644 --- a/daemon/client/checks/openshift.go +++ b/daemon/client/checks/openshift.go @@ -63,24 +63,24 @@ func CheckOcGetNodes(buildNodes bool) error { func CheckOcGetNodesRelaxed() error { log.Println("Checking oc get nodes output") - availablePodHardLimit, err := getAvailablePodHardLimit() - if err != nil { - return err - } + availablePodHardLimit, err := getAvailablePodHardLimit() + if err != nil { + return err + } - var notReadyCount int + var notReadyCount int var out string for i := 0; i < 5; i++ { out, err := runOcGetNodes(false) if err != nil { return err } - notReadyCount = nodesNotReady(out) - if notReadyCount*100 < availablePodHardLimit { - return nil - } - // wait a few seconds and then check again - time.Sleep(10 * time.Second) + notReadyCount = nodesNotReady(out) + if notReadyCount*100 < availablePodHardLimit { + return nil + } + // wait a few seconds and then check again + time.Sleep(10 * time.Second) } return errors.New("Capacity overload! Workernode " + getNotReadyNodeNames(out) + " is not ready! 'oc get nodes' output contained NotReady. Output: " + out) } @@ -116,7 +116,7 @@ func getTotalPods() (int, error) { } func getTotalPodCapacity(output string) (int, error) { - out, err := exec.Command("bash", "-c", "oc describe nodes " + getReadyWorkingNodeNames(output) + " | grep Capacity -A4 | grep pods | awk '{ print $2 }' | paste -sd+ | bc").Output() + out, err := exec.Command("bash", "-c", "oc describe nodes "+getReadyWorkingNodeNames(output)+" | grep Capacity -A4 | grep pods | awk '{ print $2 }' | paste -sd+ | bc").Output() if err != nil { return 0, errors.New("Could not parse oc describe nodes output: " + err.Error()) } @@ -144,14 +144,14 @@ func getReadyWorkingNodeNames(out string) string { var ReadyWorkingNodes []string for _, line := range lines { if strings.Contains(line, "NotReady") { - continue - } + continue + } if strings.Contains(line, "SchedulingDisabled") { - continue - } + continue + } if strings.Contains(line, "purpose=buildnode") { - continue - } + continue + } s := strings.Fields(line)[0] ReadyWorkingNodes = append(ReadyWorkingNodes, s) @@ -159,8 +159,6 @@ func getReadyWorkingNodeNames(out string) string { return strings.Join(ReadyWorkingNodes, " ") } - - func runOcGetNodes(buildNodes bool) (string, error) { buildNodes_grep_params := "-v" if buildNodes { From 195c934543ec34a96e3535b8997f0ab8dfa18672 Mon Sep 17 00:00:00 2001 From: lbischof Date: Mon, 10 Sep 2018 12:11:07 +0200 Subject: [PATCH 3/7] Fix errors --- daemon/client/checks/openshift.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/daemon/client/checks/openshift.go b/daemon/client/checks/openshift.go index 0f971b5..e7b2040 100644 --- a/daemon/client/checks/openshift.go +++ b/daemon/client/checks/openshift.go @@ -63,11 +63,6 @@ func CheckOcGetNodes(buildNodes bool) error { func CheckOcGetNodesRelaxed() error { log.Println("Checking oc get nodes output") - availablePodHardLimit, err := getAvailablePodHardLimit() - if err != nil { - return err - } - var notReadyCount int var out string for i := 0; i < 5; i++ { @@ -76,6 +71,10 @@ func CheckOcGetNodesRelaxed() error { return err } notReadyCount = nodesNotReady(out) + availablePodHardLimit, err := getAvailablePodHardLimit(out) + if err != nil { + return err + } if notReadyCount*100 < availablePodHardLimit { return nil } From f78bbb1cabc2807b9610854a6de98cab46d683c1 Mon Sep 17 00:00:00 2001 From: lbischof Date: Mon, 10 Sep 2018 14:05:18 +0200 Subject: [PATCH 4/7] Fix checks --- daemon/client/checks/openshift.go | 26 ++++++++++++++++---------- daemon/client/handlers/major.go | 5 +++++ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/daemon/client/checks/openshift.go b/daemon/client/checks/openshift.go index e7b2040..8481ebd 100644 --- a/daemon/client/checks/openshift.go +++ b/daemon/client/checks/openshift.go @@ -55,7 +55,7 @@ func CheckOcGetNodes(buildNodes bool) error { if buildNodes { purpose = "Buildnode " } else { - purpose = "Workernode " + purpose = "Workingnode " } return errors.New(purpose + getNotReadyNodeNames(out) + " is not ready! 'oc get nodes' output contained NotReady. Output: " + out) } @@ -64,14 +64,16 @@ func CheckOcGetNodesRelaxed() error { log.Println("Checking oc get nodes output") var notReadyCount int + var availablePodHardLimit int var out string + var err error for i := 0; i < 5; i++ { - out, err := runOcGetNodes(false) + out, err = runOcGetNodes(false) if err != nil { return err } notReadyCount = nodesNotReady(out) - availablePodHardLimit, err := getAvailablePodHardLimit(out) + availablePodHardLimit, err = getAvailablePodHardLimit(out) if err != nil { return err } @@ -81,7 +83,7 @@ func CheckOcGetNodesRelaxed() error { // wait a few seconds and then check again time.Sleep(10 * time.Second) } - return errors.New("Capacity overload! Workernode " + getNotReadyNodeNames(out) + " is not ready! 'oc get nodes' output contained NotReady. Output: " + out) + return fmt.Errorf("Capacity overload! Workernode %v is not ready! AvailablePodHardLimit: %v 'oc get nodes' output contained NotReady. Output: %v", getNotReadyNodeNames(out), availablePodHardLimit, out) } func getAvailablePodHardLimit(output string) (int, error) { @@ -107,7 +109,8 @@ func getTotalPods() (int, error) { if err != nil { return 0, errors.New("Could not parse oc get pods output: " + err.Error()) } - i, err := strconv.Atoi(string(out)) + trimmed := strings.TrimSpace(string(out)) + i, err := strconv.Atoi(trimmed) if err != nil { return 0, errors.New("Could not parse oc get pods output: " + err.Error()) } @@ -119,7 +122,8 @@ func getTotalPodCapacity(output string) (int, error) { if err != nil { return 0, errors.New("Could not parse oc describe nodes output: " + err.Error()) } - i, err := strconv.Atoi(string(out)) + trimmed := strings.TrimSpace(string(out)) + i, err := strconv.Atoi(trimmed) if err != nil { return 0, errors.New("Could not parse oc describe nodes output: " + err.Error()) } @@ -142,6 +146,9 @@ func getReadyWorkingNodeNames(out string) string { lines := strings.Split(out, "\n") var ReadyWorkingNodes []string for _, line := range lines { + if line == "" { + continue + } if strings.Contains(line, "NotReady") { continue } @@ -151,9 +158,8 @@ func getReadyWorkingNodeNames(out string) string { if strings.Contains(line, "purpose=buildnode") { continue } - s := strings.Fields(line)[0] - ReadyWorkingNodes = append(ReadyWorkingNodes, s) - + s := strings.Fields(line) + ReadyWorkingNodes = append(ReadyWorkingNodes, s[0]) } return strings.Join(ReadyWorkingNodes, " ") } @@ -163,7 +169,7 @@ func runOcGetNodes(buildNodes bool) (string, error) { if buildNodes { buildNodes_grep_params = "" } - out, err := exec.Command("bash", "-c", fmt.Sprintf("oc get nodes --show-labels | grep -v monitoring=false | grep -v SchedulingDisabled | grep %s purpose=buildnode || test $? -eq 1", buildNodes_grep_params)).Output() + out, err := exec.Command("bash", "-c", fmt.Sprintf("oc get nodes --show-labels --no-headers | grep -v monitoring=false | grep -v SchedulingDisabled | grep %s purpose=buildnode || test $? -eq 1", buildNodes_grep_params)).Output() if err != nil { msg := "Could not parse oc get nodes output: " + err.Error() log.Println(msg) diff --git a/daemon/client/handlers/major.go b/daemon/client/handlers/major.go index 2e30b82..27e4369 100644 --- a/daemon/client/handlers/major.go +++ b/daemon/client/handlers/major.go @@ -57,6 +57,11 @@ func HandleMajorChecks(daemonType string, w http.ResponseWriter, r *http.Request errors = append(errors, err.Error()) } + // check notready working nodes but only alert if no more capacity is available + if err := checks.CheckOcGetNodesRelaxed(false); err != nil { + errors = append(errors, err.Error()) + } + if err := checks.CheckEtcdHealth(etcdIps, ""); err != nil { errors = append(errors, err.Error()) } From a6da1059086b7e3f02098b2d07b59a5fc5b653c2 Mon Sep 17 00:00:00 2001 From: lbischof Date: Mon, 10 Sep 2018 14:12:35 +0200 Subject: [PATCH 5/7] Add max pods config option --- daemon/client/checks/common.go | 7 +++++++ daemon/client/checks/openshift.go | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/daemon/client/checks/common.go b/daemon/client/checks/common.go index da92679..49476d3 100644 --- a/daemon/client/checks/common.go +++ b/daemon/client/checks/common.go @@ -132,6 +132,13 @@ func getIpsForName(n string) []net.IP { return ips } +func getEnv(key, fallback string) string { + if value, ok := os.LookupEnv(key); ok { + return value + } + return fallback +} + func checkHttp(toCall string) error { log.Println("Checking access to:", toCall) if strings.HasPrefix(toCall, "https") { diff --git a/daemon/client/checks/openshift.go b/daemon/client/checks/openshift.go index 8481ebd..ebda8fe 100644 --- a/daemon/client/checks/openshift.go +++ b/daemon/client/checks/openshift.go @@ -77,7 +77,7 @@ func CheckOcGetNodesRelaxed() error { if err != nil { return err } - if notReadyCount*100 < availablePodHardLimit { + if notReadyCount*getEnv("OPENSHIFT_MAX_PODS", 100) < availablePodHardLimit { return nil } // wait a few seconds and then check again From 58994ac19efda0b03fadc667f84d23f4648bcc2b Mon Sep 17 00:00:00 2001 From: lbischof Date: Mon, 10 Sep 2018 14:17:45 +0200 Subject: [PATCH 6/7] Fix checks --- daemon/client/checks/common.go | 1 + daemon/client/checks/openshift.go | 6 +++++- daemon/client/handlers/major.go | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/daemon/client/checks/common.go b/daemon/client/checks/common.go index 49476d3..7231c79 100644 --- a/daemon/client/checks/common.go +++ b/daemon/client/checks/common.go @@ -7,6 +7,7 @@ import ( "log" "net" "net/http" + "os" "os/exec" "regexp" "strconv" diff --git a/daemon/client/checks/openshift.go b/daemon/client/checks/openshift.go index ebda8fe..f721030 100644 --- a/daemon/client/checks/openshift.go +++ b/daemon/client/checks/openshift.go @@ -77,7 +77,11 @@ func CheckOcGetNodesRelaxed() error { if err != nil { return err } - if notReadyCount*getEnv("OPENSHIFT_MAX_PODS", 100) < availablePodHardLimit { + max_pods, err := strconv.Atoi(getEnv("OPENSHIFT_MAX_PODS", "100")) + if err != nil { + return errors.New("Could not parse OPENSHIFT_MAX_PODS environment variable: " + err.Error()) + } + if notReadyCount*max_pods < availablePodHardLimit { return nil } // wait a few seconds and then check again diff --git a/daemon/client/handlers/major.go b/daemon/client/handlers/major.go index 27e4369..b4bb416 100644 --- a/daemon/client/handlers/major.go +++ b/daemon/client/handlers/major.go @@ -58,7 +58,7 @@ func HandleMajorChecks(daemonType string, w http.ResponseWriter, r *http.Request } // check notready working nodes but only alert if no more capacity is available - if err := checks.CheckOcGetNodesRelaxed(false); err != nil { + if err := checks.CheckOcGetNodesRelaxed(); err != nil { errors = append(errors, err.Error()) } From 834534751fceedd5a7a3ae49653320d409170891 Mon Sep 17 00:00:00 2001 From: lbischof Date: Mon, 10 Sep 2018 14:29:28 +0200 Subject: [PATCH 7/7] Fix typo --- daemon/client/checks/openshift.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daemon/client/checks/openshift.go b/daemon/client/checks/openshift.go index f721030..0928607 100644 --- a/daemon/client/checks/openshift.go +++ b/daemon/client/checks/openshift.go @@ -87,7 +87,7 @@ func CheckOcGetNodesRelaxed() error { // wait a few seconds and then check again time.Sleep(10 * time.Second) } - return fmt.Errorf("Capacity overload! Workernode %v is not ready! AvailablePodHardLimit: %v 'oc get nodes' output contained NotReady. Output: %v", getNotReadyNodeNames(out), availablePodHardLimit, out) + return fmt.Errorf("Capacity overload! Workingnode %v is not ready! AvailablePodHardLimit: %v 'oc get nodes' output contained NotReady. Output: %v", getNotReadyNodeNames(out), availablePodHardLimit, out) } func getAvailablePodHardLimit(output string) (int, error) {