Skip to content

Commit

Permalink
Merge pull request #10 from oscp/check_capacity_when_node_notready
Browse files Browse the repository at this point in the history
Check capacity when node notready before alerting
  • Loading branch information
c0desurfer authored Sep 20, 2018
2 parents c9d343e + 8345347 commit 7d8b49e
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 3 deletions.
8 changes: 8 additions & 0 deletions daemon/client/checks/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"log"
"net"
"net/http"
"os"
"os/exec"
"regexp"
"strconv"
Expand Down Expand Up @@ -132,6 +133,13 @@ func getIpsForName(n string) []net.IP {
return ips
}

func getEnv(key, fallback string) string {
if value, ok := os.LookupEnv(key); ok {
return value
}
return fallback
}

func checkHttp(toCall string) error {
log.Println("Checking access to:", toCall)
if strings.HasPrefix(toCall, "https") {
Expand Down
102 changes: 99 additions & 3 deletions daemon/client/checks/openshift.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"log"
"os/exec"
"regexp"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -35,7 +36,6 @@ func CheckMasterApis(urls string) error {

func CheckOcGetNodes(buildNodes bool) error {
log.Println("Checking oc get nodes output")

var out string
var err error
for i := 0; i < 5; i++ {
Expand All @@ -55,11 +55,85 @@ func CheckOcGetNodes(buildNodes bool) error {
if buildNodes {
purpose = "Buildnode "
} else {
purpose = "Workernode "
purpose = "Workingnode "
}
return errors.New(purpose + getNotReadyNodeNames(out) + " is not ready! 'oc get nodes' output contained NotReady. Output: " + out)
}

func CheckOcGetNodesRelaxed() error {
log.Println("Checking oc get nodes output")

var notReadyCount int
var availablePodHardLimit int
var out string
var err error
for i := 0; i < 5; i++ {
out, err = runOcGetNodes(false)
if err != nil {
return err
}
notReadyCount = nodesNotReady(out)
availablePodHardLimit, err = getAvailablePodHardLimit(out)
if err != nil {
return err
}
max_pods, err := strconv.Atoi(getEnv("OPENSHIFT_MAX_PODS", "100"))
if err != nil {
return errors.New("Could not parse OPENSHIFT_MAX_PODS environment variable: " + err.Error())
}
if notReadyCount*max_pods < availablePodHardLimit {
return nil
}
// wait a few seconds and then check again
time.Sleep(10 * time.Second)
}
return fmt.Errorf("Capacity overload! Workingnode %v is not ready! AvailablePodHardLimit: %v 'oc get nodes' output contained NotReady. Output: %v", getNotReadyNodeNames(out), availablePodHardLimit, out)
}

func getAvailablePodHardLimit(output string) (int, error) {
totalPods, err := getTotalPods()
if err != nil {
return 0, err
}
totalCapacity, err := getTotalPodCapacity(output)
if err != nil {
return 0, err
}
return totalCapacity - totalPods, nil
}

func nodesNotReady(output string) int {
r := regexp.MustCompile("NotReady")
matches := r.FindAllStringIndex(output, -1)
return len(matches)
}

func getTotalPods() (int, error) {
out, err := exec.Command("bash", "-c", "oc get pods --all-namespaces | grep -v Error | grep -v Completed | wc -l").Output()
if err != nil {
return 0, errors.New("Could not parse oc get pods output: " + err.Error())
}
trimmed := strings.TrimSpace(string(out))
i, err := strconv.Atoi(trimmed)
if err != nil {
return 0, errors.New("Could not parse oc get pods output: " + err.Error())
}
return i, nil
}

func getTotalPodCapacity(output string) (int, error) {
out, err := exec.Command("bash", "-c", "oc describe nodes "+getReadyWorkingNodeNames(output)+" | grep Capacity -A4 | grep pods | awk '{ print $2 }' | paste -sd+ | bc").Output()
if err != nil {
return 0, errors.New("Could not parse oc describe nodes output: " + err.Error())
}
trimmed := strings.TrimSpace(string(out))
i, err := strconv.Atoi(trimmed)
if err != nil {
return 0, errors.New("Could not parse oc describe nodes output: " + err.Error())
}
return i, nil
}

func getNotReadyNodeNames(out string) string {
lines := strings.Split(out, "\n")
var notReadyNodes []string
Expand All @@ -72,12 +146,34 @@ func getNotReadyNodeNames(out string) string {
return strings.Join(notReadyNodes, ", ")
}

func getReadyWorkingNodeNames(out string) string {
lines := strings.Split(out, "\n")
var ReadyWorkingNodes []string
for _, line := range lines {
if line == "" {
continue
}
if strings.Contains(line, "NotReady") {
continue
}
if strings.Contains(line, "SchedulingDisabled") {
continue
}
if strings.Contains(line, "purpose=buildnode") {
continue
}
s := strings.Fields(line)
ReadyWorkingNodes = append(ReadyWorkingNodes, s[0])
}
return strings.Join(ReadyWorkingNodes, " ")
}

func runOcGetNodes(buildNodes bool) (string, error) {
buildNodes_grep_params := "-v"
if buildNodes {
buildNodes_grep_params = ""
}
out, err := exec.Command("bash", "-c", fmt.Sprintf("oc get nodes --show-labels | grep -v monitoring=false | grep -v SchedulingDisabled | grep %s purpose=buildnode || test $? -eq 1", buildNodes_grep_params)).Output()
out, err := exec.Command("bash", "-c", fmt.Sprintf("oc get nodes --show-labels --no-headers | grep -v monitoring=false | grep -v SchedulingDisabled | grep %s purpose=buildnode || test $? -eq 1", buildNodes_grep_params)).Output()
if err != nil {
msg := "Could not parse oc get nodes output: " + err.Error()
log.Println(msg)
Expand Down
5 changes: 5 additions & 0 deletions daemon/client/handlers/major.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ func HandleMajorChecks(daemonType string, w http.ResponseWriter, r *http.Request
errors = append(errors, err.Error())
}

// check notready working nodes but only alert if no more capacity is available
if err := checks.CheckOcGetNodesRelaxed(); err != nil {
errors = append(errors, err.Error())
}

if err := checks.CheckEtcdHealth(etcdIps, ""); err != nil {
errors = append(errors, err.Error())
}
Expand Down

0 comments on commit 7d8b49e

Please sign in to comment.