Add --priority keeper option.

Sentinel will promote keeper with higher priority than the current one if this is possible. In async mode this is a bit non-deterministic because we always elect node with highest LSN, and under heavy load prioritized node might never report LSN higher than its stronger competitors. However, if nodes are equal this should happen at some moment. In sync mode, we can just elect any of synchronous standbies. Implements sorintlab#492
postgrespro · Mar 11, 2019 · a3b812a · a3b812a
1 parent e776b3a
commit a3b812a
Show file tree

Hide file tree

Showing 8 changed files with 357 additions and 39 deletions.
diff --git a/cmd/keeper/cmd/keeper.go b/cmd/keeper/cmd/keeper.go
@@ -94,6 +94,7 @@ type config struct {
 	uid                     string
 	dataDir                 string
 	debug                   bool
+	priority                int
 	pgListenAddress         string
 	pgPort                  string
 	pgBinPath               string
@@ -134,6 +135,7 @@ func init() {
 	CmdKeeper.PersistentFlags().StringVar(&cfg.pgSUPassword, "pg-su-password", "", "postgres superuser password. Only one of --pg-su-password or --pg-su-passwordfile must be provided. Must be the same for all keepers.")
 	CmdKeeper.PersistentFlags().StringVar(&cfg.pgSUPasswordFile, "pg-su-passwordfile", "", "postgres superuser password file. Only one of --pg-su-password or --pg-su-passwordfile must be provided. Must be the same for all keepers)")
 	CmdKeeper.PersistentFlags().BoolVar(&cfg.debug, "debug", false, "enable debug logging")
+	CmdKeeper.PersistentFlags().IntVar(&cfg.priority, "priority", 0, "keeper priority, integer. Stolon will promote available keeper with higher priority than current master, if this is possible. Default is 0.")
 
 	CmdKeeper.PersistentFlags().MarkDeprecated("id", "please use --uid")
 	CmdKeeper.PersistentFlags().MarkDeprecated("debug", "use --log-level=debug instead")
@@ -533,6 +535,7 @@ func (p *PostgresKeeper) updateKeeperInfo() error {
 			Maj: maj,
 			Min: min,
 		},
+		Priority:      p.cfg.priority,
 		PostgresState: p.getLastPGState(),
 	}
 

diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go
@@ -234,6 +234,7 @@ func (s *Sentinel) updateKeepersStatus(cd *cluster.ClusterData, keepersInfo clus
 			k.Status.BootUUID = ki.BootUUID
 			k.Status.PostgresBinaryVersion.Maj = ki.PostgresBinaryVersion.Maj
 			k.Status.PostgresBinaryVersion.Min = ki.PostgresBinaryVersion.Min
+			k.Status.Priority = ki.Priority
 		}
 	}
 
@@ -689,12 +690,17 @@ func (s *Sentinel) validStandbysByStatus(cd *cluster.ClusterData) (map[string]*c
 	return goodStandbys, failedStandbys, convergingStandbys
 }
 
-// dbSlice implements sort interface to sort by XLogPos
-type dbSlice []*cluster.DB
-
-func (p dbSlice) Len() int           { return len(p) }
-func (p dbSlice) Less(i, j int) bool { return p[i].Status.XLogPos < p[j].Status.XLogPos }
-func (p dbSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
+// sort dbs by XLogPos and keeper's priority
+func sortDBs(cd *cluster.ClusterData, dbs []*cluster.DB) {
+	sort.Slice(dbs, func(i, j int) bool {
+		if dbs[i].Status.XLogPos != dbs[j].Status.XLogPos {
+			return dbs[i].Status.XLogPos < dbs[j].Status.XLogPos
+		}
+		pi := cd.Keepers[dbs[i].Spec.KeeperUID].Status.Priority
+		pj := cd.Keepers[dbs[i].Spec.KeeperUID].Status.Priority
+		return pi < pj
+	})
+}
 
 func (s *Sentinel) findBestStandbys(cd *cluster.ClusterData, masterDB *cluster.DB) []*cluster.DB {
 	goodStandbys, _, _ := s.validStandbysByStatus(cd)
@@ -716,7 +722,7 @@ func (s *Sentinel) findBestStandbys(cd *cluster.ClusterData, masterDB *cluster.D
 		bestDBs = append(bestDBs, db)
 	}
 	// Sort by XLogPos
-	sort.Sort(dbSlice(bestDBs))
+	sortDBs(cd, bestDBs)
 	return bestDBs
 }
 
@@ -746,11 +752,56 @@ func (s *Sentinel) findBestNewMasters(cd *cluster.ClusterData, masterDB *cluster
 		bestNewMasters = append(bestNewMasters, db)
 	}
 	// Sort by XLogPos
-	sort.Sort(dbSlice(bestNewMasters))
+	sortDBs(cd, bestNewMasters)
 	log.Debugf("bestNewMasters: %s", spew.Sdump(bestNewMasters))
 	return bestNewMasters
 }
 
+// Return DB who can be new master. This function mostly takes care of
+// sync mode; in async case, new master is just first element of findBestNewMasters.
+func (s *Sentinel) findBestNewMaster(cd *cluster.ClusterData, curMasterDB *cluster.DB, logErrors bool) *cluster.DB {
+	bestNewMasters := s.findBestNewMasters(cd, curMasterDB)
+	if len(bestNewMasters) == 0 {
+		if logErrors {
+			log.Errorw("no eligible masters")
+		}
+		return nil
+	}
+
+	// if synchronous replication is enabled, only choose new master in the synchronous replication standbys.
+	var bestNewMasterDB *cluster.DB = nil
+	if curMasterDB.Spec.SynchronousReplication == true {
+		commonSyncStandbys := util.CommonElements(curMasterDB.Status.SynchronousStandbys, curMasterDB.Spec.SynchronousStandbys)
+		if len(commonSyncStandbys) == 0 {
+			if logErrors {
+				log.Errorw("cannot choose synchronous standby since there are no common elements between the latest master reported synchronous standbys and the db spec ones", "reported", curMasterDB.Status.SynchronousStandbys, "spec", curMasterDB.Spec.SynchronousStandbys)
+			}
+			return nil
+		}
+		// In synchronous mode there is no need to choose DB with
+		// highest LSN; all found dbs must be in sync, so pick the one
+		// with highest priority.
+		var newMasterPriority int
+		for _, nm := range bestNewMasters {
+			if util.StringInSlice(commonSyncStandbys, nm.UID) {
+				nmPriority := cd.Keepers[nm.Spec.KeeperUID].Status.Priority
+				if (bestNewMasterDB == nil) || (nmPriority > newMasterPriority) {
+					bestNewMasterDB = nm
+					newMasterPriority = nmPriority
+				}
+			}
+		}
+		if bestNewMasterDB == nil {
+			if logErrors {
+				log.Errorw("cannot choose synchronous standby since there's not match between the possible masters and the usable synchronousStandbys", "reported", curMasterDB.Status.SynchronousStandbys, "spec", curMasterDB.Spec.SynchronousStandbys, "common", commonSyncStandbys, "possibleMasters", bestNewMasters)
+			}
+		}
+	} else {
+		bestNewMasterDB = bestNewMasters[0]
+	}
+	return bestNewMasterDB
+}
+
 func (s *Sentinel) updateCluster(cd *cluster.ClusterData, pis cluster.ProxiesInfo) (*cluster.ClusterData, error) {
 	// take a cd deepCopy to check that the code isn't changing it (it'll be a bug)
 	origcd := cd.DeepCopy()
@@ -983,37 +1034,20 @@ func (s *Sentinel) updateCluster(cd *cluster.ClusterData, pis cluster.ProxiesInf
 			masterOK = false
 		}
 
-		if !masterOK {
-			log.Infow("trying to find a new master to replace failed master")
-			bestNewMasters := s.findBestNewMasters(newcd, curMasterDB)
-			if len(bestNewMasters) == 0 {
-				log.Errorw("no eligible masters")
+		bestNewMasterDB := s.findBestNewMaster(newcd, curMasterDB, !masterOK)
+		if bestNewMasterDB != nil {
+			if !masterOK {
+				log.Infow("electing db as the new master", "db", bestNewMasterDB.UID, "keeper", bestNewMasterDB.Spec.KeeperUID)
+				wantedMasterDBUID = bestNewMasterDB.UID
 			} else {
-				// if synchronous replication is enabled, only choose new master in the synchronous replication standbys.
-				var bestNewMasterDB *cluster.DB
-				if curMasterDB.Spec.SynchronousReplication == true {
-					commonSyncStandbys := util.CommonElements(curMasterDB.Status.SynchronousStandbys, curMasterDB.Spec.SynchronousStandbys)
-					if len(commonSyncStandbys) == 0 {
-						log.Warnw("cannot choose synchronous standby since there are no common elements between the latest master reported synchronous standbys and the db spec ones", "reported", curMasterDB.Status.SynchronousStandbys, "spec", curMasterDB.Spec.SynchronousStandbys)
-					} else {
-						for _, nm := range bestNewMasters {
-							if util.StringInSlice(commonSyncStandbys, nm.UID) {
-								bestNewMasterDB = nm
-								break
-							}
-						}
-						if bestNewMasterDB == nil {
-							log.Warnw("cannot choose synchronous standby since there's not match between the possible masters and the usable synchronousStandbys", "reported", curMasterDB.Status.SynchronousStandbys, "spec", curMasterDB.Spec.SynchronousStandbys, "common", commonSyncStandbys, "possibleMasters", bestNewMasters)
-						}
-					}
-				} else {
-					bestNewMasterDB = bestNewMasters[0]
-				}
-				if bestNewMasterDB != nil {
-					log.Infow("electing db as the new master", "db", bestNewMasterDB.UID, "keeper", bestNewMasterDB.Spec.KeeperUID)
+				// Even if current master is ok, we probably still
+				// want to change it if there is ready DB with higher
+				// keeper priority.
+				curMasterPriority := cd.Keepers[curMasterDB.Spec.KeeperUID].Status.Priority
+				newMasterPriority := cd.Keepers[bestNewMasterDB.Spec.KeeperUID].Status.Priority
+				if newMasterPriority > curMasterPriority {
+					log.Infow("electing db as the new master because it has higher priority", "db", bestNewMasterDB.UID, "keeper", bestNewMasterDB.Spec.KeeperUID, "currPriority", curMasterPriority, "newPriority", newMasterPriority)
 					wantedMasterDBUID = bestNewMasterDB.UID
-				} else {
-					log.Errorw("no eligible masters")
 				}
 			}
 		}

diff --git a/cmd/sentinel/cmd/sentinel_test.go b/cmd/sentinel/cmd/sentinel_test.go
@@ -4960,6 +4960,190 @@ func TestUpdateCluster(t *testing.T) {
 				},
 			},
 		},
+		// #26 Test keeper's priority. One master and one healthy
+		// standby. Master is ok, but standy has higher priority and
+		// gets elected.
+		{
+			cd: &cluster.ClusterData{
+				Cluster: &cluster.Cluster{
+					UID:        "cluster1",
+					Generation: 1,
+					Spec: &cluster.ClusterSpec{
+						ConvergenceTimeout:   &cluster.Duration{Duration: cluster.DefaultConvergenceTimeout},
+						InitTimeout:          &cluster.Duration{Duration: cluster.DefaultInitTimeout},
+						SyncTimeout:          &cluster.Duration{Duration: cluster.DefaultSyncTimeout},
+						MaxStandbysPerSender: cluster.Uint16P(cluster.DefaultMaxStandbysPerSender),
+					},
+					Status: cluster.ClusterStatus{
+						CurrentGeneration: 1,
+						Phase:             cluster.ClusterPhaseNormal,
+						Master:            "db1",
+					},
+				},
+				Keepers: cluster.Keepers{
+					"keeper1": &cluster.Keeper{
+						UID:  "keeper1",
+						Spec: &cluster.KeeperSpec{},
+						Status: cluster.KeeperStatus{
+							Healthy:         true,
+							LastHealthyTime: now,
+						},
+					},
+					"keeper2": &cluster.Keeper{
+						UID:  "keeper2",
+						Spec: &cluster.KeeperSpec{},
+						Status: cluster.KeeperStatus{
+							Healthy:         true,
+							LastHealthyTime: now,
+							Priority:        1,
+						},
+					},
+				},
+				DBs: cluster.DBs{
+					"db1": &cluster.DB{
+						UID:        "db1",
+						Generation: 1,
+						ChangeTime: time.Time{},
+						Spec: &cluster.DBSpec{
+							KeeperUID:                   "keeper1",
+							RequestTimeout:              cluster.Duration{Duration: cluster.DefaultRequestTimeout},
+							MaxStandbys:                 cluster.DefaultMaxStandbys,
+							AdditionalWalSenders:        cluster.DefaultAdditionalWalSenders,
+							InitMode:                    cluster.DBInitModeNone,
+							SynchronousReplication:      false,
+							Role:                        common.RoleMaster,
+							Followers:                   []string{"db2"},
+							SynchronousStandbys:         nil,
+							ExternalSynchronousStandbys: nil,
+						},
+						Status: cluster.DBStatus{
+							Healthy:           true,
+							CurrentGeneration: 1,
+						},
+					},
+					"db2": &cluster.DB{
+						UID:        "db2",
+						Generation: 1,
+						ChangeTime: time.Time{},
+						Spec: &cluster.DBSpec{
+							KeeperUID:              "keeper2",
+							RequestTimeout:         cluster.Duration{Duration: cluster.DefaultRequestTimeout},
+							MaxStandbys:            cluster.DefaultMaxStandbys,
+							AdditionalWalSenders:   cluster.DefaultAdditionalWalSenders,
+							InitMode:               cluster.DBInitModeNone,
+							SynchronousReplication: false,
+							Role:                   common.RoleStandby,
+							Followers:              []string{},
+							FollowConfig: &cluster.FollowConfig{
+								Type:  cluster.FollowTypeInternal,
+								DBUID: "db1",
+							},
+							SynchronousStandbys:         nil,
+							ExternalSynchronousStandbys: nil,
+						},
+						Status: cluster.DBStatus{
+							Healthy:           true,
+							CurrentGeneration: 1,
+						},
+					},
+				},
+				Proxy: &cluster.Proxy{
+					Generation: 1,
+					Spec: cluster.ProxySpec{
+						MasterDBUID:    "db1",
+						EnabledProxies: []string{},
+					},
+				},
+			},
+			outcd: &cluster.ClusterData{
+				Cluster: &cluster.Cluster{
+					UID:        "cluster1",
+					Generation: 1,
+					Spec: &cluster.ClusterSpec{
+						ConvergenceTimeout:   &cluster.Duration{Duration: cluster.DefaultConvergenceTimeout},
+						InitTimeout:          &cluster.Duration{Duration: cluster.DefaultInitTimeout},
+						SyncTimeout:          &cluster.Duration{Duration: cluster.DefaultSyncTimeout},
+						MaxStandbysPerSender: cluster.Uint16P(cluster.DefaultMaxStandbysPerSender),
+					},
+					Status: cluster.ClusterStatus{
+						CurrentGeneration: 1,
+						Phase:             cluster.ClusterPhaseNormal,
+						Master:            "db2",
+					},
+				},
+				Keepers: cluster.Keepers{
+					"keeper1": &cluster.Keeper{
+						UID:  "keeper1",
+						Spec: &cluster.KeeperSpec{},
+						Status: cluster.KeeperStatus{
+							Healthy:         true,
+							LastHealthyTime: now,
+						},
+					},
+					"keeper2": &cluster.Keeper{
+						UID:  "keeper2",
+						Spec: &cluster.KeeperSpec{},
+						Status: cluster.KeeperStatus{
+							Healthy:         true,
+							LastHealthyTime: now,
+							Priority:        1,
+						},
+					},
+				},
+				DBs: cluster.DBs{
+					"db1": &cluster.DB{
+						UID:        "db1",
+						Generation: 2,
+						ChangeTime: time.Time{},
+						Spec: &cluster.DBSpec{
+							KeeperUID:                   "keeper1",
+							RequestTimeout:              cluster.Duration{Duration: cluster.DefaultRequestTimeout},
+							MaxStandbys:                 cluster.DefaultMaxStandbys,
+							AdditionalWalSenders:        cluster.DefaultAdditionalWalSenders,
+							InitMode:                    cluster.DBInitModeNone,
+							SynchronousReplication:      false,
+							Role:                        common.RoleMaster,
+							Followers:                   []string{},
+							SynchronousStandbys:         nil,
+							ExternalSynchronousStandbys: nil,
+						},
+						Status: cluster.DBStatus{
+							Healthy:           true,
+							CurrentGeneration: 1,
+						},
+					},
+					"db2": &cluster.DB{
+						UID:        "db2",
+						Generation: 2,
+						ChangeTime: time.Time{},
+						Spec: &cluster.DBSpec{
+							KeeperUID:                   "keeper2",
+							RequestTimeout:              cluster.Duration{Duration: cluster.DefaultRequestTimeout},
+							MaxStandbys:                 cluster.DefaultMaxStandbys,
+							AdditionalWalSenders:        cluster.DefaultAdditionalWalSenders,
+							InitMode:                    cluster.DBInitModeNone,
+							SynchronousReplication:      false,
+							Role:                        common.RoleMaster,
+							Followers:                   []string{},
+							FollowConfig:                nil,
+							SynchronousStandbys:         nil,
+							ExternalSynchronousStandbys: nil,
+						},
+						Status: cluster.DBStatus{
+							Healthy:           true,
+							CurrentGeneration: 1,
+						},
+					},
+				},
+				Proxy: &cluster.Proxy{
+					Generation: 2,
+					Spec: cluster.ProxySpec{
+						MasterDBUID:    "",
+						EnabledProxies: []string{},
+					},
+				},
+			},
+		},
 	}
 
 	for i, tt := range tests {

diff --git a/doc/commands/stolon-keeper.md b/doc/commands/stolon-keeper.md
@@ -31,6 +31,7 @@ stolon-keeper [flags]
       --pg-su-password string           postgres superuser password. Only one of --pg-su-password or --pg-su-passwordfile must be provided. Must be the same for all keepers.
       --pg-su-passwordfile string       postgres superuser password file. Only one of --pg-su-password or --pg-su-passwordfile must be provided. Must be the same for all keepers)
       --pg-su-username string           postgres superuser user name. Used for keeper managed instance access and pg_rewind based synchronization. It'll be created on db initialization. Defaults to the name of the effective user running stolon-keeper. Must be the same for all keepers. (default "motaboy")
+      --priority int                    keeper priority, integer. Stolon will promote available keeper with higher priority than current master, if this is possible. Default is 0.
       --store-backend string            store backend type (etcdv2/etcd, etcdv3, consul or kubernetes)
       --store-ca-file string            verify certificates of HTTPS-enabled store servers using this CA bundle
       --store-cert-file string          certificate file for client identification to the store
@@ -41,4 +42,4 @@ stolon-keeper [flags]
       --uid string                      keeper uid (must be unique in the cluster and can contain only lower-case letters, numbers and the underscore character). If not provided a random uid will be generated.
 ```
 
-###### Auto generated by spf13/cobra on 21-Aug-2018
+###### Auto generated by spf13/cobra on 11-Mar-2019
diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go
@@ -552,6 +552,8 @@ type KeeperStatus struct {
 	PostgresBinaryVersion PostgresBinaryVersion `json:"postgresBinaryVersion,omitempty"`
 
 	ForceFail bool `json:"forceFail,omitempty"`
+
+	Priority int `json:"priority,omitempty"`
 }
 
 type Keeper struct {
@@ -575,6 +577,7 @@ func NewKeeperFromKeeperInfo(ki *KeeperInfo) *Keeper {
 			Healthy:         true,
 			LastHealthyTime: time.Now(),
 			BootUUID:        ki.BootUUID,
+			Priority:        ki.Priority,
 		},
 	}
 }