[Tarantool-patches] [PATCH 5/6] replication: use 'score' to find a join-master

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Sat Jun 5 02:37:59 MSK 2021


The patch refactors the algorithm of finding a join-master (in
replicaset_find_join_master()) to use scores instead of multiple
iterations with different criteria.

The original code was relatively fine as long as it had only
one parameter to change - whether should it skip
`box.cfg{read_only = true}` nodes.

Although it was clear that it was "on the edge" of acceptable
complexity due to a second non-configurable parameter whether a
replica is in read-only state regardless of its config.

It is going to get more complicated when the algorithm will take
into account the third parameter whether an instance is
bootstrapped.

Then it should make decisions like "among bootstrapped nodes try
to prefer instances not having read_only=true, and not being in
read-only state". The easiest way to do so is to use
scores/weights incremented according to the instance's parameters
matching certain "good points".

Part of #5613
---
 src/box/replication.cc | 62 ++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 39 deletions(-)

diff --git a/src/box/replication.cc b/src/box/replication.cc
index 990f6239c..d33e70f28 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -960,71 +960,55 @@ replicaset_next(struct replica *replica)
  * replicas, choose a read-only replica with biggest vclock
  * as a leader, in hope it will become read-write soon.
  */
-static struct replica *
-replicaset_round(bool skip_ro)
+struct replica *
+replicaset_find_join_master(void)
 {
 	struct replica *leader = NULL;
+	int leader_score = -1;
 	replicaset_foreach(replica) {
 		struct applier *applier = replica->applier;
 		if (applier == NULL)
 			continue;
 		const struct ballot *ballot = &applier->ballot;
-		/**
-		 * While bootstrapping a new cluster, read-only
-		 * replicas shouldn't be considered as a leader.
-		 * The only exception if there is no read-write
-		 * replicas since there is still a possibility
-		 * that all replicas exist in cluster table.
-		 */
-		if (skip_ro && ballot->is_ro_cfg)
-			continue;
-		if (leader == NULL) {
-			leader = replica;
-			continue;
-		}
-		const struct ballot *leader_ballot = &leader->applier->ballot;
+		int score = 0;
 		/*
-		 * Try to find a replica which has already left
-		 * orphan mode.
+		 * Prefer instances not configured as read-only via box.cfg, and
+		 * not being in read-only state due to any other reason. The
+		 * config is stronger because if it is configured as read-only,
+		 * it is in read-only state for sure, until the config is
+		 * changed.
 		 */
-		if (ballot->is_ro && !leader_ballot->is_ro)
+		if (!ballot->is_ro_cfg)
+			score += 5;
+		if (!ballot->is_ro)
+			score += 1;
+		if (leader_score < score)
+			goto elect;
+		if (score < leader_score)
 			continue;
+		const struct ballot *leader_ballot;
+		leader_ballot = &leader->applier->ballot;
 		/*
 		 * Choose the replica with the most advanced
 		 * vclock. If there are two or more replicas
 		 * with the same vclock, prefer the one with
 		 * the lowest uuid.
 		 */
-		int cmp = vclock_compare_ignore0(&ballot->vclock,
-						 &leader_ballot->vclock);
+		int cmp;
+		cmp = vclock_compare_ignore0(&ballot->vclock,
+					     &leader_ballot->vclock);
 		if (cmp < 0)
 			continue;
 		if (cmp == 0 && tt_uuid_compare(&replica->uuid,
 						&leader->uuid) > 0)
 			continue;
+	elect:
 		leader = replica;
+		leader_score = score;
 	}
 	return leader;
 }
 
-struct replica *
-replicaset_find_join_master(void)
-{
-	bool skip_ro = true;
-	/**
-	 * Two loops, first prefers read-write replicas among others.
-	 * Second for backward compatibility, if there is no such
-	 * replicas at all.
-	 */
-	struct replica *leader = replicaset_round(skip_ro);
-	if (leader == NULL) {
-		skip_ro = false;
-		leader = replicaset_round(skip_ro);
-	}
-
-	return leader;
-}
-
 struct replica *
 replica_by_uuid(const struct tt_uuid *uuid)
 {
-- 
2.24.3 (Apple Git-128)



More information about the Tarantool-patches mailing list