[PATCH 2/2] replication: fix cluster node rebootstrap

Vladimir Davydov vdavydov.dev at gmail.com
Tue Feb 6 15:02:16 MSK 2018


When a tarantool instance starts for the first time (the local directory
is empty), it chooses the peer with the lowest UUID as the bootstrap
master. As a result, one cannot reliably rebootstrap a cluster node
(delete all local files and restart): if the node happens to have the
lowest UUID in the cluster after restart, it will assume that it is the
leader of a new cluster and bootstrap locally, splitting the cluster in
two.

To fix this problem, let's always give preference to peers with a higher
vclock when choosing a bootstrap master and only fall back on selection
by UUID if two or more peers have the same vclock. To achieve that, we
need to introduce a new iproto request type for fetching the current
vclock of a tarantool instance (we cannot squeeze the vclock in the
greeting, because the latter is already packed). The new request type is
called IPROTO_REQUEST_VOTE so that in future it can be reused for a more
sophisticated leader election algorithm. It has no body and does not
require authentication. In reply to such a request, a tarantool instance
will send IPROTO_OK and its current vclock. If the version of the master
is >= 1.7.7, an applier will send IPROTO_REQUEST_VOTE to fetch the
master's vclock before trying to authenticate. The vclock will then be
to determine the node to bootstrap from.

Closes #3108
---
 src/box/applier.cc                      | 17 +++++++++++-
 src/box/applier.h                       |  2 ++
 src/box/box.cc                          |  2 +-
 src/box/iproto.cc                       |  8 ++++++
 src/box/iproto_constants.h              |  2 ++
 src/box/replication.cc                  | 29 +++++++++++++++++++++
 src/box/replication.h                   |  6 +++++
 src/box/xrow.c                          | 45 ++++++++++++++++++++++++++++++++
 src/box/xrow.h                          | 31 ++++++++++++++++++++++
 test/replication/autobootstrap.result   | 46 +++++++++++++++++++++++++++++++++
 test/replication/autobootstrap.test.lua | 21 +++++++++++++++
 11 files changed, 207 insertions(+), 2 deletions(-)

diff --git a/src/box/applier.cc b/src/box/applier.cc
index 93aa8c40..4938b43c 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -155,6 +155,7 @@ applier_connect(struct applier *applier)
 	if (coio->fd >= 0)
 		return;
 	char greetingbuf[IPROTO_GREETING_SIZE];
+	struct xrow_header row;
 
 	struct uri *uri = &applier->uri;
 	/*
@@ -197,6 +198,21 @@ applier_connect(struct applier *applier)
 	/* Don't display previous error messages in box.info.replication */
 	diag_clear(&fiber()->diag);
 
+	/*
+	 * Tarantool >= 1.7.7: send an IPROTO_REQUEST_VOTE message
+	 * to fetch the master's vclock before proceeding to "join".
+	 * It will be used for leader election on bootstrap.
+	 */
+	if (applier->version_id >= version_id(1, 7, 7)) {
+		xrow_encode_request_vote(&row);
+		coio_write_xrow(coio, &row);
+		coio_read_xrow(coio, ibuf, &row);
+		if (row.type != IPROTO_OK)
+			xrow_decode_error_xc(&row);
+		vclock_create(&applier->vclock);
+		xrow_decode_vclock_xc(&row, &applier->vclock);
+	}
+
 	applier_set_state(applier, APPLIER_CONNECTED);
 
 	/* Detect connection to itself */
@@ -209,7 +225,6 @@ applier_connect(struct applier *applier)
 
 	/* Authenticate */
 	applier_set_state(applier, APPLIER_AUTH);
-	struct xrow_header row;
 	xrow_encode_auth_xc(&row, greeting.salt, greeting.salt_len, uri->login,
 			    uri->login_len, uri->password, uri->password_len);
 	coio_write_xrow(coio, &row);
diff --git a/src/box/applier.h b/src/box/applier.h
index 6dffb6b6..f25d6cb2 100644
--- a/src/box/applier.h
+++ b/src/box/applier.h
@@ -93,6 +93,8 @@ struct applier {
 	struct uri uri;
 	/** Remote version encoded as a number, see version_id() macro */
 	uint32_t version_id;
+	/** Remote vclock at time of connect. */
+	struct vclock vclock;
 	/** Remote address */
 	union {
 		struct sockaddr addr;
diff --git a/src/box/box.cc b/src/box/box.cc
index 7cd0e279..d0f95f22 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1562,7 +1562,7 @@ static void
 bootstrap(const struct tt_uuid *replicaset_uuid, bool *is_bootstrap_leader)
 {
 	/* Use the first replica by URI as a bootstrap leader */
-	struct replica *master = replicaset_first();
+	struct replica *master = replicaset_leader();
 	assert(master == NULL || master->applier != NULL);
 
 	if (master != NULL && !tt_uuid_is_equal(&master->uuid, &INSTANCE_UUID)) {
diff --git a/src/box/iproto.cc b/src/box/iproto.cc
index 8876acb8..37313fa5 100644
--- a/src/box/iproto.cc
+++ b/src/box/iproto.cc
@@ -981,6 +981,9 @@ iproto_msg_decode(struct iproto_msg *msg, const char **pos, const char *reqend,
 		cmsg_init(&msg->base, subscribe_route);
 		*stop_input = true;
 		break;
+	case IPROTO_REQUEST_VOTE:
+		cmsg_init(&msg->base, misc_route);
+		break;
 	case IPROTO_AUTH:
 		if (xrow_decode_auth(&msg->header, &msg->auth))
 			goto error;
@@ -1342,6 +1345,11 @@ tx_process_misc(struct cmsg *m)
 			iproto_reply_ok_xc(out, msg->header.sync,
 					   ::schema_version);
 			break;
+		case IPROTO_REQUEST_VOTE:
+			iproto_reply_vclock_xc(out, msg->header.sync,
+					       ::schema_version,
+					       &replicaset.vclock);
+			break;
 		default:
 			unreachable();
 		}
diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h
index b0964124..95184248 100644
--- a/src/box/iproto_constants.h
+++ b/src/box/iproto_constants.h
@@ -153,6 +153,8 @@ enum iproto_type {
 	IPROTO_JOIN = 65,
 	/** Replication SUBSCRIBE command */
 	IPROTO_SUBSCRIBE = 66,
+	/** Vote request command for master election */
+	IPROTO_REQUEST_VOTE = 67,
 
 	/** Vinyl run info stored in .index file */
 	VY_INDEX_RUN_INFO = 100,
diff --git a/src/box/replication.cc b/src/box/replication.cc
index fe926d31..35efd8ad 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -643,6 +643,35 @@ replicaset_next(struct replica *replica)
 }
 
 struct replica *
+replicaset_leader(void)
+{
+	struct replica *leader = NULL;
+	replicaset_foreach(replica) {
+		if (replica->applier == NULL)
+			continue;
+		if (leader == NULL) {
+			leader = replica;
+			continue;
+		}
+		/*
+		 * Choose the replica with the most advanced
+		 * vclock. If there are two or more replicas
+		 * with the same vclock, prefer the one with
+		 * the lowest uuid.
+		 */
+		int cmp = vclock_compare(&replica->applier->vclock,
+					 &leader->applier->vclock);
+		if (cmp < 0)
+			continue;
+		if (cmp == 0 && tt_uuid_compare(&replica->uuid,
+						&leader->uuid) > 0)
+			continue;
+		leader = replica;
+	}
+	return leader;
+}
+
+struct replica *
 replica_by_uuid(const struct tt_uuid *uuid)
 {
 	struct replica key;
diff --git a/src/box/replication.h b/src/box/replication.h
index 54f809b1..a7595f61 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -264,6 +264,12 @@ enum {
 struct replica *
 replica_by_uuid(const struct tt_uuid *uuid);
 
+/**
+ * Return the replica set leader.
+ */
+struct replica *
+replicaset_leader(void);
+
 struct replica *
 replicaset_first(void);
 
diff --git a/src/box/xrow.c b/src/box/xrow.c
index cdc26ebc..f4852564 100644
--- a/src/box/xrow.c
+++ b/src/box/xrow.c
@@ -261,6 +261,44 @@ iproto_reply_ok(struct obuf *out, uint64_t sync, uint32_t schema_version)
 }
 
 int
+iproto_reply_vclock(struct obuf *out, uint64_t sync, uint32_t schema_version,
+		    const struct vclock *vclock)
+{
+	uint32_t replicaset_size = vclock_size(vclock);
+	size_t max_size = IPROTO_HEADER_LEN + mp_sizeof_map(1) +
+		mp_sizeof_uint(UINT32_MAX) + mp_sizeof_map(replicaset_size) +
+		replicaset_size * (mp_sizeof_uint(UINT32_MAX) +
+				   mp_sizeof_uint(UINT64_MAX));
+
+	char *buf = obuf_reserve(out, max_size);
+	if (buf == NULL) {
+		diag_set(OutOfMemory, max_size,
+			 "obuf_alloc", "buf");
+		return -1;
+	}
+
+	char *data = buf + IPROTO_HEADER_LEN;
+	data = mp_encode_map(data, 1);
+	data = mp_encode_uint(data, IPROTO_VCLOCK);
+	data = mp_encode_map(data, replicaset_size);
+	struct vclock_iterator it;
+	vclock_iterator_init(&it, vclock);
+	vclock_foreach(&it, replica) {
+		data = mp_encode_uint(data, replica.id);
+		data = mp_encode_uint(data, replica.lsn);
+	}
+	size_t size = data - buf;
+	assert(size <= max_size);
+
+	iproto_header_encode(buf, IPROTO_OK, sync, schema_version,
+			     size - IPROTO_HEADER_LEN);
+
+	char *ptr = obuf_alloc(out, size);
+	assert(ptr == buf);
+	return 0;
+}
+
+int
 iproto_reply_error(struct obuf *out, const struct error *e, uint64_t sync,
 		   uint32_t schema_version)
 {
@@ -750,6 +788,13 @@ error:
 	box_error_set(__FILE__, __LINE__, code, error);
 }
 
+void
+xrow_encode_request_vote(struct xrow_header *row)
+{
+	memset(row, 0, sizeof(*row));
+	row->type = IPROTO_REQUEST_VOTE;
+}
+
 int
 xrow_encode_subscribe(struct xrow_header *row,
 		      const struct tt_uuid *replicaset_uuid,
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 859bd1d2..d407d151 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -223,6 +223,13 @@ xrow_encode_auth(struct xrow_header *row, const char *salt, size_t salt_len,
 		 size_t password_len);
 
 /**
+ * Encode a vote request for master election.
+ * @param row[out] Row to encode into.
+ */
+void
+xrow_encode_request_vote(struct xrow_header *row);
+
+/**
  * Encode SUBSCRIBE command.
  * @param[out] Row.
  * @param replicaset_uuid Replica set uuid.
@@ -361,6 +368,21 @@ int
 iproto_reply_ok(struct obuf *out, uint64_t sync, uint32_t schema_version);
 
 /**
+ * Encode iproto header with IPROTO_OK response code
+ * and vclock in the body.
+ * @param out Encode to.
+ * @param sync Request sync.
+ * @param schema_version.
+ * @param vclock.
+ *
+ * @retval  0 Success.
+ * @retval -1 Memory error.
+ */
+int
+iproto_reply_vclock(struct obuf *out, uint64_t sync, uint32_t schema_version,
+		    const struct vclock *vclock);
+
+/**
  * Write an error packet int output buffer. Doesn't throw if out
  * of memory
  */
@@ -594,6 +616,15 @@ iproto_reply_ok_xc(struct obuf *out, uint64_t sync, uint32_t schema_version)
 		diag_raise();
 }
 
+/** @copydoc iproto_reply_vclock. */
+static inline void
+iproto_reply_vclock_xc(struct obuf *out, uint64_t sync, uint32_t schema_version,
+		       const struct vclock *vclock)
+{
+	if (iproto_reply_vclock(out, sync, schema_version, vclock) != 0)
+		diag_raise();
+}
+
 #endif
 
 #endif /* TARANTOOL_XROW_H_INCLUDED */
diff --git a/test/replication/autobootstrap.result b/test/replication/autobootstrap.result
index e0ab6f57..ae4e5540 100644
--- a/test/replication/autobootstrap.result
+++ b/test/replication/autobootstrap.result
@@ -155,6 +155,52 @@ box.space.test_u:select()
 ---
 - - [1, 2, 3, 4]
 ...
+--
+-- Rebootstrap one node and check that others follow.
+--
+_ = test_run:cmd("switch autobootstrap1")
+---
+...
+_ = test_run:cmd("restart server autobootstrap1 with cleanup=1")
+_ = box.space.test_u:replace({5, 6, 7, 8})
+---
+...
+box.space.test_u:select()
+---
+- - [1, 2, 3, 4]
+  - [5, 6, 7, 8]
+...
+_ = test_run:cmd("switch default")
+---
+...
+test_run:wait_fullmesh(SERVERS)
+---
+...
+vclock = test_run:get_vclock("autobootstrap1")
+---
+...
+_ = test_run:wait_vclock("autobootstrap2", vclock)
+---
+...
+_ = test_run:wait_vclock("autobootstrap3", vclock)
+---
+...
+_ = test_run:cmd("switch autobootstrap2")
+---
+...
+box.space.test_u:select()
+---
+- - [1, 2, 3, 4]
+  - [5, 6, 7, 8]
+...
+_ = test_run:cmd("switch autobootstrap3")
+---
+...
+box.space.test_u:select()
+---
+- - [1, 2, 3, 4]
+  - [5, 6, 7, 8]
+...
 _ = test_run:cmd("switch default")
 ---
 ...
diff --git a/test/replication/autobootstrap.test.lua b/test/replication/autobootstrap.test.lua
index e7f624b0..d8c12283 100644
--- a/test/replication/autobootstrap.test.lua
+++ b/test/replication/autobootstrap.test.lua
@@ -72,6 +72,27 @@ box.space.test_u:select()
 _ = test_run:cmd("switch autobootstrap3")
 box.space.test_u:select()
 
+--
+-- Rebootstrap one node and check that others follow.
+--
+_ = test_run:cmd("switch autobootstrap1")
+_ = test_run:cmd("restart server autobootstrap1 with cleanup=1")
+
+_ = box.space.test_u:replace({5, 6, 7, 8})
+box.space.test_u:select()
+
+_ = test_run:cmd("switch default")
+test_run:wait_fullmesh(SERVERS)
+
+vclock = test_run:get_vclock("autobootstrap1")
+_ = test_run:wait_vclock("autobootstrap2", vclock)
+_ = test_run:wait_vclock("autobootstrap3", vclock)
+
+_ = test_run:cmd("switch autobootstrap2")
+box.space.test_u:select()
+_ = test_run:cmd("switch autobootstrap3")
+box.space.test_u:select()
+
 _ = test_run:cmd("switch default")
 
 --
-- 
2.11.0




More information about the Tarantool-patches mailing list