[PATCH 2/2] replication: fix cluster node rebootstrap
Vladimir Davydov
vdavydov.dev at gmail.com
Tue Feb 6 15:02:16 MSK 2018
When a tarantool instance starts for the first time (the local directory
is empty), it chooses the peer with the lowest UUID as the bootstrap
master. As a result, one cannot reliably rebootstrap a cluster node
(delete all local files and restart): if the node happens to have the
lowest UUID in the cluster after restart, it will assume that it is the
leader of a new cluster and bootstrap locally, splitting the cluster in
two.
To fix this problem, let's always give preference to peers with a higher
vclock when choosing a bootstrap master and only fall back on selection
by UUID if two or more peers have the same vclock. To achieve that, we
need to introduce a new iproto request type for fetching the current
vclock of a tarantool instance (we cannot squeeze the vclock in the
greeting, because the latter is already packed). The new request type is
called IPROTO_REQUEST_VOTE so that in future it can be reused for a more
sophisticated leader election algorithm. It has no body and does not
require authentication. In reply to such a request, a tarantool instance
will send IPROTO_OK and its current vclock. If the version of the master
is >= 1.7.7, an applier will send IPROTO_REQUEST_VOTE to fetch the
master's vclock before trying to authenticate. The vclock will then be
to determine the node to bootstrap from.
Closes #3108
---
src/box/applier.cc | 17 +++++++++++-
src/box/applier.h | 2 ++
src/box/box.cc | 2 +-
src/box/iproto.cc | 8 ++++++
src/box/iproto_constants.h | 2 ++
src/box/replication.cc | 29 +++++++++++++++++++++
src/box/replication.h | 6 +++++
src/box/xrow.c | 45 ++++++++++++++++++++++++++++++++
src/box/xrow.h | 31 ++++++++++++++++++++++
test/replication/autobootstrap.result | 46 +++++++++++++++++++++++++++++++++
test/replication/autobootstrap.test.lua | 21 +++++++++++++++
11 files changed, 207 insertions(+), 2 deletions(-)
diff --git a/src/box/applier.cc b/src/box/applier.cc
index 93aa8c40..4938b43c 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -155,6 +155,7 @@ applier_connect(struct applier *applier)
if (coio->fd >= 0)
return;
char greetingbuf[IPROTO_GREETING_SIZE];
+ struct xrow_header row;
struct uri *uri = &applier->uri;
/*
@@ -197,6 +198,21 @@ applier_connect(struct applier *applier)
/* Don't display previous error messages in box.info.replication */
diag_clear(&fiber()->diag);
+ /*
+ * Tarantool >= 1.7.7: send an IPROTO_REQUEST_VOTE message
+ * to fetch the master's vclock before proceeding to "join".
+ * It will be used for leader election on bootstrap.
+ */
+ if (applier->version_id >= version_id(1, 7, 7)) {
+ xrow_encode_request_vote(&row);
+ coio_write_xrow(coio, &row);
+ coio_read_xrow(coio, ibuf, &row);
+ if (row.type != IPROTO_OK)
+ xrow_decode_error_xc(&row);
+ vclock_create(&applier->vclock);
+ xrow_decode_vclock_xc(&row, &applier->vclock);
+ }
+
applier_set_state(applier, APPLIER_CONNECTED);
/* Detect connection to itself */
@@ -209,7 +225,6 @@ applier_connect(struct applier *applier)
/* Authenticate */
applier_set_state(applier, APPLIER_AUTH);
- struct xrow_header row;
xrow_encode_auth_xc(&row, greeting.salt, greeting.salt_len, uri->login,
uri->login_len, uri->password, uri->password_len);
coio_write_xrow(coio, &row);
diff --git a/src/box/applier.h b/src/box/applier.h
index 6dffb6b6..f25d6cb2 100644
--- a/src/box/applier.h
+++ b/src/box/applier.h
@@ -93,6 +93,8 @@ struct applier {
struct uri uri;
/** Remote version encoded as a number, see version_id() macro */
uint32_t version_id;
+ /** Remote vclock at time of connect. */
+ struct vclock vclock;
/** Remote address */
union {
struct sockaddr addr;
diff --git a/src/box/box.cc b/src/box/box.cc
index 7cd0e279..d0f95f22 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1562,7 +1562,7 @@ static void
bootstrap(const struct tt_uuid *replicaset_uuid, bool *is_bootstrap_leader)
{
/* Use the first replica by URI as a bootstrap leader */
- struct replica *master = replicaset_first();
+ struct replica *master = replicaset_leader();
assert(master == NULL || master->applier != NULL);
if (master != NULL && !tt_uuid_is_equal(&master->uuid, &INSTANCE_UUID)) {
diff --git a/src/box/iproto.cc b/src/box/iproto.cc
index 8876acb8..37313fa5 100644
--- a/src/box/iproto.cc
+++ b/src/box/iproto.cc
@@ -981,6 +981,9 @@ iproto_msg_decode(struct iproto_msg *msg, const char **pos, const char *reqend,
cmsg_init(&msg->base, subscribe_route);
*stop_input = true;
break;
+ case IPROTO_REQUEST_VOTE:
+ cmsg_init(&msg->base, misc_route);
+ break;
case IPROTO_AUTH:
if (xrow_decode_auth(&msg->header, &msg->auth))
goto error;
@@ -1342,6 +1345,11 @@ tx_process_misc(struct cmsg *m)
iproto_reply_ok_xc(out, msg->header.sync,
::schema_version);
break;
+ case IPROTO_REQUEST_VOTE:
+ iproto_reply_vclock_xc(out, msg->header.sync,
+ ::schema_version,
+ &replicaset.vclock);
+ break;
default:
unreachable();
}
diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h
index b0964124..95184248 100644
--- a/src/box/iproto_constants.h
+++ b/src/box/iproto_constants.h
@@ -153,6 +153,8 @@ enum iproto_type {
IPROTO_JOIN = 65,
/** Replication SUBSCRIBE command */
IPROTO_SUBSCRIBE = 66,
+ /** Vote request command for master election */
+ IPROTO_REQUEST_VOTE = 67,
/** Vinyl run info stored in .index file */
VY_INDEX_RUN_INFO = 100,
diff --git a/src/box/replication.cc b/src/box/replication.cc
index fe926d31..35efd8ad 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -643,6 +643,35 @@ replicaset_next(struct replica *replica)
}
struct replica *
+replicaset_leader(void)
+{
+ struct replica *leader = NULL;
+ replicaset_foreach(replica) {
+ if (replica->applier == NULL)
+ continue;
+ if (leader == NULL) {
+ leader = replica;
+ continue;
+ }
+ /*
+ * Choose the replica with the most advanced
+ * vclock. If there are two or more replicas
+ * with the same vclock, prefer the one with
+ * the lowest uuid.
+ */
+ int cmp = vclock_compare(&replica->applier->vclock,
+ &leader->applier->vclock);
+ if (cmp < 0)
+ continue;
+ if (cmp == 0 && tt_uuid_compare(&replica->uuid,
+ &leader->uuid) > 0)
+ continue;
+ leader = replica;
+ }
+ return leader;
+}
+
+struct replica *
replica_by_uuid(const struct tt_uuid *uuid)
{
struct replica key;
diff --git a/src/box/replication.h b/src/box/replication.h
index 54f809b1..a7595f61 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -264,6 +264,12 @@ enum {
struct replica *
replica_by_uuid(const struct tt_uuid *uuid);
+/**
+ * Return the replica set leader.
+ */
+struct replica *
+replicaset_leader(void);
+
struct replica *
replicaset_first(void);
diff --git a/src/box/xrow.c b/src/box/xrow.c
index cdc26ebc..f4852564 100644
--- a/src/box/xrow.c
+++ b/src/box/xrow.c
@@ -261,6 +261,44 @@ iproto_reply_ok(struct obuf *out, uint64_t sync, uint32_t schema_version)
}
int
+iproto_reply_vclock(struct obuf *out, uint64_t sync, uint32_t schema_version,
+ const struct vclock *vclock)
+{
+ uint32_t replicaset_size = vclock_size(vclock);
+ size_t max_size = IPROTO_HEADER_LEN + mp_sizeof_map(1) +
+ mp_sizeof_uint(UINT32_MAX) + mp_sizeof_map(replicaset_size) +
+ replicaset_size * (mp_sizeof_uint(UINT32_MAX) +
+ mp_sizeof_uint(UINT64_MAX));
+
+ char *buf = obuf_reserve(out, max_size);
+ if (buf == NULL) {
+ diag_set(OutOfMemory, max_size,
+ "obuf_alloc", "buf");
+ return -1;
+ }
+
+ char *data = buf + IPROTO_HEADER_LEN;
+ data = mp_encode_map(data, 1);
+ data = mp_encode_uint(data, IPROTO_VCLOCK);
+ data = mp_encode_map(data, replicaset_size);
+ struct vclock_iterator it;
+ vclock_iterator_init(&it, vclock);
+ vclock_foreach(&it, replica) {
+ data = mp_encode_uint(data, replica.id);
+ data = mp_encode_uint(data, replica.lsn);
+ }
+ size_t size = data - buf;
+ assert(size <= max_size);
+
+ iproto_header_encode(buf, IPROTO_OK, sync, schema_version,
+ size - IPROTO_HEADER_LEN);
+
+ char *ptr = obuf_alloc(out, size);
+ assert(ptr == buf);
+ return 0;
+}
+
+int
iproto_reply_error(struct obuf *out, const struct error *e, uint64_t sync,
uint32_t schema_version)
{
@@ -750,6 +788,13 @@ error:
box_error_set(__FILE__, __LINE__, code, error);
}
+void
+xrow_encode_request_vote(struct xrow_header *row)
+{
+ memset(row, 0, sizeof(*row));
+ row->type = IPROTO_REQUEST_VOTE;
+}
+
int
xrow_encode_subscribe(struct xrow_header *row,
const struct tt_uuid *replicaset_uuid,
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 859bd1d2..d407d151 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -223,6 +223,13 @@ xrow_encode_auth(struct xrow_header *row, const char *salt, size_t salt_len,
size_t password_len);
/**
+ * Encode a vote request for master election.
+ * @param row[out] Row to encode into.
+ */
+void
+xrow_encode_request_vote(struct xrow_header *row);
+
+/**
* Encode SUBSCRIBE command.
* @param[out] Row.
* @param replicaset_uuid Replica set uuid.
@@ -361,6 +368,21 @@ int
iproto_reply_ok(struct obuf *out, uint64_t sync, uint32_t schema_version);
/**
+ * Encode iproto header with IPROTO_OK response code
+ * and vclock in the body.
+ * @param out Encode to.
+ * @param sync Request sync.
+ * @param schema_version.
+ * @param vclock.
+ *
+ * @retval 0 Success.
+ * @retval -1 Memory error.
+ */
+int
+iproto_reply_vclock(struct obuf *out, uint64_t sync, uint32_t schema_version,
+ const struct vclock *vclock);
+
+/**
* Write an error packet int output buffer. Doesn't throw if out
* of memory
*/
@@ -594,6 +616,15 @@ iproto_reply_ok_xc(struct obuf *out, uint64_t sync, uint32_t schema_version)
diag_raise();
}
+/** @copydoc iproto_reply_vclock. */
+static inline void
+iproto_reply_vclock_xc(struct obuf *out, uint64_t sync, uint32_t schema_version,
+ const struct vclock *vclock)
+{
+ if (iproto_reply_vclock(out, sync, schema_version, vclock) != 0)
+ diag_raise();
+}
+
#endif
#endif /* TARANTOOL_XROW_H_INCLUDED */
diff --git a/test/replication/autobootstrap.result b/test/replication/autobootstrap.result
index e0ab6f57..ae4e5540 100644
--- a/test/replication/autobootstrap.result
+++ b/test/replication/autobootstrap.result
@@ -155,6 +155,52 @@ box.space.test_u:select()
---
- - [1, 2, 3, 4]
...
+--
+-- Rebootstrap one node and check that others follow.
+--
+_ = test_run:cmd("switch autobootstrap1")
+---
+...
+_ = test_run:cmd("restart server autobootstrap1 with cleanup=1")
+_ = box.space.test_u:replace({5, 6, 7, 8})
+---
+...
+box.space.test_u:select()
+---
+- - [1, 2, 3, 4]
+ - [5, 6, 7, 8]
+...
+_ = test_run:cmd("switch default")
+---
+...
+test_run:wait_fullmesh(SERVERS)
+---
+...
+vclock = test_run:get_vclock("autobootstrap1")
+---
+...
+_ = test_run:wait_vclock("autobootstrap2", vclock)
+---
+...
+_ = test_run:wait_vclock("autobootstrap3", vclock)
+---
+...
+_ = test_run:cmd("switch autobootstrap2")
+---
+...
+box.space.test_u:select()
+---
+- - [1, 2, 3, 4]
+ - [5, 6, 7, 8]
+...
+_ = test_run:cmd("switch autobootstrap3")
+---
+...
+box.space.test_u:select()
+---
+- - [1, 2, 3, 4]
+ - [5, 6, 7, 8]
+...
_ = test_run:cmd("switch default")
---
...
diff --git a/test/replication/autobootstrap.test.lua b/test/replication/autobootstrap.test.lua
index e7f624b0..d8c12283 100644
--- a/test/replication/autobootstrap.test.lua
+++ b/test/replication/autobootstrap.test.lua
@@ -72,6 +72,27 @@ box.space.test_u:select()
_ = test_run:cmd("switch autobootstrap3")
box.space.test_u:select()
+--
+-- Rebootstrap one node and check that others follow.
+--
+_ = test_run:cmd("switch autobootstrap1")
+_ = test_run:cmd("restart server autobootstrap1 with cleanup=1")
+
+_ = box.space.test_u:replace({5, 6, 7, 8})
+box.space.test_u:select()
+
+_ = test_run:cmd("switch default")
+test_run:wait_fullmesh(SERVERS)
+
+vclock = test_run:get_vclock("autobootstrap1")
+_ = test_run:wait_vclock("autobootstrap2", vclock)
+_ = test_run:wait_vclock("autobootstrap3", vclock)
+
+_ = test_run:cmd("switch autobootstrap2")
+box.space.test_u:select()
+_ = test_run:cmd("switch autobootstrap3")
+box.space.test_u:select()
+
_ = test_run:cmd("switch default")
--
--
2.11.0
More information about the Tarantool-patches
mailing list