* [PATCH] replication: fix bug with read-only replica as a bootstrap leader
@ 2018-04-11 16:02 Konstantin Belyavskiy
2018-04-11 16:11 ` [tarantool-patches] " Georgy Kirichenko
2018-04-13 8:54 ` Vladimir Davydov
0 siblings, 2 replies; 9+ messages in thread
From: Konstantin Belyavskiy @ 2018-04-11 16:02 UTC (permalink / raw)
To: vdavydov, georgy; +Cc: tarantool-patches
ticket: https://github.com/tarantool/tarantool/issues/3257
branch: https://github.com/tarantool/tarantool/compare/gh-3257-fix-bug-with-ro-replica-as-a-leader
When bootstrapping a new cluster, each replica from replicaset can
be chosen as a leader, but if it is 'read-only', bootstrap will
failed with an error.
Fixed it by eliminating read-only replicas from voting by adding
access rights information to IPROTO_REQUEST_VOTE reply.
Closes #3257
---
src/box/applier.cc | 3 +-
src/box/applier.h | 2 +
src/box/iproto.cc | 11 +++--
src/box/iproto_constants.c | 2 +-
src/box/iproto_constants.h | 1 +
src/box/replication.cc | 8 ++++
src/box/xrow.c | 29 ++++++++++---
src/box/xrow.h | 54 ++++++++++++++++++-----
test/replication/replica_uuid_ro.lua | 33 ++++++++++++++
test/replication/replica_uuid_ro1.lua | 1 +
test/replication/replica_uuid_ro2.lua | 1 +
test/replication/replicaset_ro_mostly.result | 59 ++++++++++++++++++++++++++
test/replication/replicaset_ro_mostly.test.lua | 30 +++++++++++++
13 files changed, 212 insertions(+), 22 deletions(-)
create mode 100644 test/replication/replica_uuid_ro.lua
create mode 120000 test/replication/replica_uuid_ro1.lua
create mode 120000 test/replication/replica_uuid_ro2.lua
create mode 100644 test/replication/replicaset_ro_mostly.result
create mode 100644 test/replication/replicaset_ro_mostly.test.lua
diff --git a/src/box/applier.cc b/src/box/applier.cc
index 9aa951c34..b3a13260d 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -223,7 +223,8 @@ applier_connect(struct applier *applier)
if (row.type != IPROTO_OK)
xrow_decode_error_xc(&row);
vclock_create(&applier->vclock);
- xrow_decode_vclock_xc(&row, &applier->vclock);
+ xrow_decode_request_vote_xc(&row, &applier->vclock,
+ &applier->read_only);
}
applier_set_state(applier, APPLIER_CONNECTED);
diff --git a/src/box/applier.h b/src/box/applier.h
index f25d6cb26..f47cf330f 100644
--- a/src/box/applier.h
+++ b/src/box/applier.h
@@ -95,6 +95,8 @@ struct applier {
uint32_t version_id;
/** Remote vclock at time of connect. */
struct vclock vclock;
+ /** Remote access rights, true if read-only, default: false */
+ bool read_only;
/** Remote address */
union {
struct sockaddr addr;
diff --git a/src/box/iproto.cc b/src/box/iproto.cc
index db5820806..51970bb28 100644
--- a/src/box/iproto.cc
+++ b/src/box/iproto.cc
@@ -60,6 +60,8 @@
#include "iproto_constants.h"
#include "rmean.h"
#include "errinj.h"
+#include "applier.h"
+#include "cfg.h"
/* The number of iproto messages in flight */
enum { IPROTO_MSG_MAX = 768 };
@@ -1352,6 +1354,7 @@ tx_process_misc(struct cmsg *m)
goto error;
try {
+ bool read_only = false;
switch (msg->header.type) {
case IPROTO_AUTH:
box_process_auth(&msg->auth);
@@ -1363,9 +1366,11 @@ tx_process_misc(struct cmsg *m)
::schema_version);
break;
case IPROTO_REQUEST_VOTE:
- iproto_reply_vclock_xc(out, msg->header.sync,
- ::schema_version,
- &replicaset.vclock);
+ read_only = cfg_geti("read_only");
+ iproto_reply_request_vote_xc(out, msg->header.sync,
+ ::schema_version,
+ &replicaset.vclock,
+ read_only);
break;
default:
unreachable();
diff --git a/src/box/iproto_constants.c b/src/box/iproto_constants.c
index cd7b1d03b..eaba6259e 100644
--- a/src/box/iproto_constants.c
+++ b/src/box/iproto_constants.c
@@ -40,10 +40,10 @@ const unsigned char iproto_key_type[IPROTO_KEY_MAX] =
/* 0x04 */ MP_DOUBLE, /* IPROTO_TIMESTAMP */
/* 0x05 */ MP_UINT, /* IPROTO_SCHEMA_VERSION */
/* 0x06 */ MP_UINT, /* IPROTO_SERVER_VERSION */
+ /* 0x07 */ MP_UINT, /* IPROTO_SERVER_READONLY */
/* }}} */
/* {{{ unused */
- /* 0x07 */ MP_UINT,
/* 0x08 */ MP_UINT,
/* 0x09 */ MP_UINT,
/* 0x0a */ MP_UINT,
diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h
index 951842485..aae004584 100644
--- a/src/box/iproto_constants.h
+++ b/src/box/iproto_constants.h
@@ -58,6 +58,7 @@ enum iproto_key {
IPROTO_TIMESTAMP = 0x04,
IPROTO_SCHEMA_VERSION = 0x05,
IPROTO_SERVER_VERSION = 0x06,
+ IPROTO_SERVER_READONLY = 0x07,
/* Leave a gap for other keys in the header. */
IPROTO_SPACE_ID = 0x10,
IPROTO_INDEX_ID = 0x11,
diff --git a/src/box/replication.cc b/src/box/replication.cc
index b1c84d36c..423de2c88 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -685,6 +685,14 @@ replicaset_leader(void)
replicaset_foreach(replica) {
if (replica->applier == NULL)
continue;
+ /**
+ * While bootstrapping a new cluster,
+ * read-only replicas shouldn't be considered
+ * as a leader.
+ */
+ if (replica->applier->read_only &&
+ replica->applier->vclock.signature == 0)
+ continue;
if (leader == NULL) {
leader = replica;
continue;
diff --git a/src/box/xrow.c b/src/box/xrow.c
index f48525645..f8ea73286 100644
--- a/src/box/xrow.c
+++ b/src/box/xrow.c
@@ -261,14 +261,16 @@ iproto_reply_ok(struct obuf *out, uint64_t sync, uint32_t schema_version)
}
int
-iproto_reply_vclock(struct obuf *out, uint64_t sync, uint32_t schema_version,
- const struct vclock *vclock)
+iproto_reply_request_vote(struct obuf *out, uint64_t sync,
+ uint32_t schema_version, const struct vclock *vclock,
+ bool read_only)
{
uint32_t replicaset_size = vclock_size(vclock);
- size_t max_size = IPROTO_HEADER_LEN + mp_sizeof_map(1) +
+ size_t max_size = IPROTO_HEADER_LEN + mp_sizeof_map(2) +
mp_sizeof_uint(UINT32_MAX) + mp_sizeof_map(replicaset_size) +
replicaset_size * (mp_sizeof_uint(UINT32_MAX) +
- mp_sizeof_uint(UINT64_MAX));
+ mp_sizeof_uint(UINT64_MAX)) +
+ mp_sizeof_uint(UINT32_MAX) + mp_sizeof_bool(true);
char *buf = obuf_reserve(out, max_size);
if (buf == NULL) {
@@ -278,7 +280,9 @@ iproto_reply_vclock(struct obuf *out, uint64_t sync, uint32_t schema_version,
}
char *data = buf + IPROTO_HEADER_LEN;
- data = mp_encode_map(data, 1);
+ data = mp_encode_map(data, 2);
+ data = mp_encode_uint(data, IPROTO_SERVER_READONLY);
+ data = mp_encode_bool(data, read_only);
data = mp_encode_uint(data, IPROTO_VCLOCK);
data = mp_encode_map(data, replicaset_size);
struct vclock_iterator it;
@@ -837,7 +841,7 @@ xrow_encode_subscribe(struct xrow_header *row,
int
xrow_decode_subscribe(struct xrow_header *row, struct tt_uuid *replicaset_uuid,
struct tt_uuid *instance_uuid, struct vclock *vclock,
- uint32_t *version_id)
+ uint32_t *version_id, bool *read_only)
{
if (row->bodycnt == 0) {
diag_set(ClientError, ER_INVALID_MSGPACK, "request body");
@@ -852,6 +856,9 @@ xrow_decode_subscribe(struct xrow_header *row, struct tt_uuid *replicaset_uuid,
return -1;
}
+ /* For backward compatibility initialize read-only with false. */
+ if (read_only)
+ *read_only = false;
const char *lsnmap = NULL;
d = data;
uint32_t map_size = mp_decode_map(&d);
@@ -896,6 +903,16 @@ xrow_decode_subscribe(struct xrow_header *row, struct tt_uuid *replicaset_uuid,
}
*version_id = mp_decode_uint(&d);
break;
+ case IPROTO_SERVER_READONLY:
+ if (read_only == NULL)
+ goto skip;
+ if (mp_typeof(*d) != MP_BOOL) {
+ diag_set(ClientError, ER_INVALID_MSGPACK,
+ "invalid STATUS");
+ return -1;
+ }
+ *read_only = mp_decode_bool(&d);
+ break;
default: skip:
mp_next(&d); /* value */
}
diff --git a/src/box/xrow.h b/src/box/xrow.h
index d407d151b..919a8ea3f 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -251,6 +251,8 @@ xrow_encode_subscribe(struct xrow_header *row,
* @param[out] replicaset_uuid.
* @param[out] instance_uuid.
* @param[out] vclock.
+ * @param[out] version_id.
+ * @param[out] read_only.
*
* @retval 0 Success.
* @retval -1 Memory or format error.
@@ -258,7 +260,7 @@ xrow_encode_subscribe(struct xrow_header *row,
int
xrow_decode_subscribe(struct xrow_header *row, struct tt_uuid *replicaset_uuid,
struct tt_uuid *instance_uuid, struct vclock *vclock,
- uint32_t *version_id);
+ uint32_t *version_id, bool *read_only);
/**
* Encode JOIN command.
@@ -282,7 +284,8 @@ xrow_encode_join(struct xrow_header *row, const struct tt_uuid *instance_uuid);
static inline int
xrow_decode_join(struct xrow_header *row, struct tt_uuid *instance_uuid)
{
- return xrow_decode_subscribe(row, NULL, instance_uuid, NULL, NULL);
+ return xrow_decode_subscribe(row, NULL, instance_uuid, NULL, NULL,
+ NULL);
}
/**
@@ -307,7 +310,23 @@ xrow_encode_vclock(struct xrow_header *row, const struct vclock *vclock);
static inline int
xrow_decode_vclock(struct xrow_header *row, struct vclock *vclock)
{
- return xrow_decode_subscribe(row, NULL, NULL, vclock, NULL);
+ return xrow_decode_subscribe(row, NULL, NULL, vclock, NULL, NULL);
+}
+
+/**
+ * Decode peer vclock and access rights (a response to VOTE command).
+ * @param row Row to decode.
+ * @param[out] vclock.
+ * @param[out] read_only.
+ *
+ * @retval 0 Success.
+ * @retval -1 Memory or format error.
+ */
+static inline int
+xrow_decode_request_vote(struct xrow_header *row, struct vclock *vclock,
+ bool *read_only)
+{
+ return xrow_decode_subscribe(row, NULL, NULL, vclock, NULL, read_only);
}
/**
@@ -369,18 +388,20 @@ iproto_reply_ok(struct obuf *out, uint64_t sync, uint32_t schema_version);
/**
* Encode iproto header with IPROTO_OK response code
- * and vclock in the body.
+ * vclock and access rights in the body.
* @param out Encode to.
* @param sync Request sync.
* @param schema_version.
* @param vclock.
+ * @param read_only.
*
* @retval 0 Success.
* @retval -1 Memory error.
*/
int
-iproto_reply_vclock(struct obuf *out, uint64_t sync, uint32_t schema_version,
- const struct vclock *vclock);
+iproto_reply_request_vote(struct obuf *out, uint64_t sync,
+ uint32_t schema_version, const struct vclock *vclock,
+ bool read_only);
/**
* Write an error packet int output buffer. Doesn't throw if out
@@ -571,7 +592,7 @@ xrow_decode_subscribe_xc(struct xrow_header *row,
uint32_t *replica_version_id)
{
if (xrow_decode_subscribe(row, replicaset_uuid, instance_uuid,
- vclock, replica_version_id) != 0)
+ vclock, replica_version_id, NULL) != 0)
diag_raise();
}
@@ -608,6 +629,15 @@ xrow_decode_vclock_xc(struct xrow_header *row, struct vclock *vclock)
diag_raise();
}
+/** @copydoc xrow_decode_request_vote. */
+static inline void
+xrow_decode_request_vote_xc(struct xrow_header *row, struct vclock *vclock,
+ bool *read_only)
+{
+ if (xrow_decode_request_vote(row, vclock, read_only) != 0)
+ diag_raise();
+}
+
/** @copydoc iproto_reply_ok. */
static inline void
iproto_reply_ok_xc(struct obuf *out, uint64_t sync, uint32_t schema_version)
@@ -616,12 +646,14 @@ iproto_reply_ok_xc(struct obuf *out, uint64_t sync, uint32_t schema_version)
diag_raise();
}
-/** @copydoc iproto_reply_vclock. */
+/** @copydoc iproto_reply_request_vote_xc. */
static inline void
-iproto_reply_vclock_xc(struct obuf *out, uint64_t sync, uint32_t schema_version,
- const struct vclock *vclock)
+iproto_reply_request_vote_xc(struct obuf *out, uint64_t sync,
+ uint32_t schema_version,
+ const struct vclock *vclock, bool read_only)
{
- if (iproto_reply_vclock(out, sync, schema_version, vclock) != 0)
+ if (iproto_reply_request_vote(out, sync, schema_version,
+ vclock, read_only) != 0)
diag_raise();
}
diff --git a/test/replication/replica_uuid_ro.lua b/test/replication/replica_uuid_ro.lua
new file mode 100644
index 000000000..dd5dae9e8
--- /dev/null
+++ b/test/replication/replica_uuid_ro.lua
@@ -0,0 +1,33 @@
+#!/usr/bin/env tarantool
+
+-- get instance name from filename (replica_uuid_ro1.lua => replica_uuid_ro1)
+local INSTANCE_ID = string.match(arg[0], "%d")
+local USER = 'cluster'
+local PASSWORD = 'somepassword'
+local SOCKET_DIR = require('fio').cwd()
+local function instance_uri(instance_id)
+ --return 'localhost:'..(3310 + instance_id)
+ return SOCKET_DIR..'/replica_uuid_ro'..instance_id..'.sock';
+end
+
+-- start console first
+require('console').listen(os.getenv('ADMIN'))
+
+box.cfg({
+ instance_uuid = arg[1];
+ listen = instance_uri(INSTANCE_ID);
+-- log_level = 7;
+ replication = {
+ USER..':'..PASSWORD..'@'..instance_uri(1);
+ USER..':'..PASSWORD..'@'..instance_uri(2);
+ };
+ read_only = (INSTANCE_ID ~= '1' and true or false);
+})
+
+box.once("bootstrap", function()
+ local test_run = require('test_run').new()
+ box.schema.user.create(USER, { password = PASSWORD })
+ box.schema.user.grant(USER, 'replication')
+ box.schema.space.create('test', {engine = test_run:get_cfg('engine')})
+ box.space.test:create_index('primary')
+end)
diff --git a/test/replication/replica_uuid_ro1.lua b/test/replication/replica_uuid_ro1.lua
new file mode 120000
index 000000000..342d71c57
--- /dev/null
+++ b/test/replication/replica_uuid_ro1.lua
@@ -0,0 +1 @@
+replica_uuid_ro.lua
\ No newline at end of file
diff --git a/test/replication/replica_uuid_ro2.lua b/test/replication/replica_uuid_ro2.lua
new file mode 120000
index 000000000..342d71c57
--- /dev/null
+++ b/test/replication/replica_uuid_ro2.lua
@@ -0,0 +1 @@
+replica_uuid_ro.lua
\ No newline at end of file
diff --git a/test/replication/replicaset_ro_mostly.result b/test/replication/replicaset_ro_mostly.result
new file mode 100644
index 000000000..d753a182d
--- /dev/null
+++ b/test/replication/replicaset_ro_mostly.result
@@ -0,0 +1,59 @@
+-- gh-3257 check bootstrap with read-only replica in cluster.
+-- Old behaviour: failed, since read-only is chosen by uuid.
+test_run = require('test_run').new()
+---
+...
+SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'}
+---
+...
+uuid = require('uuid')
+---
+...
+uuid1 = uuid.new()
+---
+...
+uuid2 = uuid.new()
+---
+...
+function sort_cmp(a, b) return a.time_low > b.time_low and true or false end
+---
+...
+function sort(t) table.sort(t, sort_cmp) return t end
+---
+...
+UUID = sort({uuid1, uuid2}, sort_cmp)
+---
+...
+create_cluster_cmd1 = 'create server %s with script="replication/%s.lua"'
+---
+...
+create_cluster_cmd2 = 'start server %s with args="%s", wait_load=False, wait=False'
+---
+...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+function create_cluster_uuid(servers, uuids)
+ for i, name in ipairs(servers) do
+ test_run:cmd(create_cluster_cmd1:format(name, name))
+ test_run:cmd(create_cluster_cmd2:format(name, uuids[i]))
+ end
+end;
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+-- Deploy a cluster.
+create_cluster_uuid(SERVERS, UUID)
+---
+...
+test_run:wait_fullmesh(SERVERS)
+---
+...
+-- Cleanup.
+test_run:drop_cluster(SERVERS)
+---
+...
diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua
new file mode 100644
index 000000000..539ca5a13
--- /dev/null
+++ b/test/replication/replicaset_ro_mostly.test.lua
@@ -0,0 +1,30 @@
+-- gh-3257 check bootstrap with read-only replica in cluster.
+-- Old behaviour: failed, since read-only is chosen by uuid.
+test_run = require('test_run').new()
+
+SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'}
+
+uuid = require('uuid')
+uuid1 = uuid.new()
+uuid2 = uuid.new()
+function sort_cmp(a, b) return a.time_low > b.time_low and true or false end
+function sort(t) table.sort(t, sort_cmp) return t end
+UUID = sort({uuid1, uuid2}, sort_cmp)
+
+create_cluster_cmd1 = 'create server %s with script="replication/%s.lua"'
+create_cluster_cmd2 = 'start server %s with args="%s", wait_load=False, wait=False'
+
+test_run:cmd("setopt delimiter ';'")
+function create_cluster_uuid(servers, uuids)
+ for i, name in ipairs(servers) do
+ test_run:cmd(create_cluster_cmd1:format(name, name))
+ test_run:cmd(create_cluster_cmd2:format(name, uuids[i]))
+ end
+end;
+test_run:cmd("setopt delimiter ''");
+
+-- Deploy a cluster.
+create_cluster_uuid(SERVERS, UUID)
+test_run:wait_fullmesh(SERVERS)
+-- Cleanup.
+test_run:drop_cluster(SERVERS)
--
2.14.3 (Apple Git-98)
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [tarantool-patches] [PATCH] replication: fix bug with read-only replica as a bootstrap leader
2018-04-11 16:02 [PATCH] replication: fix bug with read-only replica as a bootstrap leader Konstantin Belyavskiy
@ 2018-04-11 16:11 ` Georgy Kirichenko
2018-04-13 8:54 ` Vladimir Davydov
1 sibling, 0 replies; 9+ messages in thread
From: Georgy Kirichenko @ 2018-04-11 16:11 UTC (permalink / raw)
To: tarantool-patches; +Cc: Konstantin Belyavskiy, vdavydov
[-- Attachment #1: Type: text/plain, Size: 609 bytes --]
Generally looks good
Vladimir, please take a look on this
On Wednesday, April 11, 2018 7:02:27 PM MSK Konstantin Belyavskiy wrote:
> ticket: https://github.com/tarantool/tarantool/issues/3257
> branch:
> https://github.com/tarantool/tarantool/compare/gh-3257-fix-bug-with-ro-repl
> ica-as-a-leader
>
> When bootstrapping a new cluster, each replica from replicaset can
> be chosen as a leader, but if it is 'read-only', bootstrap will
> failed with an error.
> Fixed it by eliminating read-only replicas from voting by adding
> access rights information to IPROTO_REQUEST_VOTE reply.
>
> Closes #3257
> ---
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] replication: fix bug with read-only replica as a bootstrap leader
2018-04-11 16:02 [PATCH] replication: fix bug with read-only replica as a bootstrap leader Konstantin Belyavskiy
2018-04-11 16:11 ` [tarantool-patches] " Georgy Kirichenko
@ 2018-04-13 8:54 ` Vladimir Davydov
2018-04-13 11:13 ` [tarantool-patches] " Konstantin Belyavskiy
1 sibling, 1 reply; 9+ messages in thread
From: Vladimir Davydov @ 2018-04-13 8:54 UTC (permalink / raw)
To: Konstantin Belyavskiy; +Cc: georgy, tarantool-patches
On Wed, Apr 11, 2018 at 07:02:27PM +0300, Konstantin Belyavskiy wrote:
> ticket: https://github.com/tarantool/tarantool/issues/3257
> branch: https://github.com/tarantool/tarantool/compare/gh-3257-fix-bug-with-ro-replica-as-a-leader
>
> When bootstrapping a new cluster, each replica from replicaset can
> be chosen as a leader, but if it is 'read-only', bootstrap will
> failed with an error.
> Fixed it by eliminating read-only replicas from voting by adding
> access rights information to IPROTO_REQUEST_VOTE reply.
>
> Closes #3257
> ---
> src/box/applier.cc | 3 +-
> src/box/applier.h | 2 +
> src/box/iproto.cc | 11 +++--
> src/box/iproto_constants.c | 2 +-
> src/box/iproto_constants.h | 1 +
> src/box/replication.cc | 8 ++++
> src/box/xrow.c | 29 ++++++++++---
> src/box/xrow.h | 54 ++++++++++++++++++-----
> test/replication/replica_uuid_ro.lua | 33 ++++++++++++++
> test/replication/replica_uuid_ro1.lua | 1 +
> test/replication/replica_uuid_ro2.lua | 1 +
> test/replication/replicaset_ro_mostly.result | 59 ++++++++++++++++++++++++++
> test/replication/replicaset_ro_mostly.test.lua | 30 +++++++++++++
> 13 files changed, 212 insertions(+), 22 deletions(-)
> create mode 100644 test/replication/replica_uuid_ro.lua
> create mode 120000 test/replication/replica_uuid_ro1.lua
> create mode 120000 test/replication/replica_uuid_ro2.lua
> create mode 100644 test/replication/replicaset_ro_mostly.result
> create mode 100644 test/replication/replicaset_ro_mostly.test.lua
>
> diff --git a/src/box/applier.cc b/src/box/applier.cc
> index 9aa951c34..b3a13260d 100644
> --- a/src/box/applier.cc
> +++ b/src/box/applier.cc
> @@ -223,7 +223,8 @@ applier_connect(struct applier *applier)
> if (row.type != IPROTO_OK)
> xrow_decode_error_xc(&row);
> vclock_create(&applier->vclock);
> - xrow_decode_vclock_xc(&row, &applier->vclock);
> + xrow_decode_request_vote_xc(&row, &applier->vclock,
> + &applier->read_only);
> }
>
> applier_set_state(applier, APPLIER_CONNECTED);
> diff --git a/src/box/applier.h b/src/box/applier.h
> index f25d6cb26..f47cf330f 100644
> --- a/src/box/applier.h
> +++ b/src/box/applier.h
> @@ -95,6 +95,8 @@ struct applier {
> uint32_t version_id;
> /** Remote vclock at time of connect. */
> struct vclock vclock;
> + /** Remote access rights, true if read-only, default: false */
The comment is misleading IMO as this variable doesn't have anything
to do with access rights. It just means that the master is running in
read-only mode. Please fix.
> + bool read_only;
I'd call it master_is_ro, because it's not the applier which is read
only - it's the master.
> /** Remote address */
> union {
> struct sockaddr addr;
> diff --git a/src/box/iproto.cc b/src/box/iproto.cc
> index db5820806..51970bb28 100644
> --- a/src/box/iproto.cc
> +++ b/src/box/iproto.cc
> @@ -60,6 +60,8 @@
> #include "iproto_constants.h"
> #include "rmean.h"
> #include "errinj.h"
> +#include "applier.h"
> +#include "cfg.h"
>
> /* The number of iproto messages in flight */
> enum { IPROTO_MSG_MAX = 768 };
> @@ -1352,6 +1354,7 @@ tx_process_misc(struct cmsg *m)
> goto error;
>
> try {
> + bool read_only = false;
> switch (msg->header.type) {
> case IPROTO_AUTH:
> box_process_auth(&msg->auth);
> @@ -1363,9 +1366,11 @@ tx_process_misc(struct cmsg *m)
> ::schema_version);
> break;
> case IPROTO_REQUEST_VOTE:
> - iproto_reply_vclock_xc(out, msg->header.sync,
> - ::schema_version,
> - &replicaset.vclock);
> + read_only = cfg_geti("read_only");
We have box_is_ro() for this, which also takes into account "orphan"
mode.
> + iproto_reply_request_vote_xc(out, msg->header.sync,
> + ::schema_version,
> + &replicaset.vclock,
> + read_only);
Nit: you don't need an extra variable here - let's pass box_is_ro()
directly to iproto_reply_request_vote_xc().
> break;
> default:
> unreachable();
> diff --git a/src/box/iproto_constants.c b/src/box/iproto_constants.c
> index cd7b1d03b..eaba6259e 100644
> --- a/src/box/iproto_constants.c
> +++ b/src/box/iproto_constants.c
> @@ -40,10 +40,10 @@ const unsigned char iproto_key_type[IPROTO_KEY_MAX] =
> /* 0x04 */ MP_DOUBLE, /* IPROTO_TIMESTAMP */
> /* 0x05 */ MP_UINT, /* IPROTO_SCHEMA_VERSION */
> /* 0x06 */ MP_UINT, /* IPROTO_SERVER_VERSION */
> + /* 0x07 */ MP_UINT, /* IPROTO_SERVER_READONLY */
> /* }}} */
>
> /* {{{ unused */
> - /* 0x07 */ MP_UINT,
> /* 0x08 */ MP_UINT,
> /* 0x09 */ MP_UINT,
> /* 0x0a */ MP_UINT,
> diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h
> index 951842485..aae004584 100644
> --- a/src/box/iproto_constants.h
> +++ b/src/box/iproto_constants.h
> @@ -58,6 +58,7 @@ enum iproto_key {
> IPROTO_TIMESTAMP = 0x04,
> IPROTO_SCHEMA_VERSION = 0x05,
> IPROTO_SERVER_VERSION = 0x06,
> + IPROTO_SERVER_READONLY = 0x07,
IPROTO_SERVER_IS_RO, may be?
> /* Leave a gap for other keys in the header. */
> IPROTO_SPACE_ID = 0x10,
> IPROTO_INDEX_ID = 0x11,
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index b1c84d36c..423de2c88 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -685,6 +685,14 @@ replicaset_leader(void)
> replicaset_foreach(replica) {
> if (replica->applier == NULL)
> continue;
> + /**
> + * While bootstrapping a new cluster,
> + * read-only replicas shouldn't be considered
> + * as a leader.
> + */
> + if (replica->applier->read_only &&
> + replica->applier->vclock.signature == 0)
> + continue;
What if you're adding a new replica to an existing cluster some nodes of
which are read only? AFAIU you want to choose an rw replica in this case
as well.
> if (leader == NULL) {
> leader = replica;
> continue;
> diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua
> new file mode 100644
> index 000000000..539ca5a13
> --- /dev/null
> +++ b/test/replication/replicaset_ro_mostly.test.lua
> @@ -0,0 +1,30 @@
> +-- gh-3257 check bootstrap with read-only replica in cluster.
> +-- Old behaviour: failed, since read-only is chosen by uuid.
> +test_run = require('test_run').new()
> +
> +SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'}
> +
> +uuid = require('uuid')
> +uuid1 = uuid.new()
> +uuid2 = uuid.new()
> +function sort_cmp(a, b) return a.time_low > b.time_low and true or false end
> +function sort(t) table.sort(t, sort_cmp) return t end
> +UUID = sort({uuid1, uuid2}, sort_cmp)
I don't think you really need to set UUID explicitly. If you don't, the
test will fail quite often anyway.
> +
> +create_cluster_cmd1 = 'create server %s with script="replication/%s.lua"'
> +create_cluster_cmd2 = 'start server %s with args="%s", wait_load=False, wait=False'
> +
> +test_run:cmd("setopt delimiter ';'")
> +function create_cluster_uuid(servers, uuids)
> + for i, name in ipairs(servers) do
> + test_run:cmd(create_cluster_cmd1:format(name, name))
> + test_run:cmd(create_cluster_cmd2:format(name, uuids[i]))
> + end
> +end;
> +test_run:cmd("setopt delimiter ''");
> +
> +-- Deploy a cluster.
> +create_cluster_uuid(SERVERS, UUID)
> +test_run:wait_fullmesh(SERVERS)
> +-- Cleanup.
> +test_run:drop_cluster(SERVERS)
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [tarantool-patches] Re: [PATCH] replication: fix bug with read-only replica as a bootstrap leader
2018-04-13 8:54 ` Vladimir Davydov
@ 2018-04-13 11:13 ` Konstantin Belyavskiy
2018-04-13 11:15 ` Re[2]: " Konstantin Belyavskiy
2018-04-13 12:38 ` Vladimir Davydov
0 siblings, 2 replies; 9+ messages in thread
From: Konstantin Belyavskiy @ 2018-04-13 11:13 UTC (permalink / raw)
To: tarantool-patches, Vladimir Davydov, georgy
[-- Attachment #1: Type: text/plain, Size: 8819 bytes --]
Please take a look at newer version.
>Пятница, 13 апреля 2018, 11:55 +03:00 от Vladimir Davydov <vdavydov.dev@gmail.com>:
>
>On Wed, Apr 11, 2018 at 07:02:27PM +0300, Konstantin Belyavskiy wrote:
>> ticket: https://github.com/tarantool/tarantool/issues/3257
>> branch: https://github.com/tarantool/tarantool/compare/gh-3257-fix-bug-with-ro-replica-as-a-leader
>>
>> When bootstrapping a new cluster, each replica from replicaset can
>> be chosen as a leader, but if it is 'read-only', bootstrap will
>> failed with an error.
>> Fixed it by eliminating read-only replicas from voting by adding
>> access rights information to IPROTO_REQUEST_VOTE reply.
>>
>> Closes #3257
>> ---
>> src/box/applier.cc | 3 +-
>> src/box/applier.h | 2 +
>> src/box/iproto.cc | 11 +++--
>> src/box/iproto_constants.c | 2 +-
>> src/box/iproto_constants.h | 1 +
>> src/box/replication.cc | 8 ++++
>> src/box/xrow.c | 29 ++++++++++---
>> src/box/xrow.h | 54 ++++++++++++++++++-----
>> test/replication/replica_uuid_ro.lua | 33 ++++++++++++++
>> test/replication/replica_uuid_ro1.lua | 1 +
>> test/replication/replica_uuid_ro2.lua | 1 +
>> test/replication/replicaset_ro_mostly.result | 59 ++++++++++++++++++++++++++
>> test/replication/replicaset_ro_mostly.test.lua | 30 +++++++++++++
>> 13 files changed, 212 insertions(+), 22 deletions(-)
>> create mode 100644 test/replication/replica_uuid_ro.lua
>> create mode 120000 test/replication/replica_uuid_ro1.lua
>> create mode 120000 test/replication/replica_uuid_ro2.lua
>> create mode 100644 test/replication/replicaset_ro_mostly.result
>> create mode 100644 test/replication/replicaset_ro_mostly.test.lua
>>
>> diff --git a/src/box/applier.cc b/src/box/applier.cc
>> index 9aa951c34..b3a13260d 100644
>> --- a/src/box/applier.cc
>> +++ b/src/box/applier.cc
>> @@ -223,7 +223,8 @@ applier_connect(struct applier *applier)
>> if (row.type != IPROTO_OK)
>> xrow_decode_error_xc(&row);
>> vclock_create(&applier->vclock);
>> - xrow_decode_vclock_xc(&row, &applier->vclock);
>> + xrow_decode_request_vote_xc(&row, &applier->vclock,
>> + &applier->read_only);
>> }
>>
>> applier_set_state(applier, APPLIER_CONNECTED);
>> diff --git a/src/box/applier.h b/src/box/applier.h
>> index f25d6cb26..f47cf330f 100644
>> --- a/src/box/applier.h
>> +++ b/src/box/applier.h
>> @@ -95,6 +95,8 @@ struct applier {
>> uint32_t version_id;
>> /** Remote vclock at time of connect. */
>> struct vclock vclock;
>> + /** Remote access rights, true if read-only, default: false */
>
>The comment is misleading IMO as this variable doesn't have anything
>to do with access rights. It just means that the master is running in
>read-only mode. Please fix.
Fixed
Remote peer mode, true if read-only, default: false
>
>
>> + bool read_only;
>
>I'd call it master_is_ro, because it's not the applier which is read
>only - it's the master.
remote_is_ro
>
>
>> /** Remote address */
>> union {
>> struct sockaddr addr;
>> diff --git a/src/box/iproto.cc b/src/box/iproto.cc
>> index db5820806..51970bb28 100644
>> --- a/src/box/iproto.cc
>> +++ b/src/box/iproto.cc
>> @@ -60,6 +60,8 @@
>> #include "iproto_constants.h"
>> #include "rmean.h"
>> #include "errinj.h"
>> +#include "applier.h"
>> +#include "cfg.h"
>>
>> /* The number of iproto messages in flight */
>> enum { IPROTO_MSG_MAX = 768 };
>> @@ -1352,6 +1354,7 @@ tx_process_misc(struct cmsg *m)
>> goto error;
>>
>> try {
>> + bool read_only = false;
>> switch (msg->header.type) {
>> case IPROTO_AUTH:
>> box_process_auth(&msg->auth);
>> @@ -1363,9 +1366,11 @@ tx_process_misc(struct cmsg *m)
>> ::schema_version);
>> break;
>> case IPROTO_REQUEST_VOTE:
>> - iproto_reply_vclock_xc(out, msg->header.sync,
>> - ::schema_version,
>> - &replicaset.vclock);
>
>> + read_only = cfg_geti("read_only");
>
>We have box_is_ro() for this, which also takes into account "orphan"
>mode.
No, during bootstrap it is in "orphan" mode. So this will actually crash it.
Also after bootstrap we should use a node with the most recent vclock.
>
>
>> + iproto_reply_request_vote_xc(out, msg->header.sync,
>> + ::schema_version,
>> + &replicaset.vclock,
>> + read_only);
>
>Nit: you don't need an extra variable here - let's pass box_is_ro()
>directly to iproto_reply_request_vote_xc().
To be honest, I don't think it's necessary.
iproto_reply_request_vote_xc just pass all parameters to
iproto_reply_request_vote
and it construct reply from these params.
So it will create a mixed logic then some params are get internally, others
from input.
Let's leave it as is.
>
>> break;
>> default:
>> unreachable();
>> diff --git a/src/box/iproto_constants.c b/src/box/iproto_constants.c
>> index cd7b1d03b..eaba6259e 100644
>> --- a/src/box/iproto_constants.c
>> +++ b/src/box/iproto_constants.c
>> @@ -40,10 +40,10 @@ const unsigned char iproto_key_type[IPROTO_KEY_MAX] =
>> /* 0x04 */ MP_DOUBLE, /* IPROTO_TIMESTAMP */
>> /* 0x05 */ MP_UINT, /* IPROTO_SCHEMA_VERSION */
>> /* 0x06 */ MP_UINT, /* IPROTO_SERVER_VERSION */
>> + /* 0x07 */ MP_UINT, /* IPROTO_SERVER_READONLY */
>> /* }}} */
>>
>> /* {{{ unused */
>> - /* 0x07 */ MP_UINT,
>> /* 0x08 */ MP_UINT,
>> /* 0x09 */ MP_UINT,
>> /* 0x0a */ MP_UINT,
>> diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h
>> index 951842485..aae004584 100644
>> --- a/src/box/iproto_constants.h
>> +++ b/src/box/iproto_constants.h
>> @@ -58,6 +58,7 @@ enum iproto_key {
>> IPROTO_TIMESTAMP = 0x04,
>> IPROTO_SCHEMA_VERSION = 0x05,
>> IPROTO_SERVER_VERSION = 0x06,
>> + IPROTO_SERVER_READONLY = 0x07,
>
>IPROTO_SERVER_IS_RO, may be?
Ok, fixed.
>
>
>> /* Leave a gap for other keys in the header. */
>> IPROTO_SPACE_ID = 0x10,
>> IPROTO_INDEX_ID = 0x11,
>> diff --git a/src/box/replication.cc b/src/box/replication.cc
>> index b1c84d36c..423de2c88 100644
>> --- a/src/box/replication.cc
>> +++ b/src/box/replication.cc
>> @@ -685,6 +685,14 @@ replicaset_leader(void)
>> replicaset_foreach(replica) {
>> if (replica->applier == NULL)
>> continue;
>> + /**
>> + * While bootstrapping a new cluster,
>> + * read-only replicas shouldn't be considered
>> + * as a leader.
>> + */
>> + if (replica->applier->read_only &&
>> + replica->applier->vclock.signature == 0)
>> + continue;
>
>What if you're adding a new replica to an existing cluster some nodes of
>which are read only? AFAIU you want to choose an rw replica in this case
>as well.
No, we improve only one case: bootstrapping a new cluster (and do not change others).
In other cases it's more difficult and may be we need to develop a specific protocol
to solve it in general case. But it's a case for a new ticket.
>
>
>> if (leader == NULL) {
>> leader = replica;
>> continue;
>
>> diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua
>> new file mode 100644
>> index 000000000..539ca5a13
>> --- /dev/null
>> +++ b/test/replication/replicaset_ro_mostly.test.lua
>> @@ -0,0 +1,30 @@
>> +-- gh-3257 check bootstrap with read-only replica in cluster.
>> +-- Old behaviour: failed, since read-only is chosen by uuid.
>> +test_run = require('test_run').new()
>> +
>> +SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'}
>> +
>> +uuid = require('uuid')
>> +uuid1 = uuid.new()
>> +uuid2 = uuid.new()
>> +function sort_cmp(a, b) return a.time_low > b.time_low and true or false end
>> +function sort(t) table.sort(t, sort_cmp) return t end
>> +UUID = sort({uuid1, uuid2}, sort_cmp)
>
>I don't think you really need to set UUID explicitly. If you don't, the
>test will fail quite often anyway.
It's to make it 100% reproducible. In other cases it's a probability test (not good).
>
>
>> +
>> +create_cluster_cmd1 = 'create server %s with script="replication/%s.lua"'
>> +create_cluster_cmd2 = 'start server %s with args="%s", wait_load=False, wait=False'
>> +
>> +test_run:cmd("setopt delimiter ';'")
>> +function create_cluster_uuid(servers, uuids)
>> + for i, name in ipairs(servers) do
>> + test_run:cmd(create_cluster_cmd1:format(name, name))
>> + test_run:cmd(create_cluster_cmd2:format(name, uuids[i]))
>> + end
>> +end;
>> +test_run:cmd("setopt delimiter ''");
>> +
>> +-- Deploy a cluster.
>> +create_cluster_uuid(SERVERS, UUID)
>> +test_run:wait_fullmesh(SERVERS)
>> +-- Cleanup.
>> +test_run:drop_cluster(SERVERS)
>
Best regards,
Konstantin Belyavskiy
k.belyavskiy@tarantool.org
[-- Attachment #2: Type: text/html, Size: 12563 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re[2]: [tarantool-patches] Re: [PATCH] replication: fix bug with read-only replica as a bootstrap leader
2018-04-13 11:13 ` [tarantool-patches] " Konstantin Belyavskiy
@ 2018-04-13 11:15 ` Konstantin Belyavskiy
2018-04-13 12:38 ` Vladimir Davydov
1 sibling, 0 replies; 9+ messages in thread
From: Konstantin Belyavskiy @ 2018-04-13 11:15 UTC (permalink / raw)
To: tarantool-patches, Vladimir Davydov, georgy
[-- Attachment #1: Type: text/plain, Size: 9315 bytes --]
>Пятница, 13 апреля 2018, 14:13 +03:00 от Konstantin Belyavskiy <k.belyavskiy@tarantool.org>:
>
>Please take a look at newer version.
>
>>Пятница, 13 апреля 2018, 11:55 +03:00 от Vladimir Davydov < vdavydov.dev@gmail.com >:
>>
>>On Wed, Apr 11, 2018 at 07:02:27PM +0300, Konstantin Belyavskiy wrote:
>>> ticket: https://github.com/tarantool/tarantool/issues/3257
>>> branch: https://github.com/tarantool/tarantool/compare/gh-3257-fix-bug-with-ro-replica-as-a-leader
>>>
>>> When bootstrapping a new cluster, each replica from replicaset can
>>> be chosen as a leader, but if it is 'read-only', bootstrap will
>>> failed with an error.
>>> Fixed it by eliminating read-only replicas from voting by adding
>>> access rights information to IPROTO_REQUEST_VOTE reply.
>>>
>>> Closes #3257
>>> ---
>>> src/box/applier.cc | 3 +-
>>> src/box/applier.h | 2 +
>>> src/box/iproto.cc | 11 +++--
>>> src/box/iproto_constants.c | 2 +-
>>> src/box/iproto_constants.h | 1 +
>>> src/box/replication.cc | 8 ++++
>>> src/box/xrow.c | 29 ++++++++++---
>>> src/box/xrow.h | 54 ++++++++++++++++++-----
>>> test/replication/replica_uuid_ro.lua | 33 ++++++++++++++
>>> test/replication/replica_uuid_ro1.lua | 1 +
>>> test/replication/replica_uuid_ro2.lua | 1 +
>>> test/replication/replicaset_ro_mostly.result | 59 ++++++++++++++++++++++++++
>>> test/replication/replicaset_ro_mostly.test.lua | 30 +++++++++++++
>>> 13 files changed, 212 insertions(+), 22 deletions(-)
>>> create mode 100644 test/replication/replica_uuid_ro.lua
>>> create mode 120000 test/replication/replica_uuid_ro1.lua
>>> create mode 120000 test/replication/replica_uuid_ro2.lua
>>> create mode 100644 test/replication/replicaset_ro_mostly.result
>>> create mode 100644 test/replication/replicaset_ro_mostly.test.lua
>>>
>>> diff --git a/src/box/applier.cc b/src/box/applier.cc
>>> index 9aa951c34..b3a13260d 100644
>>> --- a/src/box/applier.cc
>>> +++ b/src/box/applier.cc
>>> @@ -223,7 +223,8 @@ applier_connect(struct applier *applier)
>>> if (row.type != IPROTO_OK)
>>> xrow_decode_error_xc(&row);
>>> vclock_create(&applier->vclock);
>>> - xrow_decode_vclock_xc(&row, &applier->vclock);
>>> + xrow_decode_request_vote_xc(&row, &applier->vclock,
>>> + &applier->read_only);
>>> }
>>>
>>> applier_set_state(applier, APPLIER_CONNECTED);
>>> diff --git a/src/box/applier.h b/src/box/applier.h
>>> index f25d6cb26..f47cf330f 100644
>>> --- a/src/box/applier.h
>>> +++ b/src/box/applier.h
>>> @@ -95,6 +95,8 @@ struct applier {
>>> uint32_t version_id;
>>> /** Remote vclock at time of connect. */
>>> struct vclock vclock;
>>> + /** Remote access rights, true if read-only, default: false */
>>
>>The comment is misleading IMO as this variable doesn't have anything
>>to do with access rights. It just means that the master is running in
>>read-only mode. Please fix.
>Fixed
>Remote peer mode, true if read-only, default: false
>>
>>
>>> + bool read_only;
>>
>>I'd call it master_is_ro, because it's not the applier which is read
>>only - it's the master.
>remote_is_ro
>>
>>
>>> /** Remote address */
>>> union {
>>> struct sockaddr addr;
>>> diff --git a/src/box/iproto.cc b/src/box/iproto.cc
>>> index db5820806..51970bb28 100644
>>> --- a/src/box/iproto.cc
>>> +++ b/src/box/iproto.cc
>>> @@ -60,6 +60,8 @@
>>> #include "iproto_constants.h"
>>> #include "rmean.h"
>>> #include "errinj.h"
>>> +#include "applier.h"
>>> +#include "cfg.h"
>>>
>>> /* The number of iproto messages in flight */
>>> enum { IPROTO_MSG_MAX = 768 };
>>> @@ -1352,6 +1354,7 @@ tx_process_misc(struct cmsg *m)
>>> goto error;
>>>
>>> try {
>>> + bool read_only = false;
>>> switch (msg->header.type) {
>>> case IPROTO_AUTH:
>>> box_process_auth(&msg->auth);
>>> @@ -1363,9 +1366,11 @@ tx_process_misc(struct cmsg *m)
>>> ::schema_version);
>>> break;
>>> case IPROTO_REQUEST_VOTE:
>>> - iproto_reply_vclock_xc(out, msg->header.sync,
>>> - ::schema_version,
>>> - &replicaset.vclock);
>>
>>> + read_only = cfg_geti("read_only");
>>
>>We have box_is_ro() for this, which also takes into account "orphan"
>>mode.
>No, during bootstrap it is in "orphan" mode. So this will actually crash it.
>Also after bootstrap we should use a node with the most recent vclock.
>>
>>
>>> + iproto_reply_request_vote_xc(out, msg->header.sync,
>>> + ::schema_version,
>>> + &replicaset.vclock,
>>> + read_only);
>>
>>Nit: you don't need an extra variable here - let's pass box_is_ro()
>>directly to iproto_reply_request_vote_xc().
>To be honest, I don't think it's necessary.
>iproto_reply_request_vote_xc just pass all parameters to
>iproto_reply_request_vote
>and it construct reply from these params.
>So it will create a mixed logic then some params are get internally, others
>from input.
>Let's leave it as is.
Sorry, understand, just remove a variable and make it one line shorter.
>>
>>> break;
>>> default:
>>> unreachable();
>>> diff --git a/src/box/iproto_constants.c b/src/box/iproto_constants.c
>>> index cd7b1d03b..eaba6259e 100644
>>> --- a/src/box/iproto_constants.c
>>> +++ b/src/box/iproto_constants.c
>>> @@ -40,10 +40,10 @@ const unsigned char iproto_key_type[IPROTO_KEY_MAX] =
>>> /* 0x04 */ MP_DOUBLE, /* IPROTO_TIMESTAMP */
>>> /* 0x05 */ MP_UINT, /* IPROTO_SCHEMA_VERSION */
>>> /* 0x06 */ MP_UINT, /* IPROTO_SERVER_VERSION */
>>> + /* 0x07 */ MP_UINT, /* IPROTO_SERVER_READONLY */
>>> /* }}} */
>>>
>>> /* {{{ unused */
>>> - /* 0x07 */ MP_UINT,
>>> /* 0x08 */ MP_UINT,
>>> /* 0x09 */ MP_UINT,
>>> /* 0x0a */ MP_UINT,
>>> diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h
>>> index 951842485..aae004584 100644
>>> --- a/src/box/iproto_constants.h
>>> +++ b/src/box/iproto_constants.h
>>> @@ -58,6 +58,7 @@ enum iproto_key {
>>> IPROTO_TIMESTAMP = 0x04,
>>> IPROTO_SCHEMA_VERSION = 0x05,
>>> IPROTO_SERVER_VERSION = 0x06,
>>> + IPROTO_SERVER_READONLY = 0x07,
>>
>>IPROTO_SERVER_IS_RO, may be?
>Ok, fixed.
>>
>>
>>> /* Leave a gap for other keys in the header. */
>>> IPROTO_SPACE_ID = 0x10,
>>> IPROTO_INDEX_ID = 0x11,
>>> diff --git a/src/box/replication.cc b/src/box/replication.cc
>>> index b1c84d36c..423de2c88 100644
>>> --- a/src/box/replication.cc
>>> +++ b/src/box/replication.cc
>>> @@ -685,6 +685,14 @@ replicaset_leader(void)
>>> replicaset_foreach(replica) {
>>> if (replica->applier == NULL)
>>> continue;
>>> + /**
>>> + * While bootstrapping a new cluster,
>>> + * read-only replicas shouldn't be considered
>>> + * as a leader.
>>> + */
>>> + if (replica->applier->read_only &&
>>> + replica->applier->vclock.signature == 0)
>>> + continue;
>>
>>What if you're adding a new replica to an existing cluster some nodes of
>>which are read only? AFAIU you want to choose an rw replica in this case
>>as well.
>No, we improve only one case: bootstrapping a new cluster (and do not change others).
>In other cases it's more difficult and may be we need to develop a specific protocol
>to solve it in general case. But it's a case for a new ticket.
>>
>>
>>> if (leader == NULL) {
>>> leader = replica;
>>> continue;
>>
>>> diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua
>>> new file mode 100644
>>> index 000000000..539ca5a13
>>> --- /dev/null
>>> +++ b/test/replication/replicaset_ro_mostly.test.lua
>>> @@ -0,0 +1,30 @@
>>> +-- gh-3257 check bootstrap with read-only replica in cluster.
>>> +-- Old behaviour: failed, since read-only is chosen by uuid.
>>> +test_run = require('test_run').new()
>>> +
>>> +SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'}
>>> +
>>> +uuid = require('uuid')
>>> +uuid1 = uuid.new()
>>> +uuid2 = uuid.new()
>>> +function sort_cmp(a, b) return a.time_low > b.time_low and true or false end
>>> +function sort(t) table.sort(t, sort_cmp) return t end
>>> +UUID = sort({uuid1, uuid2}, sort_cmp)
>>
>>I don't think you really need to set UUID explicitly. If you don't, the
>>test will fail quite often anyway.
>It's to make it 100% reproducible. In other cases it's a probability test (not good).
>>
>>
>>> +
>>> +create_cluster_cmd1 = 'create server %s with script="replication/%s.lua"'
>>> +create_cluster_cmd2 = 'start server %s with args="%s", wait_load=False, wait=False'
>>> +
>>> +test_run:cmd("setopt delimiter ';'")
>>> +function create_cluster_uuid(servers, uuids)
>>> + for i, name in ipairs(servers) do
>>> + test_run:cmd(create_cluster_cmd1:format(name, name))
>>> + test_run:cmd(create_cluster_cmd2:format(name, uuids[i]))
>>> + end
>>> +end;
>>> +test_run:cmd("setopt delimiter ''");
>>> +
>>> +-- Deploy a cluster.
>>> +create_cluster_uuid(SERVERS, UUID)
>>> +test_run:wait_fullmesh(SERVERS)
>>> +-- Cleanup.
>>> +test_run:drop_cluster(SERVERS)
>>
>
>
>Best regards,
>Konstantin Belyavskiy
>k.belyavskiy@tarantool.org
Best regards,
Konstantin Belyavskiy
k.belyavskiy@tarantool.org
[-- Attachment #2: Type: text/html, Size: 14180 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [tarantool-patches] Re: [PATCH] replication: fix bug with read-only replica as a bootstrap leader
2018-04-13 11:13 ` [tarantool-patches] " Konstantin Belyavskiy
2018-04-13 11:15 ` Re[2]: " Konstantin Belyavskiy
@ 2018-04-13 12:38 ` Vladimir Davydov
2018-04-18 10:34 ` Vladimir Davydov
1 sibling, 1 reply; 9+ messages in thread
From: Vladimir Davydov @ 2018-04-13 12:38 UTC (permalink / raw)
To: Konstantin Belyavskiy; +Cc: tarantool-patches, georgy
On Fri, Apr 13, 2018 at 02:13:34PM +0300, Konstantin Belyavskiy wrote:
> Please take a look at newer version.
When you fix something like this, please include the new version of the
patch in the email.
The patch itself looks OK to me. Posting it here for the record.
> From 983b98a91dcd70a3013de373b55294a6af468e17 Mon Sep 17 00:00:00 2001
> From: Konstantin Belyavskiy <k.belyavskiy@tarantool.org>
> Date: Mon, 9 Apr 2018 16:32:26 +0300
> Subject: [PATCH] replication: fix bug with read-only replica as a bootstrap
> leader
>
> When bootstrapping a new cluster, each replica from replicaset can
> be chosen as a leader, but if it is 'read-only', bootstrap will
> failed with an error.
> Fixed it by eliminating read-only replicas from voting by adding
> access rights information to IPROTO_REQUEST_VOTE reply.
>
> Closes #3257
>
> diff --git a/src/box/applier.cc b/src/box/applier.cc
> index 9aa951c3..127a8c90 100644
> --- a/src/box/applier.cc
> +++ b/src/box/applier.cc
> @@ -223,7 +223,8 @@ applier_connect(struct applier *applier)
> if (row.type != IPROTO_OK)
> xrow_decode_error_xc(&row);
> vclock_create(&applier->vclock);
> - xrow_decode_vclock_xc(&row, &applier->vclock);
> + xrow_decode_request_vote_xc(&row, &applier->vclock,
> + &applier->remote_is_ro);
> }
>
> applier_set_state(applier, APPLIER_CONNECTED);
> diff --git a/src/box/applier.h b/src/box/applier.h
> index f25d6cb2..392113e2 100644
> --- a/src/box/applier.h
> +++ b/src/box/applier.h
> @@ -95,6 +95,8 @@ struct applier {
> uint32_t version_id;
> /** Remote vclock at time of connect. */
> struct vclock vclock;
> + /** Remote peer mode, true if read-only, default: false */
> + bool remote_is_ro;
> /** Remote address */
> union {
> struct sockaddr addr;
> diff --git a/src/box/iproto.cc b/src/box/iproto.cc
> index db582080..81938ce0 100644
> --- a/src/box/iproto.cc
> +++ b/src/box/iproto.cc
> @@ -60,6 +60,8 @@
> #include "iproto_constants.h"
> #include "rmean.h"
> #include "errinj.h"
> +#include "applier.h"
> +#include "cfg.h"
>
> /* The number of iproto messages in flight */
> enum { IPROTO_MSG_MAX = 768 };
> @@ -1363,9 +1365,10 @@ tx_process_misc(struct cmsg *m)
> ::schema_version);
> break;
> case IPROTO_REQUEST_VOTE:
> - iproto_reply_vclock_xc(out, msg->header.sync,
> - ::schema_version,
> - &replicaset.vclock);
> + iproto_reply_request_vote_xc(out, msg->header.sync,
> + ::schema_version,
> + &replicaset.vclock,
> + cfg_geti("read_only"));
> break;
> default:
> unreachable();
> diff --git a/src/box/iproto_constants.c b/src/box/iproto_constants.c
> index cd7b1d03..3735a91a 100644
> --- a/src/box/iproto_constants.c
> +++ b/src/box/iproto_constants.c
> @@ -40,10 +40,10 @@ const unsigned char iproto_key_type[IPROTO_KEY_MAX] =
> /* 0x04 */ MP_DOUBLE, /* IPROTO_TIMESTAMP */
> /* 0x05 */ MP_UINT, /* IPROTO_SCHEMA_VERSION */
> /* 0x06 */ MP_UINT, /* IPROTO_SERVER_VERSION */
> + /* 0x07 */ MP_UINT, /* IPROTO_SERVER_IS_RO */
> /* }}} */
>
> /* {{{ unused */
> - /* 0x07 */ MP_UINT,
> /* 0x08 */ MP_UINT,
> /* 0x09 */ MP_UINT,
> /* 0x0a */ MP_UINT,
> diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h
> index 95184248..35812228 100644
> --- a/src/box/iproto_constants.h
> +++ b/src/box/iproto_constants.h
> @@ -58,6 +58,7 @@ enum iproto_key {
> IPROTO_TIMESTAMP = 0x04,
> IPROTO_SCHEMA_VERSION = 0x05,
> IPROTO_SERVER_VERSION = 0x06,
> + IPROTO_SERVER_IS_RO = 0x07,
> /* Leave a gap for other keys in the header. */
> IPROTO_SPACE_ID = 0x10,
> IPROTO_INDEX_ID = 0x11,
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index b1c84d36..e29d4ee1 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -685,6 +685,14 @@ replicaset_leader(void)
> replicaset_foreach(replica) {
> if (replica->applier == NULL)
> continue;
> + /**
> + * While bootstrapping a new cluster,
> + * read-only replicas shouldn't be considered
> + * as a leader.
> + */
> + if (replica->applier->remote_is_ro &&
> + replica->applier->vclock.signature == 0)
> + continue;
> if (leader == NULL) {
> leader = replica;
> continue;
> diff --git a/src/box/xrow.c b/src/box/xrow.c
> index f4852564..b3f81a86 100644
> --- a/src/box/xrow.c
> +++ b/src/box/xrow.c
> @@ -261,14 +261,16 @@ iproto_reply_ok(struct obuf *out, uint64_t sync, uint32_t schema_version)
> }
>
> int
> -iproto_reply_vclock(struct obuf *out, uint64_t sync, uint32_t schema_version,
> - const struct vclock *vclock)
> +iproto_reply_request_vote(struct obuf *out, uint64_t sync,
> + uint32_t schema_version, const struct vclock *vclock,
> + bool read_only)
> {
> uint32_t replicaset_size = vclock_size(vclock);
> - size_t max_size = IPROTO_HEADER_LEN + mp_sizeof_map(1) +
> + size_t max_size = IPROTO_HEADER_LEN + mp_sizeof_map(2) +
> mp_sizeof_uint(UINT32_MAX) + mp_sizeof_map(replicaset_size) +
> replicaset_size * (mp_sizeof_uint(UINT32_MAX) +
> - mp_sizeof_uint(UINT64_MAX));
> + mp_sizeof_uint(UINT64_MAX)) +
> + mp_sizeof_uint(UINT32_MAX) + mp_sizeof_bool(true);
>
> char *buf = obuf_reserve(out, max_size);
> if (buf == NULL) {
> @@ -278,7 +280,9 @@ iproto_reply_vclock(struct obuf *out, uint64_t sync, uint32_t schema_version,
> }
>
> char *data = buf + IPROTO_HEADER_LEN;
> - data = mp_encode_map(data, 1);
> + data = mp_encode_map(data, 2);
> + data = mp_encode_uint(data, IPROTO_SERVER_IS_RO);
> + data = mp_encode_bool(data, read_only);
> data = mp_encode_uint(data, IPROTO_VCLOCK);
> data = mp_encode_map(data, replicaset_size);
> struct vclock_iterator it;
> @@ -837,7 +841,7 @@ xrow_encode_subscribe(struct xrow_header *row,
> int
> xrow_decode_subscribe(struct xrow_header *row, struct tt_uuid *replicaset_uuid,
> struct tt_uuid *instance_uuid, struct vclock *vclock,
> - uint32_t *version_id)
> + uint32_t *version_id, bool *read_only)
> {
> if (row->bodycnt == 0) {
> diag_set(ClientError, ER_INVALID_MSGPACK, "request body");
> @@ -852,6 +856,9 @@ xrow_decode_subscribe(struct xrow_header *row, struct tt_uuid *replicaset_uuid,
> return -1;
> }
>
> + /* For backward compatibility initialize read-only with false. */
> + if (read_only)
> + *read_only = false;
> const char *lsnmap = NULL;
> d = data;
> uint32_t map_size = mp_decode_map(&d);
> @@ -896,6 +903,16 @@ xrow_decode_subscribe(struct xrow_header *row, struct tt_uuid *replicaset_uuid,
> }
> *version_id = mp_decode_uint(&d);
> break;
> + case IPROTO_SERVER_IS_RO:
> + if (read_only == NULL)
> + goto skip;
> + if (mp_typeof(*d) != MP_BOOL) {
> + diag_set(ClientError, ER_INVALID_MSGPACK,
> + "invalid STATUS");
> + return -1;
> + }
> + *read_only = mp_decode_bool(&d);
> + break;
> default: skip:
> mp_next(&d); /* value */
> }
> diff --git a/src/box/xrow.h b/src/box/xrow.h
> index d407d151..b10bf26d 100644
> --- a/src/box/xrow.h
> +++ b/src/box/xrow.h
> @@ -251,6 +251,8 @@ xrow_encode_subscribe(struct xrow_header *row,
> * @param[out] replicaset_uuid.
> * @param[out] instance_uuid.
> * @param[out] vclock.
> + * @param[out] version_id.
> + * @param[out] read_only.
> *
> * @retval 0 Success.
> * @retval -1 Memory or format error.
> @@ -258,7 +260,7 @@ xrow_encode_subscribe(struct xrow_header *row,
> int
> xrow_decode_subscribe(struct xrow_header *row, struct tt_uuid *replicaset_uuid,
> struct tt_uuid *instance_uuid, struct vclock *vclock,
> - uint32_t *version_id);
> + uint32_t *version_id, bool *read_only);
>
> /**
> * Encode JOIN command.
> @@ -282,7 +284,8 @@ xrow_encode_join(struct xrow_header *row, const struct tt_uuid *instance_uuid);
> static inline int
> xrow_decode_join(struct xrow_header *row, struct tt_uuid *instance_uuid)
> {
> - return xrow_decode_subscribe(row, NULL, instance_uuid, NULL, NULL);
> + return xrow_decode_subscribe(row, NULL, instance_uuid, NULL, NULL,
> + NULL);
> }
>
> /**
> @@ -307,7 +310,23 @@ xrow_encode_vclock(struct xrow_header *row, const struct vclock *vclock);
> static inline int
> xrow_decode_vclock(struct xrow_header *row, struct vclock *vclock)
> {
> - return xrow_decode_subscribe(row, NULL, NULL, vclock, NULL);
> + return xrow_decode_subscribe(row, NULL, NULL, vclock, NULL, NULL);
> +}
> +
> +/**
> + * Decode peer vclock and access rights (a response to VOTE command).
> + * @param row Row to decode.
> + * @param[out] vclock.
> + * @param[out] read_only.
> + *
> + * @retval 0 Success.
> + * @retval -1 Memory or format error.
> + */
> +static inline int
> +xrow_decode_request_vote(struct xrow_header *row, struct vclock *vclock,
> + bool *read_only)
> +{
> + return xrow_decode_subscribe(row, NULL, NULL, vclock, NULL, read_only);
> }
>
> /**
> @@ -374,13 +393,15 @@ iproto_reply_ok(struct obuf *out, uint64_t sync, uint32_t schema_version);
> * @param sync Request sync.
> * @param schema_version.
> * @param vclock.
> + * @param read_only.
> *
> * @retval 0 Success.
> * @retval -1 Memory error.
> */
> int
> -iproto_reply_vclock(struct obuf *out, uint64_t sync, uint32_t schema_version,
> - const struct vclock *vclock);
> +iproto_reply_request_vote(struct obuf *out, uint64_t sync,
> + uint32_t schema_version, const struct vclock *vclock,
> + bool read_only);
>
> /**
> * Write an error packet int output buffer. Doesn't throw if out
> @@ -571,7 +592,7 @@ xrow_decode_subscribe_xc(struct xrow_header *row,
> uint32_t *replica_version_id)
> {
> if (xrow_decode_subscribe(row, replicaset_uuid, instance_uuid,
> - vclock, replica_version_id) != 0)
> + vclock, replica_version_id, NULL) != 0)
> diag_raise();
> }
>
> @@ -608,6 +629,15 @@ xrow_decode_vclock_xc(struct xrow_header *row, struct vclock *vclock)
> diag_raise();
> }
>
> +/** @copydoc xrow_decode_request_vote. */
> +static inline void
> +xrow_decode_request_vote_xc(struct xrow_header *row, struct vclock *vclock,
> + bool *read_only)
> +{
> + if (xrow_decode_request_vote(row, vclock, read_only) != 0)
> + diag_raise();
> +}
> +
> /** @copydoc iproto_reply_ok. */
> static inline void
> iproto_reply_ok_xc(struct obuf *out, uint64_t sync, uint32_t schema_version)
> @@ -616,12 +646,14 @@ iproto_reply_ok_xc(struct obuf *out, uint64_t sync, uint32_t schema_version)
> diag_raise();
> }
>
> -/** @copydoc iproto_reply_vclock. */
> +/** @copydoc iproto_reply_request_vote_xc. */
> static inline void
> -iproto_reply_vclock_xc(struct obuf *out, uint64_t sync, uint32_t schema_version,
> - const struct vclock *vclock)
> +iproto_reply_request_vote_xc(struct obuf *out, uint64_t sync,
> + uint32_t schema_version,
> + const struct vclock *vclock, bool read_only)
> {
> - if (iproto_reply_vclock(out, sync, schema_version, vclock) != 0)
> + if (iproto_reply_request_vote(out, sync, schema_version,
> + vclock, read_only) != 0)
> diag_raise();
> }
>
> diff --git a/test/replication/replica_uuid_ro.lua b/test/replication/replica_uuid_ro.lua
> new file mode 100644
> index 00000000..dd5dae9e
> --- /dev/null
> +++ b/test/replication/replica_uuid_ro.lua
> @@ -0,0 +1,33 @@
> +#!/usr/bin/env tarantool
> +
> +-- get instance name from filename (replica_uuid_ro1.lua => replica_uuid_ro1)
> +local INSTANCE_ID = string.match(arg[0], "%d")
> +local USER = 'cluster'
> +local PASSWORD = 'somepassword'
> +local SOCKET_DIR = require('fio').cwd()
> +local function instance_uri(instance_id)
> + --return 'localhost:'..(3310 + instance_id)
> + return SOCKET_DIR..'/replica_uuid_ro'..instance_id..'.sock';
> +end
> +
> +-- start console first
> +require('console').listen(os.getenv('ADMIN'))
> +
> +box.cfg({
> + instance_uuid = arg[1];
> + listen = instance_uri(INSTANCE_ID);
> +-- log_level = 7;
> + replication = {
> + USER..':'..PASSWORD..'@'..instance_uri(1);
> + USER..':'..PASSWORD..'@'..instance_uri(2);
> + };
> + read_only = (INSTANCE_ID ~= '1' and true or false);
> +})
> +
> +box.once("bootstrap", function()
> + local test_run = require('test_run').new()
> + box.schema.user.create(USER, { password = PASSWORD })
> + box.schema.user.grant(USER, 'replication')
> + box.schema.space.create('test', {engine = test_run:get_cfg('engine')})
> + box.space.test:create_index('primary')
> +end)
> diff --git a/test/replication/replica_uuid_ro1.lua b/test/replication/replica_uuid_ro1.lua
> new file mode 120000
> index 00000000..342d71c5
> --- /dev/null
> +++ b/test/replication/replica_uuid_ro1.lua
> @@ -0,0 +1 @@
> +replica_uuid_ro.lua
> \ No newline at end of file
> diff --git a/test/replication/replica_uuid_ro2.lua b/test/replication/replica_uuid_ro2.lua
> new file mode 120000
> index 00000000..342d71c5
> --- /dev/null
> +++ b/test/replication/replica_uuid_ro2.lua
> @@ -0,0 +1 @@
> +replica_uuid_ro.lua
> \ No newline at end of file
> diff --git a/test/replication/replicaset_ro_mostly.result b/test/replication/replicaset_ro_mostly.result
> new file mode 100644
> index 00000000..d753a182
> --- /dev/null
> +++ b/test/replication/replicaset_ro_mostly.result
> @@ -0,0 +1,59 @@
> +-- gh-3257 check bootstrap with read-only replica in cluster.
> +-- Old behaviour: failed, since read-only is chosen by uuid.
> +test_run = require('test_run').new()
> +---
> +...
> +SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'}
> +---
> +...
> +uuid = require('uuid')
> +---
> +...
> +uuid1 = uuid.new()
> +---
> +...
> +uuid2 = uuid.new()
> +---
> +...
> +function sort_cmp(a, b) return a.time_low > b.time_low and true or false end
> +---
> +...
> +function sort(t) table.sort(t, sort_cmp) return t end
> +---
> +...
> +UUID = sort({uuid1, uuid2}, sort_cmp)
> +---
> +...
> +create_cluster_cmd1 = 'create server %s with script="replication/%s.lua"'
> +---
> +...
> +create_cluster_cmd2 = 'start server %s with args="%s", wait_load=False, wait=False'
> +---
> +...
> +test_run:cmd("setopt delimiter ';'")
> +---
> +- true
> +...
> +function create_cluster_uuid(servers, uuids)
> + for i, name in ipairs(servers) do
> + test_run:cmd(create_cluster_cmd1:format(name, name))
> + test_run:cmd(create_cluster_cmd2:format(name, uuids[i]))
> + end
> +end;
> +---
> +...
> +test_run:cmd("setopt delimiter ''");
> +---
> +- true
> +...
> +-- Deploy a cluster.
> +create_cluster_uuid(SERVERS, UUID)
> +---
> +...
> +test_run:wait_fullmesh(SERVERS)
> +---
> +...
> +-- Cleanup.
> +test_run:drop_cluster(SERVERS)
> +---
> +...
> diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua
> new file mode 100644
> index 00000000..539ca5a1
> --- /dev/null
> +++ b/test/replication/replicaset_ro_mostly.test.lua
> @@ -0,0 +1,30 @@
> +-- gh-3257 check bootstrap with read-only replica in cluster.
> +-- Old behaviour: failed, since read-only is chosen by uuid.
> +test_run = require('test_run').new()
> +
> +SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'}
> +
> +uuid = require('uuid')
> +uuid1 = uuid.new()
> +uuid2 = uuid.new()
> +function sort_cmp(a, b) return a.time_low > b.time_low and true or false end
> +function sort(t) table.sort(t, sort_cmp) return t end
> +UUID = sort({uuid1, uuid2}, sort_cmp)
> +
> +create_cluster_cmd1 = 'create server %s with script="replication/%s.lua"'
> +create_cluster_cmd2 = 'start server %s with args="%s", wait_load=False, wait=False'
> +
> +test_run:cmd("setopt delimiter ';'")
> +function create_cluster_uuid(servers, uuids)
> + for i, name in ipairs(servers) do
> + test_run:cmd(create_cluster_cmd1:format(name, name))
> + test_run:cmd(create_cluster_cmd2:format(name, uuids[i]))
> + end
> +end;
> +test_run:cmd("setopt delimiter ''");
> +
> +-- Deploy a cluster.
> +create_cluster_uuid(SERVERS, UUID)
> +test_run:wait_fullmesh(SERVERS)
> +-- Cleanup.
> +test_run:drop_cluster(SERVERS)
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [tarantool-patches] Re: [PATCH] replication: fix bug with read-only replica as a bootstrap leader
2018-04-13 12:38 ` Vladimir Davydov
@ 2018-04-18 10:34 ` Vladimir Davydov
0 siblings, 0 replies; 9+ messages in thread
From: Vladimir Davydov @ 2018-04-18 10:34 UTC (permalink / raw)
To: Konstantin Belyavskiy; +Cc: tarantool-patches, georgy
On Fri, Apr 13, 2018 at 03:38:46PM +0300, Vladimir Davydov wrote:
> On Fri, Apr 13, 2018 at 02:13:34PM +0300, Konstantin Belyavskiy wrote:
> > Please take a look at newer version.
>
> When you fix something like this, please include the new version of the
> patch in the email.
>
> The patch itself looks OK to me. Posting it here for the record.
>
> > From 983b98a91dcd70a3013de373b55294a6af468e17 Mon Sep 17 00:00:00 2001
> > From: Konstantin Belyavskiy <k.belyavskiy@tarantool.org>
> > Date: Mon, 9 Apr 2018 16:32:26 +0300
> > Subject: [PATCH] replication: fix bug with read-only replica as a bootstrap
> > leader
> >
> > When bootstrapping a new cluster, each replica from replicaset can
> > be chosen as a leader, but if it is 'read-only', bootstrap will
> > failed with an error.
> > Fixed it by eliminating read-only replicas from voting by adding
> > access rights information to IPROTO_REQUEST_VOTE reply.
> >
> > Closes #3257
Pushed to 1.9.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] replication: fix bug with read-only replica as a bootstrap leader
2018-05-22 15:40 Konstantin Belyavskiy
@ 2018-05-22 16:45 ` Konstantin Osipov
0 siblings, 0 replies; 9+ messages in thread
From: Konstantin Osipov @ 2018-05-22 16:45 UTC (permalink / raw)
To: Konstantin Belyavskiy; +Cc: vdavydov, georgy, tarantool-patches
* Konstantin Belyavskiy <k.belyavskiy@tarantool.org> [18/05/22 18:41]:
It's OK to push.
As a nitpick, I'd prefer extracting leader search loop into a
routine, rather than adding a goto label.
I'm not crazy about goto, but i think in this case the could would
win from not having it.
> Another broken case. Adding a new replica to cluster:
> + if (replica->applier->remote_is_ro &&
> + replica->applier->vclock.signature == 0)
> In this case we may got an ER_READONLY, since signature is not 0.
> So leader election now has two phases:
> 1. To select among read-write replicas.
> 2. If no such found, try old algorithm for backward compatibility
> (case then all replicas exist in cluster table).
>
> Closes #3257
> ---
> https://github.com/tarantool/tarantool/issues/3257
> https://github.com/tarantool/tarantool/tree/kbelyavs/gh-3257-fix-bug-with-read-only-as-a-leader
>
> src/box/replication.cc | 22 +++++++++++++++++-----
> test/replication/replica_uuid_ro3.lua | 1 +
> test/replication/replicaset_ro_mostly.result | 20 ++++++++++++++++++++
> test/replication/replicaset_ro_mostly.test.lua | 8 ++++++++
> 4 files changed, 46 insertions(+), 5 deletions(-)
> create mode 120000 test/replication/replica_uuid_ro3.lua
>
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 0b770c913..8dcf1f656 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -691,16 +691,24 @@ struct replica *
> replicaset_leader(void)
> {
> struct replica *leader = NULL;
> + bool skip_ro = true;
> + /**
> + * Two loops, first prefers read-write replicas among others.
> + * Second for backward compatibility, if there is no such
> + * replicas at all.
> + */
> +loop:
> replicaset_foreach(replica) {
> if (replica->applier == NULL)
> continue;
> /**
> - * While bootstrapping a new cluster,
> - * read-only replicas shouldn't be considered
> - * as a leader.
> + * While bootstrapping a new cluster, read-only
> + * replicas shouldn't be considered as a leader.
> + * The only exception if there is no read-write
> + * replicas since there is still a possibility
> + * that all replicas exist in cluster table.
> */
> - if (replica->applier->remote_is_ro &&
> - replica->applier->vclock.signature == 0)
> + if (skip_ro && replica->applier->remote_is_ro)
> continue;
> if (leader == NULL) {
> leader = replica;
> @@ -721,6 +729,10 @@ replicaset_leader(void)
> continue;
> leader = replica;
> }
> + if (skip_ro && leader == NULL) {
> + skip_ro = false;
> + goto loop;
> + }
> return leader;
> }
>
> diff --git a/test/replication/replica_uuid_ro3.lua b/test/replication/replica_uuid_ro3.lua
> new file mode 120000
> index 000000000..342d71c57
> --- /dev/null
> +++ b/test/replication/replica_uuid_ro3.lua
> @@ -0,0 +1 @@
> +replica_uuid_ro.lua
> \ No newline at end of file
> diff --git a/test/replication/replicaset_ro_mostly.result b/test/replication/replicaset_ro_mostly.result
> index d753a182d..b9e8f1fe8 100644
> --- a/test/replication/replicaset_ro_mostly.result
> +++ b/test/replication/replicaset_ro_mostly.result
> @@ -53,6 +53,26 @@ create_cluster_uuid(SERVERS, UUID)
> test_run:wait_fullmesh(SERVERS)
> ---
> ...
> +-- Add third replica
> +name = 'replica_uuid_ro3'
> +---
> +...
> +test_run:cmd(create_cluster_cmd1:format(name, name))
> +---
> +- true
> +...
> +test_run:cmd(create_cluster_cmd2:format(name, uuid.new()))
> +---
> +- true
> +...
> +test_run:cmd('switch replica_uuid_ro3')
> +---
> +- true
> +...
> +test_run:cmd('switch default')
> +---
> +- true
> +...
> -- Cleanup.
> test_run:drop_cluster(SERVERS)
> ---
> diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua
> index 539ca5a13..f2c2d0d11 100644
> --- a/test/replication/replicaset_ro_mostly.test.lua
> +++ b/test/replication/replicaset_ro_mostly.test.lua
> @@ -26,5 +26,13 @@ test_run:cmd("setopt delimiter ''");
> -- Deploy a cluster.
> create_cluster_uuid(SERVERS, UUID)
> test_run:wait_fullmesh(SERVERS)
> +
> +-- Add third replica
> +name = 'replica_uuid_ro3'
> +test_run:cmd(create_cluster_cmd1:format(name, name))
> +test_run:cmd(create_cluster_cmd2:format(name, uuid.new()))
> +test_run:cmd('switch replica_uuid_ro3')
> +test_run:cmd('switch default')
> +
> -- Cleanup.
> test_run:drop_cluster(SERVERS)
> --
> 2.14.3 (Apple Git-98)
--
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH] replication: fix bug with read-only replica as a bootstrap leader
@ 2018-05-22 15:40 Konstantin Belyavskiy
2018-05-22 16:45 ` Konstantin Osipov
0 siblings, 1 reply; 9+ messages in thread
From: Konstantin Belyavskiy @ 2018-05-22 15:40 UTC (permalink / raw)
To: kostja, vdavydov, georgy; +Cc: tarantool-patches
Another broken case. Adding a new replica to cluster:
+ if (replica->applier->remote_is_ro &&
+ replica->applier->vclock.signature == 0)
In this case we may got an ER_READONLY, since signature is not 0.
So leader election now has two phases:
1. To select among read-write replicas.
2. If no such found, try old algorithm for backward compatibility
(case then all replicas exist in cluster table).
Closes #3257
---
https://github.com/tarantool/tarantool/issues/3257
https://github.com/tarantool/tarantool/tree/kbelyavs/gh-3257-fix-bug-with-read-only-as-a-leader
src/box/replication.cc | 22 +++++++++++++++++-----
test/replication/replica_uuid_ro3.lua | 1 +
test/replication/replicaset_ro_mostly.result | 20 ++++++++++++++++++++
test/replication/replicaset_ro_mostly.test.lua | 8 ++++++++
4 files changed, 46 insertions(+), 5 deletions(-)
create mode 120000 test/replication/replica_uuid_ro3.lua
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 0b770c913..8dcf1f656 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -691,16 +691,24 @@ struct replica *
replicaset_leader(void)
{
struct replica *leader = NULL;
+ bool skip_ro = true;
+ /**
+ * Two loops, first prefers read-write replicas among others.
+ * Second for backward compatibility, if there is no such
+ * replicas at all.
+ */
+loop:
replicaset_foreach(replica) {
if (replica->applier == NULL)
continue;
/**
- * While bootstrapping a new cluster,
- * read-only replicas shouldn't be considered
- * as a leader.
+ * While bootstrapping a new cluster, read-only
+ * replicas shouldn't be considered as a leader.
+ * The only exception if there is no read-write
+ * replicas since there is still a possibility
+ * that all replicas exist in cluster table.
*/
- if (replica->applier->remote_is_ro &&
- replica->applier->vclock.signature == 0)
+ if (skip_ro && replica->applier->remote_is_ro)
continue;
if (leader == NULL) {
leader = replica;
@@ -721,6 +729,10 @@ replicaset_leader(void)
continue;
leader = replica;
}
+ if (skip_ro && leader == NULL) {
+ skip_ro = false;
+ goto loop;
+ }
return leader;
}
diff --git a/test/replication/replica_uuid_ro3.lua b/test/replication/replica_uuid_ro3.lua
new file mode 120000
index 000000000..342d71c57
--- /dev/null
+++ b/test/replication/replica_uuid_ro3.lua
@@ -0,0 +1 @@
+replica_uuid_ro.lua
\ No newline at end of file
diff --git a/test/replication/replicaset_ro_mostly.result b/test/replication/replicaset_ro_mostly.result
index d753a182d..b9e8f1fe8 100644
--- a/test/replication/replicaset_ro_mostly.result
+++ b/test/replication/replicaset_ro_mostly.result
@@ -53,6 +53,26 @@ create_cluster_uuid(SERVERS, UUID)
test_run:wait_fullmesh(SERVERS)
---
...
+-- Add third replica
+name = 'replica_uuid_ro3'
+---
+...
+test_run:cmd(create_cluster_cmd1:format(name, name))
+---
+- true
+...
+test_run:cmd(create_cluster_cmd2:format(name, uuid.new()))
+---
+- true
+...
+test_run:cmd('switch replica_uuid_ro3')
+---
+- true
+...
+test_run:cmd('switch default')
+---
+- true
+...
-- Cleanup.
test_run:drop_cluster(SERVERS)
---
diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua
index 539ca5a13..f2c2d0d11 100644
--- a/test/replication/replicaset_ro_mostly.test.lua
+++ b/test/replication/replicaset_ro_mostly.test.lua
@@ -26,5 +26,13 @@ test_run:cmd("setopt delimiter ''");
-- Deploy a cluster.
create_cluster_uuid(SERVERS, UUID)
test_run:wait_fullmesh(SERVERS)
+
+-- Add third replica
+name = 'replica_uuid_ro3'
+test_run:cmd(create_cluster_cmd1:format(name, name))
+test_run:cmd(create_cluster_cmd2:format(name, uuid.new()))
+test_run:cmd('switch replica_uuid_ro3')
+test_run:cmd('switch default')
+
-- Cleanup.
test_run:drop_cluster(SERVERS)
--
2.14.3 (Apple Git-98)
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2018-05-22 16:45 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-11 16:02 [PATCH] replication: fix bug with read-only replica as a bootstrap leader Konstantin Belyavskiy
2018-04-11 16:11 ` [tarantool-patches] " Georgy Kirichenko
2018-04-13 8:54 ` Vladimir Davydov
2018-04-13 11:13 ` [tarantool-patches] " Konstantin Belyavskiy
2018-04-13 11:15 ` Re[2]: " Konstantin Belyavskiy
2018-04-13 12:38 ` Vladimir Davydov
2018-04-18 10:34 ` Vladimir Davydov
2018-05-22 15:40 Konstantin Belyavskiy
2018-05-22 16:45 ` Konstantin Osipov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox