From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Date: Fri, 13 Apr 2018 11:54:59 +0300 From: Vladimir Davydov Subject: Re: [PATCH] replication: fix bug with read-only replica as a bootstrap leader Message-ID: <20180413085459.x2zuxqufp73r7cmi@esperanza> References: <20180411160227.51871-1-k.belyavskiy@tarantool.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20180411160227.51871-1-k.belyavskiy@tarantool.org> To: Konstantin Belyavskiy Cc: georgy@tarantool.org, tarantool-patches@freelists.org List-ID: On Wed, Apr 11, 2018 at 07:02:27PM +0300, Konstantin Belyavskiy wrote: > ticket: https://github.com/tarantool/tarantool/issues/3257 > branch: https://github.com/tarantool/tarantool/compare/gh-3257-fix-bug-with-ro-replica-as-a-leader > > When bootstrapping a new cluster, each replica from replicaset can > be chosen as a leader, but if it is 'read-only', bootstrap will > failed with an error. > Fixed it by eliminating read-only replicas from voting by adding > access rights information to IPROTO_REQUEST_VOTE reply. > > Closes #3257 > --- > src/box/applier.cc | 3 +- > src/box/applier.h | 2 + > src/box/iproto.cc | 11 +++-- > src/box/iproto_constants.c | 2 +- > src/box/iproto_constants.h | 1 + > src/box/replication.cc | 8 ++++ > src/box/xrow.c | 29 ++++++++++--- > src/box/xrow.h | 54 ++++++++++++++++++----- > test/replication/replica_uuid_ro.lua | 33 ++++++++++++++ > test/replication/replica_uuid_ro1.lua | 1 + > test/replication/replica_uuid_ro2.lua | 1 + > test/replication/replicaset_ro_mostly.result | 59 ++++++++++++++++++++++++++ > test/replication/replicaset_ro_mostly.test.lua | 30 +++++++++++++ > 13 files changed, 212 insertions(+), 22 deletions(-) > create mode 100644 test/replication/replica_uuid_ro.lua > create mode 120000 test/replication/replica_uuid_ro1.lua > create mode 120000 test/replication/replica_uuid_ro2.lua > create mode 100644 test/replication/replicaset_ro_mostly.result > create mode 100644 test/replication/replicaset_ro_mostly.test.lua > > diff --git a/src/box/applier.cc b/src/box/applier.cc > index 9aa951c34..b3a13260d 100644 > --- a/src/box/applier.cc > +++ b/src/box/applier.cc > @@ -223,7 +223,8 @@ applier_connect(struct applier *applier) > if (row.type != IPROTO_OK) > xrow_decode_error_xc(&row); > vclock_create(&applier->vclock); > - xrow_decode_vclock_xc(&row, &applier->vclock); > + xrow_decode_request_vote_xc(&row, &applier->vclock, > + &applier->read_only); > } > > applier_set_state(applier, APPLIER_CONNECTED); > diff --git a/src/box/applier.h b/src/box/applier.h > index f25d6cb26..f47cf330f 100644 > --- a/src/box/applier.h > +++ b/src/box/applier.h > @@ -95,6 +95,8 @@ struct applier { > uint32_t version_id; > /** Remote vclock at time of connect. */ > struct vclock vclock; > + /** Remote access rights, true if read-only, default: false */ The comment is misleading IMO as this variable doesn't have anything to do with access rights. It just means that the master is running in read-only mode. Please fix. > + bool read_only; I'd call it master_is_ro, because it's not the applier which is read only - it's the master. > /** Remote address */ > union { > struct sockaddr addr; > diff --git a/src/box/iproto.cc b/src/box/iproto.cc > index db5820806..51970bb28 100644 > --- a/src/box/iproto.cc > +++ b/src/box/iproto.cc > @@ -60,6 +60,8 @@ > #include "iproto_constants.h" > #include "rmean.h" > #include "errinj.h" > +#include "applier.h" > +#include "cfg.h" > > /* The number of iproto messages in flight */ > enum { IPROTO_MSG_MAX = 768 }; > @@ -1352,6 +1354,7 @@ tx_process_misc(struct cmsg *m) > goto error; > > try { > + bool read_only = false; > switch (msg->header.type) { > case IPROTO_AUTH: > box_process_auth(&msg->auth); > @@ -1363,9 +1366,11 @@ tx_process_misc(struct cmsg *m) > ::schema_version); > break; > case IPROTO_REQUEST_VOTE: > - iproto_reply_vclock_xc(out, msg->header.sync, > - ::schema_version, > - &replicaset.vclock); > + read_only = cfg_geti("read_only"); We have box_is_ro() for this, which also takes into account "orphan" mode. > + iproto_reply_request_vote_xc(out, msg->header.sync, > + ::schema_version, > + &replicaset.vclock, > + read_only); Nit: you don't need an extra variable here - let's pass box_is_ro() directly to iproto_reply_request_vote_xc(). > break; > default: > unreachable(); > diff --git a/src/box/iproto_constants.c b/src/box/iproto_constants.c > index cd7b1d03b..eaba6259e 100644 > --- a/src/box/iproto_constants.c > +++ b/src/box/iproto_constants.c > @@ -40,10 +40,10 @@ const unsigned char iproto_key_type[IPROTO_KEY_MAX] = > /* 0x04 */ MP_DOUBLE, /* IPROTO_TIMESTAMP */ > /* 0x05 */ MP_UINT, /* IPROTO_SCHEMA_VERSION */ > /* 0x06 */ MP_UINT, /* IPROTO_SERVER_VERSION */ > + /* 0x07 */ MP_UINT, /* IPROTO_SERVER_READONLY */ > /* }}} */ > > /* {{{ unused */ > - /* 0x07 */ MP_UINT, > /* 0x08 */ MP_UINT, > /* 0x09 */ MP_UINT, > /* 0x0a */ MP_UINT, > diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h > index 951842485..aae004584 100644 > --- a/src/box/iproto_constants.h > +++ b/src/box/iproto_constants.h > @@ -58,6 +58,7 @@ enum iproto_key { > IPROTO_TIMESTAMP = 0x04, > IPROTO_SCHEMA_VERSION = 0x05, > IPROTO_SERVER_VERSION = 0x06, > + IPROTO_SERVER_READONLY = 0x07, IPROTO_SERVER_IS_RO, may be? > /* Leave a gap for other keys in the header. */ > IPROTO_SPACE_ID = 0x10, > IPROTO_INDEX_ID = 0x11, > diff --git a/src/box/replication.cc b/src/box/replication.cc > index b1c84d36c..423de2c88 100644 > --- a/src/box/replication.cc > +++ b/src/box/replication.cc > @@ -685,6 +685,14 @@ replicaset_leader(void) > replicaset_foreach(replica) { > if (replica->applier == NULL) > continue; > + /** > + * While bootstrapping a new cluster, > + * read-only replicas shouldn't be considered > + * as a leader. > + */ > + if (replica->applier->read_only && > + replica->applier->vclock.signature == 0) > + continue; What if you're adding a new replica to an existing cluster some nodes of which are read only? AFAIU you want to choose an rw replica in this case as well. > if (leader == NULL) { > leader = replica; > continue; > diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua > new file mode 100644 > index 000000000..539ca5a13 > --- /dev/null > +++ b/test/replication/replicaset_ro_mostly.test.lua > @@ -0,0 +1,30 @@ > +-- gh-3257 check bootstrap with read-only replica in cluster. > +-- Old behaviour: failed, since read-only is chosen by uuid. > +test_run = require('test_run').new() > + > +SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'} > + > +uuid = require('uuid') > +uuid1 = uuid.new() > +uuid2 = uuid.new() > +function sort_cmp(a, b) return a.time_low > b.time_low and true or false end > +function sort(t) table.sort(t, sort_cmp) return t end > +UUID = sort({uuid1, uuid2}, sort_cmp) I don't think you really need to set UUID explicitly. If you don't, the test will fail quite often anyway. > + > +create_cluster_cmd1 = 'create server %s with script="replication/%s.lua"' > +create_cluster_cmd2 = 'start server %s with args="%s", wait_load=False, wait=False' > + > +test_run:cmd("setopt delimiter ';'") > +function create_cluster_uuid(servers, uuids) > + for i, name in ipairs(servers) do > + test_run:cmd(create_cluster_cmd1:format(name, name)) > + test_run:cmd(create_cluster_cmd2:format(name, uuids[i])) > + end > +end; > +test_run:cmd("setopt delimiter ''"); > + > +-- Deploy a cluster. > +create_cluster_uuid(SERVERS, UUID) > +test_run:wait_fullmesh(SERVERS) > +-- Cleanup. > +test_run:drop_cluster(SERVERS)