Please take a look at newer version. >Пятница, 13 апреля 2018, 11:55 +03:00 от Vladimir Davydov : > >On Wed, Apr 11, 2018 at 07:02:27PM +0300, Konstantin Belyavskiy wrote: >> ticket: https://github.com/tarantool/tarantool/issues/3257 >> branch: https://github.com/tarantool/tarantool/compare/gh-3257-fix-bug-with-ro-replica-as-a-leader >> >> When bootstrapping a new cluster, each replica from replicaset can >> be chosen as a leader, but if it is 'read-only', bootstrap will >> failed with an error. >> Fixed it by eliminating read-only replicas from voting by adding >> access rights information to IPROTO_REQUEST_VOTE reply. >> >> Closes #3257 >> --- >> src/box/applier.cc | 3 +- >> src/box/applier.h | 2 + >> src/box/iproto.cc | 11 +++-- >> src/box/iproto_constants.c | 2 +- >> src/box/iproto_constants.h | 1 + >> src/box/replication.cc | 8 ++++ >> src/box/xrow.c | 29 ++++++++++--- >> src/box/xrow.h | 54 ++++++++++++++++++----- >> test/replication/replica_uuid_ro.lua | 33 ++++++++++++++ >> test/replication/replica_uuid_ro1.lua | 1 + >> test/replication/replica_uuid_ro2.lua | 1 + >> test/replication/replicaset_ro_mostly.result | 59 ++++++++++++++++++++++++++ >> test/replication/replicaset_ro_mostly.test.lua | 30 +++++++++++++ >> 13 files changed, 212 insertions(+), 22 deletions(-) >> create mode 100644 test/replication/replica_uuid_ro.lua >> create mode 120000 test/replication/replica_uuid_ro1.lua >> create mode 120000 test/replication/replica_uuid_ro2.lua >> create mode 100644 test/replication/replicaset_ro_mostly.result >> create mode 100644 test/replication/replicaset_ro_mostly.test.lua >> >> diff --git a/src/box/applier.cc b/src/box/applier.cc >> index 9aa951c34..b3a13260d 100644 >> --- a/src/box/applier.cc >> +++ b/src/box/applier.cc >> @@ -223,7 +223,8 @@ applier_connect(struct applier *applier) >> if (row.type != IPROTO_OK) >> xrow_decode_error_xc(&row); >> vclock_create(&applier->vclock); >> - xrow_decode_vclock_xc(&row, &applier->vclock); >> + xrow_decode_request_vote_xc(&row, &applier->vclock, >> + &applier->read_only); >> } >> >> applier_set_state(applier, APPLIER_CONNECTED); >> diff --git a/src/box/applier.h b/src/box/applier.h >> index f25d6cb26..f47cf330f 100644 >> --- a/src/box/applier.h >> +++ b/src/box/applier.h >> @@ -95,6 +95,8 @@ struct applier { >> uint32_t version_id; >> /** Remote vclock at time of connect. */ >> struct vclock vclock; >> + /** Remote access rights, true if read-only, default: false */ > >The comment is misleading IMO as this variable doesn't have anything >to do with access rights. It just means that the master is running in >read-only mode. Please fix. Fixed Remote peer mode, true if read-only, default: false > > >> + bool read_only; > >I'd call it master_is_ro, because it's not the applier which is read >only - it's the master. remote_is_ro > > >> /** Remote address */ >> union { >> struct sockaddr addr; >> diff --git a/src/box/iproto.cc b/src/box/iproto.cc >> index db5820806..51970bb28 100644 >> --- a/src/box/iproto.cc >> +++ b/src/box/iproto.cc >> @@ -60,6 +60,8 @@ >> #include "iproto_constants.h" >> #include "rmean.h" >> #include "errinj.h" >> +#include "applier.h" >> +#include "cfg.h" >> >> /* The number of iproto messages in flight */ >> enum { IPROTO_MSG_MAX = 768 }; >> @@ -1352,6 +1354,7 @@ tx_process_misc(struct cmsg *m) >> goto error; >> >> try { >> + bool read_only = false; >> switch (msg->header.type) { >> case IPROTO_AUTH: >> box_process_auth(&msg->auth); >> @@ -1363,9 +1366,11 @@ tx_process_misc(struct cmsg *m) >> ::schema_version); >> break; >> case IPROTO_REQUEST_VOTE: >> - iproto_reply_vclock_xc(out, msg->header.sync, >> - ::schema_version, >> - &replicaset.vclock); > >> + read_only = cfg_geti("read_only"); > >We have box_is_ro() for this, which also takes into account "orphan" >mode. No, during bootstrap it is in "orphan" mode. So this will actually crash it. Also after bootstrap we should use a node with the most recent vclock. > > >> + iproto_reply_request_vote_xc(out, msg->header.sync, >> + ::schema_version, >> + &replicaset.vclock, >> + read_only); > >Nit: you don't need an extra variable here - let's pass box_is_ro() >directly to iproto_reply_request_vote_xc(). To be honest, I don't think it's necessary. iproto_reply_request_vote_xc just pass all parameters to  iproto_reply_request_vote and it construct reply from these params. So it will create a mixed logic then some params are get internally, others from input. Let's leave it as is. > >> break; >> default: >> unreachable(); >> diff --git a/src/box/iproto_constants.c b/src/box/iproto_constants.c >> index cd7b1d03b..eaba6259e 100644 >> --- a/src/box/iproto_constants.c >> +++ b/src/box/iproto_constants.c >> @@ -40,10 +40,10 @@ const unsigned char iproto_key_type[IPROTO_KEY_MAX] = >> /* 0x04 */ MP_DOUBLE, /* IPROTO_TIMESTAMP */ >> /* 0x05 */ MP_UINT, /* IPROTO_SCHEMA_VERSION */ >> /* 0x06 */ MP_UINT, /* IPROTO_SERVER_VERSION */ >> + /* 0x07 */ MP_UINT, /* IPROTO_SERVER_READONLY */ >> /* }}} */ >> >> /* {{{ unused */ >> - /* 0x07 */ MP_UINT, >> /* 0x08 */ MP_UINT, >> /* 0x09 */ MP_UINT, >> /* 0x0a */ MP_UINT, >> diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h >> index 951842485..aae004584 100644 >> --- a/src/box/iproto_constants.h >> +++ b/src/box/iproto_constants.h >> @@ -58,6 +58,7 @@ enum iproto_key { >> IPROTO_TIMESTAMP = 0x04, >> IPROTO_SCHEMA_VERSION = 0x05, >> IPROTO_SERVER_VERSION = 0x06, >> + IPROTO_SERVER_READONLY = 0x07, > >IPROTO_SERVER_IS_RO, may be? Ok, fixed. > > >> /* Leave a gap for other keys in the header. */ >> IPROTO_SPACE_ID = 0x10, >> IPROTO_INDEX_ID = 0x11, >> diff --git a/src/box/replication.cc b/src/box/replication.cc >> index b1c84d36c..423de2c88 100644 >> --- a/src/box/replication.cc >> +++ b/src/box/replication.cc >> @@ -685,6 +685,14 @@ replicaset_leader(void) >> replicaset_foreach(replica) { >> if (replica->applier == NULL) >> continue; >> + /** >> + * While bootstrapping a new cluster, >> + * read-only replicas shouldn't be considered >> + * as a leader. >> + */ >> + if (replica->applier->read_only && >> + replica->applier->vclock.signature == 0) >> + continue; > >What if you're adding a new replica to an existing cluster some nodes of >which are read only? AFAIU you want to choose an rw replica in this case >as well. No, we improve only one case: bootstrapping a new cluster (and do not change others). In other cases it's more difficult and may be we need to develop a specific protocol to solve it in general case. But it's a case for a new ticket. > > >> if (leader == NULL) { >> leader = replica; >> continue; > >> diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua >> new file mode 100644 >> index 000000000..539ca5a13 >> --- /dev/null >> +++ b/test/replication/replicaset_ro_mostly.test.lua >> @@ -0,0 +1,30 @@ >> +-- gh-3257 check bootstrap with read-only replica in cluster. >> +-- Old behaviour: failed, since read-only is chosen by uuid. >> +test_run = require('test_run').new() >> + >> +SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'} >> + >> +uuid = require('uuid') >> +uuid1 = uuid.new() >> +uuid2 = uuid.new() >> +function sort_cmp(a, b) return a.time_low > b.time_low and true or false end >> +function sort(t) table.sort(t, sort_cmp) return t end >> +UUID = sort({uuid1, uuid2}, sort_cmp) > >I don't think you really need to set UUID explicitly. If you don't, the >test will fail quite often anyway. It's to make it 100% reproducible. In other cases it's a probability test (not good). > > >> + >> +create_cluster_cmd1 = 'create server %s with script="replication/%s.lua"' >> +create_cluster_cmd2 = 'start server %s with args="%s", wait_load=False, wait=False' >> + >> +test_run:cmd("setopt delimiter ';'") >> +function create_cluster_uuid(servers, uuids) >> + for i, name in ipairs(servers) do >> + test_run:cmd(create_cluster_cmd1:format(name, name)) >> + test_run:cmd(create_cluster_cmd2:format(name, uuids[i])) >> + end >> +end; >> +test_run:cmd("setopt delimiter ''"); >> + >> +-- Deploy a cluster. >> +create_cluster_uuid(SERVERS, UUID) >> +test_run:wait_fullmesh(SERVERS) >> +-- Cleanup. >> +test_run:drop_cluster(SERVERS) > Best regards, Konstantin Belyavskiy k.belyavskiy@tarantool.org