Please take a look at newer version.

Пятница, 13 апреля 2018, 11:55 +03:00 от Vladimir Davydov <vdavydov.dev@gmail.com>:

On Wed, Apr 11, 2018 at 07:02:27PM +0300, Konstantin Belyavskiy wrote:
> ticket: https://github.com/tarantool/tarantool/issues/3257
> branch: https://github.com/tarantool/tarantool/compare/gh-3257-fix-bug-with-ro-replica-as-a-leader
>
> When bootstrapping a new cluster, each replica from replicaset can
> be chosen as a leader, but if it is 'read-only', bootstrap will
> failed with an error.
> Fixed it by eliminating read-only replicas from voting by adding
> access rights information to IPROTO_REQUEST_VOTE reply.
>
> Closes #3257
> ---
> src/box/applier.cc | 3 +-
> src/box/applier.h | 2 +
> src/box/iproto.cc | 11 +++--
> src/box/iproto_constants.c | 2 +-
> src/box/iproto_constants.h | 1 +
> src/box/replication.cc | 8 ++++
> src/box/xrow.c | 29 ++++++++++---
> src/box/xrow.h | 54 ++++++++++++++++++-----
> test/replication/replica_uuid_ro.lua | 33 ++++++++++++++
> test/replication/replica_uuid_ro1.lua | 1 +
> test/replication/replica_uuid_ro2.lua | 1 +
> test/replication/replicaset_ro_mostly.result | 59 ++++++++++++++++++++++++++
> test/replication/replicaset_ro_mostly.test.lua | 30 +++++++++++++
> 13 files changed, 212 insertions(+), 22 deletions(-)
> create mode 100644 test/replication/replica_uuid_ro.lua
> create mode 120000 test/replication/replica_uuid_ro1.lua
> create mode 120000 test/replication/replica_uuid_ro2.lua
> create mode 100644 test/replication/replicaset_ro_mostly.result
> create mode 100644 test/replication/replicaset_ro_mostly.test.lua
>
> diff --git a/src/box/applier.cc b/src/box/applier.cc
> index 9aa951c34..b3a13260d 100644
> --- a/src/box/applier.cc
> +++ b/src/box/applier.cc
> @@ -223,7 +223,8 @@ applier_connect(struct applier *applier)
> if (row.type != IPROTO_OK)
> xrow_decode_error_xc(&row);
> vclock_create(&applier->vclock);
> - xrow_decode_vclock_xc(&row, &applier->vclock);
> + xrow_decode_request_vote_xc(&row, &applier->vclock,
> + &applier->read_only);
> }
>
> applier_set_state(applier, APPLIER_CONNECTED);
> diff --git a/src/box/applier.h b/src/box/applier.h
> index f25d6cb26..f47cf330f 100644
> --- a/src/box/applier.h
> +++ b/src/box/applier.h
> @@ -95,6 +95,8 @@ struct applier {
> uint32_t version_id;
> /** Remote vclock at time of connect. */
> struct vclock vclock;
> + /** Remote access rights, true if read-only, default: false */

The comment is misleading IMO as this variable doesn't have anything
to do with access rights. It just means that the master is running in
read-only mode. Please fix.
Fixed

Remote peer mode, true if read-only, default: false



> + bool read_only;

I'd call it master_is_ro, because it's not the applier which is read
only - it's the master.
remote_is_ro


> /** Remote address */
> union {
> struct sockaddr addr;
> diff --git a/src/box/iproto.cc b/src/box/iproto.cc
> index db5820806..51970bb28 100644
> --- a/src/box/iproto.cc
> +++ b/src/box/iproto.cc
> @@ -60,6 +60,8 @@
> #include "iproto_constants.h"
> #include "rmean.h"
> #include "errinj.h"
> +#include "applier.h"
> +#include "cfg.h"
>
> /* The number of iproto messages in flight */
> enum { IPROTO_MSG_MAX = 768 };
> @@ -1352,6 +1354,7 @@ tx_process_misc(struct cmsg *m)
> goto error;
>
> try {
> + bool read_only = false;
> switch (msg->header.type) {
> case IPROTO_AUTH:
> box_process_auth(&msg->auth);
> @@ -1363,9 +1366,11 @@ tx_process_misc(struct cmsg *m)
> ::schema_version);
> break;
> case IPROTO_REQUEST_VOTE:
> - iproto_reply_vclock_xc(out, msg->header.sync,
> - ::schema_version,
> - &replicaset.vclock);

> + read_only = cfg_geti("read_only");

We have box_is_ro() for this, which also takes into account "orphan"
mode.
No, during bootstrap it is in "orphan" mode. So this will actually crash it.
Also after bootstrap we should use a node with the most recent vclock.


> + iproto_reply_request_vote_xc(out, msg->header.sync,
> + ::schema_version,
> + &replicaset.vclock,
> + read_only);

Nit: you don't need an extra variable here - let's pass box_is_ro()
directly to iproto_reply_request_vote_xc().
To be honest, I don't think it's necessary.
iproto_reply_request_vote_xc just pass all parameters to 
iproto_reply_request_vote
and it construct reply from these params.
So it will create a mixed logic then some params are get internally, others
from input.
Let's leave it as is.

> break;
> default:
> unreachable();
> diff --git a/src/box/iproto_constants.c b/src/box/iproto_constants.c
> index cd7b1d03b..eaba6259e 100644
> --- a/src/box/iproto_constants.c
> +++ b/src/box/iproto_constants.c
> @@ -40,10 +40,10 @@ const unsigned char iproto_key_type[IPROTO_KEY_MAX] =
> /* 0x04 */ MP_DOUBLE, /* IPROTO_TIMESTAMP */
> /* 0x05 */ MP_UINT, /* IPROTO_SCHEMA_VERSION */
> /* 0x06 */ MP_UINT, /* IPROTO_SERVER_VERSION */
> + /* 0x07 */ MP_UINT, /* IPROTO_SERVER_READONLY */
> /* }}} */
>
> /* {{{ unused */
> - /* 0x07 */ MP_UINT,
> /* 0x08 */ MP_UINT,
> /* 0x09 */ MP_UINT,
> /* 0x0a */ MP_UINT,
> diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h
> index 951842485..aae004584 100644
> --- a/src/box/iproto_constants.h
> +++ b/src/box/iproto_constants.h
> @@ -58,6 +58,7 @@ enum iproto_key {
> IPROTO_TIMESTAMP = 0x04,
> IPROTO_SCHEMA_VERSION = 0x05,
> IPROTO_SERVER_VERSION = 0x06,
> + IPROTO_SERVER_READONLY = 0x07,

IPROTO_SERVER_IS_RO, may be?
Ok, fixed.


> /* Leave a gap for other keys in the header. */
> IPROTO_SPACE_ID = 0x10,
> IPROTO_INDEX_ID = 0x11,
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index b1c84d36c..423de2c88 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -685,6 +685,14 @@ replicaset_leader(void)
> replicaset_foreach(replica) {
> if (replica->applier == NULL)
> continue;
> + /**
> + * While bootstrapping a new cluster,
> + * read-only replicas shouldn't be considered
> + * as a leader.
> + */
> + if (replica->applier->read_only &&
> + replica->applier->vclock.signature == 0)
> + continue;

What if you're adding a new replica to an existing cluster some nodes of
which are read only? AFAIU you want to choose an rw replica in this case
as well.
No, we improve only one case: bootstrapping a new cluster (and do not change others).
In other cases it's more difficult and may be we need to develop a specific protocol
to solve it in general case. But it's a case for a new ticket.


> if (leader == NULL) {
> leader = replica;
> continue;

> diff --git a/test/replication/replicaset_ro_mostly.test.lua b/test/replication/replicaset_ro_mostly.test.lua
> new file mode 100644
> index 000000000..539ca5a13
> --- /dev/null
> +++ b/test/replication/replicaset_ro_mostly.test.lua
> @@ -0,0 +1,30 @@
> +-- gh-3257 check bootstrap with read-only replica in cluster.
> +-- Old behaviour: failed, since read-only is chosen by uuid.
> +test_run = require('test_run').new()
> +
> +SERVERS = {'replica_uuid_ro1', 'replica_uuid_ro2'}
> +
> +uuid = require('uuid')
> +uuid1 = uuid.new()
> +uuid2 = uuid.new()
> +function sort_cmp(a, b) return a.time_low > b.time_low and true or false end
> +function sort(t) table.sort(t, sort_cmp) return t end
> +UUID = sort({uuid1, uuid2}, sort_cmp)

I don't think you really need to set UUID explicitly. If you don't, the
test will fail quite often anyway.
It's to make it 100% reproducible. In other cases it's a probability test (not good).


> +
> +create_cluster_cmd1 = 'create server %s with script="replication/%s.lua"'
> +create_cluster_cmd2 = 'start server %s with args="%s", wait_load=False, wait=False'
> +
> +test_run:cmd("setopt delimiter ';'")
> +function create_cluster_uuid(servers, uuids)
> + for i, name in ipairs(servers) do
> + test_run:cmd(create_cluster_cmd1:format(name, name))
> + test_run:cmd(create_cluster_cmd2:format(name, uuids[i]))
> + end
> +end;
> +test_run:cmd("setopt delimiter ''");
> +
> +-- Deploy a cluster.
> +create_cluster_uuid(SERVERS, UUID)
> +test_run:wait_fullmesh(SERVERS)
> +-- Cleanup.
> +test_run:drop_cluster(SERVERS)



Best regards,
Konstantin Belyavskiy
k.belyavskiy@tarantool.org