[Tarantool-patches] [PATCH v3 07/12] box: introduce `box.ctl.demote`
Serge Petrenko
sergepetrenko at tarantool.org
Tue Aug 3 10:56:47 MSK 2021
01.08.2021 19:19, Vladislav Shpilevoy пишет:
> Hi! Thanks for the patch!
>
> I think we are on the finish line here, see 4 small
> comments below. After them and after you fix the failing
> vinyl test, the patchset will probably be finished!
As discussed, fixed vinyl/errinj test by making
iproto_write_error() use blocking writes in debug build.
Here's the relevant commit (indentation is broken in the letter) :
=============================================
commit c6283769a887429f81d0005b2354691ace22514d
Author: Serge Petrenko <sergepetrenko at tarantool.org>
Date: Mon Aug 2 19:33:36 2021 +0300
iproto: make iproto_write_error() blocking in debug
iproto_write_error() used to be blocking until the commit
4dac37a66b0674c345e036faa9984c9ae0d70382 (iproto: remove
iproto_write_error_blocking())
Actually, it should block until the error is written to socket, because
some tests (vinyl/errinj.test.lua, for example) rely on that.
Do not make iproto_write_error() blocking in release builds for safety
reasons, as stated in commit above. But make it blocking in debug for
testing sake.
Part-of #6034
diff --git a/src/box/iproto.cc b/src/box/iproto.cc
index 3ed641eea..5cc69b77f 100644
--- a/src/box/iproto.cc
+++ b/src/box/iproto.cc
@@ -32,6 +32,7 @@
#include <string.h>
#include <stdarg.h>
#include <stdio.h>
+#include <fcntl.h>
#include <msgpuck.h>
#include <small/ibuf.h>
@@ -557,6 +558,20 @@ struct iproto_connection
struct iproto_thread *iproto_thread;
};
+#ifdef NDEBUG
+#define iproto_write_error(fd, e, schema_version,
sync) \
+ iproto_do_write_error(fd, e, schema_version, sync);
+#else
+#define iproto_write_error(fd, e, schema_version, sync) do
{ \
+ int flags = fcntl(fd, F_GETFL,
0); \
+ if (flags >=
0) \
+ fcntl(fd, F_SETFL, flags & (~O_NONBLOCK)); \
+ iproto_do_write_error(fd, e, schema_version,
sync); \
+ if (flags >=
0) \
+ fcntl(fd, F_SETFL, flags); \
+} while (0);
+#endif
+
/**
* Return true if we have not enough spare messages
* in the message pool.
diff --git a/src/box/xrow.c b/src/box/xrow.c
index 116be01ce..5c5da4808 100644
--- a/src/box/xrow.c
+++ b/src/box/xrow.c
@@ -543,8 +543,8 @@ iproto_reply_error(struct obuf *out, const struct
error *e, uint64_t sync,
}
void
-iproto_write_error(int fd, const struct error *e, uint32_t schema_version,
- uint64_t sync)
+iproto_do_write_error(int fd, const struct error *e, uint32_t
schema_version,
+ uint64_t sync)
{
bool is_error = false;
struct mpstream stream;
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 3867e0c0e..30d6b8639 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -719,8 +719,8 @@ iproto_reply_chunk(struct obuf *buf, struct obuf_svp
*svp, uint64_t sync,
/** Write error directly to a socket. */
void
-iproto_write_error(int fd, const struct error *e, uint32_t schema_version,
- uint64_t sync);
+iproto_do_write_error(int fd, const struct error *e, uint32_t
schema_version,
+ uint64_t sync);
enum {
/* Maximal length of protocol name in handshake */
=============================================
I've also noticed some failures in box-py/iproto.test.py,
so I've made the following changes in these 3 commits:
d61435f4a replication: send current Raft term in join response
f0e4d1b73 replication: send latest effective promote in initial join
2d2ba4d35 replication: add META stage to JOIN
=============================================
diff --git a/src/box/box.cc b/src/box/box.cc
index f3da02231..e7b8ddda5 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -2724,7 +2724,7 @@ box_process_join(struct ev_io *io, struct
xrow_header *header)
/* Decode JOIN request */
struct tt_uuid instance_uuid = uuid_nil;
- uint32_t replica_version_id;
+ uint32_t replica_version_id = 0;
xrow_decode_join_xc(header, &instance_uuid, &replica_version_id);
/* Check that bootstrap has been finished */
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 805b5e7ff..2947468ba 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -444,20 +444,20 @@ relay_initial_join(int fd, uint64_t sync, struct
vclock *vclock,
*/
if (replica_version_id > 0) {
/* Mark the beginning of the metadata stream. */
- row.type = IPROTO_JOIN_META;
- coio_write_xrow(&relay->io, &row);
+ xrow_encode_type(&row, IPROTO_JOIN_META);
+ xstream_write(&relay->stream, &row);
xrow_encode_raft(&row, &fiber()->gc, &raft_req);
- coio_write_xrow(&relay->io, &row);
+ xstream_write(&relay->stream, &row);
char body[XROW_SYNCHRO_BODY_LEN_MAX];
xrow_encode_synchro(&row, body, &req);
row.replica_id = req.replica_id;
- coio_write_xrow(&relay->io, &row);
+ xstream_write(&relay->stream, &row);
/* Mark the end of the metadata stream. */
- row.type = IPROTO_JOIN_SNAPSHOT;
- coio_write_xrow(&relay->io, &row);
+ xrow_encode_type(&row, IPROTO_JOIN_SNAPSHOT);
+ xstream_write(&relay->stream, &row);
}
/* Send read view to the replica. */
diff --git a/src/box/xrow.c b/src/box/xrow.c
index 5c5da4808..8ab8b2768 100644
--- a/src/box/xrow.c
+++ b/src/box/xrow.c
@@ -1730,6 +1730,13 @@ xrow_encode_timestamp(struct xrow_header *row,
uint32_t replica_id, double tm)
row->tm = tm;
}
+void
+xrow_encode_type(struct xrow_header *row, uint16_t type)
+{
+ memset(row, 0, sizeof(*row));
+ row->type = type;
+}
+
void
greeting_encode(char *greetingbuf, uint32_t version_id,
const struct tt_uuid *uuid, const char *salt, uint32_t salt_len)
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 30d6b8639..c6e8ed0fd 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -570,6 +570,14 @@ xrow_decode_subscribe_response(struct xrow_header *row,
void
xrow_encode_timestamp(struct xrow_header *row, uint32_t replica_id,
double tm);
+/**
+ * Encode any bodyless message.
+ * @param row[out] Row to encode into.
+ * @param type Message type.
+ */
+void
+xrow_encode_type(struct xrow_header *row, uint16_t type);
+
/**
* Fast encode xrow header using the specified header fields.
* It is faster than the xrow_header_encode, because uses
=============================================
>
> There are only 2 things which bother me. They are not bugs
> and we can work on them in the next quarter.
>
> 1) Assume you have election_mode = 'manual'. And you are a
> leader. You call box.ctl.demote() and stop being a leader.
> But the limbo is still yours. If now you switch election_mode to
> 'off', you need to call box.ctl.demote() again to free the
> limbo.
This is an inconvenience rather than a bug IMO.
I couldn't find a solution right away.
> 2) In the last commit I see we make too much actions to ensure
> we are a writable leader. Perhaps in the future we should not
> report box.info.election.state == 'leader' until promote is
> written and should not say the instance is writable.
Yes, maybe. Need to think about that.
>
> I don't have a plan for either of these ideas yet.
>
>> diff --git a/src/box/box.cc b/src/box/box.cc
>> index 41f665e38..a34e05e94 100644
>> --- a/src/box/box.cc
>> +++ b/src/box/box.cc
>> @@ -1679,20 +1679,44 @@ box_issue_promote(uint32_t prev_leader_id, int64_t promote_lsn)
>> assert(txn_limbo_is_empty(&txn_limbo));
>> }
>>
>> +/* A guard to block multiple simultaneous box_promote() invocations. */
> 1. For out of function comments we usually use /** as an opening.
Fixed, sorry.
>> +static bool in_box_promote = false;
> 2. Could you please use `is_` prefix here? `is_in_box_promote`.
Ok.
>> +
>> +int
>> +box_promote_qsync(void)
>> +{
>> + assert(!in_box_promote);
>> + assert(is_box_configured);
>> + struct raft *raft = box_raft();
>> + in_box_promote = true;
>> + auto promote_guard = make_scoped_guard([&] {
>> + in_box_promote = false;
>> + });
>> + if (raft->state != RAFT_STATE_LEADER)
>> + return 0;
> 3. This condition is not reachable, according to what I see in
> box_raft_worker_f().
Indeed. Changed, please, see the diff below.
>> + assert(box_election_mode == ELECTION_MODE_MANUAL ||
>> + box_election_mode == ELECTION_MODE_CANDIDATE);
>> + if (txn_limbo_replica_term(&txn_limbo, instance_id) == raft->term)
>> + return 0;
>> + int64_t wait_lsn = box_wait_limbo_acked(TIMEOUT_INFINITY);
>> + if (wait_lsn < 0)
>> + return -1;
> 4. Perhaps this better be panic? Because infinity timeout should
> not ever be reached. And then the function becomes void, because
> would not be able to fail anymore.
No. Actually, there are still multiple reasons for box_wait_limbo_acked()
to fail: the quorum may be reached, but then increased,
the fiber might be cancelled, new synchronous transactions might appear
while waiting.
I think box_promote_qsync() should be retried in raft worker, like it was
done for box_promote().
Also there's box_check_promote_term_intact() inside box_wait_limbo_acked(),
which could cause box_promote_qsync() to fail, but we shouldn't panic in
this case.
Here's the diff for this commit (box: split manual and automatic promotion):
============================================
diff --git a/src/box/box.cc b/src/box/box.cc
index dad5e4557..53ed64e51 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1679,23 +1679,20 @@ box_issue_promote(uint32_t prev_leader_id,
int64_t promote_lsn)
assert(txn_limbo_is_empty(&txn_limbo));
}
-/* A guard to block multiple simultaneous box_promote() invocations. */
-static bool in_box_promote = false;
+/** A guard to block multiple simultaneous box_promote() invocations. */
+static bool is_in_box_promote = false;
int
box_promote_qsync(void)
{
- assert(!in_box_promote);
+ assert(!is_in_box_promote);
assert(is_box_configured);
struct raft *raft = box_raft();
- in_box_promote = true;
+ is_in_box_promote = true;
auto promote_guard = make_scoped_guard([&] {
- in_box_promote = false;
+ is_in_box_promote = false;
});
- if (raft->state != RAFT_STATE_LEADER)
- return 0;
- assert(box_election_mode == ELECTION_MODE_MANUAL ||
- box_election_mode == ELECTION_MODE_CANDIDATE);
+ assert(raft->state == RAFT_STATE_LEADER);
if (txn_limbo_replica_term(&txn_limbo, instance_id) == raft->term)
return 0;
int64_t wait_lsn = box_wait_limbo_acked(TIMEOUT_INFINITY);
@@ -1708,15 +1705,15 @@ box_promote_qsync(void)
int
box_promote(void)
{
- if (in_box_promote) {
+ if (is_in_box_promote) {
diag_set(ClientError, ER_UNSUPPORTED, "box.ctl.promote",
"simultaneous invocations");
return -1;
}
struct raft *raft = box_raft();
- in_box_promote = true;
+ is_in_box_promote = true;
auto promote_guard = make_scoped_guard([&] {
- in_box_promote = false;
+ is_in_box_promote = false;
});
/*
* Do nothing when box isn't configured and when PROMOTE was already
@@ -1742,7 +1739,7 @@ box_promote(void)
case ELECTION_MODE_MANUAL:
if (raft->state == RAFT_STATE_LEADER)
return 0;
- in_box_promote = false;
+ is_in_box_promote = false;
return box_run_elections();
case ELECTION_MODE_CANDIDATE:
/*
diff --git a/src/box/raft.c b/src/box/raft.c
index f8d13aa32..bc69f7f1b 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -83,6 +83,24 @@ box_raft_request_to_msg(const struct raft_request
*req, struct raft_msg *msg)
};
}
+static void
+box_raft_update_synchro_queue(struct raft *raft)
+{
+ assert(raft == box_raft());
+ if (raft->state != RAFT_STATE_LEADER)
+ return;
+ int rc = 0;
+ uint32_t errcode = 0;
+ do {
+ rc = box_promote_qsync();
+ if (rc != 0) {
+ struct error *err = diag_last_error(diag_get());
+ errcode = box_error_code(err);
+ diag_log();
+ }
+ } while (rc != 0 && errcode == ER_QUORUM_WAIT &&
!fiber_is_cancelled());
+}
+
static int
box_raft_worker_f(va_list args)
{
@@ -93,9 +111,7 @@ box_raft_worker_f(va_list args)
box_raft_has_work = false;
raft_process_async(raft);
- if (raft->state == RAFT_STATE_LEADER &&
- box_promote_qsync() != 0)
- diag_log();
+ box_raft_update_synchro_queue(raft);
if (!box_raft_has_work)
fiber_yield();
============================================
>> + box_issue_promote(txn_limbo.owner_id, wait_lsn);
>> + return 0;
>> +}
>> commit 7980cb3096f2616a2851f8d97db8091f0d82879c
>> Author: Serge Petrenko<sergepetrenko at tarantool.org>
>> Date: Mon Jun 28 11:52:44 2021 +0300
>>
>> box: allow calling promote on a candidate
>>
>> Part of #6034
>>
>> diff --git a/test/replication/gh-6034-election-candidate-promote.result b/test/replication/gh-6034-election-candidate-promote.result
>> new file mode 100644
>> index 000000000..2b4bc0213
>> --- /dev/null
>> +++ b/test/replication/gh-6034-election-candidate-promote.result
> 5. The test name format `gh-####-...` is obligatory only for bug tests.
> This patch seems to be adding a feature.
Ok:
===============================
diff --git a/test/replication/gh-6034-election-candidate-promote.result
b/test/replication/election-candidate-promote.result
similarity index 100%
rename from test/replication/gh-6034-election-candidate-promote.result
rename to test/replication/election-candidate-promote.result
diff --git
a/test/replication/gh-6034-election-candidate-promote.test.lua
b/test/replication/election-candidate-promote.test.lua
similarity index 100%
rename from test/replication/gh-6034-election-candidate-promote.test.lua
rename to test/replication/election-candidate-promote.test.lua
diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
index 1ec1a94eb..1a3c991f0 100644
--- a/test/replication/suite.cfg
+++ b/test/replication/suite.cfg
@@ -57,8 +57,8 @@
"gh-6057-qsync-confirm-async-no-wal.test.lua": {},
"gh-6094-rs-uuid-mismatch.test.lua": {},
"gh-6127-election-join-new.test.lua": {},
- "gh-6034-election-candidate-promote.test.lua": {},
"gh-6035-applier-filter.test.lua": {},
+ "election-candidate-promote.test.lua": {},
"*": {
"memtx": {"engine": "memtx"},
"vinyl": {"engine": "vinyl"}
===============================
--
Serge Petrenko
More information about the Tarantool-patches
mailing list