[Tarantool-patches] [PATCH v3 07/12] box: introduce `box.ctl.demote`

Serge Petrenko sergepetrenko at tarantool.org
Tue Aug 3 10:56:47 MSK 2021



01.08.2021 19:19, Vladislav Shpilevoy пишет:
> Hi! Thanks for the patch!
>
> I think we are on the finish line here, see 4 small
> comments below. After them and after you fix the failing
> vinyl test, the patchset will probably be finished!

As discussed, fixed vinyl/errinj test by making
iproto_write_error() use blocking writes in debug build.

Here's the relevant commit (indentation is broken in the letter) :

=============================================

commit c6283769a887429f81d0005b2354691ace22514d
Author: Serge Petrenko <sergepetrenko at tarantool.org>
Date:   Mon Aug 2 19:33:36 2021 +0300

     iproto: make iproto_write_error() blocking in debug

     iproto_write_error() used to be blocking until the commit
     4dac37a66b0674c345e036faa9984c9ae0d70382 (iproto: remove
     iproto_write_error_blocking())
     Actually, it should block until the error is written to socket, because
     some tests (vinyl/errinj.test.lua, for example) rely on that.

     Do not make iproto_write_error() blocking in release builds for safety
     reasons, as stated in commit above. But make it blocking in debug for
     testing sake.

     Part-of #6034

diff --git a/src/box/iproto.cc b/src/box/iproto.cc
index 3ed641eea..5cc69b77f 100644
--- a/src/box/iproto.cc
+++ b/src/box/iproto.cc
@@ -32,6 +32,7 @@
  #include <string.h>
  #include <stdarg.h>
  #include <stdio.h>
+#include <fcntl.h>

  #include <msgpuck.h>
  #include <small/ibuf.h>
@@ -557,6 +558,20 @@ struct iproto_connection
      struct iproto_thread *iproto_thread;
  };

+#ifdef NDEBUG
+#define iproto_write_error(fd, e, schema_version, 
sync)                         \
+    iproto_do_write_error(fd, e, schema_version, sync);
+#else
+#define iproto_write_error(fd, e, schema_version, sync) do 
{                    \
+    int flags = fcntl(fd, F_GETFL, 
0);                                      \
+    if (flags >= 
0)                                                         \
+        fcntl(fd, F_SETFL, flags & (~O_NONBLOCK));                      \
+    iproto_do_write_error(fd, e, schema_version, 
sync);                     \
+    if (flags >= 
0)                                                         \
+        fcntl(fd, F_SETFL, flags);                                      \
+} while (0);
+#endif
+
  /**
   * Return true if we have not enough spare messages
   * in the message pool.
diff --git a/src/box/xrow.c b/src/box/xrow.c
index 116be01ce..5c5da4808 100644
--- a/src/box/xrow.c
+++ b/src/box/xrow.c
@@ -543,8 +543,8 @@ iproto_reply_error(struct obuf *out, const struct 
error *e, uint64_t sync,
  }

  void
-iproto_write_error(int fd, const struct error *e, uint32_t schema_version,
-           uint64_t sync)
+iproto_do_write_error(int fd, const struct error *e, uint32_t 
schema_version,
+              uint64_t sync)
  {
      bool is_error = false;
      struct mpstream stream;
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 3867e0c0e..30d6b8639 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -719,8 +719,8 @@ iproto_reply_chunk(struct obuf *buf, struct obuf_svp 
*svp, uint64_t sync,

  /** Write error directly to a socket. */
  void
-iproto_write_error(int fd, const struct error *e, uint32_t schema_version,
-           uint64_t sync);
+iproto_do_write_error(int fd, const struct error *e, uint32_t 
schema_version,
+              uint64_t sync);

  enum {
      /* Maximal length of protocol name in handshake */

=============================================

I've also noticed some failures in box-py/iproto.test.py,
so I've made the following changes in these 3 commits:
d61435f4a replication: send current Raft term in join response
f0e4d1b73 replication: send latest effective promote in initial join
2d2ba4d35 replication: add META stage to JOIN

=============================================

diff --git a/src/box/box.cc b/src/box/box.cc
index f3da02231..e7b8ddda5 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -2724,7 +2724,7 @@ box_process_join(struct ev_io *io, struct 
xrow_header *header)

      /* Decode JOIN request */
      struct tt_uuid instance_uuid = uuid_nil;
-    uint32_t replica_version_id;
+    uint32_t replica_version_id = 0;
      xrow_decode_join_xc(header, &instance_uuid, &replica_version_id);

      /* Check that bootstrap has been finished */
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 805b5e7ff..2947468ba 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -444,20 +444,20 @@ relay_initial_join(int fd, uint64_t sync, struct 
vclock *vclock,
       */
      if (replica_version_id > 0) {
          /* Mark the beginning of the metadata stream. */
-        row.type = IPROTO_JOIN_META;
-        coio_write_xrow(&relay->io, &row);
+        xrow_encode_type(&row, IPROTO_JOIN_META);
+        xstream_write(&relay->stream, &row);

          xrow_encode_raft(&row, &fiber()->gc, &raft_req);
-        coio_write_xrow(&relay->io, &row);
+        xstream_write(&relay->stream, &row);

          char body[XROW_SYNCHRO_BODY_LEN_MAX];
          xrow_encode_synchro(&row, body, &req);
          row.replica_id = req.replica_id;
-        coio_write_xrow(&relay->io, &row);
+        xstream_write(&relay->stream, &row);

          /* Mark the end of the metadata stream. */
-        row.type = IPROTO_JOIN_SNAPSHOT;
-        coio_write_xrow(&relay->io, &row);
+        xrow_encode_type(&row, IPROTO_JOIN_SNAPSHOT);
+        xstream_write(&relay->stream, &row);
      }

      /* Send read view to the replica. */
diff --git a/src/box/xrow.c b/src/box/xrow.c
index 5c5da4808..8ab8b2768 100644
--- a/src/box/xrow.c
+++ b/src/box/xrow.c
@@ -1730,6 +1730,13 @@ xrow_encode_timestamp(struct xrow_header *row, 
uint32_t replica_id, double tm)
      row->tm = tm;
  }

+void
+xrow_encode_type(struct xrow_header *row, uint16_t type)
+{
+    memset(row, 0, sizeof(*row));
+    row->type = type;
+}
+
  void
  greeting_encode(char *greetingbuf, uint32_t version_id,
          const struct tt_uuid *uuid, const char *salt, uint32_t salt_len)
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 30d6b8639..c6e8ed0fd 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -570,6 +570,14 @@ xrow_decode_subscribe_response(struct xrow_header *row,
  void
  xrow_encode_timestamp(struct xrow_header *row, uint32_t replica_id, 
double tm);

+/**
+ * Encode any bodyless message.
+ * @param row[out] Row to encode into.
+ * @param type Message type.
+ */
+void
+xrow_encode_type(struct xrow_header *row, uint16_t type);
+
  /**
   * Fast encode xrow header using the specified header fields.
   * It is faster than the xrow_header_encode, because uses

=============================================
>
> There are only 2 things which bother me. They are not bugs
> and we can work on them in the next quarter.
>
> 1) Assume you have election_mode = 'manual'. And you are a
> leader. You call box.ctl.demote() and stop being a leader.
> But the limbo is still yours. If now you switch election_mode to
> 'off', you need to call box.ctl.demote() again to free the
> limbo.

This is an inconvenience rather than a bug IMO.
I couldn't find a solution right away.

> 2) In the last commit I see we make too much actions to ensure
> we are a writable leader. Perhaps in the future we should not
> report box.info.election.state == 'leader' until promote is
> written and should not say the instance is writable.

Yes, maybe. Need to think about that.

>
> I don't have a plan for either of these ideas yet.
>
>> diff --git a/src/box/box.cc b/src/box/box.cc
>> index 41f665e38..a34e05e94 100644
>> --- a/src/box/box.cc
>> +++ b/src/box/box.cc
>> @@ -1679,20 +1679,44 @@ box_issue_promote(uint32_t prev_leader_id, int64_t promote_lsn)
>>   	assert(txn_limbo_is_empty(&txn_limbo));
>>   }
>>   
>> +/* A guard to block multiple simultaneous box_promote() invocations. */
> 1. For out of function comments we usually use /** as an opening.

Fixed, sorry.

>> +static bool in_box_promote = false;
> 2. Could you please use `is_` prefix here? `is_in_box_promote`.

Ok.

>> +
>> +int
>> +box_promote_qsync(void)
>> +{
>> +	assert(!in_box_promote);
>> +	assert(is_box_configured);
>> +	struct raft *raft = box_raft();
>> +	in_box_promote = true;
>> +	auto promote_guard = make_scoped_guard([&] {
>> +		in_box_promote = false;
>> +	});
>> +	if (raft->state != RAFT_STATE_LEADER)
>> +		return 0;
> 3. This condition is not reachable, according to what I see in
> box_raft_worker_f().

Indeed. Changed, please, see the diff below.

>> +	assert(box_election_mode == ELECTION_MODE_MANUAL ||
>> +	       box_election_mode == ELECTION_MODE_CANDIDATE);
>> +	if (txn_limbo_replica_term(&txn_limbo, instance_id) == raft->term)
>> +		return 0;
>> +	int64_t wait_lsn = box_wait_limbo_acked(TIMEOUT_INFINITY);
>> +	if (wait_lsn < 0)
>> +		return -1;
> 4. Perhaps this better be panic? Because infinity timeout should
> not ever be reached. And then the function becomes void, because
> would not be able to fail anymore.

No. Actually, there are still multiple reasons for box_wait_limbo_acked()
to fail: the quorum may be reached, but then increased,
the fiber might be cancelled, new synchronous transactions might appear
while waiting.

I think box_promote_qsync() should be retried in raft worker, like it was
done for box_promote().

Also there's box_check_promote_term_intact() inside box_wait_limbo_acked(),
which could cause box_promote_qsync() to fail, but we shouldn't panic in
this case.


Here's the diff for this commit (box: split manual and automatic promotion):

============================================

diff --git a/src/box/box.cc b/src/box/box.cc
index dad5e4557..53ed64e51 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1679,23 +1679,20 @@ box_issue_promote(uint32_t prev_leader_id, 
int64_t promote_lsn)
      assert(txn_limbo_is_empty(&txn_limbo));
  }

-/* A guard to block multiple simultaneous box_promote() invocations. */
-static bool in_box_promote = false;
+/** A guard to block multiple simultaneous box_promote() invocations. */
+static bool is_in_box_promote = false;

  int
  box_promote_qsync(void)
  {
-    assert(!in_box_promote);
+    assert(!is_in_box_promote);
      assert(is_box_configured);
      struct raft *raft = box_raft();
-    in_box_promote = true;
+    is_in_box_promote = true;
      auto promote_guard = make_scoped_guard([&] {
-        in_box_promote = false;
+        is_in_box_promote = false;
      });
-    if (raft->state != RAFT_STATE_LEADER)
-        return 0;
-    assert(box_election_mode == ELECTION_MODE_MANUAL ||
-           box_election_mode == ELECTION_MODE_CANDIDATE);
+    assert(raft->state == RAFT_STATE_LEADER);
      if (txn_limbo_replica_term(&txn_limbo, instance_id) == raft->term)
          return 0;
      int64_t wait_lsn = box_wait_limbo_acked(TIMEOUT_INFINITY);
@@ -1708,15 +1705,15 @@ box_promote_qsync(void)
  int
  box_promote(void)
  {
-    if (in_box_promote) {
+    if (is_in_box_promote) {
          diag_set(ClientError, ER_UNSUPPORTED, "box.ctl.promote",
               "simultaneous invocations");
          return -1;
      }
      struct raft *raft = box_raft();
-    in_box_promote = true;
+    is_in_box_promote = true;
      auto promote_guard = make_scoped_guard([&] {
-        in_box_promote = false;
+        is_in_box_promote = false;
      });
      /*
       * Do nothing when box isn't configured and when PROMOTE was already
@@ -1742,7 +1739,7 @@ box_promote(void)
      case ELECTION_MODE_MANUAL:
          if (raft->state == RAFT_STATE_LEADER)
              return 0;
-        in_box_promote = false;
+        is_in_box_promote = false;
          return box_run_elections();
      case ELECTION_MODE_CANDIDATE:
          /*
diff --git a/src/box/raft.c b/src/box/raft.c
index f8d13aa32..bc69f7f1b 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -83,6 +83,24 @@ box_raft_request_to_msg(const struct raft_request 
*req, struct raft_msg *msg)
      };
  }

+static void
+box_raft_update_synchro_queue(struct raft *raft)
+{
+    assert(raft == box_raft());
+    if (raft->state != RAFT_STATE_LEADER)
+        return;
+    int rc = 0;
+    uint32_t errcode = 0;
+    do {
+        rc = box_promote_qsync();
+        if (rc != 0) {
+            struct error *err = diag_last_error(diag_get());
+            errcode = box_error_code(err);
+            diag_log();
+        }
+    } while (rc != 0 && errcode == ER_QUORUM_WAIT && 
!fiber_is_cancelled());
+}
+
  static int
  box_raft_worker_f(va_list args)
  {
@@ -93,9 +111,7 @@ box_raft_worker_f(va_list args)
          box_raft_has_work = false;

          raft_process_async(raft);
-        if (raft->state == RAFT_STATE_LEADER &&
-            box_promote_qsync() != 0)
-            diag_log();
+        box_raft_update_synchro_queue(raft);

          if (!box_raft_has_work)
              fiber_yield();


============================================
>> +	box_issue_promote(txn_limbo.owner_id, wait_lsn);
>> +	return 0;
>> +}
>> commit 7980cb3096f2616a2851f8d97db8091f0d82879c
>> Author: Serge Petrenko<sergepetrenko at tarantool.org>
>> Date:   Mon Jun 28 11:52:44 2021 +0300
>>
>>      box: allow calling promote on a candidate
>>      
>>      Part of #6034
>>
>> diff --git a/test/replication/gh-6034-election-candidate-promote.result b/test/replication/gh-6034-election-candidate-promote.result
>> new file mode 100644
>> index 000000000..2b4bc0213
>> --- /dev/null
>> +++ b/test/replication/gh-6034-election-candidate-promote.result
> 5. The test name format `gh-####-...` is obligatory only for bug tests.
> This patch seems to be adding a feature.

Ok:

===============================
diff --git a/test/replication/gh-6034-election-candidate-promote.result 
b/test/replication/election-candidate-promote.result
similarity index 100%
rename from test/replication/gh-6034-election-candidate-promote.result
rename to test/replication/election-candidate-promote.result
diff --git 
a/test/replication/gh-6034-election-candidate-promote.test.lua 
b/test/replication/election-candidate-promote.test.lua
similarity index 100%
rename from test/replication/gh-6034-election-candidate-promote.test.lua
rename to test/replication/election-candidate-promote.test.lua
diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
index 1ec1a94eb..1a3c991f0 100644
--- a/test/replication/suite.cfg
+++ b/test/replication/suite.cfg
@@ -57,8 +57,8 @@
      "gh-6057-qsync-confirm-async-no-wal.test.lua": {},
      "gh-6094-rs-uuid-mismatch.test.lua": {},
      "gh-6127-election-join-new.test.lua": {},
-    "gh-6034-election-candidate-promote.test.lua": {},
      "gh-6035-applier-filter.test.lua": {},
+    "election-candidate-promote.test.lua": {},
      "*": {
          "memtx": {"engine": "memtx"},
          "vinyl": {"engine": "vinyl"}
===============================

-- 
Serge Petrenko



More information about the Tarantool-patches mailing list