[Tarantool-patches] [PATCH v4 13/12] replication: send accumulated Raft messages after relay start

Serge Petrenko sergepetrenko at tarantool.org
Sun Apr 18 15:00:43 MSK 2021


It may happen that a Raft leader fails to send a broadcast to the
freshly connected follower.

Here's what happens: a follower subscribes to a candidate during
on-going elections. box_process_subscribe() sends out current node's
Raft state, which's candidate. Suppose a relay from follower to
candidate is already set up. Follower immediately responds to the vote
request. This makes the candidate become leader. But candidate's relay
is not yet ready to process Raft messages, and is_leader message from
the candidate gets rejected. Once relay starts, it relays all the xlogs,
but the follower rejects all the data, because it hasn't received
is_leader notification from the candidate.

Fix this by sending the last rejected message as soon as relay starts
dispatching Raft messages.

Follow-up #5445
---

Hey, guys, take a look please. This fixes flaky 
replication/gh-5445-leader-inconsistency
and should probably fix replication/election_qsync_stress as well.

  src/box/relay.cc | 79 ++++++++++++++++++++++++++++++++++++++----------
  1 file changed, 63 insertions(+), 16 deletions(-)

diff --git a/src/box/relay.cc b/src/box/relay.cc
index 7be33ee31..9fdd02bc1 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -160,6 +160,16 @@ struct relay {
           * anonymous replica, for example.
           */
          bool is_raft_enabled;
+        /** Is set to true by the first Raft broadcast which comes while
+         * the relay is not yet ready to dispatch Raft messages.
+         */
+        bool has_pending_broadcast;
+        /**
+         * A Raft broadcast which should be pushed once relay notifies
+         * tx it needs Raft updates. Otherwise this message would be
+         * lost until some new Raft event happens.
+         */
+        struct raft_request pending_broadcast;
      } tx;
  };

@@ -626,6 +636,10 @@ struct relay_is_raft_enabled_msg {
      bool value;
      /** Flag to wait for the flag being set, in a relay thread. */
      bool is_finished;
+    /** Whether this message carries a pending raft broadcast to relay. */
+    bool has_pending_broadcast;
+    /** The raft request relay should send upon this message's return. */
+    struct raft_request req;
  };

  /** TX thread part of the Raft flag setting, first hop. */
@@ -635,14 +649,28 @@ tx_set_is_raft_enabled(struct cmsg *base)
      struct relay_is_raft_enabled_msg *msg =
          (struct relay_is_raft_enabled_msg *)base;
      msg->relay->tx.is_raft_enabled = msg->value;
+    if (msg->relay->tx.has_pending_broadcast) {
+        msg->has_pending_broadcast = true;
+        msg->req = msg->relay->tx.pending_broadcast;
+    }
  }

+static void
+relay_send_raft(struct relay *relay, struct raft_request *req);
+
  /** Relay thread part of the Raft flag setting, second hop. */
  static void
  relay_set_is_raft_enabled(struct cmsg *base)
  {
      struct relay_is_raft_enabled_msg *msg =
          (struct relay_is_raft_enabled_msg *)base;
+    /*
+     * There might have been some pending Raft broadcasts. Send the last of
+     * them as soon as relay is set up.
+     */
+    if (msg->has_pending_broadcast)
+        relay_send_raft(msg->relay, &msg->req);
+
      msg->is_finished = true;
  }

@@ -938,25 +966,41 @@ struct relay_raft_msg {
      struct relay *relay;
  };

+/**
+ * Send a Raft message to the peer. This is done asynchronously, out of 
scope
+ * of recover_remaining_wals loop.
+ */
  static void
-relay_raft_msg_push(struct cmsg *base)
+relay_send_raft(struct relay *relay, struct raft_request *req)
  {
-    struct relay_raft_msg *msg = (struct relay_raft_msg *)base;
      struct xrow_header row;
-    xrow_encode_raft(&row, &fiber()->gc, &msg->req);
+    xrow_encode_raft(&row, &fiber()->gc, req);
      try {
-        /*
-         * Send the message before restarting the recovery. Otherwise
-         * all the rows would be sent from under a non-leader role and
-         * would be ignored again.
-         */
-        relay_send(msg->relay, &row);
-        if (msg->req.state == RAFT_STATE_LEADER)
-            relay_restart_recovery(msg->relay);
+        relay_send(relay, &row);
      } catch (Exception *e) {
-        relay_set_error(msg->relay, e);
+        relay_set_error(relay, e);
          fiber_cancel(fiber());
      }
+}
+
+static void
+relay_raft_msg_push(struct cmsg *base)
+{
+    struct relay_raft_msg *msg = (struct relay_raft_msg *)base;
+    /*
+     * Send the message before restarting the recovery. Otherwise
+     * all the rows would be sent from under a non-leader role and
+     * would be ignored again.
+     */
+    relay_send_raft(msg->relay, &msg->req);
+    if (msg->req.state == RAFT_STATE_LEADER) {
+        try {
+            relay_restart_recovery(msg->relay);
+        } catch (Exception *e) {
+            relay_set_error(msg->relay, e);
+            fiber_cancel(fiber());
+        }
+    }
      free(msg);
  }

@@ -964,12 +1008,15 @@ void
  relay_push_raft(struct relay *relay, const struct raft_request *req)
  {
      /*
-     * Raft updates don't stack. They are thrown away if can't be pushed
-     * now. This is fine, as long as relay's live much longer that the
-     * timeouts in Raft are set.
+     * Remember the latest Raft update. It might be a notification that
+     * this node is a leader. If sometime later we find out this node needs
+     * Raft updates, we need to send is_leader notification.
       */
-    if (!relay->tx.is_raft_enabled)
+    if (!relay->tx.is_raft_enabled) {
+        relay->tx.has_pending_broadcast = true;
+        relay->tx.pending_broadcast = *req;
          return;
+    }
      /*
       * XXX: the message should be preallocated. It should
       * work like Kharon in IProto. Relay should have 2 raft
-- 
2.24.3 (Apple Git-128)




More information about the Tarantool-patches mailing list