[Tarantool-patches] [PATCH 2/3] applier: don't miss WAL writes happened during ACK send
Vladislav Shpilevoy
v.shpilevoy at tarantool.org
Sun Jul 5 18:20:15 MSK 2020
Applier has a writer fiber sending vclock of the instance to the
master after each WAL write or when heartbeat timeout passes.
However it missed WAL writes happened *during* sending ACK on a
previous WAL write. That made applier sleep heartbeat timeout
even though it had not sent data. It is not a problem for async
replication, but becomes a bug when sync transactions appear. For
them an ACK should be sent as soon as possible.
Part of #5100
---
src/box/applier.cc | 36 ++++++++++++++++++++++++++++--------
src/box/applier.h | 7 +++++++
2 files changed, 35 insertions(+), 8 deletions(-)
diff --git a/src/box/applier.cc b/src/box/applier.cc
index c636864d7..fccde877d 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -154,12 +154,14 @@ applier_writer_f(va_list ap)
* messages so we don't need to send ACKs every
* replication_timeout seconds any more.
*/
- if (applier->version_id >= version_id(1, 7, 7))
- fiber_cond_wait_timeout(&applier->writer_cond,
- TIMEOUT_INFINITY);
- else
- fiber_cond_wait_timeout(&applier->writer_cond,
- replication_timeout);
+ if (!applier->has_acks_to_send) {
+ if (applier->version_id >= version_id(1, 7, 7))
+ fiber_cond_wait_timeout(&applier->writer_cond,
+ TIMEOUT_INFINITY);
+ else
+ fiber_cond_wait_timeout(&applier->writer_cond,
+ replication_timeout);
+ }
/*
* A writer fiber is going to be awaken after a commit or
* a heartbeat message. So this is an appropriate place to
@@ -173,9 +175,16 @@ applier_writer_f(va_list ap)
applier->state != APPLIER_FOLLOW)
continue;
try {
+ applier->has_acks_to_send = false;
struct xrow_header xrow;
xrow_encode_vclock(&xrow, &replicaset.vclock);
coio_write_xrow(&io, &xrow);
+ /*
+ * Even if new ACK is requested during the
+ * write, don't send it again right away.
+ * Otherwise risk to stay in this loop for
+ * a long time.
+ */
} catch (SocketError *e) {
/*
* There is no point trying to send ACKs if
@@ -928,6 +937,17 @@ fail:
return -1;
}
+/**
+ * Notify the applier's write fiber that there are more ACKs to
+ * send to master.
+ */
+static inline void
+applier_signal_ack(struct applier *applier)
+{
+ fiber_cond_signal(&applier->writer_cond);
+ applier->has_acks_to_send = true;
+}
+
/*
* A trigger to update an applier state after a replication commit.
*/
@@ -936,7 +956,7 @@ applier_on_commit(struct trigger *trigger, void *event)
{
(void) event;
struct applier *applier = (struct applier *)trigger->data;
- fiber_cond_signal(&applier->writer_cond);
+ applier_signal_ack(applier);
return 0;
}
@@ -1101,7 +1121,7 @@ applier_subscribe(struct applier *applier)
*/
if (stailq_first_entry(&rows, struct applier_tx_row,
next)->row.lsn == 0)
- fiber_cond_signal(&applier->writer_cond);
+ applier_signal_ack(applier);
else if (applier_apply_tx(&rows) != 0)
diag_raise();
diff --git a/src/box/applier.h b/src/box/applier.h
index c9fdc2955..6e979a806 100644
--- a/src/box/applier.h
+++ b/src/box/applier.h
@@ -80,6 +80,13 @@ struct applier {
struct fiber *writer;
/** Writer cond. */
struct fiber_cond writer_cond;
+ /**
+ * True if the applier has vclocks not sent to the remote
+ * master. The flag is needed because during sending one
+ * vclock (ACK), it can be updated again. So just one
+ * condition variable is not enough.
+ */
+ bool has_acks_to_send;
/** Finite-state machine */
enum applier_state state;
/** Local time of this replica when the last row has been received */
--
2.21.1 (Apple Git-122.3)
More information about the Tarantool-patches
mailing list