Tarantool development patches archive
 help / color / mirror / Atom feed
From: Serge Petrenko via Tarantool-patches <tarantool-patches@dev.tarantool.org>
To: v.shpilevoy@tarantool.org, gorcunov@gmail.com
Cc: tarantool-patches@dev.tarantool.org
Subject: [Tarantool-patches] [PATCH 6/9] raft: keep track of greatest known term and filter replication sources based on that
Date: Sun, 11 Apr 2021 20:56:01 +0300	[thread overview]
Message-ID: <f0f142cdbe8ef87bec48dbdfbf1e125a72ac50bd.1618163409.git.sergepetrenko@tarantool.org> (raw)
In-Reply-To: <cover.1618163409.git.sergepetrenko@tarantool.org>

Start writing the actual leader term together with the PROMOTE request
and process terms in PROMOTE requests on receiver side.

Make applier only apply synchronous transactions from the instance which
has the greatest term as received in PROMOTE requests.

Closes #5445
---
 ...very => qsync-multi-statement-recovery.md} |  0
 changelogs/unreleased/raft-promote.md         |  4 +++
 src/box/applier.cc                            | 18 +++++++++++
 src/box/box.cc                                | 15 ++++++----
 src/lib/raft/raft.c                           |  1 +
 src/lib/raft/raft.h                           | 30 +++++++++++++++++++
 6 files changed, 63 insertions(+), 5 deletions(-)
 rename changelogs/unreleased/{qsync-multi-statement-recovery => qsync-multi-statement-recovery.md} (100%)
 create mode 100644 changelogs/unreleased/raft-promote.md

diff --git a/changelogs/unreleased/qsync-multi-statement-recovery b/changelogs/unreleased/qsync-multi-statement-recovery.md
similarity index 100%
rename from changelogs/unreleased/qsync-multi-statement-recovery
rename to changelogs/unreleased/qsync-multi-statement-recovery.md
diff --git a/changelogs/unreleased/raft-promote.md b/changelogs/unreleased/raft-promote.md
new file mode 100644
index 000000000..e5dac599c
--- /dev/null
+++ b/changelogs/unreleased/raft-promote.md
@@ -0,0 +1,4 @@
+## bugfix/replication
+
+* Fix a bug in synchronous replication when rolled back transactions could
+  reappear once a sufficiently old instance reconnected (gh-5445).
diff --git a/src/box/applier.cc b/src/box/applier.cc
index e8cbbe27a..926d2f7ea 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -849,6 +849,9 @@ apply_synchro_row(struct xrow_header *row)
 
 	txn_limbo_process(&txn_limbo, &req);
 
+	if (req.type == IPROTO_PROMOTE)
+		raft_source_update_term(box_raft(), req.origin_id, req.term);
+
 	struct synchro_entry *entry;
 	entry = synchro_entry_new(row, &req);
 	if (entry == NULL)
@@ -1044,6 +1047,21 @@ applier_apply_tx(struct applier *applier, struct stailq *rows)
 		}
 	}
 
+	/*
+	 * All the synchronous rows coming from outdated instances are ignored
+	 * and replaced with NOPs to save vclock consistency.
+	 */
+	struct applier_tx_row *item;
+	if (raft_source_has_outdated_term(box_raft(), applier->instance_id) &&
+	    (last_row->wait_sync ||
+	     (iproto_type_is_synchro_request(first_row->type) &&
+	     !iproto_type_is_promote_request(first_row->type)))) {
+		stailq_foreach_entry(item, rows, next) {
+			struct xrow_header *row = &item->row;
+			row->type = IPROTO_NOP;
+			row->bodycnt = 0;
+		}
+	}
 	if (unlikely(iproto_type_is_synchro_request(first_row->type))) {
 		/*
 		 * Synchro messages are not transactions, in terms
diff --git a/src/box/box.cc b/src/box/box.cc
index b093341d3..9b6323b3f 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -426,6 +426,10 @@ wal_stream_apply_synchro_row(struct wal_stream *stream, struct xrow_header *row)
 		return -1;
 	}
 	txn_limbo_process(&txn_limbo, &syn_req);
+	if (syn_req.type == IPROTO_PROMOTE) {
+			raft_source_update_term(box_raft(), syn_req.origin_id,
+						syn_req.term);
+	}
 	return 0;
 }
 
@@ -1556,11 +1560,10 @@ box_clear_synchro_queue(bool try_wait)
 			rc = -1;
 		} else {
 promote:
-			/*
-			 * Term parameter is unused now, We'll pass
-			 * box_raft()->term there later.
-			 */
-			txn_limbo_write_promote(&txn_limbo, wait_lsn, 0);
+			/* We cannot possibly get here in a volatile state. */
+			assert(box_raft()->volatile_term == box_raft()->term);
+			txn_limbo_write_promote(&txn_limbo, wait_lsn,
+						box_raft()->term);
 			struct synchro_request req = {
 				.type = 0, /* unused */
 				.replica_id = 0, /* unused */
@@ -1570,6 +1573,8 @@ promote:
 			};
 			txn_limbo_read_promote(&txn_limbo, &req);
 			assert(txn_limbo_is_empty(&txn_limbo));
+			raft_source_update_term(box_raft(), req.origin_id,
+						req.lsn);
 		}
 	}
 	in_clear_synchro_queue = false;
diff --git a/src/lib/raft/raft.c b/src/lib/raft/raft.c
index 4ea4fc3f8..e9ce8cade 100644
--- a/src/lib/raft/raft.c
+++ b/src/lib/raft/raft.c
@@ -985,6 +985,7 @@ raft_create(struct raft *raft, const struct raft_vtab *vtab)
 		.death_timeout = 5,
 		.vtab = vtab,
 	};
+	vclock_create(&raft->term_map);
 	raft_ev_timer_init(&raft->timer, raft_sm_schedule_new_election_cb,
 			   0, 0);
 	raft->timer.data = raft;
diff --git a/src/lib/raft/raft.h b/src/lib/raft/raft.h
index e447f6634..cba45a67d 100644
--- a/src/lib/raft/raft.h
+++ b/src/lib/raft/raft.h
@@ -207,6 +207,19 @@ struct raft {
 	 * subsystems, such as Raft.
 	 */
 	const struct vclock *vclock;
+	/**
+	 * The biggest term seen by this instance and persisted in WAL as part
+	 * of a PROMOTE request. May be smaller than @a term, while there are
+	 * ongoing elections, or the leader is already known, but this instance
+	 * hasn't read its PROMOTE request yet.
+	 * During other times must be equal to @a term.
+	 */
+	uint64_t greatest_known_term;
+	/**
+	 * Latest terms received with PROMOTE entries from remote instances.
+	 * Raft uses them to determine data from which sources may be applied.
+	 */
+	struct vclock term_map;
 	/** State machine timed event trigger. */
 	struct ev_timer timer;
 	/** Configured election timeout in seconds. */
@@ -243,6 +256,13 @@ raft_is_source_allowed(const struct raft *raft, uint32_t source_id)
 	return !raft->is_enabled || raft->leader == source_id;
 }
 
+static inline bool
+raft_source_has_outdated_term(const struct raft *raft, uint32_t source_id)
+{
+	uint64_t source_term = vclock_get(&raft->term_map, source_id);
+	return raft->is_enabled && source_term < raft->greatest_known_term;
+}
+
 /** Check if Raft is enabled. */
 static inline bool
 raft_is_enabled(const struct raft *raft)
@@ -250,6 +270,16 @@ raft_is_enabled(const struct raft *raft)
 	return raft->is_enabled;
 }
 
+static inline void
+raft_source_update_term(struct raft *raft, uint32_t source_id, uint64_t term)
+{
+	if ((uint64_t) vclock_get(&raft->term_map, source_id) >= term)
+		return;
+	vclock_follow(&raft->term_map, source_id, term);
+	if (term > raft->greatest_known_term)
+		raft->greatest_known_term = term;
+}
+
 /** Process a raft entry stored in WAL/snapshot. */
 void
 raft_process_recovery(struct raft *raft, const struct raft_msg *req);
-- 
2.24.3 (Apple Git-128)


  parent reply	other threads:[~2021-04-11 17:59 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-11 17:55 [Tarantool-patches] [PATCH 0/9] raft: introduce manual elections and fix a bug with re-applying rolled back transactions Serge Petrenko via Tarantool-patches
2021-04-11 17:55 ` [Tarantool-patches] [PATCH 1/9] wal: enrich row's meta information with sync replication flags Serge Petrenko via Tarantool-patches
2021-04-12 13:06   ` Cyrill Gorcunov via Tarantool-patches
2021-04-13 13:26     ` Serge Petrenko via Tarantool-patches
2021-04-12 19:21   ` Serge Petrenko via Tarantool-patches
2021-04-11 17:55 ` [Tarantool-patches] [PATCH 2/9] xrow: introduce a PROMOTE entry Serge Petrenko via Tarantool-patches
2021-04-11 17:55 ` [Tarantool-patches] [PATCH 3/9] box: actualise iproto_key_type array Serge Petrenko via Tarantool-patches
2021-04-11 17:55 ` [Tarantool-patches] [PATCH 4/9] box: make clear_synchro_queue() write a PROMOTE entry instead of CONFIRM + ROLLBACK Serge Petrenko via Tarantool-patches
2021-04-11 17:56 ` [Tarantool-patches] [PATCH 5/9] box: write PROMOTE even for empty limbo Serge Petrenko via Tarantool-patches
2021-04-11 17:56 ` Serge Petrenko via Tarantool-patches [this message]
2021-04-12 19:23   ` [Tarantool-patches] [PATCH 6/9] raft: keep track of greatest known term and filter replication sources based on that Serge Petrenko via Tarantool-patches
2021-04-11 17:56 ` [Tarantool-patches] [PATCH 7/9] replication: introduce a new election mode: "manual" Serge Petrenko via Tarantool-patches
2021-04-11 17:56 ` [Tarantool-patches] [PATCH 8/9] Support manual elections in `box.ctl.clear_synchro_queue()` Serge Petrenko via Tarantool-patches
2021-04-12 19:23   ` Serge Petrenko via Tarantool-patches
2021-04-11 17:56 ` [Tarantool-patches] [PATCH 9/9] box.ctl: rename clear_synchro_queue to promote Serge Petrenko via Tarantool-patches
2021-04-12 19:24   ` Serge Petrenko via Tarantool-patches

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f0f142cdbe8ef87bec48dbdfbf1e125a72ac50bd.1618163409.git.sergepetrenko@tarantool.org \
    --to=tarantool-patches@dev.tarantool.org \
    --cc=gorcunov@gmail.com \
    --cc=sergepetrenko@tarantool.org \
    --cc=v.shpilevoy@tarantool.org \
    --subject='Re: [Tarantool-patches] [PATCH 6/9] raft: keep track of greatest known term and filter replication sources based on that' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox