[Tarantool-patches] [PATCH v22 0/3] qsync: implement packet filtering (part 1)

Cyrill Gorcunov gorcunov at gmail.com
Mon Oct 11 22:16:32 MSK 2021


Guys, please take a look once time permit, any comments are highly appreciated!

v19 (by Vlad):
 - do not modify box_issue_promote and demote (while they are
   still simply utter code duplication but whatever)
 - make txn_limbo_process being void
 - make txn_limbo_process_begin/commit/rollback being void
 - the real processing of request under the lock named as
   txn_limbo_process_core
 - testcase completely reworked (kudos to SergeP)
 - note that if we import test to the master branch without
   ordering pass it will fire assertion
 - dropped off debug info from box.info interface

v20 (by SergeP):
 - use guard for ACK processing and parameters change
 - rework test

v21 (by SergeP):
 - drop warning from txn_limbo_ack
 - rework test to use cluster helpers and ERRINJ_WAL_WRITE_COUNT
   error injection, same time drop modification of election_replica
   script
v22 (by SergeP):
 - use limbo emptiness test _after_ owner_id test
 - drop redundant assert in limbo commit/rollback
   since we're unlocking a latch anyway where own
   assertion present
 - in test: drop excessive wait_cond and setup wal
   delay earlier

branch gorcunov/gh-6036-rollback-confirm-22
issue https://github.com/tarantool/tarantool/issues/6036
previous series https://lists.tarantool.org/tarantool-patches/20211008175809.349501-1-gorcunov@gmail.com/

Cyrill Gorcunov (3):
  latch: add latch_is_locked helper
  qsync: order access to the limbo terms
  test: add gh-6036-qsync-order test

 src/box/applier.cc                            |  12 +-
 src/box/box.cc                                |  15 +-
 src/box/relay.cc                              |  11 +-
 src/box/txn.c                                 |   2 +-
 src/box/txn_limbo.c                           |  49 ++++-
 src/box/txn_limbo.h                           |  78 ++++++-
 src/lib/core/latch.h                          |  11 +
 test/replication/gh-6036-qsync-order.result   | 200 ++++++++++++++++++
 test/replication/gh-6036-qsync-order.test.lua |  96 +++++++++
 test/replication/suite.cfg                    |   1 +
 test/replication/suite.ini                    |   2 +-
 test/unit/snap_quorum_delay.cc                |   5 +-
 12 files changed, 451 insertions(+), 31 deletions(-)
 create mode 100644 test/replication/gh-6036-qsync-order.result
 create mode 100644 test/replication/gh-6036-qsync-order.test.lua


base-commit: ce5752ce235324fcefd5a3d0503fd3f8a1800d38
-- 
2.31.1

-- Summary patch against v21 --
diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
index 176a83cb5..8f9bc11c7 100644
--- a/src/box/txn_limbo.c
+++ b/src/box/txn_limbo.c
@@ -546,8 +546,6 @@ void
 txn_limbo_ack(struct txn_limbo *limbo, uint32_t owner_id,
 	      uint32_t replica_id, int64_t lsn)
 {
-	if (rlist_empty(&limbo->queue))
-		return;
 	/*
 	 * ACKs are collected only by the transactions originator
 	 * (which is the single master in 100% so far). Other instances
@@ -560,6 +558,15 @@ txn_limbo_ack(struct txn_limbo *limbo, uint32_t owner_id,
 	 */
 	if (!txn_limbo_is_owner(limbo, owner_id))
 		return;
+
+	/*
+	 * Test for empty queue is done _after_ txn_limbo_is_owner
+	 * call because we need to be sure that limbo is not been
+	 * changed under our feets while we're reading it.
+	 */
+	if (rlist_empty(&limbo->queue))
+		return;
+
 	/*
 	 * If limbo is currently writing a rollback, it means that the whole
 	 * queue will be rolled back. Because rollback is written only for
@@ -815,8 +822,6 @@ txn_limbo_process(struct txn_limbo *limbo,
 void
 txn_limbo_on_parameters_change(struct txn_limbo *limbo)
 {
-	if (rlist_empty(&limbo->queue))
-		return;
 	/*
 	 * In case if we're not current leader (ie not owning the
 	 * limbo) then we should not confirm anything, otherwise
@@ -826,6 +831,9 @@ txn_limbo_on_parameters_change(struct txn_limbo *limbo)
 	if (!txn_limbo_is_owner(limbo, instance_id))
 		return;
 
+	if (rlist_empty(&limbo->queue))
+		return;
+
 	struct txn_limbo_entry *e;
 	int64_t confirm_lsn = -1;
 	rlist_foreach_entry(e, &limbo->queue, in_queue) {
diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h
index aaff444e4..33cacef8f 100644
--- a/src/box/txn_limbo.h
+++ b/src/box/txn_limbo.h
@@ -349,7 +349,6 @@ txn_limbo_process_begin(struct txn_limbo *limbo)
 static inline void
 txn_limbo_process_commit(struct txn_limbo *limbo)
 {
-	assert(latch_is_locked(&limbo->promote_latch));
 	latch_unlock(&limbo->promote_latch);
 }
 
@@ -357,7 +356,6 @@ txn_limbo_process_commit(struct txn_limbo *limbo)
 static inline void
 txn_limbo_process_rollback(struct txn_limbo *limbo)
 {
-	assert(latch_is_locked(&limbo->promote_latch));
 	latch_unlock(&limbo->promote_latch);
 }
 
diff --git a/test/replication/gh-6036-qsync-order.result b/test/replication/gh-6036-qsync-order.result
index eb3e808cb..464a131a4 100644
--- a/test/replication/gh-6036-qsync-order.result
+++ b/test/replication/gh-6036-qsync-order.result
@@ -76,10 +76,6 @@ test_run:switch("election_replica2")
  | ---
  | - true
  | ...
-test_run:wait_cond(function() return box.space.test:get{1} ~= nil end)
- | ---
- | - true
- | ...
 box.cfg({                                   \
     replication = {                         \
         "unix/:./election_replica2.sock",   \
@@ -106,6 +102,10 @@ test_run:switch("election_replica3")
 write_cnt = box.error.injection.get("ERRINJ_WAL_WRITE_COUNT")
  | ---
  | ...
+box.error.injection.set("ERRINJ_WAL_DELAY", true)
+ | ---
+ | - ok
+ | ...
 --
 -- Make election_replica2 been a leader and start writting data,
 -- the PROMOTE request get queued on election_replica3 and not
@@ -128,10 +128,6 @@ test_run:wait_cond(function() return box.error.injection.get("ERRINJ_WAL_WRITE_C
  | ---
  | - true
  | ...
-box.error.injection.set("ERRINJ_WAL_DELAY", true)
- | ---
- | - ok
- | ...
 test_run:switch("election_replica2")
  | ---
  | - true
diff --git a/test/replication/gh-6036-qsync-order.test.lua b/test/replication/gh-6036-qsync-order.test.lua
index b8df170b8..6350e9303 100644
--- a/test/replication/gh-6036-qsync-order.test.lua
+++ b/test/replication/gh-6036-qsync-order.test.lua
@@ -37,7 +37,6 @@ box.cfg({                                   \
 --
 -- Drop connection between election_replica2 and election_replica1.
 test_run:switch("election_replica2")
-test_run:wait_cond(function() return box.space.test:get{1} ~= nil end)
 box.cfg({                                   \
     replication = {                         \
         "unix/:./election_replica2.sock",   \
@@ -57,6 +56,7 @@ box.cfg({                                   \
 -- fall into forever sleep.
 test_run:switch("election_replica3")
 write_cnt = box.error.injection.get("ERRINJ_WAL_WRITE_COUNT")
+box.error.injection.set("ERRINJ_WAL_DELAY", true)
 --
 -- Make election_replica2 been a leader and start writting data,
 -- the PROMOTE request get queued on election_replica3 and not
@@ -68,7 +68,6 @@ test_run:switch("election_replica2")
 box.ctl.promote()
 test_run:switch("election_replica3")
 test_run:wait_cond(function() return box.error.injection.get("ERRINJ_WAL_WRITE_COUNT") > write_cnt end)
-box.error.injection.set("ERRINJ_WAL_DELAY", true)
 test_run:switch("election_replica2")
 _ = require('fiber').create(function() box.space.test:insert{2} end)
 


More information about the Tarantool-patches mailing list