[Tarantool-patches] [PATCH 1/1] recovery: handle local sync txns during recovery

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Sat Jul 11 20:05:19 MSK 2020


Recovery uses txn_commit_async() so as not to block the recovery
process when a synchronous transaction is met. They are either
committed later when CONFIRM is read, or stay in the limbo after
recovery.

However txn_commit_async() assumed it is used for remote
transactions only, and had some assertions about that. One of them
crashed in case master restarted and had any synchronous
transaction in WAL.

The patch makes txn_commit_async() not assume anything about
transaction's origin.

Closes #5163
---
Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5163-qsync-restart-crash
Issue: https://github.com/tarantool/tarantool/issues/5163

 src/box/txn.c                                 | 15 +++++++--
 src/box/txn_limbo.c                           | 10 ++++++
 src/box/txn_limbo.h                           | 14 ++++++++
 .../gh-5163-qsync-restart-crash.result        | 32 +++++++++++++++++++
 .../gh-5163-qsync-restart-crash.test.lua      | 14 ++++++++
 5 files changed, 83 insertions(+), 2 deletions(-)
 create mode 100644 test/replication/gh-5163-qsync-restart-crash.result
 create mode 100644 test/replication/gh-5163-qsync-restart-crash.test.lua

diff --git a/src/box/txn.c b/src/box/txn.c
index b3d41214d..39bc84a0c 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -756,8 +756,14 @@ txn_commit_async(struct txn *txn)
 
 		if (txn_has_flag(txn, TXN_WAIT_ACK)) {
 			int64_t lsn = req->rows[txn->n_applier_rows - 1]->lsn;
-			txn_limbo_assign_remote_lsn(&txn_limbo, limbo_entry,
-						    lsn);
+			/*
+			 * Can't tell whether it is local or not -
+			 * async commit is used both by applier
+			 * and during recovery. Use general LSN
+			 * assignment to let the limbo rule this
+			 * out.
+			 */
+			txn_limbo_assign_lsn(&txn_limbo, limbo_entry, lsn);
 		}
 
 		/*
@@ -844,6 +850,11 @@ txn_commit(struct txn *txn)
 	if (is_sync) {
 		if (txn_has_flag(txn, TXN_WAIT_ACK)) {
 			int64_t lsn = req->rows[req->n_rows - 1]->lsn;
+			/*
+			 * Use local LSN assignment. Because
+			 * blocking commit is used by local
+			 * transactions only.
+			 */
 			txn_limbo_assign_local_lsn(&txn_limbo, limbo_entry,
 						   lsn);
 			/* Local WAL write is a first 'ACK'. */
diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
index 207487a4d..9623b71cf 100644
--- a/src/box/txn_limbo.c
+++ b/src/box/txn_limbo.c
@@ -148,6 +148,16 @@ txn_limbo_assign_local_lsn(struct txn_limbo *limbo,
 	entry->ack_count = ack_count;
 }
 
+void
+txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
+		     int64_t lsn)
+{
+	if (limbo->instance_id == instance_id)
+		txn_limbo_assign_local_lsn(limbo, entry, lsn);
+	else
+		txn_limbo_assign_remote_lsn(limbo, entry, lsn);
+}
+
 static int
 txn_limbo_write_rollback(struct txn_limbo *limbo, int64_t lsn);
 
diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h
index 54e1af75c..f88ec2bcc 100644
--- a/src/box/txn_limbo.h
+++ b/src/box/txn_limbo.h
@@ -188,6 +188,20 @@ void
 txn_limbo_assign_local_lsn(struct txn_limbo *limbo,
 			   struct txn_limbo_entry *entry, int64_t lsn);
 
+/**
+ * Assign an LSN to a limbo entry. Works both with local and
+ * remote transactions. The function exists to be used in a
+ * context, where a transaction is not known whether it is local
+ * or not. For example, when a transaction is committed not bound
+ * to any fiber (txn_commit_async()), it can be created by applier
+ * (then it is remote) or by recovery (then it is local). Besides,
+ * recovery can commit remote transactions as well, when works on
+ * a replica - it will recover data received from master.
+ */
+void
+txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
+		     int64_t lsn);
+
 /**
  * Ack all transactions up to the given LSN on behalf of the
  * replica with the specified ID.
diff --git a/test/replication/gh-5163-qsync-restart-crash.result b/test/replication/gh-5163-qsync-restart-crash.result
new file mode 100644
index 000000000..e57bc76d1
--- /dev/null
+++ b/test/replication/gh-5163-qsync-restart-crash.result
@@ -0,0 +1,32 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+engine = test_run:get_cfg('engine')
+ | ---
+ | ...
+
+--
+-- gh-5163: master during recovery treated local transactions as
+-- remote and crashed.
+--
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+
+box.space.sync:replace{1}
+ | ---
+ | - [1]
+ | ...
+test_run:cmd('restart server default')
+ | 
+box.space.sync:select{}
+ | ---
+ | - - [1]
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
diff --git a/test/replication/gh-5163-qsync-restart-crash.test.lua b/test/replication/gh-5163-qsync-restart-crash.test.lua
new file mode 100644
index 000000000..d5aca4749
--- /dev/null
+++ b/test/replication/gh-5163-qsync-restart-crash.test.lua
@@ -0,0 +1,14 @@
+test_run = require('test_run').new()
+engine = test_run:get_cfg('engine')
+
+--
+-- gh-5163: master during recovery treated local transactions as
+-- remote and crashed.
+--
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+
+box.space.sync:replace{1}
+test_run:cmd('restart server default')
+box.space.sync:select{}
+box.space.sync:drop()
-- 
2.21.1 (Apple Git-122.3)



More information about the Tarantool-patches mailing list