[Tarantool-patches] [PATCH 1/2] txn: single failure point for WAL and TX commit errors
Vladislav Shpilevoy
v.shpilevoy at tarantool.org
Mon Jul 20 23:43:05 MSK 2020
Txn_commit() uses journal_write() function to send requests to WAL
thread. journal_write() can return 0/-1, but these codes have
nothing to do with the actual write result. journal_write() only
signals whether the interaction with WAL thread finished
successfully. It tells nothing about what happened in this
interaction.
To check WAL write result need to look at journal_entry.res field.
As a result, there were 2 failure points to handle. One of them
wasn't handled for synchronous transactions. Not counting 4
failure points in other places:
- When can't prepare the transaction before commit;
- When can't allocate a journal entry;
- When can't append a new entry to the qsync limbo;
- When synchronous transaction completion wait fails.
This patch merges all the failure points into one place for
txn_commit().
Closes #5146
---
src/box/txn.c | 55 +++++++++-----------
test/replication/qsync_errinj.result | 69 ++++++++++++++++++++++++++
test/replication/qsync_errinj.test.lua | 28 +++++++++++
3 files changed, 120 insertions(+), 32 deletions(-)
diff --git a/src/box/txn.c b/src/box/txn.c
index 62b91d6bb..375c01f85 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -797,11 +797,8 @@ txn_commit(struct txn *txn)
txn->fiber = fiber();
- if (txn_prepare(txn) != 0) {
- txn_rollback(txn);
- txn_free(txn);
- return -1;
- }
+ if (txn_prepare(txn) != 0)
+ goto rollback;
if (txn_commit_nop(txn)) {
txn_free(txn);
@@ -809,11 +806,8 @@ txn_commit(struct txn *txn)
}
req = txn_journal_entry_new(txn);
- if (req == NULL) {
- txn_rollback(txn);
- txn_free(txn);
- return -1;
- }
+ if (req == NULL)
+ goto rollback;
bool is_sync = txn_has_flag(txn, TXN_WAIT_SYNC);
if (is_sync) {
@@ -829,24 +823,17 @@ txn_commit(struct txn *txn)
* wouldn't be acceptable.
*/
limbo_entry = txn_limbo_append(&txn_limbo, origin_id, txn);
- if (limbo_entry == NULL) {
- txn_rollback(txn);
- txn_free(txn);
- return -1;
- }
+ if (limbo_entry == NULL)
+ goto rollback;
}
fiber_set_txn(fiber(), NULL);
- if (journal_write(req) != 0) {
+ if (journal_write(req) != 0 || req->res < 0) {
if (is_sync)
txn_limbo_abort(&txn_limbo, limbo_entry);
- fiber_set_txn(fiber(), txn);
- txn_rollback(txn);
- txn_free(txn);
-
diag_set(ClientError, ER_WAL_IO);
diag_log();
- return -1;
+ goto rollback;
}
if (is_sync) {
if (txn_has_flag(txn, TXN_WAIT_ACK)) {
@@ -861,25 +848,29 @@ txn_commit(struct txn *txn)
/* Local WAL write is a first 'ACK'. */
txn_limbo_ack(&txn_limbo, txn_limbo.instance_id, lsn);
}
- if (txn_limbo_wait_complete(&txn_limbo, limbo_entry) < 0) {
- txn_free(txn);
- return -1;
- }
+ if (txn_limbo_wait_complete(&txn_limbo, limbo_entry) < 0)
+ goto rollback;
}
if (!txn_has_flag(txn, TXN_IS_DONE)) {
txn->signature = req->res;
txn_complete(txn);
}
-
- int res = txn->signature >= 0 ? 0 : -1;
- if (res != 0) {
- diag_set(ClientError, ER_WAL_IO);
- diag_log();
- }
+ assert(txn->signature >= 0);
/* Synchronous transactions are freed by the calling fiber. */
txn_free(txn);
- return res;
+ return 0;
+
+rollback:
+ assert(txn->fiber != NULL);
+ if (!txn_has_flag(txn, TXN_IS_DONE)) {
+ fiber_set_txn(fiber(), txn);
+ txn_rollback(txn);
+ } else {
+ assert(in_txn() == NULL);
+ }
+ txn_free(txn);
+ return -1;
}
void
diff --git a/test/replication/qsync_errinj.result b/test/replication/qsync_errinj.result
index 49429cc80..1c6e357f7 100644
--- a/test/replication/qsync_errinj.result
+++ b/test/replication/qsync_errinj.result
@@ -214,6 +214,75 @@ box.space.sync:select{13}, box.space.sync:select{14}, box.space.sync:select{15}
| - - [15]
| ...
+--
+-- gh-5146: txn module should correctly handle errors from WAL
+-- thread, not only TX thread errors.
+--
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 1000}
+ | ---
+ | ...
+box.space.sync:truncate()
+ | ---
+ | ...
+-- Make a next row stuck in WAL thread.
+box.error.injection.set("ERRINJ_WAL_DELAY", true)
+ | ---
+ | - ok
+ | ...
+ok1, err1 = nil
+ | ---
+ | ...
+f1 = fiber.create(function() \
+ ok1, err1 = pcall(box.space.sync.replace, box.space.sync, {1}) \
+end)
+ | ---
+ | ...
+-- When the suspended write will be set free, it will stumble into
+-- an error.
+box.error.injection.set("ERRINJ_WAL_ROTATE", true)
+ | ---
+ | - ok
+ | ...
+-- Set the suspected write free. TX thread will receive success
+-- in terms of talking to WAL, but will get error inside the WAL
+-- response.
+box.error.injection.set("ERRINJ_WAL_DELAY", false)
+ | ---
+ | - ok
+ | ...
+test_run:wait_cond(function() return f1:status() == 'dead' end)
+ | ---
+ | - true
+ | ...
+ok1, err1
+ | ---
+ | - false
+ | - Failed to write to disk
+ | ...
+box.error.injection.set("ERRINJ_WAL_ROTATE", false)
+ | ---
+ | - ok
+ | ...
+box.cfg{replication_synchro_quorum = 2}
+ | ---
+ | ...
+box.space.sync:replace{2}
+ | ---
+ | - [2]
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{}
+ | ---
+ | - - [2]
+ | ...
+
test_run:cmd('switch default')
| ---
| - true
diff --git a/test/replication/qsync_errinj.test.lua b/test/replication/qsync_errinj.test.lua
index fe8bb4387..da554c775 100644
--- a/test/replication/qsync_errinj.test.lua
+++ b/test/replication/qsync_errinj.test.lua
@@ -77,6 +77,34 @@ box.space.sync:select{13}, box.space.sync:select{14}, box.space.sync:select{15}
test_run:switch('replica')
box.space.sync:select{13}, box.space.sync:select{14}, box.space.sync:select{15}
+--
+-- gh-5146: txn module should correctly handle errors from WAL
+-- thread, not only TX thread errors.
+--
+test_run:switch('default')
+box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 1000}
+box.space.sync:truncate()
+-- Make a next row stuck in WAL thread.
+box.error.injection.set("ERRINJ_WAL_DELAY", true)
+ok1, err1 = nil
+f1 = fiber.create(function() \
+ ok1, err1 = pcall(box.space.sync.replace, box.space.sync, {1}) \
+end)
+-- When the suspended write will be set free, it will stumble into
+-- an error.
+box.error.injection.set("ERRINJ_WAL_ROTATE", true)
+-- Set the suspected write free. TX thread will receive success
+-- in terms of talking to WAL, but will get error inside the WAL
+-- response.
+box.error.injection.set("ERRINJ_WAL_DELAY", false)
+test_run:wait_cond(function() return f1:status() == 'dead' end)
+ok1, err1
+box.error.injection.set("ERRINJ_WAL_ROTATE", false)
+box.cfg{replication_synchro_quorum = 2}
+box.space.sync:replace{2}
+test_run:switch('replica')
+box.space.sync:select{}
+
test_run:cmd('switch default')
box.cfg{ \
--
2.21.1 (Apple Git-122.3)
More information about the Tarantool-patches
mailing list