From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtpng3.m.smailru.net (smtpng3.m.smailru.net [94.100.177.149]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 66C01445320 for ; Thu, 30 Jul 2020 02:39:07 +0300 (MSK) From: Vladislav Shpilevoy Date: Thu, 30 Jul 2020 01:39:05 +0200 Message-Id: <04de5b4a0f7859909155cb9f52e9fd45a338a0e8.1596065870.git.v.shpilevoy@tarantool.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH 1/1] txn_limbo: panic when synchro WAL write fails List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: tarantool-patches@dev.tarantool.org, gorcunov@gmail.com CONFIRM and ROLLBACK go to WAL. Their WAL write can fail just like any other WAL write. However it is not clear what to do in that case, especially in case of ROLLBACK fail. The patch adds panic() stub so as to at least terminate the instance. Before the patch it would work like nothing happened, with undefined behaviour. Closes #5159 --- Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5159-qsync-panic Issue: https://github.com/tarantool/tarantool/issues/5159 @ChangeLog * Instance will terminate if a synchronous transaction confirmation or rollback fail. Before it was undefined behaviour (gh-5159). src/box/txn_limbo.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c index b6725ae21..4593abef2 100644 --- a/src/box/txn_limbo.c +++ b/src/box/txn_limbo.c @@ -178,7 +178,7 @@ txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry, txn_limbo_assign_remote_lsn(limbo, entry, lsn); } -static int +static void txn_limbo_write_rollback(struct txn_limbo *limbo, int64_t lsn); int @@ -258,7 +258,7 @@ complete: return 0; } -static int +static void txn_limbo_write_confirm_rollback(struct txn_limbo *limbo, int64_t lsn, bool is_confirm) { @@ -271,7 +271,7 @@ txn_limbo_write_confirm_rollback(struct txn_limbo *limbo, int64_t lsn, struct txn *txn = txn_begin(); if (txn == NULL) - return -1; + goto rollback; int res = 0; if (is_confirm) { @@ -299,23 +299,32 @@ txn_limbo_write_confirm_rollback(struct txn_limbo *limbo, int64_t lsn, goto rollback; if (txn_commit_stmt(txn, &request) != 0) goto rollback; + if (txn_commit(txn) != 0) + goto rollback; + return; - return txn_commit(txn); rollback: - txn_rollback(txn); - return -1; + /* + * XXX: the stub is supposed to be removed once it is defined what to do + * when a synchro request WAL write fails. One of the possible + * solutions: log the error, keep the limbo queue as is and probably put + * in rollback mode. Then provide a hook to call manually when WAL + * problems are fixed. Or retry automatically with some period. + */ + panic("Could write a synchro request to WAL: lsn = %lld, type = %s\n", + lsn, is_confirm ? "CONFIRM" : "ROLLBACK"); } /** * Write a confirmation entry to WAL. After it's written all the * transactions waiting for confirmation may be finished. */ -static int +static void txn_limbo_write_confirm(struct txn_limbo *limbo, int64_t lsn) { assert(lsn > limbo->confirmed_lsn); limbo->confirmed_lsn = lsn; - return txn_limbo_write_confirm_rollback(limbo, lsn, true); + txn_limbo_write_confirm_rollback(limbo, lsn, true); } void @@ -361,15 +370,14 @@ txn_limbo_read_confirm(struct txn_limbo *limbo, int64_t lsn) * transactions following the current one and waiting for * confirmation must be rolled back. */ -static int +static void txn_limbo_write_rollback(struct txn_limbo *limbo, int64_t lsn) { assert(lsn > limbo->confirmed_lsn); assert(!limbo->is_in_rollback); limbo->is_in_rollback = true; - int rc = txn_limbo_write_confirm_rollback(limbo, lsn, false); + txn_limbo_write_confirm_rollback(limbo, lsn, false); limbo->is_in_rollback = false; - return rc; } void @@ -445,13 +453,7 @@ txn_limbo_ack(struct txn_limbo *limbo, uint32_t replica_id, int64_t lsn) } if (confirm_lsn == -1 || confirm_lsn <= limbo->confirmed_lsn) return; - if (txn_limbo_write_confirm(limbo, confirm_lsn) != 0) { - // TODO: what to do here?. - // We already failed writing the CONFIRM - // message. What are the chances we'll be - // able to write ROLLBACK? - return; - } + txn_limbo_write_confirm(limbo, confirm_lsn); txn_limbo_read_confirm(limbo, confirm_lsn); } @@ -587,10 +589,7 @@ txn_limbo_on_parameters_change(struct txn_limbo *limbo) } } if (confirm_lsn > limbo->confirmed_lsn) { - if (txn_limbo_write_confirm(limbo, confirm_lsn) != 0) { - panic("Couldn't write CONFIRM to WAL"); - return; - } + txn_limbo_write_confirm(limbo, confirm_lsn); txn_limbo_read_confirm(limbo, confirm_lsn); } /* -- 2.21.1 (Apple Git-122.3)