Box.snapshot() could include rolled back data in case synchronous
transaction ROLLBACK arrived during WAL rotation in preparation of
a checkpoint.
More specifically, snapshot consists of fixating the engines'
content (creation of a read-view), doing WAL rotation, and writing
the snapshot itself. All data changes after content fixation won't
go into the snap. So if ROLLBACK arrives during WAL rotation, the
fixated content will have rolled back data, not present in the
newest dataset.
The patch makes it fail if during WAL rotation anything was rolled
back. The bug sometimes appeared in an existing test about qsync
snapshots, but with a very poor reproducibility. In a new test
file it is reproduced 100% without the patch.
Closes #5167
---
Branch:
http://github.com/tarantool/tarantool/tree/gerold103/gh-5167-flaky-qsync-snapshotsIssue:
https://github.com/tarantool/tarantool/issues/5167@ChangeLog
* Snapshot could contain changes from a rolled back synchronous transaction (gh-5167).
src/box/gc.c | 12 +-
src/box/wal.c | 4 +
src/lib/core/errinj.h | 1 +
test/box/errinj.result | 1 +
.../gh-5167-qsync-rollback-snap.result | 165 ++++++++++++++++++
.../gh-5167-qsync-rollback-snap.test.lua | 67 +++++++
test/replication/suite.ini | 2 +-
7 files changed, 250 insertions(+), 2 deletions(-)
create mode 100644 test/replication/gh-5167-qsync-rollback-snap.result
create mode 100644 test/replication/gh-5167-qsync-rollback-snap.test.lua
diff --git a/src/box/gc.c b/src/box/gc.c
index d50a64d66..76f7c6325 100644
--- a/src/box/gc.c
+++ b/src/box/gc.c
@@ -382,6 +382,7 @@ gc_do_checkpoint(bool is_scheduled)
{
int rc;
struct wal_checkpoint checkpoint;
+ int64_t limbo_rollback_count = txn_limbo.rollback_count;
assert(!gc.checkpoint_is_in_progress);
gc.checkpoint_is_in_progress = true;
@@ -396,7 +397,16 @@ gc_do_checkpoint(bool is_scheduled)
rc = wal_begin_checkpoint(&checkpoint);
if (rc != 0)
goto out;
-
+ /*
+ * Check if the checkpoint contains rolled back data. That makes the
+ * checkpoint not self-sufficient - it needs the xlog file with
+ * ROLLBACK. Drop it.
+ */
+ if (txn_limbo.rollback_count != limbo_rollback_count) {
+ rc = -1;
+ diag_set(ClientError, ER_SYNC_ROLLBACK);
+ goto out;
+ }
/*
* Wait the confirms on all "sync" transactions before
* create a snapshot.
diff --git a/src/box/wal.c b/src/box/wal.c
index 220e68245..d8c92aa36 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -662,6 +662,7 @@ wal_begin_checkpoint_f(struct cbus_call_msg *data)
}
vclock_copy(&msg->vclock, &writer->vclock);
msg->wal_size = writer->checkpoint_wal_size;
+ ERROR_INJECT_SLEEP(ERRINJ_WAL_DELAY);
return 0;
}
@@ -1272,6 +1273,9 @@ wal_write_async(struct journal *journal, struct journal_entry *entry)
writer->last_entry = entry;
batch->approx_len += entry->approx_len;
writer->wal_pipe.n_input += entry->n_rows * XROW_IOVMAX;
+#ifndef NDEBUG
+ ++errinj(ERRINJ_WAL_WRITE_COUNT, ERRINJ_INT)->iparam;
+#endif
cpipe_flush_input(&writer->wal_pipe);
return 0;
diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h
index aace8736f..814c57c2e 100644
--- a/src/lib/core/errinj.h
+++ b/src/lib/core/errinj.h
@@ -82,6 +82,7 @@ struct errinj {
_(ERRINJ_WAL_DELAY, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_WAL_DELAY_COUNTDOWN, ERRINJ_INT, {.iparam = -1}) \
_(ERRINJ_WAL_FALLOCATE, ERRINJ_INT, {.iparam = 0}) \
+ _(ERRINJ_WAL_WRITE_COUNT, ERRINJ_INT, {.iparam = 0}) \
_(ERRINJ_INDEX_ALLOC, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_TUPLE_ALLOC, ERRINJ_BOOL, {.bparam = false}) \
_(ERRINJ_TUPLE_FIELD, ERRINJ_BOOL, {.bparam = false}) \
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 4bea0f46f..613d22c51 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -114,6 +114,7 @@ evals
- ERRINJ_WAL_ROTATE: false
- ERRINJ_WAL_SYNC: false
- ERRINJ_WAL_WRITE: false
+ - ERRINJ_WAL_WRITE_COUNT: 3
- ERRINJ_WAL_WRITE_DISK: false
- ERRINJ_WAL_WRITE_EOF: false
- ERRINJ_WAL_WRITE_PARTIAL: -1
diff --git a/test/replication/gh-5167-qsync-rollback-snap.result b/test/replication/gh-5167-qsync-rollback-snap.result
new file mode 100644
index 000000000..06f58526c
--- /dev/null
+++ b/test/replication/gh-5167-qsync-rollback-snap.result
@@ -0,0 +1,165 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+engine = test_run:get_cfg('engine')
+ | ---
+ | ...
+
+orig_synchro_quorum = box.cfg.replication_synchro_quorum
+ | ---
+ | ...
+orig_synchro_timeout = box.cfg.replication_synchro_timeout
+ | ---
+ | ...
+box.schema.user.grant('guest', 'super')
+ | ---
+ | ...
+
+test_run:cmd('create server replica with rpl_master=default,\
+ script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica with wait=True, wait_load=True')
+ | ---
+ | - true
+ | ...
+
+--
+-- gh-5167:
+--
+fiber = require('fiber')
+ | ---
+ | ...
+box.cfg{replication_synchro_quorum = 2, replication_synchro_timeout = 1000}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync = true, engine = engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Write something to flush the current master's state to replica.
+_ = box.space.sync:insert{1}
+ | ---
+ | ...
+_ = box.space.sync:delete{1}
+ | ---
+ | ...
+
+box.cfg{replication_synchro_quorum = 3}
+ | ---
+ | ...
+ok, err = nil
+ | ---
+ | ...
+f = fiber.create(function() \
+ ok, err = pcall(box.space.sync.insert, box.space.sync, {1}) \
+end)
+ | ---
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+fiber = require('fiber')
+ | ---
+ | ...
+test_run:wait_cond(function() return box.space.sync:count() == 1 end)
+ | ---
+ | - true
+ | ...
+-- Snapshot will stuck in WAL thread on rotation before starting wait on the
+-- limbo.
+box.error.injection.set("ERRINJ_WAL_DELAY", true)
+ | ---
+ | - ok
+ | ...
+wal_write_count = box.error.injection.get("ERRINJ_WAL_WRITE_COUNT")
+ | ---
+ | ...
+ok, err = nil
+ | ---
+ | ...
+f = fiber.create(function() ok, err = pcall(box.snapshot) end)
+ | ---
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_timeout = 0.0001}
+ | ---
+ | ...
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ | ---
+ | - true
+ | ...
+ok, err
+ | ---
+ | - false
+ | - Quorum collection for a synchronous transaction is timed out
+ | ...
+box.space.sync:select{}
+ | ---
+ | - []
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+-- Rollback was received. Note, it is not legit to look for space:count() == 0.
+-- Because ideally ROLLBACK should not be applied before written to WAL. That
+-- means count() will be > 0 until WAL write succeeds.
+test_run:wait_cond(function() \
+ return box.error.injection.get("ERRINJ_WAL_WRITE_COUNT") > wal_write_count \
+end)
+ | ---
+ | - true
+ | ...
+-- Now WAL rotation is done. Snapshot will fail, because will see that a
+-- rollback happened during that. Meaning that the rotated WAL contains
+-- not confirmed data, and it can't be used as a checkpoint.
+box.error.injection.set("ERRINJ_WAL_DELAY", false)
+ | ---
+ | - ok
+ | ...
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ | ---
+ | - true
+ | ...
+ok, err
+ | ---
+ | - false
+ | - A rollback for a synchronous transaction is received
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+box.schema.user.revoke('guest', 'super')
+ | ---
+ | ...
+box.cfg{ \
+ replication_synchro_quorum = orig_synchro_quorum, \
+ replication_synchro_timeout = orig_synchro_timeout, \
+}
+ | ---
+ | ...
diff --git a/test/replication/gh-5167-qsync-rollback-snap.test.lua b/test/replication/gh-5167-qsync-rollback-snap.test.lua
new file mode 100644
index 000000000..475727e61
--- /dev/null
+++ b/test/replication/gh-5167-qsync-rollback-snap.test.lua
@@ -0,0 +1,67 @@
+test_run = require('test_run').new()
+engine = test_run:get_cfg('engine')
+
+orig_synchro_quorum = box.cfg.replication_synchro_quorum
+orig_synchro_timeout = box.cfg.replication_synchro_timeout
+box.schema.user.grant('guest', 'super')
+
+test_run:cmd('create server replica with rpl_master=default,\
+ script="replication/replica.lua"')
+test_run:cmd('start server replica with wait=True, wait_load=True')
+
+--
+-- gh-5167:
+--
+fiber = require('fiber')
+box.cfg{replication_synchro_quorum = 2, replication_synchro_timeout = 1000}
+_ = box.schema.space.create('sync', {is_sync = true, engine = engine})
+_ = box.space.sync:create_index('pk')
+-- Write something to flush the current master's state to replica.
+_ = box.space.sync:insert{1}
+_ = box.space.sync:delete{1}
+
+box.cfg{replication_synchro_quorum = 3}
+ok, err = nil
+f = fiber.create(function() \
+ ok, err = pcall(box.space.sync.insert, box.space.sync, {1}) \
+end)
+
+test_run:switch('replica')
+fiber = require('fiber')
+test_run:wait_cond(function() return box.space.sync:count() == 1 end)
+-- Snapshot will stuck in WAL thread on rotation before starting wait on the
+-- limbo.
+box.error.injection.set("ERRINJ_WAL_DELAY", true)
+wal_write_count = box.error.injection.get("ERRINJ_WAL_WRITE_COUNT")
+ok, err = nil
+f = fiber.create(function() ok, err = pcall(box.snapshot) end)
+
+test_run:switch('default')
+box.cfg{replication_synchro_timeout = 0.0001}
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ok, err
+box.space.sync:select{}
+
+test_run:switch('replica')
+-- Rollback was received. Note, it is not legit to look for space:count() == 0.
+-- Because ideally ROLLBACK should not be applied before written to WAL. That
+-- means count() will be > 0 until WAL write succeeds.
+test_run:wait_cond(function() \
+ return box.error.injection.get("ERRINJ_WAL_WRITE_COUNT") > wal_write_count \
+end)
+-- Now WAL rotation is done. Snapshot will fail, because will see that a
+-- rollback happened during that. Meaning that the rotated WAL contains
+-- not confirmed data, and it can't be used as a checkpoint.
+box.error.injection.set("ERRINJ_WAL_DELAY", false)
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ok, err
+
+test_run:switch('default')
+box.space.sync:drop()
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+box.schema.user.revoke('guest', 'super')
+box.cfg{ \
+ replication_synchro_quorum = orig_synchro_quorum, \
+ replication_synchro_timeout = orig_synchro_timeout, \
+}
diff --git a/test/replication/suite.ini b/test/replication/suite.ini
index 73f73eb89..247289d58 100644
--- a/test/replication/suite.ini
+++ b/test/replication/suite.ini
@@ -3,7 +3,7 @@ core = tarantool
script = master.lua
description = tarantool/box, replication
disabled = consistent.test.lua
-release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua
+release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-rollback-snap.test.lua
config = suite.cfg
lua_libs = lua/fast_replica.lua lua/rlimit.lua
use_unix_sockets = True
--
2.21.1 (Apple Git-122.3)