Hi!
Thanks for the patch, LGTM.
--
Serge Petrenko
 
 
Вторник, 11 августа 2020, 1:18 +03:00 от Vladislav Shpilevoy <v.shpilevoy@tarantool.org>:
 
Box.snapshot() could include rolled back data in case synchronous
transaction ROLLBACK arrived during WAL rotation in preparation of
a checkpoint.

More specifically, snapshot consists of fixating the engines'
content (creation of a read-view), doing WAL rotation, and writing
the snapshot itself. All data changes after content fixation won't
go into the snap. So if ROLLBACK arrives during WAL rotation, the
fixated content will have rolled back data, not present in the
newest dataset.

The patch makes it fail if during WAL rotation anything was rolled
back. The bug sometimes appeared in an existing test about qsync
snapshots, but with a very poor reproducibility. In a new test
file it is reproduced 100% without the patch.

Closes #5167
---
Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5167-flaky-qsync-snapshots
Issue: https://github.com/tarantool/tarantool/issues/5167

@ChangeLog
* Snapshot could contain changes from a rolled back synchronous transaction (gh-5167).

 src/box/gc.c | 12 +-
 src/box/wal.c | 4 +
 src/lib/core/errinj.h | 1 +
 test/box/errinj.result | 1 +
 .../gh-5167-qsync-rollback-snap.result | 165 ++++++++++++++++++
 .../gh-5167-qsync-rollback-snap.test.lua | 67 +++++++
 test/replication/suite.ini | 2 +-
 7 files changed, 250 insertions(+), 2 deletions(-)
 create mode 100644 test/replication/gh-5167-qsync-rollback-snap.result
 create mode 100644 test/replication/gh-5167-qsync-rollback-snap.test.lua

diff --git a/src/box/gc.c b/src/box/gc.c
index d50a64d66..76f7c6325 100644
--- a/src/box/gc.c
+++ b/src/box/gc.c
@@ -382,6 +382,7 @@ gc_do_checkpoint(bool is_scheduled)
 {
  int rc;
  struct wal_checkpoint checkpoint;
+ int64_t limbo_rollback_count = txn_limbo.rollback_count;
 
  assert(!gc.checkpoint_is_in_progress);
  gc.checkpoint_is_in_progress = true;
@@ -396,7 +397,16 @@ gc_do_checkpoint(bool is_scheduled)
  rc = wal_begin_checkpoint(&checkpoint);
  if (rc != 0)
  goto out;
-
+ /*
+ * Check if the checkpoint contains rolled back data. That makes the
+ * checkpoint not self-sufficient - it needs the xlog file with
+ * ROLLBACK. Drop it.
+ */
+ if (txn_limbo.rollback_count != limbo_rollback_count) {
+ rc = -1;
+ diag_set(ClientError, ER_SYNC_ROLLBACK);
+ goto out;
+ }
  /*
  * Wait the confirms on all "sync" transactions before
  * create a snapshot.
diff --git a/src/box/wal.c b/src/box/wal.c
index 220e68245..d8c92aa36 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -662,6 +662,7 @@ wal_begin_checkpoint_f(struct cbus_call_msg *data)
  }
  vclock_copy(&msg->vclock, &writer->vclock);
  msg->wal_size = writer->checkpoint_wal_size;
+ ERROR_INJECT_SLEEP(ERRINJ_WAL_DELAY);
  return 0;
 }
 
@@ -1272,6 +1273,9 @@ wal_write_async(struct journal *journal, struct journal_entry *entry)
  writer->last_entry = entry;
  batch->approx_len += entry->approx_len;
  writer->wal_pipe.n_input += entry->n_rows * XROW_IOVMAX;
+#ifndef NDEBUG
+ ++errinj(ERRINJ_WAL_WRITE_COUNT, ERRINJ_INT)->iparam;
+#endif
  cpipe_flush_input(&writer->wal_pipe);
  return 0;
 
diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h
index aace8736f..814c57c2e 100644
--- a/src/lib/core/errinj.h
+++ b/src/lib/core/errinj.h
@@ -82,6 +82,7 @@ struct errinj {
  _(ERRINJ_WAL_DELAY, ERRINJ_BOOL, {.bparam = false}) \
  _(ERRINJ_WAL_DELAY_COUNTDOWN, ERRINJ_INT, {.iparam = -1}) \
  _(ERRINJ_WAL_FALLOCATE, ERRINJ_INT, {.iparam = 0}) \
+ _(ERRINJ_WAL_WRITE_COUNT, ERRINJ_INT, {.iparam = 0}) \
  _(ERRINJ_INDEX_ALLOC, ERRINJ_BOOL, {.bparam = false}) \
  _(ERRINJ_TUPLE_ALLOC, ERRINJ_BOOL, {.bparam = false}) \
  _(ERRINJ_TUPLE_FIELD, ERRINJ_BOOL, {.bparam = false}) \
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 4bea0f46f..613d22c51 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -114,6 +114,7 @@ evals
   - ERRINJ_WAL_ROTATE: false
   - ERRINJ_WAL_SYNC: false
   - ERRINJ_WAL_WRITE: false
+ - ERRINJ_WAL_WRITE_COUNT: 3
   - ERRINJ_WAL_WRITE_DISK: false
   - ERRINJ_WAL_WRITE_EOF: false
   - ERRINJ_WAL_WRITE_PARTIAL: -1
diff --git a/test/replication/gh-5167-qsync-rollback-snap.result b/test/replication/gh-5167-qsync-rollback-snap.result
new file mode 100644
index 000000000..06f58526c
--- /dev/null
+++ b/test/replication/gh-5167-qsync-rollback-snap.result
@@ -0,0 +1,165 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+engine = test_run:get_cfg('engine')
+ | ---
+ | ...
+
+orig_synchro_quorum = box.cfg.replication_synchro_quorum
+ | ---
+ | ...
+orig_synchro_timeout = box.cfg.replication_synchro_timeout
+ | ---
+ | ...
+box.schema.user.grant('guest', 'super')
+ | ---
+ | ...
+
+test_run:cmd('create server replica with rpl_master=default,\
+ script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica with wait=True, wait_load=True')
+ | ---
+ | - true
+ | ...
+
+--
+-- gh-5167:
+--
+fiber = require('fiber')
+ | ---
+ | ...
+box.cfg{replication_synchro_quorum = 2, replication_synchro_timeout = 1000}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync = true, engine = engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+-- Write something to flush the current master's state to replica.
+_ = box.space.sync:insert{1}
+ | ---
+ | ...
+_ = box.space.sync:delete{1}
+ | ---
+ | ...
+
+box.cfg{replication_synchro_quorum = 3}
+ | ---
+ | ...
+ok, err = nil
+ | ---
+ | ...
+f = fiber.create(function() \
+ ok, err = pcall(box.space.sync.insert, box.space.sync, {1}) \
+end)
+ | ---
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+fiber = require('fiber')
+ | ---
+ | ...
+test_run:wait_cond(function() return box.space.sync:count() == 1 end)
+ | ---
+ | - true
+ | ...
+-- Snapshot will stuck in WAL thread on rotation before starting wait on the
+-- limbo.
+box.error.injection.set("ERRINJ_WAL_DELAY", true)
+ | ---
+ | - ok
+ | ...
+wal_write_count = box.error.injection.get("ERRINJ_WAL_WRITE_COUNT")
+ | ---
+ | ...
+ok, err = nil
+ | ---
+ | ...
+f = fiber.create(function() ok, err = pcall(box.snapshot) end)
+ | ---
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_timeout = 0.0001}
+ | ---
+ | ...
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ | ---
+ | - true
+ | ...
+ok, err
+ | ---
+ | - false
+ | - Quorum collection for a synchronous transaction is timed out
+ | ...
+box.space.sync:select{}
+ | ---
+ | - []
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+-- Rollback was received. Note, it is not legit to look for space:count() == 0.
+-- Because ideally ROLLBACK should not be applied before written to WAL. That
+-- means count() will be > 0 until WAL write succeeds.
+test_run:wait_cond(function() \
+ return box.error.injection.get("ERRINJ_WAL_WRITE_COUNT") > wal_write_count \
+end)
+ | ---
+ | - true
+ | ...
+-- Now WAL rotation is done. Snapshot will fail, because will see that a
+-- rollback happened during that. Meaning that the rotated WAL contains
+-- not confirmed data, and it can't be used as a checkpoint.
+box.error.injection.set("ERRINJ_WAL_DELAY", false)
+ | ---
+ | - ok
+ | ...
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ | ---
+ | - true
+ | ...
+ok, err
+ | ---
+ | - false
+ | - A rollback for a synchronous transaction is received
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+box.schema.user.revoke('guest', 'super')
+ | ---
+ | ...
+box.cfg{ \
+ replication_synchro_quorum = orig_synchro_quorum, \
+ replication_synchro_timeout = orig_synchro_timeout, \
+}
+ | ---
+ | ...
diff --git a/test/replication/gh-5167-qsync-rollback-snap.test.lua b/test/replication/gh-5167-qsync-rollback-snap.test.lua
new file mode 100644
index 000000000..475727e61
--- /dev/null
+++ b/test/replication/gh-5167-qsync-rollback-snap.test.lua
@@ -0,0 +1,67 @@
+test_run = require('test_run').new()
+engine = test_run:get_cfg('engine')
+
+orig_synchro_quorum = box.cfg.replication_synchro_quorum
+orig_synchro_timeout = box.cfg.replication_synchro_timeout
+box.schema.user.grant('guest', 'super')
+
+test_run:cmd('create server replica with rpl_master=default,\
+ script="replication/replica.lua"')
+test_run:cmd('start server replica with wait=True, wait_load=True')
+
+--
+-- gh-5167:
+--
+fiber = require('fiber')
+box.cfg{replication_synchro_quorum = 2, replication_synchro_timeout = 1000}
+_ = box.schema.space.create('sync', {is_sync = true, engine = engine})
+_ = box.space.sync:create_index('pk')
+-- Write something to flush the current master's state to replica.
+_ = box.space.sync:insert{1}
+_ = box.space.sync:delete{1}
+
+box.cfg{replication_synchro_quorum = 3}
+ok, err = nil
+f = fiber.create(function() \
+ ok, err = pcall(box.space.sync.insert, box.space.sync, {1}) \
+end)
+
+test_run:switch('replica')
+fiber = require('fiber')
+test_run:wait_cond(function() return box.space.sync:count() == 1 end)
+-- Snapshot will stuck in WAL thread on rotation before starting wait on the
+-- limbo.
+box.error.injection.set("ERRINJ_WAL_DELAY", true)
+wal_write_count = box.error.injection.get("ERRINJ_WAL_WRITE_COUNT")
+ok, err = nil
+f = fiber.create(function() ok, err = pcall(box.snapshot) end)
+
+test_run:switch('default')
+box.cfg{replication_synchro_timeout = 0.0001}
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ok, err
+box.space.sync:select{}
+
+test_run:switch('replica')
+-- Rollback was received. Note, it is not legit to look for space:count() == 0.
+-- Because ideally ROLLBACK should not be applied before written to WAL. That
+-- means count() will be > 0 until WAL write succeeds.
+test_run:wait_cond(function() \
+ return box.error.injection.get("ERRINJ_WAL_WRITE_COUNT") > wal_write_count \
+end)
+-- Now WAL rotation is done. Snapshot will fail, because will see that a
+-- rollback happened during that. Meaning that the rotated WAL contains
+-- not confirmed data, and it can't be used as a checkpoint.
+box.error.injection.set("ERRINJ_WAL_DELAY", false)
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ok, err
+
+test_run:switch('default')
+box.space.sync:drop()
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+box.schema.user.revoke('guest', 'super')
+box.cfg{ \
+ replication_synchro_quorum = orig_synchro_quorum, \
+ replication_synchro_timeout = orig_synchro_timeout, \
+}
diff --git a/test/replication/suite.ini b/test/replication/suite.ini
index 73f73eb89..247289d58 100644
--- a/test/replication/suite.ini
+++ b/test/replication/suite.ini
@@ -3,7 +3,7 @@ core = tarantool
 script = master.lua
 description = tarantool/box, replication
 disabled = consistent.test.lua
-release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua
+release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-rollback-snap.test.lua
 config = suite.cfg
 lua_libs = lua/fast_replica.lua lua/rlimit.lua
 use_unix_sockets = True
--
2.21.1 (Apple Git-122.3)