Среда, 26 февраля 2020, 14:58 +03:00 от Konstantin Osipov <kostja.osipov@gmail.com>:
 
* Serge Petrenko <sergepetrenko@tarantool.org> [20/02/26 14:22]:
> I don’t think I can. The test that comes with an issue is a stress test,
> relying on running it with multiple workers simultaneously.
> It reproduces the problem when ran with 4 workers on one of my PCs,
> and with 20 workers on the other.
> I think we don’t have the appropriate testing infrastructure to run the same
> test with multiple workers at the same time, and I couldn’t come up with a
> single test which would reproduce the same problem.

Is there a place in which you can inject a sleep to make the
problem much easier to reproduce?

What about injecting a sleep in wal code on replica, the place
which increments local replicaset vclock ?
 
Thanks for the suggestion! Haven’t thought about it for some reason.
I made a test. The diff’s below.

Then you will be much more likely to receive a record from the
peer before you incremented the record vclock locally, and the bug
will be reproducible with a single master.

--
Konstantin Osipov, Moscow, Russia
 

diff --git a/src/box/wal.c b/src/box/wal.c

index 27bff662a..35ba7b072 100644

--- a/src/box/wal.c

+++ b/src/box/wal.c

@@ -278,8 +278,13 @@ tx_schedule_commit(struct cmsg *msg)

  /* Closes the input valve. */

  stailq_concat(&writer->rollback, &batch->rollback);

  }

+

+ ERROR_INJECT(ERRINJ_REPLICASET_VCLOCK_UPDATE, { goto skip_update; });

  /* Update the tx vclock to the latest written by wal. */

  vclock_copy(&replicaset.vclock, &batch->vclock);

+#ifndef NDEBUG

+skip_update:

+#endif

  tx_schedule_queue(&batch->commit);

  mempool_free(&writer->msg_pool, container_of(msg, struct wal_msg, base));

}

diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h

index ed0cba903..58fe158fd 100644

--- a/src/lib/core/errinj.h

+++ b/src/lib/core/errinj.h

@@ -136,7 +136,8 @@ struct errinj {

  _(ERRINJ_SWIM_FD_ONLY, ERRINJ_BOOL, {.bparam = false}) \

  _(ERRINJ_DYN_MODULE_COUNT, ERRINJ_INT, {.iparam = 0}) \

  _(ERRINJ_FIBER_MADVISE, ERRINJ_BOOL, {.bparam = false}) \

- _(ERRINJ_FIBER_MPROTECT, ERRINJ_INT, {.iparam = -1})

+ _(ERRINJ_FIBER_MPROTECT, ERRINJ_INT, {.iparam = -1}) \

+ _(ERRINJ_REPLICASET_VCLOCK_UPDATE, ERRINJ_BOOL, {.bparam = false}) \

 

ENUM0(errinj_id, ERRINJ_LIST);

extern struct errinj errinjs[];

diff --git a/test/box/errinj.result b/test/box/errinj.result

index daa27ed24..eb0905238 100644

--- a/test/box/errinj.result

+++ b/test/box/errinj.result

@@ -64,6 +64,7 @@ evals

   - ERRINJ_RELAY_REPORT_INTERVAL: 0

   - ERRINJ_RELAY_SEND_DELAY: false

   - ERRINJ_RELAY_TIMEOUT: 0

+  - ERRINJ_REPLICASET_VCLOCK_UPDATE: false

   - ERRINJ_REPLICA_JOIN_DELAY: false

   - ERRINJ_SIO_READ_MAX: -1

   - ERRINJ_SNAP_COMMIT_DELAY: false

diff --git a/test/replication/gh-4739-vclock-assert.result b/test/replication/gh-4739-vclock-assert.result

new file mode 100644

index 000000000..7dc2f7118

--- /dev/null

+++ b/test/replication/gh-4739-vclock-assert.result

@@ -0,0 +1,82 @@

+-- test-run result file version 2

+env = require('test_run')

+ | ---

+ | ...

+test_run = env.new()

+ | ---

+ | ...

+

+SERVERS = {'rebootstrap1', 'rebootstrap2'}

+ | ---

+ | ...

+test_run:create_cluster(SERVERS, "replication")

+ | ---

+ | ...

+test_run:wait_fullmesh(SERVERS)

+ | ---

+ | ...

+

+test_run:cmd('switch rebootstrap1')

+ | ---

+ | - true

+ | ...

+fiber = require('fiber')

+ | ---

+ | ...

+-- Stop updating replicaset vclock to simulate a situation, when

+-- a row is already relayed to the remote master, but the local

+-- vclock update hasn't happened yet.

+box.error.injection.set('ERRINJ_REPLICASET_VCLOCK_UPDATE', true)

+ | ---

+ | - ok

+ | ...

+lsn = box.info.lsn

+ | ---

+ | ...

+box.space._schema:replace{'something'}

+ | ---

+ | - ['something']

+ | ...

+-- Vclock isn't updated.

+box.info.lsn == lsn

+ | ---

+ | - true

+ | ...

+

+-- Wait until the remote instance gets the row.

+while test_run:get_vclock('rebootstrap2')[box.info.id] == lsn do\

+    fiber.sleep(0.01)\

+end

+ | ---

+ | ...

+

+-- Restart the remote instance. This will make the first instance

+-- resubscribe without entering orphan mode.

+test_run:cmd('restart server rebootstrap2')

+ | ---

+ | - true

+ | ...

+test_run:cmd('switch rebootstrap1')

+ | ---

+ | - true

+ | ...

+-- Wait until resubscribe is sent

+fiber.sleep(2 * box.cfg.replication_timeout)

+ | ---

+ | ...

+box.info.replication[2].upstream.status

+ | ---

+ | - sync

+ | ...

+

+box.error.injection.set('ERRINJ_REPLICASET_VCLOCK_UPDATE', false)

+ | ---

+ | - ok

+ | ...

+test_run:cmd('switch default')

+ | ---

+ | - true

+ | ...

+test_run:drop_cluster(SERVERS)

+ | ---

+ | ...

diff --git a/test/replication/gh-4739-vclock-assert.test.lua b/test/replication/gh-4739-vclock-assert.test.lua

new file mode 100644

index 000000000..26dc781e2

--- /dev/null

+++ b/test/replication/gh-4739-vclock-assert.test.lua

@@ -0,0 +1,34 @@

+env = require('test_run')

+test_run = env.new()

+

+SERVERS = {'rebootstrap1', 'rebootstrap2'}

+test_run:create_cluster(SERVERS, "replication")

+test_run:wait_fullmesh(SERVERS)

+

+test_run:cmd('switch rebootstrap1')

+fiber = require('fiber')

+-- Stop updating replicaset vclock to simulate a situation, when

+-- a row is already relayed to the remote master, but the local

+-- vclock update hasn't happened yet.

+box.error.injection.set('ERRINJ_REPLICASET_VCLOCK_UPDATE', true)

+lsn = box.info.lsn

+box.space._schema:replace{'something'}

+-- Vclock isn't updated.

+box.info.lsn == lsn

+

+-- Wait until the remote instance gets the row.

+while test_run:get_vclock('rebootstrap2')[box.info.id] == lsn do\

+    fiber.sleep(0.01)\

+end

+

+-- Restart the remote instance. This will make the first instance

+-- resubscribe without entering orphan mode.

+test_run:cmd('restart server rebootstrap2')

+test_run:cmd('switch rebootstrap1')

+-- Wait until resubscribe is sent

+fiber.sleep(2 * box.cfg.replication_timeout)

+box.info.replication[2].upstream.status

+

+box.error.injection.set('ERRINJ_REPLICASET_VCLOCK_UPDATE', false)

+test_run:cmd('switch default')

+test_run:drop_cluster(SERVERS)

diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg

index 429c64df3..90fd53ca6 100644

--- a/test/replication/suite.cfg

+++ b/test/replication/suite.cfg

@@ -15,6 +15,7 @@

     "gh-4402-info-errno.test.lua": {},

     "gh-4605-empty-password.test.lua": {},

     "gh-4606-admin-creds.test.lua": {},

+    "gh-4739-vclock-assert.test.lua": {},

     "*": {

         "memtx": {"engine": "memtx"},

         "vinyl": {"engine": "vinyl"}

diff --git a/test/replication/suite.ini b/test/replication/suite.ini

index ed1de3140..b4e09744a 100644

--- a/test/replication/suite.ini

+++ b/test/replication/suite.ini

@@ -3,7 +3,7 @@ core = tarantool

script =  master.lua

description = tarantool/box, replication

disabled = consistent.test.lua

-release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua

+release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua

config = suite.cfg

lua_libs = lua/fast_replica.lua lua/rlimit.lua

use_unix_sockets = True

 
--
Serge Petrenko