From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp59.i.mail.ru (smtp59.i.mail.ru [217.69.128.39]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id E6E554696C3 for ; Tue, 7 Apr 2020 13:26:35 +0300 (MSK) Content-Type: text/plain; charset=utf-8 Mime-Version: 1.0 (Mac OS X Mail 13.4 \(3608.80.23.2.2\)) From: Serge Petrenko In-Reply-To: <20200404161524.7466-9-gorcunov@gmail.com> Date: Tue, 7 Apr 2020 13:26:34 +0300 Content-Transfer-Encoding: quoted-printable Message-Id: References: <20200404161524.7466-1-gorcunov@gmail.com> <20200404161524.7466-9-gorcunov@gmail.com> Subject: Re: [Tarantool-patches] [PATCH v11 8/8] test: add replication/applier-rollback List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Cyrill Gorcunov Cc: tml Hi! Thanks for the patch! Please find my comments below. > 4 =D0=B0=D0=BF=D1=80. 2020 =D0=B3., =D0=B2 19:15, Cyrill Gorcunov = =D0=BD=D0=B0=D0=BF=D0=B8=D1=81=D0=B0=D0=BB(=D0=B0): >=20 > Test that diag_raise doesn't happen if async transaction > fails inside replication procedure. >=20 > Side note: I don't like merging tests with patches in > general and I hate doing so for big tests with a passion > because it hides the patch code itself. So here is a > separate patch on top of the fix. I like that. It was easy to check the test without the fix. >=20 > Test-of #4730 >=20 > Signed-off-by: Cyrill Gorcunov > --- > src/box/txn.c | 13 ++ > src/lib/core/errinj.h | 1 + > test/box/errinj.result | 1 + > test/replication/applier-rollback-slave.lua | 16 ++ > test/replication/applier-rollback.result | 162 ++++++++++++++++++++ > test/replication/applier-rollback.test.lua | 81 ++++++++++ It=E2=80=99s a bugfix, so please name the test = gh-4730-something-something.test.lua > test/replication/suite.ini | 2 +- > 7 files changed, 275 insertions(+), 1 deletion(-) > create mode 100644 test/replication/applier-rollback-slave.lua > create mode 100644 test/replication/applier-rollback.result > create mode 100644 test/replication/applier-rollback.test.lua >=20 > diff --git a/src/box/txn.c b/src/box/txn.c > index f9c3e3675..488aa4bdd 100644 > --- a/src/box/txn.c > +++ b/src/box/txn.c > @@ -34,6 +34,7 @@ > #include "journal.h" > #include > #include "xrow.h" > +#include "errinj.h" >=20 > double too_long_threshold; >=20 > @@ -576,6 +577,18 @@ txn_commit_async(struct txn *txn) > { > struct journal_entry *req; >=20 > + ERROR_INJECT(ERRINJ_TXN_COMMIT_ASYNC, { > + diag_set(ClientError, ER_INJECTION, > + "txn commit async injection"); > + /* > + * Log it for the testing sake: we grep > + * output to mark this event. > + */ > + diag_log(); > + txn_rollback(txn); > + return -1; > + }); > + > if (txn_prepare(txn) !=3D 0) { > txn_rollback(txn); > return -1; > diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h > index ee6c57a0d..7577ed11a 100644 > --- a/src/lib/core/errinj.h > +++ b/src/lib/core/errinj.h > @@ -139,6 +139,7 @@ struct errinj { > _(ERRINJ_FIBER_MPROTECT, ERRINJ_INT, {.iparam =3D -1}) \ > _(ERRINJ_RELAY_FASTER_THAN_TX, ERRINJ_BOOL, {.bparam =3D false}) = \ > _(ERRINJ_INDEX_RESERVE, ERRINJ_BOOL, {.bparam =3D false})\ > + _(ERRINJ_TXN_COMMIT_ASYNC, ERRINJ_BOOL, {.bparam =3D false})\ >=20 > ENUM0(errinj_id, ERRINJ_LIST); > extern struct errinj errinjs[]; > diff --git a/test/box/errinj.result b/test/box/errinj.result > index 0d3fedeb3..de877b708 100644 > --- a/test/box/errinj.result > +++ b/test/box/errinj.result > @@ -76,6 +76,7 @@ evals > - ERRINJ_TUPLE_ALLOC: false > - ERRINJ_TUPLE_FIELD: false > - ERRINJ_TUPLE_FORMAT_COUNT: -1 > + - ERRINJ_TXN_COMMIT_ASYNC: false > - ERRINJ_VYRUN_DATA_READ: false > - ERRINJ_VY_COMPACTION_DELAY: false > - ERRINJ_VY_DELAY_PK_LOOKUP: false > diff --git a/test/replication/applier-rollback-slave.lua = b/test/replication/applier-rollback-slave.lua > new file mode 100644 > index 000000000..26fb10055 > --- /dev/null > +++ b/test/replication/applier-rollback-slave.lua Better name it replica_applier_rollback.lua for the sake of consistency with other instance file names. > @@ -0,0 +1,16 @@ > +-- > +-- vim: ts=3D4 sw=3D4 et > +-- > + > +print('arg', arg) > + > +box.cfg({ > + replication =3D os.getenv("MASTER"), > + listen =3D os.getenv("LISTEN"), > + memtx_memory =3D 107374182, > + replication_timeout =3D 0.1, > + replication_connect_timeout =3D 0.5, > + read_only =3D true, > +}) > + > +require('console').listen(os.getenv('ADMIN')) > diff --git a/test/replication/applier-rollback.result = b/test/replication/applier-rollback.result > new file mode 100644 > index 000000000..3c659f460 > --- /dev/null > +++ b/test/replication/applier-rollback.result > @@ -0,0 +1,162 @@ > +-- test-run result file version 2 > +#!/usr/bin/env tarantool > + | --- > + | ... > +-- > +-- vim: ts=3D4 sw=3D4 et > +-- > + > +test_run =3D require('test_run').new() > + | --- > + | ... > + > +errinj =3D box.error.injection > + | --- > + | ... > +engine =3D test_run:get_cfg('engine') > + | --- > + | ... > + > +-- > +-- Allow replica to connect to us > +box.schema.user.grant('guest', 'replication') > + | --- > + | ... > + > +-- > +-- Create replica instance, we're the master and > +-- start it, no data to sync yet though > +test_run:cmd("create server replica_slave with rpl_master=3Ddefault, = script=3D'replication/applier-rollback-slave.lua'") > + | --- > + | - true > + | ... > +test_run:cmd("start server replica_slave") > + | --- > + | - true > + | ... > + > +-- > +-- Fill initial data on the master instance > +test_run:cmd('switch default') > + | --- > + | - true > + | ... > + > +_ =3D box.schema.space.create('test', {engine=3Dengine}) > + | --- > + | ... > +s =3D box.space.test > + | --- > + | ... > + > +s:format({{name =3D 'id', type =3D 'unsigned'}, {name =3D = 'band_name', type =3D 'string'}}) > + | --- > + | ... > + > +_ =3D s:create_index('primary', {type =3D 'tree', parts =3D {'id'}}) > + | --- > + | ... > +s:insert({1, '1'}) > + | --- > + | - [1, '1'] > + | ... > +s:insert({2, '2'}) > + | --- > + | - [2, '2'] > + | ... > +s:insert({3, '3'}) > + | --- > + | - [3, '3'] > + | ... > + > +-- > +-- To make sure we're running > +box.info.status > + | --- > + | - running > + | ... > + > +-- > +-- Wait for data from master get propagated > +test_run:wait_lsn('replica_slave', 'default') > + | --- > + | ... > + > +-- > +-- Now inject error into slave instance > +test_run:cmd('switch replica_slave') > + | --- > + | - true > + | ... > + > +-- > +-- To make sure we're running > +box.info.status > + | --- > + | - running > + | ... > + > +-- > +-- To fail inserting new record. > +errinj =3D box.error.injection > + | --- > + | ... > +errinj.set('ERRINJ_TXN_COMMIT_ASYNC', true) > + | --- > + | - ok > + | ... > + > +-- > +-- Jump back to master node and write new > +-- entry which should cause error to happen > +-- on slave instance > +test_run:cmd('switch default') > + | --- > + | - true > + | ... > +s:insert({4, '4'}) > + | --- > + | - [4, '4'] > + | ... > + > +-- > +-- Wait for error to trigger > +test_run:cmd('switch replica_slave') > + | --- > + | - true > + | ... > +fiber =3D require('fiber') > + | --- > + | ... > +while test_run:grep_log('replica_slave', 'ER_INJECTION:[^\n]*') =3D=3D = nil do fiber.sleep(0.1) end > + | --- > + | ... > + > +---- > +---- Such error cause the applier to be > +---- cancelled and reaped, thus stop the > +---- slave node and cleanup > +test_run:cmd('switch default') > + | --- > + | - true > + | ... > + > +-- > +-- Cleanup > +test_run:cmd("stop server replica_slave") > + | --- > + | - true > + | ... > +test_run:cmd("delete server replica_slave") > + | --- > + | - true > + | ... > +box.cfg{replication=3D""} > + | --- > + | ... > +box.space.test:drop() > + | --- > + | ... > +box.schema.user.revoke('guest', 'replication') > + | --- > + | ... > diff --git a/test/replication/applier-rollback.test.lua = b/test/replication/applier-rollback.test.lua > new file mode 100644 > index 000000000..2c32af5c6 > --- /dev/null > +++ b/test/replication/applier-rollback.test.lua > @@ -0,0 +1,81 @@ > +#!/usr/bin/env tarantool > +-- > +-- vim: ts=3D4 sw=3D4 et > +-- > + > +test_run =3D require('test_run').new() > + > +errinj =3D box.error.injection You don=E2=80=99t need the errinj on master, only on replica, AFAICS. > +engine =3D test_run:get_cfg('engine') Why test both engines? I suggest you run the test with no arguments. (you=E2=80=99ll have to add a line to replication/suite.cfg for that) > + > +-- > +-- Allow replica to connect to us > +box.schema.user.grant('guest', 'replication') > + > +-- > +-- Create replica instance, we're the master and > +-- start it, no data to sync yet though > +test_run:cmd("create server replica_slave with rpl_master=3Ddefault, = script=3D'replication/applier-rollback-slave.lua'") > +test_run:cmd("start server replica_slave") > + > +-- > +-- Fill initial data on the master instance > +test_run:cmd('switch default') > + > +_ =3D box.schema.space.create('test', {engine=3Dengine}) > +s =3D box.space.test > + > +s:format({{name =3D 'id', type =3D 'unsigned'}, {name =3D = 'band_name', type =3D 'string'}}) > + > +_ =3D s:create_index('primary', {type =3D 'tree', parts =3D {'id'}}) > +s:insert({1, '1'}) > +s:insert({2, '2'}) > +s:insert({3, '3'}) > + > +-- > +-- To make sure we're running > +box.info.status > + This should always evaluate to =E2=80=98running=E2=80=99, you=E2=80=99ve = just inserted some values and got a result back, so I=E2=80=99d omit this check. > +-- > +-- Wait for data from master get propagated > +test_run:wait_lsn('replica_slave', 'default') > + > +-- > +-- Now inject error into slave instance > +test_run:cmd('switch replica_slave') > + > +-- > +-- To make sure we're running > +box.info.status > + > +-- > +-- To fail inserting new record. > +errinj =3D box.error.injection > +errinj.set('ERRINJ_TXN_COMMIT_ASYNC', true) > + > +-- > +-- Jump back to master node and write new > +-- entry which should cause error to happen > +-- on slave instance > +test_run:cmd('switch default') > +s:insert({4, '4'}) > + > +-- > +-- Wait for error to trigger > +test_run:cmd('switch replica_slave') > +fiber =3D require('fiber') > +while test_run:grep_log('replica_slave', 'ER_INJECTION:[^\n]*') =3D=3D = nil do fiber.sleep(0.1) end > + > +---- > +---- Such error cause the applier to be > +---- cancelled and reaped, thus stop the > +---- slave node and cleanup > +test_run:cmd('switch default') > + > +-- > +-- Cleanup > +test_run:cmd("stop server replica_slave") > +test_run:cmd("delete server replica_slave") > +box.cfg{replication=3D""} You didn=E2=80=99t set box.cfg.replication, so you shouldn=E2=80=99t = reset it. > +box.space.test:drop() > +box.schema.user.revoke('guest', 'replication') > diff --git a/test/replication/suite.ini b/test/replication/suite.ini > index b4e09744a..f6c924762 100644 > --- a/test/replication/suite.ini > +++ b/test/replication/suite.ini > @@ -3,7 +3,7 @@ core =3D tarantool > script =3D master.lua > description =3D tarantool/box, replication > disabled =3D consistent.test.lua > -release_disabled =3D catch.test.lua errinj.test.lua gc.test.lua = gc_no_space.test.lua before_replace.test.lua quorum.test.lua = recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua = gh-4739-vclock-assert.test.lua > +release_disabled =3D catch.test.lua errinj.test.lua gc.test.lua = gc_no_space.test.lua before_replace.test.lua quorum.test.lua = recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua = gh-4739-vclock-assert.test.lua applier-rollback.test.lua > config =3D suite.cfg > lua_libs =3D lua/fast_replica.lua lua/rlimit.lua > use_unix_sockets =3D True > --=20 > 2.20.1 >=20 -- Serge Petrenko sergepetrenko@tarantool.org