From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Date: Wed, 29 Aug 2018 12:52:47 +0300 From: Vladimir Davydov Subject: Re: [tarantool-patches] [PATCH v2] xlog: add request details to panic message for broken LSN Message-ID: <20180829095247.xbrk7euqwnssvl5q@esperanza> References: <1535368944-23713-1-git-send-email-ztarvos@gmail.com> <20180827170407.2igpxsf4zprp7vdu@esperanza> <20180828162740.GA11135@localhost.localdomain> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20180828162740.GA11135@localhost.localdomain> To: Sergei Kalashnikov Cc: tarantool-patches List-ID: On Tue, Aug 28, 2018 at 07:27:41PM +0300, Sergei Kalashnikov wrote: > diff --git a/src/box/relay.cc b/src/box/relay.cc > index 138e66d..9f07655 100644 > --- a/src/box/relay.cc > +++ b/src/box/relay.cc > @@ -676,6 +676,12 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet) > if (packet->replica_id != relay->replica->id || > packet->lsn <= vclock_get(&relay->local_vclock_at_subscribe, > packet->replica_id)) { > + struct errinj *inj = errinj(ERRINJ_RELAY_BREAK_LSN, > + ERRINJ_INT); > + if (inj != NULL && packet->lsn == inj->iparam) { > + packet->lsn = inj->iparam - 1; > + say_warn("injected broken lsn: %lld", packet->lsn); Nit: int64_t isn't guaranteed to be equivalent to long long. Please converse packet->lsn to long long explicitly. > + } > relay_send(relay, packet); > } > } > diff --git a/src/box/wal.c b/src/box/wal.c > index cd068a4..ade2c71 100644 > --- a/src/box/wal.c > +++ b/src/box/wal.c > @@ -186,6 +186,11 @@ xlog_write_entry(struct xlog *l, struct journal_entry *entry) > struct xrow_header **row = entry->rows; > for (; row < entry->rows + entry->n_rows; row++) { > (*row)->tm = ev_now(loop()); > + struct errinj *inj = errinj(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT); > + if (inj != NULL && inj->iparam == (*row)->lsn) { > + (*row)->lsn = inj->iparam - 1; > + say_warn("injected broken lsn: %lld", (*row)->lsn); Ditto. > + } > if (xlog_write_row(l, *row) < 0) { > /* > * Rollback all un-written rows > diff --git a/src/errinj.h b/src/errinj.h > index 64d13b0..c658cd7 100644 > --- a/src/errinj.h > +++ b/src/errinj.h > @@ -118,6 +118,8 @@ struct errinj { > _(ERRINJ_VY_LOG_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \ > _(ERRINJ_VY_RUN_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \ > _(ERRINJ_VY_INDEX_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \ > + _(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \ > + _(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \ Nit: please rename ERRINJ_XLOG_BREAK_LSN to ERRINJ_WAL_BREAK_LSN - it's used by the WAL thread, after all. > diff --git a/test/xlog/panic_on_broken_lsn.test.lua b/test/xlog/panic_on_broken_lsn.test.lua > new file mode 100644 > index 0000000..d4abf6f > --- /dev/null > +++ b/test/xlog/panic_on_broken_lsn.test.lua > @@ -0,0 +1,72 @@ > +-- Issue 3105: Test logging of request with broken lsn before panicking > +-- Two cases are covered: Recovery and Joining a new replica > +env = require('test_run') > +test_run = env.new() > +test_run:cmd('restart server default with cleanup=True') We strive to make our tests run as fast as possible, that's why we don't restart the default instance before/after each test. Instead we clean up only what we have to. In your case, it means revoking privileges you granted, dropping spaces you created, and disabling error injections you enabled. Since in your test you assume that a new replica is assigned id 2, you should call test_run:cleanup_cluster() in the beginning of this test. This is much cheaper than restarting the default instance. If you don't restart the default instance, LSNs and space ids might be different from run to run. That's OK, you can filter them out in the test output, e.g. see https://github.com/tarantool/tarantool/blob/ac17838c9634ba7ebe329290ebd3314e37ec2255/test/replication/status.test.lua#L117 > + > +fio = require('fio') > +test_run:cmd("setopt delimiter ';'") > +function grep_file_tail(filepath, bytes, pattern) > + local fh = fio.open(filepath, {'O_RDONLY'}) > + local size = fh:seek(0, 'SEEK_END') > + if size < bytes then > + bytes = size > + end > + fh:seek(-bytes, 'SEEK_END') > + local line = fh:read(bytes) > + fh:close() > + return string.match(line, pattern) > +end; > +test_run:cmd("setopt delimiter ''"); > + > +-- Testing case of panic on recovery > +test_run:cmd('create server panic with script="xlog/panic.lua"') > +test_run:cmd('start server panic') > +test_run:switch('panic') > + > +box.space._schema:replace{"t0", "v0"} > +lsn = box.info.vclock[1] > +box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", lsn + 1) > +box.space._schema:replace{"t0", "v1"} > +box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", -1) > + > +test_run:switch('default') > +test_run:cmd('stop server panic') > + > +dirname = fio.pathjoin(fio.cwd(), "panic") > +xlogs = fio.glob(dirname .. "/*.xlog") > +fio.unlink(xlogs[#xlogs]) I don't understand why you need to delete the last xlog here. > + > +test_run:cmd('start server panic with crash_expected=True') > + > +logpath = fio.pathjoin(fio.cwd(), 'panic.log') > + > +-- Check that log contains the mention of broken LSN and the request printout > +grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}") > + > +test_run:cmd('cleanup server panic') > +test_run:cmd('delete server panic') > + > +-- Testing case of panic on joining a new replica > +box.schema.user.grant('guest', 'super') Please instead use box.schema.user.grant('guest', 'replication') > +_ = box.schema.space.create('test') > +_ = box.space.test:create_index('pk') > +box.space.test:auto_increment{'v0'} > +lsn = box.info.vclock[1] > +box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1) > +box.space.test:auto_increment{'v1'} > + > +test_run:cmd('create server replica with rpl_master=default, script="xlog/replica.lua"') > +test_run:cmd('start server replica with crash_expected=True') > +fiber = require('fiber') > +while box.info.replication == nil do fiber.sleep(0.001) end > +while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end > + > +test_run:switch('default') Pointless 'switch' - the test is already running on behalf of the default instance here. > +logpath = fio.pathjoin(fio.cwd(), 'replica.log') > +-- Check that log contains the mention of broken LSN and the request printout > +grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 5, new: 5, req: {.*}") > + > +test_run:cmd('cleanup server replica') > +test_run:cmd('delete server replica') > +test_run:cmd('restart server default with cleanup=True')