From: Sergei Kalashnikov <ztarvos@gmail.com> To: tarantool-patches@freelists.org Subject: [tarantool-patches] [PATCH] xlog: added request details to panic message for broken LSN Date: Thu, 9 Aug 2018 18:02:13 +0300 [thread overview] Message-ID: <CAPSug8N_+mEcTLZj9q+mOtYqzY1chrR-7Xghae4jsZ7YvCQH1A@mail.gmail.com> (raw) Aid the debugging of replication issues related to out-of-order requests. Adds the details of request/tuple to the diagnostic message whenever possible. Fixes: #3105 --- Ticket:https://github.com/tarantool/tarantool/issues/3105 Branch:ztarvos/gh-3105-log-request-broken-lsn src/box/applier.cc | 8 +- src/box/recovery.cc | 4 +- src/box/vclock.c | 6 + src/box/vclock.h | 22 +++ src/box/wal.c | 12 +- src/box/xrow.c | 24 +++ src/box/xrow.h | 15 ++ test/xlog/log_request_broken_lsn_panic.result | 200 ++++++++++++++++++++++++ test/xlog/log_request_broken_lsn_panic.test.lua | 105 +++++++++++++ test/xlog/panic_broken_lsn.lua | 11 ++ 10 files changed, 395 insertions(+), 12 deletions(-) create mode 100644 test/xlog/log_request_broken_lsn_panic.result create mode 100644 test/xlog/log_request_broken_lsn_panic.test.lua create mode 100644 test/xlog/panic_broken_lsn.lua diff --git a/src/box/applier.cc b/src/box/applier.cc index dbb4d05..108f657 100644 --- a/src/box/applier.cc +++ b/src/box/applier.cc @@ -336,8 +336,8 @@ applier_join(struct applier *applier) coio_read_xrow(coio, ibuf, &row); applier->last_row_time = ev_monotonic_now(loop()); if (iproto_type_is_dml(row.type)) { - vclock_follow(&replicaset.vclock, row.replica_id, - row.lsn); + xrow_vclock_follow(&replicaset.vclock, row.replica_id, + row.lsn, &row); xstream_write_xc(applier->subscribe_stream, &row); } else if (row.type == IPROTO_OK) { /* @@ -503,8 +503,8 @@ applier_subscribe(struct applier *applier) * the row is skipped when the replication * is resumed. */ - vclock_follow(&replicaset.vclock, row.replica_id, - row.lsn); + vclock_follow_unsafe(&replicaset.vclock, row.replica_id, + row.lsn); struct replica *replica = replica_by_id(row.replica_id); struct latch *latch = (replica ? &replica->order_latch : &replicaset.applier.order_latch); diff --git a/src/box/recovery.cc b/src/box/recovery.cc index fe14def..c3eac62 100644 --- a/src/box/recovery.cc +++ b/src/box/recovery.cc @@ -135,7 +135,7 @@ recovery_scan(struct recovery *r, struct vclock *end_vclock) return; struct xrow_header row; while (xlog_cursor_next(&cursor, &row, true) == 0) - vclock_follow(end_vclock, row.replica_id, row.lsn); + xrow_vclock_follow(end_vclock, row.replica_id, row.lsn, &row); xlog_cursor_close(&cursor, false); } @@ -266,7 +266,7 @@ recover_xlog(struct recovery *r, struct xstream *stream, * in case of forced recovery, when we skip the * failed row anyway. */ - vclock_follow(&r->vclock, row.replica_id, row.lsn); + vclock_follow_unsafe(&r->vclock, row.replica_id, row.lsn); if (xstream_write(stream, &row) == 0) { ++row_count; if (row_count % 100000 == 0) diff --git a/src/box/vclock.c b/src/box/vclock.c index 2189ea8..a8f2d99 100644 --- a/src/box/vclock.c +++ b/src/box/vclock.c @@ -49,6 +49,12 @@ vclock_follow(struct vclock *vclock, uint32_t replica_id, int64_t lsn) (unsigned) replica_id, (long long) prev_lsn, (long long) lsn); } + return vclock_follow_unsafe(vclock, replica_id, lsn); +} + +int64_t +vclock_follow_unsafe(struct vclock *vclock, uint32_t replica_id, int64_t lsn) { + int64_t prev_lsn = vclock->lsn[replica_id]; /* Easier add each time than check. */ vclock->map |= 1 << replica_id; vclock->lsn[replica_id] = lsn; diff --git a/src/box/vclock.h b/src/box/vclock.h index 928717c..e389074 100644 --- a/src/box/vclock.h +++ b/src/box/vclock.h @@ -199,10 +199,32 @@ vclock_sum(const struct vclock *vclock) return vclock->signature; } +/** + * Update vclock with the next LSN value for given replica id. + * The function will cause panic if the next LSN happens to be + * out of order. + * + * @param vclock Vector clock. + * @param replica_id Replica identifier. + * @param lsn Next lsn. + */ int64_t vclock_follow(struct vclock *vclock, uint32_t replica_id, int64_t lsn); /** + * Update vclock with the next LSN value for given replica id. + * No checks are performed by this function. It may only be + * used when prior checks are performed with regard to validity + * of its parameters. + * + * @param vclock Vector clock. + * @param replica_id Replica identifier. + * @param lsn Next lsn. + */ +int64_t +vclock_follow_unsafe(struct vclock *vclock, uint32_t replica_id, int64_t lsn); + +/** * \brief Format vclock to YAML-compatible string representation: * { replica_id: lsn, replica_id:lsn }) * \param vclock vclock diff --git a/src/box/wal.c b/src/box/wal.c index 41762a5..b206cb0 100644 --- a/src/box/wal.c +++ b/src/box/wal.c @@ -662,8 +662,8 @@ wal_assign_lsn(struct wal_writer *writer, struct xrow_header **row, (*row)->lsn = vclock_inc(&writer->vclock, instance_id); (*row)->replica_id = instance_id; } else { - vclock_follow(&writer->vclock, (*row)->replica_id, - (*row)->lsn); + xrow_vclock_follow(&writer->vclock, (*row)->replica_id, + (*row)->lsn, *row); } } } @@ -888,9 +888,9 @@ wal_write(struct journal *journal, struct journal_entry *entry) */ if (vclock_get(&replicaset.vclock, instance_id) < (*last)->lsn) { - vclock_follow(&replicaset.vclock, - instance_id, - (*last)->lsn); + vclock_follow_unsafe(&replicaset.vclock, + instance_id, + (*last)->lsn); } break; } @@ -910,7 +910,7 @@ wal_write_in_wal_mode_none(struct journal *journal, int64_t new_lsn = vclock_get(&writer->vclock, instance_id); if (new_lsn > old_lsn) { /* There were local writes, promote vclock. */ - vclock_follow(&replicaset.vclock, instance_id, new_lsn); + vclock_follow_unsafe(&replicaset.vclock, instance_id, new_lsn); } return vclock_sum(&writer->vclock); } diff --git a/src/box/xrow.c b/src/box/xrow.c index 269a6e6..558d8d5 100644 --- a/src/box/xrow.c +++ b/src/box/xrow.c @@ -1194,3 +1194,27 @@ greeting_decode(const char *greetingbuf, struct greeting *greeting) return 0; } + +void +xrow_vclock_follow(struct vclock* vclock, uint32_t replica_id, int64_t lsn, + const struct xrow_header *row) { + assert(lsn >= 0); + assert(replica_id < VCLOCK_MAX); + int64_t prev_lsn = vclock->lsn[replica_id]; + if (lsn <= prev_lsn) { + const char* req_str = NULL; + if (row != NULL) { + struct request req; + if (xrow_decode_dml((struct xrow_header *)row, &req, 0) == 0) + req_str = request_str(&req); + } + /* Never confirm LSN out of order. */ + panic("LSN for %u is used twice or COMMIT order is broken: " + "confirmed: %lld, new: %lld, req: %s", + (unsigned) replica_id, + (long long) prev_lsn, + (long long) lsn, + req_str?req_str:"n/a"); + } + vclock_follow_unsafe(vclock, replica_id, lsn); +} diff --git a/src/box/xrow.h b/src/box/xrow.h index 9887382..4f82399 100644 --- a/src/box/xrow.h +++ b/src/box/xrow.h @@ -519,6 +519,21 @@ xrow_to_iovec(const struct xrow_header *row, struct iovec *out); void xrow_decode_error(struct xrow_header *row); +/** + * Update vclock with the next LSN value for given replica id. + * The function will cause panic if the next LSN happens to be + * out of order. The details of provided row are included into + * diagnostic message. + * + * @param vclock Vector clock. + * @param replica_id Replica identifier. + * @param lsn Next lsn. + * @param row Data row. + */ +void +xrow_vclock_follow(struct vclock* vclock, uint32_t replica_id, int64_t lsn, + const struct xrow_header *row); + #if defined(__cplusplus) } /* extern "C" */ diff --git a/test/xlog/log_request_broken_lsn_panic.result b/test/xlog/log_request_broken_lsn_panic.result new file mode 100644 index 0000000..89bed53 --- /dev/null +++ b/test/xlog/log_request_broken_lsn_panic.result @@ -0,0 +1,200 @@ +-- Issue 3105: Test logging of request with broken lsn before panicking +env = require('test_run') +--- +... +test_run = env.new() +--- +... +test_run:cmd('create server panic_broken_lsn with script="xlog/panic_broken_lsn.lua"') +--- +- true +... +test_run:cmd('start server panic_broken_lsn') +--- +- true +... +test_run:switch('panic_broken_lsn') +--- +- true +... +box.space._schema:replace{"t0", "v0"} +--- +- ['t0', 'v0'] +... +box.snapshot() +--- +- ok +... +box.space._schema:replace{"t0", "v1"} +--- +- ['t0', 'v1'] +... +box.snapshot() +--- +- ok +... +box.space._schema:replace{"t0", "v2"} +--- +- ['t0', 'v2'] +... +box.snapshot() +--- +- ok +... +test_run:switch('default') +--- +- true +... +test_run:cmd('stop server panic_broken_lsn') +--- +- true +... +fio = require('fio') +--- +... +dirname = fio.pathjoin(fio.cwd(), "panic_broken_lsn") +--- +... +xlogs = fio.glob(dirname .. "/*.xlog") +--- +... +wal1_name = xlogs[#xlogs - 2] +--- +... +wal2_name = xlogs[#xlogs - 1] +--- +... +snaps = fio.glob(dirname .. "/*.snap") +--- +... +-- Remove empty xlog +fio.unlink(xlogs[#xlogs]) +--- +- true +... +-- Remove two last snapshots +fio.unlink(snaps[#snaps]) +--- +- true +... +fio.unlink(snaps[#snaps - 1]) +--- +- true +... +buffer = require('buffer') +--- +... +ffi = require('ffi') +--- +... +test_run:cmd("setopt delimiter ';'") +--- +- true +... +function read_file(filepath) + local fh = fio.open(filepath, {'O_RDONLY'}) + local size = fh:stat().size + local buf = buffer.ibuf() + fh:read(buf:reserve(size)) + fh:close() + buf:alloc(size) + return buf +end; +--- +... +function find_marker_pos(buf) + local sz = buf:size() + local data = ffi.string(buf.rpos, sz) + local cnt = 0 + for i = 1, sz do + local b = string.byte(data, i) + if (cnt == 0 and b == 213) then + cnt = 1 + elseif (cnt == 1 and b == 186) then + cnt = 2 + elseif (cnt == 2 and b == 11) then + cnt = 3 + elseif (cnt == 3 and b == 171) then + return i - 3 + else + cnt = 0 + end + end + return 0 +end; +--- +... +function run_panic() + local tarantool_bin = arg[-1] + local fmt = [[/bin/sh -c 'cd "%s" && "%s" ../panic_broken_lsn.lua']] + local cmd = string.format(fmt, dirname, tarantool_bin) + local res = os.execute(cmd) + return res +end; +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... +-- Read WAL 1 and find position of data +buf1 = read_file(wal1_name) +--- +... +pos1 = find_marker_pos(buf1) +--- +... +-- Read WAL 2 and find position of data +buf2 = read_file(wal2_name) +--- +... +pos2 = find_marker_pos(buf2) +--- +... +-- Create fake WAL file with header of WAL 2 and data of WAL 1 +tmp_file_name = wal2_name .. ".tmp" +--- +... +fh3 = fio.open(tmp_file_name, {'O_WRONLY', 'O_CREAT'}) +--- +... +fh3:write(buf2.rpos, pos2) +--- +- true +... +fh3:write(buf1.rpos + pos1, buf1:size() - pos1) +--- +- true +... +fh3:close() +--- +- true +... +fio.chmod(tmp_file_name, 0x1B0) +--- +- true +... +-- Replace WAL 2 with fake WAL file +fio.unlink(wal2_name) +--- +- true +... +fio.copyfile(tmp_file_name, wal2_name) +--- +- true +... +fio.unlink(tmp_file_name) +--- +- true +... +-- Try to start tarantool with xlog containing broken LSN +run_panic() +--- +- 256 +... +-- Check that log contains the mention of broken LSN and the request printout +test_run:grep_log('default', "LSN for 1 is used twice or COMMIT order is broken: confirmed: 2, new: 2, req: {.*}") +--- +- 'LSN for 1 is used twice or COMMIT order is broken: confirmed: 2, new: 2, req: {type: + ''REPLACE'', lsn: 2, space_id: 272, index_id: 0, tuple: ["t0", "v1"]}' +... diff --git a/test/xlog/log_request_broken_lsn_panic.test.lua b/test/xlog/log_request_broken_lsn_panic.test.lua new file mode 100644 index 0000000..f87024c --- /dev/null +++ b/test/xlog/log_request_broken_lsn_panic.test.lua @@ -0,0 +1,105 @@ +-- Issue 3105: Test logging of request with broken lsn before panicking +env = require('test_run') +test_run = env.new() + +test_run:cmd('create server panic_broken_lsn with script="xlog/panic_broken_lsn.lua"') +test_run:cmd('start server panic_broken_lsn') +test_run:switch('panic_broken_lsn') + +box.space._schema:replace{"t0", "v0"} +box.snapshot() +box.space._schema:replace{"t0", "v1"} +box.snapshot() +box.space._schema:replace{"t0", "v2"} +box.snapshot() + +test_run:switch('default') +test_run:cmd('stop server panic_broken_lsn') + +fio = require('fio') + +dirname = fio.pathjoin(fio.cwd(), "panic_broken_lsn") + +xlogs = fio.glob(dirname .. "/*.xlog") + +wal1_name = xlogs[#xlogs - 2] +wal2_name = xlogs[#xlogs - 1] + +snaps = fio.glob(dirname .. "/*.snap") + +-- Remove empty xlog +fio.unlink(xlogs[#xlogs]) + +-- Remove two last snapshots +fio.unlink(snaps[#snaps]) +fio.unlink(snaps[#snaps - 1]) + +buffer = require('buffer') +ffi = require('ffi') + +test_run:cmd("setopt delimiter ';'") +function read_file(filepath) + local fh = fio.open(filepath, {'O_RDONLY'}) + local size = fh:stat().size + local buf = buffer.ibuf() + fh:read(buf:reserve(size)) + fh:close() + buf:alloc(size) + return buf +end; +function find_marker_pos(buf) + local sz = buf:size() + local data = ffi.string(buf.rpos, sz) + local cnt = 0 + for i = 1, sz do + local b = string.byte(data, i) + if (cnt == 0 and b == 213) then + cnt = 1 + elseif (cnt == 1 and b == 186) then + cnt = 2 + elseif (cnt == 2 and b == 11) then + cnt = 3 + elseif (cnt == 3 and b == 171) then + return i - 3 + else + cnt = 0 + end + end + return 0 +end; +function run_panic() + local tarantool_bin = arg[-1] + local fmt = [[/bin/sh -c 'cd "%s" && "%s" ../panic_broken_lsn.lua']] + local cmd = string.format(fmt, dirname, tarantool_bin) + local res = os.execute(cmd) + return res +end; +test_run:cmd("setopt delimiter ''"); + +-- Read WAL 1 and find position of data +buf1 = read_file(wal1_name) +pos1 = find_marker_pos(buf1) + +-- Read WAL 2 and find position of data +buf2 = read_file(wal2_name) +pos2 = find_marker_pos(buf2) + +-- Create fake WAL file with header of WAL 2 and data of WAL 1 +tmp_file_name = wal2_name .. ".tmp" + +fh3 = fio.open(tmp_file_name, {'O_WRONLY', 'O_CREAT'}) +fh3:write(buf2.rpos, pos2) +fh3:write(buf1.rpos + pos1, buf1:size() - pos1) +fh3:close() +fio.chmod(tmp_file_name, 0x1B0) + +-- Replace WAL 2 with fake WAL file +fio.unlink(wal2_name) +fio.copyfile(tmp_file_name, wal2_name) +fio.unlink(tmp_file_name) + +-- Try to start tarantool with xlog containing broken LSN +run_panic() + +-- Check that log contains the mention of broken LSN and the request printout +test_run:grep_log('default', "LSN for 1 is used twice or COMMIT order is broken: confirmed: 2, new: 2, req: {.*}") diff --git a/test/xlog/panic_broken_lsn.lua b/test/xlog/panic_broken_lsn.lua new file mode 100644 index 0000000..4d869a3 --- /dev/null +++ b/test/xlog/panic_broken_lsn.lua @@ -0,0 +1,11 @@ +#!/usr/bin/env tarantool +os = require('os') + +box.cfg{ + listen = os.getenv("LISTEN"), + memtx_memory = 107374182, + pid_file = "tarantool.pid", + checkpoint_count = 3 +} + +require('console').listen(os.getenv('ADMIN')) -- 1.8.3.1
next reply other threads:[~2018-08-09 15:02 UTC|newest] Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top 2018-08-09 15:02 Sergei Kalashnikov [this message] 2018-08-14 16:17 ` Vladimir Davydov
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=CAPSug8N_+mEcTLZj9q+mOtYqzY1chrR-7Xghae4jsZ7YvCQH1A@mail.gmail.com \ --to=ztarvos@gmail.com \ --cc=tarantool-patches@freelists.org \ --subject='Re: [tarantool-patches] [PATCH] xlog: added request details to panic message for broken LSN' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox