Tarantool development patches archive
 help / color / mirror / Atom feed
* [tarantool-patches] [PATCH v2] xlog: add request details to panic message for broken LSN
@ 2018-08-27 11:22 Sergei Kalashnikov
  2018-08-27 17:04 ` Vladimir Davydov
  0 siblings, 1 reply; 6+ messages in thread
From: Sergei Kalashnikov @ 2018-08-27 11:22 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Sergei Kalashnikov

Aid the debugging of replication issues related to out-of-order
requests. Adds the details of request/tuple to the diagnostic
message whenever possible.

Closes #3105
---
https://github.com/tarantool/tarantool/issues/3105
https://github.com/tarantool/tarantool/commits/ztarvos/gh-3105-log-request-broken-lsn

 src/box/applier.cc                              |   6 +-
 src/box/recovery.cc                             |   4 +-
 src/box/relay.cc                                |   8 +
 src/box/vclock.c                                |   8 +-
 src/box/vclock.h                                |   8 +
 src/box/wal.c                                   |   8 +-
 src/box/xlog.c                                  |   8 +
 src/box/xrow.h                                  |  29 ++++
 src/errinj.h                                    |   2 +
 test/box/errinj.result                          |   4 +
 test/xlog/log_request_broken_lsn_panic.result   | 187 ++++++++++++++++++++++++
 test/xlog/log_request_broken_lsn_panic.test.lua |  78 ++++++++++
 test/xlog/panic_replica.lua                     |   9 ++
 test/xlog/suite.ini                             |   2 +-
 14 files changed, 342 insertions(+), 19 deletions(-)
 create mode 100644 test/xlog/log_request_broken_lsn_panic.result
 create mode 100644 test/xlog/log_request_broken_lsn_panic.test.lua
 create mode 100644 test/xlog/panic_replica.lua

diff --git a/src/box/applier.cc b/src/box/applier.cc
index dbb4d05..5d87867 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -336,8 +336,7 @@ applier_join(struct applier *applier)
 		coio_read_xrow(coio, ibuf, &row);
 		applier->last_row_time = ev_monotonic_now(loop());
 		if (iproto_type_is_dml(row.type)) {
-			vclock_follow(&replicaset.vclock, row.replica_id,
-				      row.lsn);
+			vclock_follow_xrow(&replicaset.vclock, &row);
 			xstream_write_xc(applier->subscribe_stream, &row);
 		} else if (row.type == IPROTO_OK) {
 			/*
@@ -503,8 +502,7 @@ applier_subscribe(struct applier *applier)
 			 * the row is skipped when the replication
 			 * is resumed.
 			 */
-			vclock_follow(&replicaset.vclock, row.replica_id,
-				      row.lsn);
+			vclock_follow_xrow(&replicaset.vclock, &row);
 			struct replica *replica = replica_by_id(row.replica_id);
 			struct latch *latch = (replica ? &replica->order_latch :
 					       &replicaset.applier.order_latch);
diff --git a/src/box/recovery.cc b/src/box/recovery.cc
index fe14def..77357f0 100644
--- a/src/box/recovery.cc
+++ b/src/box/recovery.cc
@@ -135,7 +135,7 @@ recovery_scan(struct recovery *r, struct vclock *end_vclock)
 		return;
 	struct xrow_header row;
 	while (xlog_cursor_next(&cursor, &row, true) == 0)
-		vclock_follow(end_vclock, row.replica_id, row.lsn);
+		vclock_follow_xrow(end_vclock, &row);
 	xlog_cursor_close(&cursor, false);
 }
 
@@ -266,7 +266,7 @@ recover_xlog(struct recovery *r, struct xstream *stream,
 		 * in case of forced recovery, when we skip the
 		 * failed row anyway.
 		 */
-		vclock_follow(&r->vclock,  row.replica_id, row.lsn);
+		vclock_follow_xrow(&r->vclock, &row);
 		if (xstream_write(stream, &row) == 0) {
 			++row_count;
 			if (row_count % 100000 == 0)
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 138e66d..2d0b651 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -678,4 +678,12 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet)
 				      packet->replica_id)) {
 		relay_send(relay, packet);
 	}
+#ifndef NDEBUG
+	struct errinj *inj = errinj(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT);
+	if (inj != NULL && packet->lsn == inj->iparam) {
+		relay_send(relay, packet);
+		diag_set(ClientError, ER_INJECTION,
+			 "relay broken lsn injection lsn");
+	}
+#endif
 }
diff --git a/src/box/vclock.c b/src/box/vclock.c
index 2189ea8..b5eb280 100644
--- a/src/box/vclock.c
+++ b/src/box/vclock.c
@@ -42,13 +42,7 @@ vclock_follow(struct vclock *vclock, uint32_t replica_id, int64_t lsn)
 	assert(lsn >= 0);
 	assert(replica_id < VCLOCK_MAX);
 	int64_t prev_lsn = vclock->lsn[replica_id];
-	if (lsn <= prev_lsn) {
-		/* Never confirm LSN out of order. */
-		panic("LSN for %u is used twice or COMMIT order is broken: "
-		      "confirmed: %lld, new: %lld",
-		      (unsigned) replica_id,
-		      (long long) prev_lsn, (long long) lsn);
-	}
+	assert(lsn > prev_lsn);
 	/* Easier add each time than check. */
 	vclock->map |= 1 << replica_id;
 	vclock->lsn[replica_id] = lsn;
diff --git a/src/box/vclock.h b/src/box/vclock.h
index 928717c..111e291 100644
--- a/src/box/vclock.h
+++ b/src/box/vclock.h
@@ -199,6 +199,14 @@ vclock_sum(const struct vclock *vclock)
 	return vclock->signature;
 }
 
+/**
+ * Update vclock with the next LSN value for given replica id.
+ *
+ * @param vclock Vector clock.
+ * @param replica_id Replica identifier.
+ * @param lsn Next lsn.
+ * @return previous lsn value.
+ */
 int64_t
 vclock_follow(struct vclock *vclock, uint32_t replica_id, int64_t lsn);
 
diff --git a/src/box/wal.c b/src/box/wal.c
index cd068a4..9cc1901 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -652,8 +652,7 @@ wal_assign_lsn(struct wal_writer *writer, struct xrow_header **row,
 			(*row)->lsn = vclock_inc(&writer->vclock, instance_id);
 			(*row)->replica_id = instance_id;
 		} else {
-			vclock_follow(&writer->vclock, (*row)->replica_id,
-				      (*row)->lsn);
+			vclock_follow_xrow(&writer->vclock, *row);
 		}
 	}
 }
@@ -878,9 +877,8 @@ wal_write(struct journal *journal, struct journal_entry *entry)
 				 */
 				if (vclock_get(&replicaset.vclock,
 					       instance_id) < (*last)->lsn) {
-					vclock_follow(&replicaset.vclock,
-						      instance_id,
-						      (*last)->lsn);
+					vclock_follow_xrow(&replicaset.vclock,
+							   *last);
 				}
 				break;
 			}
diff --git a/src/box/xlog.c b/src/box/xlog.c
index d399a72..46cf7ed 100644
--- a/src/box/xlog.c
+++ b/src/box/xlog.c
@@ -1246,6 +1246,14 @@ xlog_write_row(struct xlog *log, const struct xrow_header *packet)
 	size_t page_offset = obuf_size(&log->obuf);
 	/** encode row into iovec */
 	struct iovec iov[XROW_IOVMAX];
+#ifndef NDEBUG
+	struct errinj *inj = errinj(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT);
+	if (inj != NULL && inj->iparam == packet->lsn) {
+		((struct xrow_header *)packet)->lsn = inj->iparam - 1;
+		diag_set(ClientError, ER_INJECTION,
+			 "xlog broken lsn injection lsn");
+	}
+#endif
 	/** don't write sync to the disk */
 	int iovcnt = xrow_header_encode(packet, 0, iov, 0);
 	if (iovcnt < 0) {
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 4721670..f7eda91 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -522,6 +522,35 @@ xrow_to_iovec(const struct xrow_header *row, struct iovec *out);
 void
 xrow_decode_error(struct xrow_header *row);
 
+/**
+ * Update vclock with the next LSN value for given replica id.
+ * The function will cause panic if the next LSN happens to be
+ * out of order. The details of provided row are included into
+ * diagnostic message.
+ *
+ * @param vclock Vector clock.
+ * @param row Data row.
+ * @return Previous LSN value.
+ */
+static inline int64_t
+vclock_follow_xrow(struct vclock* vclock, const struct xrow_header *row)
+{
+	assert(row);
+	assert(row->replica_id < VCLOCK_MAX);
+	if (row->lsn <= vclock->lsn[row->replica_id]) {
+		struct request req;
+		xrow_decode_dml((struct xrow_header *)row, &req, 0);
+		/* Never confirm LSN out of order. */
+		panic("LSN for %u is used twice or COMMIT order is broken: "
+		      "confirmed: %lld, new: %lld, req: %s",
+		      (unsigned) row->replica_id,
+		      (long long) vclock->lsn[row->replica_id],
+		      (long long) row->lsn,
+		      request_str(&req));
+	}
+	return vclock_follow(vclock, row->replica_id, row->lsn);
+}
+
 #if defined(__cplusplus)
 } /* extern "C" */
 
diff --git a/src/errinj.h b/src/errinj.h
index 64d13b0..c658cd7 100644
--- a/src/errinj.h
+++ b/src/errinj.h
@@ -118,6 +118,8 @@ struct errinj {
 	_(ERRINJ_VY_LOG_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_VY_RUN_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_VY_INDEX_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
+	_(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
+	_(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
 
 ENUM0(errinj_id, ERRINJ_LIST);
 extern struct errinj errinjs[];
diff --git a/test/box/errinj.result b/test/box/errinj.result
index c7e4ce2..dca2c2d 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -16,6 +16,8 @@ errinj.info()
     state: 0
   ERRINJ_WAL_WRITE:
     state: false
+  ERRINJ_XLOG_BREAK_LSN:
+    state: -1
   ERRINJ_HTTPC_EXECUTE:
     state: false
   ERRINJ_VYRUN_DATA_READ:
@@ -60,6 +62,8 @@ errinj.info()
     state: false
   ERRINJ_WAL_WRITE_DISK:
     state: false
+  ERRINJ_RELAY_BREAK_LSN:
+    state: -1
   ERRINJ_VY_LOG_FILE_RENAME:
     state: false
   ERRINJ_VY_RUN_WRITE:
diff --git a/test/xlog/log_request_broken_lsn_panic.result b/test/xlog/log_request_broken_lsn_panic.result
new file mode 100644
index 0000000..2fd29c1
--- /dev/null
+++ b/test/xlog/log_request_broken_lsn_panic.result
@@ -0,0 +1,187 @@
+-- Issue 3105: Test logging of request with broken lsn before panicking
+-- Two cases are covered: Recovery and Initial Join
+env = require('test_run')
+---
+...
+test_run = env.new()
+---
+...
+fio = require('fio')
+---
+...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+function grep_file_tail(filepath, bytes, pattern)
+    local fh = fio.open(filepath, {'O_RDONLY'})
+    local size = fh:seek(0, 'SEEK_END')
+    if size < bytes then
+        bytes = size
+    end
+    fh:seek(-bytes, 'SEEK_END')
+    local line = fh:read(bytes)
+    fh:close()
+    return string.match(line, pattern)
+end;
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+-- Testing case of panic on recovery
+test_run:cmd('create server replica with script="xlog/panic_replica.lua"')
+---
+- true
+...
+test_run:cmd('start server replica')
+---
+- true
+...
+test_run:switch('replica')
+---
+- true
+...
+box.space._schema:replace{"t0", "v0"}
+---
+- ['t0', 'v0']
+...
+lsn = box.info.vclock[1]
+---
+...
+box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", lsn + 1)
+---
+- ok
+...
+box.space._schema:replace{"t0", "v1"}
+---
+- ['t0', 'v1']
+...
+box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", -1)
+---
+- ok
+...
+test_run:switch('default')
+---
+- true
+...
+test_run:cmd('stop server replica')
+---
+- true
+...
+dirname = fio.pathjoin(fio.cwd(), "panic_replica")
+---
+...
+xlogs = fio.glob(dirname .. "/*.xlog")
+---
+...
+fio.unlink(xlogs[#xlogs])
+---
+- true
+...
+test_run:cmd('start server replica with crash_expected=True')
+---
+- false
+...
+logpath = fio.pathjoin(fio.cwd(), 'panic_replica.log')
+---
+...
+-- Check that log contains the mention of broken LSN and the request printout
+grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
+---
+- 'LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {type:
+  ''REPLACE'', lsn: 1, space_id: 272, index_id: 0, tuple: ["t0", "v1"]}'
+...
+test_run:cmd('cleanup server replica')
+---
+- true
+...
+test_run:cmd('delete server replica')
+---
+- true
+...
+-- Testing case of panic on initial join
+test_run:cmd('create server leader with script="xlog/replica.lua"')
+---
+- true
+...
+test_run:cmd("start server leader")
+---
+- true
+...
+test_run:switch('leader')
+---
+- true
+...
+box.schema.user.grant('guest', 'super')
+---
+...
+_ = box.schema.space.create('test')
+---
+...
+_ = box.space.test:create_index('pk')
+---
+...
+box.space.test:auto_increment{'v0'}
+---
+- [1, 'v0']
+...
+lsn = box.info.vclock[1]
+---
+...
+box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
+---
+- ok
+...
+box.space.test:auto_increment{'v1'}
+---
+- [2, 'v1']
+...
+test_run:cmd('create server replica with rpl_master=leader, script="xlog/panic_replica.lua"')
+---
+- true
+...
+test_run:cmd('start server replica with crash_expected=True')
+---
+- false
+...
+fiber = require('fiber')
+---
+...
+while box.info.replication == nil do fiber.sleep(0.001) end
+---
+...
+while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
+---
+...
+test_run:switch('default')
+---
+- true
+...
+-- Check that log contains the mention of broken LSN and the request printout
+grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 6, new: 6, req: {.*}")
+---
+- 'LSN for 1 is used twice or COMMIT order is broken: confirmed: 6, new: 6, req: {type:
+  ''INSERT'', lsn: 6, space_id: 512, index_id: 0, tuple: [2, "v1"]}'
+...
+test_run:cmd('stop server leader')
+---
+- true
+...
+test_run:cmd('cleanup server leader')
+---
+- true
+...
+test_run:cmd('delete server leader')
+---
+- true
+...
+test_run:cmd('cleanup server replica')
+---
+- true
+...
+test_run:cmd('delete server replica')
+---
+- true
+...
diff --git a/test/xlog/log_request_broken_lsn_panic.test.lua b/test/xlog/log_request_broken_lsn_panic.test.lua
new file mode 100644
index 0000000..e0b3295
--- /dev/null
+++ b/test/xlog/log_request_broken_lsn_panic.test.lua
@@ -0,0 +1,78 @@
+-- Issue 3105: Test logging of request with broken lsn before panicking
+-- Two cases are covered: Recovery and Initial Join
+env = require('test_run')
+test_run = env.new()
+
+fio = require('fio')
+test_run:cmd("setopt delimiter ';'")
+function grep_file_tail(filepath, bytes, pattern)
+    local fh = fio.open(filepath, {'O_RDONLY'})
+    local size = fh:seek(0, 'SEEK_END')
+    if size < bytes then
+        bytes = size
+    end
+    fh:seek(-bytes, 'SEEK_END')
+    local line = fh:read(bytes)
+    fh:close()
+    return string.match(line, pattern)
+end;
+test_run:cmd("setopt delimiter ''");
+
+-- Testing case of panic on recovery
+test_run:cmd('create server replica with script="xlog/panic_replica.lua"')
+test_run:cmd('start server replica')
+test_run:switch('replica')
+
+box.space._schema:replace{"t0", "v0"}
+lsn = box.info.vclock[1]
+box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", lsn + 1)
+box.space._schema:replace{"t0", "v1"}
+box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", -1)
+
+test_run:switch('default')
+test_run:cmd('stop server replica')
+
+dirname = fio.pathjoin(fio.cwd(), "panic_replica")
+xlogs = fio.glob(dirname .. "/*.xlog")
+fio.unlink(xlogs[#xlogs])
+
+test_run:cmd('start server replica with crash_expected=True')
+
+logpath = fio.pathjoin(fio.cwd(), 'panic_replica.log')
+
+-- Check that log contains the mention of broken LSN and the request printout
+grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
+
+test_run:cmd('cleanup server replica')
+test_run:cmd('delete server replica')
+
+-- Testing case of panic on initial join
+test_run:cmd('create server leader with script="xlog/replica.lua"')
+test_run:cmd("start server leader")
+test_run:switch('leader')
+
+box.schema.user.grant('guest', 'super')
+_ = box.schema.space.create('test')
+_ = box.space.test:create_index('pk')
+
+box.space.test:auto_increment{'v0'}
+lsn = box.info.vclock[1]
+box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
+box.space.test:auto_increment{'v1'}
+
+test_run:cmd('create server replica with rpl_master=leader, script="xlog/panic_replica.lua"')
+test_run:cmd('start server replica with crash_expected=True')
+fiber = require('fiber')
+while box.info.replication == nil do fiber.sleep(0.001) end
+while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
+
+test_run:switch('default')
+-- Check that log contains the mention of broken LSN and the request printout
+grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 6, new: 6, req: {.*}")
+
+test_run:cmd('stop server leader')
+test_run:cmd('cleanup server leader')
+test_run:cmd('delete server leader')
+
+test_run:cmd('cleanup server replica')
+test_run:cmd('delete server replica')
diff --git a/test/xlog/panic_replica.lua b/test/xlog/panic_replica.lua
new file mode 100644
index 0000000..ed0ba38
--- /dev/null
+++ b/test/xlog/panic_replica.lua
@@ -0,0 +1,9 @@
+#!/usr/bin/env tarantool
+
+box.cfg({
+    listen              = os.getenv("LISTEN"),
+    replication         = os.getenv("MASTER"),
+    memtx_memory        = 107374182,
+})
+
+require('console').listen(os.getenv('ADMIN'))
diff --git a/test/xlog/suite.ini b/test/xlog/suite.ini
index 8ee9720..f27b0f9 100644
--- a/test/xlog/suite.ini
+++ b/test/xlog/suite.ini
@@ -4,7 +4,7 @@ description = tarantool write ahead log tests
 script = xlog.lua
 disabled = snap_io_rate.test.lua
 valgrind_disabled =
-release_disabled = errinj.test.lua panic_on_lsn_gap.test.lua
+release_disabled = errinj.test.lua panic_on_lsn_gap.test.lua log_request_broken_lsn_panic.test.lua
 config = suite.cfg
 use_unix_sockets = True
 long_run = snap_io_rate.test.lua
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [tarantool-patches] [PATCH v2] xlog: add request details to panic message for broken LSN
  2018-08-27 11:22 [tarantool-patches] [PATCH v2] xlog: add request details to panic message for broken LSN Sergei Kalashnikov
@ 2018-08-27 17:04 ` Vladimir Davydov
  2018-08-28 16:27   ` Sergei Kalashnikov
  0 siblings, 1 reply; 6+ messages in thread
From: Vladimir Davydov @ 2018-08-27 17:04 UTC (permalink / raw)
  To: Sergei Kalashnikov; +Cc: tarantool-patches

On Mon, Aug 27, 2018 at 02:22:24PM +0300, Sergei Kalashnikov wrote:
> Aid the debugging of replication issues related to out-of-order
> requests. Adds the details of request/tuple to the diagnostic
> message whenever possible.
> 
> Closes #3105
> ---
> https://github.com/tarantool/tarantool/issues/3105
> https://github.com/tarantool/tarantool/commits/ztarvos/gh-3105-log-request-broken-lsn

Please write a brief change log here, after the hyperlinks,
when you re-submit a patch, e.g.

https://www.freelists.org/post/tarantool-patches/PATCH-v3-replication-do-not-ignore-replication-connect-quorum

> diff --git a/src/box/relay.cc b/src/box/relay.cc
> index 138e66d..2d0b651 100644
> --- a/src/box/relay.cc
> +++ b/src/box/relay.cc
> @@ -678,4 +678,12 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet)
>  				      packet->replica_id)) {
>  		relay_send(relay, packet);
>  	}
> +#ifndef NDEBUG

You don't really need 'ifndef NDEBUG' here. Please remove.

> +	struct errinj *inj = errinj(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT);
> +	if (inj != NULL && packet->lsn == inj->iparam) {
> +		relay_send(relay, packet);
> +		diag_set(ClientError, ER_INJECTION,
> +			 "relay broken lsn injection lsn");

There's no point in setting diag here. Please remove. At the same time,
I think it would be useful if you logged the LSN of the row you mangled
with say_warn() or say_error().

Also, IMO the code would be clearer if you modified LSN of the row
before sending it instead of sending two rows.

> +	}
> +#endif
>  }
> diff --git a/src/box/xlog.c b/src/box/xlog.c
> index d399a72..46cf7ed 100644
> --- a/src/box/xlog.c
> +++ b/src/box/xlog.c
> @@ -1246,6 +1246,14 @@ xlog_write_row(struct xlog *log, const struct xrow_header *packet)
>  	size_t page_offset = obuf_size(&log->obuf);
>  	/** encode row into iovec */
>  	struct iovec iov[XROW_IOVMAX];
> +#ifndef NDEBUG

Again, ifndef and diag aren't really needed here while logging would
probably make sense.

> +	struct errinj *inj = errinj(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT);
> +	if (inj != NULL && inj->iparam == packet->lsn) {
> +		((struct xrow_header *)packet)->lsn = inj->iparam - 1;
> +		diag_set(ClientError, ER_INJECTION,
> +			 "xlog broken lsn injection lsn");
> +	}
> +#endif

I'd expect to see this error injection in wal.c - xlog_write_row() is
used not only for writing xlogs, but also run and snap files, we don't
want to mangle those.

>  	/** don't write sync to the disk */
>  	int iovcnt = xrow_header_encode(packet, 0, iov, 0);
>  	if (iovcnt < 0) {
> diff --git a/src/box/xrow.h b/src/box/xrow.h
> index 4721670..f7eda91 100644
> --- a/src/box/xrow.h
> +++ b/src/box/xrow.h
> @@ -522,6 +522,35 @@ xrow_to_iovec(const struct xrow_header *row, struct iovec *out);
>  void
>  xrow_decode_error(struct xrow_header *row);
>  
> +/**
> + * Update vclock with the next LSN value for given replica id.
> + * The function will cause panic if the next LSN happens to be
> + * out of order. The details of provided row are included into
> + * diagnostic message.
> + *
> + * @param vclock Vector clock.
> + * @param row Data row.
> + * @return Previous LSN value.
> + */
> +static inline int64_t
> +vclock_follow_xrow(struct vclock* vclock, const struct xrow_header *row)
> +{
> +	assert(row);
> +	assert(row->replica_id < VCLOCK_MAX);
> +	if (row->lsn <= vclock->lsn[row->replica_id]) {
> +		struct request req;
> +		xrow_decode_dml((struct xrow_header *)row, &req, 0);

xrow_decode_dml() may fail, you should handle it.

> +		/* Never confirm LSN out of order. */
> +		panic("LSN for %u is used twice or COMMIT order is broken: "
> +		      "confirmed: %lld, new: %lld, req: %s",
> +		      (unsigned) row->replica_id,
> +		      (long long) vclock->lsn[row->replica_id],
> +		      (long long) row->lsn,
> +		      request_str(&req));
> +	}
> +	return vclock_follow(vclock, row->replica_id, row->lsn);
> +}
> +
>  #if defined(__cplusplus)
>  } /* extern "C" */
>  
> diff --git a/test/xlog/log_request_broken_lsn_panic.test.lua b/test/xlog/log_request_broken_lsn_panic.test.lua

The test doesn't pass when run twice in a row:

  yes xlog/log_request_broken_lsn_panic.test.lua | head -n 10 | xargs ./test-run.py -j 1 --long

Please fix.

Also, the test name is rather cumbersome IMHO. What about
xlog/panic_on_broken_lsn.test.lua ?

> new file mode 100644
> index 0000000..e0b3295
> --- /dev/null
> +++ b/test/xlog/log_request_broken_lsn_panic.test.lua
> @@ -0,0 +1,78 @@
> +-- Issue 3105: Test logging of request with broken lsn before panicking
> +-- Two cases are covered: Recovery and Initial Join
> +env = require('test_run')
> +test_run = env.new()
> +
> +fio = require('fio')
> +test_run:cmd("setopt delimiter ';'")
> +function grep_file_tail(filepath, bytes, pattern)
> +    local fh = fio.open(filepath, {'O_RDONLY'})
> +    local size = fh:seek(0, 'SEEK_END')
> +    if size < bytes then
> +        bytes = size
> +    end
> +    fh:seek(-bytes, 'SEEK_END')
> +    local line = fh:read(bytes)
> +    fh:close()
> +    return string.match(line, pattern)
> +end;
> +test_run:cmd("setopt delimiter ''");
> +
> +-- Testing case of panic on recovery
> +test_run:cmd('create server replica with script="xlog/panic_replica.lua"')
> +test_run:cmd('start server replica')
> +test_run:switch('replica')
> +
> +box.space._schema:replace{"t0", "v0"}
> +lsn = box.info.vclock[1]
> +box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", lsn + 1)
> +box.space._schema:replace{"t0", "v1"}
> +box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", -1)
> +
> +test_run:switch('default')
> +test_run:cmd('stop server replica')
> +
> +dirname = fio.pathjoin(fio.cwd(), "panic_replica")
> +xlogs = fio.glob(dirname .. "/*.xlog")
> +fio.unlink(xlogs[#xlogs])
> +
> +test_run:cmd('start server replica with crash_expected=True')
> +
> +logpath = fio.pathjoin(fio.cwd(), 'panic_replica.log')
> +
> +-- Check that log contains the mention of broken LSN and the request printout
> +grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
> +
> +test_run:cmd('cleanup server replica')
> +test_run:cmd('delete server replica')
> +
> +-- Testing case of panic on initial join

In tarantool the process of joining a new replica consists of two
phases: initial join, when the last checkpoint is sent, and final join,
when xlogs are sent. So this is final join, actually. Let's use phrase
'joining a new replica' in comments to avoid confusion.

> +test_run:cmd('create server leader with script="xlog/replica.lua"')

Why do you need a separate master? Why can't you simply bootstrap a
replica from the default instance? Then you wouldn't need to add the
new script, xlog/panic_replica.lua. I surmise this is because you need
to access box.info.replication below while on the default server the
replica doesn't necessarily have id 2. If this is the reason, then you
might want to look at test_run.cleanup_cluster().

> +test_run:cmd("start server leader")
> +test_run:switch('leader')
> +
> +box.schema.user.grant('guest', 'super')

box.schema.user.grant('guest', 'replication')

> +_ = box.schema.space.create('test')
> +_ = box.space.test:create_index('pk')
> +
> +box.space.test:auto_increment{'v0'}
> +lsn = box.info.vclock[1]
> +box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
> +box.space.test:auto_increment{'v1'}
> +
> +test_run:cmd('create server replica with rpl_master=leader, script="xlog/panic_replica.lua"')
> +test_run:cmd('start server replica with crash_expected=True')
> +fiber = require('fiber')
> +while box.info.replication == nil do fiber.sleep(0.001) end
> +while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
> +
> +test_run:switch('default')
> +-- Check that log contains the mention of broken LSN and the request printout
> +grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 6, new: 6, req: {.*}")
> +
> +test_run:cmd('stop server leader')
> +test_run:cmd('cleanup server leader')
> +test_run:cmd('delete server leader')
> +
> +test_run:cmd('cleanup server replica')
> +test_run:cmd('delete server replica')

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [tarantool-patches] [PATCH v2] xlog: add request details to panic message for broken LSN
  2018-08-27 17:04 ` Vladimir Davydov
@ 2018-08-28 16:27   ` Sergei Kalashnikov
  2018-08-29  9:52     ` Vladimir Davydov
  0 siblings, 1 reply; 6+ messages in thread
From: Sergei Kalashnikov @ 2018-08-28 16:27 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

Hi Vladimir,

Thank you for review. Please find my answers inline.
I also included the fixed patch at the end of this mail.


On Mon, Aug 27, 2018 at 08:04:07PM +0300, Vladimir Davydov wrote:
> On Mon, Aug 27, 2018 at 02:22:24PM +0300, Sergei Kalashnikov wrote:
> > Aid the debugging of replication issues related to out-of-order
> > requests. Adds the details of request/tuple to the diagnostic
> > message whenever possible.
> > 
> > Closes #3105
> > ---
> > https://github.com/tarantool/tarantool/issues/3105
> > https://github.com/tarantool/tarantool/commits/ztarvos/gh-3105-log-request-broken-lsn
> 
> Please write a brief change log here, after the hyperlinks,
> when you re-submit a patch, e.g.
> 
> https://www.freelists.org/post/tarantool-patches/PATCH-v3-replication-do-not-ignore-replication-connect-quorum
> 
OK, got it!

> > diff --git a/src/box/relay.cc b/src/box/relay.cc
> > index 138e66d..2d0b651 100644
> > --- a/src/box/relay.cc
> > +++ b/src/box/relay.cc
> > @@ -678,4 +678,12 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet)
> >  				      packet->replica_id)) {
> >  		relay_send(relay, packet);
> >  	}
> > +#ifndef NDEBUG
> 
> You don't really need 'ifndef NDEBUG' here. Please remove.
> 
Done.

> > +	struct errinj *inj = errinj(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT);
> > +	if (inj != NULL && packet->lsn == inj->iparam) {
> > +		relay_send(relay, packet);
> > +		diag_set(ClientError, ER_INJECTION,
> > +			 "relay broken lsn injection lsn");
> 
> There's no point in setting diag here. Please remove. At the same time,
> I think it would be useful if you logged the LSN of the row you mangled
> with say_warn() or say_error().
> 
Fixed. Added say_warn with the lsn.

> Also, IMO the code would be clearer if you modified LSN of the row
> before sending it instead of sending two rows.
> 
Done.

> > +	}
> > +#endif
> >  }
> > diff --git a/src/box/xlog.c b/src/box/xlog.c
> > index d399a72..46cf7ed 100644
> > --- a/src/box/xlog.c
> > +++ b/src/box/xlog.c
> > @@ -1246,6 +1246,14 @@ xlog_write_row(struct xlog *log, const struct xrow_header *packet)
> >  	size_t page_offset = obuf_size(&log->obuf);
> >  	/** encode row into iovec */
> >  	struct iovec iov[XROW_IOVMAX];
> > +#ifndef NDEBUG
> 
> Again, ifndef and diag aren't really needed here while logging would
> probably make sense.
> 
Fixed.

> > +	struct errinj *inj = errinj(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT);
> > +	if (inj != NULL && inj->iparam == packet->lsn) {
> > +		((struct xrow_header *)packet)->lsn = inj->iparam - 1;
> > +		diag_set(ClientError, ER_INJECTION,
> > +			 "xlog broken lsn injection lsn");
> > +	}
> > +#endif
> 
> I'd expect to see this error injection in wal.c - xlog_write_row() is
> used not only for writing xlogs, but also run and snap files, we don't
> want to mangle those.
> 
Moved to wal.c

> >  	/** don't write sync to the disk */
> >  	int iovcnt = xrow_header_encode(packet, 0, iov, 0);
> >  	if (iovcnt < 0) {
> > diff --git a/src/box/xrow.h b/src/box/xrow.h
> > index 4721670..f7eda91 100644
> > --- a/src/box/xrow.h
> > +++ b/src/box/xrow.h
> > @@ -522,6 +522,35 @@ xrow_to_iovec(const struct xrow_header *row, struct iovec *out);
> >  void
> >  xrow_decode_error(struct xrow_header *row);
> >  
> > +/**
> > + * Update vclock with the next LSN value for given replica id.
> > + * The function will cause panic if the next LSN happens to be
> > + * out of order. The details of provided row are included into
> > + * diagnostic message.
> > + *
> > + * @param vclock Vector clock.
> > + * @param row Data row.
> > + * @return Previous LSN value.
> > + */
> > +static inline int64_t
> > +vclock_follow_xrow(struct vclock* vclock, const struct xrow_header *row)
> > +{
> > +	assert(row);
> > +	assert(row->replica_id < VCLOCK_MAX);
> > +	if (row->lsn <= vclock->lsn[row->replica_id]) {
> > +		struct request req;
> > +		xrow_decode_dml((struct xrow_header *)row, &req, 0);
> 
> xrow_decode_dml() may fail, you should handle it.
> 
Agree, fixed.
> > +		/* Never confirm LSN out of order. */
> > +		panic("LSN for %u is used twice or COMMIT order is broken: "
> > +		      "confirmed: %lld, new: %lld, req: %s",
> > +		      (unsigned) row->replica_id,
> > +		      (long long) vclock->lsn[row->replica_id],
> > +		      (long long) row->lsn,
> > +		      request_str(&req));
> > +	}
> > +	return vclock_follow(vclock, row->replica_id, row->lsn);
> > +}
> > +
> >  #if defined(__cplusplus)
> >  } /* extern "C" */
> >  
> > diff --git a/test/xlog/log_request_broken_lsn_panic.test.lua b/test/xlog/log_request_broken_lsn_panic.test.lua
> 
> The test doesn't pass when run twice in a row:
> 
>   yes xlog/log_request_broken_lsn_panic.test.lua | head -n 10 | xargs ./test-run.py -j 1 --long
> 
> Please fix.

I corrected the test. My understanding is that the issue stemmed
from the fact that I used the xlog/replica.lua script without specifying
repl_master parameter and on the second test run, the environment already
contained the MASTER variable set from the previous run, so it caused a
failed replication attempt.

In order to fix this, I replaced the replica.lua script with panic.lua
since the latter doesn't expect replication config.

> 
> Also, the test name is rather cumbersome IMHO. What about
> xlog/panic_on_broken_lsn.test.lua ?
> 
OK.

> > new file mode 100644
> > index 0000000..e0b3295
> > --- /dev/null
> > +++ b/test/xlog/log_request_broken_lsn_panic.test.lua
> > @@ -0,0 +1,78 @@
> > +-- Issue 3105: Test logging of request with broken lsn before panicking
> > +-- Two cases are covered: Recovery and Initial Join
> > +env = require('test_run')
> > +test_run = env.new()
> > +
> > +fio = require('fio')
> > +test_run:cmd("setopt delimiter ';'")
> > +function grep_file_tail(filepath, bytes, pattern)
> > +    local fh = fio.open(filepath, {'O_RDONLY'})
> > +    local size = fh:seek(0, 'SEEK_END')
> > +    if size < bytes then
> > +        bytes = size
> > +    end
> > +    fh:seek(-bytes, 'SEEK_END')
> > +    local line = fh:read(bytes)
> > +    fh:close()
> > +    return string.match(line, pattern)
> > +end;
> > +test_run:cmd("setopt delimiter ''");
> > +
> > +-- Testing case of panic on recovery
> > +test_run:cmd('create server replica with script="xlog/panic_replica.lua"')
> > +test_run:cmd('start server replica')
> > +test_run:switch('replica')
> > +
> > +box.space._schema:replace{"t0", "v0"}
> > +lsn = box.info.vclock[1]
> > +box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", lsn + 1)
> > +box.space._schema:replace{"t0", "v1"}
> > +box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", -1)
> > +
> > +test_run:switch('default')
> > +test_run:cmd('stop server replica')
> > +
> > +dirname = fio.pathjoin(fio.cwd(), "panic_replica")
> > +xlogs = fio.glob(dirname .. "/*.xlog")
> > +fio.unlink(xlogs[#xlogs])
> > +
> > +test_run:cmd('start server replica with crash_expected=True')
> > +
> > +logpath = fio.pathjoin(fio.cwd(), 'panic_replica.log')
> > +
> > +-- Check that log contains the mention of broken LSN and the request printout
> > +grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
> > +
> > +test_run:cmd('cleanup server replica')
> > +test_run:cmd('delete server replica')
> > +
> > +-- Testing case of panic on initial join
> 
> In tarantool the process of joining a new replica consists of two
> phases: initial join, when the last checkpoint is sent, and final join,
> when xlogs are sent. So this is final join, actually. Let's use phrase
> 'joining a new replica' in comments to avoid confusion.
> 
OK.

> > +test_run:cmd('create server leader with script="xlog/replica.lua"')
> 
> Why do you need a separate master? Why can't you simply bootstrap a
> replica from the default instance? Then you wouldn't need to add the
> new script, xlog/panic_replica.lua. I surmise this is because you need
> to access box.info.replication below while on the default server the
> replica doesn't necessarily have id 2. If this is the reason, then you
> might want to look at test_run.cleanup_cluster().
> 

In the fixed patch, I removed the extra script xlog/panic_replica.lua.
I also added the default server restart to the beginning and the very
end of the test. I expect this will guarantee that replica will always 
have the id 2.

> > +test_run:cmd("start server leader")
> > +test_run:switch('leader')
> > +
> > +box.schema.user.grant('guest', 'super')
> 
> box.schema.user.grant('guest', 'replication')
> 
> > +_ = box.schema.space.create('test')
> > +_ = box.space.test:create_index('pk')
> > +
> > +box.space.test:auto_increment{'v0'}
> > +lsn = box.info.vclock[1]
> > +box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
> > +box.space.test:auto_increment{'v1'}
> > +
> > +test_run:cmd('create server replica with rpl_master=leader, script="xlog/panic_replica.lua"')
> > +test_run:cmd('start server replica with crash_expected=True')
> > +fiber = require('fiber')
> > +while box.info.replication == nil do fiber.sleep(0.001) end
> > +while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
> > +
> > +test_run:switch('default')
> > +-- Check that log contains the mention of broken LSN and the request printout
> > +grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 6, new: 6, req: {.*}")
> > +
> > +test_run:cmd('stop server leader')
> > +test_run:cmd('cleanup server leader')
> > +test_run:cmd('delete server leader')
> > +
> > +test_run:cmd('cleanup server replica')
> > +test_run:cmd('delete server replica')

Please find the fixed patch below.
============================================================================

From 8299d697f0dff8d552c84d1450f70cf5fbde7e55 Mon Sep 17 00:00:00 2001
From: Sergei Kalashnikov <ztarvos@gmail.com>
Date: Tue, 28 Aug 2018 18:05:34 +0300
Subject: [PATCH v2] xlog: add request details to panic message for broken LSN

Aid the debugging of replication issues related to out-of-order
requests. Adds the details of request/tuple to the diagnostic
message whenever possible.

Closes #3105
---
https://github.com/tarantool/tarantool/issues/3105
https://github.com/tarantool/tarantool/commits/ztarvos/gh-3105-log-request-broken-lsn

Changes in v2:
 - Removed vclock_follow_unsafe()
 - Renamed the new function vclock_follow_xrow(),
   removed extra params and moved it to the header.
 - Added error injections and simplified the test.
 - Added test for replica join case.
 - Removed trailing spaces.


 src/box/applier.cc                     |   6 +-
 src/box/recovery.cc                    |   4 +-
 src/box/relay.cc                       |   6 ++
 src/box/vclock.c                       |   8 +-
 src/box/vclock.h                       |   8 ++
 src/box/wal.c                          |  13 ++-
 src/box/xrow.h                         |  31 ++++++
 src/errinj.h                           |   2 +
 test/box/errinj.result                 |   4 +
 test/xlog/panic_on_broken_lsn.result   | 168 +++++++++++++++++++++++++++++++++
 test/xlog/panic_on_broken_lsn.test.lua |  72 ++++++++++++++
 test/xlog/suite.ini                    |   2 +-
 12 files changed, 305 insertions(+), 19 deletions(-)
 create mode 100644 test/xlog/panic_on_broken_lsn.result
 create mode 100644 test/xlog/panic_on_broken_lsn.test.lua

diff --git a/src/box/applier.cc b/src/box/applier.cc
index dbb4d05..5d87867 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -336,8 +336,7 @@ applier_join(struct applier *applier)
 		coio_read_xrow(coio, ibuf, &row);
 		applier->last_row_time = ev_monotonic_now(loop());
 		if (iproto_type_is_dml(row.type)) {
-			vclock_follow(&replicaset.vclock, row.replica_id,
-				      row.lsn);
+			vclock_follow_xrow(&replicaset.vclock, &row);
 			xstream_write_xc(applier->subscribe_stream, &row);
 		} else if (row.type == IPROTO_OK) {
 			/*
@@ -503,8 +502,7 @@ applier_subscribe(struct applier *applier)
 			 * the row is skipped when the replication
 			 * is resumed.
 			 */
-			vclock_follow(&replicaset.vclock, row.replica_id,
-				      row.lsn);
+			vclock_follow_xrow(&replicaset.vclock, &row);
 			struct replica *replica = replica_by_id(row.replica_id);
 			struct latch *latch = (replica ? &replica->order_latch :
 					       &replicaset.applier.order_latch);
diff --git a/src/box/recovery.cc b/src/box/recovery.cc
index fe14def..77357f0 100644
--- a/src/box/recovery.cc
+++ b/src/box/recovery.cc
@@ -135,7 +135,7 @@ recovery_scan(struct recovery *r, struct vclock *end_vclock)
 		return;
 	struct xrow_header row;
 	while (xlog_cursor_next(&cursor, &row, true) == 0)
-		vclock_follow(end_vclock, row.replica_id, row.lsn);
+		vclock_follow_xrow(end_vclock, &row);
 	xlog_cursor_close(&cursor, false);
 }
 
@@ -266,7 +266,7 @@ recover_xlog(struct recovery *r, struct xstream *stream,
 		 * in case of forced recovery, when we skip the
 		 * failed row anyway.
 		 */
-		vclock_follow(&r->vclock,  row.replica_id, row.lsn);
+		vclock_follow_xrow(&r->vclock, &row);
 		if (xstream_write(stream, &row) == 0) {
 			++row_count;
 			if (row_count % 100000 == 0)
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 138e66d..9f07655 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -676,6 +676,12 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet)
 	if (packet->replica_id != relay->replica->id ||
 	    packet->lsn <= vclock_get(&relay->local_vclock_at_subscribe,
 				      packet->replica_id)) {
+		struct errinj *inj = errinj(ERRINJ_RELAY_BREAK_LSN,
+					    ERRINJ_INT);
+		if (inj != NULL && packet->lsn == inj->iparam) {
+			packet->lsn = inj->iparam - 1;
+			say_warn("injected broken lsn: %lld", packet->lsn);
+		}
 		relay_send(relay, packet);
 	}
 }
diff --git a/src/box/vclock.c b/src/box/vclock.c
index 2189ea8..b5eb280 100644
--- a/src/box/vclock.c
+++ b/src/box/vclock.c
@@ -42,13 +42,7 @@ vclock_follow(struct vclock *vclock, uint32_t replica_id, int64_t lsn)
 	assert(lsn >= 0);
 	assert(replica_id < VCLOCK_MAX);
 	int64_t prev_lsn = vclock->lsn[replica_id];
-	if (lsn <= prev_lsn) {
-		/* Never confirm LSN out of order. */
-		panic("LSN for %u is used twice or COMMIT order is broken: "
-		      "confirmed: %lld, new: %lld",
-		      (unsigned) replica_id,
-		      (long long) prev_lsn, (long long) lsn);
-	}
+	assert(lsn > prev_lsn);
 	/* Easier add each time than check. */
 	vclock->map |= 1 << replica_id;
 	vclock->lsn[replica_id] = lsn;
diff --git a/src/box/vclock.h b/src/box/vclock.h
index 928717c..111e291 100644
--- a/src/box/vclock.h
+++ b/src/box/vclock.h
@@ -199,6 +199,14 @@ vclock_sum(const struct vclock *vclock)
 	return vclock->signature;
 }
 
+/**
+ * Update vclock with the next LSN value for given replica id.
+ *
+ * @param vclock Vector clock.
+ * @param replica_id Replica identifier.
+ * @param lsn Next lsn.
+ * @return previous lsn value.
+ */
 int64_t
 vclock_follow(struct vclock *vclock, uint32_t replica_id, int64_t lsn);
 
diff --git a/src/box/wal.c b/src/box/wal.c
index cd068a4..ade2c71 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -186,6 +186,11 @@ xlog_write_entry(struct xlog *l, struct journal_entry *entry)
 	struct xrow_header **row = entry->rows;
 	for (; row < entry->rows + entry->n_rows; row++) {
 		(*row)->tm = ev_now(loop());
+		struct errinj *inj = errinj(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT);
+		if (inj != NULL && inj->iparam == (*row)->lsn) {
+			(*row)->lsn = inj->iparam - 1;
+			say_warn("injected broken lsn: %lld", (*row)->lsn);
+		}
 		if (xlog_write_row(l, *row) < 0) {
 			/*
 			 * Rollback all un-written rows
@@ -652,8 +657,7 @@ wal_assign_lsn(struct wal_writer *writer, struct xrow_header **row,
 			(*row)->lsn = vclock_inc(&writer->vclock, instance_id);
 			(*row)->replica_id = instance_id;
 		} else {
-			vclock_follow(&writer->vclock, (*row)->replica_id,
-				      (*row)->lsn);
+			vclock_follow_xrow(&writer->vclock, *row);
 		}
 	}
 }
@@ -878,9 +882,8 @@ wal_write(struct journal *journal, struct journal_entry *entry)
 				 */
 				if (vclock_get(&replicaset.vclock,
 					       instance_id) < (*last)->lsn) {
-					vclock_follow(&replicaset.vclock,
-						      instance_id,
-						      (*last)->lsn);
+					vclock_follow_xrow(&replicaset.vclock,
+							   *last);
 				}
 				break;
 			}
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 4721670..3fc007a 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -522,6 +522,37 @@ xrow_to_iovec(const struct xrow_header *row, struct iovec *out);
 void
 xrow_decode_error(struct xrow_header *row);
 
+/**
+ * Update vclock with the next LSN value for given replica id.
+ * The function will cause panic if the next LSN happens to be
+ * out of order. The details of provided row are included into
+ * diagnostic message.
+ *
+ * @param vclock Vector clock.
+ * @param row Data row.
+ * @return Previous LSN value.
+ */
+static inline int64_t
+vclock_follow_xrow(struct vclock* vclock, const struct xrow_header *row)
+{
+	assert(row);
+	assert(row->replica_id < VCLOCK_MAX);
+	if (row->lsn <= vclock->lsn[row->replica_id]) {
+		struct request req;
+		const char *req_str = "n/a";
+		if (xrow_decode_dml((struct xrow_header *)row, &req, 0) == 0)
+			req_str = request_str(&req);
+		/* Never confirm LSN out of order. */
+		panic("LSN for %u is used twice or COMMIT order is broken: "
+		      "confirmed: %lld, new: %lld, req: %s",
+		      (unsigned) row->replica_id,
+		      (long long) vclock->lsn[row->replica_id],
+		      (long long) row->lsn,
+		      req_str);
+	}
+	return vclock_follow(vclock, row->replica_id, row->lsn);
+}
+
 #if defined(__cplusplus)
 } /* extern "C" */
 
diff --git a/src/errinj.h b/src/errinj.h
index 64d13b0..c658cd7 100644
--- a/src/errinj.h
+++ b/src/errinj.h
@@ -118,6 +118,8 @@ struct errinj {
 	_(ERRINJ_VY_LOG_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_VY_RUN_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_VY_INDEX_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
+	_(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
+	_(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
 
 ENUM0(errinj_id, ERRINJ_LIST);
 extern struct errinj errinjs[];
diff --git a/test/box/errinj.result b/test/box/errinj.result
index c7e4ce2..dca2c2d 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -16,6 +16,8 @@ errinj.info()
     state: 0
   ERRINJ_WAL_WRITE:
     state: false
+  ERRINJ_XLOG_BREAK_LSN:
+    state: -1
   ERRINJ_HTTPC_EXECUTE:
     state: false
   ERRINJ_VYRUN_DATA_READ:
@@ -60,6 +62,8 @@ errinj.info()
     state: false
   ERRINJ_WAL_WRITE_DISK:
     state: false
+  ERRINJ_RELAY_BREAK_LSN:
+    state: -1
   ERRINJ_VY_LOG_FILE_RENAME:
     state: false
   ERRINJ_VY_RUN_WRITE:
diff --git a/test/xlog/panic_on_broken_lsn.result b/test/xlog/panic_on_broken_lsn.result
new file mode 100644
index 0000000..b14514b
--- /dev/null
+++ b/test/xlog/panic_on_broken_lsn.result
@@ -0,0 +1,168 @@
+-- Issue 3105: Test logging of request with broken lsn before panicking
+-- Two cases are covered: Recovery and Joining a new replica
+env = require('test_run')
+---
+...
+test_run = env.new()
+---
+...
+test_run:cmd('restart server default with cleanup=True')
+fio = require('fio')
+---
+...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+function grep_file_tail(filepath, bytes, pattern)
+    local fh = fio.open(filepath, {'O_RDONLY'})
+    local size = fh:seek(0, 'SEEK_END')
+    if size < bytes then
+        bytes = size
+    end
+    fh:seek(-bytes, 'SEEK_END')
+    local line = fh:read(bytes)
+    fh:close()
+    return string.match(line, pattern)
+end;
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+-- Testing case of panic on recovery
+test_run:cmd('create server panic with script="xlog/panic.lua"')
+---
+- true
+...
+test_run:cmd('start server panic')
+---
+- true
+...
+test_run:switch('panic')
+---
+- true
+...
+box.space._schema:replace{"t0", "v0"}
+---
+- ['t0', 'v0']
+...
+lsn = box.info.vclock[1]
+---
+...
+box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", lsn + 1)
+---
+- ok
+...
+box.space._schema:replace{"t0", "v1"}
+---
+- ['t0', 'v1']
+...
+box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", -1)
+---
+- ok
+...
+test_run:switch('default')
+---
+- true
+...
+test_run:cmd('stop server panic')
+---
+- true
+...
+dirname = fio.pathjoin(fio.cwd(), "panic")
+---
+...
+xlogs = fio.glob(dirname .. "/*.xlog")
+---
+...
+fio.unlink(xlogs[#xlogs])
+---
+- true
+...
+test_run:cmd('start server panic with crash_expected=True')
+---
+- false
+...
+logpath = fio.pathjoin(fio.cwd(), 'panic.log')
+---
+...
+-- Check that log contains the mention of broken LSN and the request printout
+grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
+---
+- 'LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {type:
+  ''REPLACE'', lsn: 1, space_id: 272, index_id: 0, tuple: ["t0", "v1"]}'
+...
+test_run:cmd('cleanup server panic')
+---
+- true
+...
+test_run:cmd('delete server panic')
+---
+- true
+...
+-- Testing case of panic on joining a new replica
+box.schema.user.grant('guest', 'super')
+---
+...
+_ = box.schema.space.create('test')
+---
+...
+_ = box.space.test:create_index('pk')
+---
+...
+box.space.test:auto_increment{'v0'}
+---
+- [1, 'v0']
+...
+lsn = box.info.vclock[1]
+---
+...
+box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
+---
+- ok
+...
+box.space.test:auto_increment{'v1'}
+---
+- [2, 'v1']
+...
+test_run:cmd('create server replica with rpl_master=default, script="xlog/replica.lua"')
+---
+- true
+...
+test_run:cmd('start server replica with crash_expected=True')
+---
+- false
+...
+fiber = require('fiber')
+---
+...
+while box.info.replication == nil do fiber.sleep(0.001) end
+---
+...
+while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
+---
+...
+test_run:switch('default')
+---
+- true
+...
+logpath = fio.pathjoin(fio.cwd(), 'replica.log')
+---
+...
+-- Check that log contains the mention of broken LSN and the request printout
+grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 5, new: 5, req: {.*}")
+---
+- 'LSN for 1 is used twice or COMMIT order is broken: confirmed: 5, new: 5, req: {type:
+  ''INSERT'', lsn: 5, space_id: 512, index_id: 0, tuple: [2, "v1"]}'
+...
+test_run:cmd('cleanup server replica')
+---
+- true
+...
+test_run:cmd('delete server replica')
+---
+- true
+...
+test_run:cmd('restart server default with cleanup=True')
diff --git a/test/xlog/panic_on_broken_lsn.test.lua b/test/xlog/panic_on_broken_lsn.test.lua
new file mode 100644
index 0000000..d4abf6f
--- /dev/null
+++ b/test/xlog/panic_on_broken_lsn.test.lua
@@ -0,0 +1,72 @@
+-- Issue 3105: Test logging of request with broken lsn before panicking
+-- Two cases are covered: Recovery and Joining a new replica
+env = require('test_run')
+test_run = env.new()
+test_run:cmd('restart server default with cleanup=True')
+
+fio = require('fio')
+test_run:cmd("setopt delimiter ';'")
+function grep_file_tail(filepath, bytes, pattern)
+    local fh = fio.open(filepath, {'O_RDONLY'})
+    local size = fh:seek(0, 'SEEK_END')
+    if size < bytes then
+        bytes = size
+    end
+    fh:seek(-bytes, 'SEEK_END')
+    local line = fh:read(bytes)
+    fh:close()
+    return string.match(line, pattern)
+end;
+test_run:cmd("setopt delimiter ''");
+
+-- Testing case of panic on recovery
+test_run:cmd('create server panic with script="xlog/panic.lua"')
+test_run:cmd('start server panic')
+test_run:switch('panic')
+
+box.space._schema:replace{"t0", "v0"}
+lsn = box.info.vclock[1]
+box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", lsn + 1)
+box.space._schema:replace{"t0", "v1"}
+box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", -1)
+
+test_run:switch('default')
+test_run:cmd('stop server panic')
+
+dirname = fio.pathjoin(fio.cwd(), "panic")
+xlogs = fio.glob(dirname .. "/*.xlog")
+fio.unlink(xlogs[#xlogs])
+
+test_run:cmd('start server panic with crash_expected=True')
+
+logpath = fio.pathjoin(fio.cwd(), 'panic.log')
+
+-- Check that log contains the mention of broken LSN and the request printout
+grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
+
+test_run:cmd('cleanup server panic')
+test_run:cmd('delete server panic')
+
+-- Testing case of panic on joining a new replica
+box.schema.user.grant('guest', 'super')
+_ = box.schema.space.create('test')
+_ = box.space.test:create_index('pk')
+box.space.test:auto_increment{'v0'}
+lsn = box.info.vclock[1]
+box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
+box.space.test:auto_increment{'v1'}
+
+test_run:cmd('create server replica with rpl_master=default, script="xlog/replica.lua"')
+test_run:cmd('start server replica with crash_expected=True')
+fiber = require('fiber')
+while box.info.replication == nil do fiber.sleep(0.001) end
+while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
+
+test_run:switch('default')
+logpath = fio.pathjoin(fio.cwd(), 'replica.log')
+-- Check that log contains the mention of broken LSN and the request printout
+grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 5, new: 5, req: {.*}")
+
+test_run:cmd('cleanup server replica')
+test_run:cmd('delete server replica')
+test_run:cmd('restart server default with cleanup=True')
diff --git a/test/xlog/suite.ini b/test/xlog/suite.ini
index 8ee9720..ffc812e 100644
--- a/test/xlog/suite.ini
+++ b/test/xlog/suite.ini
@@ -4,7 +4,7 @@ description = tarantool write ahead log tests
 script = xlog.lua
 disabled = snap_io_rate.test.lua
 valgrind_disabled =
-release_disabled = errinj.test.lua panic_on_lsn_gap.test.lua
+release_disabled = errinj.test.lua panic_on_lsn_gap.test.lua panic_on_broken_lsn.test.lua
 config = suite.cfg
 use_unix_sockets = True
 long_run = snap_io_rate.test.lua
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [tarantool-patches] [PATCH v2] xlog: add request details to panic message for broken LSN
  2018-08-28 16:27   ` Sergei Kalashnikov
@ 2018-08-29  9:52     ` Vladimir Davydov
  2018-08-29 15:21       ` Sergei Kalashnikov
  0 siblings, 1 reply; 6+ messages in thread
From: Vladimir Davydov @ 2018-08-29  9:52 UTC (permalink / raw)
  To: Sergei Kalashnikov; +Cc: tarantool-patches

On Tue, Aug 28, 2018 at 07:27:41PM +0300, Sergei Kalashnikov wrote:
> diff --git a/src/box/relay.cc b/src/box/relay.cc
> index 138e66d..9f07655 100644
> --- a/src/box/relay.cc
> +++ b/src/box/relay.cc
> @@ -676,6 +676,12 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet)
>  	if (packet->replica_id != relay->replica->id ||
>  	    packet->lsn <= vclock_get(&relay->local_vclock_at_subscribe,
>  				      packet->replica_id)) {
> +		struct errinj *inj = errinj(ERRINJ_RELAY_BREAK_LSN,
> +					    ERRINJ_INT);
> +		if (inj != NULL && packet->lsn == inj->iparam) {
> +			packet->lsn = inj->iparam - 1;
> +			say_warn("injected broken lsn: %lld", packet->lsn);

Nit: int64_t isn't guaranteed to be equivalent to long long. Please
converse packet->lsn to long long explicitly.

> +		}
>  		relay_send(relay, packet);
>  	}
>  }
> diff --git a/src/box/wal.c b/src/box/wal.c
> index cd068a4..ade2c71 100644
> --- a/src/box/wal.c
> +++ b/src/box/wal.c
> @@ -186,6 +186,11 @@ xlog_write_entry(struct xlog *l, struct journal_entry *entry)
>  	struct xrow_header **row = entry->rows;
>  	for (; row < entry->rows + entry->n_rows; row++) {
>  		(*row)->tm = ev_now(loop());
> +		struct errinj *inj = errinj(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT);
> +		if (inj != NULL && inj->iparam == (*row)->lsn) {
> +			(*row)->lsn = inj->iparam - 1;
> +			say_warn("injected broken lsn: %lld", (*row)->lsn);

Ditto.

> +		}
>  		if (xlog_write_row(l, *row) < 0) {
>  			/*
>  			 * Rollback all un-written rows
> diff --git a/src/errinj.h b/src/errinj.h
> index 64d13b0..c658cd7 100644
> --- a/src/errinj.h
> +++ b/src/errinj.h
> @@ -118,6 +118,8 @@ struct errinj {
>  	_(ERRINJ_VY_LOG_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
>  	_(ERRINJ_VY_RUN_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
>  	_(ERRINJ_VY_INDEX_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
> +	_(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
> +	_(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \

Nit: please rename ERRINJ_XLOG_BREAK_LSN to ERRINJ_WAL_BREAK_LSN -
it's used by the WAL thread, after all.

> diff --git a/test/xlog/panic_on_broken_lsn.test.lua b/test/xlog/panic_on_broken_lsn.test.lua
> new file mode 100644
> index 0000000..d4abf6f
> --- /dev/null
> +++ b/test/xlog/panic_on_broken_lsn.test.lua
> @@ -0,0 +1,72 @@
> +-- Issue 3105: Test logging of request with broken lsn before panicking
> +-- Two cases are covered: Recovery and Joining a new replica
> +env = require('test_run')
> +test_run = env.new()
> +test_run:cmd('restart server default with cleanup=True')

We strive to make our tests run as fast as possible, that's why we don't
restart the default instance before/after each test. Instead we clean up
only what we have to. In your case, it means revoking privileges you
granted, dropping spaces you created, and disabling error injections you
enabled.

Since in your test you assume that a new replica is assigned id 2, you
should call test_run:cleanup_cluster() in the beginning of this test.
This is much cheaper than restarting the default instance.

If you don't restart the default instance, LSNs and space ids might be
different from run to run. That's OK, you can filter them out in the
test output, e.g. see

https://github.com/tarantool/tarantool/blob/ac17838c9634ba7ebe329290ebd3314e37ec2255/test/replication/status.test.lua#L117

> +
> +fio = require('fio')
> +test_run:cmd("setopt delimiter ';'")
> +function grep_file_tail(filepath, bytes, pattern)
> +    local fh = fio.open(filepath, {'O_RDONLY'})
> +    local size = fh:seek(0, 'SEEK_END')
> +    if size < bytes then
> +        bytes = size
> +    end
> +    fh:seek(-bytes, 'SEEK_END')
> +    local line = fh:read(bytes)
> +    fh:close()
> +    return string.match(line, pattern)
> +end;
> +test_run:cmd("setopt delimiter ''");
> +
> +-- Testing case of panic on recovery
> +test_run:cmd('create server panic with script="xlog/panic.lua"')
> +test_run:cmd('start server panic')
> +test_run:switch('panic')
> +
> +box.space._schema:replace{"t0", "v0"}
> +lsn = box.info.vclock[1]
> +box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", lsn + 1)
> +box.space._schema:replace{"t0", "v1"}
> +box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", -1)
> +
> +test_run:switch('default')
> +test_run:cmd('stop server panic')
> +
> +dirname = fio.pathjoin(fio.cwd(), "panic")
> +xlogs = fio.glob(dirname .. "/*.xlog")
> +fio.unlink(xlogs[#xlogs])

I don't understand why you need to delete the last xlog here.

> +
> +test_run:cmd('start server panic with crash_expected=True')
> +
> +logpath = fio.pathjoin(fio.cwd(), 'panic.log')
> +
> +-- Check that log contains the mention of broken LSN and the request printout
> +grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
> +
> +test_run:cmd('cleanup server panic')
> +test_run:cmd('delete server panic')
> +
> +-- Testing case of panic on joining a new replica
> +box.schema.user.grant('guest', 'super')

Please instead use

box.schema.user.grant('guest', 'replication')

> +_ = box.schema.space.create('test')
> +_ = box.space.test:create_index('pk')
> +box.space.test:auto_increment{'v0'}
> +lsn = box.info.vclock[1]
> +box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
> +box.space.test:auto_increment{'v1'}
> +
> +test_run:cmd('create server replica with rpl_master=default, script="xlog/replica.lua"')
> +test_run:cmd('start server replica with crash_expected=True')
> +fiber = require('fiber')
> +while box.info.replication == nil do fiber.sleep(0.001) end
> +while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
> +
> +test_run:switch('default')

Pointless 'switch' - the test is already running on behalf of the
default instance here.

> +logpath = fio.pathjoin(fio.cwd(), 'replica.log')
> +-- Check that log contains the mention of broken LSN and the request printout
> +grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 5, new: 5, req: {.*}")
> +
> +test_run:cmd('cleanup server replica')
> +test_run:cmd('delete server replica')
> +test_run:cmd('restart server default with cleanup=True')

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [tarantool-patches] [PATCH v2] xlog: add request details to panic message for broken LSN
  2018-08-29  9:52     ` Vladimir Davydov
@ 2018-08-29 15:21       ` Sergei Kalashnikov
  2018-08-29 16:31         ` Vladimir Davydov
  0 siblings, 1 reply; 6+ messages in thread
From: Sergei Kalashnikov @ 2018-08-29 15:21 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

On Wed, Aug 29, 2018 at 12:52:47PM +0300, Vladimir Davydov wrote:
> On Tue, Aug 28, 2018 at 07:27:41PM +0300, Sergei Kalashnikov wrote:
> > diff --git a/src/box/relay.cc b/src/box/relay.cc
> > index 138e66d..9f07655 100644
> > --- a/src/box/relay.cc
> > +++ b/src/box/relay.cc
> > @@ -676,6 +676,12 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet)
> >  	if (packet->replica_id != relay->replica->id ||
> >  	    packet->lsn <= vclock_get(&relay->local_vclock_at_subscribe,
> >  				      packet->replica_id)) {
> > +		struct errinj *inj = errinj(ERRINJ_RELAY_BREAK_LSN,
> > +					    ERRINJ_INT);
> > +		if (inj != NULL && packet->lsn == inj->iparam) {
> > +			packet->lsn = inj->iparam - 1;
> > +			say_warn("injected broken lsn: %lld", packet->lsn);
> 
> Nit: int64_t isn't guaranteed to be equivalent to long long. Please
> converse packet->lsn to long long explicitly.
> 

OK, fixed.

> > +		}
> >  		relay_send(relay, packet);
> >  	}
> >  }
> > diff --git a/src/box/wal.c b/src/box/wal.c
> > index cd068a4..ade2c71 100644
> > --- a/src/box/wal.c
> > +++ b/src/box/wal.c
> > @@ -186,6 +186,11 @@ xlog_write_entry(struct xlog *l, struct journal_entry *entry)
> >  	struct xrow_header **row = entry->rows;
> >  	for (; row < entry->rows + entry->n_rows; row++) {
> >  		(*row)->tm = ev_now(loop());
> > +		struct errinj *inj = errinj(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT);
> > +		if (inj != NULL && inj->iparam == (*row)->lsn) {
> > +			(*row)->lsn = inj->iparam - 1;
> > +			say_warn("injected broken lsn: %lld", (*row)->lsn);
> 
> Ditto.
> 

Fixed.

> > +		}
> >  		if (xlog_write_row(l, *row) < 0) {
> >  			/*
> >  			 * Rollback all un-written rows
> > diff --git a/src/errinj.h b/src/errinj.h
> > index 64d13b0..c658cd7 100644
> > --- a/src/errinj.h
> > +++ b/src/errinj.h
> > @@ -118,6 +118,8 @@ struct errinj {
> >  	_(ERRINJ_VY_LOG_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
> >  	_(ERRINJ_VY_RUN_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
> >  	_(ERRINJ_VY_INDEX_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
> > +	_(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
> > +	_(ERRINJ_XLOG_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
> 
> Nit: please rename ERRINJ_XLOG_BREAK_LSN to ERRINJ_WAL_BREAK_LSN -
> it's used by the WAL thread, after all.
> 

Done.

> > diff --git a/test/xlog/panic_on_broken_lsn.test.lua b/test/xlog/panic_on_broken_lsn.test.lua
> > new file mode 100644
> > index 0000000..d4abf6f
> > --- /dev/null
> > +++ b/test/xlog/panic_on_broken_lsn.test.lua
> > @@ -0,0 +1,72 @@
> > +-- Issue 3105: Test logging of request with broken lsn before panicking
> > +-- Two cases are covered: Recovery and Joining a new replica
> > +env = require('test_run')
> > +test_run = env.new()
> > +test_run:cmd('restart server default with cleanup=True')
> 
> We strive to make our tests run as fast as possible, that's why we don't
> restart the default instance before/after each test. Instead we clean up
> only what we have to. In your case, it means revoking privileges you
> granted, dropping spaces you created, and disabling error injections you
> enabled.
> 
> Since in your test you assume that a new replica is assigned id 2, you
> should call test_run:cleanup_cluster() in the beginning of this test.
> This is much cheaper than restarting the default instance.
> 
> If you don't restart the default instance, LSNs and space ids might be
> different from run to run. That's OK, you can filter them out in the
> test output, e.g. see
> 
> https://github.com/tarantool/tarantool/blob/ac17838c9634ba7ebe329290ebd3314e37ec2255/test/replication/status.test.lua#L117
> 

I changed the test accordingly. Thanks for valuable info.

> > +
> > +fio = require('fio')
> > +test_run:cmd("setopt delimiter ';'")
> > +function grep_file_tail(filepath, bytes, pattern)
> > +    local fh = fio.open(filepath, {'O_RDONLY'})
> > +    local size = fh:seek(0, 'SEEK_END')
> > +    if size < bytes then
> > +        bytes = size
> > +    end
> > +    fh:seek(-bytes, 'SEEK_END')
> > +    local line = fh:read(bytes)
> > +    fh:close()
> > +    return string.match(line, pattern)
> > +end;
> > +test_run:cmd("setopt delimiter ''");
> > +
> > +-- Testing case of panic on recovery
> > +test_run:cmd('create server panic with script="xlog/panic.lua"')
> > +test_run:cmd('start server panic')
> > +test_run:switch('panic')
> > +
> > +box.space._schema:replace{"t0", "v0"}
> > +lsn = box.info.vclock[1]
> > +box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", lsn + 1)
> > +box.space._schema:replace{"t0", "v1"}
> > +box.error.injection.set("ERRINJ_XLOG_BREAK_LSN", -1)
> > +
> > +test_run:switch('default')
> > +test_run:cmd('stop server panic')
> > +
> > +dirname = fio.pathjoin(fio.cwd(), "panic")
> > +xlogs = fio.glob(dirname .. "/*.xlog")
> > +fio.unlink(xlogs[#xlogs])
> 
> I don't understand why you need to delete the last xlog here.
> 

This last xlog is always empty, so recovery_scan() skips the vclock_follow()
call and it doesn't panic unless I delete the file. I'm not sure why it gets
created since I didn't create checkpoints in the test explicitly. Later on, the
actual row recovery happens, but recover_xlog() silently skips unordered lsn.


> > +
> > +test_run:cmd('start server panic with crash_expected=True')
> > +
> > +logpath = fio.pathjoin(fio.cwd(), 'panic.log')
> > +
> > +-- Check that log contains the mention of broken LSN and the request printout
> > +grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
> > +
> > +test_run:cmd('cleanup server panic')
> > +test_run:cmd('delete server panic')
> > +
> > +-- Testing case of panic on joining a new replica
> > +box.schema.user.grant('guest', 'super')
> 
> Please instead use
> 
> box.schema.user.grant('guest', 'replication')
> 

Done.

> > +_ = box.schema.space.create('test')
> > +_ = box.space.test:create_index('pk')
> > +box.space.test:auto_increment{'v0'}
> > +lsn = box.info.vclock[1]
> > +box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
> > +box.space.test:auto_increment{'v1'}
> > +
> > +test_run:cmd('create server replica with rpl_master=default, script="xlog/replica.lua"')
> > +test_run:cmd('start server replica with crash_expected=True')
> > +fiber = require('fiber')
> > +while box.info.replication == nil do fiber.sleep(0.001) end
> > +while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
> > +
> > +test_run:switch('default')
> 
> Pointless 'switch' - the test is already running on behalf of the
> default instance here.
> 

Fixed.

> > +logpath = fio.pathjoin(fio.cwd(), 'replica.log')
> > +-- Check that log contains the mention of broken LSN and the request printout
> > +grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 5, new: 5, req: {.*}")
> > +
> > +test_run:cmd('cleanup server replica')
> > +test_run:cmd('delete server replica')
> > +test_run:cmd('restart server default with cleanup=True')

Please find below the fixed patch.
===================================

From 2199d4e7e44fc8f6d93aca2774890b2dfd71f716 Mon Sep 17 00:00:00 2001
From: Sergei Kalashnikov <ztarvos@gmail.com>
Date: Wed, 29 Aug 2018 17:35:42 +0300
Subject: [PATCH v2] xlog: add request details to panic message for broken LSN

Aid the debugging of replication issues related to out-of-order
requests. Adds the details of request/tuple to the diagnostic
message whenever possible.

Closes #3105
---
https://github.com/tarantool/tarantool/issues/3105
https://github.com/tarantool/tarantool/commits/ztarvos/gh-3105-log-request-broken-lsn

Changes in v2:
 - Removed vclock_follow_unsafe()
 - Renamed the new function vclock_follow_xrow(),
   removed extra params and moved it to the header.
 - Added error injections and simplified the test.
 - Added test for replica join case.
 - Removed trailing spaces.


 src/box/applier.cc                     |   6 +-
 src/box/recovery.cc                    |   4 +-
 src/box/relay.cc                       |   7 ++
 src/box/vclock.c                       |   8 +-
 src/box/vclock.h                       |   8 ++
 src/box/wal.c                          |  14 ++-
 src/box/xrow.h                         |  31 ++++++
 src/errinj.h                           |   2 +
 test/box/errinj.result                 |   4 +
 test/xlog/panic_on_broken_lsn.result   | 187 +++++++++++++++++++++++++++++++++
 test/xlog/panic_on_broken_lsn.test.lua |  77 ++++++++++++++
 test/xlog/suite.ini                    |   2 +-
 12 files changed, 331 insertions(+), 19 deletions(-)
 create mode 100644 test/xlog/panic_on_broken_lsn.result
 create mode 100644 test/xlog/panic_on_broken_lsn.test.lua

diff --git a/src/box/applier.cc b/src/box/applier.cc
index dbb4d05..5d87867 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -336,8 +336,7 @@ applier_join(struct applier *applier)
 		coio_read_xrow(coio, ibuf, &row);
 		applier->last_row_time = ev_monotonic_now(loop());
 		if (iproto_type_is_dml(row.type)) {
-			vclock_follow(&replicaset.vclock, row.replica_id,
-				      row.lsn);
+			vclock_follow_xrow(&replicaset.vclock, &row);
 			xstream_write_xc(applier->subscribe_stream, &row);
 		} else if (row.type == IPROTO_OK) {
 			/*
@@ -503,8 +502,7 @@ applier_subscribe(struct applier *applier)
 			 * the row is skipped when the replication
 			 * is resumed.
 			 */
-			vclock_follow(&replicaset.vclock, row.replica_id,
-				      row.lsn);
+			vclock_follow_xrow(&replicaset.vclock, &row);
 			struct replica *replica = replica_by_id(row.replica_id);
 			struct latch *latch = (replica ? &replica->order_latch :
 					       &replicaset.applier.order_latch);
diff --git a/src/box/recovery.cc b/src/box/recovery.cc
index fe14def..77357f0 100644
--- a/src/box/recovery.cc
+++ b/src/box/recovery.cc
@@ -135,7 +135,7 @@ recovery_scan(struct recovery *r, struct vclock *end_vclock)
 		return;
 	struct xrow_header row;
 	while (xlog_cursor_next(&cursor, &row, true) == 0)
-		vclock_follow(end_vclock, row.replica_id, row.lsn);
+		vclock_follow_xrow(end_vclock, &row);
 	xlog_cursor_close(&cursor, false);
 }
 
@@ -266,7 +266,7 @@ recover_xlog(struct recovery *r, struct xstream *stream,
 		 * in case of forced recovery, when we skip the
 		 * failed row anyway.
 		 */
-		vclock_follow(&r->vclock,  row.replica_id, row.lsn);
+		vclock_follow_xrow(&r->vclock, &row);
 		if (xstream_write(stream, &row) == 0) {
 			++row_count;
 			if (row_count % 100000 == 0)
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 138e66d..b0bc03e 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -676,6 +676,13 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet)
 	if (packet->replica_id != relay->replica->id ||
 	    packet->lsn <= vclock_get(&relay->local_vclock_at_subscribe,
 				      packet->replica_id)) {
+		struct errinj *inj = errinj(ERRINJ_RELAY_BREAK_LSN,
+					    ERRINJ_INT);
+		if (inj != NULL && packet->lsn == inj->iparam) {
+			packet->lsn = inj->iparam - 1;
+			say_warn("injected broken lsn: %lld",
+				 (long long) packet->lsn);
+		}
 		relay_send(relay, packet);
 	}
 }
diff --git a/src/box/vclock.c b/src/box/vclock.c
index 2189ea8..b5eb280 100644
--- a/src/box/vclock.c
+++ b/src/box/vclock.c
@@ -42,13 +42,7 @@ vclock_follow(struct vclock *vclock, uint32_t replica_id, int64_t lsn)
 	assert(lsn >= 0);
 	assert(replica_id < VCLOCK_MAX);
 	int64_t prev_lsn = vclock->lsn[replica_id];
-	if (lsn <= prev_lsn) {
-		/* Never confirm LSN out of order. */
-		panic("LSN for %u is used twice or COMMIT order is broken: "
-		      "confirmed: %lld, new: %lld",
-		      (unsigned) replica_id,
-		      (long long) prev_lsn, (long long) lsn);
-	}
+	assert(lsn > prev_lsn);
 	/* Easier add each time than check. */
 	vclock->map |= 1 << replica_id;
 	vclock->lsn[replica_id] = lsn;
diff --git a/src/box/vclock.h b/src/box/vclock.h
index 928717c..111e291 100644
--- a/src/box/vclock.h
+++ b/src/box/vclock.h
@@ -199,6 +199,14 @@ vclock_sum(const struct vclock *vclock)
 	return vclock->signature;
 }
 
+/**
+ * Update vclock with the next LSN value for given replica id.
+ *
+ * @param vclock Vector clock.
+ * @param replica_id Replica identifier.
+ * @param lsn Next lsn.
+ * @return previous lsn value.
+ */
 int64_t
 vclock_follow(struct vclock *vclock, uint32_t replica_id, int64_t lsn);
 
diff --git a/src/box/wal.c b/src/box/wal.c
index cd068a4..a11e0ad 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -186,6 +186,12 @@ xlog_write_entry(struct xlog *l, struct journal_entry *entry)
 	struct xrow_header **row = entry->rows;
 	for (; row < entry->rows + entry->n_rows; row++) {
 		(*row)->tm = ev_now(loop());
+		struct errinj *inj = errinj(ERRINJ_WAL_BREAK_LSN, ERRINJ_INT);
+		if (inj != NULL && inj->iparam == (*row)->lsn) {
+			(*row)->lsn = inj->iparam - 1;
+			say_warn("injected broken lsn: %lld",
+				 (long long) (*row)->lsn);
+		}
 		if (xlog_write_row(l, *row) < 0) {
 			/*
 			 * Rollback all un-written rows
@@ -652,8 +658,7 @@ wal_assign_lsn(struct wal_writer *writer, struct xrow_header **row,
 			(*row)->lsn = vclock_inc(&writer->vclock, instance_id);
 			(*row)->replica_id = instance_id;
 		} else {
-			vclock_follow(&writer->vclock, (*row)->replica_id,
-				      (*row)->lsn);
+			vclock_follow_xrow(&writer->vclock, *row);
 		}
 	}
 }
@@ -878,9 +883,8 @@ wal_write(struct journal *journal, struct journal_entry *entry)
 				 */
 				if (vclock_get(&replicaset.vclock,
 					       instance_id) < (*last)->lsn) {
-					vclock_follow(&replicaset.vclock,
-						      instance_id,
-						      (*last)->lsn);
+					vclock_follow_xrow(&replicaset.vclock,
+							   *last);
 				}
 				break;
 			}
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 4721670..3fc007a 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -522,6 +522,37 @@ xrow_to_iovec(const struct xrow_header *row, struct iovec *out);
 void
 xrow_decode_error(struct xrow_header *row);
 
+/**
+ * Update vclock with the next LSN value for given replica id.
+ * The function will cause panic if the next LSN happens to be
+ * out of order. The details of provided row are included into
+ * diagnostic message.
+ *
+ * @param vclock Vector clock.
+ * @param row Data row.
+ * @return Previous LSN value.
+ */
+static inline int64_t
+vclock_follow_xrow(struct vclock* vclock, const struct xrow_header *row)
+{
+	assert(row);
+	assert(row->replica_id < VCLOCK_MAX);
+	if (row->lsn <= vclock->lsn[row->replica_id]) {
+		struct request req;
+		const char *req_str = "n/a";
+		if (xrow_decode_dml((struct xrow_header *)row, &req, 0) == 0)
+			req_str = request_str(&req);
+		/* Never confirm LSN out of order. */
+		panic("LSN for %u is used twice or COMMIT order is broken: "
+		      "confirmed: %lld, new: %lld, req: %s",
+		      (unsigned) row->replica_id,
+		      (long long) vclock->lsn[row->replica_id],
+		      (long long) row->lsn,
+		      req_str);
+	}
+	return vclock_follow(vclock, row->replica_id, row->lsn);
+}
+
 #if defined(__cplusplus)
 } /* extern "C" */
 
diff --git a/src/errinj.h b/src/errinj.h
index 64d13b0..b6d4a4c 100644
--- a/src/errinj.h
+++ b/src/errinj.h
@@ -118,6 +118,8 @@ struct errinj {
 	_(ERRINJ_VY_LOG_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_VY_RUN_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_VY_INDEX_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
+	_(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
+	_(ERRINJ_WAL_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
 
 ENUM0(errinj_id, ERRINJ_LIST);
 extern struct errinj errinjs[];
diff --git a/test/box/errinj.result b/test/box/errinj.result
index c7e4ce2..8dae761 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -58,8 +58,12 @@ errinj.info()
     state: 0
   ERRINJ_XLOG_META:
     state: false
+  ERRINJ_WAL_BREAK_LSN:
+    state: -1
   ERRINJ_WAL_WRITE_DISK:
     state: false
+  ERRINJ_RELAY_BREAK_LSN:
+    state: -1
   ERRINJ_VY_LOG_FILE_RENAME:
     state: false
   ERRINJ_VY_RUN_WRITE:
diff --git a/test/xlog/panic_on_broken_lsn.result b/test/xlog/panic_on_broken_lsn.result
new file mode 100644
index 0000000..ad10217
--- /dev/null
+++ b/test/xlog/panic_on_broken_lsn.result
@@ -0,0 +1,187 @@
+-- Issue 3105: Test logging of request with broken lsn before panicking
+-- Two cases are covered: Recovery and Joining a new replica
+env = require('test_run')
+---
+...
+test_run = env.new()
+---
+...
+test_run:cleanup_cluster()
+---
+...
+fio = require('fio')
+---
+...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+function grep_file_tail(filepath, bytes, pattern)
+    local fh = fio.open(filepath, {'O_RDONLY'})
+    local size = fh:seek(0, 'SEEK_END')
+    if size < bytes then
+        bytes = size
+    end
+    fh:seek(-bytes, 'SEEK_END')
+    local line = fh:read(bytes)
+    fh:close()
+    return string.match(line, pattern)
+end;
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+-- Testing case of panic on recovery
+test_run:cmd('create server panic with script="xlog/panic.lua"')
+---
+- true
+...
+test_run:cmd('start server panic')
+---
+- true
+...
+test_run:switch('panic')
+---
+- true
+...
+box.space._schema:replace{"t0", "v0"}
+---
+- ['t0', 'v0']
+...
+lsn = box.info.vclock[1]
+---
+...
+box.error.injection.set("ERRINJ_WAL_BREAK_LSN", lsn + 1)
+---
+- ok
+...
+box.space._schema:replace{"t0", "v1"}
+---
+- ['t0', 'v1']
+...
+box.error.injection.set("ERRINJ_WAL_BREAK_LSN", -1)
+---
+- ok
+...
+test_run:switch('default')
+---
+- true
+...
+test_run:cmd('stop server panic')
+---
+- true
+...
+dirname = fio.pathjoin(fio.cwd(), "panic")
+---
+...
+xlogs = fio.glob(dirname .. "/*.xlog")
+---
+...
+fio.unlink(xlogs[#xlogs])
+---
+- true
+...
+test_run:cmd('start server panic with crash_expected=True')
+---
+- false
+...
+logpath = fio.pathjoin(fio.cwd(), 'panic.log')
+---
+...
+-- Check that log contains the mention of broken LSN and the request printout
+grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
+---
+- 'LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {type:
+  ''REPLACE'', lsn: 1, space_id: 272, index_id: 0, tuple: ["t0", "v1"]}'
+...
+test_run:cmd('cleanup server panic')
+---
+- true
+...
+test_run:cmd('delete server panic')
+---
+- true
+...
+-- Testing case of panic on joining a new replica
+box.schema.user.grant('guest', 'replication')
+---
+...
+_ = box.schema.space.create('test')
+---
+...
+_ = box.space.test:create_index('pk')
+---
+...
+box.space.test:auto_increment{'v0'}
+---
+- [1, 'v0']
+...
+lsn = box.info.vclock[1]
+---
+...
+box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
+---
+- ok
+...
+box.space.test:auto_increment{'v1'}
+---
+- [2, 'v1']
+...
+test_run:cmd('create server replica with rpl_master=default, script="xlog/replica.lua"')
+---
+- true
+...
+test_run:cmd('start server replica with crash_expected=True')
+---
+- false
+...
+fiber = require('fiber')
+---
+...
+while box.info.replication == nil do fiber.sleep(0.001) end
+---
+...
+while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
+---
+...
+box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", -1)
+---
+- ok
+...
+logpath = fio.pathjoin(fio.cwd(), 'replica.log')
+---
+...
+-- Check that log contains the mention of broken LSN and the request printout
+test_run:cmd("push filter 'lsn: "..lsn..", space_id: [0-9]+' to 'lsn: <lsn>, space_id: <space_id>'")
+---
+- true
+...
+test_run:cmd("push filter 'confirmed: "..lsn..", new: "..lsn.."' to '<lsn>'")
+---
+- true
+...
+grep_file_tail(logpath, 256, "(LSN for 1 is used twice or COMMIT order is broken: confirmed: "..lsn..", new: "..lsn.."), req: ({.*})")
+---
+- 'LSN for 1 is used twice or COMMIT order is broken: <lsn>'
+- '{type: ''INSERT'', lsn: <lsn>, space_id: <space_id>, index_id: 0, tuple: [2, "v1"]}'
+...
+test_run:cmd("clear filter")
+---
+- true
+...
+test_run:cmd('cleanup server replica')
+---
+- true
+...
+test_run:cmd('delete server replica')
+---
+- true
+...
+box.space.test:drop()
+---
+...
+box.schema.user.revoke('guest', 'replication')
+---
+...
diff --git a/test/xlog/panic_on_broken_lsn.test.lua b/test/xlog/panic_on_broken_lsn.test.lua
new file mode 100644
index 0000000..dd450d1
--- /dev/null
+++ b/test/xlog/panic_on_broken_lsn.test.lua
@@ -0,0 +1,77 @@
+-- Issue 3105: Test logging of request with broken lsn before panicking
+-- Two cases are covered: Recovery and Joining a new replica
+env = require('test_run')
+test_run = env.new()
+test_run:cleanup_cluster()
+
+fio = require('fio')
+test_run:cmd("setopt delimiter ';'")
+function grep_file_tail(filepath, bytes, pattern)
+    local fh = fio.open(filepath, {'O_RDONLY'})
+    local size = fh:seek(0, 'SEEK_END')
+    if size < bytes then
+        bytes = size
+    end
+    fh:seek(-bytes, 'SEEK_END')
+    local line = fh:read(bytes)
+    fh:close()
+    return string.match(line, pattern)
+end;
+test_run:cmd("setopt delimiter ''");
+
+-- Testing case of panic on recovery
+test_run:cmd('create server panic with script="xlog/panic.lua"')
+test_run:cmd('start server panic')
+test_run:switch('panic')
+
+box.space._schema:replace{"t0", "v0"}
+lsn = box.info.vclock[1]
+box.error.injection.set("ERRINJ_WAL_BREAK_LSN", lsn + 1)
+box.space._schema:replace{"t0", "v1"}
+box.error.injection.set("ERRINJ_WAL_BREAK_LSN", -1)
+
+test_run:switch('default')
+test_run:cmd('stop server panic')
+
+dirname = fio.pathjoin(fio.cwd(), "panic")
+xlogs = fio.glob(dirname .. "/*.xlog")
+fio.unlink(xlogs[#xlogs])
+
+test_run:cmd('start server panic with crash_expected=True')
+
+logpath = fio.pathjoin(fio.cwd(), 'panic.log')
+
+-- Check that log contains the mention of broken LSN and the request printout
+grep_file_tail(logpath, 256, "LSN for 1 is used twice or COMMIT order is broken: confirmed: 1, new: 1, req: {.*}")
+
+test_run:cmd('cleanup server panic')
+test_run:cmd('delete server panic')
+
+-- Testing case of panic on joining a new replica
+box.schema.user.grant('guest', 'replication')
+_ = box.schema.space.create('test')
+_ = box.space.test:create_index('pk')
+box.space.test:auto_increment{'v0'}
+lsn = box.info.vclock[1]
+box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", lsn + 1)
+box.space.test:auto_increment{'v1'}
+
+test_run:cmd('create server replica with rpl_master=default, script="xlog/replica.lua"')
+test_run:cmd('start server replica with crash_expected=True')
+fiber = require('fiber')
+while box.info.replication == nil do fiber.sleep(0.001) end
+while box.info.replication[2].downstream.status ~= "stopped" do fiber.sleep(0.001) end
+box.error.injection.set("ERRINJ_RELAY_BREAK_LSN", -1)
+
+logpath = fio.pathjoin(fio.cwd(), 'replica.log')
+-- Check that log contains the mention of broken LSN and the request printout
+test_run:cmd("push filter 'lsn: "..lsn..", space_id: [0-9]+' to 'lsn: <lsn>, space_id: <space_id>'")
+test_run:cmd("push filter 'confirmed: "..lsn..", new: "..lsn.."' to '<lsn>'")
+grep_file_tail(logpath, 256, "(LSN for 1 is used twice or COMMIT order is broken: confirmed: "..lsn..", new: "..lsn.."), req: ({.*})")
+test_run:cmd("clear filter")
+
+test_run:cmd('cleanup server replica')
+test_run:cmd('delete server replica')
+
+box.space.test:drop()
+box.schema.user.revoke('guest', 'replication')
diff --git a/test/xlog/suite.ini b/test/xlog/suite.ini
index 8ee9720..ffc812e 100644
--- a/test/xlog/suite.ini
+++ b/test/xlog/suite.ini
@@ -4,7 +4,7 @@ description = tarantool write ahead log tests
 script = xlog.lua
 disabled = snap_io_rate.test.lua
 valgrind_disabled =
-release_disabled = errinj.test.lua panic_on_lsn_gap.test.lua
+release_disabled = errinj.test.lua panic_on_lsn_gap.test.lua panic_on_broken_lsn.test.lua
 config = suite.cfg
 use_unix_sockets = True
 long_run = snap_io_rate.test.lua
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [tarantool-patches] [PATCH v2] xlog: add request details to panic message for broken LSN
  2018-08-29 15:21       ` Sergei Kalashnikov
@ 2018-08-29 16:31         ` Vladimir Davydov
  0 siblings, 0 replies; 6+ messages in thread
From: Vladimir Davydov @ 2018-08-29 16:31 UTC (permalink / raw)
  To: Sergei Kalashnikov; +Cc: tarantool-patches

On Wed, Aug 29, 2018 at 06:21:15PM +0300, Sergei Kalashnikov wrote:
> > > +dirname = fio.pathjoin(fio.cwd(), "panic")
> > > +xlogs = fio.glob(dirname .. "/*.xlog")
> > > +fio.unlink(xlogs[#xlogs])
> > 
> > I don't understand why you need to delete the last xlog here.
> > 
> 
> This last xlog is always empty, so recovery_scan() skips the vclock_follow()
> call and it doesn't panic unless I delete the file. I'm not sure why it gets
> created since I didn't create checkpoints in the test explicitly. Later on, the
> actual row recovery happens, but recover_xlog() silently skips unordered lsn.

OK, I see. We create an empty xlog on shutdown so that we can find out
the instance vclock on recovery without rescanning the last xlog. This
is needed for replica rebootstrap functionality.

Since recover_xlog() doesn't check LSN order, I guess we could drop
this test case altogether then - anyway, the primary application of
this patch is replication, not local recovery. Well, let's leave it
since it's already implemented and works well.

Thanks for addressing all my comments. I pushed the patch to 1.10.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2018-08-29 16:31 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-27 11:22 [tarantool-patches] [PATCH v2] xlog: add request details to panic message for broken LSN Sergei Kalashnikov
2018-08-27 17:04 ` Vladimir Davydov
2018-08-28 16:27   ` Sergei Kalashnikov
2018-08-29  9:52     ` Vladimir Davydov
2018-08-29 15:21       ` Sergei Kalashnikov
2018-08-29 16:31         ` Vladimir Davydov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox