[tarantool-patches] [PATCH 1/3] Do not promote wal vclock for failed writes

Wed Feb 6 16:56:56 MSK 2019

On Wed, Feb 06, 2019 at 11:29:57AM +0300, Georgy Kirichenko wrote:
> Increase replica lsn only if row was successfully written to disk. This
> prevents wal from lsn gaps in case of IO errors.

Why are gaps that bad? Please say a few words of explanation in the
commit message.

> 
> Obsoletes xlog/panic_on_lsn_gap.test.
> 
> Needs for #2283

Needed for

> ---
>  src/box/wal.c                       |  19 +-
>  test/box/errinj.result              |  23 ++
>  test/box/errinj.test.lua            |   8 +
>  test/xlog/errinj.result             |   1 -
>  test/xlog/panic_on_lsn_gap.result   | 377 ----------------------------
>  test/xlog/panic_on_lsn_gap.test.lua | 136 ----------
>  6 files changed, 44 insertions(+), 520 deletions(-)
>  delete mode 100644 test/xlog/panic_on_lsn_gap.result
>  delete mode 100644 test/xlog/panic_on_lsn_gap.test.lua
> 
> diff --git a/src/box/wal.c b/src/box/wal.c
> index 3b50d3629..a55b544aa 100644
> --- a/src/box/wal.c
> +++ b/src/box/wal.c
> @@ -901,16 +901,16 @@ wal_writer_begin_rollback(struct wal_writer *writer)
>  }
>  
>  static void
> -wal_assign_lsn(struct wal_writer *writer, struct xrow_header **row,
> +wal_assign_lsn(struct vclock *vclock, struct xrow_header **row,
>  	       struct xrow_header **end)
>  {
>  	/** Assign LSN to all local rows. */
>  	for ( ; row < end; row++) {
>  		if ((*row)->replica_id == 0) {
> -			(*row)->lsn = vclock_inc(&writer->vclock, instance_id);
> +			(*row)->lsn = vclock_inc(vclock, instance_id);
>  			(*row)->replica_id = instance_id;
>  		} else {
> -			vclock_follow_xrow(&writer->vclock, *row);
> +			vclock_follow_xrow(vclock, *row);
>  		}
>  	}
>  }
> @@ -922,6 +922,11 @@ wal_write_to_disk(struct cmsg *msg)
>  	struct wal_msg *wal_msg = (struct wal_msg *) msg;
>  	struct error *error;
>  
> +	/* Local vclock copy. */

Bad comment. Can you elaborate why you need it, please? E.g.

	We don't want to promote the WAL vclock if write fails,
	because that would result in LSN gaps, which are difficult
	to handle in relay. So we promote a local copy instead
	and sync it only if entries have been successfully flushed
	to disk.

Or something like that.

> +	struct vclock vclock;
> +	vclock_create(&vclock);
> +	vclock_copy(&vclock, &writer->vclock);
> +
>  	struct errinj *inj = errinj(ERRINJ_WAL_DELAY, ERRINJ_BOOL);
>  	while (inj != NULL && inj->bparam)
>  		usleep(10);
> @@ -974,14 +979,15 @@ wal_write_to_disk(struct cmsg *msg)
>  	struct journal_entry *entry;
>  	struct stailq_entry *last_committed = NULL;
>  	stailq_foreach_entry(entry, &wal_msg->commit, fifo) {
> -		wal_assign_lsn(writer, entry->rows, entry->rows + entry->n_rows);
> -		entry->res = vclock_sum(&writer->vclock);
> +		wal_assign_lsn(&vclock, entry->rows, entry->rows + entry->n_rows);
> +		entry->res = vclock_sum(&vclock);
>  		rc = xlog_write_entry(l, entry);
>  		if (rc < 0)
>  			goto done;
>  		if (rc > 0) {
>  			writer->checkpoint_wal_size += rc;
>  			last_committed = &entry->fifo;
> +			vclock_copy(&writer->vclock, &vclock);
>  		}
>  		/* rc == 0: the write is buffered in xlog_tx */
>  	}
> @@ -991,6 +997,7 @@ wal_write_to_disk(struct cmsg *msg)
>  
>  	writer->checkpoint_wal_size += rc;
>  	last_committed = stailq_last(&wal_msg->commit);
> +	vclock_copy(&writer->vclock, &vclock);
>  
>  	/*
>  	 * Notify TX if the checkpoint threshold has been exceeded.
> @@ -1185,7 +1192,7 @@ wal_write_in_wal_mode_none(struct journal *journal,
>  			   struct journal_entry *entry)
>  {
>  	struct wal_writer *writer = (struct wal_writer *) journal;
> -	wal_assign_lsn(writer, entry->rows, entry->rows + entry->n_rows);
> +	wal_assign_lsn(&writer->vclock, entry->rows, entry->rows + entry->n_rows);
>  	int64_t old_lsn = vclock_get(&replicaset.vclock, instance_id);
>  	int64_t new_lsn = vclock_get(&writer->vclock, instance_id);
>  	if (new_lsn > old_lsn) {
> diff --git a/test/box/errinj.result b/test/box/errinj.result
> index 12303670e..1d9a16d8d 100644
> --- a/test/box/errinj.result
> +++ b/test/box/errinj.result
> @@ -141,6 +141,15 @@ errinj.set("ERRINJ_TESTING", false)
>  ---
>  - ok
>  ...
> +env = require('test_run')
> +---
> +...
> +test_run = env.new()
> +---
> +...

Why? You don't seem to need it.

> +lsn1 = box.info.vclock[box.info.id]
> +---
> +...
>  -- Check how well we handle a failed log write
>  errinj.set("ERRINJ_WAL_IO", true)
>  ---
> @@ -161,6 +170,11 @@ space:insert{1}
>  ---
>  - [1]
>  ...
> +-- Check vclock was promoted only one time
> +box.info.vclock[box.info.id] == lsn1 + 1
> +---
> +- true
> +...
>  errinj.set("ERRINJ_WAL_IO", true)
>  ---
>  - ok
> @@ -180,6 +194,15 @@ errinj.set("ERRINJ_WAL_IO", false)
>  ---
>  - ok
>  ...
> +space:update(1, {{'=', 2, 2}})
> +---
> +- [1, 2]
> +...
> +-- Check vclock was promoted only two times
> +box.info.vclock[box.info.id] == lsn1 + 2
> +---
> +- true
> +...

This test will pass with and without your patch, because we already
don't promote box.info.lsn on failed WAL write. I think you need to
restart the server to check this properly.

Other than that, the patch is fine by me.