From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <vdavydov.dev@gmail.com>
Date: Wed, 21 Aug 2019 13:44:39 +0300
From: Vladimir Davydov <vdavydov.dev@gmail.com>
Subject: Re: [tarantool-patches] [PATCH 2/7] Refactoring: Track wal files
 using gc state.
Message-ID: <20190821104439.GZ13834@esperanza>
References: <cover.1565676868.git.georgy@tarantool.org>
 <f98bf7a3c33fb445748c6aa4839e8d0794e56853.1565676868.git.georgy@tarantool.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <f98bf7a3c33fb445748c6aa4839e8d0794e56853.1565676868.git.georgy@tarantool.org>
To: Georgy Kirichenko <georgy@tarantool.org>
Cc: tarantool-patches@freelists.org
List-ID: <tarantool-patches.dev.tarantool.org>

On Tue, Aug 13, 2019 at 09:27:40AM +0300, Georgy Kirichenko wrote:
> Move wal files tracking from wal into gc struct because current garbage
> collection was complicated and illogical:
> 1. wal tracks xlogs ->
>    relay catches on_close_log and sends events to gc ->
>    gc tracks the least relay signature and sends vclock to wal
>    wal collect logs
>    gc collects checkpoints independently
> 2. in case of no space errors wal collects logs (making some checkpoints
>      pointless) ->
>    wal notifies gc
>    gc deactivates outdated relays (this is a BUG because relay may catch
>    the currenct state after deactivating)
> This flow does not allow in memory replication because relay would not
> have xlog boundaries yet.
> The flow after the patch is more straightforward:
>    wal informs gc about log open/close events
>    gc tracks both checkpoints and logs
>    relay catches on_close_log and send events to gc (this will be
>    changed after in memroy replication patch)
>    wal requests gc to free space in case of no space error
>    gc could consistently free checkpoints and logs.
> 
> As relay notifies tx about ACK state changes gc would be able to track
> the oldest used used wal and perform garbage collection as well as relay
> deactivating.

Overall, I feel like it must be a responsibility of the WAL thread to
delete old files, because the TX thread deals with a stream of rows, it
doesn't care how this stream is split into files or whether it's split
at all. For example, we could use a ring buffer to store WAL records
rather than a bunch of files. Moving garbage collection to TX will make
any such transformation difficult to achieve. I'm inclined to think that
instead of moving WAL file tracking and removal from the WAL thread to
the TX thread, we should move garbage collection infrastructure to the
WAL thread. Judging by the following patches, there's a link between
a relay and a WAL anyway so perhaps we could use it for updating the
garbage collector state right in the WAL? Just a thought, not insisting.

> diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c
> index 59ad16823..9b3f2233e 100644
> --- a/src/box/memtx_engine.c
> +++ b/src/box/memtx_engine.c
> @@ -568,7 +568,7 @@ checkpoint_f(va_list ap)
>  	}
>  
>  	struct xlog snap;
> -	if (xdir_create_xlog(&ckpt->dir, &snap, &ckpt->vclock) != 0)
> +	if (xdir_create_xlog(&ckpt->dir, &snap, &ckpt->vclock, NULL) != 0)
>  		return -1;
>  
>  	say_info("saving snapshot `%s'", snap.filename);
> diff --git a/src/box/recovery.cc b/src/box/recovery.cc
> index d122d618a..45c4f6820 100644
> --- a/src/box/recovery.cc
> +++ b/src/box/recovery.cc
> @@ -118,22 +118,17 @@ recovery_new(const char *wal_dirname, bool force_recovery,
>  }
>  
>  void
> -recovery_scan(struct recovery *r, struct vclock *end_vclock,
> -	      struct vclock *gc_vclock)
> +recovery_scan(struct recovery *r, struct vclock *end_vclock)
>  {
>  	xdir_scan_xc(&r->wal_dir);
>  
>  	if (xdir_last_vclock(&r->wal_dir, end_vclock) < 0 ||
>  	    vclock_compare(end_vclock, &r->vclock) < 0) {
>  		/* No xlogs after last checkpoint. */
> -		vclock_copy(gc_vclock, &r->vclock);
>  		vclock_copy(end_vclock, &r->vclock);
>  		return;
>  	}
>  
> -	if (xdir_first_vclock(&r->wal_dir, gc_vclock) < 0)
> -		unreachable();
> -
>  	/* Scan the last xlog to find end vclock. */
>  	struct xlog_cursor cursor;
>  	if (xdir_open_cursor(&r->wal_dir, vclock_sum(end_vclock), &cursor) != 0)
> diff --git a/src/box/recovery.h b/src/box/recovery.h
> index 6e68abc0b..2a03cfc2f 100644
> --- a/src/box/recovery.h
> +++ b/src/box/recovery.h
> @@ -75,8 +75,7 @@ recovery_delete(struct recovery *r);
>   * WAL directory.
>   */
>  void
> -recovery_scan(struct recovery *r,  struct vclock *end_vclock,
> -	      struct vclock *gc_vclock);
> +recovery_scan(struct recovery *r, struct vclock *end_vclock);
>  
>  void
>  recovery_follow_local(struct recovery *r, struct xstream *stream,
> diff --git a/src/box/vy_log.c b/src/box/vy_log.c
> index cb291f3c8..d97eff9b8 100644
> --- a/src/box/vy_log.c
> +++ b/src/box/vy_log.c
> @@ -949,7 +949,7 @@ vy_log_open(struct xlog *xlog)
>  	}
>  
>  	if (xdir_create_xlog(&vy_log.dir, xlog,
> -			     &vy_log.last_checkpoint) < 0)
> +			     &vy_log.last_checkpoint, NULL) < 0)
>  		goto fail;
>  
>  	struct xrow_header row;
> @@ -2585,7 +2585,7 @@ vy_log_create(const struct vclock *vclock, struct vy_recovery *recovery)
>  	rlist_foreach_entry(lsm, &recovery->lsms, in_recovery) {
>  		/* Create the log file on the first write. */
>  		if (!xlog_is_open(&xlog) &&
> -		    xdir_create_xlog(&vy_log.dir, &xlog, vclock) != 0)
> +		    xdir_create_xlog(&vy_log.dir, &xlog, vclock, NULL) != 0)
>  			goto err_create_xlog;
>  
>  		if (vy_log_append_lsm(&xlog, lsm) != 0)
> diff --git a/src/box/wal.c b/src/box/wal.c
> index 5d8dcc4f7..a09ab7187 100644
> --- a/src/box/wal.c
> +++ b/src/box/wal.c
> @@ -801,20 +759,69 @@ wal_opt_rotate(struct wal_writer *writer)
>  		return 0;
>  
>  	if (xdir_create_xlog(&writer->wal_dir, &writer->current_wal,
> -			     &writer->vclock) != 0) {
> +			     &writer->vclock, &writer->prev_vclock) != 0) {
>  		diag_log();
>  		return -1;
>  	}
> -	/*
> -	 * Keep track of the new WAL vclock. Required for garbage
> -	 * collection, see wal_collect_garbage().
> -	 */
> -	xdir_add_vclock(&writer->wal_dir, &writer->vclock);
> +	vclock_copy(&writer->prev_vclock, &writer->vclock);
>  
> +	wal_notify_log_action(writer, WAL_LOG_OPEN);
>  	wal_notify_watchers(writer, WAL_EVENT_ROTATE);
>  	return 0;
>  }
>  
> +struct gc_force_wal_msg {
> +	struct cmsg base;
> +	struct fiber_cond done_cond;
> +	bool done;
> +	int rc;
> +};
> +
> +static void
> +wal_gc_wal_force_done(struct cmsg *base)
> +{
> +	struct gc_force_wal_msg *msg = container_of(base,
> +						   struct gc_force_wal_msg,
> +						   base);
> +	msg->done = true;
> +	fiber_cond_signal(&msg->done_cond);
> +}
> +
> +static int
> +tx_gc_force_wal_f(va_list ap)
> +{
> +	struct wal_writer *writer = &wal_writer_singleton;
> +	struct gc_force_wal_msg *msg = va_arg(ap, struct gc_force_wal_msg *);
> +	static struct cmsg_hop respond_route[] = {
> +		{wal_gc_wal_force_done, NULL}
> +	};
> +	msg->rc = gc_force_wal_cleanup();

The idea was to avoid direct calls from WAL to gc, because those are
different subsystems and WAL is a more low-level one. Using callbacks
allowed to avoid a dependency loop. Now there are still callbacks, but
certain methods are called directly. We should either remove callbacks
altogether turning this code into a monolith or add another callback,
I guess.

> +	cmsg_init(&msg->base, respond_route);
> +	cpipe_push(&writer->wal_pipe, &msg->base);
> +	return 0;
> +}
> +
> +void
> +tx_gc_wal_force(struct cmsg *base)
> +{
> +	struct wal_writer *writer = &wal_writer_singleton;
> +	struct gc_force_wal_msg *msg = container_of(base,
> +						   struct gc_force_wal_msg,
> +						   base);
> +	struct fiber *gc_fiber = fiber_new("wal_gc_fiber", tx_gc_force_wal_f);
> +	if (gc_fiber == NULL) {
> +		struct cmsg_hop respond_route[] = {
> +			{wal_gc_wal_force_done, NULL}
> +		};
> +		msg->rc = -1;
> +		cmsg_init(&msg->base, respond_route);
> +		cpipe_push(&writer->wal_pipe, &msg->base);
> +		return;
> +	}
> +	fiber_start(gc_fiber, msg);
> +	return;
> +}
> +
>  /**
>   * Make sure there's enough disk space to append @len bytes
>   * of data to the current WAL.
> @@ -825,17 +832,10 @@ wal_opt_rotate(struct wal_writer *writer)
>  static int
>  wal_fallocate(struct wal_writer *writer, size_t len)
>  {
> -	bool warn_no_space = true, notify_gc = false;
>  	struct xlog *l = &writer->current_wal;
>  	struct errinj *errinj = errinj(ERRINJ_WAL_FALLOCATE, ERRINJ_INT);
>  	int rc = 0;
>  
> -	/*
> -	 * Max LSN that can be collected in case of ENOSPC -
> -	 * we must not delete WALs necessary for recovery.
> -	 */
> -	int64_t gc_lsn = vclock_sum(&writer->checkpoint_vclock);
> -
>  	/*
>  	 * The actual write size can be greater than the sum size
>  	 * of encoded rows (compression, fixheaders). Double the
> @@ -856,45 +856,23 @@ retry:
>  	}
>  	if (errno != ENOSPC)
>  		goto error;
> -	if (!xdir_has_garbage(&writer->wal_dir, gc_lsn))
> -		goto error;
> -
> -	if (warn_no_space) {
> -		say_crit("ran out of disk space, try to delete old WAL files");
> -		warn_no_space = false;
> -	}

I think we shouldn't remove this warning.

> -
> -	xdir_collect_garbage(&writer->wal_dir, gc_lsn, XDIR_GC_REMOVE_ONE);
> -	notify_gc = true;
> -	goto retry;
> +	static struct cmsg_hop gc_wal_force_route[] = {
> +		{tx_gc_wal_force, NULL}
> +	};
> +	struct gc_force_wal_msg msg;
> +	msg.done = false;
> +	fiber_cond_create(&msg.done_cond);
> +	cmsg_init(&msg.base, gc_wal_force_route);
> +	cpipe_push(&writer->tx_prio_pipe, &msg.base);
> +
> +	while (!msg.done)
> +		fiber_cond_wait(&msg.done_cond);
> +	if (msg.rc == 0)
> +		goto retry;
>  error:
>  	diag_log();
>  	rc = -1;