[PATCH v2] replication: remove old snapshot files not needed by replicas

Vladimir Davydov vdavydov.dev at gmail.com
Thu Jun 28 16:00:26 MSK 2018


On Thu, Jun 28, 2018 at 03:09:08PM +0300, Serge Petrenko wrote:
> Garbage collection doesn't distinguish consumers which need checkpoint
> files, such as backup, and the ones, who only need WALS, such as
> replicas. A disconnected replica will 'hold' all checkpoint files, created
> after it got unsynchronised, even though it doesn't need them, which may
> lead to disk space shortage. To fix this, we store consumer's type, and
> treat consumers differently during garbage collection: now only the old
> WALS are stored for replicas, and old checkpoints are stored for backup,
> if any. Also changed the tests to check updated garbage collection correctly.
> 
> Closes #3444
> ---
> https://github.com/tarantool/tarantool/tree/sergepetrenko/gh-3444-remove-old-shapshots-for-replicas
> https://github.com/tarantool/tarantool/issues/3444

After fixing your code according to review comments, you should amend
your old commit, not make a new one (take a look at `git commit --amend`
and `git revert -i`), then send the amended patch to the mailing list.
Sometimes, when the change set is fairly small (which isn't the case
this time), you might want to send an incremental diff in reply to the
email with review remarks, but you should amend your branch anyway.
Please do. Also, see a few minor comments below.

> 
> Changes in v2:
> - prefixed variable names with prefix 'checkpoint_'
>   instead of 'snap_', so that there is no
>   confusion with memtx snapshots
> - same with changing variable name
>   xlog_only to wal_only
> - rewrote gc_run so that there is only a single
>   loop over checkpoints, and also one excess old
>   WAL is removed (it was unneeded, but kept due
>   to a mistake). Now wal_collect_garbage or
>   engine_collect_garbage are called only if they
>   have work to do.
> - fix tests to correctly check the amount of xlogs
>   kept by garbage collection
> 
>  src/box/gc.c                 | 90 +++++++++++++++++++-------------------------
>  src/box/gc.h                 | 10 ++---
>  test/replication/gc.result   | 26 ++++++++++++-
>  test/replication/gc.test.lua | 14 ++++++-
>  4 files changed, 80 insertions(+), 60 deletions(-)
> 
> diff --git a/src/box/gc.c b/src/box/gc.c
> index 288cc7236..06d2fbd35 100644
> --- a/src/box/gc.c
> +++ b/src/box/gc.c
> @@ -161,12 +161,12 @@ gc_free(void)
>  	latch_destroy(&gc.latch);
>  }
>  
> -/** Find the consumer that uses the oldest snapshot */
> +/** Find the consumer that uses the oldest checkpoint */

Comment-style: a dot at the end of the sentence is missing.

>  struct gc_consumer *
> -gc_first_snap(gc_tree_t *consumers)
> +gc_tree_first_checkpoint(gc_tree_t *consumers)
>  {
>  	struct gc_consumer *consumer = gc_tree_first(consumers);
> -	while (consumer != NULL && consumer->xlog_only)
> +	while (consumer != NULL && consumer->wal_only)
>  		consumer = gc_tree_next(consumers, consumer);
>  	return consumer;
>  }
> @@ -179,15 +179,16 @@ gc_run(void)
>  
>  	/* Look up the consumer that uses the oldest WAL */
>  	struct gc_consumer *leftmost = gc_tree_first(&gc.consumers);
> -	/* Look up the consumer that uses the oldest snapshot. */
> -	struct gc_consumer *leftmost_snap = gc_first_snap(&gc.consumers);
> +	/* Look up the consumer that uses the oldest checkpoint. */
> +	struct gc_consumer *leftmost_checkpoint =
> +	  gc_tree_first_checkpoint(&gc.consumers);

Coding-style: when continuing an expression to the next line, indent it
with tabs, please:

	struct gc_consumer *leftmost_checkpoint =
		gc_tree_first_checkpoint(&gc.consumers).

>  
>  	/*
>  	 * Find the oldest checkpoint that must be preserved.
> -	 * We have to maintain @checkpoint_count oldest snapshots,
> -	 * plus we can't remove snapshots that are still in use.
> +	 * We have to maintain @checkpoint_count oldest checkpoints,
> +	 * plus we can't remove checkpoints that are still in use.
>  	 */
> -	int64_t gc_xlog_signature = -1;
> +	int64_t gc_checkpoint_signature = -1;
>  
>  	struct checkpoint_iterator checkpoints;
>  	checkpoint_iterator_init(&checkpoints);
> @@ -196,37 +197,20 @@ gc_run(void)
>  	while ((vclock = checkpoint_iterator_prev(&checkpoints)) != NULL) {
>  		if (--checkpoint_count > 0)
>  			continue;
> -		if (leftmost != NULL &&
> -		    leftmost->signature < vclock_sum(vclock))
> +		if (leftmost_checkpoint != NULL &&
> +		    leftmost_checkpoint->signature < vclock_sum(vclock))
>  			continue;
> -		gc_xlog_signature = vclock_sum(vclock);
> +		gc_checkpoint_signature = vclock_sum(vclock);
>  		break;
>  	}
>  
> -	int64_t gc_snap_signature = -1;
> -	checkpoint_count = gc.checkpoint_count;
> +	int64_t gc_wal_signature = MIN(gc_checkpoint_signature, leftmost != NULL ?
> +	  leftmost->signature : INT64_MAX);

Ditto.

>  
> -	checkpoint_iterator_init(&checkpoints);
> -
> -	while ((vclock = checkpoint_iterator_prev(&checkpoints)) != NULL) {
> -		if (--checkpoint_count > 0)
> -			continue;
> -		if (leftmost_snap != NULL &&
> -		    leftmost_snap->signature < vclock_sum(vclock))
> -			continue;
> -		gc_snap_signature = vclock_sum(vclock);
> -		break;
> -	}
> -
> -	if (gc_snap_signature <= gc.snap_signature &&
> -	    gc_xlog_signature <= gc.xlog_signature)
> +	if (gc_checkpoint_signature <= gc.checkpoint_signature &&
> +	    gc_wal_signature <= gc.wal_signature)
>  		return; /* nothing to do */
>  
> -	if (gc_snap_signature > gc.snap_signature)
> -		gc.snap_signature = gc_snap_signature;
> -	if (gc_xlog_signature > gc.xlog_signature)
> -		gc.xlog_signature = gc_xlog_signature;
> -
>  	/*
>  	 * Engine callbacks may sleep, because they use coio for
>  	 * removing files. Make sure we won't try to remove the
> @@ -234,6 +218,7 @@ gc_run(void)
>  	 * executions.
>  	 */
>  	latch_lock(&gc.latch);
> +
>  	/*
>  	 * Run garbage collection.
>  	 *

Please try to avoid stray hunks, like this one, when preparing a patch.
They complicate review.

> @@ -241,8 +226,17 @@ gc_run(void)
>  	 * collection for memtx snapshots first and abort if it
>  	 * fails - see comment to memtx_engine_collect_garbage().
>  	 */
> -	if (engine_collect_garbage(gc_snap_signature) == 0)
> -		wal_collect_garbage(gc_xlog_signature);
> +	int rc = 0;
> +
> +	if (gc_checkpoint_signature > gc.checkpoint_signature) {
> +		gc.checkpoint_signature = gc_checkpoint_signature;
> +		rc = engine_collect_garbage(gc_checkpoint_signature);
> +	}
> +	if (gc_wal_signature > gc.wal_signature) {
> +		gc.wal_signature = gc_wal_signature;
> +		if (rc == 0)
> +			wal_collect_garbage(gc_wal_signature);
> +	}

I guess, in case engine_collect_garbage() fails and we don't call
wal_collect_garbage(), we shouldn't advance gc.wal_signature:

	if (rc == 0 && gc_wal_signature > gc.wal_signature) {
		gc.wal_signature = gc_wal_signature;
		wal_collect_garbage(gc_wal_signature);
	}



More information about the Tarantool-patches mailing list