Re[2]: [PATCH v2] replication: remove old snapshot files not needed by replicas

Sergey Petrenko sergepetrenko at tarantool.org
Thu Jun 28 17:45:12 MSK 2018


>Четверг, 28 июня 2018, 16:00 +03:00 от Vladimir Davydov <vdavydov.dev at gmail.com>:
>
>On Thu, Jun 28, 2018 at 03:09:08PM +0300, Serge Petrenko wrote:
>> Garbage collection doesn't distinguish consumers which need checkpoint
>> files, such as backup, and the ones, who only need WALS, such as
>> replicas. A disconnected replica will 'hold' all checkpoint files, created
>> after it got unsynchronised, even though it doesn't need them, which may
>> lead to disk space shortage. To fix this, we store consumer's type, and
>> treat consumers differently during garbage collection: now only the old
>> WALS are stored for replicas, and old checkpoints are stored for backup,
>> if any. Also changed the tests to check updated garbage collection correctly.
>> 
>> Closes #3444
>> ---
>>  https://github.com/tarantool/tarantool/tree/sergepetrenko/gh-3444-remove-old-shapshots-for-replicas
>>  https://github.com/tarantool/tarantool/issues/3444
>
>After fixing your code according to review comments, you should amend
>your old commit, not make a new one (take a look at `git commit --amend`
>and `git revert -i`), then send the amended patch to the mailing list.
>Sometimes, when the change set is fairly small (which isn't the case
>this time), you might want to send an incremental diff in reply to the
>email with review remarks, but you should amend your branch anyway.
>Please do. Also, see a few minor comments below. 
Sorry, all done.
>
>> 
>> Changes in v2:
>> - prefixed variable names with prefix 'checkpoint_'
>>   instead of 'snap_', so that there is no
>>   confusion with memtx snapshots
>> - same with changing variable name
>>   xlog_only to wal_only
>> - rewrote gc_run so that there is only a single
>>   loop over checkpoints, and also one excess old
>>   WAL is removed (it was unneeded, but kept due
>>   to a mistake). Now wal_collect_garbage or
>>   engine_collect_garbage are called only if they
>>   have work to do.
>> - fix tests to correctly check the amount of xlogs
>>   kept by garbage collection
>> 
>>  src/box/gc.c                 | 90 +++++++++++++++++++-------------------------
>>  src/box/gc.h                 | 10 ++---
>>  test/replication/gc.result   | 26 ++++++++++++-
>>  test/replication/gc.test.lua | 14 ++++++-
>>  4 files changed, 80 insertions(+), 60 deletions(-)
>> 
>> diff --git a/src/box/gc.c b/src/box/gc.c
>> index 288cc7236..06d2fbd35 100644
>> --- a/src/box/gc.c
>> +++ b/src/box/gc.c
>> @@ -161,12 +161,12 @@ gc_free(void)
>>  	latch_destroy(&gc.latch);
>>  }
>> 
>> -/** Find the consumer that uses the oldest snapshot */
>> +/** Find the consumer that uses the oldest checkpoint */
>
>Comment-style: a dot at the end of the sentence is missing. 
Fixed.
>
>>  struct gc_consumer *
>> -gc_first_snap(gc_tree_t *consumers)
>> +gc_tree_first_checkpoint(gc_tree_t *consumers)
>>  {
>>  	struct gc_consumer *consumer = gc_tree_first(consumers);
>> -	while (consumer != NULL && consumer->xlog_only)
>> +	while (consumer != NULL && consumer->wal_only)
>>  		consumer = gc_tree_next(consumers, consumer);
>>  	return consumer;
>>  }
>> @@ -179,15 +179,16 @@ gc_run(void)
>> 
>>  	/* Look up the consumer that uses the oldest WAL */
>>  	struct gc_consumer *leftmost = gc_tree_first(&gc.consumers);
>> -	/* Look up the consumer that uses the oldest snapshot. */
>> -	struct gc_consumer *leftmost_snap = gc_first_snap(&gc.consumers);
>> +	/* Look up the consumer that uses the oldest checkpoint. */
>> +	struct gc_consumer *leftmost_checkpoint =
>> +	  gc_tree_first_checkpoint(&gc.consumers);
>
>Coding-style: when continuing an expression to the next line, indent it
>with tabs, please:
>
>struct gc_consumer *leftmost_checkpoint =
>gc_tree_first_checkpoint(&gc.consumers). 
Fixed.
>
>> 
>>  	/*
>>  	 * Find the oldest checkpoint that must be preserved.
>> -	 * We have to maintain @checkpoint_count oldest snapshots,
>> -	 * plus we can't remove snapshots that are still in use.
>> +	 * We have to maintain @checkpoint_count oldest checkpoints,
>> +	 * plus we can't remove checkpoints that are still in use.
>>  	 */
>> -	int64_t gc_xlog_signature = -1;
>> +	int64_t gc_checkpoint_signature = -1;
>> 
>>  	struct checkpoint_iterator checkpoints;
>>  	checkpoint_iterator_init(&checkpoints);
>> @@ -196,37 +197,20 @@ gc_run(void)
>>  	while ((vclock = checkpoint_iterator_prev(&checkpoints)) != NULL) {
>>  		if (--checkpoint_count > 0)
>>  			continue;
>> -		if (leftmost != NULL &&
>> -		    leftmost->signature < vclock_sum(vclock))
>> +		if (leftmost_checkpoint != NULL &&
>> +		    leftmost_checkpoint->signature < vclock_sum(vclock))
>>  			continue;
>> -		gc_xlog_signature = vclock_sum(vclock);
>> +		gc_checkpoint_signature = vclock_sum(vclock);
>>  		break;
>>  	}
>> 
>> -	int64_t gc_snap_signature = -1;
>> -	checkpoint_count = gc.checkpoint_count;
>> +	int64_t gc_wal_signature = MIN(gc_checkpoint_signature, leftmost != NULL ?
>> +	  leftmost->signature : INT64_MAX);
>
>Ditto. 
Fixed.
>
>> 
>> -	checkpoint_iterator_init(&checkpoints);
>> -
>> -	while ((vclock = checkpoint_iterator_prev(&checkpoints)) != NULL) {
>> -		if (--checkpoint_count > 0)
>> -			continue;
>> -		if (leftmost_snap != NULL &&
>> -		    leftmost_snap->signature < vclock_sum(vclock))
>> -			continue;
>> -		gc_snap_signature = vclock_sum(vclock);
>> -		break;
>> -	}
>> -
>> -	if (gc_snap_signature <= gc.snap_signature &&
>> -	    gc_xlog_signature <= gc.xlog_signature)
>> +	if (gc_checkpoint_signature <= gc.checkpoint_signature &&
>> +	    gc_wal_signature <= gc.wal_signature)
>>  		return; /* nothing to do */
>> 
>> -	if (gc_snap_signature > gc.snap_signature)
>> -		gc.snap_signature = gc_snap_signature;
>> -	if (gc_xlog_signature > gc.xlog_signature)
>> -		gc.xlog_signature = gc_xlog_signature;
>> -
>>  	/*
>>  	 * Engine callbacks may sleep, because they use coio for
>>  	 * removing files. Make sure we won't try to remove the
>> @@ -234,6 +218,7 @@ gc_run(void)
>>  	 * executions.
>>  	 */
>>  	latch_lock(&gc.latch);
>> +
>>  	/*
>>  	 * Run garbage collection.
>>  	 *
>
>Please try to avoid stray hunks, like this one, when preparing a patch.
>They complicate review. 
Wym?
>
>> @@ -241,8 +226,17 @@ gc_run(void)
>>  	 * collection for memtx snapshots first and abort if it
>>  	 * fails - see comment to memtx_engine_collect_garbage().
>>  	 */
>> -	if (engine_collect_garbage(gc_snap_signature) == 0)
>> -		wal_collect_garbage(gc_xlog_signature);
>> +	int rc = 0;
>> +
>> +	if (gc_checkpoint_signature > gc.checkpoint_signature) {
>> +		gc.checkpoint_signature = gc_checkpoint_signature;
>> +		rc = engine_collect_garbage(gc_checkpoint_signature);
>> +	}
>> +	if (gc_wal_signature > gc.wal_signature) {
>> +		gc.wal_signature = gc_wal_signature;
>> +		if (rc == 0)
>> +			wal_collect_garbage(gc_wal_signature);
>> +	}
>
>I guess, in case engine_collect_garbage() fails and we don't call
>wal_collect_garbage(), we shouldn't advance gc.wal_signature:
>
>if (rc == 0 && gc_wal_signature > gc.wal_signature) {
>gc.wal_signature = gc_wal_signature;
>wal_collect_garbage(gc_wal_signature);
>}
That's how it was done previously. First update gc.signature, and only then
try to run encgine_collect_garbage() and wal_collect_garbage(), so I decided
to leave it as is.

---
src/box/gc.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/box/gc.c b/src/box/gc.c
index 06d2fbd35..4d8f321ff 100644
--- a/src/box/gc.c
+++ b/src/box/gc.c
@@ -161,7 +161,7 @@ gc_free(void)
latch_destroy(&gc.latch);
}

-/** Find the consumer that uses the oldest checkpoint */
+/** Find the consumer that uses the oldest checkpoint. */
struct gc_consumer *
gc_tree_first_checkpoint(gc_tree_t *consumers)
{
@@ -181,7 +181,7 @@ gc_run(void)
struct gc_consumer *leftmost = gc_tree_first(&gc.consumers);
/* Look up the consumer that uses the oldest checkpoint. */
struct gc_consumer *leftmost_checkpoint =
- gc_tree_first_checkpoint(&gc.consumers);
+ gc_tree_first_checkpoint(&gc.consumers);

/*
* Find the oldest checkpoint that must be preserved.
@@ -198,17 +198,17 @@ gc_run(void)
if (--checkpoint_count > 0)
continue;
if (leftmost_checkpoint != NULL &&
- leftmost_checkpoint->signature < vclock_sum(vclock))
+ leftmost_checkpoint->signature < vclock_sum(vclock))
continue;
gc_checkpoint_signature = vclock_sum(vclock);
break;
}

int64_t gc_wal_signature = MIN(gc_checkpoint_signature, leftmost != NULL ?
- leftmost->signature : INT64_MAX);
+ leftmost->signature : INT64_MAX);

if (gc_checkpoint_signature <= gc.checkpoint_signature &&
- gc_wal_signature <= gc.wal_signature)
+ gc_wal_signature <= gc.wal_signature)
return; /* nothing to do */

/*
-- 
2.15.2 (Apple Git-101.1)

-- 
Best regards,
Sergey Petrenko
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.tarantool.org/pipermail/tarantool-patches/attachments/20180628/084b15ad/attachment.html>


More information about the Tarantool-patches mailing list