[Tarantool-patches] [PATCH v5 3/4] gc: rely on minimal vclock components instead of signatures

Serge Petrenko sergepetrenko at tarantool.org
Tue Apr 7 15:40:25 MSK 2020



> 4 апр. 2020 г., в 23:51, Vladislav Shpilevoy <v.shpilevoy at tarantool.org> написал(а):
> 
> Thanks for the patch!
> 

Thanks for the review!

> See 5 comments below.
> 
> On 30/03/2020 13:04, Serge Petrenko wrote:
>> The current WAL GC implementation tracks consumers (i.e. remote replicas) by
>> their vclock signature, which is the sum of all vclock components.
>> This approach is wrong, and this can be shown by a little example.
>> The example will be a little synthetic, but it'll illustrate the problem.
>> Say, you have 2 masters, A and B with ids 1 and 2 respectively, and a replica C
>> with id 3. Say, С replicates from both A and B, and there is no replication
>> between A and B (say, the instances were reconfigured to not replicate from each
>> other). Now, say replica C has followed A and B to vclock {1:5, 2:13}. At the
>> same time, A has lsn 10 and B has lsn 15. A and B do not know about each other’s
>> changes, so A’s vclock is {1:10} and B’s vclock is {2:15}. Now imagine A does a
>> snapshot and creates a new xlog with signature 10. A’s directory will look like:
>> 00…000.xlog 00…010.snap 00….010.xlog
>> Replica C reports its vclock {1:5, 2:13} to A, A uses the vclock to update the
>> corresponding GC consumer. Since signatures are used, GC consumer is assigned a
>> signature = 13 + 5 = 18. This is greater than the signature of the last xlog on
>> A (which is 10), so the previous xlog (00…00.xlog) can be deleted (at least A
>> assumes it can be). Actually, replica still needs 00…00.xlog, because it
>> contains rows corresponding to vclocks {1:6} - {1:10}, which haven’t been
>> replicated yet.
>> 
>> If instead of using vclock signatures, gc consumers used vclocks, such a problem
>> wouldn’t arise. Replia would report its vclock {1:5, 2:13}. The vclock is NOT
> 
> 1. Replia -> Replica.

Yep, thanks.

> 
>> strictly greater than A’s most recent xlog vclock ({1:10}), so the previous log
>> is kept until replica reports a vclock {1:10, 2:something} or {1:11, …} and so
>> on.
>> 
>> Rewrite gc to perform cleanup based on finding minimal vclock
>> components present in at least one of the consumer vclocks instead of
>> just comparing vclock signatures.
>> 
>> Prerequisite #4114
>> ---
>> src/box/gc.c     | 41 +++++++++++++++----------
>> src/box/vclock.h | 78 ++++++++++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 103 insertions(+), 16 deletions(-)
>> 
>> diff --git a/src/box/gc.c b/src/box/gc.c
>> index f5c387f9d..a2d0a515c 100644
>> --- a/src/box/gc.c
>> +++ b/src/box/gc.c
>> @@ -66,16 +66,15 @@ static int
>> gc_checkpoint_fiber_f(va_list);
>> 
>> /**
>> - * Comparator used for ordering gc_consumer objects by signature
>> - * in a binary tree.
>> + * Comparator used for ordering gc_consumer objects
>> + * lexicographically by their vclock in a binary tree.
>>  */
>> static inline int
>> gc_consumer_cmp(const struct gc_consumer *a, const struct gc_consumer *b)
>> {
>> -	if (vclock_sum(&a->vclock) < vclock_sum(&b->vclock))
>> -		return -1;
>> -	if (vclock_sum(&a->vclock) > vclock_sum(&b->vclock))
>> -		return 1;
>> +	int rc = vclock_lex_compare(&a->vclock, &b->vclock);
>> +	if (rc != 0)
>> +		return rc;
>> 	if ((intptr_t)a < (intptr_t)b)
>> 		return -1;
>> 	if ((intptr_t)a > (intptr_t)b)
>> @@ -187,19 +186,29 @@ gc_run_cleanup(void)
>> 	/* At least one checkpoint must always be available. */
>> 	assert(checkpoint != NULL);
>> 
>> +	/* Find the vclock of the oldest WAL row to keep. */
>> +	struct vclock min_vclock;
>> +	struct gc_consumer *consumer = gc_tree_first(&gc.consumers);
>> 	/*
>> -	 * Find the vclock of the oldest WAL row to keep.
>> +	 * Vclock of the oldest WAL row to keep is a by-component
>> +	 * minimum of all consumer vclocks and the oldest
>> +	 * checkpoint vclock. This ensures that all rows needed by
>> +	 * at least one consumer are kept.
>> 	 * Note, we must keep all WALs created after the
>> 	 * oldest checkpoint, even if no consumer needs them.
>> 	 */
>> -	const struct vclock *vclock = (gc_tree_empty(&gc.consumers) ? NULL :
>> -				       &gc_tree_first(&gc.consumers)->vclock);
>> -	if (vclock == NULL ||
>> -	    vclock_sum(vclock) > vclock_sum(&checkpoint->vclock))
>> -		vclock = &checkpoint->vclock;
>> -
>> -	if (vclock_sum(vclock) > vclock_sum(&gc.vclock)) {
>> -		vclock_copy(&gc.vclock, vclock);
>> +	vclock_copy(&min_vclock, &checkpoint->vclock);
>> +	while (consumer != NULL) {
>> +		/*
>> +		 * Consumers will never need rows signed
>> +		 * with a zero instance id (local rows).
>> +		 */
>> +		vclock_min_ignore0(&min_vclock, &consumer->vclock);
>> +		consumer = gc_tree_next(&gc.consumers, consumer);
>> +	}
>> +
>> +	if (vclock_sum(&min_vclock) > vclock_sum(&gc.vclock)) {
> 
> 2. Why do we still use sum here?

Doesn’t matter. In this case min_vclock can only get bigger over time,
so comparing signatures is the same as comparing vclocks.
We’re just checking whether min_vclock has changed, and if it has,
it got bigger.

> 
>> +		vclock_copy(&gc.vclock, &min_vclock);
>> 		run_wal_gc = true;
>> 	}
>> 
>> @@ -222,7 +231,7 @@ gc_run_cleanup(void)
>> 	if (run_engine_gc)
>> 		engine_collect_garbage(&checkpoint->vclock);
>> 	if (run_wal_gc)
>> -		wal_collect_garbage(vclock);
>> +		wal_collect_garbage(&min_vclock);
>> }
>> 
>> static int
>> diff --git a/src/box/vclock.h b/src/box/vclock.h
>> index 2a3a29020..36fcac966 100644
>> --- a/src/box/vclock.h
>> +++ b/src/box/vclock.h
>> @@ -348,6 +348,84 @@ vclock_compare_ignore0(const struct vclock *a, const struct vclock *b)
>> 	return vclock_compare_generic(a, b, true);
>> }
>> 
>> +/**
>> + * Compare vclocks lexicographically.
>> + * The following affirmation holds: if a < b in terms of
>> + * vclock_compare, then a < b in terms of vclock_lex_compare.
>> + * However, a < b in terms of vclock_lex_compare doesn't mean
>> + * than a < b in terms of vclock_compare. Example:
>> + * {1:5, 2:10} < {1:7, 2:3} (vclock_lex_compare), but the vclocks
>> + * are incomparable in terms of vclock_compare.
>> + * All vclocks are comparable lexicographically.
>> + *
>> + * \param a vclock
>> + * \param b vclock
>> + * \retval 1 if \a a is lexicographically greater than \a b
>> + * \retval -1 if \a a is lexicographically less than \a b
>> + * \retval 0 if vclocks are equal
> 3. Please, be consistent with other comments' syntax. I.e. use @
> instead of \, start sentences with capital letters, end with dots.

Ok, will fix.

> 
>> + */
>> +static inline int
>> +vclock_lex_compare(const struct vclock *a, const struct vclock *b)
>> +{
>> +	vclock_map_t map = a->map | b->map;
>> +	struct bit_iterator it;
>> +	bit_iterator_init(&it, &map, sizeof(map), true);
>> +	for(size_t replica_id = bit_iterator_next(&it); replica_id < VCLOCK_MAX;
>> +	    replica_id = bit_iterator_next(&it)) {
>> +		int64_t lsn_a = vclock_get(a, replica_id);
>> +		int64_t lsn_b = vclock_get(b, replica_id);
>> +		if (lsn_a < lsn_b) return -1;
>> +		if (lsn_a > lsn_b) return 1;
> 
> 4. Could you please put if's body on separate lines?

No problem.

> 
>> +	}
>> +	return 0;
>> +}
>> +
>> +/**
>> + * Return a vclock, which is a componentwise minimum between
>> + * vclocks \a a and \a b.
>> + *
>> + * \param[in,out] a vclock containing the minimum components.
>> + * \param b vclock to compare with.
>> + * \param ignore_zero whether to ignore 0th vclock component
>> + *                    or not.
>> + */
>> +static inline void
>> +vclock_min_generic(struct vclock *a, const struct vclock *b, bool ignore_zero)
>> +{
>> +	vclock_map_t map = a->map | b->map;
>> +	struct bit_iterator it;
>> +	bit_iterator_init(&it, &map, sizeof(map), true);
>> +	size_t replica_id = bit_iterator_next(&it);
>> +	if (replica_id == 0 && ignore_zero)
>> +		replica_id = bit_iterator_next(&it);
>> +
>> +	for( ; replica_id < VCLOCK_MAX; replica_id = bit_iterator_next(&it)) {
>> +		int64_t lsn_a = vclock_get(a, replica_id);
>> +		int64_t lsn_b = vclock_get(b, replica_id);
>> +		if (lsn_a <= lsn_b)
>> +			continue;
>> +		vclock_reset(a, replica_id, lsn_b);
>> +	}
>> +}
>> +
>> +/**
>> + * \sa vclock_min_generic
>> + */
>> +static inline void
>> +vclock_min(struct vclock *a, const struct vclock *b)
> 
> 5. This function is never used, so you can drop it,
> rename vclock_min_generic to vclock_min_ignore0, and
> inline ignore_zero to true.

Ok.

diff --git a/src/box/vclock.h b/src/box/vclock.h
index b79895e31..ed519f66b 100644
--- a/src/box/vclock.h
+++ b/src/box/vclock.h
@@ -365,11 +365,11 @@ vclock_compare_ignore0(const struct vclock *a, const struct vclock *b)
  * are incomparable in terms of vclock_compare.
  * All vclocks are comparable lexicographically.
  *
- * \param a vclock
- * \param b vclock
- * \retval 1 if \a a is lexicographically greater than \a b
- * \retval -1 if \a a is lexicographically less than \a b
- * \retval 0 if vclocks are equal
+ * @param a Vclock.
+ * @param b Vclock.
+ * @retval 1 If @a a is lexicographically greater than @a b.
+ * @retval -1 If @a a is lexicographically less than @a b.
+ * @retval 0 If vclocks are equal.
  */
 static inline int
 vclock_lex_compare(const struct vclock *a, const struct vclock *b)
@@ -381,23 +381,23 @@ vclock_lex_compare(const struct vclock *a, const struct vclock *b)
 	    replica_id = bit_iterator_next(&it)) {
 		int64_t lsn_a = vclock_get(a, replica_id);
 		int64_t lsn_b = vclock_get(b, replica_id);
-		if (lsn_a < lsn_b) return -1;
-		if (lsn_a > lsn_b) return 1;
+		if (lsn_a < lsn_b)
+			return -1;
+		if (lsn_a > lsn_b)
+			return 1;
 	}
 	return 0;
 }
 
 /**
  * Return a vclock, which is a componentwise minimum between
- * vclocks \a a and \a b.
+ * vclocks @a a and @a b. Do not take vclock[0] into account.
  *
- * \param[in,out] a vclock containing the minimum components.
- * \param b vclock to compare with.
- * \param ignore_zero whether to ignore 0th vclock component
- *                    or not.
+ * @param[in,out] a Vclock containing the minimum components.
+ * @param b Vclock to compare with.
  */
 static inline void
-vclock_min_generic(struct vclock *a, const struct vclock *b, bool ignore_zero)
+vclock_min_ignore0(struct vclock *a, const struct vclock *b)
 {
 	vclock_map_t map = a->map | b->map;
 	struct bit_iterator it;
@@ -415,24 +415,6 @@ vclock_min_generic(struct vclock *a, const struct vclock *b, bool ignore_zero)
 	}
 }
 
-/**
- * \sa vclock_min_generic
- */
-static inline void
-vclock_min(struct vclock *a, const struct vclock *b)
-{
-	return vclock_min_generic(a, b, false);
-}
-
-/**
- * \sa vclock_min_generic
- */
-static inline void
-vclock_min_ignore0(struct vclock *a, const struct vclock *b)
-{
-	return vclock_min_generic(a, b, true);
-}
-
 /**
  * @brief vclockset - a set of vclocks
  */


--
Serge Petrenko
sergepetrenko at tarantool.org




More information about the Tarantool-patches mailing list