[Tarantool-patches] [PATCH v3 2/4] gc: rely on minimal vclock components instead of signatures

Serge Petrenko sergepetrenko at tarantool.org
Mon Mar 23 19:19:33 MSK 2020


The current WAL GC implementation tracks consumers (i.e. remote replicas) by
their vclock signature, which is the sum of all vclock components.
This approach is wrong, and this can be shown by a little example.
The example will be a little synthetic, but it'll illustrate the problem.
Say, you have 2 masters, A and B with ids 1 and 2 respectively, and a replica C
with id 3. Say, С replicates from both A and B, and there is no replication
between A and B (say, the instances were reconfigured to not replicate from each
other). Now, say replica C has followed A and B to vclock {1:5, 2:13}. At the
same time, A has lsn 10 and B has lsn 15. A and B do not know about each other’s
changes, so A’s vclock is {1:10} and B’s vclock is {2:15}. Now imagine A does a
snapshot and creates a new xlog with signature 10. A’s directory will look like:
00…000.xlog 00…010.snap 00….010.xlog
Replica C reports its vclock {1:5, 2:13} to A, A uses the vclock to update the
corresponding GC consumer. Since signatures are used, GC consumer is assigned a
signature = 13 + 5 = 18. This is greater than the signature of the last xlog on
A (which is 10), so the previous xlog (00…00.xlog) can be deleted (at least A
assumes it can be). Actually, replica still needs 00…00.xlog, because it
contains rows corresponding to vclocks {1:6} - {1:10}, which haven’t been
replicated yet.

If instead of using vclock signatures, gc consumers used vclocks, such a problem
wouldn’t arise. Replia would report its vclock {1:5, 2:13}. The vclock is NOT
strictly greater than A’s most recent xlog vclock ({1:10}), so the previous log
is kept until replica reports a vclock {1:10, 2:something} or {1:11, …} and so
on.

Rewrite gc to perform cleanup based on finding minimal vclock
components present in at least one of the consumer vclocks instead of
just comparing vclock signatures.

Prerequisite #4114
---
 src/box/gc.c     | 29 +++++++++++++++----------
 src/box/vclock.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/src/box/gc.c b/src/box/gc.c
index f5c387f9d..b63f65af0 100644
--- a/src/box/gc.c
+++ b/src/box/gc.c
@@ -66,16 +66,15 @@ static int
 gc_checkpoint_fiber_f(va_list);
 
 /**
- * Comparator used for ordering gc_consumer objects by signature
- * in a binary tree.
+ * Comparator used for ordering gc_consumer objects
+ * lexicographically by their vclock in a binary tree.
  */
 static inline int
 gc_consumer_cmp(const struct gc_consumer *a, const struct gc_consumer *b)
 {
-	if (vclock_sum(&a->vclock) < vclock_sum(&b->vclock))
-		return -1;
-	if (vclock_sum(&a->vclock) > vclock_sum(&b->vclock))
-		return 1;
+	int rc = vclock_lex_compare(&a->vclock, &b->vclock);
+	if (rc != 0)
+		return rc;
 	if ((intptr_t)a < (intptr_t)b)
 		return -1;
 	if ((intptr_t)a > (intptr_t)b)
@@ -192,11 +191,19 @@ gc_run_cleanup(void)
 	 * Note, we must keep all WALs created after the
 	 * oldest checkpoint, even if no consumer needs them.
 	 */
-	const struct vclock *vclock = (gc_tree_empty(&gc.consumers) ? NULL :
-				       &gc_tree_first(&gc.consumers)->vclock);
-	if (vclock == NULL ||
-	    vclock_sum(vclock) > vclock_sum(&checkpoint->vclock))
-		vclock = &checkpoint->vclock;
+	struct vclock min_vclock;
+	struct vclock *vclock;
+	struct gc_consumer *consumer = gc_tree_first(&gc.consumers);
+	if (consumer != NULL) {
+		vclock_copy(&min_vclock, &consumer->vclock);
+		while ((consumer = gc_tree_next(&gc.consumers, consumer)) != NULL) {
+			vclock_min(&min_vclock, &consumer->vclock);
+		}
+		vclock_min(&min_vclock, &checkpoint->vclock);
+		vclock = &min_vclock;
+	} else {
+		vclock =  &checkpoint->vclock;
+	}
 
 	if (vclock_sum(vclock) > vclock_sum(&gc.vclock)) {
 		vclock_copy(&gc.vclock, vclock);
diff --git a/src/box/vclock.h b/src/box/vclock.h
index 7dc69015f..7ac9af1bb 100644
--- a/src/box/vclock.h
+++ b/src/box/vclock.h
@@ -335,6 +335,61 @@ vclock_compare_ignore0(const struct vclock *a, const struct vclock *b)
 	return vclock_compare_generic(a, b, true);
 }
 
+/**
+ * Compare vclocks lexicographically.
+ * The following affirmation holds: if a < b in terms of
+ * vclock_compare, then a < b in terms of vclock_lex_compare.
+ * However, a < b in terms of vclock_lex_compare doesn't mean
+ * than a < b in terms of vclock_compare. Example:
+ * {1:5, 2:10} < {1:7, 2:3} (vclock_lex_compare), but the vclocks
+ * are incomparable in terms of vclock_compare.
+ * All vclocks are comparable lexicographically.
+ *
+ * \param a vclock
+ * \param b vclock
+ * \retval 1 if \a a is lexicographically greater than \a b
+ * \retval -1 if \a a is lexicographically less than \a b
+ * \retval 0 if vclocks are equal
+ */
+static inline int
+vclock_lex_compare(const struct vclock *a, const struct vclock *b)
+{
+	vclock_map_t map = a->map | b->map;
+	struct bit_iterator it;
+	bit_iterator_init(&it, &map, sizeof(map), true);
+	for(size_t replica_id = bit_iterator_next(&it); replica_id < VCLOCK_MAX;
+	    replica_id = bit_iterator_next(&it)) {
+		int64_t lsn_a = vclock_get(a, replica_id);
+		int64_t lsn_b = vclock_get(b, replica_id);
+		if (lsn_a < lsn_b) return -1;
+		if (lsn_a > lsn_b) return 1;
+	}
+	return 0;
+}
+
+/**
+ * Return a vclock, which is a componentwise minimum between
+ * vclocks \a a and \a b.
+ *
+ * \param[in,out] a vclock containing the minimum components.
+ * \param b vclock to compare with.
+ */
+static inline void
+vclock_min(struct vclock *a, const struct vclock *b)
+{
+	vclock_map_t map = a->map | b->map;
+	struct bit_iterator it;
+	bit_iterator_init(&it, &map, sizeof(map), true);
+	for(size_t replica_id = bit_iterator_next(&it); replica_id < VCLOCK_MAX;
+	    replica_id = bit_iterator_next(&it)) {
+		int64_t lsn_a = vclock_get(a, replica_id);
+		int64_t lsn_b = vclock_get(b, replica_id);
+		if (lsn_a <= lsn_b)
+			continue;
+		vclock_set(a, replica_id, lsn_b);
+	}
+}
+
 /**
  * @brief vclockset - a set of vclocks
  */
-- 
2.21.1 (Apple Git-122.3)



More information about the Tarantool-patches mailing list