[Tarantool-patches] [PATCH v5 3/4] gc: rely on minimal vclock components instead of signatures

Serge Petrenko sergepetrenko at tarantool.org
Mon Mar 30 14:04:10 MSK 2020


The current WAL GC implementation tracks consumers (i.e. remote replicas) by
their vclock signature, which is the sum of all vclock components.
This approach is wrong, and this can be shown by a little example.
The example will be a little synthetic, but it'll illustrate the problem.
Say, you have 2 masters, A and B with ids 1 and 2 respectively, and a replica C
with id 3. Say, С replicates from both A and B, and there is no replication
between A and B (say, the instances were reconfigured to not replicate from each
other). Now, say replica C has followed A and B to vclock {1:5, 2:13}. At the
same time, A has lsn 10 and B has lsn 15. A and B do not know about each other’s
changes, so A’s vclock is {1:10} and B’s vclock is {2:15}. Now imagine A does a
snapshot and creates a new xlog with signature 10. A’s directory will look like:
00…000.xlog 00…010.snap 00….010.xlog
Replica C reports its vclock {1:5, 2:13} to A, A uses the vclock to update the
corresponding GC consumer. Since signatures are used, GC consumer is assigned a
signature = 13 + 5 = 18. This is greater than the signature of the last xlog on
A (which is 10), so the previous xlog (00…00.xlog) can be deleted (at least A
assumes it can be). Actually, replica still needs 00…00.xlog, because it
contains rows corresponding to vclocks {1:6} - {1:10}, which haven’t been
replicated yet.

If instead of using vclock signatures, gc consumers used vclocks, such a problem
wouldn’t arise. Replia would report its vclock {1:5, 2:13}. The vclock is NOT
strictly greater than A’s most recent xlog vclock ({1:10}), so the previous log
is kept until replica reports a vclock {1:10, 2:something} or {1:11, …} and so
on.

Rewrite gc to perform cleanup based on finding minimal vclock
components present in at least one of the consumer vclocks instead of
just comparing vclock signatures.

Prerequisite #4114
---
 src/box/gc.c     | 41 +++++++++++++++----------
 src/box/vclock.h | 78 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 16 deletions(-)

diff --git a/src/box/gc.c b/src/box/gc.c
index f5c387f9d..a2d0a515c 100644
--- a/src/box/gc.c
+++ b/src/box/gc.c
@@ -66,16 +66,15 @@ static int
 gc_checkpoint_fiber_f(va_list);
 
 /**
- * Comparator used for ordering gc_consumer objects by signature
- * in a binary tree.
+ * Comparator used for ordering gc_consumer objects
+ * lexicographically by their vclock in a binary tree.
  */
 static inline int
 gc_consumer_cmp(const struct gc_consumer *a, const struct gc_consumer *b)
 {
-	if (vclock_sum(&a->vclock) < vclock_sum(&b->vclock))
-		return -1;
-	if (vclock_sum(&a->vclock) > vclock_sum(&b->vclock))
-		return 1;
+	int rc = vclock_lex_compare(&a->vclock, &b->vclock);
+	if (rc != 0)
+		return rc;
 	if ((intptr_t)a < (intptr_t)b)
 		return -1;
 	if ((intptr_t)a > (intptr_t)b)
@@ -187,19 +186,29 @@ gc_run_cleanup(void)
 	/* At least one checkpoint must always be available. */
 	assert(checkpoint != NULL);
 
+	/* Find the vclock of the oldest WAL row to keep. */
+	struct vclock min_vclock;
+	struct gc_consumer *consumer = gc_tree_first(&gc.consumers);
 	/*
-	 * Find the vclock of the oldest WAL row to keep.
+	 * Vclock of the oldest WAL row to keep is a by-component
+	 * minimum of all consumer vclocks and the oldest
+	 * checkpoint vclock. This ensures that all rows needed by
+	 * at least one consumer are kept.
 	 * Note, we must keep all WALs created after the
 	 * oldest checkpoint, even if no consumer needs them.
 	 */
-	const struct vclock *vclock = (gc_tree_empty(&gc.consumers) ? NULL :
-				       &gc_tree_first(&gc.consumers)->vclock);
-	if (vclock == NULL ||
-	    vclock_sum(vclock) > vclock_sum(&checkpoint->vclock))
-		vclock = &checkpoint->vclock;
-
-	if (vclock_sum(vclock) > vclock_sum(&gc.vclock)) {
-		vclock_copy(&gc.vclock, vclock);
+	vclock_copy(&min_vclock, &checkpoint->vclock);
+	while (consumer != NULL) {
+		/*
+		 * Consumers will never need rows signed
+		 * with a zero instance id (local rows).
+		 */
+		vclock_min_ignore0(&min_vclock, &consumer->vclock);
+		consumer = gc_tree_next(&gc.consumers, consumer);
+	}
+
+	if (vclock_sum(&min_vclock) > vclock_sum(&gc.vclock)) {
+		vclock_copy(&gc.vclock, &min_vclock);
 		run_wal_gc = true;
 	}
 
@@ -222,7 +231,7 @@ gc_run_cleanup(void)
 	if (run_engine_gc)
 		engine_collect_garbage(&checkpoint->vclock);
 	if (run_wal_gc)
-		wal_collect_garbage(vclock);
+		wal_collect_garbage(&min_vclock);
 }
 
 static int
diff --git a/src/box/vclock.h b/src/box/vclock.h
index 2a3a29020..36fcac966 100644
--- a/src/box/vclock.h
+++ b/src/box/vclock.h
@@ -348,6 +348,84 @@ vclock_compare_ignore0(const struct vclock *a, const struct vclock *b)
 	return vclock_compare_generic(a, b, true);
 }
 
+/**
+ * Compare vclocks lexicographically.
+ * The following affirmation holds: if a < b in terms of
+ * vclock_compare, then a < b in terms of vclock_lex_compare.
+ * However, a < b in terms of vclock_lex_compare doesn't mean
+ * than a < b in terms of vclock_compare. Example:
+ * {1:5, 2:10} < {1:7, 2:3} (vclock_lex_compare), but the vclocks
+ * are incomparable in terms of vclock_compare.
+ * All vclocks are comparable lexicographically.
+ *
+ * \param a vclock
+ * \param b vclock
+ * \retval 1 if \a a is lexicographically greater than \a b
+ * \retval -1 if \a a is lexicographically less than \a b
+ * \retval 0 if vclocks are equal
+ */
+static inline int
+vclock_lex_compare(const struct vclock *a, const struct vclock *b)
+{
+	vclock_map_t map = a->map | b->map;
+	struct bit_iterator it;
+	bit_iterator_init(&it, &map, sizeof(map), true);
+	for(size_t replica_id = bit_iterator_next(&it); replica_id < VCLOCK_MAX;
+	    replica_id = bit_iterator_next(&it)) {
+		int64_t lsn_a = vclock_get(a, replica_id);
+		int64_t lsn_b = vclock_get(b, replica_id);
+		if (lsn_a < lsn_b) return -1;
+		if (lsn_a > lsn_b) return 1;
+	}
+	return 0;
+}
+
+/**
+ * Return a vclock, which is a componentwise minimum between
+ * vclocks \a a and \a b.
+ *
+ * \param[in,out] a vclock containing the minimum components.
+ * \param b vclock to compare with.
+ * \param ignore_zero whether to ignore 0th vclock component
+ *                    or not.
+ */
+static inline void
+vclock_min_generic(struct vclock *a, const struct vclock *b, bool ignore_zero)
+{
+	vclock_map_t map = a->map | b->map;
+	struct bit_iterator it;
+	bit_iterator_init(&it, &map, sizeof(map), true);
+	size_t replica_id = bit_iterator_next(&it);
+	if (replica_id == 0 && ignore_zero)
+		replica_id = bit_iterator_next(&it);
+
+	for( ; replica_id < VCLOCK_MAX; replica_id = bit_iterator_next(&it)) {
+		int64_t lsn_a = vclock_get(a, replica_id);
+		int64_t lsn_b = vclock_get(b, replica_id);
+		if (lsn_a <= lsn_b)
+			continue;
+		vclock_reset(a, replica_id, lsn_b);
+	}
+}
+
+/**
+ * \sa vclock_min_generic
+ */
+static inline void
+vclock_min(struct vclock *a, const struct vclock *b)
+{
+	return vclock_min_generic(a, b, false);
+}
+
+/**
+ * \sa vclock_min_generic
+ */
+static inline void
+vclock_min_ignore0(struct vclock *a, const struct vclock *b)
+{
+	return vclock_min_generic(a, b, true);
+}
+
 /**
  * @brief vclockset - a set of vclocks
  */
-- 
2.21.1 (Apple Git-122.3)



More information about the Tarantool-patches mailing list