[PATCH v2 05/14] bloom: do not use tuple_common_key_parts when constructing tuple bloom

Vladimir Davydov vdavydov.dev at gmail.com
Wed Mar 13 11:52:51 MSK 2019


Tuple bloom filter is an array of bloom filters, each of which reflects
lookups by all possible partial keys. To optimize the overall bloom
filter size, we need to know how many unique elements there are for each
partial key. To achieve that, we require the caller to pass the number
of key parts that have been hashed for the given tuple. Here's how it
looks in Vinyl:

        uint32_t hashed_parts = writer->last_stmt == NULL ? 0 :
                tuple_common_key_parts(stmt, writer->last_stmt,
                                       writer->key_def);
        tuple_bloom_builder_add(writer->bloom, stmt,
                                writer->key_def, hashed_parts);

Actually, there's no need in such a requirement as instead we can
calculate the hash value for the given tuple, compare it with the hash
of the tuple added last time, and add the new hash only if the two
values differ. This should be accurate enough while allowing us to get
rid of the cumbersome tuple_common_key_parts helper. Note, such a check
will only work if tuples are added in the order defined by the key
definition, but that already holds - anyway, one wouldn't be able to
use tuple_common_key_parts either if it wasn't true.
---
 src/box/key_def.h        | 11 -----------
 src/box/tuple_bloom.c    | 15 ++++++++-------
 src/box/tuple_bloom.h    |  8 ++++----
 src/box/tuple_compare.cc | 33 ---------------------------------
 src/box/vy_run.c         | 10 ++--------
 5 files changed, 14 insertions(+), 63 deletions(-)

diff --git a/src/box/key_def.h b/src/box/key_def.h
index dd62f6a3..e3a34ce1 100644
--- a/src/box/key_def.h
+++ b/src/box/key_def.h
@@ -512,17 +512,6 @@ tuple_extract_key_raw(const char *data, const char *data_end,
 }
 
 /**
- * Return the length of the longest common prefix of two tuples.
- * @param tuple_a first tuple
- * @param tuple_b second tuple
- * @param key_def key defintion
- * @return number of key parts the two tuples have in common
- */
-uint32_t
-tuple_common_key_parts(const struct tuple *tuple_a, const struct tuple *tuple_b,
-		       struct key_def *key_def);
-
-/**
  * Compare keys using the key definition.
  * @param key_a key parts with MessagePack array header
  * @param part_count_a the number of parts in the key_a
diff --git a/src/box/tuple_bloom.c b/src/box/tuple_bloom.c
index cf887c7b..39d218dc 100644
--- a/src/box/tuple_bloom.c
+++ b/src/box/tuple_bloom.c
@@ -72,8 +72,7 @@ tuple_bloom_builder_delete(struct tuple_bloom_builder *builder)
 
 int
 tuple_bloom_builder_add(struct tuple_bloom_builder *builder,
-			const struct tuple *tuple, struct key_def *key_def,
-			uint32_t hashed_parts)
+			const struct tuple *tuple, struct key_def *key_def)
 {
 	assert(builder->part_count == key_def->part_count);
 
@@ -82,17 +81,20 @@ tuple_bloom_builder_add(struct tuple_bloom_builder *builder,
 	uint32_t total_size = 0;
 
 	for (uint32_t i = 0; i < key_def->part_count; i++) {
+		struct tuple_hash_array *hash_arr = &builder->parts[i];
 		total_size += tuple_hash_key_part(&h, &carry, tuple,
 						  &key_def->parts[i]);
-		if (i < hashed_parts) {
+		uint32_t hash = PMurHash32_Result(h, carry, total_size);
+		if (hash_arr->count > 0 &&
+		    hash_arr->values[hash_arr->count - 1] == hash) {
 			/*
 			 * This part is already in the bloom, proceed
-			 * to the next one. Note, we can't skip to
-			 * hashed_parts, as we need to compute the hash.
+			 * to the next one. Note, this check only works
+			 * if tuples are added in the order defined by
+			 * the key definition.
 			 */
 			continue;
 		}
-		struct tuple_hash_array *hash_arr = &builder->parts[i];
 		if (hash_arr->count >= hash_arr->capacity) {
 			uint32_t capacity = MAX(hash_arr->capacity * 2, 1024U);
 			uint32_t *values = realloc(hash_arr->values,
@@ -105,7 +107,6 @@ tuple_bloom_builder_add(struct tuple_bloom_builder *builder,
 			hash_arr->capacity = capacity;
 			hash_arr->values = values;
 		}
-		uint32_t hash = PMurHash32_Result(h, carry, total_size);
 		hash_arr->values[hash_arr->count++] = hash;
 	}
 	return 0;
diff --git a/src/box/tuple_bloom.h b/src/box/tuple_bloom.h
index b05fee18..8380801e 100644
--- a/src/box/tuple_bloom.h
+++ b/src/box/tuple_bloom.h
@@ -111,14 +111,14 @@ tuple_bloom_builder_delete(struct tuple_bloom_builder *builder);
  * @param builder - bloom filter builder
  * @param tuple - tuple to add
  * @param key_def - key definition
- * @param hashed_parts - number of key parts that have already
- *  been added to the builder
  * @return 0 on success, -1 on OOM
+ *
+ * For the bloom size calculation to be accurate, tuples
+ * must be added in the order defined by the key definition.
  */
 int
 tuple_bloom_builder_add(struct tuple_bloom_builder *builder,
-			const struct tuple *tuple, struct key_def *key_def,
-			uint32_t hashed_parts);
+			const struct tuple *tuple, struct key_def *key_def);
 
 /**
  * Create a new tuple bloom filter.
diff --git a/src/box/tuple_compare.cc b/src/box/tuple_compare.cc
index cf4519cc..dfe30b19 100644
--- a/src/box/tuple_compare.cc
+++ b/src/box/tuple_compare.cc
@@ -425,39 +425,6 @@ tuple_compare_field_with_hint(const char *field_a, enum mp_type a_type,
 	}
 }
 
-uint32_t
-tuple_common_key_parts(const struct tuple *tuple_a, const struct tuple *tuple_b,
-		       struct key_def *key_def)
-{
-	uint32_t i;
-	struct tuple_format *tuple_a_format = tuple_format(tuple_a);
-	struct tuple_format *tuple_b_format = tuple_format(tuple_b);
-	const char *tuple_a_raw = tuple_data(tuple_a);
-	const char *tuple_b_raw = tuple_data(tuple_b);
-	const uint32_t *tuple_a_field_map = tuple_field_map(tuple_a);
-	const uint32_t *tuple_b_field_map = tuple_field_map(tuple_b);
-	for (i = 0; i < key_def->part_count; i++) {
-		struct key_part *part = (struct key_part *)&key_def->parts[i];
-		const char *field_a =
-			tuple_field_raw_by_part(tuple_a_format, tuple_a_raw,
-						tuple_a_field_map, part);
-		const char *field_b =
-			tuple_field_raw_by_part(tuple_b_format, tuple_b_raw,
-						tuple_b_field_map, part);
-		enum mp_type a_type = field_a != NULL ?
-				      mp_typeof(*field_a) : MP_NIL;
-		enum mp_type b_type = field_b != NULL ?
-				      mp_typeof(*field_b) : MP_NIL;
-		if (a_type == MP_NIL && b_type == MP_NIL)
-			continue;
-		if (a_type == MP_NIL || b_type == MP_NIL ||
-		    tuple_compare_field_with_hint(field_a, a_type,
-				field_b, b_type, part->type, part->coll) != 0)
-			break;
-	}
-	return i;
-}
-
 template<bool is_nullable, bool has_optional_parts, bool has_json_paths>
 static inline int
 tuple_compare_slowpath(const struct tuple *tuple_a, const struct tuple *tuple_b,
diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index 3c096b0a..e2123fe7 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -2178,11 +2178,8 @@ static int
 vy_run_writer_write_to_page(struct vy_run_writer *writer, struct tuple *stmt)
 {
 	if (writer->bloom != NULL) {
-		uint32_t hashed_parts = writer->last_stmt == NULL ? 0 :
-			tuple_common_key_parts(stmt, writer->last_stmt,
-					       writer->key_def);
 		tuple_bloom_builder_add(writer->bloom, stmt,
-					writer->key_def, hashed_parts);
+					writer->key_def);
 	}
 	if (writer->last_stmt != NULL)
 		vy_stmt_unref_if_possible(writer->last_stmt);
@@ -2409,11 +2406,8 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir,
 			if (tuple == NULL)
 				goto close_err;
 			if (bloom_builder != NULL) {
-				uint32_t hashed_parts = prev_tuple == NULL ? 0 :
-					tuple_common_key_parts(prev_tuple,
-							       tuple, key_def);
 				tuple_bloom_builder_add(bloom_builder, tuple,
-							key_def, hashed_parts);
+							key_def);
 			}
 			key = tuple_extract_key(tuple, cmp_def, NULL);
 			if (prev_tuple != NULL)
-- 
2.11.0




More information about the Tarantool-patches mailing list