From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH v2 05/14] bloom: do not use tuple_common_key_parts when constructing tuple bloom Date: Wed, 13 Mar 2019 11:52:51 +0300 Message-Id: In-Reply-To: References: In-Reply-To: References: To: kostja@tarantool.org Cc: tarantool-patches@freelists.org List-ID: Tuple bloom filter is an array of bloom filters, each of which reflects lookups by all possible partial keys. To optimize the overall bloom filter size, we need to know how many unique elements there are for each partial key. To achieve that, we require the caller to pass the number of key parts that have been hashed for the given tuple. Here's how it looks in Vinyl: uint32_t hashed_parts = writer->last_stmt == NULL ? 0 : tuple_common_key_parts(stmt, writer->last_stmt, writer->key_def); tuple_bloom_builder_add(writer->bloom, stmt, writer->key_def, hashed_parts); Actually, there's no need in such a requirement as instead we can calculate the hash value for the given tuple, compare it with the hash of the tuple added last time, and add the new hash only if the two values differ. This should be accurate enough while allowing us to get rid of the cumbersome tuple_common_key_parts helper. Note, such a check will only work if tuples are added in the order defined by the key definition, but that already holds - anyway, one wouldn't be able to use tuple_common_key_parts either if it wasn't true. --- src/box/key_def.h | 11 ----------- src/box/tuple_bloom.c | 15 ++++++++------- src/box/tuple_bloom.h | 8 ++++---- src/box/tuple_compare.cc | 33 --------------------------------- src/box/vy_run.c | 10 ++-------- 5 files changed, 14 insertions(+), 63 deletions(-) diff --git a/src/box/key_def.h b/src/box/key_def.h index dd62f6a3..e3a34ce1 100644 --- a/src/box/key_def.h +++ b/src/box/key_def.h @@ -512,17 +512,6 @@ tuple_extract_key_raw(const char *data, const char *data_end, } /** - * Return the length of the longest common prefix of two tuples. - * @param tuple_a first tuple - * @param tuple_b second tuple - * @param key_def key defintion - * @return number of key parts the two tuples have in common - */ -uint32_t -tuple_common_key_parts(const struct tuple *tuple_a, const struct tuple *tuple_b, - struct key_def *key_def); - -/** * Compare keys using the key definition. * @param key_a key parts with MessagePack array header * @param part_count_a the number of parts in the key_a diff --git a/src/box/tuple_bloom.c b/src/box/tuple_bloom.c index cf887c7b..39d218dc 100644 --- a/src/box/tuple_bloom.c +++ b/src/box/tuple_bloom.c @@ -72,8 +72,7 @@ tuple_bloom_builder_delete(struct tuple_bloom_builder *builder) int tuple_bloom_builder_add(struct tuple_bloom_builder *builder, - const struct tuple *tuple, struct key_def *key_def, - uint32_t hashed_parts) + const struct tuple *tuple, struct key_def *key_def) { assert(builder->part_count == key_def->part_count); @@ -82,17 +81,20 @@ tuple_bloom_builder_add(struct tuple_bloom_builder *builder, uint32_t total_size = 0; for (uint32_t i = 0; i < key_def->part_count; i++) { + struct tuple_hash_array *hash_arr = &builder->parts[i]; total_size += tuple_hash_key_part(&h, &carry, tuple, &key_def->parts[i]); - if (i < hashed_parts) { + uint32_t hash = PMurHash32_Result(h, carry, total_size); + if (hash_arr->count > 0 && + hash_arr->values[hash_arr->count - 1] == hash) { /* * This part is already in the bloom, proceed - * to the next one. Note, we can't skip to - * hashed_parts, as we need to compute the hash. + * to the next one. Note, this check only works + * if tuples are added in the order defined by + * the key definition. */ continue; } - struct tuple_hash_array *hash_arr = &builder->parts[i]; if (hash_arr->count >= hash_arr->capacity) { uint32_t capacity = MAX(hash_arr->capacity * 2, 1024U); uint32_t *values = realloc(hash_arr->values, @@ -105,7 +107,6 @@ tuple_bloom_builder_add(struct tuple_bloom_builder *builder, hash_arr->capacity = capacity; hash_arr->values = values; } - uint32_t hash = PMurHash32_Result(h, carry, total_size); hash_arr->values[hash_arr->count++] = hash; } return 0; diff --git a/src/box/tuple_bloom.h b/src/box/tuple_bloom.h index b05fee18..8380801e 100644 --- a/src/box/tuple_bloom.h +++ b/src/box/tuple_bloom.h @@ -111,14 +111,14 @@ tuple_bloom_builder_delete(struct tuple_bloom_builder *builder); * @param builder - bloom filter builder * @param tuple - tuple to add * @param key_def - key definition - * @param hashed_parts - number of key parts that have already - * been added to the builder * @return 0 on success, -1 on OOM + * + * For the bloom size calculation to be accurate, tuples + * must be added in the order defined by the key definition. */ int tuple_bloom_builder_add(struct tuple_bloom_builder *builder, - const struct tuple *tuple, struct key_def *key_def, - uint32_t hashed_parts); + const struct tuple *tuple, struct key_def *key_def); /** * Create a new tuple bloom filter. diff --git a/src/box/tuple_compare.cc b/src/box/tuple_compare.cc index cf4519cc..dfe30b19 100644 --- a/src/box/tuple_compare.cc +++ b/src/box/tuple_compare.cc @@ -425,39 +425,6 @@ tuple_compare_field_with_hint(const char *field_a, enum mp_type a_type, } } -uint32_t -tuple_common_key_parts(const struct tuple *tuple_a, const struct tuple *tuple_b, - struct key_def *key_def) -{ - uint32_t i; - struct tuple_format *tuple_a_format = tuple_format(tuple_a); - struct tuple_format *tuple_b_format = tuple_format(tuple_b); - const char *tuple_a_raw = tuple_data(tuple_a); - const char *tuple_b_raw = tuple_data(tuple_b); - const uint32_t *tuple_a_field_map = tuple_field_map(tuple_a); - const uint32_t *tuple_b_field_map = tuple_field_map(tuple_b); - for (i = 0; i < key_def->part_count; i++) { - struct key_part *part = (struct key_part *)&key_def->parts[i]; - const char *field_a = - tuple_field_raw_by_part(tuple_a_format, tuple_a_raw, - tuple_a_field_map, part); - const char *field_b = - tuple_field_raw_by_part(tuple_b_format, tuple_b_raw, - tuple_b_field_map, part); - enum mp_type a_type = field_a != NULL ? - mp_typeof(*field_a) : MP_NIL; - enum mp_type b_type = field_b != NULL ? - mp_typeof(*field_b) : MP_NIL; - if (a_type == MP_NIL && b_type == MP_NIL) - continue; - if (a_type == MP_NIL || b_type == MP_NIL || - tuple_compare_field_with_hint(field_a, a_type, - field_b, b_type, part->type, part->coll) != 0) - break; - } - return i; -} - template static inline int tuple_compare_slowpath(const struct tuple *tuple_a, const struct tuple *tuple_b, diff --git a/src/box/vy_run.c b/src/box/vy_run.c index 3c096b0a..e2123fe7 100644 --- a/src/box/vy_run.c +++ b/src/box/vy_run.c @@ -2178,11 +2178,8 @@ static int vy_run_writer_write_to_page(struct vy_run_writer *writer, struct tuple *stmt) { if (writer->bloom != NULL) { - uint32_t hashed_parts = writer->last_stmt == NULL ? 0 : - tuple_common_key_parts(stmt, writer->last_stmt, - writer->key_def); tuple_bloom_builder_add(writer->bloom, stmt, - writer->key_def, hashed_parts); + writer->key_def); } if (writer->last_stmt != NULL) vy_stmt_unref_if_possible(writer->last_stmt); @@ -2409,11 +2406,8 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir, if (tuple == NULL) goto close_err; if (bloom_builder != NULL) { - uint32_t hashed_parts = prev_tuple == NULL ? 0 : - tuple_common_key_parts(prev_tuple, - tuple, key_def); tuple_bloom_builder_add(bloom_builder, tuple, - key_def, hashed_parts); + key_def); } key = tuple_extract_key(tuple, cmp_def, NULL); if (prev_tuple != NULL) -- 2.11.0