From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH v2 5/5] bloom: drop spectrum Date: Wed, 28 Mar 2018 22:05:01 +0300 Message-Id: In-Reply-To: References: In-Reply-To: References: To: kostja@tarantool.org Cc: tarantool-patches@freelists.org List-ID: As it was pointed out earlier, the bloom spectrum concept is rather dubious, because its overhead for a reasonable false positive rate is about 10 bytes per record while storing all hashes in an array takes only 4 bytes per record so one can stash all hashes and count records first, then create the optimal bloom filter and add all hashes there. --- src/lib/salad/bloom.c | 58 ---------------------------------------------- src/lib/salad/bloom.h | 63 -------------------------------------------------- test/unit/bloom.cc | 63 -------------------------------------------------- test/unit/bloom.result | 9 -------- 4 files changed, 193 deletions(-) diff --git a/src/lib/salad/bloom.c b/src/lib/salad/bloom.c index 09683fb8..d68692ba 100644 --- a/src/lib/salad/bloom.c +++ b/src/lib/salad/bloom.c @@ -113,61 +113,3 @@ bloom_load_table(struct bloom *bloom, const char *table, struct quota *quota) memcpy(bloom->table, table, size); return 0; } - -void -bloom_spectrum_choose(struct bloom_spectrum *spectrum, struct bloom *bloom) -{ - assert(spectrum->chosen_one < 0); - spectrum->chosen_one = 0; - uint32_t number_of_values = spectrum->count_expected; - for (int i = 1; i < BLOOM_SPECTRUM_SIZE; i++) { - number_of_values = number_of_values * 4 / 5; - if (number_of_values < 1) - number_of_values = 1; - if (spectrum->count_collected > number_of_values) - break; - spectrum->chosen_one = i; - } - /* Move the chosen one to result bloom structure */ - *bloom = spectrum->vector[spectrum->chosen_one]; - memset(&spectrum->vector[spectrum->chosen_one], 0, - sizeof(spectrum->vector[spectrum->chosen_one])); -} - -int -bloom_spectrum_create(struct bloom_spectrum *spectrum, - uint32_t max_number_of_values, double false_positive_rate, - struct quota *quota) -{ - spectrum->count_expected = max_number_of_values; - spectrum->count_collected = 0; - spectrum->chosen_one = -1; - for (uint32_t i = 0; i < BLOOM_SPECTRUM_SIZE; i++) { - int rc = bloom_create(&spectrum->vector[i], - max_number_of_values, - false_positive_rate, quota); - if (rc) { - for (uint32_t j = 0; j < i; j++) - bloom_destroy(&spectrum->vector[i], quota); - return rc; - } - - max_number_of_values = max_number_of_values * 4 / 5; - if (max_number_of_values < 1) - max_number_of_values = 1; - } - return 0; -} - -void -bloom_spectrum_destroy(struct bloom_spectrum *spectrum, struct quota *quota) -{ - for (int i = 0; i < BLOOM_SPECTRUM_SIZE; i++) { - if (i != spectrum->chosen_one) - bloom_destroy(&spectrum->vector[i], quota); - } -} - -/* }}} API definition */ - - diff --git a/src/lib/salad/bloom.h b/src/lib/salad/bloom.h index 81d3480e..32deb81a 100644 --- a/src/lib/salad/bloom.h +++ b/src/lib/salad/bloom.h @@ -57,8 +57,6 @@ extern "C" { enum { /* Expected cache line of target processor */ BLOOM_CACHE_LINE = 64, - /* Number of different bloom filter in bloom spectrum */ - BLOOM_SPECTRUM_SIZE = 10, }; typedef uint32_t bloom_hash_t; @@ -164,59 +162,6 @@ bloom_store(const struct bloom *bloom, char *table); int bloom_load_table(struct bloom *bloom, const char *table, struct quota *quota); -/** - * Bloom spectrum that allows to create and fill a bloom filter in case when - * there is no enough knowledge about final number of elements. - * It consists of several bloom filter (set of filters) of different sizes - * and fills them all. After filling it allows to choose more efficient bloom - * filter. - */ -struct bloom_spectrum { - uint32_t count_expected; - uint32_t count_collected; - int chosen_one; - struct bloom vector[BLOOM_SPECTRUM_SIZE]; -}; - -/** - * Create a bloom spectrum - * @param spectrum - spectrum to init - * @param max_number_of_values - upper bound of estimation about - * number of elements - * @param false_positive_rate - desired false positive rate - * @param quota - quota for memory allocation - * @return 0 - OK, -1 - memory error - */ -int -bloom_spectrum_create(struct bloom_spectrum *spectrum, - uint32_t max_number_of_values, double false_positive_rate, - struct quota *quota); - -/** - * Add a value into the data set - * @param spectrum - spectrum to add to - * @param hash - a hash of a value - */ -static void -bloom_spectrum_add(struct bloom_spectrum *spectrum, bloom_hash_t hash); - -/** - * Choose best bloom filter after filling the set. - * Must be used only once. - * @param spectrum - spectrum to choose from - * @param bloom - target structure that will hold the best bloom filter - */ -void -bloom_spectrum_choose(struct bloom_spectrum *spectrum, struct bloom *bloom); - -/** - * Destroy spectrum and free all data (except the chosen one) - * @param spectrum - spectrum to destroy - * @param quota - quota for memory deallocation - */ -void -bloom_spectrum_destroy(struct bloom_spectrum *spectrum, struct quota *quota); - /* }}} API declaration */ /* {{{ API definition */ @@ -263,14 +208,6 @@ bloom_maybe_has(const struct bloom *bloom, bloom_hash_t hash) return true; } -static inline void -bloom_spectrum_add(struct bloom_spectrum *spectrum, bloom_hash_t hash) -{ - spectrum->count_collected++; - for (uint32_t i = 0; i < BLOOM_SPECTRUM_SIZE; i++) - bloom_add(&spectrum->vector[i], hash); -} - /* }}} API definition */ #if defined(__cplusplus) diff --git a/test/unit/bloom.cc b/test/unit/bloom.cc index ac5796ac..b8a2ef13 100644 --- a/test/unit/bloom.cc +++ b/test/unit/bloom.cc @@ -102,72 +102,9 @@ store_load_test() cout << "memory after destruction = " << quota_used(&q) << endl << endl; } -void -spectrum_test() -{ - cout << "*** " << __func__ << " ***" << endl; - struct quota q; - quota_init(&q, 1005000); - double p = 0.01; - uint32_t count = 4000; - struct bloom_spectrum spectrum; - struct bloom bloom; - - /* using (count) */ - bloom_spectrum_create(&spectrum, count, p, &q); - for (uint32_t i = 0; i < count; i++) { - bloom_spectrum_add(&spectrum, h(i)); - } - bloom_spectrum_choose(&spectrum, &bloom); - bloom_spectrum_destroy(&spectrum, &q); - - uint64_t false_positive = 0; - uint64_t error_count = 0; - for (uint32_t i = 0; i < count; i++) { - if (!bloom_maybe_has(&bloom, h(i))) - error_count++; - } - for (uint32_t i = count; i < 2 * count; i++) { - if (bloom_maybe_has(&bloom, h(i))) - false_positive++; - } - bool fpr_rate_is_good = false_positive < 1.5 * p * count; - cout << "bloom table size = " << bloom.table_size << endl; - cout << "error_count = " << error_count << endl; - cout << "fpr_rate_is_good = " << fpr_rate_is_good << endl; - bloom_destroy(&bloom, &q); - - /* same test using (count * 10) */ - bloom_spectrum_create(&spectrum, count * 10, p, &q); - for (uint32_t i = 0; i < count; i++) { - bloom_spectrum_add(&spectrum, h(i)); - } - bloom_spectrum_choose(&spectrum, &bloom); - bloom_spectrum_destroy(&spectrum, &q); - - false_positive = 0; - error_count = 0; - for (uint32_t i = 0; i < count; i++) { - if (!bloom_maybe_has(&bloom, h(i))) - error_count++; - } - for (uint32_t i = count; i < 2 * count; i++) { - if (bloom_maybe_has(&bloom, h(i))) - false_positive++; - } - fpr_rate_is_good = false_positive < 1.5 * p * count; - cout << "bloom table size = " << bloom.table_size << endl; - cout << "error_count = " << error_count << endl; - cout << "fpr_rate_is_good = " << fpr_rate_is_good << endl; - bloom_destroy(&bloom, &q); - - cout << "memory after destruction = " << quota_used(&q) << endl << endl; -} - int main(void) { simple_test(); store_load_test(); - spectrum_test(); } diff --git a/test/unit/bloom.result b/test/unit/bloom.result index 39177f51..1b1b1f94 100644 --- a/test/unit/bloom.result +++ b/test/unit/bloom.result @@ -8,12 +8,3 @@ error_count = 0 fp_rate_too_big = 0 memory after destruction = 0 -*** spectrum_test *** -bloom table size = 79 -error_count = 0 -fpr_rate_is_good = 1 -bloom table size = 106 -error_count = 0 -fpr_rate_is_good = 1 -memory after destruction = 0 - -- 2.11.0