From: Vladimir Davydov <vdavydov.dev@gmail.com> To: kostja@tarantool.org Cc: tarantool-patches@freelists.org Subject: [PATCH v2 5/5] bloom: drop spectrum Date: Wed, 28 Mar 2018 22:05:01 +0300 [thread overview] Message-ID: <d73650ccee5b063616b8d2a1196dc26eb7abd53f.1522262496.git.vdavydov.dev@gmail.com> (raw) In-Reply-To: <cover.1522262496.git.vdavydov.dev@gmail.com> In-Reply-To: <cover.1522262496.git.vdavydov.dev@gmail.com> As it was pointed out earlier, the bloom spectrum concept is rather dubious, because its overhead for a reasonable false positive rate is about 10 bytes per record while storing all hashes in an array takes only 4 bytes per record so one can stash all hashes and count records first, then create the optimal bloom filter and add all hashes there. --- src/lib/salad/bloom.c | 58 ---------------------------------------------- src/lib/salad/bloom.h | 63 -------------------------------------------------- test/unit/bloom.cc | 63 -------------------------------------------------- test/unit/bloom.result | 9 -------- 4 files changed, 193 deletions(-) diff --git a/src/lib/salad/bloom.c b/src/lib/salad/bloom.c index 09683fb8..d68692ba 100644 --- a/src/lib/salad/bloom.c +++ b/src/lib/salad/bloom.c @@ -113,61 +113,3 @@ bloom_load_table(struct bloom *bloom, const char *table, struct quota *quota) memcpy(bloom->table, table, size); return 0; } - -void -bloom_spectrum_choose(struct bloom_spectrum *spectrum, struct bloom *bloom) -{ - assert(spectrum->chosen_one < 0); - spectrum->chosen_one = 0; - uint32_t number_of_values = spectrum->count_expected; - for (int i = 1; i < BLOOM_SPECTRUM_SIZE; i++) { - number_of_values = number_of_values * 4 / 5; - if (number_of_values < 1) - number_of_values = 1; - if (spectrum->count_collected > number_of_values) - break; - spectrum->chosen_one = i; - } - /* Move the chosen one to result bloom structure */ - *bloom = spectrum->vector[spectrum->chosen_one]; - memset(&spectrum->vector[spectrum->chosen_one], 0, - sizeof(spectrum->vector[spectrum->chosen_one])); -} - -int -bloom_spectrum_create(struct bloom_spectrum *spectrum, - uint32_t max_number_of_values, double false_positive_rate, - struct quota *quota) -{ - spectrum->count_expected = max_number_of_values; - spectrum->count_collected = 0; - spectrum->chosen_one = -1; - for (uint32_t i = 0; i < BLOOM_SPECTRUM_SIZE; i++) { - int rc = bloom_create(&spectrum->vector[i], - max_number_of_values, - false_positive_rate, quota); - if (rc) { - for (uint32_t j = 0; j < i; j++) - bloom_destroy(&spectrum->vector[i], quota); - return rc; - } - - max_number_of_values = max_number_of_values * 4 / 5; - if (max_number_of_values < 1) - max_number_of_values = 1; - } - return 0; -} - -void -bloom_spectrum_destroy(struct bloom_spectrum *spectrum, struct quota *quota) -{ - for (int i = 0; i < BLOOM_SPECTRUM_SIZE; i++) { - if (i != spectrum->chosen_one) - bloom_destroy(&spectrum->vector[i], quota); - } -} - -/* }}} API definition */ - - diff --git a/src/lib/salad/bloom.h b/src/lib/salad/bloom.h index 81d3480e..32deb81a 100644 --- a/src/lib/salad/bloom.h +++ b/src/lib/salad/bloom.h @@ -57,8 +57,6 @@ extern "C" { enum { /* Expected cache line of target processor */ BLOOM_CACHE_LINE = 64, - /* Number of different bloom filter in bloom spectrum */ - BLOOM_SPECTRUM_SIZE = 10, }; typedef uint32_t bloom_hash_t; @@ -164,59 +162,6 @@ bloom_store(const struct bloom *bloom, char *table); int bloom_load_table(struct bloom *bloom, const char *table, struct quota *quota); -/** - * Bloom spectrum that allows to create and fill a bloom filter in case when - * there is no enough knowledge about final number of elements. - * It consists of several bloom filter (set of filters) of different sizes - * and fills them all. After filling it allows to choose more efficient bloom - * filter. - */ -struct bloom_spectrum { - uint32_t count_expected; - uint32_t count_collected; - int chosen_one; - struct bloom vector[BLOOM_SPECTRUM_SIZE]; -}; - -/** - * Create a bloom spectrum - * @param spectrum - spectrum to init - * @param max_number_of_values - upper bound of estimation about - * number of elements - * @param false_positive_rate - desired false positive rate - * @param quota - quota for memory allocation - * @return 0 - OK, -1 - memory error - */ -int -bloom_spectrum_create(struct bloom_spectrum *spectrum, - uint32_t max_number_of_values, double false_positive_rate, - struct quota *quota); - -/** - * Add a value into the data set - * @param spectrum - spectrum to add to - * @param hash - a hash of a value - */ -static void -bloom_spectrum_add(struct bloom_spectrum *spectrum, bloom_hash_t hash); - -/** - * Choose best bloom filter after filling the set. - * Must be used only once. - * @param spectrum - spectrum to choose from - * @param bloom - target structure that will hold the best bloom filter - */ -void -bloom_spectrum_choose(struct bloom_spectrum *spectrum, struct bloom *bloom); - -/** - * Destroy spectrum and free all data (except the chosen one) - * @param spectrum - spectrum to destroy - * @param quota - quota for memory deallocation - */ -void -bloom_spectrum_destroy(struct bloom_spectrum *spectrum, struct quota *quota); - /* }}} API declaration */ /* {{{ API definition */ @@ -263,14 +208,6 @@ bloom_maybe_has(const struct bloom *bloom, bloom_hash_t hash) return true; } -static inline void -bloom_spectrum_add(struct bloom_spectrum *spectrum, bloom_hash_t hash) -{ - spectrum->count_collected++; - for (uint32_t i = 0; i < BLOOM_SPECTRUM_SIZE; i++) - bloom_add(&spectrum->vector[i], hash); -} - /* }}} API definition */ #if defined(__cplusplus) diff --git a/test/unit/bloom.cc b/test/unit/bloom.cc index ac5796ac..b8a2ef13 100644 --- a/test/unit/bloom.cc +++ b/test/unit/bloom.cc @@ -102,72 +102,9 @@ store_load_test() cout << "memory after destruction = " << quota_used(&q) << endl << endl; } -void -spectrum_test() -{ - cout << "*** " << __func__ << " ***" << endl; - struct quota q; - quota_init(&q, 1005000); - double p = 0.01; - uint32_t count = 4000; - struct bloom_spectrum spectrum; - struct bloom bloom; - - /* using (count) */ - bloom_spectrum_create(&spectrum, count, p, &q); - for (uint32_t i = 0; i < count; i++) { - bloom_spectrum_add(&spectrum, h(i)); - } - bloom_spectrum_choose(&spectrum, &bloom); - bloom_spectrum_destroy(&spectrum, &q); - - uint64_t false_positive = 0; - uint64_t error_count = 0; - for (uint32_t i = 0; i < count; i++) { - if (!bloom_maybe_has(&bloom, h(i))) - error_count++; - } - for (uint32_t i = count; i < 2 * count; i++) { - if (bloom_maybe_has(&bloom, h(i))) - false_positive++; - } - bool fpr_rate_is_good = false_positive < 1.5 * p * count; - cout << "bloom table size = " << bloom.table_size << endl; - cout << "error_count = " << error_count << endl; - cout << "fpr_rate_is_good = " << fpr_rate_is_good << endl; - bloom_destroy(&bloom, &q); - - /* same test using (count * 10) */ - bloom_spectrum_create(&spectrum, count * 10, p, &q); - for (uint32_t i = 0; i < count; i++) { - bloom_spectrum_add(&spectrum, h(i)); - } - bloom_spectrum_choose(&spectrum, &bloom); - bloom_spectrum_destroy(&spectrum, &q); - - false_positive = 0; - error_count = 0; - for (uint32_t i = 0; i < count; i++) { - if (!bloom_maybe_has(&bloom, h(i))) - error_count++; - } - for (uint32_t i = count; i < 2 * count; i++) { - if (bloom_maybe_has(&bloom, h(i))) - false_positive++; - } - fpr_rate_is_good = false_positive < 1.5 * p * count; - cout << "bloom table size = " << bloom.table_size << endl; - cout << "error_count = " << error_count << endl; - cout << "fpr_rate_is_good = " << fpr_rate_is_good << endl; - bloom_destroy(&bloom, &q); - - cout << "memory after destruction = " << quota_used(&q) << endl << endl; -} - int main(void) { simple_test(); store_load_test(); - spectrum_test(); } diff --git a/test/unit/bloom.result b/test/unit/bloom.result index 39177f51..1b1b1f94 100644 --- a/test/unit/bloom.result +++ b/test/unit/bloom.result @@ -8,12 +8,3 @@ error_count = 0 fp_rate_too_big = 0 memory after destruction = 0 -*** spectrum_test *** -bloom table size = 79 -error_count = 0 -fpr_rate_is_good = 1 -bloom table size = 106 -error_count = 0 -fpr_rate_is_good = 1 -memory after destruction = 0 - -- 2.11.0
prev parent reply other threads:[~2018-03-28 19:05 UTC|newest] Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top 2018-03-28 19:04 [PATCH v2 0/5] vinyl: multi-part bloom filter Vladimir Davydov 2018-03-28 19:04 ` [PATCH v2 1/5] bloom: use malloc for bitmap allocations Vladimir Davydov 2018-03-28 19:04 ` [PATCH v2 2/5] bloom: rename bloom_possible_has to bloom_maybe_has Vladimir Davydov 2018-03-28 19:04 ` [PATCH v2 3/5] vinyl: introduce bloom filters for partial key lookups Vladimir Davydov 2018-03-28 19:05 ` [PATCH v2 4/5] bloom: optimize tuple bloom filter size Vladimir Davydov 2018-03-28 19:05 ` Vladimir Davydov [this message]
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=d73650ccee5b063616b8d2a1196dc26eb7abd53f.1522262496.git.vdavydov.dev@gmail.com \ --to=vdavydov.dev@gmail.com \ --cc=kostja@tarantool.org \ --cc=tarantool-patches@freelists.org \ --subject='Re: [PATCH v2 5/5] bloom: drop spectrum' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox