[PATCH 4/4] bloom: drop spectrum
Vladimir Davydov
vdavydov.dev at gmail.com
Sat Feb 24 11:32:40 MSK 2018
As it was pointed out earlier, the bloom spectrum concept is rather
dubious, because its overhead for a reasonable false positive rate is
about 10 bytes per record while storing all hashes in an array takes
only 4 bytes per record so one can stash all hashes and count records
first, then create the optimal bloom filter and add all hashes there.
---
src/lib/salad/bloom.c | 58 ----------------------------------------------
src/lib/salad/bloom.h | 63 --------------------------------------------------
test/unit/bloom.cc | 63 --------------------------------------------------
test/unit/bloom.result | 9 --------
4 files changed, 193 deletions(-)
diff --git a/src/lib/salad/bloom.c b/src/lib/salad/bloom.c
index c38265dc..2e13cedd 100644
--- a/src/lib/salad/bloom.c
+++ b/src/lib/salad/bloom.c
@@ -116,61 +116,3 @@ bloom_load_table(struct bloom *bloom, const char *table, struct quota *quota)
memcpy(bloom->table, table, size);
return 0;
}
-
-void
-bloom_spectrum_choose(struct bloom_spectrum *spectrum, struct bloom *bloom)
-{
- assert(spectrum->chosen_one < 0);
- spectrum->chosen_one = 0;
- uint32_t number_of_values = spectrum->count_expected;
- for (int i = 1; i < BLOOM_SPECTRUM_SIZE; i++) {
- number_of_values = number_of_values * 4 / 5;
- if (number_of_values < 1)
- number_of_values = 1;
- if (spectrum->count_collected > number_of_values)
- break;
- spectrum->chosen_one = i;
- }
- /* Move the chosen one to result bloom structure */
- *bloom = spectrum->vector[spectrum->chosen_one];
- memset(&spectrum->vector[spectrum->chosen_one], 0,
- sizeof(spectrum->vector[spectrum->chosen_one]));
-}
-
-int
-bloom_spectrum_create(struct bloom_spectrum *spectrum,
- uint32_t max_number_of_values, double false_positive_rate,
- struct quota *quota)
-{
- spectrum->count_expected = max_number_of_values;
- spectrum->count_collected = 0;
- spectrum->chosen_one = -1;
- for (uint32_t i = 0; i < BLOOM_SPECTRUM_SIZE; i++) {
- int rc = bloom_create(&spectrum->vector[i],
- max_number_of_values,
- false_positive_rate, quota);
- if (rc) {
- for (uint32_t j = 0; j < i; j++)
- bloom_destroy(&spectrum->vector[i], quota);
- return rc;
- }
-
- max_number_of_values = max_number_of_values * 4 / 5;
- if (max_number_of_values < 1)
- max_number_of_values = 1;
- }
- return 0;
-}
-
-void
-bloom_spectrum_destroy(struct bloom_spectrum *spectrum, struct quota *quota)
-{
- for (int i = 0; i < BLOOM_SPECTRUM_SIZE; i++) {
- if (i != spectrum->chosen_one)
- bloom_destroy(&spectrum->vector[i], quota);
- }
-}
-
-/* }}} API definition */
-
-
diff --git a/src/lib/salad/bloom.h b/src/lib/salad/bloom.h
index 8efcd8f3..29fe595b 100644
--- a/src/lib/salad/bloom.h
+++ b/src/lib/salad/bloom.h
@@ -57,8 +57,6 @@ extern "C" {
enum {
/* Expected cache line of target processor */
BLOOM_CACHE_LINE = 64,
- /* Number of different bloom filter in bloom spectrum */
- BLOOM_SPECTRUM_SIZE = 10,
};
typedef uint32_t bloom_hash_t;
@@ -164,59 +162,6 @@ bloom_store(const struct bloom *bloom, char *table);
int
bloom_load_table(struct bloom *bloom, const char *table, struct quota *quota);
-/**
- * Bloom spectrum that allows to create and fill a bloom filter in case when
- * there is no enough knowledge about final number of elements.
- * It consists of several bloom filter (set of filters) of different sizes
- * and fills them all. After filling it allows to choose more efficient bloom
- * filter.
- */
-struct bloom_spectrum {
- uint32_t count_expected;
- uint32_t count_collected;
- int chosen_one;
- struct bloom vector[BLOOM_SPECTRUM_SIZE];
-};
-
-/**
- * Create a bloom spectrum
- * @param spectrum - spectrum to init
- * @param max_number_of_values - upper bound of estimation about
- * number of elements
- * @param false_positive_rate - desired false positive rate
- * @param quota - quota for memory allocation
- * @return 0 - OK, -1 - memory error
- */
-int
-bloom_spectrum_create(struct bloom_spectrum *spectrum,
- uint32_t max_number_of_values, double false_positive_rate,
- struct quota *quota);
-
-/**
- * Add a value into the data set
- * @param spectrum - spectrum to add to
- * @param hash - a hash of a value
- */
-static void
-bloom_spectrum_add(struct bloom_spectrum *spectrum, bloom_hash_t hash);
-
-/**
- * Choose best bloom filter after filling the set.
- * Must be used only once.
- * @param spectrum - spectrum to choose from
- * @param bloom - target structure that will hold the best bloom filter
- */
-void
-bloom_spectrum_choose(struct bloom_spectrum *spectrum, struct bloom *bloom);
-
-/**
- * Destroy spectrum and free all data (except the chosen one)
- * @param spectrum - spectrum to destroy
- * @param quota - quota for memory deallocation
- */
-void
-bloom_spectrum_destroy(struct bloom_spectrum *spectrum, struct quota *quota);
-
/* }}} API declaration */
/* {{{ API definition */
@@ -263,14 +208,6 @@ bloom_possible_has(const struct bloom *bloom, bloom_hash_t hash)
return true;
}
-static inline void
-bloom_spectrum_add(struct bloom_spectrum *spectrum, bloom_hash_t hash)
-{
- spectrum->count_collected++;
- for (uint32_t i = 0; i < BLOOM_SPECTRUM_SIZE; i++)
- bloom_add(&spectrum->vector[i], hash);
-}
-
/* }}} API definition */
#if defined(__cplusplus)
diff --git a/test/unit/bloom.cc b/test/unit/bloom.cc
index e78ab73e..e32ba4bd 100644
--- a/test/unit/bloom.cc
+++ b/test/unit/bloom.cc
@@ -102,72 +102,9 @@ store_load_test()
cout << "memory after destruction = " << quota_used(&q) << endl << endl;
}
-void
-spectrum_test()
-{
- cout << "*** " << __func__ << " ***" << endl;
- struct quota q;
- quota_init(&q, 1005000);
- double p = 0.01;
- uint32_t count = 4000;
- struct bloom_spectrum spectrum;
- struct bloom bloom;
-
- /* using (count) */
- bloom_spectrum_create(&spectrum, count, p, &q);
- for (uint32_t i = 0; i < count; i++) {
- bloom_spectrum_add(&spectrum, h(i));
- }
- bloom_spectrum_choose(&spectrum, &bloom);
- bloom_spectrum_destroy(&spectrum, &q);
-
- uint64_t false_positive = 0;
- uint64_t error_count = 0;
- for (uint32_t i = 0; i < count; i++) {
- if (!bloom_possible_has(&bloom, h(i)))
- error_count++;
- }
- for (uint32_t i = count; i < 2 * count; i++) {
- if (bloom_possible_has(&bloom, h(i)))
- false_positive++;
- }
- bool fpr_rate_is_good = false_positive < 1.5 * p * count;
- cout << "bloom table size = " << bloom.table_size << endl;
- cout << "error_count = " << error_count << endl;
- cout << "fpr_rate_is_good = " << fpr_rate_is_good << endl;
- bloom_destroy(&bloom, &q);
-
- /* same test using (count * 10) */
- bloom_spectrum_create(&spectrum, count * 10, p, &q);
- for (uint32_t i = 0; i < count; i++) {
- bloom_spectrum_add(&spectrum, h(i));
- }
- bloom_spectrum_choose(&spectrum, &bloom);
- bloom_spectrum_destroy(&spectrum, &q);
-
- false_positive = 0;
- error_count = 0;
- for (uint32_t i = 0; i < count; i++) {
- if (!bloom_possible_has(&bloom, h(i)))
- error_count++;
- }
- for (uint32_t i = count; i < 2 * count; i++) {
- if (bloom_possible_has(&bloom, h(i)))
- false_positive++;
- }
- fpr_rate_is_good = false_positive < 1.5 * p * count;
- cout << "bloom table size = " << bloom.table_size << endl;
- cout << "error_count = " << error_count << endl;
- cout << "fpr_rate_is_good = " << fpr_rate_is_good << endl;
- bloom_destroy(&bloom, &q);
-
- cout << "memory after destruction = " << quota_used(&q) << endl << endl;
-}
-
int
main(void)
{
simple_test();
store_load_test();
- spectrum_test();
}
diff --git a/test/unit/bloom.result b/test/unit/bloom.result
index 39177f51..1b1b1f94 100644
--- a/test/unit/bloom.result
+++ b/test/unit/bloom.result
@@ -8,12 +8,3 @@ error_count = 0
fp_rate_too_big = 0
memory after destruction = 0
-*** spectrum_test ***
-bloom table size = 79
-error_count = 0
-fpr_rate_is_good = 1
-bloom table size = 106
-error_count = 0
-fpr_rate_is_good = 1
-memory after destruction = 0
-
--
2.11.0
More information about the Tarantool-patches
mailing list