[PATCH 4/4] bloom: drop spectrum

Vladimir Davydov vdavydov.dev at gmail.com
Sat Feb 24 11:32:40 MSK 2018


As it was pointed out earlier, the bloom spectrum concept is rather
dubious, because its overhead for a reasonable false positive rate is
about 10 bytes per record while storing all hashes in an array takes
only 4 bytes per record so one can stash all hashes and count records
first, then create the optimal bloom filter and add all hashes there.
---
 src/lib/salad/bloom.c  | 58 ----------------------------------------------
 src/lib/salad/bloom.h  | 63 --------------------------------------------------
 test/unit/bloom.cc     | 63 --------------------------------------------------
 test/unit/bloom.result |  9 --------
 4 files changed, 193 deletions(-)

diff --git a/src/lib/salad/bloom.c b/src/lib/salad/bloom.c
index c38265dc..2e13cedd 100644
--- a/src/lib/salad/bloom.c
+++ b/src/lib/salad/bloom.c
@@ -116,61 +116,3 @@ bloom_load_table(struct bloom *bloom, const char *table, struct quota *quota)
 	memcpy(bloom->table, table, size);
 	return 0;
 }
-
-void
-bloom_spectrum_choose(struct bloom_spectrum *spectrum, struct bloom *bloom)
-{
-	assert(spectrum->chosen_one < 0);
-	spectrum->chosen_one = 0;
-	uint32_t number_of_values = spectrum->count_expected;
-	for (int i = 1; i < BLOOM_SPECTRUM_SIZE; i++) {
-		number_of_values = number_of_values * 4 / 5;
-		if (number_of_values < 1)
-			number_of_values = 1;
-		if (spectrum->count_collected > number_of_values)
-			break;
-		spectrum->chosen_one = i;
-	}
-	/* Move the chosen one to result bloom structure */
-	*bloom = spectrum->vector[spectrum->chosen_one];
-	memset(&spectrum->vector[spectrum->chosen_one], 0,
-	       sizeof(spectrum->vector[spectrum->chosen_one]));
-}
-
-int
-bloom_spectrum_create(struct bloom_spectrum *spectrum,
-		      uint32_t max_number_of_values, double false_positive_rate,
-		      struct quota *quota)
-{
-	spectrum->count_expected = max_number_of_values;
-	spectrum->count_collected = 0;
-	spectrum->chosen_one = -1;
-	for (uint32_t i = 0; i < BLOOM_SPECTRUM_SIZE; i++) {
-		int rc = bloom_create(&spectrum->vector[i],
-				      max_number_of_values,
-				      false_positive_rate, quota);
-		if (rc) {
-			for (uint32_t j = 0; j < i; j++)
-				bloom_destroy(&spectrum->vector[i], quota);
-			return rc;
-		}
-
-		max_number_of_values = max_number_of_values * 4 / 5;
-		if (max_number_of_values < 1)
-			max_number_of_values = 1;
-	}
-	return 0;
-}
-
-void
-bloom_spectrum_destroy(struct bloom_spectrum *spectrum, struct quota *quota)
-{
-	for (int i = 0; i < BLOOM_SPECTRUM_SIZE; i++) {
-		if (i != spectrum->chosen_one)
-			bloom_destroy(&spectrum->vector[i], quota);
-	}
-}
-
-/* }}} API definition */
-
-
diff --git a/src/lib/salad/bloom.h b/src/lib/salad/bloom.h
index 8efcd8f3..29fe595b 100644
--- a/src/lib/salad/bloom.h
+++ b/src/lib/salad/bloom.h
@@ -57,8 +57,6 @@ extern "C" {
 enum {
 	/* Expected cache line of target processor */
 	BLOOM_CACHE_LINE = 64,
-	/* Number of different bloom filter in bloom spectrum */
-	BLOOM_SPECTRUM_SIZE = 10,
 };
 
 typedef uint32_t bloom_hash_t;
@@ -164,59 +162,6 @@ bloom_store(const struct bloom *bloom, char *table);
 int
 bloom_load_table(struct bloom *bloom, const char *table, struct quota *quota);
 
-/**
- * Bloom spectrum that allows to create and fill a bloom filter in case when
- * there is no enough knowledge about final number of elements.
- * It consists of several bloom filter (set of filters) of different sizes
- * and fills them all. After filling it allows to choose more efficient bloom
- * filter.
- */
-struct bloom_spectrum {
-	uint32_t count_expected;
-	uint32_t count_collected;
-	int chosen_one;
-	struct bloom vector[BLOOM_SPECTRUM_SIZE];
-};
-
-/**
- * Create a bloom spectrum
- * @param spectrum - spectrum to init
- * @param max_number_of_values - upper bound of estimation about
- *  number of elements
- * @param false_positive_rate - desired false positive rate
- * @param quota - quota for memory allocation
- * @return 0 - OK, -1 - memory error
- */
-int
-bloom_spectrum_create(struct bloom_spectrum *spectrum,
-		      uint32_t max_number_of_values, double false_positive_rate,
-		      struct quota *quota);
-
-/**
- * Add a value into the data set
- * @param spectrum - spectrum to add to
- * @param hash - a hash of a value
- */
-static void
-bloom_spectrum_add(struct bloom_spectrum *spectrum, bloom_hash_t hash);
-
-/**
- * Choose best bloom filter after filling the set.
- * Must be used only once.
- * @param spectrum - spectrum to choose from
- * @param bloom - target structure that will hold the best bloom filter
- */
-void
-bloom_spectrum_choose(struct bloom_spectrum *spectrum, struct bloom *bloom);
-
-/**
- * Destroy spectrum and free all data (except the chosen one)
- * @param spectrum - spectrum to destroy
- * @param quota - quota for memory deallocation
- */
-void
-bloom_spectrum_destroy(struct bloom_spectrum *spectrum, struct quota *quota);
-
 /* }}} API declaration */
 
 /* {{{ API definition */
@@ -263,14 +208,6 @@ bloom_possible_has(const struct bloom *bloom, bloom_hash_t hash)
 	return true;
 }
 
-static inline void
-bloom_spectrum_add(struct bloom_spectrum *spectrum, bloom_hash_t hash)
-{
-	spectrum->count_collected++;
-	for (uint32_t i = 0; i < BLOOM_SPECTRUM_SIZE; i++)
-		bloom_add(&spectrum->vector[i], hash);
-}
-
 /* }}} API definition */
 
 #if defined(__cplusplus)
diff --git a/test/unit/bloom.cc b/test/unit/bloom.cc
index e78ab73e..e32ba4bd 100644
--- a/test/unit/bloom.cc
+++ b/test/unit/bloom.cc
@@ -102,72 +102,9 @@ store_load_test()
 	cout << "memory after destruction = " << quota_used(&q) << endl << endl;
 }
 
-void
-spectrum_test()
-{
-	cout << "*** " << __func__ << " ***" << endl;
-	struct quota q;
-	quota_init(&q, 1005000);
-	double p = 0.01;
-	uint32_t count = 4000;
-	struct bloom_spectrum spectrum;
-	struct bloom bloom;
-
-	/* using (count) */
-	bloom_spectrum_create(&spectrum, count, p, &q);
-	for (uint32_t i = 0; i < count; i++) {
-		bloom_spectrum_add(&spectrum, h(i));
-	}
-	bloom_spectrum_choose(&spectrum, &bloom);
-	bloom_spectrum_destroy(&spectrum, &q);
-
-	uint64_t false_positive = 0;
-	uint64_t error_count = 0;
-	for (uint32_t i = 0; i < count; i++) {
-		if (!bloom_possible_has(&bloom, h(i)))
-			error_count++;
-	}
-	for (uint32_t i = count; i < 2 * count; i++) {
-		if (bloom_possible_has(&bloom, h(i)))
-			false_positive++;
-	}
-	bool fpr_rate_is_good = false_positive < 1.5 * p * count;
-	cout << "bloom table size = " << bloom.table_size << endl;
-	cout << "error_count = " << error_count << endl;
-	cout << "fpr_rate_is_good = " << fpr_rate_is_good << endl;
-	bloom_destroy(&bloom, &q);
-
-	/* same test using (count * 10) */
-	bloom_spectrum_create(&spectrum, count * 10, p, &q);
-	for (uint32_t i = 0; i < count; i++) {
-		bloom_spectrum_add(&spectrum, h(i));
-	}
-	bloom_spectrum_choose(&spectrum, &bloom);
-	bloom_spectrum_destroy(&spectrum, &q);
-
-	false_positive = 0;
-	error_count = 0;
-	for (uint32_t i = 0; i < count; i++) {
-		if (!bloom_possible_has(&bloom, h(i)))
-			error_count++;
-	}
-	for (uint32_t i = count; i < 2 * count; i++) {
-		if (bloom_possible_has(&bloom, h(i)))
-			false_positive++;
-	}
-	fpr_rate_is_good = false_positive < 1.5 * p * count;
-	cout << "bloom table size = " << bloom.table_size << endl;
-	cout << "error_count = " << error_count << endl;
-	cout << "fpr_rate_is_good = " << fpr_rate_is_good << endl;
-	bloom_destroy(&bloom, &q);
-
-	cout << "memory after destruction = " << quota_used(&q) << endl << endl;
-}
-
 int
 main(void)
 {
 	simple_test();
 	store_load_test();
-	spectrum_test();
 }
diff --git a/test/unit/bloom.result b/test/unit/bloom.result
index 39177f51..1b1b1f94 100644
--- a/test/unit/bloom.result
+++ b/test/unit/bloom.result
@@ -8,12 +8,3 @@ error_count = 0
 fp_rate_too_big = 0
 memory after destruction = 0
 
-*** spectrum_test ***
-bloom table size = 79
-error_count = 0
-fpr_rate_is_good = 1
-bloom table size = 106
-error_count = 0
-fpr_rate_is_good = 1
-memory after destruction = 0
-
-- 
2.11.0




More information about the Tarantool-patches mailing list