Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: kostja@tarantool.org
Cc: tarantool-patches@freelists.org
Subject: [PATCH v2 5/5] bloom: drop spectrum
Date: Wed, 28 Mar 2018 22:05:01 +0300	[thread overview]
Message-ID: <d73650ccee5b063616b8d2a1196dc26eb7abd53f.1522262496.git.vdavydov.dev@gmail.com> (raw)
In-Reply-To: <cover.1522262496.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1522262496.git.vdavydov.dev@gmail.com>

As it was pointed out earlier, the bloom spectrum concept is rather
dubious, because its overhead for a reasonable false positive rate is
about 10 bytes per record while storing all hashes in an array takes
only 4 bytes per record so one can stash all hashes and count records
first, then create the optimal bloom filter and add all hashes there.
---
 src/lib/salad/bloom.c  | 58 ----------------------------------------------
 src/lib/salad/bloom.h  | 63 --------------------------------------------------
 test/unit/bloom.cc     | 63 --------------------------------------------------
 test/unit/bloom.result |  9 --------
 4 files changed, 193 deletions(-)

diff --git a/src/lib/salad/bloom.c b/src/lib/salad/bloom.c
index 09683fb8..d68692ba 100644
--- a/src/lib/salad/bloom.c
+++ b/src/lib/salad/bloom.c
@@ -113,61 +113,3 @@ bloom_load_table(struct bloom *bloom, const char *table, struct quota *quota)
 	memcpy(bloom->table, table, size);
 	return 0;
 }
-
-void
-bloom_spectrum_choose(struct bloom_spectrum *spectrum, struct bloom *bloom)
-{
-	assert(spectrum->chosen_one < 0);
-	spectrum->chosen_one = 0;
-	uint32_t number_of_values = spectrum->count_expected;
-	for (int i = 1; i < BLOOM_SPECTRUM_SIZE; i++) {
-		number_of_values = number_of_values * 4 / 5;
-		if (number_of_values < 1)
-			number_of_values = 1;
-		if (spectrum->count_collected > number_of_values)
-			break;
-		spectrum->chosen_one = i;
-	}
-	/* Move the chosen one to result bloom structure */
-	*bloom = spectrum->vector[spectrum->chosen_one];
-	memset(&spectrum->vector[spectrum->chosen_one], 0,
-	       sizeof(spectrum->vector[spectrum->chosen_one]));
-}
-
-int
-bloom_spectrum_create(struct bloom_spectrum *spectrum,
-		      uint32_t max_number_of_values, double false_positive_rate,
-		      struct quota *quota)
-{
-	spectrum->count_expected = max_number_of_values;
-	spectrum->count_collected = 0;
-	spectrum->chosen_one = -1;
-	for (uint32_t i = 0; i < BLOOM_SPECTRUM_SIZE; i++) {
-		int rc = bloom_create(&spectrum->vector[i],
-				      max_number_of_values,
-				      false_positive_rate, quota);
-		if (rc) {
-			for (uint32_t j = 0; j < i; j++)
-				bloom_destroy(&spectrum->vector[i], quota);
-			return rc;
-		}
-
-		max_number_of_values = max_number_of_values * 4 / 5;
-		if (max_number_of_values < 1)
-			max_number_of_values = 1;
-	}
-	return 0;
-}
-
-void
-bloom_spectrum_destroy(struct bloom_spectrum *spectrum, struct quota *quota)
-{
-	for (int i = 0; i < BLOOM_SPECTRUM_SIZE; i++) {
-		if (i != spectrum->chosen_one)
-			bloom_destroy(&spectrum->vector[i], quota);
-	}
-}
-
-/* }}} API definition */
-
-
diff --git a/src/lib/salad/bloom.h b/src/lib/salad/bloom.h
index 81d3480e..32deb81a 100644
--- a/src/lib/salad/bloom.h
+++ b/src/lib/salad/bloom.h
@@ -57,8 +57,6 @@ extern "C" {
 enum {
 	/* Expected cache line of target processor */
 	BLOOM_CACHE_LINE = 64,
-	/* Number of different bloom filter in bloom spectrum */
-	BLOOM_SPECTRUM_SIZE = 10,
 };
 
 typedef uint32_t bloom_hash_t;
@@ -164,59 +162,6 @@ bloom_store(const struct bloom *bloom, char *table);
 int
 bloom_load_table(struct bloom *bloom, const char *table, struct quota *quota);
 
-/**
- * Bloom spectrum that allows to create and fill a bloom filter in case when
- * there is no enough knowledge about final number of elements.
- * It consists of several bloom filter (set of filters) of different sizes
- * and fills them all. After filling it allows to choose more efficient bloom
- * filter.
- */
-struct bloom_spectrum {
-	uint32_t count_expected;
-	uint32_t count_collected;
-	int chosen_one;
-	struct bloom vector[BLOOM_SPECTRUM_SIZE];
-};
-
-/**
- * Create a bloom spectrum
- * @param spectrum - spectrum to init
- * @param max_number_of_values - upper bound of estimation about
- *  number of elements
- * @param false_positive_rate - desired false positive rate
- * @param quota - quota for memory allocation
- * @return 0 - OK, -1 - memory error
- */
-int
-bloom_spectrum_create(struct bloom_spectrum *spectrum,
-		      uint32_t max_number_of_values, double false_positive_rate,
-		      struct quota *quota);
-
-/**
- * Add a value into the data set
- * @param spectrum - spectrum to add to
- * @param hash - a hash of a value
- */
-static void
-bloom_spectrum_add(struct bloom_spectrum *spectrum, bloom_hash_t hash);
-
-/**
- * Choose best bloom filter after filling the set.
- * Must be used only once.
- * @param spectrum - spectrum to choose from
- * @param bloom - target structure that will hold the best bloom filter
- */
-void
-bloom_spectrum_choose(struct bloom_spectrum *spectrum, struct bloom *bloom);
-
-/**
- * Destroy spectrum and free all data (except the chosen one)
- * @param spectrum - spectrum to destroy
- * @param quota - quota for memory deallocation
- */
-void
-bloom_spectrum_destroy(struct bloom_spectrum *spectrum, struct quota *quota);
-
 /* }}} API declaration */
 
 /* {{{ API definition */
@@ -263,14 +208,6 @@ bloom_maybe_has(const struct bloom *bloom, bloom_hash_t hash)
 	return true;
 }
 
-static inline void
-bloom_spectrum_add(struct bloom_spectrum *spectrum, bloom_hash_t hash)
-{
-	spectrum->count_collected++;
-	for (uint32_t i = 0; i < BLOOM_SPECTRUM_SIZE; i++)
-		bloom_add(&spectrum->vector[i], hash);
-}
-
 /* }}} API definition */
 
 #if defined(__cplusplus)
diff --git a/test/unit/bloom.cc b/test/unit/bloom.cc
index ac5796ac..b8a2ef13 100644
--- a/test/unit/bloom.cc
+++ b/test/unit/bloom.cc
@@ -102,72 +102,9 @@ store_load_test()
 	cout << "memory after destruction = " << quota_used(&q) << endl << endl;
 }
 
-void
-spectrum_test()
-{
-	cout << "*** " << __func__ << " ***" << endl;
-	struct quota q;
-	quota_init(&q, 1005000);
-	double p = 0.01;
-	uint32_t count = 4000;
-	struct bloom_spectrum spectrum;
-	struct bloom bloom;
-
-	/* using (count) */
-	bloom_spectrum_create(&spectrum, count, p, &q);
-	for (uint32_t i = 0; i < count; i++) {
-		bloom_spectrum_add(&spectrum, h(i));
-	}
-	bloom_spectrum_choose(&spectrum, &bloom);
-	bloom_spectrum_destroy(&spectrum, &q);
-
-	uint64_t false_positive = 0;
-	uint64_t error_count = 0;
-	for (uint32_t i = 0; i < count; i++) {
-		if (!bloom_maybe_has(&bloom, h(i)))
-			error_count++;
-	}
-	for (uint32_t i = count; i < 2 * count; i++) {
-		if (bloom_maybe_has(&bloom, h(i)))
-			false_positive++;
-	}
-	bool fpr_rate_is_good = false_positive < 1.5 * p * count;
-	cout << "bloom table size = " << bloom.table_size << endl;
-	cout << "error_count = " << error_count << endl;
-	cout << "fpr_rate_is_good = " << fpr_rate_is_good << endl;
-	bloom_destroy(&bloom, &q);
-
-	/* same test using (count * 10) */
-	bloom_spectrum_create(&spectrum, count * 10, p, &q);
-	for (uint32_t i = 0; i < count; i++) {
-		bloom_spectrum_add(&spectrum, h(i));
-	}
-	bloom_spectrum_choose(&spectrum, &bloom);
-	bloom_spectrum_destroy(&spectrum, &q);
-
-	false_positive = 0;
-	error_count = 0;
-	for (uint32_t i = 0; i < count; i++) {
-		if (!bloom_maybe_has(&bloom, h(i)))
-			error_count++;
-	}
-	for (uint32_t i = count; i < 2 * count; i++) {
-		if (bloom_maybe_has(&bloom, h(i)))
-			false_positive++;
-	}
-	fpr_rate_is_good = false_positive < 1.5 * p * count;
-	cout << "bloom table size = " << bloom.table_size << endl;
-	cout << "error_count = " << error_count << endl;
-	cout << "fpr_rate_is_good = " << fpr_rate_is_good << endl;
-	bloom_destroy(&bloom, &q);
-
-	cout << "memory after destruction = " << quota_used(&q) << endl << endl;
-}
-
 int
 main(void)
 {
 	simple_test();
 	store_load_test();
-	spectrum_test();
 }
diff --git a/test/unit/bloom.result b/test/unit/bloom.result
index 39177f51..1b1b1f94 100644
--- a/test/unit/bloom.result
+++ b/test/unit/bloom.result
@@ -8,12 +8,3 @@ error_count = 0
 fp_rate_too_big = 0
 memory after destruction = 0
 
-*** spectrum_test ***
-bloom table size = 79
-error_count = 0
-fpr_rate_is_good = 1
-bloom table size = 106
-error_count = 0
-fpr_rate_is_good = 1
-memory after destruction = 0
-
-- 
2.11.0

      parent reply	other threads:[~2018-03-28 19:05 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-28 19:04 [PATCH v2 0/5] vinyl: multi-part bloom filter Vladimir Davydov
2018-03-28 19:04 ` [PATCH v2 1/5] bloom: use malloc for bitmap allocations Vladimir Davydov
2018-03-28 19:04 ` [PATCH v2 2/5] bloom: rename bloom_possible_has to bloom_maybe_has Vladimir Davydov
2018-03-28 19:04 ` [PATCH v2 3/5] vinyl: introduce bloom filters for partial key lookups Vladimir Davydov
2018-03-28 19:05 ` [PATCH v2 4/5] bloom: optimize tuple bloom filter size Vladimir Davydov
2018-03-28 19:05 ` Vladimir Davydov [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d73650ccee5b063616b8d2a1196dc26eb7abd53f.1522262496.git.vdavydov.dev@gmail.com \
    --to=vdavydov.dev@gmail.com \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [PATCH v2 5/5] bloom: drop spectrum' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox