[Tarantool-patches] [PATCH v5] core: introduce evenly distributed int64 random in range
Ilya Kosarev
i.kosarev at tarantool.org
Fri Nov 13 14:53:18 MSK 2020
Tarantool codebase had at least two functions to generate random
integer in given range and both of them had problems at least with
integer overflow. This patch brings nice functions to generate random
int64_t in given range without overflow while preserving uniform random
generator distribution using unbiased bitmask with rejection method.
It is now possible to use xoshiro256++ PRNG or random bytes as a basis.
Most relevant replacements have been made. Needed tests are introduced.
Closes #5075
---
Branch: https://github.com/tarantool/tarantool/tree/i.kosarev/gh-5075-fine-random-in-range
Issue: https://github.com/tarantool/tarantool/issues/5075
Changes in v2:
- implemented fine 64-bit random
- implemented fine range bounding
Changes in v3:
- added tests
- fixed some hardcoded sizes
- fixed potential compilations issues, leading to compilation fail or low performance
- style fixed
- added needed comments
Changes in v4:
- added more tests
- style fixes
- made xoshiro state print more safe
Changes in v5:
- fix test output
- remove extra change
src/lib/core/CMakeLists.txt | 2 +-
src/lib/core/random.c | 85 +++++++++++++++++++++++++++++++++++++
src/lib/core/random.h | 43 +++++++++++++++++++
src/lib/swim/swim.c | 26 ++----------
test/unit/CMakeLists.txt | 4 +-
test/unit/histogram.c | 12 ++----
test/unit/random.c | 59 +++++++++++++++++++++++++
test/unit/random.result | 59 +++++++++++++++++++++++++
test/unit/swim_test_utils.c | 1 +
9 files changed, 258 insertions(+), 33 deletions(-)
create mode 100644 test/unit/random.c
create mode 100644 test/unit/random.result
diff --git a/src/lib/core/CMakeLists.txt b/src/lib/core/CMakeLists.txt
index 13ed1e7ab..7c62fc5ce 100644
--- a/src/lib/core/CMakeLists.txt
+++ b/src/lib/core/CMakeLists.txt
@@ -39,7 +39,7 @@ endif()
add_library(core STATIC ${core_sources})
-target_link_libraries(core salad small uri decNumber ${LIBEV_LIBRARIES}
+target_link_libraries(core salad small uri decNumber bit ${LIBEV_LIBRARIES}
${LIBEIO_LIBRARIES} ${LIBCORO_LIBRARIES}
${MSGPUCK_LIBRARIES} ${ICU_LIBRARIES})
diff --git a/src/lib/core/random.c b/src/lib/core/random.c
index f4fa75b1c..3e6810d3c 100644
--- a/src/lib/core/random.c
+++ b/src/lib/core/random.c
@@ -35,10 +35,14 @@
#include <sys/time.h>
#include <unistd.h>
#include <stdlib.h>
+#include "bit/bit.h"
#include "say.h"
+#include "tt_static.h"
static int rfd;
+static uint64_t state[4];
+
void
random_init(void)
{
@@ -63,6 +67,8 @@ random_init(void)
srand:
srandom(seed);
srand(seed);
+
+ random_bytes((char *)state, sizeof(state));
}
void
@@ -97,3 +103,82 @@ rand:
while (generated < size)
buf[generated++] = rand();
}
+
+uint64_t
+real_random(void)
+{
+ uint64_t result;
+ random_bytes((char *)&result, sizeof(result));
+ return result;
+}
+
+/**
+ * Helper function for the xoshiro256++ pseudo random generator:
+ * rotate left.
+ */
+static inline uint64_t
+rotl(uint64_t x, int k)
+{
+ return (x << k) | (x >> (64 - k));
+}
+
+/**
+ * xoshiro256++ pseudo random generator.
+ * http://prng.di.unimi.it/
+ * State is initialized with random_bytes().
+ *
+ * It is fast and doesn’t fail any known statistical test.
+ * About 2 times faster than conventional LCG rand() and
+ * mersenne-twister algorithms. Also both of them do fail
+ * some statistical tests.
+ * Here are some other reasons to choose xoshiro over
+ * mersenne-twister: https://arxiv.org/pdf/1910.06437.pdf
+ */
+uint64_t
+xoshiro_random(void)
+{
+ const uint64_t result = rotl(state[0] + state[3], 23) + state[0];
+ const uint64_t t = state[1] << 17;
+ state[2] ^= state[0];
+ state[3] ^= state[1];
+ state[1] ^= state[2];
+ state[0] ^= state[3];
+ state[2] ^= t;
+ state[3] = rotl(state[3], 45);
+ return result;
+}
+
+const char *
+xoshiro_state_str(void)
+{
+ return tt_sprintf("%llu %llu %llu %llu", (unsigned long long)state[0],
+ (unsigned long long)state[1],
+ (unsigned long long)state[2],
+ (unsigned long long)state[3]);
+}
+
+int64_t
+real_random_in_range(int64_t min, int64_t max)
+{
+ assert(max >= min);
+ uint64_t range = (uint64_t)max - min;
+ uint64_t mask = UINT64_MAX >> bit_clz_u64(range | 1);
+ uint64_t result;
+ do {
+ result = real_random() & mask;
+ } while (result > range);
+ return min + result;
+}
+
+int64_t
+pseudo_random_in_range(int64_t min, int64_t max)
+{
+ assert(max >= min);
+ uint64_t range = (uint64_t)max - min;
+ uint64_t mask = UINT64_MAX >> bit_clz_u64(range | 1);
+ uint64_t result;
+ do {
+ result = xoshiro_random() & mask;
+ } while (result > range);
+ return min + result;
+}
diff --git a/src/lib/core/random.h b/src/lib/core/random.h
index 26109b14b..9108eab58 100644
--- a/src/lib/core/random.h
+++ b/src/lib/core/random.h
@@ -32,6 +32,7 @@
*/
#include <stddef.h>
+#include <stdint.h>
#if defined(__cplusplus)
extern "C" {
@@ -45,6 +46,48 @@ random_free(void);
void
random_bytes(char *buf, size_t size);
+/**
+ * Just 8 random_bytes().
+ */
+uint64_t
+real_random(void);
+
+/**
+ * xoshiro256++ pseudo random generator.
+ * http://prng.di.unimi.it/
+ * State is initialized with random_bytes().
+ */
+uint64_t
+xoshiro_random(void);
+
+const char *
+xoshiro_state_str(void);
+
+/**
+ * Return a random int64_t number within given boundaries,
+ * including both.
+ *
+ * Instead of blindly calculating a modulo, this function uses
+ * unbiased bitmask with rejection method to provide number in
+ * given boundaries while preserving uniform distribution and
+ * avoiding overflow. It uses random bytes as a basis.
+ */
+int64_t
+real_random_in_range(int64_t min, int64_t max);
+
+/**
+ * Return a pseudo random int64_t number within given boundaries,
+ * including both.
+ *
+ * Instead of blindly calculating a modulo, this function uses
+ * unbiased bitmask with rejection method to provide number in
+ * given boundaries while preserving uniform distribution and
+ * avoiding overflow. It uses xoshiro256++ as an uint64 pseudo
+ * random generator.
+ */
+int64_t
+pseudo_random_in_range(int64_t min, int64_t max);
+
#if defined(__cplusplus)
}
#endif /* extern "C" */
diff --git a/src/lib/swim/swim.c b/src/lib/swim/swim.c
index 396bd7c45..9b5458635 100644
--- a/src/lib/swim/swim.c
+++ b/src/lib/swim/swim.c
@@ -34,6 +34,7 @@
#include "swim_ev.h"
#include "uri/uri.h"
#include "fiber.h"
+#include "random.h"
#include "msgpuck.h"
#include "assoc.h"
#include "sio.h"
@@ -188,25 +189,6 @@ enum {
INDIRECT_PING_COUNT = 2,
};
-/**
- * Return a random number within given boundaries.
- *
- * Instead of blindly calculating a modulo, scale the random
- * number down the given boundaries to preserve the original
- * distribution. The result belongs to range [start, end].
- */
-static inline int
-swim_scaled_rand(int start, int end)
-{
- assert(end >= start);
- /*
- * RAND_MAX is likely to be INT_MAX - hardly SWIM will
- * ever be used in such a huge cluster.
- */
- assert(end - start < RAND_MAX);
- return rand() / (RAND_MAX / (end - start + 1) + 1);
-}
-
/** Calculate UUID hash to use as a member table key. */
static inline uint32_t
swim_uuid_hash(const struct tt_uuid *uuid)
@@ -980,7 +962,7 @@ swim_shuffle_members(struct swim *swim)
for (mh_int_t node = mh_first(members), end = mh_end(members);
node != end; node = mh_next(members, node), ++i) {
swim->shuffled[i] = *mh_swim_table_node(members, node);
- int j = swim_scaled_rand(0, i);
+ int j = pseudo_random_in_range(0, i);
SWAP(swim->shuffled[i], swim->shuffled[j]);
}
}
@@ -1069,7 +1051,7 @@ swim_encode_anti_entropy(struct swim *swim, struct swim_packet *packet)
swim_member_payload_bin_create(&payload_header);
struct mh_swim_table_t *t = swim->members;
int i = 0, member_count = mh_size(t);
- int rnd = swim_scaled_rand(0, member_count - 1);
+ int rnd = pseudo_random_in_range(0, member_count - 1);
for (mh_int_t rc = mh_swim_table_random(t, rnd), end = mh_end(t);
i < member_count; ++i) {
struct swim_member *m = *mh_swim_table_node(t, rc);
@@ -1380,7 +1362,7 @@ swim_send_indirect_pings(struct swim *swim, const struct swim_member *dst)
{
struct mh_swim_table_t *t = swim->members;
int member_count = mh_size(t);
- int rnd = swim_scaled_rand(0, member_count - 1);
+ int rnd = pseudo_random_in_range(0, member_count - 1);
mh_int_t rc = mh_swim_table_random(t, rnd), end = mh_end(t);
for (int member_i = 0, task_i = 0; member_i < member_count &&
task_i < INDIRECT_PING_COUNT; ++member_i) {
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index e05466473..70cd1b2c3 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -52,6 +52,8 @@ add_executable(base64.test base64.c)
target_link_libraries(base64.test misc unit)
add_executable(uuid.test uuid.c core_test_utils.c)
target_link_libraries(uuid.test uuid unit)
+add_executable(random.test random.c core_test_utils.c)
+target_link_libraries(random.test core unit)
add_executable(bps_tree.test bps_tree.cc)
target_link_libraries(bps_tree.test small misc)
@@ -153,7 +155,7 @@ target_link_libraries(json.test json unit ${ICU_LIBRARIES})
add_executable(rmean.test rmean.cc core_test_utils.c)
target_link_libraries(rmean.test stat unit)
-add_executable(histogram.test histogram.c)
+add_executable(histogram.test histogram.c core_test_utils.c)
target_link_libraries(histogram.test stat unit)
add_executable(ratelimit.test ratelimit.c)
target_link_libraries(ratelimit.test unit)
diff --git a/test/unit/histogram.c b/test/unit/histogram.c
index 3056d20fe..ef5cd714f 100644
--- a/test/unit/histogram.c
+++ b/test/unit/histogram.c
@@ -5,6 +5,7 @@
#include "memory.h"
#include "unit.h"
#include "trivia/util.h"
+#include "core/random.h"
static int
int64_cmp(const void *p1, const void *p2)
@@ -46,13 +47,6 @@ gen_rand_data(size_t *p_len)
return data;
}
-static int64_t
-gen_rand_value(int64_t min, int64_t max)
-{
- assert(max >= min);
- return min + rand() % (max - min + 1);
-}
-
static void
test_counts(void)
{
@@ -97,7 +91,7 @@ test_discard(void)
struct histogram *hist = histogram_new(buckets, n_buckets);
- size_t bucket_sz = gen_rand_value(2, 10);
+ size_t bucket_sz = pseudo_random_in_range(2, 10);
size_t data_len = (n_buckets + 1) * bucket_sz;
int64_t *data = calloc(data_len, sizeof(*data));
@@ -105,7 +99,7 @@ test_discard(void)
int64_t min = (b == 0 ? INT64_MIN : buckets[b - 1] + 1);
int64_t max = (b == n_buckets ? INT64_MAX : buckets[b]);
for (size_t i = 0; i < bucket_sz; i++)
- data[b * bucket_sz + i] = gen_rand_value(min, max);
+ data[b * bucket_sz + i] = pseudo_random_in_range(min, max);
}
for (size_t i = 0; i < data_len; i++)
diff --git a/test/unit/random.c b/test/unit/random.c
new file mode 100644
index 000000000..54c5ca97c
--- /dev/null
+++ b/test/unit/random.c
@@ -0,0 +1,59 @@
+#include <core/random.h>
+
+#include <stdio.h>
+
+#include "unit.h"
+
+static void
+test_random_in_range_one(int64_t min, int64_t max)
+{
+ plan(2);
+ int64_t result = pseudo_random_in_range(min, max);
+ is(min <= result && result <= max, 1, "pseudo random is in range "
+ "%lld %lld", (long long)min,
+ (long long)max);
+
+ result = real_random_in_range(min, max);
+ is(min <= result && result <= max, 1, "real random is in range "
+ "%lld %lld", (long long)min,
+ (long long)max);
+ check_plan();
+}
+
+static void
+test_random_in_range(void)
+{
+ header();
+
+ test_random_in_range_one(INT64_MIN, INT64_MAX);
+ test_random_in_range_one(INT64_MIN, INT64_MIN);
+ test_random_in_range_one(INT64_MAX, INT64_MAX);
+ test_random_in_range_one(-1, -1);
+ test_random_in_range_one(0, 0);
+ test_random_in_range_one(1, 1);
+
+ test_random_in_range_one(INT64_MIN + 1, INT64_MAX - 1);
+ test_random_in_range_one(INT64_MIN / 2, INT64_MAX / 2);
+ test_random_in_range_one(INT64_MIN, INT64_MIN / 2);
+ test_random_in_range_one(INT64_MAX / 2, INT64_MAX);
+ test_random_in_range_one(-2, -1);
+ test_random_in_range_one(1, 2);
+ test_random_in_range_one(-1, 1);
+ test_random_in_range_one(0, 1);
+
+ footer();
+}
+
+int
+main(void)
+{
+ random_init();
+
+ plan(14);
+
+ test_random_in_range();
+
+ random_free();
+
+ return check_plan();
+}
diff --git a/test/unit/random.result b/test/unit/random.result
new file mode 100644
index 000000000..43ffcfd6f
--- /dev/null
+++ b/test/unit/random.result
@@ -0,0 +1,59 @@
+1..14
+ *** test_random_in_range ***
+ 1..2
+ ok 1 - pseudo random is in range -9223372036854775808 9223372036854775807
+ ok 2 - real random is in range -9223372036854775808 9223372036854775807
+ok 1 - subtests
+ 1..2
+ ok 1 - pseudo random is in range -9223372036854775808 -9223372036854775808
+ ok 2 - real random is in range -9223372036854775808 -9223372036854775808
+ok 2 - subtests
+ 1..2
+ ok 1 - pseudo random is in range 9223372036854775807 9223372036854775807
+ ok 2 - real random is in range 9223372036854775807 9223372036854775807
+ok 3 - subtests
+ 1..2
+ ok 1 - pseudo random is in range -1 -1
+ ok 2 - real random is in range -1 -1
+ok 4 - subtests
+ 1..2
+ ok 1 - pseudo random is in range 0 0
+ ok 2 - real random is in range 0 0
+ok 5 - subtests
+ 1..2
+ ok 1 - pseudo random is in range 1 1
+ ok 2 - real random is in range 1 1
+ok 6 - subtests
+ 1..2
+ ok 1 - pseudo random is in range -9223372036854775807 9223372036854775806
+ ok 2 - real random is in range -9223372036854775807 9223372036854775806
+ok 7 - subtests
+ 1..2
+ ok 1 - pseudo random is in range -4611686018427387904 4611686018427387903
+ ok 2 - real random is in range -4611686018427387904 4611686018427387903
+ok 8 - subtests
+ 1..2
+ ok 1 - pseudo random is in range -9223372036854775808 -4611686018427387904
+ ok 2 - real random is in range -9223372036854775808 -4611686018427387904
+ok 9 - subtests
+ 1..2
+ ok 1 - pseudo random is in range 4611686018427387903 9223372036854775807
+ ok 2 - real random is in range 4611686018427387903 9223372036854775807
+ok 10 - subtests
+ 1..2
+ ok 1 - pseudo random is in range -2 -1
+ ok 2 - real random is in range -2 -1
+ok 11 - subtests
+ 1..2
+ ok 1 - pseudo random is in range 1 2
+ ok 2 - real random is in range 1 2
+ok 12 - subtests
+ 1..2
+ ok 1 - pseudo random is in range -1 1
+ ok 2 - real random is in range -1 1
+ok 13 - subtests
+ 1..2
+ ok 1 - pseudo random is in range 0 1
+ ok 2 - real random is in range 0 1
+ok 14 - subtests
+ *** test_random_in_range: done ***
diff --git a/test/unit/swim_test_utils.c b/test/unit/swim_test_utils.c
index 9dbd28a9f..114e26149 100644
--- a/test/unit/swim_test_utils.c
+++ b/test/unit/swim_test_utils.c
@@ -876,6 +876,7 @@ swim_run_test(const char *log_file, fiber_func test)
* same seed.
*/
say_info("Random seed = %llu", (unsigned long long) seed);
+ say_info("xoshiro random state = %s", xoshiro_state_str());
struct fiber *main_fiber = fiber_new("main", test);
fiber_set_joinable(main_fiber, true);
--
2.17.1
More information about the Tarantool-patches
mailing list