[Tarantool-patches] [PATCH 08/11] digest: eliminate UBs from guava()

Fri Jun 5 02:43:15 MSK 2020

Guava hash function follows the algorithm described in the paper
"A Fast, Minimal Memory, Consistent Hash Algorithm", John Lamping,
Eric Veach. But the implementation somewhy is ported from Java
instead of taking a ready to use C function right from the paper.

Java version, ported to C as is, leads to undefined behaviour:
- signed integer overflow;
- double value outside of integer range assigned to the integer.

Here is the full old function:

    static const int64_t K = 2862933555777941757;
    static const double  D = 0x1.0p31;

    static inline double lcg(int64_t *state)
    {
        return (double)((int32_t)(((uint64_t)*state >> 33) + 1)) / D;
    }

    int32_t
    guava(int64_t state, int32_t buckets)
    {
        int32_t candidate = 0;
        int32_t next;
        while (1) {
            state = K * state + 1;
            next = (int32_t)((candidate + 1) / lcg(&state));
            if (next >= 0 && next < buckets)
                candidate = next;
            else
                return candidate;
        }
    }

Signed integer overflow happened in this line:

    state = K * state + 1;

This UB is fixed by changing state type to uint64_t. This is ok
and does not change behaviour, because overflow for signed
integers in reality behaves the same as for unsigned integers,
in terms of binary representation. Change to uint64_t didn't lead
to change of any single line of the result assembly code.

Double -> int32_t truncation overflow happened in the next line:

    next = (int32_t)((candidate + 1) / lcg(&state));

Right expression can become something out of int32_t range. This
is UB, but in reality the double value on the right is truncated
to either INT32_MIN, if it is < INT32_MIN, or INT32_MAX, if it is
> INT32_MAX.

This is fixed by making 'next' and 'candidate' variables int64_t.

This fixed the UB, because the right expression can't become
< INT64_MIN and > INT64_MAX. Indeed, max possible value of 'next'
from the expression above is:

    (candidate + 1) / lcg(&state) < ((2^31 - 1) + 1) / 2^-31
        = 2^62 < INT64_MAX

Min possible value of 'next' is:

    (-2^31 + 1) / 2^-31 > -2^62 > INT64_MAX

'Candidate' is assumed to be inside int32_t range, because it is
initialized from the previous 'next' value. If it would be out of
int32_t, it would become either bigger than 'buckets', which is
int32_t, or < 0, and the cycle would stop on the previous
iteration.

The patch fixes the UBs, but guava() function is still broken.
Because it returns values different than the function in the
original paper from John Lamping and Eric Veach. Here is the
correct function:

    int32_t
    correct_guava(uint64_t key, int32_t num_buckets)
    {
        int64_t b = -1, j = 0;
        while (j < num_buckets) {
            b=j;
            key = key * 2862933555777941757ULL + 1;
            j = (b + 1) * ((double)(1LL << 31) / (double)((key >> 33) + 1));
        }
        return b;
    }

correct_guava(6356101026326471242, 813154869) = 362866355,
                            guava(<the same>) = 2.

correct_guava(112571758688054605, 1355446793) = 907865430,
                            guava(<the same>) = 907865431.

And there are more examples. It means, guava() returns something
with an unknown distrubution, consistency, and not described in
the paper. That is a subject for a separate patch to deprecate
this function and add a correct one.

Part of #4609
---
 src/lib/salad/guava.c  | 18 +++++++++++-------
 src/lib/salad/guava.h  |  2 +-
 src/lua/digest.lua     |  2 +-
 test/app/digest.result |  2 +-
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/lib/salad/guava.c b/src/lib/salad/guava.c
index 46bf1ac15..16c1e8cc4 100644
--- a/src/lib/salad/guava.c
+++ b/src/lib/salad/guava.c
@@ -39,22 +39,26 @@
  * John Lamping, Eric Veach
  */
 
-static const int64_t K = 2862933555777941757;
+static const uint64_t K = 2862933555777941757ULL;
 static const double  D = 0x1.0p31;
 
-static inline double lcg(int64_t *state)
+static inline double lcg(uint64_t *state)
 {
-	return (double )((int32_t)(((uint64_t )*state >> 33) + 1)) / D;
+	return (double)((int32_t)((*state >> 33) + 1)) / D;
 }
 
+/**
+ * The function does not conform to the original paper, and
+ * therefore has incorrect behaviour. It should be deprecated.
+ */
 int32_t
-guava(int64_t state, int32_t buckets)
+guava(uint64_t state, int32_t buckets)
 {
-	int32_t candidate = 0;
-	int32_t next;
+	int64_t candidate = 0;
+	int64_t next;
 	while (1) {
 		state = K * state + 1;
-		next = (int32_t)((candidate + 1) / lcg(&state));
+		next = (candidate + 1) / lcg(&state);
 		if (next >= 0 && next < buckets)
 			candidate = next;
 		else
diff --git a/src/lib/salad/guava.h b/src/lib/salad/guava.h
index 9ed93d641..bf0181f1e 100644
--- a/src/lib/salad/guava.h
+++ b/src/lib/salad/guava.h
@@ -39,7 +39,7 @@ extern "C" {
 #endif
 
 int32_t
-guava(int64_t state, int32_t buckets);
+guava(uint64_t state, int32_t buckets);
 
 #if defined(__cplusplus)
 } /* extern C */
diff --git a/src/lua/digest.lua b/src/lua/digest.lua
index 6ed91cfa2..6beaf562a 100644
--- a/src/lua/digest.lua
+++ b/src/lua/digest.lua
@@ -14,7 +14,7 @@ ffi.cdef[[
 
     typedef uint32_t (*crc32_func)(uint32_t crc,
         const unsigned char *buf, unsigned int len);
-    extern int32_t guava(int64_t state, int32_t buckets);
+    extern int32_t guava(uint64_t state, int32_t buckets);
     extern crc32_func crc32_calc;
 
     /* base64 */
diff --git a/test/app/digest.result b/test/app/digest.result
index ecfa518c9..d946c6a3b 100644
--- a/test/app/digest.result
+++ b/test/app/digest.result
@@ -357,7 +357,7 @@ digest.base64_decode(123)
 ...
 digest.guava('hello', 0)
 ---
-- error: 'bad argument #1 to ''?'' (cannot convert ''string'' to ''int64_t'')'
+- error: 'bad argument #1 to ''?'' (cannot convert ''string'' to ''uint64_t'')'
 ...
 digest.guava(1, 'nope_')
 ---
-- 
2.21.1 (Apple Git-122.3)