[tarantool-patches] Re: [PATCH 1/6] sql: refactor sql_atoi64()

Wed Jun 12 00:11:51 MSK 2019

Hi! Thanks for the patchset! See 8 comments below,
review fixes at the end of the email, and on the branch
in a separate commit.

On 07/06/2019 18:37, Nikita Pettik wrote:
> We are going to allow using unsigned values in SQL and extend range of
> INTEGER type. Hence, we should be able to parse and operate on integers
> in range of [2^63, 2^64 - 1]. Current mechanism which involves
> sql_atoi64() function doesn't allow this.
> 
> Let's refactor this function: firstly, get rid of manual parsing and use
> strtoll() and strtoull() functions from standard library. Then, let's
> return sign of parsed literal.> In case of success now function returns 0,
> -1 otherwise.
> 
> This patch also inlines sql_dec_or_hex_to_i64() to place of its only
> usage: it makes code cleaner and more straightforward.
> 
> Needed for #3810
> Needed for #4015
> ---
> diff --git a/src/box/sql/expr.c b/src/box/sql/expr.c
> index ba7eea59d..158631416 100644
> --- a/src/box/sql/expr.c
> +++ b/src/box/sql/expr.c> @@ -3325,29 +3326,53 @@ expr_code_int(struct Parse *parse, struct Expr *expr, bool is_neg,
>  		if (is_neg)
>  			i = -i;
>  		sqlVdbeAddOp2(v, OP_Integer, i, mem);
> +		return;
> +	}
> +	int64_t value;
> +	const char *z = expr->u.zToken;
> +	assert(z != NULL);
> +	const char *sign = is_neg ? "-" : "";
> +	if (z[0] == '0' && (z[1] == 'x' || z[1] == 'X')) {
> +		uint64_t u = 0;
> +		int i, k;
> +		for (i = 2; z[i] == '0'; i++);
> +		for (k = i; sqlIsxdigit(z[k]); k++)
> +			u = u * 16 + sqlHexToInt(z[k]);

1. Why not to use strtoll and strtol? They accept 'base' argument.
Set it to 16, and they will do the same, but probably faster.

> +		memcpy(&value, &u, 8);
> +		if (z[k] != 0 || k - i > 16) {

2. You treat tail spaces as an error. Not sure if it is right.

> +			diag_set(ClientError, ER_HEX_LITERAL_MAX, sign, z,
> +				 strlen(z) - 2, 16);
> +			parse->is_aborted = true;
> +			return;
> +		}
>  	} else {
> -		int64_t value;
> -		const char *z = expr->u.zToken;
> -		assert(z != NULL);
> -		int c = sql_dec_or_hex_to_i64(z, &value);
> -		if (c == 1 || (c == 2 && !is_neg) ||
> -		    (is_neg && value == SMALLEST_INT64)) {
> -			const char *sign = is_neg ? "-" : "";
> -			if (sql_strnicmp(z, "0x", 2) == 0) {
> -				diag_set(ClientError, ER_HEX_LITERAL_MAX, sign,
> -					 z, strlen(z) - 2, 16);
> -			} else {
> -				diag_set(ClientError, ER_INT_LITERAL_MAX, sign,
> -					 z, INT64_MIN, INT64_MAX);
> -			}
> +		size_t len = strlen(z);
> +		if (is_neg) {
> +			/*
> +			 * UINT64_MAX is constant, so each integer
> +			 * literal certainly contains less digits.
> +			 * It makes no sense to continue processing
> +			 * string if it contains more characters.
> +			 */
> +			if (len > UINT64_MAX_DIGIT_COUNT)
> +				goto int_overflow;
> +			char *neg_z = tt_static_buf();
> +			neg_z[0] = '-';
> +			memcpy(neg_z + 1, z, len++);
> +			neg_z[len]= '\0';
> +			z = neg_z;
> +		}

3. This looks like a crutch. Why do you need strlen() and number of
digits here? Firstly, sql_atoi64 already returns an error in case of
too long integer. Secondly, if you wanted to add '-', it could be done
much faster - make 'value = -value;' in case of 'neg' with a preliminary
check that the value is not too big. Will it work?

> +		bool unused;
> +		int rc = sql_atoi64(z, &value, &unused, len);
> +		if (rc != 0) {
> +int_overflow:
> +			diag_set(ClientError, ER_INT_LITERAL_MAX, z, INT64_MIN,
> +				 INT64_MAX);
>  			parse->is_aborted = true;
> -		} else {
> -			if (is_neg)
> -				value = c == 2 ? SMALLEST_INT64 : -value;
> -			sqlVdbeAddOp4Dup8(v, OP_Int64, 0, mem, 0,
> -					      (u8 *)&value, P4_INT64);
> +			return;
>  		}
>  	}
> +	sqlVdbeAddOp4Dup8(v, OP_Int64, 0, mem, 0, (u8 *)&value, P4_INT64);
>  }> diff --git a/src/box/sql/util.c b/src/box/sql/util.c
> index d5c93f8aa..2e7c298e1 100644
> --- a/src/box/sql/util.c
> +++ b/src/box/sql/util.c
> @@ -593,120 +593,42 @@ sqlAtoF(const char *z, double *pResult, int length)
>  int
> -sql_atoi64(const char *z, int64_t *val, int length)
> +sql_atoi64(const char *z, int64_t *val, bool *is_neg, int length)
>  {
> -	int incr = 1;
> -	u64 u = 0;
> -	int neg = 0;		/* assume positive */
> -	int i;
> -	int c = 0;
> -	int nonNum = 0;		/* True if input contains UTF16 with high byte non-zero */
> -	const char *zStart;
> -	const char *zEnd = z + length;
> -	incr = 1;
> -	while (z < zEnd && sqlIsspace(*z))
> -		z += incr;
> -	if (z < zEnd) {
> -		if (*z == '-') {
> -			neg = 1;
> -			z += incr;
> -		} else if (*z == '+') {
> -			z += incr;
> -		}
> -	}
> -	zStart = z;
> -	/* Skip leading zeros. */
> -	while (z < zEnd && z[0] == '0') {
> -		z += incr;
> -	}
> -	for (i = 0; &z[i] < zEnd && (c = z[i]) >= '0' && c <= '9';
> -	     i += incr) {
> -		u = u * 10 + c - '0';
> -	}
> -	if (u > LARGEST_INT64) {
> -		*val = neg ? SMALLEST_INT64 : LARGEST_INT64;
> -	} else if (neg) {
> -		*val = -(i64) u;
> +	*is_neg = false;
> +	const char *str_end = z + length;
> +	for (; z < str_end && sqlIsspace(*z); z++)
> +	/* invalid format */

4. Usually we start sentences from capital letters and finish
with dots.

> +	if (z >= str_end)
> +		return -1;
> +	if (*z == '-')
> +		*is_neg = true;
> +
> +	char* end = NULL;
> +	errno = 0;
> +	if (*is_neg) {

5. I don't like this double check. 4 lines above you already
know if the value is negative, but you double check it here.

> +		*val = strtoll(z, &end, 10);
>  	} else {
> -		*val = (i64) u;
> +		uint64_t u_val = strtoull(z, &end, 10);
> +		if (u_val > INT64_MAX)
> +			return -1;
> +		*val = u_val;
>  	}
> -	if (&z[i] < zEnd || (i == 0 && zStart == z) || i > 19 * incr ||
> -	    nonNum) {
> -		/* zNum is empty or contains non-numeric text or is longer
> -		 * than 19 digits (thus guaranteeing that it is too large)
> -		 */
> -		return 1;
> -	} else if (i < 19 * incr) {
> -		/* Less than 19 digits, so we know that it fits in 64 bits */
> -		assert(u <= LARGEST_INT64);
> -		return 0;
> -	} else {
> -		/* zNum is a 19-digit numbers.  Compare it against 9223372036854775808. */
> -		c = compare2pow63(z, incr);
> -		if (c < 0) {
> -			/* zNum is less than 9223372036854775808 so it fits */
> -			assert(u <= LARGEST_INT64);
> -			return 0;
> -		} else if (c > 0) {
> -			/* zNum is greater than 9223372036854775808 so it overflows */
> -			return 1;
> -		} else {
> -			/* zNum is exactly 9223372036854775808.  Fits if negative.  The
> -			 * special case 2 overflow if positive
> -			 */
> -			assert(u - 1 == LARGEST_INT64);
> -			return neg ? 0 : 2;
> -		}
> -	}
> -}
> +	/* No digits were found, e.g. an empty string. */
> +	if (end == z)
> +		return -1;

6. You have already checked emptiness above with
this condition: 'if (z >= str_end)'.

> +	/*
> +	 * String has been parsed, but there's
> +	 * additional character(s) remained.
> +	 */
> +	if (*end != '\0')
> +		return -1;

7. What if other symbols are spaces?

> +	/* Overflow and underflow errors. */
> +	if (errno != 0)
> +		return -1;

8. You didn't remove 'sql_atoi64() == SQL_OK' in some
places.

Consider my review fixes here and on the branch in a
separate commit. These changes and the comments are arguable,
probably I am wrong somewhere.

=============================================================

diff --git a/src/box/sql/expr.c b/src/box/sql/expr.c
index 158631416..0df6c16d1 100644
--- a/src/box/sql/expr.c
+++ b/src/box/sql/expr.c
@@ -3363,8 +3363,7 @@ expr_code_int(struct Parse *parse, struct Expr *expr, bool is_neg,
 			z = neg_z;
 		}
 		bool unused;
-		int rc = sql_atoi64(z, &value, &unused, len);
-		if (rc != 0) {
+		if (sql_atoi64(z, &value, &unused, len) != 0) {
 int_overflow:
 			diag_set(ClientError, ER_INT_LITERAL_MAX, z, INT64_MIN,
 				 INT64_MAX);
diff --git a/src/box/sql/util.c b/src/box/sql/util.c
index 2e7c298e1..de469ec36 100644
--- a/src/box/sql/util.c
+++ b/src/box/sql/util.c
@@ -38,6 +38,7 @@
  */
 #include "sqlInt.h"
 #include <stdarg.h>
+#include <ctype.h>
 #if HAVE_ISNAN || SQL_HAVE_ISNAN
 #include <math.h>
 #endif
@@ -596,38 +597,29 @@ sqlAtoF(const char *z, double *pResult, int length)
 int
 sql_atoi64(const char *z, int64_t *val, bool *is_neg, int length)
 {
-	*is_neg = false;
 	const char *str_end = z + length;
-	for (; z < str_end && sqlIsspace(*z); z++)
-	/* invalid format */
+	for (; z < str_end && isspace(*z); z++)
 	if (z >= str_end)
 		return -1;
-	if (*z == '-')
-		*is_neg = true;
-
-	char* end = NULL;
+	char *end = NULL;
 	errno = 0;
-	if (*is_neg) {
+	if (*z == '-') {
+		*is_neg = true;
 		*val = strtoll(z, &end, 10);
 	} else {
+		*is_neg = false;
 		uint64_t u_val = strtoull(z, &end, 10);
 		if (u_val > INT64_MAX)
 			return -1;
 		*val = u_val;
 	}
-	/* No digits were found, e.g. an empty string. */
-	if (end == z)
-		return -1;
-	/*
-	 * String has been parsed, but there's
-	 * additional character(s) remained.
-	 */
-	if (*end != '\0')
-		return -1;
 	/* Overflow and underflow errors. */
 	if (errno != 0)
 		return -1;
-
+	for (; *end != 0; ++end) {
+		if (! isspace(*end))
+			return -1;
+	}
 	return 0;
 }
 
diff --git a/src/box/sql/vdbe.c b/src/box/sql/vdbe.c
index 6ecdb26fc..11a92372b 100644
--- a/src/box/sql/vdbe.c
+++ b/src/box/sql/vdbe.c
@@ -386,7 +386,7 @@ static u16 SQL_NOINLINE computeNumericType(Mem *pMem)
 	if (sqlAtoF(pMem->z, &pMem->u.r, pMem->n)==0)
 		return 0;
 	bool is_neg;
-	if (sql_atoi64(pMem->z, (int64_t *)&pMem->u.i, &is_neg, pMem->n)==SQL_OK)
+	if (sql_atoi64(pMem->z, &pMem->u.i, &is_neg, pMem->n) == 0)
 		return MEM_Int;
 	return MEM_Real;
 }
diff --git a/src/box/sql/vdbemem.c b/src/box/sql/vdbemem.c
index 3a361d066..503f199f1 100644
--- a/src/box/sql/vdbemem.c
+++ b/src/box/sql/vdbemem.c
@@ -467,7 +467,7 @@ sqlVdbeIntValue(Mem * pMem, int64_t *i)
 	} else if (flags & (MEM_Str)) {
 		assert(pMem->z || pMem->n == 0);
 		bool is_neg;
-		if (sql_atoi64(pMem->z, (int64_t *)i, &is_neg, pMem->n) == 0)
+		if (sql_atoi64(pMem->z, i, &is_neg, pMem->n) == 0)
 			return 0;
 	}
 	return -1;
@@ -588,8 +588,8 @@ sqlVdbeMemNumerify(Mem * pMem)
 	if ((pMem->flags & (MEM_Int | MEM_Real | MEM_Null)) == 0) {
 		assert((pMem->flags & (MEM_Blob | MEM_Str)) != 0);
 		bool is_neg;
-		if (0 == sql_atoi64(pMem->z, (int64_t *)&pMem->u.i, &is_neg,
-				    pMem->n)) {
+		if (sql_atoi64(pMem->z, &pMem->u.i, &is_neg,
+			       pMem->n) == 0) {
 			MemSetTypeFlag(pMem, MEM_Int);
 		} else {
 			double v;
@@ -655,8 +655,7 @@ sqlVdbeMemCast(Mem * pMem, enum field_type type)
 		return SQL_OK;
 	if ((pMem->flags & MEM_Blob) != 0 && type == FIELD_TYPE_NUMBER) {
 		bool is_neg;
-		if (sql_atoi64(pMem->z, (int64_t *) &pMem->u.i, &is_neg,
-			       pMem->n) == 0) {
+		if (sql_atoi64(pMem->z, &pMem->u.i, &is_neg, pMem->n) == 0) {
 			MemSetTypeFlag(pMem, MEM_Real);
 			pMem->u.r = pMem->u.i;
 			return 0;
@@ -688,7 +687,7 @@ sqlVdbeMemCast(Mem * pMem, enum field_type type)
 	case FIELD_TYPE_INTEGER:
 		if ((pMem->flags & MEM_Blob) != 0) {
 			bool is_neg;
-			if (sql_atoi64(pMem->z, (int64_t *) &pMem->u.i, &is_neg,
+			if (sql_atoi64(pMem->z, &pMem->u.i, &is_neg,
 				       pMem->n) != 0)
 				return -1;
 			MemSetTypeFlag(pMem, MEM_Int);