[Tarantool-patches] [PATCH v1 1/1] sql: check arguments types of built-in functions

Sat Oct 10 00:08:32 MSK 2020

Hi! Thanks for the patch!

See 12 comments/questions below.

1.

	tarantool> box.execute("SELECT GREATEST(1, '2')")
	---
	- metadata:
	  - name: COLUMN_1
	    type: scalar
	  rows:
	  - ['2']
	...

Is it supposed to happen? Shouldn't the function check that all
arguments are of the same type? The same for LEAST().

2. A common problem for all functions where you check for their
arguments being NULL:

	tarantool> box.execute([[SELECT POSITION(100, NULL)]])
	---
	- metadata:
	  - name: COLUMN_1
	    type: integer
	  rows:
	  - [null]
	...

Yes, the second argument is NULL, but the first one mismatches its
type. So this should be an error (I think so, but may be wrong).
The same for SUBSTR(), ROUND(), LIKE(), REPLACE().

3. Is it intended?

	tarantool> box.execute([[SELECT PRINTF(100)]])
	---
	- metadata:
	  - name: COLUMN_1
	    type: string
	  rows:
	  - ['100']
	...

AFAIR, the function is not a standard one, so I don't know if we
want to allow to omit format argument and just stringify whatever
is passed first. Nonetheless I mention it here in case you wanted
to fix it but missed.

4. HEX() in the comment says its argument is blob. But it takes
anything:

	tarantool> box.execute([[SELECT HEX(100)]])
	---
	- metadata:
	  - name: COLUMN_1
	    type: string
	  rows:
	  - ['313030']
	...

Don't know if it is intended. Likely not.

5. TRIM() takes non-string arguments. Definitely a mistake.

	tarantool> box.execute([[SELECT TRIM(100)]])
	---
	- metadata:
	  - name: COLUMN_1
	    type: string
	  rows:
	  - ['100']
	...

> diff --git a/src/box/sql/func.c b/src/box/sql/func.c
> index 0aedb2d3d..d348c3d09 100644
> --- a/src/box/sql/func.c
> +++ b/src/box/sql/func.c
> @@ -467,30 +467,21 @@ lengthFunc(sql_context * context, int argc, sql_value ** argv)
>  
>  	assert(argc == 1);
>  	UNUSED_PARAMETER(argc);
> -	switch (sql_value_type(argv[0])) {
> -	case MP_BIN:
> -	case MP_ARRAY:
> -	case MP_MAP:
> -	case MP_INT:
> -	case MP_UINT:
> -	case MP_BOOL:
> -	case MP_DOUBLE:{
> -			sql_result_uint(context, sql_value_bytes(argv[0]));
> -			break;
> -		}
> -	case MP_STR:{
> -			const unsigned char *z = sql_value_text(argv[0]);
> -			if (z == 0)
> -				return;
> -			len = sql_utf8_char_count(z, sql_value_bytes(argv[0]));
> -			sql_result_uint(context, len);
> -			break;
> -		}
> -	default:{
> -			sql_result_null(context);
> -			break;
> -		}
> -	}
> +	enum mp_type type = mem_mp_type(argv[0]);
> +	if (type == MP_NIL)
> +		return sql_result_null(context);
> +	if (type == MP_STR) {
> +		const unsigned char *z = sql_value_text(argv[0]);
> +		if (z == NULL)
> +			return sql_result_null(context);
> +		len = sql_utf8_char_count(z, sql_value_bytes(argv[0]));
> +		return sql_result_uint(context, len);
> +	}
> +	if (type == MP_BIN)
> +		return sql_result_uint(context, sql_value_bytes(argv[0]));
> +	diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +		 sql_value_to_diag_str(argv[0]), "string or varbinary");
> +	context->is_aborted = true;

6. What was wrong with leaving it a switch-case? Here and in other places.
For more than 2 similarly looking checks switch usually looks better, and
probably works better as well (but this I don't know for sure, I just
read some stuff about comilers being able to generate kind of smart goto
table for swithches).

>  }
>  
>  /*
> @@ -566,29 +537,23 @@ position_func(struct sql_context *context, int argc, struct Mem **argv)
>  
>  	if (haystack_type == MP_NIL || needle_type == MP_NIL)
>  		return;
> -	/*
> -	 * Position function can be called only with string
> -	 * or blob params.
> -	 */
> -	struct Mem *inconsistent_type_arg = NULL;
> -	if (needle_type != MP_STR && needle_type != MP_BIN)
> -		inconsistent_type_arg = needle;
> -	if (haystack_type != MP_STR && haystack_type != MP_BIN)
> -		inconsistent_type_arg = haystack;
> -	if (inconsistent_type_arg != NULL) {
> -		diag_set(ClientError, ER_INCONSISTENT_TYPES,
> -			 "text or varbinary",
> -			 mem_type_to_str(inconsistent_type_arg));
> +	if (needle_type != MP_STR && needle_type != MP_BIN) {
> +		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +			 sql_value_to_diag_str(needle), "string or varbinary");
> +		context->is_aborted = true;
> +		return;
> +	}
> +	if (haystack_type != MP_STR && haystack_type != MP_BIN) {
> +		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +			 sql_value_to_diag_str(haystack),
> +			 "string or varbinary");
>  		context->is_aborted = true;
>  		return;

7. I think you can drop this condition. Because if needle_type is
STR or BIN and bellow you check haystack_type == needle_type,
then haystack_type is also correct.

>  	}
> -	/*
> -	 * Both params of Position function must be of the same
> -	 * type.
> -	 */
>  	if (haystack_type != needle_type) {
> -		diag_set(ClientError, ER_INCONSISTENT_TYPES,
> -			 mem_type_to_str(needle), mem_type_to_str(haystack));
> +		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +			 sql_value_to_diag_str(haystack),
> +			 mem_type_to_str(needle));
>  		context->is_aborted = true;
>  		return;
>  	}
> @@ -735,12 +700,24 @@ substrFunc(sql_context * context, int argc, sql_value ** argv)
>  		context->is_aborted = true;
>  		return;
>  	}
> -	if (sql_value_is_null(argv[1])
> +	if (sql_value_is_null(argv[0]) || sql_value_is_null(argv[1])
>  	    || (argc == 3 && sql_value_is_null(argv[2]))
>  	    ) {
>  		return;

8. I realized that it looks strange, that some functions allow to pass
NULL into non-main parameters and then return NULL. But some does not.
For example, this function allows to pass NULL instead of the integer
parameters. But function randomBlob() does not. Is it intended?

Not related to your patch. But if it is a bug, then better file a ticket.
(Kirill will throw it into the wishlist, but at least our conscience will
be clean.)

> @@ -988,9 +979,9 @@ randomBlob(sql_context * context, int argc, sql_value ** argv)
>  	unsigned char *p;
>  	assert(argc == 1);
>  	UNUSED_PARAMETER(argc);
> -	if (mp_type_is_bloblike(sql_value_type(argv[0]))) {
> +	if (!mp_type_is_numeric(sql_value_type(argv[0]))) {
>  		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> -			 sql_value_to_diag_str(argv[0]), "numeric");
> +			 sql_value_to_diag_str(argv[0]), "integer");

9. You check for numeric, but print integer? So if I pass 1.1 it will
pass, and will return 1 random byte. But should throw an error. Is there
a test on that?

>  		context->is_aborted = true;
>  		return;
>  	}
> @@ -1238,14 +1229,17 @@ likeFunc(sql_context *context, int argc, sql_value **argv)
>  	int rhs_type = sql_value_type(argv[0]);
>  	int lhs_type = sql_value_type(argv[1]);
>  
> -	if (lhs_type != MP_STR || rhs_type != MP_STR) {
> -		if (lhs_type == MP_NIL || rhs_type == MP_NIL)
> -			return;
> -		char *inconsistent_type = rhs_type != MP_STR ?
> -					  mem_type_to_str(argv[0]) :
> -					  mem_type_to_str(argv[1]);
> -		diag_set(ClientError, ER_INCONSISTENT_TYPES, "text",
> -			 inconsistent_type);
> +	if (lhs_type == MP_NIL || rhs_type == MP_NIL)
> +		return;

10. So if I pass NIL and 100, it wil pass? Not sure it should. The
second type still mismatches. The same in some other updated functions.

> +	if (rhs_type != MP_STR) {
> +		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +			 sql_value_to_diag_str(argv[0]), "string");
> +		context->is_aborted = true;
> +		return;
> +	}
> +	if (lhs_type != MP_STR) {
> +		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +			 sql_value_to_diag_str(argv[1]), "string");
>  		context->is_aborted = true;
>  		return;
>  	}
> @@ -1476,10 +1479,16 @@ charFunc(sql_context * context, int argc, sql_value ** argv)
>  	for (i = 0; i < argc; i++) {
>  		uint64_t x;
>  		unsigned c;
> -		if (sql_value_type(argv[i]) == MP_INT)
> -			x = 0xfffd;
> -		else
> -			x = sql_value_uint64(argv[i]);
> +		if (sql_value_is_null(argv[i]))
> +			continue;
> +		if (!mp_type_is_numeric(mem_mp_type(argv[i]))) {
> +			diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +				 sql_value_to_diag_str(argv[i]), "integer");
> +			context->is_aborted = true;
> +			return;
> +		}
> +		int y = sql_value_int(argv[i]);
> +		x = y < 0 ? 0xfffd : y;

11. Why was it using 64bit, but now uses simple int?

>  		if (x > 0x10ffff)
>  			x = 0xfffd;
>  		c = (unsigned)(x & 0x1fffff);
> @@ -1570,6 +1585,43 @@ replaceFunc(sql_context * context, int argc, sql_value ** argv)
>  
>  	assert(argc == 3);
>  	UNUSED_PARAMETER(argc);
> +	enum mp_type type0 = mem_mp_type(argv[0]);
> +	enum mp_type type1 = mem_mp_type(argv[1]);
> +	enum mp_type type2 = mem_mp_type(argv[2]);
> +	if (type0 == MP_NIL || type1 == MP_NIL || type2 == MP_NIL)
> +		return;
> +	if (type0 != MP_STR && type0 != MP_BIN) {
> +		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +			 sql_value_to_diag_str(argv[0]), "string or varbinary");
> +		context->is_aborted = true;
> +		return;
> +	}
> +	if (type1 != MP_STR && type1 != MP_BIN) {
> +		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +			 sql_value_to_diag_str(argv[1]), "string or varbinary");
> +		context->is_aborted = true;
> +		return;
> +	}
> +	if (type2 != MP_STR && type2 != MP_BIN) {
> +		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +			 sql_value_to_diag_str(argv[2]), "string or varbinary");
> +		context->is_aborted = true;
> +		return;
> +	}

12. It is enough to check type0 and that the other types are equal
to type0.

> +	if (type1 != type0) {
> +		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +			 sql_value_to_diag_str(argv[1]),
> +			 mem_type_to_str(argv[0]));
> +		context->is_aborted = true;
> +		return;
> +	}
> +	if (type2 != type0) {
> +		diag_set(ClientError, ER_SQL_TYPE_MISMATCH,
> +			 sql_value_to_diag_str(argv[2]),
> +			 mem_type_to_str(argv[0]));
> +		context->is_aborted = true;
> +		return;
> +	}
>  	zStr = sql_value_text(argv[0]);
>  	if (zStr == 0)
>  		return;