[tarantool-patches] Re: [PATCH 2/2] sql: make aggregate functions types more strict

Tue Apr 9 17:52:41 MSK 2019

Firstly, please consider Konstantin’s comments.

Then, I’ve found assertion fault:

tarantool> create table t(id int primary key, b scalar)
tarantool> insert into t values(1, 1), (2, '1'), (3, 4.123)
tarantool> select min(b) from t
Assertion failed: (((p->szMalloc > 0 && p->z == p->zMalloc) ? 1 : 0) + ((p->flags & MEM_Dyn) != 0 ? 1 : 0) + ((p->flags & MEM_Ephem) != 0 ? 1 : 0) + ((p->flags & MEM_Static) != 0 ? 1 : 0) == 1), function sqlVdbeCheckMemInvariants, file /Users/n.pettik/tarantool/src/box/sql/vdbemem.c, line 87.
Abort trap: 6

tarantool> create table t(id int primary key, a int, b text)
tarantool> insert into t values(1,1,'1’)
tarantool> select sum(b) from t
---
- metadata:
  - name: sum(b)
    type: number
  rows:
  - [1]
…

So, your statement below that SUM accepts only numeric arguments
is false. If string can be converted to number, it is OK:

tarantool> select a=b from t;
---
- metadata:
  - name: a=b
    type: integer
  rows:
  - [1]
…

Next:

tarantool> select min(b) from t
---
- metadata:
  - name: min(b)
    type: scalar
  rows:
  - ['1']
…

Could you please fix this behaviour (scalar->int) in a separate patch?
If it is not that easy, file a ticket.

> diff --git a/src/box/sql/func.c b/src/box/sql/func.c
> index b1bfc886e..379f28252 100644
> --- a/src/box/sql/func.c
> +++ b/src/box/sql/func.c
> @@ -45,6 +45,17 @@
> #include <unicode/uchar.h>
> #include <unicode/ucol.h>
> 
> +/*

/**

> + * This structure is for keeping context during work of
> + * aggregate function.
> + */
> +struct aggregate_context {

Let’s change name to minmax_context.

> +    /** Value being aggregated. (e.g. current MAX or current counter value). */
> +    Mem value;
> +    /** Reference value to keep track of previous argument's type. */
> +    Mem reference_value;
> +};

I agree that keeping struct Men is too much (especially as values not as pointers).
We need only type of that value. What is more, you need only one member:
“value being aggregated” comes from argv[0].

> @@ -1588,16 +1604,57 @@ static void
> minmaxStep(sql_context *context, int not_used, sql_value **argv)
> {
> 	UNUSED_PARAMETER(not_used);
> -	sql_value *arg = argv[0];
> -	sql_value *best = sql_aggregate_context(context, sizeof(*best));
> +	struct aggregate_context *aggr_context =
> +		(struct aggregate_context *) sql_aggregate_context(context,
> +								   sizeof(*aggr_context));

This way looks better:

@@ -1604,9 +1604,8 @@ static void
 minmaxStep(sql_context *context, int not_used, sql_value **argv)
 {
        UNUSED_PARAMETER(not_used);
-       struct aggregate_context *aggr_context =
-               (struct aggregate_context *) sql_aggregate_context(context,
-                                                                  sizeof(*aggr_context));
+       struct aggregate_context *aggr_context = (struct aggregate_context *)
+               sql_aggregate_context(context, sizeof(*aggr_context));
        if (aggr_context == NULL)
                retur

> +	if (aggr_context == NULL)
> +		return;
> 
> +	sql_value *arg = argv[0];
> +	sql_value *best = &(aggr_context->value);
> 	if (best == NULL)
> 		return;
> 
> -	if (sql_value_type(argv[0]) == SQL_NULL) {
> +	enum sql_type sql_type = sql_value_type(arg);
> +
> +	if (sql_type == SQL_NULL) {
> 		if (best->flags != 0)
> 			sqlSkipAccumulatorLoad(context);
> 	} else if (best->flags != 0) {
> +		/*
> +		 * During proceeding of the function, arguments
> +		 * of different types may be encountered (if
> +		 * SCALAR type column is proceeded). Some types
> +		 * are compatible (INTEGER and FLOAT) and others
> +		 * are not (TEXT and BLOB are not compatible with
> +		 * any other type). In the later case an error
> +		 * is raised.
> +		 */
> +		sql_value *reference_value = &aggr_context->reference_value;
> +		if (reference_value->flags == 0)
> +			sqlVdbeMemCopy(reference_value, arg);

AFAIR flags == 0 is a situation when we are processing first
entry (i.e. aggregate_context is initialised right now).

> +		enum sql_type ref_sql_type = sql_value_type(reference_value);
> +
> +		bool is_compatible = true;

-> types_are_compatible

> +		if (sql_type != ref_sql_type) {
> +			is_compatible = false;
> +			if ((sql_type == SQL_INTEGER || sql_type == SQL_FLOAT) &&
> +			    (ref_sql_type == SQL_INTEGER ||
> +			     ref_sql_type == SQL_FLOAT)) {
> +				is_compatible = true;
> +			}
> +		}
> +		if (!is_compatible) {
> +			diag_set(ClientError, ER_INCONSISTENT_TYPES,
> +				 mem_type_to_str(reference_value),
> +				 mem_type_to_str(arg));
> +			context->fErrorOrAux = 1;
> +			context->isError = SQL_TARANTOOL_ERROR;
> +			sqlVdbeMemRelease(best);
> +			sqlVdbeMemRelease(reference_value);

Both of these values are allocated in scope of aggregate context.
I guess there’s no need to provide any cleanup on these variables.

> +			return;
> +		}
> +
> 		int max;
> 		int cmp;
> 		struct coll *coll = sqlGetFuncCollSeq(context);
> @@ -1625,13 +1682,16 @@ minmaxStep(sql_context *context, int not_used, sql_value **argv)
> static void
> minMaxFinalize(sql_context * context)
> {
> -	sql_value *pRes;
> -	pRes = (sql_value *) sql_aggregate_context(context, 0);
> +	struct aggregate_context *func_context =
> +		(struct aggregate_context *) sql_aggregate_context(context, sizeof(*func_context));
> +	sql_value *pRes = &(func_context->value);
> +
> 	if (pRes != NULL) {
> 		if (pRes->flags != 0) {
> 			sql_result_value(context, pRes);
> 		}
> 		sqlVdbeMemRelease(pRes);
> +		sqlVdbeMemRelease(&func_context->reference_value);
> 	}
> }
> 
> diff --git a/src/box/sql/select.c b/src/box/sql/select.c
> index b1ec8c758..624083b22 100644
> --- a/src/box/sql/select.c
> +++ b/src/box/sql/select.c
> @@ -4340,13 +4340,22 @@ pushDownWhereTerms(Parse * pParse,	/* Parse context (for malloc() and error repo
>  * argument, this function checks if the following are true:
>  *
>  *    * the query contains just a single aggregate function,
> - *    * the aggregate function is either min() or max(), and
> - *    * the argument to the aggregate function is a column value.
> + *    * the aggregate function is either min() or max(),
> + *    * the argument to the aggregate function is a column value,
> + *    * the type of column is not SCALAR.
>  *
>  * If all of the above are true, then WHERE_ORDERBY_MIN or WHERE_ORDERBY_MAX
>  * is returned as appropriate. Also, *ppMinMax is set to point to the
>  * list of arguments passed to the aggregate before returning.
>  *
> + * The requirement of column type not being SCALAR follows from
> + * the purpose of the function. The purpose of the function is
> + * to answer the question: "Should MIN/MAX call be optimised by
> + * using ORDER ON clause code?" If the type of column is SCALAR

There’s no ORDER ON clause.

> + * then we have to iterate over all rows to check if their types
> + * are compatible. Hence, the optimisation should not be used.
> + * For details please see: https://github.com/tarantool/tarantool/issues/4032
> + *
>  * Or, if the conditions above are not met, *ppMinMax is set to 0 and
>  * WHERE_ORDERBY_NORMAL is returned.
>  */
> @@ -4364,6 +4373,8 @@ minMaxQuery(AggInfo * pAggInfo, ExprList ** ppMinMax)
> 		if (pEList && pEList->nExpr == 1
> 		    && pEList->a[0].pExpr->op == TK_AGG_COLUMN) {
> 			const char *zFunc = pExpr->u.zToken;
> +			if (sql_expr_type(pEList->a[0].pExpr) == FIELD_TYPE_SCALAR)
> +				return eRet;

I see no difference between byte code generated for SCALAR and other
columns (SELECT MIN(a) FROM t;). Investigate this case please.

> 			if (sqlStrICmp(zFunc, "min") == 0) {
> 				eRet = WHERE_ORDERBY_MIN;
> 				*ppMinMax = pEList;
> 
> diff --git a/test/sql-tap/select1.test.lua b/test/sql-tap/select1.test.lua
> index be63e815d..a4ce19d73 100755
> --- a/test/sql-tap/select1.test.lua
> +++ b/test/sql-tap/select1.test.lua
> @@ -1,6 +1,6 @@
> #!/usr/bin/env tarantool
> test = require("sqltester")
> -test:plan(173)
> +test:plan(172)
> 
> --!./tcltestrunner.lua
> -- ["set","testdir",[["file","dirname",["argv0"]]]]
> @@ -503,16 +503,6 @@ test:do_catchsql_test(
>         -- </select1-2.17>
>     })
> 
> -test:do_execsql_test(
> -    "select1-2.17.1",
> -    [[
> -        SELECT sum(a) FROM t3
> -    ]], {
> -        -- <select1-2.17.1>
> -        44.0
> -        -- </select1-2.17.1>
> -    })

Don’t delete test, just change it to catch an error.

> -
> test:do_catchsql_test(
>     "select1-2.18”