[tarantool-patches] Re: [PATCH 2/2] sql: make aggregate functions types more strict

Tue Apr 23 18:38:47 MSK 2019

Firstly I’d like to answer to Konstantin’s comments.

> * Ivan Koptelov <ivan.koptelov at tarantool.org> [19/04/05 18:02]:
>> +/*
>> + * This structure is for keeping context during work of
>> + * aggregate function.
>> + */
>> +struct aggregate_context {
>> +    /** Value being aggregated. (e.g. current MAX or current counter value). */
>> +    Mem value;
>> +    /** Reference value to keep track of previous argument's type. */
>> +    Mem reference_value;
>> +};
> 
> Why not call this struct agg_value?
I think Nikita’s idea is better - ‘minmax_context’ - because it’s only used in minmax implementation.

> 
> Besides, keeping a reference to the previous argument is an
> overkill. Why not keep a type instead, and assign it to
> FIELD_TYPE_SCALAR initially and change to a more specific type
> after the first assignment?
I agree. Instead of using a reference to Mem, now I keep only
reference type.

> 
>> +		} else {
>> +			diag_set(ClientError, ER_INCONSISTENT_TYPES,
>> +				 "INTEGER or FLOAT", mem_type_to_str(argv[0]));
>> +			context->fErrorOrAux = 1;
>> +			context->isError = SQL_TARANTOOL_ERROR;
> 
> This message would look confusing. Could we get rid of "or" in the
> message and be more specific about what is inconsistent?
Fixed.

@@ -1524,8 +1528,12 @@ sumStep(sql_context * context, int argc, sql_value ** argv)
 			p->rSum += sql_value_double(argv[0]);
 			p->approx = 1;
 		} else {
-			diag_set(ClientError, ER_INCONSISTENT_TYPES,
-				 "INTEGER or FLOAT", mem_type_to_str(argv[0]));
+			diag_set(ClientError, ER_ILLEGAL_PARAMS,
+				"SUM, TOTAL and AVG aggregate functions can "
+				"be called only with INTEGER, FLOAT or SCALAR "
+				"arguments. In case of SCALAR arguments, they "
+				"all must have numeric values.",
+				 mem_type_to_str(argv[0]));
 			context->fErrorOrAux = 1;
 			context->isError = SQL_TARANTOOL_ERROR;
 		}

> 
>> +		if (sql_type != ref_sql_type) {
>> +			is_compatible = false;
>> +			if ((sql_type == SQL_INTEGER || sql_type == SQL_FLOAT) &&
>> +			    (ref_sql_type == SQL_INTEGER ||
>> +			     ref_sql_type == SQL_FLOAT)) {
>> +				is_compatible = true;
> 
> This is a very hot path and doing so much work to check
> compatibility is a) clumsy when reading b) slow c) hard to
> maintain.
> 
> Please use a compatibility matrix statically defined as a 8x8
> bitmap.
Fixed:

@@ -1630,30 +1636,23 @@ minmaxStep(sql_context *context, int not_used, sql_value **argv)
 		 * any other type). In the later case an error
 		 * is raised.
 		 */
-		sql_value *reference_value = &aggr_context->reference_value;
-		if (reference_value->flags == 0)
-			sqlVdbeMemCopy(reference_value, arg);
-		enum sql_type ref_sql_type = sql_value_type(reference_value);
+		if (minmax_context->reference_type == 0)
+			minmax_context->reference_type = sql_type;
+		int ref_sql_type = minmax_context->reference_type;
 
-		bool is_compatible = true;
 		if (sql_type != ref_sql_type) {
-			is_compatible = false;
-			if ((sql_type == SQL_INTEGER || sql_type == SQL_FLOAT) &&
-			    (ref_sql_type == SQL_INTEGER ||
-			     ref_sql_type == SQL_FLOAT)) {
-				is_compatible = true;
+			bool types_are_compatible = (sql_type & MEM_NumMask) &&
+						    (ref_sql_type & MEM_NumMask);
+			if (!types_are_compatible) {
+				diag_set(ClientError, ER_INCONSISTENT_TYPES,
+					 type_to_str(ref_sql_type),
+					 type_to_str(sql_type));
+				context->fErrorOrAux = 1;
+				context->isError = SQL_TARANTOOL_ERROR;
+				sqlVdbeMemRelease(best);
+				return;
 			}
 		}
-		if (!is_compatible) {
-			diag_set(ClientError, ER_INCONSISTENT_TYPES,
-				 mem_type_to_str(reference_value),
-				 mem_type_to_str(arg));
-			context->fErrorOrAux = 1;
-			context->isError = SQL_TARANTOOL_ERROR;
-			sqlVdbeMemRelease(best);
-			sqlVdbeMemRelease(reference_value);
-			return;
-		}

diff --git a/src/box/sql/vdbe.c b/src/box/sql/vdbe.c
index ed7bf8870..960030b52 100644
--- a/src/box/sql/vdbe.c
+++ b/src/box/sql/vdbe.c
@@ -618,10 +618,9 @@ vdbe_add_new_autoinc_id(struct Vdbe *vdbe, int64_t id)
 }
 
 char *
-mem_type_to_str(const struct Mem *p)
+type_to_str(int type)
 {
-	assert(p != NULL);
-	switch (p->flags & MEM_PURE_TYPE_MASK) {
+	switch (type) {
 	case MEM_Null:
 		return "NULL";
 	case MEM_Str:
@@ -639,6 +638,19 @@ mem_type_to_str(const struct Mem *p)
 	}
 }
 
+int
+mem_type(const struct Mem *p)
+{
+	assert(p != NULL);
+	return p->flags & MEM_PURE_TYPE_MASK;
+}
+
+char *
+mem_type_to_str(const struct Mem *p)
+{
+	return type_to_str(mem_type(p));
+}
+
 /*
  * Execute as much of a VDBE program as we can.
  * This is the core of sql_step().
diff --git a/src/box/sql/vdbeInt.h b/src/box/sql/vdbeInt.h
index c84f22caf..553c4f225 100644
--- a/src/box/sql/vdbeInt.h
+++ b/src/box/sql/vdbeInt.h
@@ -228,6 +228,7 @@ struct Mem {
 #define MEM_Str       0x0002	/* Value is a string */
 #define MEM_Int       0x0004	/* Value is an integer */
 #define MEM_Real      0x0008	/* Value is a real number */
+#define MEM_NumMask   0x000c	/* Value is integer or real (mask) */
 #define MEM_Blob      0x0010	/* Value is a BLOB */
 #define MEM_Bool      0x0020    /* Value is a bool */
 #define MEM_Ptr       0x0040	/* Value is a generic pointer */
@@ -262,11 +263,25 @@ enum {
 	MEM_PURE_TYPE_MASK = 0x1f
 };
 
+/**
+ * Returns one of MEM_Null, MEM_Str, MEM_Int, MEM_Real, MEM_Blob
+ * or MEM_Bool. Used for error detection and reporting.
+ */
+int
+mem_type(const struct Mem *p);
+
 /**
  * Simple type to str convertor. It is used to simplify
  * error reporting.
  */
 char *
+type_to_str(int type);
+
+/**
+ * Returns string representing type of the given Mem. It is used
+ * to simplify error reporting.
+ */
+char *
 mem_type_to_str(const struct Mem *p);

> On 9 Apr 2019, at 17:52, n.pettik <korablev at tarantool.org> wrote:


> 
> Then, I’ve found assertion fault:
> 
> tarantool> create table t(id int primary key, b scalar)
> tarantool> insert into t values	
> tarantool> select min(b) from t
> Assertion failed: (((p->szMalloc > 0 && p->z == p->zMalloc) ? 1 : 0) + ((p->flags & MEM_Dyn) != 0 ? 1 : 0) + ((p->flags & MEM_Ephem) != 0 ? 1 : 0) + ((p->flags & MEM_Static) != 0 ? 1 : 0) == 1), function sqlVdbeCheckMemInvariants, file /Users/n.pettik/tarantool/src/box/sql/vdbemem.c, line 87.
> Abort trap: 6
It was fixed after removing reference_value from minmax_context.
> tarantool> create table t(id int primary key, a int, b text)
> tarantool> insert into t values(1,1,'1’)
> tarantool> select sum(b) from t
> ---
> - metadata:
>  - name: sum(b)
>    type: number
>  rows:
>  - [1]
> …
> 
> So, your statement below that SUM accepts only numeric arguments
> is false. If string can be converted to number, it is OK:
> 
> tarantool> select a=b from t;
> ---
> - metadata:
>  - name: a=b
>    type: integer
>  rows:
>  - [1]
> …
> 
> Next:
> 
> tarantool> select min(b) from t
> ---
> - metadata:
>  - name: min(b)
>    type: scalar
>  rows:
>  - ['1']
> …
> 
> Could you please fix this behaviour (scalar->int) in a separate patch?
> If it is not that easy, file a ticket.
It is not easy and we already have a ticket. Fixed commit message.
> 
>> diff --git a/src/box/sql/func.c b/src/box/sql/func.c
>> index b1bfc886e..379f28252 100644
>> --- a/src/box/sql/func.c
>> +++ b/src/box/sql/func.c
>> @@ -45,6 +45,17 @@
>> #include <unicode/uchar.h>
>> #include <unicode/ucol.h>
>> 
>> +/*
> 
> /**
> 
>> + * This structure is for keeping context during work of
>> + * aggregate function.
>> + */
>> +struct aggregate_context {
> 
> Let’s change name to minmax_context.
Fixed.

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index 379f28252..258df0d7e 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -45,15 +45,19 @@
 #include <unicode/uchar.h>
 #include <unicode/ucol.h>
 
-/*
+/**
  * This structure is for keeping context during work of
- * aggregate function.
+ * min/max aggregate functions.
  */
-struct aggregate_context {
-    /** Value being aggregated. (e.g. current MAX or current counter value). */
-    Mem value;
-    /** Reference value to keep track of previous argument's type. */
-    Mem reference_value;
+struct minmax_context {
+	/** Value being aggregated i.e. current MAX or MIN. */
+	Mem best;
+	/**
+	 * Reference type to keep track of previous argument's types.
+	 * One of MEM_Null, MEM_Str, MEM_Int, MEM_Real, MEM_Blob
+	 * or MEM_Bool.
+	 */
+	int reference_type;
 };

> 
>> +    /** Value being aggregated. (e.g. current MAX or current counter value). */
>> +    Mem value;
>> +    /** Reference value to keep track of previous argument's type. */
>> +    Mem reference_value;
>> +};
> 
> I agree that keeping struct Men is too much (especially as values not as pointers).
> We need only type of that value. What is more, you need only one member:
> “value being aggregated” comes from argv[0].
> 
Ok, now we keep reference type (instead of reference value) in minmax_context.
But we still need "value being aggregated” because argv[0] actually does not
contain it.
>> @@ -1588,16 +1604,57 @@ static void
>> minmaxStep(sql_context *context, int not_used, sql_value **argv)
>> {
>> 	UNUSED_PARAMETER(not_used);
>> -	sql_value *arg = argv[0];
>> -	sql_value *best = sql_aggregate_context(context, sizeof(*best));
>> +	struct aggregate_context *aggr_context =
>> +		(struct aggregate_context *) sql_aggregate_context(context,
>> +								   sizeof(*aggr_context));
> 
> This way looks better:
> 
> @@ -1604,9 +1604,8 @@ static void
> minmaxStep(sql_context *context, int not_used, sql_value **argv)
> {
>        UNUSED_PARAMETER(not_used);
> -       struct aggregate_context *aggr_context =
> -               (struct aggregate_context *) sql_aggregate_context(context,
> -                                                                  sizeof(*aggr_context));
> +       struct aggregate_context *aggr_context = (struct aggregate_context *)
> +               sql_aggregate_context(context, sizeof(*aggr_context));
>        if (aggr_context == NULL)
>                retur
Ok, fixed.

@@ -1604,20 +1612,18 @@ static void
 minmaxStep(sql_context *context, int not_used, sql_value **argv)
 {
 	UNUSED_PARAMETER(not_used);
-	struct aggregate_context *aggr_context =
-		(struct aggregate_context *) sql_aggregate_context(context,
-								   sizeof(*aggr_context));
-	if (aggr_context == NULL)
+	struct minmax_context *minmax_context = (struct minmax_context *)
+		sql_aggregate_context(context, sizeof(*minmax_context));
+	if (minmax_context == NULL)
 		return;

> 
>> +	if (aggr_context == NULL)
>> +		return;
>> 
>> +	sql_value *arg = argv[0];
>> +	sql_value *best = &(aggr_context->value);
>> 	if (best == NULL)
>> 		return;
>> 
>> -	if (sql_value_type(argv[0]) == SQL_NULL) {
>> +	enum sql_type sql_type = sql_value_type(arg);
>> +
>> +	if (sql_type == SQL_NULL) {
>> 		if (best->flags != 0)
>> 			sqlSkipAccumulatorLoad(context);
>> 	} else if (best->flags != 0) {
>> +		/*
>> +		 * During proceeding of the function, arguments
>> +		 * of different types may be encountered (if
>> +		 * SCALAR type column is proceeded). Some types
>> +		 * are compatible (INTEGER and FLOAT) and others
>> +		 * are not (TEXT and BLOB are not compatible with
>> +		 * any other type). In the later case an error
>> +		 * is raised.
>> +		 */
>> +		sql_value *reference_value = &aggr_context->reference_value;
>> +		if (reference_value->flags == 0)
>> +			sqlVdbeMemCopy(reference_value, arg);
> 
> AFAIR flags == 0 is a situation when we are processing first
> entry (i.e. aggregate_context is initialised right now).
Also fixed:

@@ -1630,30 +1636,23 @@ minmaxStep(sql_context *context, int not_used, sql_value **argv)
 		 * any other type). In the later case an error
 		 * is raised.
 		 */
-		sql_value *reference_value = &aggr_context->reference_value;
-		if (reference_value->flags == 0)
-			sqlVdbeMemCopy(reference_value, arg);
-		enum sql_type ref_sql_type = sql_value_type(reference_value);
+		if (minmax_context->reference_type == 0)
+			minmax_context->reference_type = sql_type;
+		int ref_sql_type = minmax_context->reference_type;

> 
>> +		enum sql_type ref_sql_type = sql_value_type(reference_value);
>> +
>> +		bool is_compatible = true;
> 
> -> types_are_compatible
Ok, fixed:

@@ -1630,30 +1636,23 @@ minmaxStep(sql_context *context, int not_used, sql_value **argv)
-		bool is_compatible = true;
 		if (sql_type != ref_sql_type) {
-			is_compatible = false;
-			if ((sql_type == SQL_INTEGER || sql_type == SQL_FLOAT) &&
-			    (ref_sql_type == SQL_INTEGER ||
-			     ref_sql_type == SQL_FLOAT)) {
-				is_compatible = true;
+			bool types_are_compatible = (sql_type & MEM_NumMask) &&
+						    (ref_sql_type & MEM_NumMask);
+			if (!types_are_compatible) {
+				diag_set(ClientError, ER_INCONSISTENT_TYPES,
+					 type_to_str(ref_sql_type),
+					 type_to_str(sql_type));
+				context->fErrorOrAux = 1;
+				context->isError = SQL_TARANTOOL_ERROR;
+				sqlVdbeMemRelease(best);
+				return;
 			}
 		}
-		if (!is_compatible) {
-			diag_set(ClientError, ER_INCONSISTENT_TYPES,
-				 mem_type_to_str(reference_value),
-				 mem_type_to_str(arg));
-			context->fErrorOrAux = 1;
-			context->isError = SQL_TARANTOOL_ERROR;
-			sqlVdbeMemRelease(best);
-			sqlVdbeMemRelease(reference_value);
-			return;
-		}
> 
>> +		if (sql_type != ref_sql_type) {
>> +			is_compatible = false;
>> +			if ((sql_type == SQL_INTEGER || sql_type == SQL_FLOAT) &&
>> +			    (ref_sql_type == SQL_INTEGER ||
>> +			     ref_sql_type == SQL_FLOAT)) {
>> +				is_compatible = true;
>> +			}
>> +		}
>> +		if (!is_compatible) {
>> +			diag_set(ClientError, ER_INCONSISTENT_TYPES,
>> +				 mem_type_to_str(reference_value),
>> +				 mem_type_to_str(arg));
>> +			context->fErrorOrAux = 1;
>> +			context->isError = SQL_TARANTOOL_ERROR;
>> +			sqlVdbeMemRelease(best);
>> +			sqlVdbeMemRelease(reference_value);
> 
> Both of these values are allocated in scope of aggregate context.
> I guess there’s no need to provide any cleanup on these variables.
I check this in debug and you are right. Because of inner ‘if’ this call actually
does not do nothing, except checking inner invariants of Mem. Removed. 

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index 258df0d7e..0738f44e6 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -1649,7 +1649,6 @@ minmaxStep(sql_context *context, int not_used, sql_value **argv)
                                         type_to_str(sql_type));
                                context->fErrorOrAux = 1;
                                context->isError = SQL_TARANTOOL_ERROR;
-                               sqlVdbeMemRelease(best);
                                return;
                        }
                }
@@ -1689,7 +1688,6 @@ minMaxFinalize(sql_context * context)
                if (res->flags != 0) {
                        sql_result_value(context, res);
                }
-               sqlVdbeMemRelease(res);
        }
 }

> 

>> +			return;
>> +		}
>> +
>> 		int max;
>> 		int cmp;
>> 		struct coll *coll = sqlGetFuncCollSeq(context);
>> @@ -1625,13 +1682,16 @@ minmaxStep(sql_context *context, int not_used, sql_value **argv)
>> static void
>> minMaxFinalize(sql_context * context)
>> {
>> -	sql_value *pRes;
>> -	pRes = (sql_value *) sql_aggregate_context(context, 0);
>> +	struct aggregate_context *func_context =
>> +		(struct aggregate_context *) sql_aggregate_context(context, sizeof(*func_context));
>> +	sql_value *pRes = &(func_context->value);
>> +
>> 	if (pRes != NULL) {
>> 		if (pRes->flags != 0) {
>> 			sql_result_value(context, pRes);
>> 		}
>> 		sqlVdbeMemRelease(pRes);
>> +		sqlVdbeMemRelease(&func_context->reference_value);
>> 	}
>> }
>> 
>> diff --git a/src/box/sql/select.c b/src/box/sql/select.c
>> index b1ec8c758..624083b22 100644
>> --- a/src/box/sql/select.c
>> +++ b/src/box/sql/select.c
>> @@ -4340,13 +4340,22 @@ pushDownWhereTerms(Parse * pParse,	/* Parse context (for malloc() and error repo
>> * argument, this function checks if the following are true:
>> *
>> *    * the query contains just a single aggregate function,
>> - *    * the aggregate function is either min() or max(), and
>> - *    * the argument to the aggregate function is a column value.
>> + *    * the aggregate function is either min() or max(),
>> + *    * the argument to the aggregate function is a column value,
>> + *    * the type of column is not SCALAR.
>> *
>> * If all of the above are true, then WHERE_ORDERBY_MIN or WHERE_ORDERBY_MAX
>> * is returned as appropriate. Also, *ppMinMax is set to point to the
>> * list of arguments passed to the aggregate before returning.
>> *
>> + * The requirement of column type not being SCALAR follows from
>> + * the purpose of the function. The purpose of the function is
>> + * to answer the question: "Should MIN/MAX call be optimised by
>> + * using ORDER ON clause code?" If the type of column is SCALAR
> 
> There’s no ORDER ON clause.
Sorry, fixed the comment.

diff --git a/src/box/sql/select.c b/src/box/sql/select.c
index 624083b22..dd87c5b70 100644
--- a/src/box/sql/select.c
+++ b/src/box/sql/select.c
@@ -4351,7 +4351,7 @@ pushDownWhereTerms(Parse * pParse,        /* Parse context (for malloc() and error repo
  * The requirement of column type not being SCALAR follows from
  * the purpose of the function. The purpose of the function is
  * to answer the question: "Should MIN/MAX call be optimised by
- * using ORDER ON clause code?" If the type of column is SCALAR
+ * using ORDER BY clause code?" If the type of column is SCALAR
  * then we have to iterate over all rows to check if their types
  * are compatible. Hence, the optimisation should not be used.

> 
>> + * then we have to iterate over all rows to check if their types
>> + * are compatible. Hence, the optimisation should not be used.
>> + * For details please see: https://github.com/tarantool/tarantool/issues/4032
>> + *
>> * Or, if the conditions above are not met, *ppMinMax is set to 0 and
>> * WHERE_ORDERBY_NORMAL is returned.
>> */
>> @@ -4364,6 +4373,8 @@ minMaxQuery(AggInfo * pAggInfo, ExprList ** ppMinMax)
>> 		if (pEList && pEList->nExpr == 1
>> 		    && pEList->a[0].pExpr->op == TK_AGG_COLUMN) {
>> 			const char *zFunc = pExpr->u.zToken;
>> +			if (sql_expr_type(pEList->a[0].pExpr) == FIELD_TYPE_SCALAR)
>> +				return eRet;
> 
> I see no difference between byte code generated for SCALAR and other
> columns (SELECT MIN(a) FROM t;). Investigate this case please.
> 
I am not quite understand what is the problem with this.
If the following conditions are true, then the result of MIN/MAX is retrieved without
usage of corresponding function in func:
 
 * the query contains just a single aggregate function,
 * the aggregate function is either min() or max(),
 * the argument to the aggregate function is a column value,
 * the type of column is not SCALAR.

And the changes above disables this optimisation if column type is SCALAR.
I also added test for this case.
>> 			if (sqlStrICmp(zFunc, "min") == 0) {
>> 				eRet = WHERE_ORDERBY_MIN;
>> 				*ppMinMax = pEList;
>> 
>> diff --git a/test/sql-tap/select1.test.lua b/test/sql-tap/select1.test.lua
>> index be63e815d..a4ce19d73 100755
>> --- a/test/sql-tap/select1.test.lua
>> +++ b/test/sql-tap/select1.test.lua
>> @@ -1,6 +1,6 @@
>> #!/usr/bin/env tarantool
>> test = require("sqltester")
>> -test:plan(173)	
>> +test:plan(172)
>> 
>> --!./tcltestrunner.lua
>> -- ["set","testdir",[["file","dirname",["argv0"]]]]
>> @@ -503,16 +503,6 @@ test:do_catchsql_test(
>>        -- </select1-2.17>
>>    })
>> 
>> -test:do_execsql_test(
>> -    "select1-2.17.1",
>> -    [[
>> -        SELECT sum(a) FROM t3
>> -    ]], {
>> -        -- <select1-2.17.1>
>> -        44.0
>> -        -- </select1-2.17.1>
>> -    })
> 
> Don’t delete test, just change it to catch an error.
Ok:

diff --git a/test/sql-tap/select1.test.lua b/test/sql-tap/select1.test.lua
index a4ce19d73..b743d900c 100755
--- a/test/sql-tap/select1.test.lua
+++ b/test/sql-tap/select1.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 test = require("sqltester")
-test:plan(172)
+test:plan(173)
 
 --!./tcltestrunner.lua
 -- ["set","testdir",[["file","dirname",["argv0"]]]]
@@ -503,6 +503,18 @@ test:do_catchsql_test(
         -- </select1-2.17>
     })
 
+test:do_catchsql_test(
+    "select1-2.17.1",
+    [[
+        SELECT sum(a) FROM t3
+    ]], {
+        -- <select1-2.17.1>
+        1, "Illegal parameters, SUM, TOTAL and AVG aggregate functions can " ..
+            "be called only with INTEGER, FLOAT or SCALAR arguments. In " ..
+            "case of SCALAR arguments, they all must have numeric values."
+        -- </select1-2.17.1>
+    })
+

> 
>> -
>> test:do_catchsql_test(
>>    "select1-2.18”
> 
> 
>