[tarantool-patches] Re: [PATCH 2/2] sql: make aggregate functions types more strict

Wed Apr 24 20:37:10 MSK 2019

>>> +		if (sql_type != ref_sql_type) {
>>> +			is_compatible = false;
>>> +			if ((sql_type == SQL_INTEGER || sql_type == SQL_FLOAT) &&
>>> +			    (ref_sql_type == SQL_INTEGER ||
>>> +			     ref_sql_type == SQL_FLOAT)) {
>>> +				is_compatible = true;
>> 
>> This is a very hot path and doing so much work to check
>> compatibility is a) clumsy when reading b) slow c) hard to
>> maintain.
>> 
>> Please use a compatibility matrix statically defined as a 8x8
>> bitmap.
> Fixed:

It is not what was suggested. Look at field_type1_contains_type2() and
field_type_compatibility as examples.

> diff --git a/src/box/sql/vdbeInt.h b/src/box/sql/vdbeInt.h
> index c84f22caf..553c4f225 100644
> --- a/src/box/sql/vdbeInt.h
> +++ b/src/box/sql/vdbeInt.h
> @@ -228,6 +228,7 @@ struct Mem {
> #define MEM_Str       0x0002	/* Value is a string */
> #define MEM_Int       0x0004	/* Value is an integer */
> #define MEM_Real      0x0008	/* Value is a real number */
> +#define MEM_NumMask   0x000c	/* Value is integer or real (mask) */

This mask covers neither Blob nor Bool values.

> #define MEM_Blob      0x0010	/* Value is a BLOB */
> #define MEM_Bool      0x0020    /* Value is a bool */
> #define MEM_Ptr       0x0040	/* Value is a generic pointer */
> @@ -262,11 +263,25 @@ enum {
> 	MEM_PURE_TYPE_MASK = 0x1f
> };
> 
> diff --git a/src/box/sql/select.c b/src/box/sql/select.c
> index 624083b22..dd87c5b70 100644
> --- a/src/box/sql/select.c
> +++ b/src/box/sql/select.c
> @@ -4351,7 +4351,7 @@ pushDownWhereTerms(Parse * pParse,        /* Parse context (for malloc() and error repo
>  * The requirement of column type not being SCALAR follows from
>  * the purpose of the function. The purpose of the function is
>  * to answer the question: "Should MIN/MAX call be optimised by
> - * using ORDER ON clause code?" If the type of column is SCALAR
> + * using ORDER BY clause code?" If the type of column is SCALAR
>  * then we have to iterate over all rows to check if their types
>  * are compatible. Hence, the optimisation should not be used.
> 
>> 
>>> + * then we have to iterate over all rows to check if their types
>>> + * are compatible. Hence, the optimisation should not be used.
>>> + * For details please see: https://github.com/tarantool/tarantool/issues/4032
>>> + *
>>> * Or, if the conditions above are not met, *ppMinMax is set to 0 and
>>> * WHERE_ORDERBY_NORMAL is returned.
>>> */
>>> @@ -4364,6 +4373,8 @@ minMaxQuery(AggInfo * pAggInfo, ExprList ** ppMinMax)
>>> 		if (pEList && pEList->nExpr == 1
>>> 		    && pEList->a[0].pExpr->op == TK_AGG_COLUMN) {
>>> 			const char *zFunc = pExpr->u.zToken;
>>> +			if (sql_expr_type(pEList->a[0].pExpr) == FIELD_TYPE_SCALAR)
>>> +				return eRet;
>> 
>> I see no difference between byte code generated for SCALAR and other
>> columns (SELECT MIN(a) FROM t;). Investigate this case please.
>> 
> I am not quite understand what is the problem with this.
> If the following conditions are true, then the result of MIN/MAX is retrieved without
> usage of corresponding function in func:
> 
> * the query contains just a single aggregate function,
> * the aggregate function is either min() or max(),
> * the argument to the aggregate function is a column value,
> * the type of column is not SCALAR.
> 
> And the changes above disables this optimisation if column type is SCALAR.
> I also added test for this case.

This optimisation disables index search for min/max queries and SCALAR.
I’m not sure that it is what we want. Nevertheless it makes usage of min/max
uniform (in both cases error is raised), it dramatically slows down execution.
I’ve asked server team, and we decided to allow all comparisons with SCALAR
field type. Now we can’t tell scalar from non-scalar fields. To achieve this we should
add field_type member to struct Mem and check field_type before comparisons.

Theoretically speaking, these checks introduced in minmaxStep still may occur
useful: if user register function, which may return data of different types and use
this function as argument of min/max functions.

Should current patch be pushed before handling of scalars is introduced - IDK.

Overall, I suggest following diff. Note that you don’t need even
‘reference_type’ in minmax context.

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index 0738f44e6..cc63f8b5f 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -51,13 +51,7 @@
  */
 struct minmax_context {
        /** Value being aggregated i.e. current MAX or MIN. */
-       Mem best;
-       /**
-        * Reference type to keep track of previous argument's types.
-        * One of MEM_Null, MEM_Str, MEM_Int, MEM_Real, MEM_Blob
-        * or MEM_Bool.
-        */
-       int reference_type;
+       struct Mem best;
 };
 
 static UConverter* pUtf8conv;
@@ -1618,12 +1612,10 @@ minmaxStep(sql_context *context, int not_used, sql_value **argv)
                return;
 
        sql_value *arg = argv[0];
-       sql_value *best = &(minmax_context->best);
-       if (best == NULL)
-               return;
+       sql_value *best = &minmax_context->best;
 
-       int sql_type = mem_type(arg);
-       if (sql_type == MEM_Null) {
+       int mem_type_current = arg->flags & MEM_PURE_TYPE_MASK;
+       if (mem_type_current == MEM_Null) {
                if (best->flags != 0)
                        sqlSkipAccumulatorLoad(context);
        } else if (best->flags != 0) {
@@ -1636,25 +1628,20 @@ minmaxStep(sql_context *context, int not_used, sql_value **argv)
                 * any other type). In the later case an error
                 * is raised.
                 */
-               if (minmax_context->reference_type == 0)
-                       minmax_context->reference_type = sql_type;
-               int ref_sql_type = minmax_context->reference_type;
-
-               if (sql_type != ref_sql_type) {
-                       bool types_are_compatible = (sql_type & MEM_NumMask) &&
-                                                   (ref_sql_type & MEM_NumMask);
+               int mem_type_best = best->flags & MEM_PURE_TYPE_MASK;
+               if (mem_type_best != mem_type_current) {
+                       bool types_are_compatible = (mem_type_best & MEM_NumMask) &&
+                                                   (mem_type_current & MEM_NumMask);
                        if (!types_are_compatible) {
                                diag_set(ClientError, ER_INCONSISTENT_TYPES,
-                                        type_to_str(ref_sql_type),
-                                        type_to_str(sql_type));
+                                        mem_type_to_str(best),
+                                        mem_type_to_str(arg));
                                context->fErrorOrAux = 1;
                                context->isError = SQL_TARANTOOL_ERROR;
                                return;
                        }
                }
 
-               int max;
-               int cmp;
                struct coll *coll = sqlGetFuncCollSeq(context);
                /* This step function is used for both the min() and max() aggregates,
                 * the only difference between the two being that the sense of the
@@ -1664,9 +1651,9 @@ minmaxStep(sql_context *context, int not_used, sql_value **argv)
                 * Therefore the next statement sets variable 'max' to 1 for the max()
                 * aggregate, or 0 for min().
                 */
-               max = sql_user_data(context) != 0;
-               cmp = sqlMemCompare(best, arg, coll);
-               if ((max != 0 && cmp < 0) || (max == 0 && cmp > 0)) {
+               bool max = sql_user_data(context) != 0;
+               int cmp_res = sqlMemCompare(best, arg, coll);
+               if ((max && cmp_res < 0) || (! max && cmp_res > 0)) {
                        sqlVdbeMemCopy(best, arg);
                } else {
                        sqlSkipAccumulatorLoad(context);
@@ -1682,12 +1669,13 @@ minMaxFinalize(sql_context * context)
 {
        struct minmax_context *minmax_context = (struct minmax_context *)
                sql_aggregate_context(context, sizeof(*minmax_context));
-       sql_value *res = &(minmax_context->best);
+       sql_value *res = &minmax_context->best;
 
        if (res != NULL) {
                if (res->flags != 0) {
                        sql_result_value(context, res);
                }
+               sqlVdbeMemRelease(res);
        }
 }
 
diff --git a/src/box/sql/select.c b/src/box/sql/select.c
index dd87c5b70..b1ec8c758 100644
--- a/src/box/sql/select.c
+++ b/src/box/sql/select.c
@@ -4340,22 +4340,13 @@ pushDownWhereTerms(Parse * pParse,      /* Parse context (for malloc() and error repo
  * argument, this function checks if the following are true:
  *
  *    * the query contains just a single aggregate function,
- *    * the aggregate function is either min() or max(),
- *    * the argument to the aggregate function is a column value,
- *    * the type of column is not SCALAR.
+ *    * the aggregate function is either min() or max(), and
+ *    * the argument to the aggregate function is a column value.
  *
  * If all of the above are true, then WHERE_ORDERBY_MIN or WHERE_ORDERBY_MAX
  * is returned as appropriate. Also, *ppMinMax is set to point to the
  * list of arguments passed to the aggregate before returning.
  *
- * The requirement of column type not being SCALAR follows from
- * the purpose of the function. The purpose of the function is
- * to answer the question: "Should MIN/MAX call be optimised by
- * using ORDER BY clause code?" If the type of column is SCALAR
- * then we have to iterate over all rows to check if their types
- * are compatible. Hence, the optimisation should not be used.
- * For details please see: https://github.com/tarantool/tarantool/issues/4032
- *
  * Or, if the conditions above are not met, *ppMinMax is set to 0 and
  * WHERE_ORDERBY_NORMAL is returned.
  */
@@ -4373,8 +4364,6 @@ minMaxQuery(AggInfo * pAggInfo, ExprList ** ppMinMax)
                if (pEList && pEList->nExpr == 1
                    && pEList->a[0].pExpr->op == TK_AGG_COLUMN) {
                        const char *zFunc = pExpr->u.zToken;
-                       if (sql_expr_type(pEList->a[0].pExpr) == FIELD_TYPE_SCALAR)
-                               return eRet;
                        if (sqlStrICmp(zFunc, "min") == 0) {
                                eRet = WHERE_ORDERBY_MIN;
                                *ppMinMax = pEList;
diff --git a/src/box/sql/vdbe.c b/src/box/sql/vdbe.c
index 960030b52..ed7bf8870 100644
--- a/src/box/sql/vdbe.c
+++ b/src/box/sql/vdbe.c
@@ -618,9 +618,10 @@ vdbe_add_new_autoinc_id(struct Vdbe *vdbe, int64_t id)
 }
 
 char *
-type_to_str(int type)
+mem_type_to_str(const struct Mem *p)
 {
-       switch (type) {
+       assert(p != NULL);
+       switch (p->flags & MEM_PURE_TYPE_MASK) {
        case MEM_Null:
                return "NULL";
        case MEM_Str:
@@ -638,19 +639,6 @@ type_to_str(int type)
        }
 }
 
-int
-mem_type(const struct Mem *p)
-{
-       assert(p != NULL);
-       return p->flags & MEM_PURE_TYPE_MASK;
-}
-
-char *
-mem_type_to_str(const struct Mem *p)
-{
-       return type_to_str(mem_type(p));
-}
-
 /*
  * Execute as much of a VDBE program as we can.
  * This is the core of sql_step().
diff --git a/src/box/sql/vdbeInt.h b/src/box/sql/vdbeInt.h
index 553c4f225..2b60bd6b5 100644
--- a/src/box/sql/vdbeInt.h
+++ b/src/box/sql/vdbeInt.h
@@ -263,25 +263,11 @@ enum {
        MEM_PURE_TYPE_MASK = 0x1f
 };
 
-/**
- * Returns one of MEM_Null, MEM_Str, MEM_Int, MEM_Real, MEM_Blob
- * or MEM_Bool. Used for error detection and reporting.
- */
-int
-mem_type(const struct Mem *p);
-
 /**
  * Simple type to str convertor. It is used to simplify
  * error reporting.
  */
 char *
-type_to_str(int type);
-
-/**
- * Returns string representing type of the given Mem. It is used
- * to simplify error reporting.
- */
-char *
 mem_type_to_str(const struct Mem *p);