Tarantool development patches archive
 help / color / mirror / Atom feed
* [tarantool-patches] [PATCH 0/3] Change collation compatibility rules according to ANSI SQL
@ 2018-10-25 11:00 Nikita Pettik
  2018-10-25 11:00 ` [tarantool-patches] [PATCH 1/3] sql: do not add explicit <COLLATE "BINARY"> clause Nikita Pettik
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Nikita Pettik @ 2018-10-25 11:00 UTC (permalink / raw)
  To: tarantool-patches; +Cc: v.shpilevoy, Nikita Pettik

Branch: https://github.com/tarantool/tarantool/tree/np/gh-3185-ban-comparison-with-different-collations
Issue: https://github.com/tarantool/tarantool/issues/3185

First two patches are auxiliary and self-explanatory.

The third patch adds rules to collation compatibility in order to
restrict usage of different collations within one comparison.
Originally, SQLite uses doubtful approach: if collations of LHS and RHS
can not be used together, then collation of LHS is chosen.  Such
behaviour quite misleading and results in the fact that swap of LHS and
RHS values may change result of operation.  We've decided to add
restrictions on collations of LHS and RHS to disallow messing different
collations as it is stated in ANSI SQL.

Nikita Pettik (3):
  sql: do not add explicit <COLLATE "BINARY"> clause
  Add surrogate ID for BINARY collation
  sql: change collation compatibility rules

 src/box/errcode.h                 |   1 +
 src/box/key_def.c                 |   2 +-
 src/box/key_def.h                 |  17 ++++++
 src/box/lua/space.cc              |   2 +-
 src/box/sql/callback.c            |   6 +-
 src/box/sql/expr.c                |  70 +++++++++++++++++------
 src/box/sql/fkey.c                |   2 -
 src/box/sql/select.c              | 115 ++++++++++++++++++++------------------
 src/box/sql/sqliteInt.h           |  28 +++++++++-
 src/box/sql/where.c               |  13 ++---
 src/box/sql/whereexpr.c           |  38 ++++++-------
 src/box/tuple_format.c            |   2 +-
 test/box/misc.result              |   1 +
 test/sql-tap/e_select1.test.lua   |  12 ++--
 test/sql-tap/in4.test.lua         |   2 +-
 test/sql-tap/join.test.lua        |   2 +-
 test/sql-tap/tkt3493.test.lua     |   4 +-
 test/sql-tap/transitive1.test.lua |   4 +-
 test/sql-tap/with1.test.lua       |   2 +-
 test/sql/collation.result         |  79 ++++++++++++++++++++++++++
 test/sql/collation.test.lua       |  30 ++++++++++
 21 files changed, 313 insertions(+), 119 deletions(-)

-- 
2.15.1

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] [PATCH 1/3] sql: do not add explicit <COLLATE "BINARY"> clause
  2018-10-25 11:00 [tarantool-patches] [PATCH 0/3] Change collation compatibility rules according to ANSI SQL Nikita Pettik
@ 2018-10-25 11:00 ` Nikita Pettik
  2018-10-25 11:00 ` [tarantool-patches] [PATCH 2/3] Add surrogate ID for BINARY collation Nikita Pettik
  2018-10-25 11:00 ` [tarantool-patches] [PATCH 3/3] sql: change collation compatibility rules Nikita Pettik
  2 siblings, 0 replies; 17+ messages in thread
From: Nikita Pettik @ 2018-10-25 11:00 UTC (permalink / raw)
  To: tarantool-patches; +Cc: v.shpilevoy, Nikita Pettik

We don't need to add explicit COLLATE "BINARY" clause since binary
collation is anyway default. On the other hand, it may confuse
due to ambiugty when comparing two terms.

Needed for #3185
---
 src/box/sql/fkey.c      |  2 --
 src/box/sql/select.c    |  4 ----
 src/box/sql/whereexpr.c | 25 ++++++++++++-------------
 3 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/box/sql/fkey.c b/src/box/sql/fkey.c
index b2d2a19e7..ee1de8e18 100644
--- a/src/box/sql/fkey.c
+++ b/src/box/sql/fkey.c
@@ -313,8 +313,6 @@ exprTableRegister(Parse * pParse,	/* Parsing and code generating context */
 			pExpr->iTable = regBase + iCol + 1;
 			char affinity = pTab->def->fields[iCol].affinity;
 			pExpr->affinity = affinity;
-			pExpr = sqlite3ExprAddCollateString(pParse, pExpr,
-							    "binary");
 		} else {
 			pExpr->iTable = regBase;
 			pExpr->affinity = AFFINITY_INTEGER;
diff --git a/src/box/sql/select.c b/src/box/sql/select.c
index 02c0a5d26..1ec92aac7 100644
--- a/src/box/sql/select.c
+++ b/src/box/sql/select.c
@@ -2311,10 +2311,6 @@ sql_multiselect_orderby_to_key_info(struct Parse *parse, struct Select *s,
 				order_by->a[i].pExpr =
 					sqlite3ExprAddCollateString(parse, term,
 								    name);
-			} else {
-				order_by->a[i].pExpr =
-					sqlite3ExprAddCollateString(parse, term,
-								    "BINARY");
 			}
 		}
 		part->coll_id = id;
diff --git a/src/box/sql/whereexpr.c b/src/box/sql/whereexpr.c
index 612868695..9fa6ce15d 100644
--- a/src/box/sql/whereexpr.c
+++ b/src/box/sql/whereexpr.c
@@ -1133,7 +1133,6 @@ exprAnalyze(SrcList * pSrc,	/* the FROM clause */
 		Expr *pNewExpr2;
 		int idxNew1;
 		int idxNew2;
-		const char *zCollSeqName;	/* Name of collating sequence */
 		const u16 wtFlags = TERM_LIKEOPT | TERM_VIRTUAL | TERM_DYNAMIC;
 
 		pLeft = pExpr->x.pList->a[1].pExpr;
@@ -1171,24 +1170,24 @@ exprAnalyze(SrcList * pSrc,	/* the FROM clause */
 			}
 			*pC = c + 1;
 		}
-		/* Support only for unicode_ci indexes by now */
-		zCollSeqName = noCase ? "unicode_ci" : "BINARY";
 		pNewExpr1 = sqlite3ExprDup(db, pLeft, 0);
-		pNewExpr1 = sqlite3PExpr(pParse, TK_GE,
-					 sqlite3ExprAddCollateString(pParse,
-								     pNewExpr1,
-								     zCollSeqName),
-					 pStr1);
+		if (noCase) {
+			pNewExpr1 =
+				sqlite3ExprAddCollateString(pParse, pNewExpr1,
+							    "unicode_ci");
+		}
+		pNewExpr1 = sqlite3PExpr(pParse, TK_GE, pNewExpr1, pStr1);
 		transferJoinMarkings(pNewExpr1, pExpr);
 		idxNew1 = whereClauseInsert(pWC, pNewExpr1, wtFlags);
 		testcase(idxNew1 == 0);
 		exprAnalyze(pSrc, pWC, idxNew1);
 		pNewExpr2 = sqlite3ExprDup(db, pLeft, 0);
-		pNewExpr2 = sqlite3PExpr(pParse, TK_LT,
-					 sqlite3ExprAddCollateString(pParse,
-								     pNewExpr2,
-								     zCollSeqName),
-					 pStr2);
+		if (noCase) {
+			pNewExpr2 =
+				sqlite3ExprAddCollateString(pParse, pNewExpr2,
+							    "unicode_ci");
+		}
+		pNewExpr2 = sqlite3PExpr(pParse, TK_LT, pNewExpr2, pStr2);
 		transferJoinMarkings(pNewExpr2, pExpr);
 		idxNew2 = whereClauseInsert(pWC, pNewExpr2, wtFlags);
 		testcase(idxNew2 == 0);
-- 
2.15.1

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] [PATCH 2/3] Add surrogate ID for BINARY collation
  2018-10-25 11:00 [tarantool-patches] [PATCH 0/3] Change collation compatibility rules according to ANSI SQL Nikita Pettik
  2018-10-25 11:00 ` [tarantool-patches] [PATCH 1/3] sql: do not add explicit <COLLATE "BINARY"> clause Nikita Pettik
@ 2018-10-25 11:00 ` Nikita Pettik
  2018-10-31 12:34   ` [tarantool-patches] " Vladislav Shpilevoy
  2018-10-25 11:00 ` [tarantool-patches] [PATCH 3/3] sql: change collation compatibility rules Nikita Pettik
  2 siblings, 1 reply; 17+ messages in thread
From: Nikita Pettik @ 2018-10-25 11:00 UTC (permalink / raw)
  To: tarantool-patches; +Cc: v.shpilevoy, Nikita Pettik

BINARY collation is not really collation - no one object represents it.
It is set by default, unless otherwise stated. However, according to
ANSI SQL there is difference between "no collation" and "explicitly
specified BINARY collation". So, alongside with COLL_NONE id indicating
the absence of collation, lets introduce COLL_BINARY id to outline the
fact that BINARY collation is set and should be forced during string
comparison.

Part of #3185
---
 src/box/key_def.c           |  2 +-
 src/box/key_def.h           | 17 +++++++++++++++++
 src/box/lua/space.cc        |  2 +-
 src/box/sql/callback.c      |  6 +++++-
 src/box/tuple_format.c      |  2 +-
 test/sql/collation.result   | 16 ++++++++++++++++
 test/sql/collation.test.lua |  8 ++++++++
 7 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/src/box/key_def.c b/src/box/key_def.c
index 3a560bb06..dd47e5d75 100644
--- a/src/box/key_def.c
+++ b/src/box/key_def.c
@@ -174,7 +174,7 @@ key_def_new(const struct key_part_def *parts, uint32_t part_count)
 	for (uint32_t i = 0; i < part_count; i++) {
 		const struct key_part_def *part = &parts[i];
 		struct coll *coll = NULL;
-		if (part->coll_id != COLL_NONE) {
+		if (! coll_is_missing(part->coll_id)) {
 			struct coll_id *coll_id = coll_by_id(part->coll_id);
 			if (coll_id == NULL) {
 				diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
diff --git a/src/box/key_def.h b/src/box/key_def.h
index 20e79f9fe..ecdc199d9 100644
--- a/src/box/key_def.h
+++ b/src/box/key_def.h
@@ -78,6 +78,23 @@ extern const struct key_part_def key_part_def_default;
  */
 #define COLL_NONE UINT32_MAX
 
+/**
+ * In SQL explicitly specified binary collation and absence of
+ * any collation are different in behaviour: according to ANSI
+ * it is prohibited to compare strings with different explicitly
+ * indicated collations. However, if one of collation is default,
+ * (i.e. absent) the second one will be forced.
+ * So, lets introduce another id to indicate explicitly specified
+ * binary collation.
+ */
+#define COLL_BINARY (UINT32_MAX - 1)
+
+static inline bool
+coll_is_missing(uint32_t coll_id)
+{
+	return coll_id == COLL_NONE || coll_id == COLL_BINARY;
+}
+
 /** Descriptor of a single part in a multipart key. */
 struct key_part {
 	/** Tuple field index for this part */
diff --git a/src/box/lua/space.cc b/src/box/lua/space.cc
index c75ba4782..0207639a1 100644
--- a/src/box/lua/space.cc
+++ b/src/box/lua/space.cc
@@ -299,7 +299,7 @@ lbox_fillspace(struct lua_State *L, struct space *space, int i)
 			lua_pushboolean(L, key_part_is_nullable(part));
 			lua_setfield(L, -2, "is_nullable");
 
-			if (part->coll_id != COLL_NONE) {
+			if (! coll_is_missing(part->coll_id)) {
 				struct coll_id *coll_id =
 					coll_by_id(part->coll_id);
 				assert(coll_id != NULL);
diff --git a/src/box/sql/callback.c b/src/box/sql/callback.c
index 3cf3a835d..d4789257f 100644
--- a/src/box/sql/callback.c
+++ b/src/box/sql/callback.c
@@ -42,10 +42,14 @@
 struct coll *
 sql_get_coll_seq(Parse *parser, const char *name, uint32_t *coll_id)
 {
-	if (name == NULL || strcasecmp(name, "binary") == 0) {
+	if (name == NULL) {
 		*coll_id = COLL_NONE;
 		return NULL;
 	}
+	if (strcasecmp(name, "binary") == 0) {
+		*coll_id = COLL_BINARY;
+		return NULL;
+	}
 	struct coll_id *p = coll_by_name(name, strlen(name));
 	if (p == NULL) {
 		*coll_id = COLL_NONE;
diff --git a/src/box/tuple_format.c b/src/box/tuple_format.c
index 1b36a53d6..2f28f18df 100644
--- a/src/box/tuple_format.c
+++ b/src/box/tuple_format.c
@@ -67,7 +67,7 @@ tuple_format_create(struct tuple_format *format, struct key_def * const *keys,
 		format->fields[i].nullable_action = fields[i].nullable_action;
 		struct coll *coll = NULL;
 		uint32_t cid = fields[i].coll_id;
-		if (cid != COLL_NONE) {
+		if (! coll_is_missing(cid)) {
 			struct coll_id *coll_id = coll_by_id(cid);
 			if (coll_id == NULL) {
 				diag_set(ClientError,ER_WRONG_COLLATION_OPTIONS,
diff --git a/test/sql/collation.result b/test/sql/collation.result
index 79ba9abc0..f6254f866 100644
--- a/test/sql/collation.result
+++ b/test/sql/collation.result
@@ -107,6 +107,22 @@ cn:execute('select 1 limit ? collate not_exist', {1})
 cn:close()
 ---
 ...
+-- Explicitly set BINARY collation has ID.
+--
+box.sql.execute("CREATE TABLE t (id INT PRIMARY KEY, a TEXT, b TEXT COLLATE BINARY);")
+---
+...
+box.space.T:format()[2]['collation']
+---
+- null
+...
+box.space.T:format()[3]['collation']
+---
+- 4294967294
+...
+box.sql.execute("DROP TABLE t;")
+---
+...
 box.schema.user.revoke('guest', 'read,write,execute', 'universe')
 ---
 ...
diff --git a/test/sql/collation.test.lua b/test/sql/collation.test.lua
index 935dea824..f9d653717 100644
--- a/test/sql/collation.test.lua
+++ b/test/sql/collation.test.lua
@@ -42,4 +42,12 @@ cn = remote.connect(box.cfg.listen)
 cn:execute('select 1 limit ? collate not_exist', {1})
 
 cn:close()
+
+-- Explicitly set BINARY collation has ID.
+--
+box.sql.execute("CREATE TABLE t (id INT PRIMARY KEY, a TEXT, b TEXT COLLATE BINARY);")
+box.space.T:format()[2]['collation']
+box.space.T:format()[3]['collation']
+box.sql.execute("DROP TABLE t;")
+
 box.schema.user.revoke('guest', 'read,write,execute', 'universe')
-- 
2.15.1

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] [PATCH 3/3] sql: change collation compatibility rules
  2018-10-25 11:00 [tarantool-patches] [PATCH 0/3] Change collation compatibility rules according to ANSI SQL Nikita Pettik
  2018-10-25 11:00 ` [tarantool-patches] [PATCH 1/3] sql: do not add explicit <COLLATE "BINARY"> clause Nikita Pettik
  2018-10-25 11:00 ` [tarantool-patches] [PATCH 2/3] Add surrogate ID for BINARY collation Nikita Pettik
@ 2018-10-25 11:00 ` Nikita Pettik
  2018-10-31 12:34   ` [tarantool-patches] " Vladislav Shpilevoy
  2 siblings, 1 reply; 17+ messages in thread
From: Nikita Pettik @ 2018-10-25 11:00 UTC (permalink / raw)
  To: tarantool-patches; +Cc: v.shpilevoy, Nikita Pettik

Before this patch our SQL implementation relied on strange rules
when comparing strings with different collations:

- if either operand has an explicit collating function assignment using
  the postfix COLLATE operator, then the explicit collating function is
  used for comparison, with precedence to the collating function of the
  left operand;

- if either operand is a column, then the collating function of that
  column is used with precedence to the left operand.

The main concern in this implementation is that collation of the left
operand is forced over right one (even if both operands come with
explicit COLLATE clause). This contradicts ANSI SQL and seems to be
quite misleading, since if user simple swap LHS and RHS - result of
query may change.

Lets introduce restrictions concerning collations compatibilities.
Collations of LHS and RHS are compatible (i.e. no "Illegal mix of
collations" is thrown) if:

- one of collations is mentioned alongside with explicit COLLATE clause,
  which forces this collation over another one. It is allowed to have
  the same forced collations;

- both collations are derived from table columns and they are the same;

- one collation is derived from table column and another one is not
  specified (i.e. COLL_NONE).

The compound SELECT operators UNION, INTERSECT and EXCEPT perform implicit
comparisons between values. Hence, all new rules stated above are
applied to parts of compound SELECT. Otherwise, an error is raised.
In other words, before current patch queries like below were allowed:

SELECT 'abc' COLLATE binary UNION SELECT 'ABC' COLLATE "unicode_ci";
---
- - ['ABC']
  - ['abc']
...

If we swap collations, we will get another result:

SELECT 'ABC' COLLATE "unicode_ci" UNION SELECT 'abc' COLLATE BINARY
---
- - ['abc']
...

Now such queries are illegal.

Closes #3185
---
 src/box/errcode.h                 |   1 +
 src/box/sql/expr.c                |  70 ++++++++++++++++++------
 src/box/sql/select.c              | 111 +++++++++++++++++++++-----------------
 src/box/sql/sqliteInt.h           |  28 +++++++++-
 src/box/sql/where.c               |  13 +++--
 src/box/sql/whereexpr.c           |  13 ++---
 test/box/misc.result              |   1 +
 test/sql-tap/e_select1.test.lua   |  12 ++---
 test/sql-tap/in4.test.lua         |   2 +-
 test/sql-tap/join.test.lua        |   2 +-
 test/sql-tap/tkt3493.test.lua     |   4 +-
 test/sql-tap/transitive1.test.lua |   4 +-
 test/sql-tap/with1.test.lua       |   2 +-
 test/sql/collation.result         |  63 ++++++++++++++++++++++
 test/sql/collation.test.lua       |  22 ++++++++
 15 files changed, 252 insertions(+), 96 deletions(-)

diff --git a/src/box/errcode.h b/src/box/errcode.h
index 04f4f34ee..d16702978 100644
--- a/src/box/errcode.h
+++ b/src/box/errcode.h
@@ -223,6 +223,7 @@ struct errcode_record {
 	/*168 */_(ER_DROP_FK_CONSTRAINT,	"Failed to drop foreign key constraint '%s': %s") \
 	/*169 */_(ER_NO_SUCH_CONSTRAINT,	"Constraint %s does not exist") \
 	/*170 */_(ER_CONSTRAINT_EXISTS,		"Constraint %s already exists") \
+	/*171 */_(ER_ILLEGAL_COLLATION_MIX,	"Illegal mix of collations") \
 
 /*
  * !IMPORTANT! Please follow instructions at start of the file
diff --git a/src/box/sql/expr.c b/src/box/sql/expr.c
index a13de4f0e..bd25ed656 100644
--- a/src/box/sql/expr.c
+++ b/src/box/sql/expr.c
@@ -161,10 +161,10 @@ sqlite3ExprSkipCollate(Expr * pExpr)
 }
 
 struct coll *
-sql_expr_coll(Parse *parse, Expr *p, bool *is_found, uint32_t *coll_id)
+sql_expr_coll(Parse *parse, Expr *p, bool *is_explicit_coll, uint32_t *coll_id)
 {
 	struct coll *coll = NULL;
-	*is_found = false;
+	*is_explicit_coll = false;
 	*coll_id = COLL_NONE;
 	while (p != NULL) {
 		int op = p->op;
@@ -177,7 +177,7 @@ sql_expr_coll(Parse *parse, Expr *p, bool *is_found, uint32_t *coll_id)
 		if (op == TK_COLLATE ||
 		    (op == TK_REGISTER && p->op2 == TK_COLLATE)) {
 			coll = sql_get_coll_seq(parse, p->u.zToken, coll_id);
-			*is_found = true;
+			*is_explicit_coll = true;
 			break;
 		}
 		if ((op == TK_AGG_COLUMN || op == TK_COLUMN ||
@@ -190,7 +190,6 @@ sql_expr_coll(Parse *parse, Expr *p, bool *is_found, uint32_t *coll_id)
 			if (j >= 0) {
 				coll = sql_column_collation(p->space_def, j,
 							    coll_id);
-				*is_found = true;
 			}
 			break;
 		}
@@ -316,25 +315,62 @@ binaryCompareP5(Expr * pExpr1, Expr * pExpr2, int jumpIfNull)
 	return aff;
 }
 
+int
+collations_are_compatible(uint32_t lhs_id, bool is_lhs_forced,
+			  uint32_t rhs_id, bool is_rhs_forced,
+			  uint32_t *res_id)
+{
+	assert(res_id != NULL);
+	if (is_lhs_forced && is_rhs_forced) {
+		if (lhs_id != rhs_id)
+			return -1;
+	}
+	if (is_lhs_forced) {
+		*res_id = lhs_id;
+		return 0;
+	}
+	if (is_rhs_forced) {
+		*res_id = rhs_id;
+		return 0;
+	}
+	if (lhs_id != rhs_id) {
+		if (lhs_id == COLL_NONE) {
+			*res_id = rhs_id;
+			return 0;
+		}
+		if (rhs_id == COLL_NONE) {
+			*res_id = lhs_id;
+			return 0;
+		}
+		return -1;
+	}
+	*res_id = lhs_id;
+	return 0;
+}
+
 struct coll *
 sql_binary_compare_coll_seq(Parse *parser, Expr *left, Expr *right,
 			    uint32_t *coll_id)
 {
-	struct coll *coll;
-	bool is_found;
 	assert(left != NULL);
-	if ((left->flags & EP_Collate) != 0) {
-		coll = sql_expr_coll(parser, left, &is_found, coll_id);
-	} else if (right != NULL && (right->flags & EP_Collate) != 0) {
-		coll = sql_expr_coll(parser, right, &is_found, coll_id);
-	} else {
-		coll = sql_expr_coll(parser, left, &is_found, coll_id);
-		if (! is_found) {
-			coll = sql_expr_coll(parser, right, &is_found,
-					     coll_id);
-		}
+	bool is_lhs_forced;
+	bool is_rhs_forced;
+	uint32_t lhs_coll_id;
+	uint32_t rhs_coll_id;
+	struct coll *lhs_coll = sql_expr_coll(parser, left, &is_lhs_forced,
+					      &lhs_coll_id);
+	struct coll *rhs_coll = sql_expr_coll(parser, right, &is_rhs_forced,
+					      &rhs_coll_id);
+	if (collations_are_compatible(lhs_coll_id, is_lhs_forced,
+				      rhs_coll_id, is_rhs_forced,
+				      coll_id) != 0) {
+		diag_set(ClientError, ER_ILLEGAL_COLLATION_MIX);
+		parser->rc = SQL_TARANTOOL_ERROR;
+		parser->nErr++;
+		*coll_id = COLL_NONE;
+		return NULL;
 	}
-	return coll;
+	return *coll_id == rhs_coll_id ? rhs_coll : lhs_coll;;
 }
 
 /*
diff --git a/src/box/sql/select.c b/src/box/sql/select.c
index 1ec92aac7..c3403670b 100644
--- a/src/box/sql/select.c
+++ b/src/box/sql/select.c
@@ -1024,7 +1024,7 @@ selectInnerLoop(Parse * pParse,		/* The parser context */
 								  regPrev + i);
 						VdbeCoverage(v);
 					}
-					if (is_found) {
+					if (coll != NULL) {
 						sqlite3VdbeChangeP4(v, -1,
 								    (const char *)coll,
 								    P4_COLLSEQ);
@@ -2035,8 +2035,9 @@ sqlite3SelectAddColumnTypeAndCollation(Parse * pParse,		/* Parsing contexts */
 		bool is_found;
 		uint32_t coll_id;
 		if (pTab->def->fields[i].coll_id == COLL_NONE &&
-		    sql_expr_coll(pParse, p, &is_found, &coll_id) && is_found)
+		    sql_expr_coll(pParse, p, &is_found, &coll_id) && coll_id != COLL_NONE)
 			pTab->def->fields[i].coll_id = coll_id;
+		assert(pTab->def->fields[i].coll_id != COLL_BINARY);
 	}
 }
 
@@ -2223,47 +2224,53 @@ computeLimitRegisters(Parse * pParse, Select * p, int iBreak)
 }
 
 #ifndef SQLITE_OMIT_COMPOUND_SELECT
-static struct coll *
-multi_select_coll_seq_r(Parse *parser, Select *p, int n, bool *is_found,
-			uint32_t *coll_id)
+/**
+ * This function determines resulting collation sequence for
+ * @n-th column of the result set for the compound SELECT
+ * statement. Since compound SELECT performs implicit comparisons
+ * between values, all parts of compound queries must use
+ * the same collation. Otherwise, an error is raised.
+ *
+ * @param parser Parse context.
+ * @param p Select meta-information.
+ * @param n Column number of the result set.
+ * @param is_forced_coll Used if we fall into recursion.
+ *        For most-outer call it is unused. Used to indicate that
+ *        explicit COLLATE clause is used.
+ * @retval Id of collation to be used during string comparison.
+ */
+static uint32_t
+multi_select_coll_seq(struct Parse *parser, struct Select *p, int n,
+		      bool *is_forced_coll)
 {
-	struct coll *coll;
+	bool is_prior_forced = false;
+	bool is_current_forced;
+	uint32_t prior_coll_id = COLL_NONE;
+	uint32_t current_coll_id;
 	if (p->pPrior != NULL) {
-		coll = multi_select_coll_seq_r(parser, p->pPrior, n, is_found,
-					       coll_id);
-	} else {
-		coll = NULL;
-		*coll_id = COLL_NONE;
+		prior_coll_id = multi_select_coll_seq(parser, p->pPrior, n,
+						      &is_prior_forced);
 	}
-	assert(n >= 0);
-	/* iCol must be less than p->pEList->nExpr.  Otherwise an error would
-	 * have been thrown during name resolution and we would not have gotten
-	 * this far
+	/*
+	 * Column number must be less than p->pEList->nExpr.
+	 * Otherwise an error would have been thrown during name
+	 * resolution and we would not have gotten this far.
 	 */
-	if (!*is_found && ALWAYS(n < p->pEList->nExpr)) {
-		coll = sql_expr_coll(parser, p->pEList->a[n].pExpr, is_found,
-				     coll_id);
-	}
-	return coll;
-}
-
-/**
- * The collating sequence for the compound select is taken from the
- * left-most term of the select that has a collating sequence.
- * @param parser Parser.
- * @param p Select.
- * @param n Column number.
- * @param[out] coll_id Collation identifer.
- * @retval The appropriate collating sequence for the n-th column
- *         of the result set for the compound-select statement
- *         "p".
- * @retval NULL The column has no default collating sequence.
- */
-static inline struct coll *
-multi_select_coll_seq(Parse *parser, Select *p, int n, uint32_t *coll_id)
-{
-	bool is_found = false;
-	return multi_select_coll_seq_r(parser, p, n, &is_found, coll_id);
+	assert(n >= 0 && n < p->pEList->nExpr);
+	sql_expr_coll(parser, p->pEList->a[n].pExpr, &is_current_forced,
+		      &current_coll_id);
+	uint32_t res_coll_id;
+	if (collations_are_compatible(prior_coll_id, is_prior_forced,
+				      current_coll_id, is_current_forced,
+				      &res_coll_id) != 0) {
+		diag_set(ClientError, ER_ILLEGAL_COLLATION_MIX);
+		parser->rc = SQL_TARANTOOL_ERROR;
+		parser->nErr++;
+		*is_forced_coll = false;
+		return COLL_NONE;
+	}
+	*is_forced_coll = (is_prior_forced || is_current_forced);
+	return res_coll_id;
 }
 
 /**
@@ -2300,12 +2307,13 @@ sql_multiselect_orderby_to_key_info(struct Parse *parse, struct Select *s,
 		struct ExprList_item *item = &order_by->a[i];
 		struct Expr *term = item->pExpr;
 		uint32_t id;
+		bool unused;
 		if ((term->flags & EP_Collate) != 0) {
-			bool is_found = false;
-			sql_expr_coll(parse, term, &is_found, &id);
+			sql_expr_coll(parse, term, &unused, &id);
 		} else {
-			multi_select_coll_seq(parse, s,
-					      item->u.x.iOrderByCol - 1, &id);
+			id = multi_select_coll_seq(parse, s,
+						   item->u.x.iOrderByCol - 1,
+						   &unused);
 			if (id != COLL_NONE) {
 				const char *name = coll_by_id(id)->name;
 				order_by->a[i].pExpr =
@@ -2947,9 +2955,10 @@ multiSelect(Parse * pParse,	/* Parsing context */
 		struct sql_key_info *key_info = sql_key_info_new(db, nCol);
 		if (key_info == NULL)
 			goto multi_select_end;
+		bool unused;
 		for (int i = 0; i < nCol; i++) {
-			multi_select_coll_seq(pParse, p, i,
-					      &key_info->parts[i].coll_id);
+			key_info->parts[i].coll_id =
+				multi_select_coll_seq(pParse, p, i, &unused);
 		}
 
 		for (struct Select *pLoop = p; pLoop; pLoop = pLoop->pPrior) {
@@ -3374,10 +3383,12 @@ multiSelectOrderBy(Parse * pParse,	/* Parsing context */
 		pParse->nMem += expr_count + 1;
 		sqlite3VdbeAddOp2(v, OP_Integer, 0, regPrev);
 		key_info_dup = sql_key_info_new(db, expr_count);
+		bool unused;
 		if (key_info_dup != NULL) {
 			for (int i = 0; i < expr_count; i++) {
-				multi_select_coll_seq(pParse, p, i,
-					&key_info_dup->parts[i].coll_id);
+				key_info_dup->parts[i].coll_id =
+					multi_select_coll_seq(pParse, p, i,
+							      &unused);
 			}
 		}
 	}
@@ -5351,12 +5362,12 @@ updateAccumulator(Parse * pParse, AggInfo * pAggInfo)
 			struct ExprList_item *pItem;
 			int j;
 			assert(pList != 0);	/* pList!=0 if pF->pFunc has NEEDCOLL */
-			bool is_found = false;
+			bool unused;
 			uint32_t id;
-			for (j = 0, pItem = pList->a; !is_found && j < nArg;
+			for (j = 0, pItem = pList->a; coll == NULL && j < nArg;
 			     j++, pItem++) {
 				coll = sql_expr_coll(pParse, pItem->pExpr,
-						     &is_found, &id);
+						     &unused, &id);
 			}
 			if (regHit == 0 && pAggInfo->nAccumulator)
 				regHit = ++pParse->nMem;
diff --git a/src/box/sql/sqliteInt.h b/src/box/sql/sqliteInt.h
index 53188e74d..ae82f4643 100644
--- a/src/box/sql/sqliteInt.h
+++ b/src/box/sql/sqliteInt.h
@@ -4293,13 +4293,14 @@ const char *sqlite3ErrStr(int);
  *
  * @param parse Parsing context.
  * @param expr Expression to scan.
- * @param[out] is_found Flag set if collation was found.
+ * @param[out] is_found Flag set if explicit COLLATE clause is used.
  * @param[out] coll_id Collation identifier.
  *
  * @retval Pointer to collation.
  */
 struct coll *
-sql_expr_coll(Parse * pParse, Expr * pExpr, bool *is_found, uint32_t *coll_id);
+sql_expr_coll(Parse * pParse, Expr * pExpr, bool *is_explicit_coll,
+	      uint32_t *coll_id);
 
 Expr *sqlite3ExprAddCollateToken(Parse * pParse, Expr *, const Token *, int);
 Expr *sqlite3ExprAddCollateString(Parse *, Expr *, const char *);
@@ -4593,6 +4594,29 @@ int sqlite3VdbeParameterIndex(Vdbe *, const char *, int);
 int sqlite3TransferBindings(sqlite3_stmt *, sqlite3_stmt *);
 int sqlite3Reprepare(Vdbe *);
 void sqlite3ExprListCheckLength(Parse *, ExprList *, const char *);
+
+/**
+ * This function verifies that two collations (to be more precise
+ * their ids) are compatible. In terms of SQL ANSI they are
+ * compatible if:
+ *  - one of collations is mentioned alongside with explicit
+ *    COLLATE clause, which forces this collation over another
+ *    one. It is allowed to have the same forced collations;
+ * - both collations are derived from table columns and they
+ *   are the same;
+ * - one collation is derived from table column and another
+ *   one is not specified (i.e. COLL_NONE);
+ * In all other cases they are not accounted to be compatible
+ * and error should be raised.
+ * Collation to be used in comparison operator is returned
+ * via @res_id: in case one of collations is absent, then
+ * the second one is utilized.
+ */
+int
+collations_are_compatible(uint32_t lhs_id, bool is_lhs_forced,
+			  uint32_t rhs_id, bool is_rhs_forced,
+			  uint32_t *res_id);
+
 /**
  * Return a pointer to the collation sequence that should be used
  * by a binary comparison operator comparing left and right.
diff --git a/src/box/sql/where.c b/src/box/sql/where.c
index 01dcc49cb..b0a9f0651 100644
--- a/src/box/sql/where.c
+++ b/src/box/sql/where.c
@@ -304,11 +304,11 @@ whereScanNext(WhereScan * pScan)
 								Parse *pParse =
 									pWC->pWInfo->pParse;
 								assert(pX->pLeft);
-								uint32_t id;
+								uint32_t unused;
 								struct coll *coll =
 									sql_binary_compare_coll_seq(
 										pParse, pX->pLeft,
-										pX->pRight, &id);
+										pX->pRight, &unused);
 								if (coll != pScan->coll)
 									continue;
 							}
@@ -559,7 +559,7 @@ findIndexCol(Parse * pParse,	/* Parse context */
 			struct coll *coll = sql_expr_coll(pParse,
 							  pList->a[i].pExpr,
 							  &is_found, &id);
-			if (is_found && coll == part_to_match->coll)
+			if (coll == part_to_match->coll)
 				return i;
 		}
 	}
@@ -2289,8 +2289,8 @@ whereRangeVectorLen(Parse * pParse,	/* Parsing context */
 		    sqlite3TableColumnAffinity(space->def, pLhs->iColumn);
 		if (aff != idxaff)
 			break;
-		uint32_t id;
-		pColl = sql_binary_compare_coll_seq(pParse, pLhs, pRhs, &id);
+		uint32_t unused;
+		pColl = sql_binary_compare_coll_seq(pParse, pLhs, pRhs, &unused);
 		if (pColl == 0)
 			break;
 		if (idx_def->key_def->parts[i + nEq].coll != pColl)
@@ -3409,8 +3409,7 @@ wherePathSatisfiesOrderBy(WhereInfo * pWInfo,	/* The WHERE clause */
 								      &is_found, &id);
 						struct coll *idx_coll =
 							idx_def->key_def->parts[j].coll;
-						if (is_found &&
-						    coll != idx_coll)
+						if (coll != idx_coll)
 							continue;
 					}
 					isMatch = 1;
diff --git a/src/box/sql/whereexpr.c b/src/box/sql/whereexpr.c
index 9fa6ce15d..971f428b7 100644
--- a/src/box/sql/whereexpr.c
+++ b/src/box/sql/whereexpr.c
@@ -168,7 +168,7 @@ exprCommute(Parse * pParse, Expr * pExpr)
 			bool is_found;
 			uint32_t id;
 			sql_expr_coll(pParse, pExpr->pLeft, &is_found, &id);
-			if (is_found) {
+			if (id != COLL_NONE) {
 				/*
 				 * Neither X nor Y have COLLATE
 				 * operators, but X has a
@@ -837,15 +837,16 @@ termIsEquivalence(Parse * pParse, Expr * pExpr)
 	    ) {
 		return 0;
 	}
-	uint32_t id;
+	uint32_t unused;
 	struct coll *coll1 =
 	    sql_binary_compare_coll_seq(pParse, pExpr->pLeft, pExpr->pRight,
-					&id);
+					&unused);
 	if (coll1 == NULL)
 		return 1;
-	bool unused;
-	coll1 = sql_expr_coll(pParse, pExpr->pLeft, &unused, &id);
-	struct coll *coll2 = sql_expr_coll(pParse, pExpr->pRight, &unused, &id);
+	bool unused1;
+	coll1 = sql_expr_coll(pParse, pExpr->pLeft, &unused1, &unused);
+	struct coll *coll2 = sql_expr_coll(pParse, pExpr->pRight, &unused1,
+					   &unused);
 	return coll1 != NULL && coll1 == coll2;
 }
 
diff --git a/test/box/misc.result b/test/box/misc.result
index 3d7317caf..e8c87db68 100644
--- a/test/box/misc.result
+++ b/test/box/misc.result
@@ -497,6 +497,7 @@ t;
   168: box.error.DROP_FK_CONSTRAINT
   169: box.error.NO_SUCH_CONSTRAINT
   170: box.error.CONSTRAINT_EXISTS
+  171: box.error.ILLEGAL_COLLATION_MIX
 ...
 test_run:cmd("setopt delimiter ''");
 ---
diff --git a/test/sql-tap/e_select1.test.lua b/test/sql-tap/e_select1.test.lua
index c8ad9039d..44b45461b 100755
--- a/test/sql-tap/e_select1.test.lua
+++ b/test/sql-tap/e_select1.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 test = require("sqltester")
-test:plan(541)
+test:plan(539)
 
 --!./tcltestrunner.lua
 -- 2010 July 16
@@ -1690,12 +1690,10 @@ test:do_select_tests(
         {"1", "SELECT 'abc'                UNION SELECT 'ABC'", {"ABC",  "abc"}},
         {"2", "SELECT 'abc' COLLATE \"unicode_ci\" UNION SELECT 'ABC'", {"ABC" }},
         {"3", "SELECT 'abc'                UNION SELECT 'ABC' COLLATE \"unicode_ci\"", {"ABC" }},
-        {"4", "SELECT 'abc' COLLATE binary UNION SELECT 'ABC' COLLATE \"unicode_ci\"", {"ABC",  "abc"}},
-        {"5", "SELECT 'abc' COLLATE \"unicode_ci\" UNION SELECT 'ABC' COLLATE binary", {"ABC" }},
-        {"6", "SELECT a FROM y1 UNION SELECT b FROM y1", {"abc" }},
-        {"7", "SELECT b FROM y1 UNION SELECT a FROM y1", {"Abc",  "abc"}},
-        {"8", "SELECT a FROM y1 UNION SELECT c FROM y1", {"aBC" }},
-        {"9", "SELECT a FROM y1 UNION SELECT c COLLATE binary FROM y1", {"aBC" }},
+        {"4", "SELECT 'abc' COLLATE binary UNION SELECT 'ABC'", {"ABC",  "abc"}},
+        {"5", "SELECT 'abc' COLLATE \"unicode_ci\" UNION SELECT 'ABC'", {"ABC" }},
+        {"6", "SELECT a FROM y1 UNION SELECT c FROM y1", {"aBC" }},
+        {"7", "SELECT a FROM y1 UNION SELECT c COLLATE binary FROM y1", {"Abc", "aBC" }},
     })
 
 -- EVIDENCE-OF: R-32706-07403 No affinity transformations are applied to
diff --git a/test/sql-tap/in4.test.lua b/test/sql-tap/in4.test.lua
index ac39c5fca..3ea0da653 100755
--- a/test/sql-tap/in4.test.lua
+++ b/test/sql-tap/in4.test.lua
@@ -531,7 +531,7 @@ test:do_execsql_test(
         SELECT c FROM t4a WHERE a=b ORDER BY c;
     ]], {
         -- <in4-4.1>
-        3
+        1, 3
         -- </in4-4.1>
     })
 
diff --git a/test/sql-tap/join.test.lua b/test/sql-tap/join.test.lua
index ac05a98b2..dcd1fbd4c 100755
--- a/test/sql-tap/join.test.lua
+++ b/test/sql-tap/join.test.lua
@@ -1003,7 +1003,7 @@ test:do_execsql_test(
         SELECT a,b FROM t2 NATURAL JOIN t1 
     ]], {
         -- <join-11.7>
-        "two", 2
+        "one", 1, "two", 2
         -- </join-11.7>
     })
 
diff --git a/test/sql-tap/tkt3493.test.lua b/test/sql-tap/tkt3493.test.lua
index 85f3c089f..c074d9446 100755
--- a/test/sql-tap/tkt3493.test.lua
+++ b/test/sql-tap/tkt3493.test.lua
@@ -246,7 +246,7 @@ test:do_execsql_test(
 test:do_execsql_test(
     "tkt3493-3.1",
     [[
-        CREATE TABLE t2(a COLLATE "unicode_ci" PRIMARY KEY, b COLLATE BINARY);
+        CREATE TABLE t2(a COLLATE "unicode_ci" PRIMARY KEY, b);
         INSERT INTO t2 VALUES('aBc', 'DeF');
     ]], {
         -- <tkt3493-3.1>
@@ -304,7 +304,7 @@ test:do_execsql_test(
         SELECT b>a FROM t2 GROUP BY a, b
     ]], {
         -- <tkt3493-3.3.3>
-        0
+        1
         -- </tkt3493-3.3.3>
     })
 
diff --git a/test/sql-tap/transitive1.test.lua b/test/sql-tap/transitive1.test.lua
index ed3238f04..3c0a36ba9 100755
--- a/test/sql-tap/transitive1.test.lua
+++ b/test/sql-tap/transitive1.test.lua
@@ -503,7 +503,7 @@ test:do_execsql_test(
         SELECT * FROM c1 WHERE x=y AND z=y AND z='abc';
     ]], {
         -- <transitive1-570>
-
+        1, "ABC", "ABC", "abc"
         -- </transitive1-570>
     })
 
@@ -516,7 +516,7 @@ test:do_execsql_test(
         SELECT * FROM c1 WHERE x=y AND z=y AND z='abc';
     ]], {
         -- <transitive1-570eqp>
-        "/SEARCH TABLE C1 USING COVERING INDEX C1X/"
+        "/SCAN TABLE C1/"
         -- </transitive1-570eqp>
     })
 
diff --git a/test/sql-tap/with1.test.lua b/test/sql-tap/with1.test.lua
index c6a895875..9c04cf2a6 100755
--- a/test/sql-tap/with1.test.lua
+++ b/test/sql-tap/with1.test.lua
@@ -880,7 +880,7 @@ test:do_execsql_test("10.8.4.2", [[
   SELECT a FROM tst UNION ALL SELECT b COLLATE "unicode_ci" FROM tst ORDER BY 1;
 ]], {
   -- <10.8.4.2>
-  "A", "B", "C", "a", "b", "c"
+  "a", "A", "b", "B", "c", "C"
   -- </10.8.4.2>
 })
 
diff --git a/test/sql/collation.result b/test/sql/collation.result
index f6254f866..c5934d4e2 100644
--- a/test/sql/collation.result
+++ b/test/sql/collation.result
@@ -123,6 +123,69 @@ box.space.T:format()[3]['collation']
 box.sql.execute("DROP TABLE t;")
 ---
 ...
+-- gh-3185: collations of LHS and RHS must be compatible.
+--
+box.sql.execute("CREATE TABLE t (id INT PRIMARY KEY, a TEXT, b TEXT COLLATE BINARY, c TEXT COLLATE \"unicode_ci\");")
+---
+...
+box.sql.execute("SELECT * FROM t WHERE a = b;")
+---
+- []
+...
+box.sql.execute("SELECT * FROM t WHERE a COLLATE BINARY = b;")
+---
+- []
+...
+box.sql.execute("SELECT * FROM t WHERE b = c;")
+---
+- error: Illegal mix of collations
+...
+box.sql.execute("SELECT * FROM t WHERE b COLLATE BINARY = c;")
+---
+- []
+...
+box.sql.execute("SELECT * FROM t WHERE a = c;")
+---
+- []
+...
+box.sql.execute("SELECT * FROM t WHERE a COLLATE BINARY = c COLLATE \"unicode\";")
+---
+- error: Illegal mix of collations
+...
+-- Compound queries perform implicit comparisons between values.
+-- Hence, rules for collations compatibilities are the same.
+--
+box.sql.execute("SELECT 'abc' COLLATE binary UNION SELECT 'ABC' COLLATE \"unicode_ci\"")
+---
+- error: Illegal mix of collations
+...
+box.sql.execute("SELECT 'abc' COLLATE \"unicode_ci\" UNION SELECT 'ABC' COLLATE binary")
+---
+- error: Illegal mix of collations
+...
+box.sql.execute("SELECT c FROM t UNION SELECT b FROM t;")
+---
+- error: Illegal mix of collations
+...
+box.sql.execute("SELECT b FROM t UNION SELECT a FROM t;")
+---
+- []
+...
+box.sql.execute("SELECT a FROM t UNION SELECT c FROM t;")
+---
+- []
+...
+box.sql.execute("SELECT c COLLATE BINARY FROM t UNION SELECT a FROM t;")
+---
+- []
+...
+box.sql.execute("SELECT b COLLATE \"unicode\" FROM t UNION SELECT a FROM t;")
+---
+- []
+...
+box.sql.execute("DROP TABLE t;")
+---
+...
 box.schema.user.revoke('guest', 'read,write,execute', 'universe')
 ---
 ...
diff --git a/test/sql/collation.test.lua b/test/sql/collation.test.lua
index f9d653717..decab2560 100644
--- a/test/sql/collation.test.lua
+++ b/test/sql/collation.test.lua
@@ -50,4 +50,26 @@ box.space.T:format()[2]['collation']
 box.space.T:format()[3]['collation']
 box.sql.execute("DROP TABLE t;")
 
+-- gh-3185: collations of LHS and RHS must be compatible.
+--
+box.sql.execute("CREATE TABLE t (id INT PRIMARY KEY, a TEXT, b TEXT COLLATE BINARY, c TEXT COLLATE \"unicode_ci\");")
+box.sql.execute("SELECT * FROM t WHERE a = b;")
+box.sql.execute("SELECT * FROM t WHERE a COLLATE BINARY = b;")
+box.sql.execute("SELECT * FROM t WHERE b = c;")
+box.sql.execute("SELECT * FROM t WHERE b COLLATE BINARY = c;")
+box.sql.execute("SELECT * FROM t WHERE a = c;")
+box.sql.execute("SELECT * FROM t WHERE a COLLATE BINARY = c COLLATE \"unicode\";")
+
+-- Compound queries perform implicit comparisons between values.
+-- Hence, rules for collations compatibilities are the same.
+--
+box.sql.execute("SELECT 'abc' COLLATE binary UNION SELECT 'ABC' COLLATE \"unicode_ci\"")
+box.sql.execute("SELECT 'abc' COLLATE \"unicode_ci\" UNION SELECT 'ABC' COLLATE binary")
+box.sql.execute("SELECT c FROM t UNION SELECT b FROM t;")
+box.sql.execute("SELECT b FROM t UNION SELECT a FROM t;")
+box.sql.execute("SELECT a FROM t UNION SELECT c FROM t;")
+box.sql.execute("SELECT c COLLATE BINARY FROM t UNION SELECT a FROM t;")
+box.sql.execute("SELECT b COLLATE \"unicode\" FROM t UNION SELECT a FROM t;")
+
+box.sql.execute("DROP TABLE t;")
 box.schema.user.revoke('guest', 'read,write,execute', 'universe')
-- 
2.15.1

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 3/3] sql: change collation compatibility rules
  2018-10-25 11:00 ` [tarantool-patches] [PATCH 3/3] sql: change collation compatibility rules Nikita Pettik
@ 2018-10-31 12:34   ` Vladislav Shpilevoy
  2018-11-12 23:46     ` n.pettik
  0 siblings, 1 reply; 17+ messages in thread
From: Vladislav Shpilevoy @ 2018-10-31 12:34 UTC (permalink / raw)
  To: Nikita Pettik, tarantool-patches

Thanks for the patch! See 5 comments below.

> diff --git a/src/box/sql/expr.c b/src/box/sql/expr.c
> index a13de4f0e..bd25ed656 100644
> --- a/src/box/sql/expr.c
> +++ b/src/box/sql/expr.c
> @@ -316,25 +315,62 @@ binaryCompareP5(Expr * pExpr1, Expr * pExpr2, int jumpIfNull)
>   	return aff;
>   }
>   
> +int
> +collations_are_compatible(uint32_t lhs_id, bool is_lhs_forced,
> +			  uint32_t rhs_id, bool is_rhs_forced,
> +			  uint32_t *res_id)

1. Why not to rename to _check_compatibility() and set
diag right here?

> +{
> +	assert(res_id != NULL);
> +	if (is_lhs_forced && is_rhs_forced) {
> +		if (lhs_id != rhs_id)
> +			return -1;
> +	}
> +	if (is_lhs_forced) {
> +		*res_id = lhs_id;
> +		return 0;
> +	}
> +	if (is_rhs_forced) {
> +		*res_id = rhs_id;
> +		return 0;
> +	}
> +	if (lhs_id != rhs_id) {
> +		if (lhs_id == COLL_NONE) {
> +			*res_id = rhs_id;
> +			return 0;
> +		}
> +		if (rhs_id == COLL_NONE) {
> +			*res_id = lhs_id;
> +			return 0;
> +		}
> +		return -1;
> +	}
> +	*res_id = lhs_id;
> +	return 0;
> +}
> +
>   struct coll *
>   sql_binary_compare_coll_seq(Parse *parser, Expr *left, Expr *right,
>   			    uint32_t *coll_id)
>   {
> -	struct coll *coll;
> -	bool is_found;
>   	assert(left != NULL);
> -	if ((left->flags & EP_Collate) != 0) {
> -		coll = sql_expr_coll(parser, left, &is_found, coll_id);
> -	} else if (right != NULL && (right->flags & EP_Collate) != 0) {
> -		coll = sql_expr_coll(parser, right, &is_found, coll_id);
> -	} else {
> -		coll = sql_expr_coll(parser, left, &is_found, coll_id);
> -		if (! is_found) {
> -			coll = sql_expr_coll(parser, right, &is_found,
> -					     coll_id);
> -		}
> +	bool is_lhs_forced;
> +	bool is_rhs_forced;
> +	uint32_t lhs_coll_id;
> +	uint32_t rhs_coll_id;
> +	struct coll *lhs_coll = sql_expr_coll(parser, left, &is_lhs_forced,
> +					      &lhs_coll_id);
> +	struct coll *rhs_coll = sql_expr_coll(parser, right, &is_rhs_forced,
> +					      &rhs_coll_id);
> +	if (collations_are_compatible(lhs_coll_id, is_lhs_forced,
> +				      rhs_coll_id, is_rhs_forced,
> +				      coll_id) != 0) {
> +		diag_set(ClientError, ER_ILLEGAL_COLLATION_MIX);
> +		parser->rc = SQL_TARANTOOL_ERROR;
> +		parser->nErr++;
> +		*coll_id = COLL_NONE;

3. Why do you need to set coll_id in a case of an error? As I
understand, after an error the compilation is terminated.

> +		return NULL;
>   	}
> -	return coll;
> +	return *coll_id == rhs_coll_id ? rhs_coll : lhs_coll;;
>   }
>   
>   /*
> diff --git a/src/box/sql/select.c b/src/box/sql/select.c
> index 1ec92aac7..c3403670b 100644
> --- a/src/box/sql/select.c
> +++ b/src/box/sql/select.c
> @@ -2035,8 +2035,9 @@ sqlite3SelectAddColumnTypeAndCollation(Parse * pParse,		/* Parsing contexts */
>   		bool is_found;
>   		uint32_t coll_id;
>   		if (pTab->def->fields[i].coll_id == COLL_NONE &&
> -		    sql_expr_coll(pParse, p, &is_found, &coll_id) && is_found)
> +		    sql_expr_coll(pParse, p, &is_found, &coll_id) && coll_id != COLL_NONE)
>   			pTab->def->fields[i].coll_id = coll_id;
> +		assert(pTab->def->fields[i].coll_id != COLL_BINARY);

4. Why? I can create a table with binary collation after the
previous commit.

>   	}
>   }
>   
> @@ -2223,47 +2224,53 @@ computeLimitRegisters(Parse * pParse, Select * p, int iBreak)
> +static uint32_t
> +multi_select_coll_seq(struct Parse *parser, struct Select *p, int n,
> +		      bool *is_forced_coll)
>   {
> -	struct coll *coll;
> +	bool is_prior_forced = false;
> +	bool is_current_forced;
> +	uint32_t prior_coll_id = COLL_NONE;
> +	uint32_t current_coll_id;
>   	if (p->pPrior != NULL) {
> -		coll = multi_select_coll_seq_r(parser, p->pPrior, n, is_found,
> -					       coll_id);
> -	} else {
> -		coll = NULL;
> -		*coll_id = COLL_NONE;
> +		prior_coll_id = multi_select_coll_seq(parser, p->pPrior, n,
> +						      &is_prior_forced);
>   	}
> -	assert(n >= 0);
> -	/* iCol must be less than p->pEList->nExpr.  Otherwise an error would
> -	 * have been thrown during name resolution and we would not have gotten
> -	 * this far
> +	/*
> +	 * Column number must be less than p->pEList->nExpr.
> +	 * Otherwise an error would have been thrown during name
> +	 * resolution and we would not have gotten this far.

5. 'have gotten'? Maybe 'have got'?

>   	 */
> -	if (!*is_found && ALWAYS(n < p->pEList->nExpr)) {
> -		coll = sql_expr_coll(parser, p->pEList->a[n].pExpr, is_found,
> -				     coll_id);
> -	}
> -	return coll;
> -}

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
  2018-10-25 11:00 ` [tarantool-patches] [PATCH 2/3] Add surrogate ID for BINARY collation Nikita Pettik
@ 2018-10-31 12:34   ` Vladislav Shpilevoy
  2018-10-31 15:47     ` n.pettik
  0 siblings, 1 reply; 17+ messages in thread
From: Vladislav Shpilevoy @ 2018-10-31 12:34 UTC (permalink / raw)
  To: Nikita Pettik, tarantool-patches

Hi! Thanks for the patch! See 3 comments below.

> diff --git a/src/box/key_def.h b/src/box/key_def.h
> index 20e79f9fe..ecdc199d9 100644
> --- a/src/box/key_def.h
> +++ b/src/box/key_def.h
> @@ -78,6 +78,23 @@ extern const struct key_part_def key_part_def_default;
>    */
>   #define COLL_NONE UINT32_MAX
>   
> +/**
> + * In SQL explicitly specified binary collation and absence of
> + * any collation are different in behaviour: according to ANSI
> + * it is prohibited to compare strings with different explicitly
> + * indicated collations. However, if one of collation is default,
> + * (i.e. absent) the second one will be forced.
> + * So, lets introduce another id to indicate explicitly specified
> + * binary collation.

1. Sorry, I am not sure that we can use imperative while
describing not functions.

2. I see, that you actually created a 'phantom' collation.
A collation, that has no a record in the collation cache,
but is visible to a user via space format. I think for
externally visible changes you should consult Kostja.
Alternatively, it is possible to create binary collation
in the same way as unicode and unicode_ci - via insertion
into _collation in upgrade script.

Also, I see a bug that we can create a collation in
_collation with id = COLL_NONE and COLL_BINARY, but which
actually are not NONE nor BINARY. Storing such identifiers
in _collation should be prohibited (if we will leave current
'phantom' binary collation as is). Furthermore COLL_NONE
for unknown reason is declared in key_def.h instead of
coll_id.h. It should be moved out. It is worth to create a
separate commit with refactoring right before this one.

> + */
> +#define COLL_BINARY (UINT32_MAX - 1)
> +
> +static inline bool
> +coll_is_missing(uint32_t coll_id)
> +{
> +	return coll_id == COLL_NONE || coll_id == COLL_BINARY;
> +}

Hence, this should be moved to coll_id.h as well and
renamed to coll_id_is_missing.

> diff --git a/test/sql/collation.test.lua b/test/sql/collation.test.lua
> index 935dea824..f9d653717 100644
> --- a/test/sql/collation.test.lua
> +++ b/test/sql/collation.test.lua
> @@ -42,4 +42,12 @@ cn = remote.connect(box.cfg.listen)
>   cn:execute('select 1 limit ? collate not_exist', {1})
>   
>   cn:close()
> +
> +-- Explicitly set BINARY collation has ID.

3. Please, add more tests, especially for box when you set id to
4294967294.

> +--
> +box.sql.execute("CREATE TABLE t (id INT PRIMARY KEY, a TEXT, b TEXT COLLATE BINARY);")
> +box.space.T:format()[2]['collation']
> +box.space.T:format()[3]['collation']
> +box.sql.execute("DROP TABLE t;")
> +
>   box.schema.user.revoke('guest', 'read,write,execute', 'universe')
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
  2018-10-31 12:34   ` [tarantool-patches] " Vladislav Shpilevoy
@ 2018-10-31 15:47     ` n.pettik
  2018-11-01 11:37       ` Konstantin Osipov
  0 siblings, 1 reply; 17+ messages in thread
From: n.pettik @ 2018-10-31 15:47 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Vladislav Shpilevoy



> On 31 Oct 2018, at 15:34, Vladislav Shpilevoy <v.shpilevoy@tarantool.org> wrote:
> 
> Hi! Thanks for the patch! See 3 comments below.
> 
>> diff --git a/src/box/key_def.h b/src/box/key_def.h
>> index 20e79f9fe..ecdc199d9 100644
>> --- a/src/box/key_def.h
>> +++ b/src/box/key_def.h
>> @@ -78,6 +78,23 @@ extern const struct key_part_def key_part_def_default;
>>   */
>>  #define COLL_NONE UINT32_MAX
>>  +/**
>> + * In SQL explicitly specified binary collation and absence of
>> + * any collation are different in behaviour: according to ANSI
>> + * it is prohibited to compare strings with different explicitly
>> + * indicated collations. However, if one of collation is default,
>> + * (i.e. absent) the second one will be forced.
>> + * So, lets introduce another id to indicate explicitly specified
>> + * binary collation.
> 
> 1. Sorry, I am not sure that we can use imperative while
> describing not functions.

OK, I’ll re-phrase it (still strange rule).

> 
> 2. I see, that you actually created a 'phantom' collation.
> A collation, that has no a record in the collation cache,
> but is visible to a user via space format. I think for
> externally visible changes you should consult Kostja.
> Alternatively, it is possible to create binary collation
> in the same way as unicode and unicode_ci - via insertion
> into _collation in upgrade script.
> 
> Also, I see a bug that we can create a collation in
> _collation with id = COLL_NONE and COLL_BINARY, but which
> actually are not NONE nor BINARY. Storing such identifiers
> in _collation should be prohibited (if we will leave current
> 'phantom' binary collation as is). Furthermore COLL_NONE
> for unknown reason is declared in key_def.h instead of
> coll_id.h. It should be moved out. It is worth to create a
> separate commit with refactoring right before this one.

I asked Vladimir before implementing this patch, and we
decided to avoid adding real collation struct to cache,
since in this case we would get *tiny but still* overhead
in the form of calling additional collations functions. However,
I agree that creating collations with ids COLL_NONE and
COLL_BINARY should be banned. I am going to re-ask
Vladimir and Konstantin and in case they don’t mind,
I will expose patch-set with additional commit containing
these checks.

> 
>> + */
>> +#define COLL_BINARY (UINT32_MAX - 1)
>> +
>> +static inline bool
>> +coll_is_missing(uint32_t coll_id)
>> +{
>> +	return coll_id == COLL_NONE || coll_id == COLL_BINARY;
>> +}
> 
> Hence, this should be moved to coll_id.h as well and
> renamed to coll_id_is_missing.

Ok, I am going to put this refactoring in a separate commit as well
and resend patch-set as v2.

> 
>> diff --git a/test/sql/collation.test.lua b/test/sql/collation.test.lua
>> index 935dea824..f9d653717 100644
>> --- a/test/sql/collation.test.lua
>> +++ b/test/sql/collation.test.lua
>> @@ -42,4 +42,12 @@ cn = remote.connect(box.cfg.listen)
>>  cn:execute('select 1 limit ? collate not_exist', {1})
>>    cn:close()
>> +
>> +-- Explicitly set BINARY collation has ID.
> 
> 3. Please, add more tests, especially for box when you set id to
> 4294967294.
> 
>> +--
>> +box.sql.execute("CREATE TABLE t (id INT PRIMARY KEY, a TEXT, b TEXT COLLATE BINARY);")
>> +box.space.T:format()[2]['collation']
>> +box.space.T:format()[3]['collation']
>> +box.sql.execute("DROP TABLE t;")
>> +
>>  box.schema.user.revoke('guest', 'read,write,execute', 'universe')

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
  2018-10-31 15:47     ` n.pettik
@ 2018-11-01 11:37       ` Konstantin Osipov
  2018-11-01 12:22         ` Vladislav Shpilevoy
  0 siblings, 1 reply; 17+ messages in thread
From: Konstantin Osipov @ 2018-11-01 11:37 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Vladislav Shpilevoy

* n.pettik <korablev@tarantool.org> [18/10/31 18:52]:

Sorry for a last-minute comment, but is there any reason why id
has to be  4294967294? Why not use the next spare id, it's 3
AFAIR?

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
  2018-11-01 11:37       ` Konstantin Osipov
@ 2018-11-01 12:22         ` Vladislav Shpilevoy
  2018-11-01 12:58           ` Konstantin Osipov
  0 siblings, 1 reply; 17+ messages in thread
From: Vladislav Shpilevoy @ 2018-11-01 12:22 UTC (permalink / raw)
  To: Konstantin Osipov, tarantool-patches



On 01/11/2018 14:37, Konstantin Osipov wrote:
> * n.pettik <korablev@tarantool.org> [18/10/31 18:52]:
> 
> Sorry for a last-minute comment, but is there any reason why id
> has to be  4294967294? Why not use the next spare id, it's 3
> AFAIR?


I guess, because

1) It is not real collation and is not presented in
_collation. So for a user it would be strange to see
a gap between 2 and 4 in _collation, which can not be
set.

2) Some advanced users could already create their own
collations, so 3 may be occupied.

3) Actually binary collation == no collation and it
is consistent to has its ID near COLL_NONE, in a "special
range" of collation identifiers.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
  2018-11-01 12:22         ` Vladislav Shpilevoy
@ 2018-11-01 12:58           ` Konstantin Osipov
  2018-11-01 13:08             ` n.pettik
  0 siblings, 1 reply; 17+ messages in thread
From: Konstantin Osipov @ 2018-11-01 12:58 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

* Vladislav Shpilevoy <v.shpilevoy@tarantool.org> [18/11/01 15:23]:
> 
> On 01/11/2018 14:37, Konstantin Osipov wrote:
> > * n.pettik <korablev@tarantool.org> [18/10/31 18:52]:
> > 
> > Sorry for a last-minute comment, but is there any reason why id
> > has to be  4294967294? Why not use the next spare id, it's 3
> > AFAIR?
> 
> 
> I guess, because
> 
> 1) It is not real collation and is not presented in
> _collation. So for a user it would be strange to see
> a gap between 2 and 4 in _collation, which can not be
> set.

Let's insert it there.

> 2) Some advanced users could already create their own
> collations, so 3 may be occupied.

No, they couldn't.

> 3) Actually binary collation == no collation and it
> is consistent to has its ID near COLL_NONE, in a "special
> range" of collation identifiers.

Uhm, AFAIU we have two binary collations. One is "collation is not
set" and another is "collation binary". Which one did you mean
now?

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
  2018-11-01 12:58           ` Konstantin Osipov
@ 2018-11-01 13:08             ` n.pettik
  2018-11-01 15:39               ` Konstantin Osipov
  0 siblings, 1 reply; 17+ messages in thread
From: n.pettik @ 2018-11-01 13:08 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Konstantin Osipov, Vladislav Shpilevoy

[-- Attachment #1: Type: text/plain, Size: 1525 bytes --]



> On 1 Nov 2018, at 15:58, Konstantin Osipov <kostja@tarantool.org> wrote:
> 
> * Vladislav Shpilevoy <v.shpilevoy@tarantool.org <mailto:v.shpilevoy@tarantool.org>> [18/11/01 15:23]:
>> 
>> On 01/11/2018 14:37, Konstantin Osipov wrote:
>>> * n.pettik <korablev@tarantool.org> [18/10/31 18:52]:
>>> 
>>> Sorry for a last-minute comment, but is there any reason why id
>>> has to be  4294967294? Why not use the next spare id, it's 3
>>> AFAIR?
>> 
>> 
>> I guess, because
>> 
>> 1) It is not real collation and is not presented in
>> _collation. So for a user it would be strange to see
>> a gap between 2 and 4 in _collation, which can not be
>> set.
> 
> Let's insert it there.

So, you insist on id == 3, right? Again, if user process select
rom _collation space, one won’t see entry with id == 3.
On the other hand, if user attempts at inserting id == 3,
one will get an error.

> 
>> 3) Actually binary collation == no collation and it
>> is consistent to has its ID near COLL_NONE, in a "special
>> range" of collation identifiers.
> 
> Uhm, AFAIU we have two binary collations. One is "collation is not
> set" and another is "collation binary". Which one did you mean
> now?

FIrst one is not collation at all. It is rather “absence” of any collation.
The second one is sort of “surrogate” and in terms of functionality
means the same. However, its id will be stored in space format in
order to indicate that BINARY collation should be forced during
comparisons.


[-- Attachment #2: Type: text/html, Size: 8518 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
  2018-11-01 13:08             ` n.pettik
@ 2018-11-01 15:39               ` Konstantin Osipov
       [not found]                 ` <95CB17D5-E3ED-4B05-A289-983E2FD0DE37@gmail.com>
  0 siblings, 1 reply; 17+ messages in thread
From: Konstantin Osipov @ 2018-11-01 15:39 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Vladislav Shpilevoy

* n.pettik <korablev@tarantool.org> [18/11/01 16:11]:
> >> I guess, because
> >> 
> >> 1) It is not real collation and is not presented in
> >> _collation. So for a user it would be strange to see
> >> a gap between 2 and 4 in _collation, which can not be
> >> set.
> > 
> > Let's insert it there.
> 
> So, you insist on id == 3, right? Again, if user process select
> rom _collation space, one won’t see entry with id == 3.
> On the other hand, if user attempts at inserting id == 3,
> one will get an error.

No, I don't insist yet. Why not insert a special row in there?

> >> is consistent to has its ID near COLL_NONE, in a "special
> >> range" of collation identifiers.
> > 
> > Uhm, AFAIU we have two binary collations. One is "collation is not
> > set" and another is "collation binary". Which one did you mean
> > now?
> 
> FIrst one is not collation at all. It is rather “absence” of any collation.
> The second one is sort of “surrogate” and in terms of functionality
> means the same. However, its id will be stored in space format in
> order to indicate that BINARY collation should be forced during
> comparisons.

I think we could use internal ids to reference both cases. For
these both ids we could have surrogate rows in _coll system space,
they won't harm. This will make things easier in the future. 

This is going to be the same mess as with NO ACTION and DEFAULT,
which are mostly the same, but not quite, so we'd better prepare.

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
       [not found]                 ` <95CB17D5-E3ED-4B05-A289-983E2FD0DE37@gmail.com>
@ 2018-11-01 17:45                   ` n.pettik
  2018-11-01 20:00                   ` Konstantin Osipov
  1 sibling, 0 replies; 17+ messages in thread
From: n.pettik @ 2018-11-01 17:45 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Vladislav Shpilevoy, Konstantin Osipov

[-- Attachment #1: Type: text/plain, Size: 3870 bytes --]

I have occasionally sent mail from wrong address, so
you might miss it. My apologies, I resend it from right one.

> On 1 Nov 2018, at 19:31, Никита Петтик <kitnerh@gmail.com> wrote:
> 
>> On 1 Nov 2018, at 18:39, Konstantin Osipov <kostja@tarantool.org> wrote:
>> 
>> * n.pettik <korablev@tarantool.org> [18/11/01 16:11]:
>>>>> I guess, because
>>>>> 
>>>>> 1) It is not real collation and is not presented in
>>>>> _collation. So for a user it would be strange to see
>>>>> a gap between 2 and 4 in _collation, which can not be
>>>>> set.
>>>> 
>>>> Let's insert it there.
>>> 
>>> So, you insist on id == 3, right? Again, if user process select
>>> rom _collation space, one won’t see entry with id == 3.
>>> On the other hand, if user attempts at inserting id == 3,
>>> one will get an error.
>> 
>> No, I don't insist yet. Why not insert a special row in there?
> 
> Because insertion to _collation would result in creation
> of collation objects. Meanwhile, in fact we need only ID
> to distinguish BINARY and no-collation. The rest is the
> same for them. So, it makes sense to store only ID within
> space format. That is my point.
> 
>>>>> is consistent to has its ID near COLL_NONE, in a "special
>>>>> range" of collation identifiers.
>>>> 
>>>> Uhm, AFAIU we have two binary collations. One is "collation is not
>>>> set" and another is "collation binary". Which one did you mean
>>>> now?
>>> 
>>> FIrst one is not collation at all. It is rather “absence” of any collation.
>>> The second one is sort of “surrogate” and in terms of functionality
>>> means the same. However, its id will be stored in space format in
>>> order to indicate that BINARY collation should be forced during
>>> comparisons.
>> 
>> I think we could use internal ids to reference both cases. For
>> these both ids we could have surrogate rows in _coll system space,
>> they won't harm. This will make things easier in the future. 
> 
> Ok,  how do you suggest to call “absence” of collation? Like this:
> 
> box.space._collation:select()
> 
> ---
> - - [1, 'unicode', 1, 'ICU', '', {}]
>  - [2, 'unicode_ci', 1, 'ICU', '', {'strength': 'primary’}]
>  - [3, ‘none', 1, 'ICU', '', {}]
> ...
> 
> It is nonsense, IMHO. No collation is like “no collation at all” -
> nothing represents it, especially visible for user. With BINARY
> collation it would look even more suspicious:
> 
> - - [1, 'unicode', 1, 'ICU', '', {}]
>  - [2, 'unicode_ci', 1, 'ICU', '', {'strength': 'primary’}]
>  - [3, ‘none', 1, 'ICU', '', {}]
>  - [4, ‘binary', 1, 'ICU', '', {}]
> 
> It would confuse users who don’t use SQL: in Tarantool NoSQL
> there is no difference between “binary” and “no-collation”.
> Moreover, to keep things consistent, we would have to 	make
> default collation be ’none’ instead of absence of collation.
> It means that field def without explicitly set collation would
> have ’none’ collation in format. For instance:
> 
> *before*
> 
> - [{'affinity': 66, 'type': ’string', 'nullable_action': 'abort', 'name': 'ID', 'is_nullable': false}]
> 
> *after*
> 
> - [{'collation': 3, 'affinity': 66, 'type': 'string', 'nullable_action': 'abort',
>    'name': 'ID', 'is_nullable': false}]
> 
>> This is going to be the same mess as with NO ACTION and DEFAULT,
>> which are mostly the same, but not quite, so we'd better prepare.
> 
> It is considered to be mess due to SQLite legacy. On the other hand, all
> these manipulations with collations follow SQL ANSI.
> 
> All points considered, I would prefer to introduce only another one ID
> (alongside with COLL_NONE ID) and prohibit to create collations with
> these ids. OR, add surrogate “binary collation” to _collation with id == 3,
> but not both “binary” and “none”.


[-- Attachment #2: Type: text/html, Size: 37755 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
       [not found]                 ` <95CB17D5-E3ED-4B05-A289-983E2FD0DE37@gmail.com>
  2018-11-01 17:45                   ` n.pettik
@ 2018-11-01 20:00                   ` Konstantin Osipov
  2018-11-01 20:06                     ` Konstantin Osipov
  2018-11-01 20:20                     ` n.pettik
  1 sibling, 2 replies; 17+ messages in thread
From: Konstantin Osipov @ 2018-11-01 20:00 UTC (permalink / raw)
  To: Никита
	Петтик
  Cc: tarantool-patches, Vladislav Shpilevoy

* Никита Петтик <kitnerh@gmail.com> [18/11/01 19:34]:
> >>>> 1) It is not real collation and is not presented in
> >>>> _collation. So for a user it would be strange to see
> >>>> a gap between 2 and 4 in _collation, which can not be
> >>>> set.
> >>> Let's insert it there.
> >> So, you insist on id == 3, right? Again, if user process select
> >> rom _collation space, one won’t see entry with id == 3.
> >> On the other hand, if user attempts at inserting id == 3,
> >> one will get an error.
> > No, I don't insist yet. Why not insert a special row in there?
> 
> Because insertion to _collation would result in creation
> of collation objects. 

Not necessarily. We can add a special treatment of these ids to 
insert triggers. Or we can set a different collation type for
these - which is equivalent for special treatment. _coll system
space already supports non-icu collations, so this is one such
collation.

> Meanwhile, in fact we need only ID to distinguish BINARY and
> no-collation. The rest is the same for them. So, it makes sense
> to store only ID within space format. That is my point.
> 
> >>>> is consistent to has its ID near COLL_NONE, in a "special
> >>>> range" of collation identifiers.
> >>> 
> >>> Uhm, AFAIU we have two binary collations. One is "collation is not
> >>> set" and another is "collation binary". Which one did you mean
> >>> now?
> >> 
> >> FIrst one is not collation at all. It is rather “absence” of any collation.
> >> The second one is sort of “surrogate” and in terms of functionality
> >> means the same. However, its id will be stored in space format in
> >> order to indicate that BINARY collation should be forced during
> >> comparisons.
> > 
> > I think we could use internal ids to reference both cases. For
> > these both ids we could have surrogate rows in _coll system space,
> > they won't harm. This will make things easier in the future. 
> 
> Ok,  how do you suggest to call “absence” of collation? Like this:
> 
> box.space._collation:select()
> 
> ---
> - - [1, 'unicode', 1, 'ICU', '', {}]
>   - [2, 'unicode_ci', 1, 'ICU', '', {'strength': 'primary’}]
>   - [3, ‘none', 1, 'ICU', '', {}]
> ...
> 
> It is nonsense, IMHO. No collation is like “no collation at all” -
> nothing represents it, especially visible for user. With BINARY
> collation it would look even more suspicious:
> 
> - - [1, 'unicode', 1, 'ICU', '', {}]
>   - [2, 'unicode_ci', 1, 'ICU', '', {'strength': 'primary’}]
>   - [3, ‘none', 1, 'ICU', '', {}]
>   - [4, ‘binary', 1, 'ICU', '', {}]

Yes, I believe this is the thing. Looks pretty good to me. 
> 
> It would confuse users who don’t use SQL: in Tarantool NoSQL
> there is no difference between “binary” and “no-collation”.

This is temporary. The deeper SQL penetrates box layer the more
nosql will have the same semantics as SQL.

> Moreover, to keep things consistent, we would have to 	make
> default collation be ’none’ instead of absence of collation.
> It means that field def without explicitly set collation would
> have ’none’ collation in format. For instance:

Then id of 'none' should be 0, not 3.
> 
> *before*
> 
> - [{'affinity': 66, 'type': ’string', 'nullable_action': 'abort', 'name': 'ID', 'is_nullable': false}]
> 
> *after*
> 
> - [{'collation': 3, 'affinity': 66, 'type': 'string', 'nullable_action': 'abort',
>     'name': 'ID', 'is_nullable': false}]
> 
> > This is going to be the same mess as with NO ACTION and DEFAULT,
> > which are mostly the same, but not quite, so we'd better prepare.
> 
> It is considered to be mess due to SQLite legacy. On the other hand, all
> these manipulations with collations follow SQL ANSI.

No, it's a mess due to ANSI not SQLite. And the distinction
between absence of collation and binary collation is also coming
from ANSI. 
> 
> All points considered, I would prefer to introduce only another one ID
> (alongside with COLL_NONE ID) and prohibit to create collations with
> these ids. OR, add surrogate “binary collation” to _collation with id == 3,
> but not both “binary” and “none”.

We could also ask what PeterG thinks.

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
  2018-11-01 20:00                   ` Konstantin Osipov
@ 2018-11-01 20:06                     ` Konstantin Osipov
  2018-11-01 20:20                     ` n.pettik
  1 sibling, 0 replies; 17+ messages in thread
From: Konstantin Osipov @ 2018-11-01 20:06 UTC (permalink / raw)
  To: Никита
	Петтик
  Cc: tarantool-patches, Vladislav Shpilevoy

My point is - in future you may want an object with some kind of
context to represent binary and absent collation. Right now an id
is sufficient in all contexts, but in future it may be easier to
treat all collations the same way.

For example, collation "weight" in comparisons could be
represent as an integer number. As long as you don't have an
object for binary and "none" collations, you will have to treat
them in a special way, while you could simply compare their
strength with each other. The same is about collation name - you
may need to print collation name in an error message. If "none"
and "binary" are only ids, you will, once again, have to treat
them separately.

* Konstantin Osipov <kostja@tarantool.org> [18/11/01 23:00]:

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 2/3] Add surrogate ID for BINARY collation
  2018-11-01 20:00                   ` Konstantin Osipov
  2018-11-01 20:06                     ` Konstantin Osipov
@ 2018-11-01 20:20                     ` n.pettik
  1 sibling, 0 replies; 17+ messages in thread
From: n.pettik @ 2018-11-01 20:20 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Konstantin Osipov, Vladislav Shpilevoy



> On 1 Nov 2018, at 23:00, Konstantin Osipov <kostja@tarantool.org> wrote:
> 
> * Никита Петтик <kitnerh@gmail.com> [18/11/01 19:34]:
>>>>>> 1) It is not real collation and is not presented in
>>>>>> _collation. So for a user it would be strange to see
>>>>>> a gap between 2 and 4 in _collation, which can not be
>>>>>> set.
>>>>> Let's insert it there.
>>>> So, you insist on id == 3, right? Again, if user process select
>>>> rom _collation space, one won’t see entry with id == 3.
>>>> On the other hand, if user attempts at inserting id == 3,
>>>> one will get an error.
>>> No, I don't insist yet. Why not insert a special row in there?
>> 
>> Because insertion to _collation would result in creation
>> of collation objects. 
> 
> Not necessarily. We can add a special treatment of these ids to 
> insert triggers. Or we can set a different collation type for
> these - which is equivalent for special treatment. _coll system
> space already supports non-icu collations, so this is one such
> collation.

Not really. I look at coll.c:

struct coll *
coll_new(const struct coll_def *def)
{
       assert(def->type == COLL_TYPE_ICU);
...

> 
>> Meanwhile, in fact we need only ID to distinguish BINARY and
>> no-collation. The rest is the same for them. So, it makes sense
>> to store only ID within space format. That is my point.
>> 
>>>>>> is consistent to has its ID near COLL_NONE, in a "special
>>>>>> range" of collation identifiers.
>>>>> 
>>>>> Uhm, AFAIU we have two binary collations. One is "collation is not
>>>>> set" and another is "collation binary". Which one did you mean
>>>>> now?
>>>> 
>>>> FIrst one is not collation at all. It is rather “absence” of any collation.
>>>> The second one is sort of “surrogate” and in terms of functionality
>>>> means the same. However, its id will be stored in space format in
>>>> order to indicate that BINARY collation should be forced during
>>>> comparisons.
>>> 
>>> I think we could use internal ids to reference both cases. For
>>> these both ids we could have surrogate rows in _coll system space,
>>> they won't harm. This will make things easier in the future. 
>> 
>> Ok,  how do you suggest to call “absence” of collation? Like this:
>> 
>> box.space._collation:select()
>> 
>> ---
>> - - [1, 'unicode', 1, 'ICU', '', {}]
>>  - [2, 'unicode_ci', 1, 'ICU', '', {'strength': 'primary’}]
>>  - [3, ‘none', 1, 'ICU', '', {}]
>> ...
>> 
>> It is nonsense, IMHO. No collation is like “no collation at all” -
>> nothing represents it, especially visible for user. With BINARY
>> collation it would look even more suspicious:
>> 
>> - - [1, 'unicode', 1, 'ICU', '', {}]
>>  - [2, 'unicode_ci', 1, 'ICU', '', {'strength': 'primary’}]
>>  - [3, ‘none', 1, 'ICU', '', {}]
>>  - [4, ‘binary', 1, 'ICU', '', {}]
> 
> Yes, I believe this is the thing. Looks pretty good to me. 
>> 
>> It would confuse users who don’t use SQL: in Tarantool NoSQL
>> there is no difference between “binary” and “no-collation”.
> 
> This is temporary. The deeper SQL penetrates box layer the more
> nosql will have the same semantics as SQL.
> 
>> Moreover, to keep things consistent, we would have to 	make
>> default collation be ’none’ instead of absence of collation.
>> It means that field def without explicitly set collation would
>> have ’none’ collation in format. For instance:
> 
> Then id of 'none' should be 0, not 3.
>> 
>> *before*
>> 
>> - [{'affinity': 66, 'type': ’string', 'nullable_action': 'abort', 'name': 'ID', 'is_nullable': false}]
>> 
>> *after*
>> 
>> - [{'collation': 3, 'affinity': 66, 'type': 'string', 'nullable_action': 'abort',
>>    'name': 'ID', 'is_nullable': false}]
>> 
>>> This is going to be the same mess as with NO ACTION and DEFAULT,
>>> which are mostly the same, but not quite, so we'd better prepare.
>> 
>> It is considered to be mess due to SQLite legacy. On the other hand, all
>> these manipulations with collations follow SQL ANSI.
> 
> No, it's a mess due to ANSI not SQLite. And the distinction
> between absence of collation and binary collation is also coming
> from ANSI. 
>> 
>> All points considered, I would prefer to introduce only another one ID
>> (alongside with COLL_NONE ID) and prohibit to create collations with
>> these ids. OR, add surrogate “binary collation” to _collation with id == 3,
>> but not both “binary” and “none”.
> 
> We could also ask what PeterG thinks.

I’d rather not - it is not question concerning user-visible behaviour,
but only internal implementation. Anyway, I would better follow your
way, instead of arguing. To sum up we are introducing two additional
collations to represent “none” (id == 0) and “binary” (id == 3) collations
and making “none” collation be default, which in turn means that it would
be presented in format, even if user didn’t specify it explicitly.
That’s my plan by now. SQL part of this patch is obvious and already done.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [tarantool-patches] Re: [PATCH 3/3] sql: change collation compatibility rules
  2018-10-31 12:34   ` [tarantool-patches] " Vladislav Shpilevoy
@ 2018-11-12 23:46     ` n.pettik
  0 siblings, 0 replies; 17+ messages in thread
From: n.pettik @ 2018-11-12 23:46 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Vladislav Shpilevoy

After discussion with Konstantin in @dev mailing list, we have
decided to introduce real entities in _collation space to represent
binary and “none” collations. Hence, I am going to send updated
patch-set as second iteration (v2).
Also, I’ve took your comments into consideration.

>> diff --git a/src/box/sql/expr.c b/src/box/sql/expr.c
>> index a13de4f0e..bd25ed656 100644
>> --- a/src/box/sql/expr.c
>> +++ b/src/box/sql/expr.c
>> @@ -316,25 +315,62 @@ binaryCompareP5(Expr * pExpr1, Expr * pExpr2, int jumpIfNull)
>>  	return aff;
>>  }
>>  +int
>> +collations_are_compatible(uint32_t lhs_id, bool is_lhs_forced,
>> +			  uint32_t rhs_id, bool is_rhs_forced,
>> +			  uint32_t *res_id)
> 
> 1. Why not to rename to _check_compatibility() and set
> diag right here?

Just another way to implement this. In updated patch I did
it as you suggested.

>> +{
>> +	assert(res_id != NULL);
>> +	if (is_lhs_forced && is_rhs_forced) {
>> +		if (lhs_id != rhs_id)
>> +			return -1;
>> +	}
>> +	if (is_lhs_forced) {
>> +		*res_id = lhs_id;
>> +		return 0;
>> +	}
>> +	if (is_rhs_forced) {
>> +		*res_id = rhs_id;
>> +		return 0;
>> +	}
>> +	if (lhs_id != rhs_id) {
>> +		if (lhs_id == COLL_NONE) {
>> +			*res_id = rhs_id;
>> +			return 0;
>> +		}
>> +		if (rhs_id == COLL_NONE) {
>> +			*res_id = lhs_id;
>> +			return 0;
>> +		}
>> +		return -1;
>> +	}
>> +	*res_id = lhs_id;
>> +	return 0;
>> +}
>> +
>>  struct coll *
>>  sql_binary_compare_coll_seq(Parse *parser, Expr *left, Expr *right,
>>  			    uint32_t *coll_id)
>>  {
>> -	struct coll *coll;
>> -	bool is_found;
>>  	assert(left != NULL);
>> -	if ((left->flags & EP_Collate) != 0) {
>> -		coll = sql_expr_coll(parser, left, &is_found, coll_id);
>> -	} else if (right != NULL && (right->flags & EP_Collate) != 0) {
>> -		coll = sql_expr_coll(parser, right, &is_found, coll_id);
>> -	} else {
>> -		coll = sql_expr_coll(parser, left, &is_found, coll_id);
>> -		if (! is_found) {
>> -			coll = sql_expr_coll(parser, right, &is_found,
>> -					     coll_id);
>> -		}
>> +	bool is_lhs_forced;
>> +	bool is_rhs_forced;
>> +	uint32_t lhs_coll_id;
>> +	uint32_t rhs_coll_id;
>> +	struct coll *lhs_coll = sql_expr_coll(parser, left, &is_lhs_forced,
>> +					      &lhs_coll_id);
>> +	struct coll *rhs_coll = sql_expr_coll(parser, right, &is_rhs_forced,
>> +					      &rhs_coll_id);
>> +	if (collations_are_compatible(lhs_coll_id, is_lhs_forced,
>> +				      rhs_coll_id, is_rhs_forced,
>> +				      coll_id) != 0) {
>> +		diag_set(ClientError, ER_ILLEGAL_COLLATION_MIX);
>> +		parser->rc = SQL_TARANTOOL_ERROR;
>> +		parser->nErr++;
>> +		*coll_id = COLL_NONE;
> 
> 3. Why do you need to set coll_id in a case of an error? As I
> understand, after an error the compilation is terminated.

Just in case, it shouldn’t be really used. I removed it.

> 
>> +		return NULL;
>>  	}
>> -	return coll;
>> +	return *coll_id == rhs_coll_id ? rhs_coll : lhs_coll;;
>>  }
>>    /*
>> diff --git a/src/box/sql/select.c b/src/box/sql/select.c
>> index 1ec92aac7..c3403670b 100644
>> --- a/src/box/sql/select.c
>> +++ b/src/box/sql/select.c
>> @@ -2035,8 +2035,9 @@ sqlite3SelectAddColumnTypeAndCollation(Parse * pParse,		/* Parsing contexts */
>>  		bool is_found;
>>  		uint32_t coll_id;
>>  		if (pTab->def->fields[i].coll_id == COLL_NONE &&
>> -		    sql_expr_coll(pParse, p, &is_found, &coll_id) && is_found)
>> +		    sql_expr_coll(pParse, p, &is_found, &coll_id) && coll_id != COLL_NONE)
>>  			pTab->def->fields[i].coll_id = coll_id;
>> +		assert(pTab->def->fields[i].coll_id != COLL_BINARY);
> 
> 4. Why? I can create a table with binary collation after the
> previous commit.

I guess it is debug remains. Removed.

> 
>>  	}
>>  }
>>  @@ -2223,47 +2224,53 @@ computeLimitRegisters(Parse * pParse, Select * p, int iBreak)
>> +static uint32_t
>> +multi_select_coll_seq(struct Parse *parser, struct Select *p, int n,
>> +		      bool *is_forced_coll)
>>  {
>> -	struct coll *coll;
>> +	bool is_prior_forced = false;
>> +	bool is_current_forced;
>> +	uint32_t prior_coll_id = COLL_NONE;
>> +	uint32_t current_coll_id;
>>  	if (p->pPrior != NULL) {
>> -		coll = multi_select_coll_seq_r(parser, p->pPrior, n, is_found,
>> -					       coll_id);
>> -	} else {
>> -		coll = NULL;
>> -		*coll_id = COLL_NONE;
>> +		prior_coll_id = multi_select_coll_seq(parser, p->pPrior, n,
>> +						      &is_prior_forced);
>>  	}
>> -	assert(n >= 0);
>> -	/* iCol must be less than p->pEList->nExpr.  Otherwise an error would
>> -	 * have been thrown during name resolution and we would not have gotten
>> -	 * this far
>> +	/*
>> +	 * Column number must be less than p->pEList->nExpr.
>> +	 * Otherwise an error would have been thrown during name
>> +	 * resolution and we would not have gotten this far.
> 
> 5. 'have gotten'? Maybe 'have got’?

Yep, fixed.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2018-11-12 23:46 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-25 11:00 [tarantool-patches] [PATCH 0/3] Change collation compatibility rules according to ANSI SQL Nikita Pettik
2018-10-25 11:00 ` [tarantool-patches] [PATCH 1/3] sql: do not add explicit <COLLATE "BINARY"> clause Nikita Pettik
2018-10-25 11:00 ` [tarantool-patches] [PATCH 2/3] Add surrogate ID for BINARY collation Nikita Pettik
2018-10-31 12:34   ` [tarantool-patches] " Vladislav Shpilevoy
2018-10-31 15:47     ` n.pettik
2018-11-01 11:37       ` Konstantin Osipov
2018-11-01 12:22         ` Vladislav Shpilevoy
2018-11-01 12:58           ` Konstantin Osipov
2018-11-01 13:08             ` n.pettik
2018-11-01 15:39               ` Konstantin Osipov
     [not found]                 ` <95CB17D5-E3ED-4B05-A289-983E2FD0DE37@gmail.com>
2018-11-01 17:45                   ` n.pettik
2018-11-01 20:00                   ` Konstantin Osipov
2018-11-01 20:06                     ` Konstantin Osipov
2018-11-01 20:20                     ` n.pettik
2018-10-25 11:00 ` [tarantool-patches] [PATCH 3/3] sql: change collation compatibility rules Nikita Pettik
2018-10-31 12:34   ` [tarantool-patches] " Vladislav Shpilevoy
2018-11-12 23:46     ` n.pettik

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox