[tarantool-patches] [PATCH v2 7/7] sql: store regular identifiers in case-normal form

Wed Feb 27 14:13:18 MSK 2019

Introduced a new sql_normalize_name routine performing SQL name
conversion to case-normal form via unicode character folding.
For example, ß is converted to SS. The result is similar to SQL
UPPER function.

Closes #3931
---
 src/box/lua/lua_sql.c                 | 11 +++--
 src/box/sql/build.c                   | 36 +++++++++-----
 src/box/sql/expr.c                    | 71 ++++++++++++++++++---------
 src/box/sql/parse.y                   | 26 ++++++++--
 src/box/sql/select.c                  | 20 ++++++--
 src/box/sql/sqlInt.h                  | 26 +++++++++-
 src/box/sql/trigger.c                 | 18 +++++--
 src/box/sql/util.c                    | 42 ++++++++++------
 test/sql-tap/identifier_case.test.lua | 12 +++--
 9 files changed, 188 insertions(+), 74 deletions(-)

diff --git a/src/box/lua/lua_sql.c b/src/box/lua/lua_sql.c
index f5a7b7819..c27ca818e 100644
--- a/src/box/lua/lua_sql.c
+++ b/src/box/lua/lua_sql.c
@@ -176,13 +176,16 @@ lbox_sql_create_function(struct lua_State *L)
 	}
 	size_t name_len;
 	const char *name = lua_tolstring(L, 1, &name_len);
+	int normalized_name_len = sql_normalize_name(NULL, 0, name, name_len);
+	if (normalized_name_len < 0)
+		return luaT_error(L);
 	char *normalized_name = (char *) region_alloc(&fiber()->gc,
-						      name_len + 1);
+						      normalized_name_len + 1);
 	if (normalized_name == NULL)
 		return luaL_error(L, "out of memory");
-	memcpy(normalized_name, name, name_len);
-	normalized_name[name_len] = '\0';
-	sqlNormalizeName(normalized_name);
+	if (sql_normalize_name(normalized_name, normalized_name_len + 1, name,
+			       name_len) < 0)
+		return luaT_error(L);
 	struct lua_sql_func_info *func_info =
 		(struct lua_sql_func_info *) malloc(sizeof(*func_info));
 	if (func_info == NULL)
diff --git a/src/box/sql/build.c b/src/box/sql/build.c
index 24f20836b..d167a5714 100644
--- a/src/box/sql/build.c
+++ b/src/box/sql/build.c
@@ -236,13 +236,20 @@ char *
 sql_name_from_token(struct sql *db, struct Token *name_token)
 {
 	assert(name_token != NULL && name_token->z != NULL);
-	char *name = sqlDbStrNDup(db, (char *)name_token->z, name_token->n);
+	int name_len =
+		sql_normalize_name(NULL, 0, name_token->z, name_token->n);
+	if (name_len < 0)
+		return NULL;
+	char *name = sqlDbMallocRawNN(db, name_len + 1);
 	if (name == NULL) {
-		diag_set(OutOfMemory, name_token->n + 1, "sqlDbStrNDup",
-			 "name");
+		diag_set(OutOfMemory, name_len + 1, "sqlDbMallocRawNN", "name");
+		return NULL;
+	}
+	if (sql_normalize_name(name, name_len + 1, name_token->z,
+			       name_token->n) < 0) {
+		sqlDbFree(db, name);
 		return NULL;
 	}
-	sqlNormalizeName(name);
 	return name;
 }
 
@@ -441,17 +448,16 @@ sqlAddColumn(Parse * pParse, Token * pName, struct type_def *type_def)
 	if (sql_field_retrieve(pParse, def, def->field_count) == NULL)
 		return;
 	struct region *region = &pParse->region;
-	z = region_alloc(region, pName->n + 1);
+	int name_len = sql_normalize_name(NULL, 0, pName->z, pName->n);
+	if (name_len < 0)
+		goto tarantool_error;
+	z = region_alloc(region, name_len + 1);
 	if (z == NULL) {
-		diag_set(OutOfMemory, pName->n + 1,
-			 "region_alloc", "z");
-		pParse->rc = SQL_TARANTOOL_ERROR;
-		pParse->nErr++;
-		return;
+		diag_set(OutOfMemory, name_len + 1, "region_alloc", "z");
+		goto tarantool_error;
 	}
-	memcpy(z, pName->z, pName->n);
-	z[pName->n] = 0;
-	sqlNormalizeName(z);
+	if (sql_normalize_name(z, name_len + 1, pName->z, pName->n) < 0)
+		goto tarantool_error;
 	for (uint32_t i = 0; i < def->field_count; i++) {
 		if (strcmp(z, def->fields[i].name) == 0) {
 			diag_set(ClientError, ER_SPACE_FIELD_IS_DUPLICATE, z);
@@ -472,6 +478,10 @@ sqlAddColumn(Parse * pParse, Token * pName, struct type_def *type_def)
 	column_def->type = type_def->type;
 	def->field_count++;
 	pParse->constraintName.n = 0;
+	return;
+tarantool_error:
+	pParse->rc = SQL_TARANTOOL_ERROR;
+	pParse->nErr++;
 }
 
 void
diff --git a/src/box/sql/expr.c b/src/box/sql/expr.c
index dd5e2c28d..c03873c02 100644
--- a/src/box/sql/expr.c
+++ b/src/box/sql/expr.c
@@ -863,7 +863,16 @@ sql_expr_create(struct sql *db, int op, const Token *token, bool dequote)
 	if (token != NULL) {
 		if (op != TK_INTEGER || token->z == NULL ||
 		    sqlGetInt32(token->z, &val) == 0) {
-			extra_sz = token->n + 1;
+			if (op == TK_ID || op == TK_COLLATE ||
+			    op == TK_FUNCTION) {
+				extra_sz = sql_normalize_name(NULL, 0, token->z,
+							      token->n);
+				if (extra_sz < 0)
+					return NULL;
+			} else {
+				extra_sz = token->n;
+			}
+			extra_sz += 1;
 			assert(val >= 0);
 		}
 	}
@@ -889,15 +898,20 @@ sql_expr_create(struct sql *db, int op, const Token *token, bool dequote)
 	} else {
 		expr->u.zToken = (char *)&expr[1];
 		assert(token->z != NULL || token->n == 0);
-		memcpy(expr->u.zToken, token->z, token->n);
-		expr->u.zToken[token->n] = '\0';
-		if (dequote) {
-			if (expr->u.zToken[0] == '"')
-				expr->flags |= EP_DblQuoted;
-			if (expr->op == TK_ID || expr->op == TK_COLLATE ||
-			    expr->op == TK_FUNCTION)
-				sqlNormalizeName(expr->u.zToken);
-			else
+		if (dequote && expr->u.zToken[0] == '"')
+			expr->flags |= EP_DblQuoted;
+		if (dequote &&
+		    (expr->op == TK_ID || expr->op == TK_COLLATE ||
+		     expr->op == TK_FUNCTION)) {
+			if (sql_normalize_name(expr->u.zToken, extra_sz,
+					       token->z, token->n) < 0) {
+				sqlDbFree(db, expr);
+				return NULL;
+			}
+		} else {
+			memcpy(expr->u.zToken, token->z, token->n);
+			expr->u.zToken[token->n] = '\0';
+			if (dequote)
 				sqlDequote(expr->u.zToken);
 		}
 	}
@@ -1778,18 +1792,31 @@ sqlExprListSetName(Parse * pParse,	/* Parsing context */
     )
 {
 	assert(pList != 0 || pParse->db->mallocFailed != 0);
-	if (pList) {
-		struct ExprList_item *pItem;
-		assert(pList->nExpr > 0);
-		pItem = &pList->a[pList->nExpr - 1];
-		assert(pItem->zName == 0);
-		pItem->zName = sqlDbStrNDup(pParse->db, pName->z, pName->n);
-		if (dequote)
-			sqlNormalizeName(pItem->zName);
-		/* n = 0 in case of select * */
-		if (pName->n != 0)
-			sqlCheckIdentifierName(pParse, pItem->zName);
-	}
+	if (pList == NULL || pName->n == 0)
+		return;
+	assert(pList->nExpr > 0);
+	struct ExprList_item *item = &pList->a[pList->nExpr - 1];
+	assert(item->zName == NULL);
+	if (dequote) {
+		int name_len = sql_normalize_name(NULL, 0, pName->z, pName->n);
+		if (name_len < 0)
+			goto tarantool_error;
+		item->zName = sqlDbMallocRawNN(pParse->db, name_len + 1);
+		if (item->zName == NULL)
+			return;
+		if (sql_normalize_name(item->zName, name_len + 1, pName->z,
+				       pName->n) < 0)
+			goto tarantool_error;
+	} else {
+		item->zName = sqlDbStrNDup(pParse->db, pName->z, pName->n);
+		if (item->zName == NULL)
+			return;
+	}
+	sqlCheckIdentifierName(pParse, item->zName);
+	return;
+tarantool_error:
+	pParse->rc = SQL_TARANTOOL_ERROR;
+	pParse->nErr++;
 }
 
 /*
diff --git a/src/box/sql/parse.y b/src/box/sql/parse.y
index d25995ec0..3204833e9 100644
--- a/src/box/sql/parse.y
+++ b/src/box/sql/parse.y
@@ -836,7 +836,16 @@ idlist(A) ::= nm(Y). {
   ** that created the expression.
   */
   static void spanExpr(ExprSpan *pOut, Parse *pParse, int op, Token t){
-    Expr *p = sqlDbMallocRawNN(pParse->db, sizeof(Expr)+t.n+1);
+    int name_len = 0;
+    struct Expr *p = NULL;
+    if (op != TK_VARIABLE) {
+      name_len = sql_normalize_name(NULL, 0, t.z, t.n);
+      if (name_len < 0)
+        goto tarantool_error;
+    } else {
+      name_len = t.n;
+    }
+    p = sqlDbMallocRawNN(pParse->db, sizeof(Expr) + name_len + 1);
     if( p ){
       memset(p, 0, sizeof(Expr));
       switch (op) {
@@ -869,10 +878,12 @@ idlist(A) ::= nm(Y). {
       p->flags = EP_Leaf;
       p->iAgg = -1;
       p->u.zToken = (char*)&p[1];
-      memcpy(p->u.zToken, t.z, t.n);
-      p->u.zToken[t.n] = 0;
-      if (op != TK_VARIABLE){
-        sqlNormalizeName(p->u.zToken);
+      if (op != TK_VARIABLE) {
+        if (sql_normalize_name(p->u.zToken, name_len + 1, t.z, t.n) < 0)
+          goto tarantool_error;
+      } else {
+        memcpy(p->u.zToken, t.z, t.n);
+        p->u.zToken[t.n] = 0;
       }
 #if SQL_MAX_EXPR_DEPTH>0
       p->nHeight = 1;
@@ -881,6 +892,11 @@ idlist(A) ::= nm(Y). {
     pOut->pExpr = p;
     pOut->zStart = t.z;
     pOut->zEnd = &t.z[t.n];
+    return;
+tarantool_error:
+    sqlDbFree(pParse->db, p);
+    pParse->rc = SQL_TARANTOOL_ERROR;
+    pParse->nErr++;
   }
 }
 
diff --git a/src/box/sql/select.c b/src/box/sql/select.c
index 2ae27e6c4..7b2a38101 100644
--- a/src/box/sql/select.c
+++ b/src/box/sql/select.c
@@ -4195,10 +4195,18 @@ flattenSubquery(Parse * pParse,		/* Parsing context */
 		pList = pParent->pEList;
 		for (i = 0; i < pList->nExpr; i++) {
 			if (pList->a[i].zName == 0) {
-				char *zName =
-				    sqlDbStrDup(db, pList->a[i].zSpan);
-				sqlNormalizeName(zName);
-				pList->a[i].zName = zName;
+				char *str = pList->a[i].zSpan;
+				int len = strlen(str);
+				int name_len =
+					sql_normalize_name(NULL, 0, str, len);
+				if (name_len < 0)
+					goto tarantool_error;
+				char *name = sqlDbMallocRawNN(db, name_len + 1);
+				if (name != NULL &&
+				    sql_normalize_name(name, name_len + 1, str,
+						       len) < 0)
+					goto tarantool_error;
+				pList->a[i].zName = name;
 			}
 		}
 		if (pSub->pOrderBy) {
@@ -4276,6 +4284,10 @@ flattenSubquery(Parse * pParse,		/* Parsing context */
 	}
 #endif
 
+	return 1;
+tarantool_error:
+	pParse->rc = SQL_TARANTOOL_ERROR;
+	pParse->nErr++;
 	return 1;
 }
 
diff --git a/src/box/sql/sqlInt.h b/src/box/sql/sqlInt.h
index 82aad0077..026a409c4 100644
--- a/src/box/sql/sqlInt.h
+++ b/src/box/sql/sqlInt.h
@@ -3218,7 +3218,31 @@ void sqlTreeViewWith(TreeView *, const With *);
 void sqlSetString(char **, sql *, const char *);
 void sqlErrorMsg(Parse *, const char *, ...);
 void sqlDequote(char *);
-void sqlNormalizeName(char *z);
+
+/**
+ * Perform SQL name normalization: cast name to the upper-case
+ * (via Unicode Character Folding). Casing is locale-dependent
+ * and context-sensitive. The result may be longer or shorter
+ * than the original. The source string and the destination buffer
+ * must not overlap.
+ * For example, ß is converted to SS.
+ * The result is similar to SQL UPPER function.
+ * @param dst A buffer for the result string. The result will be
+ *            NUL-terminated if the buffer is large enough. The
+ *            contents is undefined in case of failure.
+ * @param dst_size The size of the buffer (number of bytes).
+ *                 If it is 0, then dest may be NULL and the
+ *                 function will only return the length of the
+ *                 result without writing any of the result
+ *                 string.
+ * @param src The original string.
+ * @param src_len The length of the original string.
+ * @retval The length of the result string, on success.
+ * @retval < 0 otherwise.
+ */
+int
+sql_normalize_name(char *dst, int dst_size, const char *src, int src_len);
+
 void sqlTokenInit(Token *, char *);
 int sqlKeywordCode(const unsigned char *, int);
 int sqlRunParser(Parse *, const char *, char **);
diff --git a/src/box/sql/trigger.c b/src/box/sql/trigger.c
index af62a5eff..d43c7bad4 100644
--- a/src/box/sql/trigger.c
+++ b/src/box/sql/trigger.c
@@ -278,15 +278,23 @@ sql_trigger_select_step(struct sql *db, struct Select *select)
 static struct TriggerStep *
 sql_trigger_step_allocate(struct sql *db, u8 op, struct Token *target_name)
 {
-	int size = sizeof(TriggerStep) + target_name->n + 1;
-	struct TriggerStep *trigger_step = sqlDbMallocZero(db, size);
+	struct TriggerStep *trigger_step = NULL;
+	int name_len =
+		sql_normalize_name(NULL, 0, target_name->z, target_name->n);
+	if (name_len < 0)
+		return NULL;
+	trigger_step = sqlDbMallocZero(db, sizeof(TriggerStep) + name_len + 1);
 	if (trigger_step == NULL) {
-		diag_set(OutOfMemory, size, "sqlDbMallocZero", "trigger_step");
+		diag_set(OutOfMemory, name_len + 1, "sqlDbMallocZero",
+			 "trigger_step");
 		return NULL;
 	}
 	char *z = (char *)&trigger_step[1];
-	memcpy(z, target_name->z, target_name->n);
-	sqlNormalizeName(z);
+	if (sql_normalize_name(z, name_len + 1, target_name->z,
+				target_name->n) < 0) {
+		sqlDbFree(db, trigger_step);
+		return NULL;
+	}
 	trigger_step->zTarget = z;
 	trigger_step->op = op;
 	return trigger_step;
diff --git a/src/box/sql/util.c b/src/box/sql/util.c
index c89e2e8ab..924149d62 100644
--- a/src/box/sql/util.c
+++ b/src/box/sql/util.c
@@ -41,6 +41,7 @@
 #if HAVE_ISNAN || SQL_HAVE_ISNAN
 #include <math.h>
 #endif
+#include <unicode/ucasemap.h>
 
 /*
  * Routine needed to support the testcase() macro.
@@ -292,23 +293,34 @@ sqlDequote(char *z)
 	z[j] = 0;
 }
 
-
-void
-sqlNormalizeName(char *z)
+int
+sql_normalize_name(char *dst, int dst_size, const char *src, int src_len)
 {
-	char quote;
-	int i=0;
-	if (z == 0)
-		return;
-	quote = z[0];
-	if (sqlIsquote(quote)){
-		sqlDequote(z);
-		return;
-	}
-	while(z[i]!=0){
-		z[i] = (char)sqlToupper(z[i]);
-		i++;
+	assert(src != NULL);
+	if (sqlIsquote(src[0])){
+		if (dst_size == 0)
+			return src_len;
+		memcpy(dst, src, src_len);
+		dst[src_len] = '\0';
+		sqlDequote(dst);
+		return src_len;
 	}
+	UErrorCode status = U_ZERO_ERROR;
+	UCaseMap *case_map = ucasemap_open(NULL, 0, &status);
+	if (case_map == NULL)
+		goto error;
+	int len = ucasemap_utf8ToUpper(case_map, dst, dst_size, src, src_len,
+				       &status);
+	ucasemap_close(case_map);
+	if (!U_SUCCESS(status) &&
+	    !(dst_size == 0 && status == U_BUFFER_OVERFLOW_ERROR))
+		goto error;
+	return len;
+error:
+	diag_set(CollationError,
+		 "string conversion to the uppercase failed: %s",
+		 u_errorName(status));
+	return -1;
 }
 
 /*
diff --git a/test/sql-tap/identifier_case.test.lua b/test/sql-tap/identifier_case.test.lua
index 923d5e66a..aaa4cc85a 100755
--- a/test/sql-tap/identifier_case.test.lua
+++ b/test/sql-tap/identifier_case.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 test = require("sqltester")
-test:plan(71)
+test:plan(73)
 
 local test_prefix = "identifier_case-"
 
@@ -13,8 +13,10 @@ local data = {
     { 6,  [[ "Table1" ]], {0} },
     -- non ASCII characters case is not supported
     { 7,  [[ русский ]], {0} },
-    { 8,  [[ Русский ]], {0} },
-    { 9,  [[ "русский" ]], {"/already exists/"} },
+    { 8,  [[ "русский" ]], {0} },
+    { 9,  [[ Großschreibweise ]], {0} },
+    { 10,  [[ Русский ]], {"/already exists/"} },
+    { 11,  [[ Grossschreibweise ]], {"/already exists/"} },
 }
 
 for _, row in ipairs(data) do
@@ -35,7 +37,7 @@ data = {
     { 5, [[ "table1" ]], {5}},
     { 6, [[ "Table1" ]], {6}},
     { 7, [[ русский ]], {7}},
-    { 8, [[ Русский ]], {8}},
+    { 8, [[ "русский" ]], {8}},
 }
 
 for _, row in ipairs(data) do
@@ -66,7 +68,7 @@ test:do_test(
     function ()
         return test:drop_all_tables()
     end,
-    3)
+    4)
 
 data = {
     { 1,  [[ columnn ]], {0} },
-- 
2.20.1