[tarantool-patches] Re: [PATCH v1 1/1] sql: fix perf degradation on name normalization

Kirill Shcherbatov kshcherbatov at tarantool.org
Thu Apr 4 21:08:26 MSK 2019


> 1. Missed diag_set.
> 2. Again.
Sorry, accidentally dropped it

> 3. That name was ok to be local for utf8.c, but now
> it is global, and 'root_map' in the whole scope of
> tarantool looks ambiguous. What it is? MessagePack map?
> Lua table map? RB-tree map? What is 'root'? I propose
> 
>     icu_ucase_default_map
Songs good.

==========================================
Diff:
==========================================
diff --git a/src/box/sql/util.c b/src/box/sql/util.c
index 2f3c17c9a..52a86635c 100644
--- a/src/box/sql/util.c
+++ b/src/box/sql/util.c
@@ -268,9 +268,9 @@ sql_normalize_name(char *dst, int dst_size, const char *src, int src_len)
 		return src_len + 1;
 	}
 	UErrorCode status = U_ZERO_ERROR;
-	assert(root_map != NULL);
-	int len = ucasemap_utf8ToUpper(root_map, dst, dst_size, src,
-				       src_len, &status);
+	assert(icu_ucase_default_map != NULL);
+	int len = ucasemap_utf8ToUpper(icu_ucase_default_map, dst, dst_size,
+				       src, src_len, &status);
 	assert(U_SUCCESS(status) || status == U_BUFFER_OVERFLOW_ERROR);
 	return len + 1;
 }
@@ -310,7 +310,7 @@ sql_normalized_name_region_new(struct region *r, const char *name, int len)
 	size_t region_svp = region_used(r);
 	char *res = region_alloc(r, size);
 	if (res == NULL)
-		return NULL;
+		goto oom_error;
 	int rc = sql_normalize_name(res, size, name, len);
 	if (rc <= size)
 		return res;
@@ -319,10 +319,14 @@ sql_normalized_name_region_new(struct region *r, const char *name, int len)
 	region_truncate(r, region_svp);
 	res = region_alloc(r, size);
 	if (res == NULL)
-		return NULL;
+		goto oom_error;
 	if (sql_normalize_name(res, size, name, len) > size)
 		unreachable();
 	return res;
+
+oom_error:
+	diag_set(OutOfMemory, size, "region_alloc", "res");
+	return NULL;
 }
 
 /* Convenient short-hand */
diff --git a/src/lib/coll/coll.c b/src/lib/coll/coll.c
index 21f2489d4..495eb6d64 100644
--- a/src/lib/coll/coll.c
+++ b/src/lib/coll/coll.c
@@ -37,7 +37,7 @@
 #include <unicode/ucasemap.h>
 #include <trivia/config.h>
 
-struct UCaseMap *root_map = NULL;
+struct UCaseMap *icu_ucase_default_map = NULL;
 
 #define mh_name _coll
 struct mh_coll_key_t {
@@ -419,14 +419,14 @@ coll_init()
 {
 	UErrorCode err = U_ZERO_ERROR;
 	coll_cache = mh_coll_new();
-	root_map = ucasemap_open("", 0, &err);
-	if (coll_cache == NULL || root_map == NULL)
+	icu_ucase_default_map = ucasemap_open("", 0, &err);
+	if (coll_cache == NULL || icu_ucase_default_map == NULL)
 		panic("Can not create system collations cache");
 }
 
 void
 coll_free()
 {
-	ucasemap_close(root_map);
+	ucasemap_close(icu_ucase_default_map);
 	mh_coll_delete(coll_cache);
 }
diff --git a/src/lib/coll/coll.h b/src/lib/coll/coll.h
index 713c9646e..1f5b29a2e 100644
--- a/src/lib/coll/coll.h
+++ b/src/lib/coll/coll.h
@@ -54,7 +54,7 @@ typedef size_t (*coll_hint_f)(const char *s, size_t s_len, char *buf,
 struct UCollator;
 
 /** Default universal casemap for case transformations. */
-extern struct UCaseMap *root_map;
+extern struct UCaseMap *icu_ucase_default_map;
 
 /**
  * Collation. It has no unique features like name, id or owner.
diff --git a/src/lua/utf8.c b/src/lua/utf8.c
index 311698bda..d829b57b4 100644
--- a/src/lua/utf8.c
+++ b/src/lua/utf8.c
@@ -61,11 +61,11 @@ utf8_str_to_case(struct lua_State *L, const char *src, int src_bsize,
 		}
 		int real_bsize;
 		if (is_to_upper) {
-			real_bsize = ucasemap_utf8ToUpper(root_map, dst,
+			real_bsize = ucasemap_utf8ToUpper(icu_ucase_default_map, dst,
 							  dst_bsize, src,
 							  src_bsize, &err);
 		} else {
-			real_bsize = ucasemap_utf8ToLower(root_map, dst,
+			real_bsize = ucasemap_utf8ToLower(icu_ucase_default_map, dst,
 							  dst_bsize, src,
 							  src_bsize, &err);
 		}
@@ -444,7 +444,7 @@ static const struct luaL_Reg utf8_lib[] = {
 void
 tarantool_lua_utf8_init(struct lua_State *L)
 {
-	assert(root_map != NULL);
+	assert(icu_ucase_default_map != NULL);
 	struct coll_def def;
 	memset(&def, 0, sizeof(def));
 	def.icu.strength = COLL_ICU_STRENGTH_TERTIARY;

==========================================
Patch:
==========================================
>From a779238e296a78830c68bdee305bb305336c5a34 Mon Sep 17 00:00:00 2001
Message-Id: <a779238e296a78830c68bdee305bb305336c5a34.1554401202.git.kshcherbatov at tarantool.org>
From: Kirill Shcherbatov <kshcherbatov at tarantool.org>
Date: Thu, 4 Apr 2019 15:53:02 +0300
Subject: [PATCH v1 1/1] sql: fix perf degradation on name normalization

Because sql_normalize_name used to be called twice - to estimate
the size of the name buffer and to process data querying the
UCaseMap object each time performance in SQL felt by 15%.

This patch should eliminate some of the negative effects of using
ICU for name normalization.

Thanks @avtikhon for a bechmark

Follow up e7558062d3559e6bcc18f91eacb88269428321dc
---
 src/box/sql/expr.c       | 29 +++++++--------
 src/box/sql/parse.y      | 21 +++++------
 src/box/sql/sqlInt.h     |  9 ++---
 src/box/sql/trigger.c    | 22 +++++++-----
 src/box/sql/util.c       | 77 ++++++++++++++++++++++------------------
 src/lib/coll/coll.c      |  8 ++++-
 src/lib/coll/coll.h      |  3 ++
 src/lua/utf8.c           | 15 ++------
 test/sql/errinj.result   |  9 ++++-
 test/sql/errinj.test.lua |  2 ++
 10 files changed, 105 insertions(+), 90 deletions(-)

diff --git a/src/box/sql/expr.c b/src/box/sql/expr.c
index 4b98bd175..0d61be344 100644
--- a/src/box/sql/expr.c
+++ b/src/box/sql/expr.c
@@ -965,22 +965,13 @@ sql_expr_new(struct sql *db, int op, const struct Token *token)
 struct Expr *
 sql_expr_new_dequoted(struct sql *db, int op, const struct Token *token)
 {
-	int extra_size = 0;
-	bool is_name = false;
+	int extra_size = 0, rc;
 	if (token != NULL) {
 		int val;
 		assert(token->z != NULL || token->n == 0);
 		if (sql_expr_token_to_int(op, token, &val) == 0)
 			return sql_expr_new_int(db, val);
-		is_name = op == TK_ID || op == TK_COLLATE || op == TK_FUNCTION;
-		if (is_name) {
-			extra_size = sql_normalize_name(NULL, 0, token->z,
-							token->n);
-			if (extra_size < 0)
-				return NULL;
-		} else {
-			extra_size = token->n + 1;
-		}
+		extra_size = token->n + 1;
 	}
 	struct Expr *e = sql_expr_new_empty(db, op, extra_size);
 	if (e == NULL || token == NULL || token->n == 0)
@@ -988,14 +979,20 @@ sql_expr_new_dequoted(struct sql *db, int op, const struct Token *token)
 	e->u.zToken = (char *) &e[1];
 	if (token->z[0] == '"')
 		e->flags |= EP_DblQuoted;
-	if (! is_name) {
+	if (op != TK_ID && op != TK_COLLATE && op != TK_FUNCTION) {
 		memcpy(e->u.zToken, token->z, token->n);
 		e->u.zToken[token->n] = '\0';
 		sqlDequote(e->u.zToken);
-	} else if (sql_normalize_name(e->u.zToken, extra_size, token->z,
-				      token->n) < 0) {
-		sql_expr_delete(db, e, false);
-		return NULL;
+	} else if ((rc = sql_normalize_name(e->u.zToken, extra_size, token->z,
+					    token->n)) > extra_size) {
+		extra_size = rc;
+		e = sqlDbReallocOrFree(db, e, sizeof(*e) + extra_size);
+		if (e == NULL)
+			return NULL;
+		e->u.zToken = (char *) &e[1];
+		if (sql_normalize_name(e->u.zToken, extra_size, token->z,
+				       token->n) > extra_size)
+			unreachable();
 	}
 	return e;
 }
diff --git a/src/box/sql/parse.y b/src/box/sql/parse.y
index ed5c05436..4581ef6e7 100644
--- a/src/box/sql/parse.y
+++ b/src/box/sql/parse.y
@@ -893,15 +893,8 @@ idlist(A) ::= nm(Y). {
   ** that created the expression.
   */
   static void spanExpr(ExprSpan *pOut, Parse *pParse, int op, Token t){
-    int name_sz = 0;
     struct Expr *p = NULL;
-    if (op != TK_VARIABLE) {
-      name_sz = sql_normalize_name(NULL, 0, t.z, t.n);
-      if (name_sz < 0)
-        goto tarantool_error;
-    } else {
-      name_sz = t.n + 1;
-    }
+    int name_sz = t.n + 1;
     p = sqlDbMallocRawNN(pParse->db, sizeof(Expr) + name_sz);
     if( p ){
       memset(p, 0, sizeof(Expr));
@@ -936,8 +929,16 @@ idlist(A) ::= nm(Y). {
       p->iAgg = -1;
       p->u.zToken = (char*)&p[1];
       if (op != TK_VARIABLE) {
-        if (sql_normalize_name(p->u.zToken, name_sz, t.z, t.n) < 0)
-          goto tarantool_error;
+        int rc = sql_normalize_name(p->u.zToken, name_sz, t.z, t.n);
+        if (rc > name_sz) {
+          name_sz = rc;
+          p = sqlDbReallocOrFree(pParse->db, p, sizeof(*p) + name_sz);
+          if (p == NULL)
+            goto tarantool_error;
+          p->u.zToken = (char *) &p[1];
+          if (sql_normalize_name(p->u.zToken, name_sz, t.z, t.n) > name_sz)
+              unreachable();
+        }
       } else {
         memcpy(p->u.zToken, t.z, t.n);
         p->u.zToken[t.n] = 0;
diff --git a/src/box/sql/sqlInt.h b/src/box/sql/sqlInt.h
index d8dc03284..b322602dc 100644
--- a/src/box/sql/sqlInt.h
+++ b/src/box/sql/sqlInt.h
@@ -3175,15 +3175,10 @@ void sqlDequote(char *);
  * @param dst A buffer for the result string. The result will be
  *        0-terminated if the buffer is large enough. The contents
  *        is undefined in case of failure.
- * @param dst_size The size of the buffer (number of bytes). If it
- *        is 0, then dest may be NULL and the function will only
- *        return the length of the result without writing any of
- *        the result string
+ * @param dst_size The size of the buffer (number of bytes).
  * @param src The original string.
  * @param src_len The length of the original string.
- * @retval The count of bytes written(or need to be written) on
- *         success.
- * @retval < 0 Otherwise. The diag message is set.
+ * @retval The count of bytes written (or need to be written).
  */
 int
 sql_normalize_name(char *dst, int dst_size, const char *src, int src_len);
diff --git a/src/box/sql/trigger.c b/src/box/sql/trigger.c
index 20b2c970c..14e4198a8 100644
--- a/src/box/sql/trigger.c
+++ b/src/box/sql/trigger.c
@@ -278,10 +278,7 @@ sql_trigger_select_step(struct sql *db, struct Select *select)
 static struct TriggerStep *
 sql_trigger_step_new(struct sql *db, u8 op, struct Token *target_name)
 {
-	int name_size =
-		sql_normalize_name(NULL, 0, target_name->z, target_name->n);
-	if (name_size < 0)
-		return NULL;
+	int name_size = target_name->n + 1;
 	int size = sizeof(struct TriggerStep) + name_size;
 	struct TriggerStep *trigger_step = sqlDbMallocZero(db, size);
 	if (trigger_step == NULL) {
@@ -289,10 +286,19 @@ sql_trigger_step_new(struct sql *db, u8 op, struct Token *target_name)
 		return NULL;
 	}
 	char *z = (char *)&trigger_step[1];
-	if (sql_normalize_name(z, name_size, target_name->z,
-			       target_name->n) < 0) {
-		sqlDbFree(db, trigger_step);
-		return NULL;
+	int rc = sql_normalize_name(z, name_size, target_name->z,
+				    target_name->n);
+	if (rc > name_size) {
+		name_size = rc;
+		trigger_step = sqlDbReallocOrFree(db, trigger_step,
+						  sizeof(*trigger_step) +
+						  name_size);
+		if (trigger_step == NULL)
+			return NULL;
+		z = (char *) &trigger_step[1];
+		if (sql_normalize_name(z, name_size, target_name->z,
+				       target_name->n) > name_size)
+			unreachable();
 	}
 	trigger_step->zTarget = z;
 	trigger_step->op = op;
diff --git a/src/box/sql/util.c b/src/box/sql/util.c
index a13efa682..52a86635c 100644
--- a/src/box/sql/util.c
+++ b/src/box/sql/util.c
@@ -41,6 +41,7 @@
 #if HAVE_ISNAN || SQL_HAVE_ISNAN
 #include <math.h>
 #endif
+#include "coll/coll.h"
 #include <unicode/ucasemap.h>
 #include "errinj.h"
 
@@ -259,67 +260,73 @@ int
 sql_normalize_name(char *dst, int dst_size, const char *src, int src_len)
 {
 	assert(src != NULL);
+	assert(dst != NULL && dst_size > 0);
 	if (sqlIsquote(src[0])){
-		if (dst_size == 0)
-			return src_len + 1;
 		memcpy(dst, src, src_len);
 		dst[src_len] = '\0';
 		sqlDequote(dst);
 		return src_len + 1;
 	}
 	UErrorCode status = U_ZERO_ERROR;
-	ERROR_INJECT(ERRINJ_SQL_NAME_NORMALIZATION, {
-		status = U_MEMORY_ALLOCATION_ERROR;
-		goto error;
-	});
-	UCaseMap *case_map = ucasemap_open(NULL, 0, &status);
-	if (case_map == NULL)
-		goto error;
-	int len = ucasemap_utf8ToUpper(case_map, dst, dst_size, src, src_len,
-				       &status);
-	ucasemap_close(case_map);
-	assert(U_SUCCESS(status) ||
-	       (dst_size == 0 && status == U_BUFFER_OVERFLOW_ERROR));
+	assert(icu_ucase_default_map != NULL);
+	int len = ucasemap_utf8ToUpper(icu_ucase_default_map, dst, dst_size,
+				       src, src_len, &status);
+	assert(U_SUCCESS(status) || status == U_BUFFER_OVERFLOW_ERROR);
 	return len + 1;
-error:
-	diag_set(CollationError,
-		 "string conversion to the uppercase failed: %s",
-		 u_errorName(status));
-	return -1;
 }
 
 char *
 sql_normalized_name_db_new(struct sql *db, const char *name, int len)
 {
-	int size = sql_normalize_name(NULL, 0, name, len);
-	if (size < 0)
+	int size = len + 1;
+	ERROR_INJECT(ERRINJ_SQL_NAME_NORMALIZATION, {
+		diag_set(OutOfMemory, size, "sqlDbMallocRawNN", "res");
 		return NULL;
+	});
 	char *res = sqlDbMallocRawNN(db, size);
-	if (res == NULL) {
-		diag_set(OutOfMemory, size, "sqlDbMallocRawNN", "res");
+	if (res == NULL)
 		return NULL;
-	}
-	if (sql_normalize_name(res, size, name, len) < 0) {
-		sqlDbFree(db, res);
+	int rc = sql_normalize_name(res, size, name, len);
+	if (rc <= size)
+		return res;
+
+	size = rc;
+	res = sqlDbReallocOrFree(db, res, size);
+	if (res == NULL)
 		return NULL;
-	}
+	if (sql_normalize_name(res, size, name, len) > size)
+		unreachable();
 	return res;
 }
 
 char *
 sql_normalized_name_region_new(struct region *r, const char *name, int len)
 {
-	int size = sql_normalize_name(NULL, 0, name, len);
-	if (size < 0)
-		return NULL;
-	char *res = (char *) region_alloc(r, size);
-	if (res == NULL) {
+	int size = len + 1;
+	ERROR_INJECT(ERRINJ_SQL_NAME_NORMALIZATION, {
 		diag_set(OutOfMemory, size, "region_alloc", "res");
 		return NULL;
-	}
-	if (sql_normalize_name(res, size, name, len) < 0)
-		return NULL;
+	});
+	size_t region_svp = region_used(r);
+	char *res = region_alloc(r, size);
+	if (res == NULL)
+		goto oom_error;
+	int rc = sql_normalize_name(res, size, name, len);
+	if (rc <= size)
+		return res;
+
+	size = rc;
+	region_truncate(r, region_svp);
+	res = region_alloc(r, size);
+	if (res == NULL)
+		goto oom_error;
+	if (sql_normalize_name(res, size, name, len) > size)
+		unreachable();
 	return res;
+
+oom_error:
+	diag_set(OutOfMemory, size, "region_alloc", "res");
+	return NULL;
 }
 
 /* Convenient short-hand */
diff --git a/src/lib/coll/coll.c b/src/lib/coll/coll.c
index b83f0fdc7..495eb6d64 100644
--- a/src/lib/coll/coll.c
+++ b/src/lib/coll/coll.c
@@ -34,8 +34,11 @@
 #include "diag.h"
 #include "assoc.h"
 #include <unicode/ucol.h>
+#include <unicode/ucasemap.h>
 #include <trivia/config.h>
 
+struct UCaseMap *icu_ucase_default_map = NULL;
+
 #define mh_name _coll
 struct mh_coll_key_t {
 	const char *str;
@@ -414,13 +417,16 @@ coll_unref(struct coll *coll)
 void
 coll_init()
 {
+	UErrorCode err = U_ZERO_ERROR;
 	coll_cache = mh_coll_new();
-	if (coll_cache == NULL)
+	icu_ucase_default_map = ucasemap_open("", 0, &err);
+	if (coll_cache == NULL || icu_ucase_default_map == NULL)
 		panic("Can not create system collations cache");
 }
 
 void
 coll_free()
 {
+	ucasemap_close(icu_ucase_default_map);
 	mh_coll_delete(coll_cache);
 }
diff --git a/src/lib/coll/coll.h b/src/lib/coll/coll.h
index c505804f6..1f5b29a2e 100644
--- a/src/lib/coll/coll.h
+++ b/src/lib/coll/coll.h
@@ -53,6 +53,9 @@ typedef size_t (*coll_hint_f)(const char *s, size_t s_len, char *buf,
 
 struct UCollator;
 
+/** Default universal casemap for case transformations. */
+extern struct UCaseMap *icu_ucase_default_map;
+
 /**
  * Collation. It has no unique features like name, id or owner.
  * Only functional part - comparator, locale, ICU settings.
diff --git a/src/lua/utf8.c b/src/lua/utf8.c
index de026a921..d829b57b4 100644
--- a/src/lua/utf8.c
+++ b/src/lua/utf8.c
@@ -38,9 +38,6 @@
 
 extern struct ibuf *tarantool_lua_ibuf;
 
-/** Default universal casemap for case transformations. */
-static UCaseMap *root_map = NULL;
-
 /** Collations for cmp/casecmp functions. */
 static struct coll *unicode_coll = NULL;
 static struct coll *unicode_ci_coll = NULL;
@@ -64,11 +61,11 @@ utf8_str_to_case(struct lua_State *L, const char *src, int src_bsize,
 		}
 		int real_bsize;
 		if (is_to_upper) {
-			real_bsize = ucasemap_utf8ToUpper(root_map, dst,
+			real_bsize = ucasemap_utf8ToUpper(icu_ucase_default_map, dst,
 							  dst_bsize, src,
 							  src_bsize, &err);
 		} else {
-			real_bsize = ucasemap_utf8ToLower(root_map, dst,
+			real_bsize = ucasemap_utf8ToLower(icu_ucase_default_map, dst,
 							  dst_bsize, src,
 							  src_bsize, &err);
 		}
@@ -447,12 +444,7 @@ static const struct luaL_Reg utf8_lib[] = {
 void
 tarantool_lua_utf8_init(struct lua_State *L)
 {
-	UErrorCode err = U_ZERO_ERROR;
-	root_map = ucasemap_open("", 0, &err);
-	if (root_map == NULL) {
-		luaL_error(L, tt_sprintf("error in ICU ucasemap_open: %s",
-					 u_errorName(err)));
-	}
+	assert(icu_ucase_default_map != NULL);
 	struct coll_def def;
 	memset(&def, 0, sizeof(def));
 	def.icu.strength = COLL_ICU_STRENGTH_TERTIARY;
@@ -474,7 +466,6 @@ error_coll:
 void
 tarantool_lua_utf8_free()
 {
-	ucasemap_close(root_map);
 	if (unicode_coll != NULL)
 		coll_unref(unicode_coll);
 	if (unicode_ci_coll != NULL)
diff --git a/test/sql/errinj.result b/test/sql/errinj.result
index dead00f7d..5a427e855 100644
--- a/test/sql/errinj.result
+++ b/test/sql/errinj.result
@@ -450,7 +450,14 @@ errinj.set("ERRINJ_SQL_NAME_NORMALIZATION", true)
 ...
 box.execute("CREATE TABLE hello (id INT primary key,x INT,y INT);")
 ---
-- error: 'string conversion to the uppercase failed: U_MEMORY_ALLOCATION_ERROR'
+- error: Failed to allocate 6 bytes in sqlDbMallocRawNN for res
+...
+dummy_f = function(int) return 1 end
+---
+...
+box.internal.sql_create_function("counter1", "INT", dummy_f, -1, false)
+---
+- error: Failed to allocate 9 bytes in region_alloc for res
 ...
 errinj.set("ERRINJ_SQL_NAME_NORMALIZATION", false)
 ---
diff --git a/test/sql/errinj.test.lua b/test/sql/errinj.test.lua
index 736e66957..7f4b30baf 100644
--- a/test/sql/errinj.test.lua
+++ b/test/sql/errinj.test.lua
@@ -146,4 +146,6 @@ errinj.set("ERRINJ_WAL_DELAY", false)
 errinj = box.error.injection
 errinj.set("ERRINJ_SQL_NAME_NORMALIZATION", true)
 box.execute("CREATE TABLE hello (id INT primary key,x INT,y INT);")
+dummy_f = function(int) return 1 end
+box.internal.sql_create_function("counter1", "INT", dummy_f, -1, false)
 errinj.set("ERRINJ_SQL_NAME_NORMALIZATION", false)
-- 
2.21.0





More information about the Tarantool-patches mailing list