[Tarantool-patches] [PATCH v1 2/8] sql: refactor CHAR_LENGTH() function

Mergen Imeev imeevma at tarantool.org
Mon Nov 1 13:20:30 MSK 2021


Thank you for the review! My answer, diff and new patch below. I also
dropped some tests, including the ones I added in the previous version
of the patch. I did this because the behavior of the ICU functions
caused some of the CI targets to crash. I added a doc-bot request for
this issue in the commit message.

On Fri, Oct 29, 2021 at 12:11:37AM +0200, Vladislav Shpilevoy wrote:
> Thanks for the fixes!
> 
> >>> +/** Implementation of the CHAR_LENGTH() function. */
> >>> +static inline uint8_t
> >>> +utf8_len_char(char c)
> >>> +{
> >>> +	uint8_t u = (uint8_t)c;
> >>> +	return 1 + (u >= 0xc2) + (u >= 0xe0) + (u >= 0xf0);
> >>
> >> It is not that simple really. Consider either using the old
> >> lengthFunc() and other sqlite utf8 helpers or use the approach
> >> similar to utf8_len() in utf8.c. It uses ICU macro U8_NEXT()
> >> and has handling for special symbols like U_SENTINEL.
> >>
> >> Otherwise you are making already third version of functions to
> >> work with utf8.
> >>
> >> I would even prefer to refactor lengthFunc() to stop using sqlite
> >> legacy and drop sqlite utf8 entirely, but I suspect it might be
> >> not so trivial to do and should be done later.
> > I was able to use ucnv_getNextUChar() here. In fact, I was able to use this
> > functions in all the places in this patch-set where we had to work with my or
> > SQLite functions that work with UTF8 characters. I think I can remove sql/utf.c
> > in the next patchset, since I refactor the LENGTH() and UNICODE() functions
> > there.
> 
> Discussed in private that U8_NEXT() would work here just fine.
> ucnv_getNextUChar() is an overkill. In other places of the patchset too.
> 
Thank you for the suggestion! I replaces ucnv_getNextUChar() by U8_NEXT().


Diff:

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index faef0eef3..bc7a1fedd 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -269,11 +269,10 @@ func_char_length(struct sql_context *ctx, int argc, struct Mem *argv)
 		return;
 	assert(mem_is_str(arg) && arg->n >= 0);
 	uint32_t len = 0;
-	UErrorCode err = U_ZERO_ERROR;
-	const char *pos = arg->z;
-	const char *end = arg->z + arg->n;
-	while (pos < end) {
-		ucnv_getNextUChar(icu_utf8_conv, &pos, end, &err);
+	int offset = 0;
+	while (offset < arg->n) {
+		UChar32 c;
+		U8_NEXT((uint8_t *)arg->z, offset, arg->n, c);
 		++len;
 	}
 	mem_set_uint(ctx->pOut, len);
diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua
index 27f17168b..ce8354840 100755
--- a/test/sql-tap/badutf1.test.lua
+++ b/test/sql-tap/badutf1.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 local test = require("sqltester")
-test:plan(23)
+test:plan(19)
 
 --!./tcltestrunner.lua
 -- 2007 May 15
@@ -253,26 +253,6 @@ test:do_test(
 
 test:do_test(
     "badutf-3.5",
-    function()
-        return test:execsql2("SELECT length('\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.5>
-        "X", 12
-        -- </badutf-3.5>
-    })
-
-test:do_test(
-    "badutf-3.6",
-    function()
-        return test:execsql2("SELECT length('\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.6>
-        "X", 11
-        -- </badutf-3.6>
-    })
-
-test:do_test(
-    "badutf-3.7",
     function()
         return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
     end, {
@@ -281,26 +261,6 @@ test:do_test(
         -- </badutf-3.7>
     })
 
-test:do_test(
-    "badutf-3.8",
-    function()
-        return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.8>
-        "X", 10
-        -- </badutf-3.8>
-    })
-
-test:do_test(
-    "badutf-3.9",
-    function()
-        return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff') AS x")
-    end, {
-        -- <badutf-3.9>
-        "X", 10
-        -- </badutf-3.9>
-    })
-
 test:do_test(
     "badutf-4.1",
     function()
diff --git a/test/sql-tap/built-in-functions.test.lua b/test/sql-tap/built-in-functions.test.lua
index 7fe987abc..6fae811dc 100755
--- a/test/sql-tap/built-in-functions.test.lua
+++ b/test/sql-tap/built-in-functions.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 local test = require("sqltester")
-test:plan(54)
+test:plan(52)
 
 --
 -- Make sure that number of arguments check is checked properly for SQL built-in
@@ -545,28 +545,4 @@ test:do_test(
         {name = "COLUMN_2", type = "scalar"},
     })
 
--- gh-4145: Make sure the character is now checked when calculating its length.
-
--- Character with UTF-8 code F0808080 does not exist.
-test:do_execsql_test(
-    "builtins-4.1",
-    [[
-        SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));
-    ]],
-    {
-        4
-    }
-)
-
--- Character with UTF-8 code F0908080 is '𐀀'.
-test:do_execsql_test(
-    "builtins-4.2",
-    [[
-        SELECT CHAR_LENGTH(CAST(x'f0908080' AS STRING));
-    ]],
-    {
-        1
-    }
-)
-
 test:finish_test()
diff --git a/test/sql-tap/func3.test.lua b/test/sql-tap/func3.test.lua
index 6999fea67..7ed0bb27f 100755
--- a/test/sql-tap/func3.test.lua
+++ b/test/sql-tap/func3.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 local test = require("sqltester")
-test:plan(35)
+test:plan(33)
 
 --!./tcltestrunner.lua
 -- 2010 August 27
@@ -294,11 +294,9 @@ suits[2] = {str = '\x80', len = 1}
 suits[3] = {str = '\x61\x62\x63', len = 3}
 suits[4] = {str = '\x7f\x80\x81', len = 3}
 suits[5] = {str = '\x61\xc0', len = 2}
-suits[6] = {str = '\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 12}
-suits[7] = {str = '\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 11}
-suits[8] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
-suits[9] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\x80', len = 7}
-suits[10] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\xff', len = 7}
+suits[6] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
+suits[7] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\x80', len = 7}
+suits[8] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\xff', len = 7}
 
 for k,v in pairs(suits) do
     test:do_execsql_test(


New patch:

commit 918fd18760f4491b81266b279dc5c4b581dc1ed6
Author: Mergen Imeev <imeevma at gmail.com>
Date:   Fri Oct 1 11:12:39 2021 +0300

    sql: rework CHAR_LENGTH() function
    
    The CHAR_LENGTH() and CHARACTER_LENGTH() functions now use ICU functions
    to determine the length of a string.
    
    Part of #4145
    
    @TarantoolBot document
    Title: Invalid UTF-8 values and ICU
    
    Invalid UTF-8 values may be handled differently depending on the ICU
    version. For example, for this request:
    ```
    SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));
    ```
    
    On `centos 7` with `libicu-devel-50.2-4.el7_7.x86_64` the result will
    be:
    ```
    tarantool> box.execute([[SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));]])
    ---
    - metadata:
      - name: COLUMN_1
        type: integer
      rows:
      - [1]
    ...
    ```
    
    On `ubuntu 20.04` with `libicu-dev` version `66.1-2ubuntu2` the result
    will be:
    ```
    tarantool> box.execute([[SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));]])
    ---
    - metadata:
      - name: COLUMN_1
        type: integer
      rows:
      - [4]
    ...
    ```

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index dbeb38bee..bc7a1fedd 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -258,6 +258,26 @@ func_abs_double(struct sql_context *ctx, int argc, struct Mem *argv)
 	mem_set_double(ctx->pOut, arg->u.r < 0 ? -arg->u.r : arg->u.r);
 }
 
+/** Implementation of the CHAR_LENGTH() function. */
+static void
+func_char_length(struct sql_context *ctx, int argc, struct Mem *argv)
+{
+	assert(argc == 1);
+	(void)argc;
+	struct Mem *arg = &argv[0];
+	if (mem_is_null(arg))
+		return;
+	assert(mem_is_str(arg) && arg->n >= 0);
+	uint32_t len = 0;
+	int offset = 0;
+	while (offset < arg->n) {
+		UChar32 c;
+		U8_NEXT((uint8_t *)arg->z, offset, arg->n, c);
+		++len;
+	}
+	mem_set_uint(ctx->pOut, len);
+}
+
 static const unsigned char *
 mem_as_ustr(struct Mem *mem)
 {
@@ -1912,8 +1932,8 @@ static struct sql_func_definition definitions[] = {
 	{"AVG", 1, {FIELD_TYPE_INTEGER}, FIELD_TYPE_INTEGER, step_avg, fin_avg},
 	{"AVG", 1, {FIELD_TYPE_DOUBLE}, FIELD_TYPE_DOUBLE, step_avg, fin_avg},
 	{"CHAR", -1, {FIELD_TYPE_INTEGER}, FIELD_TYPE_STRING, charFunc, NULL},
-	{"CHAR_LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, lengthFunc,
-	 NULL},
+	{"CHAR_LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER,
+	 func_char_length, NULL},
 	{"COALESCE", -1, {FIELD_TYPE_ANY}, FIELD_TYPE_SCALAR, sql_builtin_stub,
 	 NULL},
 	{"COUNT", 0, {}, FIELD_TYPE_INTEGER, step_count, fin_count},
@@ -1957,7 +1977,7 @@ static struct sql_func_definition definitions[] = {
 	{"LEAST", -1, {FIELD_TYPE_STRING}, FIELD_TYPE_STRING, minmaxFunc, NULL},
 	{"LEAST", -1, {FIELD_TYPE_SCALAR}, FIELD_TYPE_SCALAR, minmaxFunc, NULL},
 
-	{"LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, lengthFunc,
+	{"LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, func_char_length,
 	 NULL},
 	{"LENGTH", 1, {FIELD_TYPE_VARBINARY}, FIELD_TYPE_INTEGER, lengthFunc,
 	 NULL},
diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua
index b25436186..ce8354840 100755
--- a/test/sql-tap/badutf1.test.lua
+++ b/test/sql-tap/badutf1.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 local test = require("sqltester")
-test:plan(23)
+test:plan(19)
 
 --!./tcltestrunner.lua
 -- 2007 May 15
@@ -253,26 +253,6 @@ test:do_test(
 
 test:do_test(
     "badutf-3.5",
-    function()
-        return test:execsql2("SELECT length('\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.5>
-        "X", 12
-        -- </badutf-3.5>
-    })
-
-test:do_test(
-    "badutf-3.6",
-    function()
-        return test:execsql2("SELECT length('\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.6>
-        "X", 11
-        -- </badutf-3.6>
-    })
-
-test:do_test(
-    "badutf-3.7",
     function()
         return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
     end, {
@@ -281,26 +261,6 @@ test:do_test(
         -- </badutf-3.7>
     })
 
-test:do_test(
-    "badutf-3.8",
-    function()
-        return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.8>
-        "X", 7
-        -- </badutf-3.8>
-    })
-
-test:do_test(
-    "badutf-3.9",
-    function()
-        return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff') AS x")
-    end, {
-        -- <badutf-3.9>
-        "X", 7
-        -- </badutf-3.9>
-    })
-
 test:do_test(
     "badutf-4.1",
     function()
diff --git a/test/sql-tap/func3.test.lua b/test/sql-tap/func3.test.lua
index 7f1d8d33c..7ed0bb27f 100755
--- a/test/sql-tap/func3.test.lua
+++ b/test/sql-tap/func3.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 local test = require("sqltester")
-test:plan(35)
+test:plan(33)
 
 --!./tcltestrunner.lua
 -- 2010 August 27
@@ -294,11 +294,9 @@ suits[2] = {str = '\x80', len = 1}
 suits[3] = {str = '\x61\x62\x63', len = 3}
 suits[4] = {str = '\x7f\x80\x81', len = 3}
 suits[5] = {str = '\x61\xc0', len = 2}
-suits[6] = {str = '\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 12}
-suits[7] = {str = '\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 11}
-suits[8] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
-suits[9] = {str = '\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80', len = 7}
-suits[10] = {str = '\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff', len = 7}
+suits[6] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
+suits[7] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\x80', len = 7}
+suits[8] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\xff', len = 7}
 
 for k,v in pairs(suits) do
     test:do_execsql_test(


More information about the Tarantool-patches mailing list