[Tarantool-patches] [PATCH v1 2/8] sql: rework CHAR_LENGTH() function

imeevma at tarantool.org imeevma at tarantool.org
Thu Nov 11 13:45:27 MSK 2021


The CHAR_LENGTH() and CHARACTER_LENGTH() functions now use ICU functions
to determine the length of a string.

Part of #4145

@TarantoolBot document
Title: Invalid UTF-8 values and ICU

Invalid UTF-8 values may be handled differently depending on the ICU
version. For example, for this request:
```
SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));
```

On `centos 7` with `libicu-devel-50.2-4.el7_7.x86_64` the result will
be:
```
tarantool> box.execute([[SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));]])
---
- metadata:
  - name: COLUMN_1
    type: integer
  rows:
  - [1]
...
```

On `ubuntu 20.04` with `libicu-dev` version `66.1-2ubuntu2` the result
will be:
```
tarantool> box.execute([[SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));]])
---
- metadata:
  - name: COLUMN_1
    type: integer
  rows:
  - [4]
...
```
---
 src/box/sql/func.c            | 26 +++++++++++++++++++---
 test/sql-tap/badutf1.test.lua | 42 +----------------------------------
 test/sql-tap/func3.test.lua   | 10 ++++-----
 3 files changed, 28 insertions(+), 50 deletions(-)

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index dbeb38bee..bc7a1fedd 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -258,6 +258,26 @@ func_abs_double(struct sql_context *ctx, int argc, struct Mem *argv)
 	mem_set_double(ctx->pOut, arg->u.r < 0 ? -arg->u.r : arg->u.r);
 }
 
+/** Implementation of the CHAR_LENGTH() function. */
+static void
+func_char_length(struct sql_context *ctx, int argc, struct Mem *argv)
+{
+	assert(argc == 1);
+	(void)argc;
+	struct Mem *arg = &argv[0];
+	if (mem_is_null(arg))
+		return;
+	assert(mem_is_str(arg) && arg->n >= 0);
+	uint32_t len = 0;
+	int offset = 0;
+	while (offset < arg->n) {
+		UChar32 c;
+		U8_NEXT((uint8_t *)arg->z, offset, arg->n, c);
+		++len;
+	}
+	mem_set_uint(ctx->pOut, len);
+}
+
 static const unsigned char *
 mem_as_ustr(struct Mem *mem)
 {
@@ -1912,8 +1932,8 @@ static struct sql_func_definition definitions[] = {
 	{"AVG", 1, {FIELD_TYPE_INTEGER}, FIELD_TYPE_INTEGER, step_avg, fin_avg},
 	{"AVG", 1, {FIELD_TYPE_DOUBLE}, FIELD_TYPE_DOUBLE, step_avg, fin_avg},
 	{"CHAR", -1, {FIELD_TYPE_INTEGER}, FIELD_TYPE_STRING, charFunc, NULL},
-	{"CHAR_LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, lengthFunc,
-	 NULL},
+	{"CHAR_LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER,
+	 func_char_length, NULL},
 	{"COALESCE", -1, {FIELD_TYPE_ANY}, FIELD_TYPE_SCALAR, sql_builtin_stub,
 	 NULL},
 	{"COUNT", 0, {}, FIELD_TYPE_INTEGER, step_count, fin_count},
@@ -1957,7 +1977,7 @@ static struct sql_func_definition definitions[] = {
 	{"LEAST", -1, {FIELD_TYPE_STRING}, FIELD_TYPE_STRING, minmaxFunc, NULL},
 	{"LEAST", -1, {FIELD_TYPE_SCALAR}, FIELD_TYPE_SCALAR, minmaxFunc, NULL},
 
-	{"LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, lengthFunc,
+	{"LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, func_char_length,
 	 NULL},
 	{"LENGTH", 1, {FIELD_TYPE_VARBINARY}, FIELD_TYPE_INTEGER, lengthFunc,
 	 NULL},
diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua
index b25436186..ce8354840 100755
--- a/test/sql-tap/badutf1.test.lua
+++ b/test/sql-tap/badutf1.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 local test = require("sqltester")
-test:plan(23)
+test:plan(19)
 
 --!./tcltestrunner.lua
 -- 2007 May 15
@@ -253,26 +253,6 @@ test:do_test(
 
 test:do_test(
     "badutf-3.5",
-    function()
-        return test:execsql2("SELECT length('\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.5>
-        "X", 12
-        -- </badutf-3.5>
-    })
-
-test:do_test(
-    "badutf-3.6",
-    function()
-        return test:execsql2("SELECT length('\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.6>
-        "X", 11
-        -- </badutf-3.6>
-    })
-
-test:do_test(
-    "badutf-3.7",
     function()
         return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
     end, {
@@ -281,26 +261,6 @@ test:do_test(
         -- </badutf-3.7>
     })
 
-test:do_test(
-    "badutf-3.8",
-    function()
-        return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.8>
-        "X", 7
-        -- </badutf-3.8>
-    })
-
-test:do_test(
-    "badutf-3.9",
-    function()
-        return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff') AS x")
-    end, {
-        -- <badutf-3.9>
-        "X", 7
-        -- </badutf-3.9>
-    })
-
 test:do_test(
     "badutf-4.1",
     function()
diff --git a/test/sql-tap/func3.test.lua b/test/sql-tap/func3.test.lua
index 8d6268bb7..32c807103 100755
--- a/test/sql-tap/func3.test.lua
+++ b/test/sql-tap/func3.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 local test = require("sqltester")
-test:plan(35)
+test:plan(33)
 
 --!./tcltestrunner.lua
 -- 2010 August 27
@@ -294,11 +294,9 @@ suits[2] = {str = '\x80', len = 1}
 suits[3] = {str = '\x61\x62\x63', len = 3}
 suits[4] = {str = '\x7f\x80\x81', len = 3}
 suits[5] = {str = '\x61\xc0', len = 2}
-suits[6] = {str = '\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 12}
-suits[7] = {str = '\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 11}
-suits[8] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
-suits[9] = {str = '\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80', len = 7}
-suits[10] = {str = '\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff', len = 7}
+suits[6] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
+suits[7] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\x80', len = 7}
+suits[8] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\xff', len = 7}
 
 for k,v in pairs(suits) do
     test:do_execsql_test(
-- 
2.25.1



More information about the Tarantool-patches mailing list