[Tarantool-patches] [PATCH v1 2/8] sql: refactor CHAR_LENGTH() function
Mergen Imeev
imeevma at tarantool.org
Mon Nov 1 13:20:30 MSK 2021
Thank you for the review! My answer, diff and new patch below. I also
dropped some tests, including the ones I added in the previous version
of the patch. I did this because the behavior of the ICU functions
caused some of the CI targets to crash. I added a doc-bot request for
this issue in the commit message.
On Fri, Oct 29, 2021 at 12:11:37AM +0200, Vladislav Shpilevoy wrote:
> Thanks for the fixes!
>
> >>> +/** Implementation of the CHAR_LENGTH() function. */
> >>> +static inline uint8_t
> >>> +utf8_len_char(char c)
> >>> +{
> >>> + uint8_t u = (uint8_t)c;
> >>> + return 1 + (u >= 0xc2) + (u >= 0xe0) + (u >= 0xf0);
> >>
> >> It is not that simple really. Consider either using the old
> >> lengthFunc() and other sqlite utf8 helpers or use the approach
> >> similar to utf8_len() in utf8.c. It uses ICU macro U8_NEXT()
> >> and has handling for special symbols like U_SENTINEL.
> >>
> >> Otherwise you are making already third version of functions to
> >> work with utf8.
> >>
> >> I would even prefer to refactor lengthFunc() to stop using sqlite
> >> legacy and drop sqlite utf8 entirely, but I suspect it might be
> >> not so trivial to do and should be done later.
> > I was able to use ucnv_getNextUChar() here. In fact, I was able to use this
> > functions in all the places in this patch-set where we had to work with my or
> > SQLite functions that work with UTF8 characters. I think I can remove sql/utf.c
> > in the next patchset, since I refactor the LENGTH() and UNICODE() functions
> > there.
>
> Discussed in private that U8_NEXT() would work here just fine.
> ucnv_getNextUChar() is an overkill. In other places of the patchset too.
>
Thank you for the suggestion! I replaces ucnv_getNextUChar() by U8_NEXT().
Diff:
diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index faef0eef3..bc7a1fedd 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -269,11 +269,10 @@ func_char_length(struct sql_context *ctx, int argc, struct Mem *argv)
return;
assert(mem_is_str(arg) && arg->n >= 0);
uint32_t len = 0;
- UErrorCode err = U_ZERO_ERROR;
- const char *pos = arg->z;
- const char *end = arg->z + arg->n;
- while (pos < end) {
- ucnv_getNextUChar(icu_utf8_conv, &pos, end, &err);
+ int offset = 0;
+ while (offset < arg->n) {
+ UChar32 c;
+ U8_NEXT((uint8_t *)arg->z, offset, arg->n, c);
++len;
}
mem_set_uint(ctx->pOut, len);
diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua
index 27f17168b..ce8354840 100755
--- a/test/sql-tap/badutf1.test.lua
+++ b/test/sql-tap/badutf1.test.lua
@@ -1,6 +1,6 @@
#!/usr/bin/env tarantool
local test = require("sqltester")
-test:plan(23)
+test:plan(19)
--!./tcltestrunner.lua
-- 2007 May 15
@@ -253,26 +253,6 @@ test:do_test(
test:do_test(
"badutf-3.5",
- function()
- return test:execsql2("SELECT length('\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
- end, {
- -- <badutf-3.5>
- "X", 12
- -- </badutf-3.5>
- })
-
-test:do_test(
- "badutf-3.6",
- function()
- return test:execsql2("SELECT length('\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
- end, {
- -- <badutf-3.6>
- "X", 11
- -- </badutf-3.6>
- })
-
-test:do_test(
- "badutf-3.7",
function()
return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
end, {
@@ -281,26 +261,6 @@ test:do_test(
-- </badutf-3.7>
})
-test:do_test(
- "badutf-3.8",
- function()
- return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80') AS x")
- end, {
- -- <badutf-3.8>
- "X", 10
- -- </badutf-3.8>
- })
-
-test:do_test(
- "badutf-3.9",
- function()
- return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff') AS x")
- end, {
- -- <badutf-3.9>
- "X", 10
- -- </badutf-3.9>
- })
-
test:do_test(
"badutf-4.1",
function()
diff --git a/test/sql-tap/built-in-functions.test.lua b/test/sql-tap/built-in-functions.test.lua
index 7fe987abc..6fae811dc 100755
--- a/test/sql-tap/built-in-functions.test.lua
+++ b/test/sql-tap/built-in-functions.test.lua
@@ -1,6 +1,6 @@
#!/usr/bin/env tarantool
local test = require("sqltester")
-test:plan(54)
+test:plan(52)
--
-- Make sure that number of arguments check is checked properly for SQL built-in
@@ -545,28 +545,4 @@ test:do_test(
{name = "COLUMN_2", type = "scalar"},
})
--- gh-4145: Make sure the character is now checked when calculating its length.
-
--- Character with UTF-8 code F0808080 does not exist.
-test:do_execsql_test(
- "builtins-4.1",
- [[
- SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));
- ]],
- {
- 4
- }
-)
-
--- Character with UTF-8 code F0908080 is '𐀀'.
-test:do_execsql_test(
- "builtins-4.2",
- [[
- SELECT CHAR_LENGTH(CAST(x'f0908080' AS STRING));
- ]],
- {
- 1
- }
-)
-
test:finish_test()
diff --git a/test/sql-tap/func3.test.lua b/test/sql-tap/func3.test.lua
index 6999fea67..7ed0bb27f 100755
--- a/test/sql-tap/func3.test.lua
+++ b/test/sql-tap/func3.test.lua
@@ -1,6 +1,6 @@
#!/usr/bin/env tarantool
local test = require("sqltester")
-test:plan(35)
+test:plan(33)
--!./tcltestrunner.lua
-- 2010 August 27
@@ -294,11 +294,9 @@ suits[2] = {str = '\x80', len = 1}
suits[3] = {str = '\x61\x62\x63', len = 3}
suits[4] = {str = '\x7f\x80\x81', len = 3}
suits[5] = {str = '\x61\xc0', len = 2}
-suits[6] = {str = '\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 12}
-suits[7] = {str = '\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 11}
-suits[8] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
-suits[9] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\x80', len = 7}
-suits[10] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\xff', len = 7}
+suits[6] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
+suits[7] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\x80', len = 7}
+suits[8] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\xff', len = 7}
for k,v in pairs(suits) do
test:do_execsql_test(
New patch:
commit 918fd18760f4491b81266b279dc5c4b581dc1ed6
Author: Mergen Imeev <imeevma at gmail.com>
Date: Fri Oct 1 11:12:39 2021 +0300
sql: rework CHAR_LENGTH() function
The CHAR_LENGTH() and CHARACTER_LENGTH() functions now use ICU functions
to determine the length of a string.
Part of #4145
@TarantoolBot document
Title: Invalid UTF-8 values and ICU
Invalid UTF-8 values may be handled differently depending on the ICU
version. For example, for this request:
```
SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));
```
On `centos 7` with `libicu-devel-50.2-4.el7_7.x86_64` the result will
be:
```
tarantool> box.execute([[SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));]])
---
- metadata:
- name: COLUMN_1
type: integer
rows:
- [1]
...
```
On `ubuntu 20.04` with `libicu-dev` version `66.1-2ubuntu2` the result
will be:
```
tarantool> box.execute([[SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));]])
---
- metadata:
- name: COLUMN_1
type: integer
rows:
- [4]
...
```
diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index dbeb38bee..bc7a1fedd 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -258,6 +258,26 @@ func_abs_double(struct sql_context *ctx, int argc, struct Mem *argv)
mem_set_double(ctx->pOut, arg->u.r < 0 ? -arg->u.r : arg->u.r);
}
+/** Implementation of the CHAR_LENGTH() function. */
+static void
+func_char_length(struct sql_context *ctx, int argc, struct Mem *argv)
+{
+ assert(argc == 1);
+ (void)argc;
+ struct Mem *arg = &argv[0];
+ if (mem_is_null(arg))
+ return;
+ assert(mem_is_str(arg) && arg->n >= 0);
+ uint32_t len = 0;
+ int offset = 0;
+ while (offset < arg->n) {
+ UChar32 c;
+ U8_NEXT((uint8_t *)arg->z, offset, arg->n, c);
+ ++len;
+ }
+ mem_set_uint(ctx->pOut, len);
+}
+
static const unsigned char *
mem_as_ustr(struct Mem *mem)
{
@@ -1912,8 +1932,8 @@ static struct sql_func_definition definitions[] = {
{"AVG", 1, {FIELD_TYPE_INTEGER}, FIELD_TYPE_INTEGER, step_avg, fin_avg},
{"AVG", 1, {FIELD_TYPE_DOUBLE}, FIELD_TYPE_DOUBLE, step_avg, fin_avg},
{"CHAR", -1, {FIELD_TYPE_INTEGER}, FIELD_TYPE_STRING, charFunc, NULL},
- {"CHAR_LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, lengthFunc,
- NULL},
+ {"CHAR_LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER,
+ func_char_length, NULL},
{"COALESCE", -1, {FIELD_TYPE_ANY}, FIELD_TYPE_SCALAR, sql_builtin_stub,
NULL},
{"COUNT", 0, {}, FIELD_TYPE_INTEGER, step_count, fin_count},
@@ -1957,7 +1977,7 @@ static struct sql_func_definition definitions[] = {
{"LEAST", -1, {FIELD_TYPE_STRING}, FIELD_TYPE_STRING, minmaxFunc, NULL},
{"LEAST", -1, {FIELD_TYPE_SCALAR}, FIELD_TYPE_SCALAR, minmaxFunc, NULL},
- {"LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, lengthFunc,
+ {"LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, func_char_length,
NULL},
{"LENGTH", 1, {FIELD_TYPE_VARBINARY}, FIELD_TYPE_INTEGER, lengthFunc,
NULL},
diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua
index b25436186..ce8354840 100755
--- a/test/sql-tap/badutf1.test.lua
+++ b/test/sql-tap/badutf1.test.lua
@@ -1,6 +1,6 @@
#!/usr/bin/env tarantool
local test = require("sqltester")
-test:plan(23)
+test:plan(19)
--!./tcltestrunner.lua
-- 2007 May 15
@@ -253,26 +253,6 @@ test:do_test(
test:do_test(
"badutf-3.5",
- function()
- return test:execsql2("SELECT length('\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
- end, {
- -- <badutf-3.5>
- "X", 12
- -- </badutf-3.5>
- })
-
-test:do_test(
- "badutf-3.6",
- function()
- return test:execsql2("SELECT length('\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
- end, {
- -- <badutf-3.6>
- "X", 11
- -- </badutf-3.6>
- })
-
-test:do_test(
- "badutf-3.7",
function()
return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
end, {
@@ -281,26 +261,6 @@ test:do_test(
-- </badutf-3.7>
})
-test:do_test(
- "badutf-3.8",
- function()
- return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80') AS x")
- end, {
- -- <badutf-3.8>
- "X", 7
- -- </badutf-3.8>
- })
-
-test:do_test(
- "badutf-3.9",
- function()
- return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff') AS x")
- end, {
- -- <badutf-3.9>
- "X", 7
- -- </badutf-3.9>
- })
-
test:do_test(
"badutf-4.1",
function()
diff --git a/test/sql-tap/func3.test.lua b/test/sql-tap/func3.test.lua
index 7f1d8d33c..7ed0bb27f 100755
--- a/test/sql-tap/func3.test.lua
+++ b/test/sql-tap/func3.test.lua
@@ -1,6 +1,6 @@
#!/usr/bin/env tarantool
local test = require("sqltester")
-test:plan(35)
+test:plan(33)
--!./tcltestrunner.lua
-- 2010 August 27
@@ -294,11 +294,9 @@ suits[2] = {str = '\x80', len = 1}
suits[3] = {str = '\x61\x62\x63', len = 3}
suits[4] = {str = '\x7f\x80\x81', len = 3}
suits[5] = {str = '\x61\xc0', len = 2}
-suits[6] = {str = '\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 12}
-suits[7] = {str = '\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 11}
-suits[8] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
-suits[9] = {str = '\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80', len = 7}
-suits[10] = {str = '\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff', len = 7}
+suits[6] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
+suits[7] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\x80', len = 7}
+suits[8] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\xff', len = 7}
for k,v in pairs(suits) do
test:do_execsql_test(
More information about the Tarantool-patches
mailing list