Tarantool development patches archive
 help / color / mirror / Atom feed
From: Mergen Imeev via Tarantool-patches <tarantool-patches@dev.tarantool.org>
To: kyukhin@tarantool.org
Cc: tarantool-patches@dev.tarantool.org
Subject: [Tarantool-patches] [PATCH v1 2/8] sql: rework CHAR_LENGTH() function
Date: Thu, 11 Nov 2021 13:45:27 +0300	[thread overview]
Message-ID: <52ed733133228494d12510e453ecaec3d274ecc7.1636627366.git.imeevma@gmail.com> (raw)
In-Reply-To: <cover.1636627365.git.imeevma@gmail.com>

The CHAR_LENGTH() and CHARACTER_LENGTH() functions now use ICU functions
to determine the length of a string.

Part of #4145

@TarantoolBot document
Title: Invalid UTF-8 values and ICU

Invalid UTF-8 values may be handled differently depending on the ICU
version. For example, for this request:
```
SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));
```

On `centos 7` with `libicu-devel-50.2-4.el7_7.x86_64` the result will
be:
```
tarantool> box.execute([[SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));]])
---
- metadata:
  - name: COLUMN_1
    type: integer
  rows:
  - [1]
...
```

On `ubuntu 20.04` with `libicu-dev` version `66.1-2ubuntu2` the result
will be:
```
tarantool> box.execute([[SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));]])
---
- metadata:
  - name: COLUMN_1
    type: integer
  rows:
  - [4]
...
```
---
 src/box/sql/func.c            | 26 +++++++++++++++++++---
 test/sql-tap/badutf1.test.lua | 42 +----------------------------------
 test/sql-tap/func3.test.lua   | 10 ++++-----
 3 files changed, 28 insertions(+), 50 deletions(-)

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index dbeb38bee..bc7a1fedd 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -258,6 +258,26 @@ func_abs_double(struct sql_context *ctx, int argc, struct Mem *argv)
 	mem_set_double(ctx->pOut, arg->u.r < 0 ? -arg->u.r : arg->u.r);
 }
 
+/** Implementation of the CHAR_LENGTH() function. */
+static void
+func_char_length(struct sql_context *ctx, int argc, struct Mem *argv)
+{
+	assert(argc == 1);
+	(void)argc;
+	struct Mem *arg = &argv[0];
+	if (mem_is_null(arg))
+		return;
+	assert(mem_is_str(arg) && arg->n >= 0);
+	uint32_t len = 0;
+	int offset = 0;
+	while (offset < arg->n) {
+		UChar32 c;
+		U8_NEXT((uint8_t *)arg->z, offset, arg->n, c);
+		++len;
+	}
+	mem_set_uint(ctx->pOut, len);
+}
+
 static const unsigned char *
 mem_as_ustr(struct Mem *mem)
 {
@@ -1912,8 +1932,8 @@ static struct sql_func_definition definitions[] = {
 	{"AVG", 1, {FIELD_TYPE_INTEGER}, FIELD_TYPE_INTEGER, step_avg, fin_avg},
 	{"AVG", 1, {FIELD_TYPE_DOUBLE}, FIELD_TYPE_DOUBLE, step_avg, fin_avg},
 	{"CHAR", -1, {FIELD_TYPE_INTEGER}, FIELD_TYPE_STRING, charFunc, NULL},
-	{"CHAR_LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, lengthFunc,
-	 NULL},
+	{"CHAR_LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER,
+	 func_char_length, NULL},
 	{"COALESCE", -1, {FIELD_TYPE_ANY}, FIELD_TYPE_SCALAR, sql_builtin_stub,
 	 NULL},
 	{"COUNT", 0, {}, FIELD_TYPE_INTEGER, step_count, fin_count},
@@ -1957,7 +1977,7 @@ static struct sql_func_definition definitions[] = {
 	{"LEAST", -1, {FIELD_TYPE_STRING}, FIELD_TYPE_STRING, minmaxFunc, NULL},
 	{"LEAST", -1, {FIELD_TYPE_SCALAR}, FIELD_TYPE_SCALAR, minmaxFunc, NULL},
 
-	{"LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, lengthFunc,
+	{"LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, func_char_length,
 	 NULL},
 	{"LENGTH", 1, {FIELD_TYPE_VARBINARY}, FIELD_TYPE_INTEGER, lengthFunc,
 	 NULL},
diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua
index b25436186..ce8354840 100755
--- a/test/sql-tap/badutf1.test.lua
+++ b/test/sql-tap/badutf1.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 local test = require("sqltester")
-test:plan(23)
+test:plan(19)
 
 --!./tcltestrunner.lua
 -- 2007 May 15
@@ -253,26 +253,6 @@ test:do_test(
 
 test:do_test(
     "badutf-3.5",
-    function()
-        return test:execsql2("SELECT length('\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.5>
-        "X", 12
-        -- </badutf-3.5>
-    })
-
-test:do_test(
-    "badutf-3.6",
-    function()
-        return test:execsql2("SELECT length('\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.6>
-        "X", 11
-        -- </badutf-3.6>
-    })
-
-test:do_test(
-    "badutf-3.7",
     function()
         return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
     end, {
@@ -281,26 +261,6 @@ test:do_test(
         -- </badutf-3.7>
     })
 
-test:do_test(
-    "badutf-3.8",
-    function()
-        return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80') AS x")
-    end, {
-        -- <badutf-3.8>
-        "X", 7
-        -- </badutf-3.8>
-    })
-
-test:do_test(
-    "badutf-3.9",
-    function()
-        return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff') AS x")
-    end, {
-        -- <badutf-3.9>
-        "X", 7
-        -- </badutf-3.9>
-    })
-
 test:do_test(
     "badutf-4.1",
     function()
diff --git a/test/sql-tap/func3.test.lua b/test/sql-tap/func3.test.lua
index 8d6268bb7..32c807103 100755
--- a/test/sql-tap/func3.test.lua
+++ b/test/sql-tap/func3.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 local test = require("sqltester")
-test:plan(35)
+test:plan(33)
 
 --!./tcltestrunner.lua
 -- 2010 August 27
@@ -294,11 +294,9 @@ suits[2] = {str = '\x80', len = 1}
 suits[3] = {str = '\x61\x62\x63', len = 3}
 suits[4] = {str = '\x7f\x80\x81', len = 3}
 suits[5] = {str = '\x61\xc0', len = 2}
-suits[6] = {str = '\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 12}
-suits[7] = {str = '\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 11}
-suits[8] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
-suits[9] = {str = '\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80', len = 7}
-suits[10] = {str = '\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff', len = 7}
+suits[6] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10}
+suits[7] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\x80', len = 7}
+suits[8] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\xff', len = 7}
 
 for k,v in pairs(suits) do
     test:do_execsql_test(
-- 
2.25.1


  parent reply	other threads:[~2021-11-11 10:46 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-11 10:45 [Tarantool-patches] [PATCH v1 0/8] Rework standard function Mergen Imeev via Tarantool-patches
2021-11-11 10:45 ` [Tarantool-patches] [PATCH v1 1/8] sql: refactor ABS() funcion Mergen Imeev via Tarantool-patches
2021-11-11 10:45 ` Mergen Imeev via Tarantool-patches [this message]
2021-11-11 10:45 ` [Tarantool-patches] [PATCH v1 3/8] sql: refactor UPPER() and LOWER() functions Mergen Imeev via Tarantool-patches
2021-11-11 10:45 ` [Tarantool-patches] [PATCH v1 4/8] sql: refactor NULLIF() function Mergen Imeev via Tarantool-patches
2021-11-11 10:45 ` [Tarantool-patches] [PATCH v1 5/8] sql: rework TRIM() function Mergen Imeev via Tarantool-patches
2021-11-11 10:45 ` [Tarantool-patches] [PATCH v1 6/8] sql: rework POSITION() function Mergen Imeev via Tarantool-patches
2021-11-11 10:45 ` [Tarantool-patches] [PATCH v1 7/8] sql: rework SUBSTR() function Mergen Imeev via Tarantool-patches
2021-11-11 10:45 ` [Tarantool-patches] [PATCH v1 8/8] sql: refactor LIKE() function Mergen Imeev via Tarantool-patches
2021-11-11 10:59 ` [Tarantool-patches] [PATCH v1 0/8] Rework standard function Kirill Yukhin via Tarantool-patches

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=52ed733133228494d12510e453ecaec3d274ecc7.1636627366.git.imeevma@gmail.com \
    --to=tarantool-patches@dev.tarantool.org \
    --cc=imeevma@tarantool.org \
    --cc=kyukhin@tarantool.org \
    --subject='Re: [Tarantool-patches] [PATCH v1 2/8] sql: rework CHAR_LENGTH() function' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox