From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from [87.239.111.99] (localhost [127.0.0.1]) by dev.tarantool.org (Postfix) with ESMTP id 2AB756F3E5; Thu, 11 Nov 2021 13:46:28 +0300 (MSK) DKIM-Filter: OpenDKIM Filter v2.11.0 dev.tarantool.org 2AB756F3E5 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=tarantool.org; s=dev; t=1636627588; bh=6nkp9vi5y9SMICHcyI/0+w4H0IJ19t16tGdP2ZR7lps=; h=To:Cc:Date:In-Reply-To:References:Subject:List-Id: List-Unsubscribe:List-Archive:List-Post:List-Help:List-Subscribe: From:Reply-To:From; b=Vbeqw9UTOmflQ9g6UjuF+dEeC6M7b+0fKRYY2SYCj01u94cKYZ5Eo2TAvzeJFidPS vtGCzX+rEikMBnPWZEJiFWfPCmmcpmQcfBY9MdyGjo9fhWYz62L+hIwO39Ix2u3sQl Cwj4bdN3XSYkJ5W7aSADAo4XAzNa5+5vqtXscFHQ= Received: from smtpng1.i.mail.ru (smtpng1.i.mail.ru [94.100.181.251]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 87BA76CE33 for ; Thu, 11 Nov 2021 13:45:28 +0300 (MSK) DKIM-Filter: OpenDKIM Filter v2.11.0 dev.tarantool.org 87BA76CE33 Received: by smtpng1.m.smailru.net with esmtpa (envelope-from ) id 1ml7aB-0002tz-Kd; Thu, 11 Nov 2021 13:45:27 +0300 To: kyukhin@tarantool.org Cc: tarantool-patches@dev.tarantool.org Date: Thu, 11 Nov 2021 13:45:27 +0300 Message-Id: <52ed733133228494d12510e453ecaec3d274ecc7.1636627366.git.imeevma@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-4EC0790: 10 X-7564579A: 646B95376F6C166E X-77F55803: 4F1203BC0FB41BD9731B3922EC0639796A433260860B48E60A001AF1EB3A7EE600894C459B0CD1B9306623E57DA466373120EDCD96F81DEEBD7B9CED87D2556722AF0AD8BA8DF889 X-7FA49CB5: FF5795518A3D127A4AD6D5ED66289B5278DA827A17800CE7C8DD139BC0BB8586EA1F7E6F0F101C67BD4B6F7A4D31EC0BCC500DACC3FED6E28638F802B75D45FF8AA50765F79006377A7A7D315BEE81B48638F802B75D45FF36EB9D2243A4F8B5A6FCA7DBDB1FC311F39EFFDF887939037866D6147AF826D8810A48F9B5902B0211280491371E45D1117882F4460429724CE54428C33FAD305F5C1EE8F4F765FCF1175FABE1C0F9B6A471835C12D1D9774AD6D5ED66289B52BA9C0B312567BB23117882F44604297287769387670735209ECD01F8117BC8BEA471835C12D1D977C4224003CC8364762BB6847A3DEAEFB0F43C7A68FF6260569E8FC8737B5C2249EC8D19AE6D49635B68655334FD4449CB9ECD01F8117BC8BEAAAE862A0553A39223F8577A6DFFEA7C289736CE4F78F08343847C11F186F3C59DAA53EE0834AAEE X-C1DE0DAB: C20DE7B7AB408E4181F030C43753B8186998911F362727C414F749A5E30D975C7E9FEBB9C11794A1EFD1CC5AA24036951762CD700ECBDCB69C2B6934AE262D3EE7EAB7254005DCED7532B743992DF240BDC6A1CF3F042BAD6DF99611D93F60EF309DFB797F6729CB699F904B3F4130E343918A1A30D5E7FCCB5012B2E24CD356 X-C8649E89: 4E36BF7865823D7055A7F0CF078B5EC49A30900B95165D340A4C04F5DECA7EE9B9C659363BEFF2647E2C386DB95D3326FEB684A54D3BBAA7E86C32C34A12B3191D7E09C32AA3244C2119995C930C08DA0C328079A308300FD9ADFF0C0BDB8D1F729B2BEF169E0186 X-D57D3AED: 3ZO7eAau8CL7WIMRKs4sN3D3tLDjz0dLbV79QFUyzQ2Ujvy7cMT6pYYqY16iZVKkSc3dCLJ7zSJH7+u4VD18S7Vl4ZUrpaVfd2+vE6kuoey4m4VkSEu530nj6fImhcD4MUrOEAnl0W826KZ9Q+tr5ycPtXkTV4k65bRjmOUUP8cvGozZ33TWg5HZplvhhXbhDGzqmQDTd6OAevLeAnq3Ra9uf7zvY2zzsIhlcp/Y7m53TZgf2aB4JOg4gkr2bioj4t8MBgWr8bKHuxolXXPrFA== X-Mailru-Sender: 689FA8AB762F7393C37E3C1AEC41BA5D5CDE0F70263C3416ACC82157DBF57A6583D72C36FC87018B9F80AB2734326CD2FB559BB5D741EB96352A0ABBE4FDA4210A04DAD6CC59E33667EA787935ED9F1B X-Mras: Ok Subject: [Tarantool-patches] [PATCH v1 2/8] sql: rework CHAR_LENGTH() function X-BeenThere: tarantool-patches@dev.tarantool.org X-Mailman-Version: 2.1.34 Precedence: list List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , From: Mergen Imeev via Tarantool-patches Reply-To: imeevma@tarantool.org Errors-To: tarantool-patches-bounces@dev.tarantool.org Sender: "Tarantool-patches" The CHAR_LENGTH() and CHARACTER_LENGTH() functions now use ICU functions to determine the length of a string. Part of #4145 @TarantoolBot document Title: Invalid UTF-8 values and ICU Invalid UTF-8 values may be handled differently depending on the ICU version. For example, for this request: ``` SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING)); ``` On `centos 7` with `libicu-devel-50.2-4.el7_7.x86_64` the result will be: ``` tarantool> box.execute([[SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));]]) --- - metadata: - name: COLUMN_1 type: integer rows: - [1] ... ``` On `ubuntu 20.04` with `libicu-dev` version `66.1-2ubuntu2` the result will be: ``` tarantool> box.execute([[SELECT CHAR_LENGTH(CAST(x'f0808080' AS STRING));]]) --- - metadata: - name: COLUMN_1 type: integer rows: - [4] ... ``` --- src/box/sql/func.c | 26 +++++++++++++++++++--- test/sql-tap/badutf1.test.lua | 42 +---------------------------------- test/sql-tap/func3.test.lua | 10 ++++----- 3 files changed, 28 insertions(+), 50 deletions(-) diff --git a/src/box/sql/func.c b/src/box/sql/func.c index dbeb38bee..bc7a1fedd 100644 --- a/src/box/sql/func.c +++ b/src/box/sql/func.c @@ -258,6 +258,26 @@ func_abs_double(struct sql_context *ctx, int argc, struct Mem *argv) mem_set_double(ctx->pOut, arg->u.r < 0 ? -arg->u.r : arg->u.r); } +/** Implementation of the CHAR_LENGTH() function. */ +static void +func_char_length(struct sql_context *ctx, int argc, struct Mem *argv) +{ + assert(argc == 1); + (void)argc; + struct Mem *arg = &argv[0]; + if (mem_is_null(arg)) + return; + assert(mem_is_str(arg) && arg->n >= 0); + uint32_t len = 0; + int offset = 0; + while (offset < arg->n) { + UChar32 c; + U8_NEXT((uint8_t *)arg->z, offset, arg->n, c); + ++len; + } + mem_set_uint(ctx->pOut, len); +} + static const unsigned char * mem_as_ustr(struct Mem *mem) { @@ -1912,8 +1932,8 @@ static struct sql_func_definition definitions[] = { {"AVG", 1, {FIELD_TYPE_INTEGER}, FIELD_TYPE_INTEGER, step_avg, fin_avg}, {"AVG", 1, {FIELD_TYPE_DOUBLE}, FIELD_TYPE_DOUBLE, step_avg, fin_avg}, {"CHAR", -1, {FIELD_TYPE_INTEGER}, FIELD_TYPE_STRING, charFunc, NULL}, - {"CHAR_LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, lengthFunc, - NULL}, + {"CHAR_LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, + func_char_length, NULL}, {"COALESCE", -1, {FIELD_TYPE_ANY}, FIELD_TYPE_SCALAR, sql_builtin_stub, NULL}, {"COUNT", 0, {}, FIELD_TYPE_INTEGER, step_count, fin_count}, @@ -1957,7 +1977,7 @@ static struct sql_func_definition definitions[] = { {"LEAST", -1, {FIELD_TYPE_STRING}, FIELD_TYPE_STRING, minmaxFunc, NULL}, {"LEAST", -1, {FIELD_TYPE_SCALAR}, FIELD_TYPE_SCALAR, minmaxFunc, NULL}, - {"LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, lengthFunc, + {"LENGTH", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, func_char_length, NULL}, {"LENGTH", 1, {FIELD_TYPE_VARBINARY}, FIELD_TYPE_INTEGER, lengthFunc, NULL}, diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua index b25436186..ce8354840 100755 --- a/test/sql-tap/badutf1.test.lua +++ b/test/sql-tap/badutf1.test.lua @@ -1,6 +1,6 @@ #!/usr/bin/env tarantool local test = require("sqltester") -test:plan(23) +test:plan(19) --!./tcltestrunner.lua -- 2007 May 15 @@ -253,26 +253,6 @@ test:do_test( test:do_test( "badutf-3.5", - function() - return test:execsql2("SELECT length('\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x") - end, { - -- - "X", 12 - -- - }) - -test:do_test( - "badutf-3.6", - function() - return test:execsql2("SELECT length('\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x") - end, { - -- - "X", 11 - -- - }) - -test:do_test( - "badutf-3.7", function() return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x") end, { @@ -281,26 +261,6 @@ test:do_test( -- }) -test:do_test( - "badutf-3.8", - function() - return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80') AS x") - end, { - -- - "X", 7 - -- - }) - -test:do_test( - "badutf-3.9", - function() - return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff') AS x") - end, { - -- - "X", 7 - -- - }) - test:do_test( "badutf-4.1", function() diff --git a/test/sql-tap/func3.test.lua b/test/sql-tap/func3.test.lua index 8d6268bb7..32c807103 100755 --- a/test/sql-tap/func3.test.lua +++ b/test/sql-tap/func3.test.lua @@ -1,6 +1,6 @@ #!/usr/bin/env tarantool local test = require("sqltester") -test:plan(35) +test:plan(33) --!./tcltestrunner.lua -- 2010 August 27 @@ -294,11 +294,9 @@ suits[2] = {str = '\x80', len = 1} suits[3] = {str = '\x61\x62\x63', len = 3} suits[4] = {str = '\x7f\x80\x81', len = 3} suits[5] = {str = '\x61\xc0', len = 2} -suits[6] = {str = '\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 12} -suits[7] = {str = '\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 11} -suits[8] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10} -suits[9] = {str = '\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80', len = 7} -suits[10] = {str = '\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff', len = 7} +suits[6] = {str = '\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80', len = 10} +suits[7] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\x80', len = 7} +suits[8] = {str = '\x80\x80\x80\x80\x80\xf0\x90\x80\x80\xff', len = 7} for k,v in pairs(suits) do test:do_execsql_test( -- 2.25.1