From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from [87.239.111.99] (localhost [127.0.0.1]) by dev.tarantool.org (Postfix) with ESMTP id 194706F3E5; Thu, 11 Nov 2021 13:47:59 +0300 (MSK) DKIM-Filter: OpenDKIM Filter v2.11.0 dev.tarantool.org 194706F3E5 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=tarantool.org; s=dev; t=1636627679; bh=weZ41Nupil/vtwWpsQUTYwx88EzHr1pkArrOLJJatss=; h=To:Cc:Date:In-Reply-To:References:Subject:List-Id: List-Unsubscribe:List-Archive:List-Post:List-Help:List-Subscribe: From:Reply-To:From; b=L0vlZ55FLotORXB73Srupj22bEGCi79KCzRmUvaZiz6p3poSHT/16ltqnFtnndR7u UeXGCfmZIPGqt/532PgnN19seF/q86st8iz/0bNbwKrkRuwbxuhh2/z10VzkpsK//R AE762640KQGlIrHzQJXVuyDSnp6A5D9xJ/4labmE= Received: from smtpng1.i.mail.ru (smtpng1.i.mail.ru [94.100.181.251]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id AFE206CE3D for ; Thu, 11 Nov 2021 13:45:33 +0300 (MSK) DKIM-Filter: OpenDKIM Filter v2.11.0 dev.tarantool.org AFE206CE3D Received: by smtpng1.m.smailru.net with esmtpa (envelope-from ) id 1ml7aG-00030U-IC; Thu, 11 Nov 2021 13:45:33 +0300 To: kyukhin@tarantool.org Cc: tarantool-patches@dev.tarantool.org Date: Thu, 11 Nov 2021 13:45:32 +0300 Message-Id: <8df5ebe445bdca18afb7d0420a30b3371a293001.1636627366.git.imeevma@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-4EC0790: 10 X-7564579A: 646B95376F6C166E X-77F55803: 4F1203BC0FB41BD9731B3922EC063979F8160E2FE2FDCF123B0537AAB149205200894C459B0CD1B96F1A0213E51F0D6336D3E598A0E5D28DBD7B9CED87D25567E81B399B8E64663B X-7FA49CB5: FF5795518A3D127A4AD6D5ED66289B5278DA827A17800CE752E71F0C64B7C834EA1F7E6F0F101C67BD4B6F7A4D31EC0BCC500DACC3FED6E28638F802B75D45FF8AA50765F790063707C4856229E8E7E48638F802B75D45FF36EB9D2243A4F8B5A6FCA7DBDB1FC311F39EFFDF887939037866D6147AF826D8D3F88ED8C52DD505CE961FF3D39C95AD117882F4460429724CE54428C33FAD305F5C1EE8F4F765FCAA867293B0326636D2E47CDBA5A96583BD4B6F7A4D31EC0BC014FD901B82EE079FA2833FD35BB23D27C277FBC8AE2E8BF1175FABE1C0F9B6A471835C12D1D977C4224003CC8364762BB6847A3DEAEFB0F43C7A68FF6260569E8FC8737B5C2249EC8D19AE6D49635B68655334FD4449CB9ECD01F8117BC8BEAAAE862A0553A39223F8577A6DFFEA7C289736CE4F78F08343847C11F186F3C59DAA53EE0834AAEE X-C1DE0DAB: C20DE7B7AB408E4181F030C43753B8186998911F362727C414F749A5E30D975C7E9FEBB9C11794A1CCAB208905FE2E763AC5CCAB4B2097B59C2B6934AE262D3EE7EAB7254005DCED7532B743992DF240BDC6A1CF3F042BAD6DF99611D93F60EF309DFB797F6729CB699F904B3F4130E343918A1A30D5E7FCCB5012B2E24CD356 X-C8649E89: 4E36BF7865823D7055A7F0CF078B5EC49A30900B95165D34181D1E89D5A0B42F5527367F688AD2731078E8E2B2DB70C06825A3E83407B1F8E5C9BC77FDDDFC1B1D7E09C32AA3244C9E8C22E50EF4F67DD80457B920356ED295A9E0DC41E9A4CF729B2BEF169E0186 X-D57D3AED: 3ZO7eAau8CL7WIMRKs4sN3D3tLDjz0dLbV79QFUyzQ2Ujvy7cMT6pYYqY16iZVKkSc3dCLJ7zSJH7+u4VD18S7Vl4ZUrpaVfd2+vE6kuoey4m4VkSEu530nj6fImhcD4MUrOEAnl0W826KZ9Q+tr5ycPtXkTV4k65bRjmOUUP8cvGozZ33TWg5HZplvhhXbhDGzqmQDTd6OAevLeAnq3Ra9uf7zvY2zzsIhlcp/Y7m53TZgf2aB4JOg4gkr2bioj4t8MBgWr8bLFisnmMY6A0Q== X-Mailru-Sender: 689FA8AB762F7393C37E3C1AEC41BA5D875443143C70AC10288BA1BA8CFDE3CC83D72C36FC87018B9F80AB2734326CD2FB559BB5D741EB96352A0ABBE4FDA4210A04DAD6CC59E33667EA787935ED9F1B X-Mras: Ok Subject: [Tarantool-patches] [PATCH v1 5/8] sql: rework TRIM() function X-BeenThere: tarantool-patches@dev.tarantool.org X-Mailman-Version: 2.1.34 Precedence: list List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , From: Mergen Imeev via Tarantool-patches Reply-To: imeevma@tarantool.org Errors-To: tarantool-patches-bounces@dev.tarantool.org Sender: "Tarantool-patches" This patch refactoring TRIM() and fixes an issue with incorrect trimming of some VARBINARY values. Also, TRIM() now use ICU functions instead of self-created. Part of #4415 --- src/box/sql/func.c | 361 ++++++++++++++++------------------ test/sql-tap/badutf1.test.lua | 41 ++-- 2 files changed, 198 insertions(+), 204 deletions(-) diff --git a/src/box/sql/func.c b/src/box/sql/func.c index 7d54a39cd..ba6b9246d 100644 --- a/src/box/sql/func.c +++ b/src/box/sql/func.c @@ -343,6 +343,172 @@ func_nullif(struct sql_context *ctx, int argc, struct Mem *argv) ctx->is_aborted = true; } +/** Implementation of the TRIM() function. */ +static inline int +trim_bin_end(const char *str, int end, const char *octets, int octets_size, + int flags) +{ + if ((flags & TRIM_TRAILING) == 0) + return end; + while (end > 0) { + bool is_trimmed = false; + char c = str[end - 1]; + for (int i = 0; i < octets_size && !is_trimmed; ++i) + is_trimmed = c == octets[i]; + if (!is_trimmed) + break; + --end; + } + return end; +} + +static inline int +trim_bin_start(const char *str, int end, const char *octets, int octets_size, + int flags) +{ + if ((flags & TRIM_LEADING) == 0) + return 0; + int start = 0; + while (start < end) { + bool is_trimmed = false; + char c = str[start]; + for (int i = 0; i < octets_size && !is_trimmed; ++i) + is_trimmed = c == octets[i]; + if (!is_trimmed) + break; + ++start; + } + return start; +} + +static void +func_trim_bin(struct sql_context *ctx, int argc, struct Mem *argv) +{ + if (mem_is_null(&argv[0]) || (argc == 3 && mem_is_null(&argv[2]))) + return; + assert(argc == 2 || (argc == 3 && mem_is_bin(&argv[2]))); + assert(mem_is_bin(&argv[0]) && mem_is_uint(&argv[1])); + const char *str = argv[0].z; + int size = argv[0].n; + const char *octets; + int octets_size; + if (argc == 3) { + octets = argv[2].z; + octets_size = argv[2].n; + } else { + octets = "\0"; + octets_size = 1; + } + + int flags = argv[1].u.u; + int end = trim_bin_end(str, size, octets, octets_size, flags); + int start = trim_bin_start(str, end, octets, octets_size, flags); + + if (start >= end) + return mem_set_bin_static(ctx->pOut, "", 0); + if (mem_copy_bin(ctx->pOut, &str[start], end - start) != 0) + ctx->is_aborted = true; +} + +static inline int +trim_str_end(const char *str, int end, const char *chars, uint8_t *chars_len, + int chars_count, int flags) +{ + if ((flags & TRIM_TRAILING) == 0) + return end; + while (end > 0) { + bool is_trimmed = false; + const char *c = chars; + int len; + for (int i = 0; i < chars_count && !is_trimmed; ++i) { + len = chars_len[i]; + const char *s = str + end - len; + is_trimmed = len <= end && memcmp(c, s, len) == 0; + c += len; + } + if (!is_trimmed) + break; + assert(len > 0); + end -= len; + } + return end; +} + +static inline int +trim_str_start(const char *str, int end, const char *chars, uint8_t *chars_len, + int chars_count, int flags) +{ + if ((flags & TRIM_LEADING) == 0) + return 0; + int start = 0; + while (start < end) { + bool is_trimmed = false; + const char *c = chars; + int len; + for (int i = 0; i < chars_count && !is_trimmed; ++i) { + len = chars_len[i]; + const char *s = str + start; + is_trimmed = start + len <= end && + memcmp(c, s, len) == 0; + c += len; + } + if (!is_trimmed) + break; + assert(len > 0); + start += len; + } + return start; +} + +static void +func_trim_str(struct sql_context *ctx, int argc, struct Mem *argv) +{ + if (mem_is_null(&argv[0]) || (argc == 3 && mem_is_null(&argv[2]))) + return; + assert(argc == 2 || (argc == 3 && mem_is_str(&argv[2]))); + assert(mem_is_str(&argv[0]) && mem_is_uint(&argv[1])); + const char *str = argv[0].z; + int size = argv[0].n; + const char *chars; + int chars_size; + if (argc == 3) { + chars = argv[2].z; + chars_size = argv[2].n; + } else { + chars = " "; + chars_size = 1; + } + + struct region *region = &fiber()->gc; + size_t svp = region_used(region); + uint8_t *chars_len = region_alloc(region, chars_size); + if (chars_len == NULL) { + ctx->is_aborted = true; + diag_set(OutOfMemory, chars_size, "region_alloc", "chars_len"); + return; + } + int chars_count = 0; + + int offset = 0; + while (offset < chars_size) { + UChar32 c; + int prev = offset; + U8_NEXT((uint8_t *)chars, offset, chars_size, c); + chars_len[chars_count++] = offset - prev; + } + + uint64_t flags = argv[1].u.u; + int end = trim_str_end(str, size, chars, chars_len, chars_count, flags); + int start = trim_str_start(str, end, chars, chars_len, chars_count, + flags); + region_truncate(region, svp); + + if (start >= end) + return mem_set_str0_static(ctx->pOut, ""); + if (mem_copy_str(ctx->pOut, &str[start], end - start) != 0) + ctx->is_aborted = true; +} + static const unsigned char * mem_as_ustr(struct Mem *mem) { @@ -1527,193 +1693,6 @@ replaceFunc(struct sql_context *context, int argc, struct Mem *argv) mem_set_bin_dynamic(context->pOut, (char *)zOut, j); } -/** - * Remove characters included in @a trim_set from @a input_str - * until encounter a character that doesn't belong to @a trim_set. - * Remove from the side specified by @a flags. - * @param context SQL context. - * @param flags Trim specification: left, right or both. - * @param trim_set The set of characters for trimming. - * @param char_len Lengths of each UTF-8 character in @a trim_set. - * @param char_cnt A number of UTF-8 characters in @a trim_set. - * @param input_str Input string for trimming. - * @param input_str_sz Input string size in bytes. - */ -static void -trim_procedure(struct sql_context *context, enum trim_side_mask flags, - const unsigned char *trim_set, const uint8_t *char_len, - int char_cnt, const unsigned char *input_str, int input_str_sz) -{ - if (char_cnt == 0) - goto finish; - int i, len; - const unsigned char *z; - if ((flags & TRIM_LEADING) != 0) { - while (input_str_sz > 0) { - z = trim_set; - for (i = 0; i < char_cnt; ++i, z += len) { - len = char_len[i]; - if (len <= input_str_sz - && memcmp(input_str, z, len) == 0) - break; - } - if (i >= char_cnt) - break; - input_str += len; - input_str_sz -= len; - } - } - if ((flags & TRIM_TRAILING) != 0) { - while (input_str_sz > 0) { - z = trim_set; - for (i = 0; i < char_cnt; ++i, z += len) { - len = char_len[i]; - if (len <= input_str_sz - && memcmp(&input_str[input_str_sz - len], - z, len) == 0) - break; - } - if (i >= char_cnt) - break; - input_str_sz -= len; - } - } -finish: - if (context->func->def->returns == FIELD_TYPE_STRING) - mem_copy_str(context->pOut, (char *)input_str, input_str_sz); - else - mem_copy_bin(context->pOut, (char *)input_str, input_str_sz); -} - -/** - * Prepare arguments for trimming procedure. Allocate memory for - * @a char_len (array of lengths each character in @a trim_set) - * and fill it. - * - * @param context SQL context. - * @param trim_set The set of characters for trimming. - * @param[out] char_len Lengths of each character in @ trim_set. - * @retval >=0 A number of UTF-8 characters in @a trim_set. - * @retval -1 Memory allocation error. - */ -static int -trim_prepare_char_len(struct sql_context *context, - const unsigned char *trim_set, int trim_set_sz, - uint8_t **char_len) -{ - /* - * Count the number of UTF-8 characters passing through - * the entire char set, but not up to the '\0' or X'00' - * character. This allows to handle trimming set - * containing such characters. - */ - int char_cnt = sql_utf8_char_count(trim_set, trim_set_sz); - if (char_cnt == 0) { - *char_len = NULL; - return 0; - } - - if ((*char_len = (uint8_t *)contextMalloc(context, char_cnt)) == NULL) - return -1; - - int i = 0, j = 0; - while(j < char_cnt) { - int old_i = i; - SQL_UTF8_FWD_1(trim_set, i, trim_set_sz); - (*char_len)[j++] = i - old_i; - } - - return char_cnt; -} - -/** - * Normalize args from @a argv input array when it has two args. - * - * Case: TRIM() - * Call trimming procedure with TRIM_BOTH as the flags and " " as - * the trimming set. - * - * Case: TRIM(LEADING/TRAILING/BOTH FROM ) - * If user has specified side keyword only, then call trimming - * procedure with the specified side and " " as the trimming set. - */ -static void -trim_func_two_args(struct sql_context *context, sql_value *arg1, - sql_value *arg2) -{ - const unsigned char *trim_set; - if (mem_is_bin(arg1)) - trim_set = (const unsigned char *)"\0"; - else - trim_set = (const unsigned char *)" "; - const unsigned char *input_str; - if ((input_str = mem_as_ustr(arg1)) == NULL) - return; - - int input_str_sz = mem_len_unsafe(arg1); - assert(arg2->type == MEM_TYPE_UINT); - uint8_t len_one = 1; - trim_procedure(context, arg2->u.u, trim_set, - &len_one, 1, input_str, input_str_sz); -} - -/** - * Normalize args from @a argv input array when it has three args. - * - * Case: TRIM( FROM ) - * If user has specified only, call trimming procedure with - * TRIM_BOTH as the flags and that trimming set. - * - * Case: TRIM(LEADING/TRAILING/BOTH FROM ) - * If user has specified side keyword and , then - * call trimming procedure with that args. - */ -static void -trim_func_three_args(struct sql_context *context, sql_value *arg1, - sql_value *arg2, sql_value *arg3) -{ - assert(arg2->type == MEM_TYPE_UINT); - const unsigned char *input_str, *trim_set; - if ((input_str = mem_as_ustr(arg1)) == NULL || - (trim_set = mem_as_ustr(arg3)) == NULL) - return; - - int trim_set_sz = mem_len_unsafe(arg3); - int input_str_sz = mem_len_unsafe(arg1); - uint8_t *char_len; - int char_cnt = trim_prepare_char_len(context, trim_set, trim_set_sz, - &char_len); - if (char_cnt == -1) - return; - trim_procedure(context, arg2->u.u, trim_set, char_len, - char_cnt, input_str, input_str_sz); - sql_free(char_len); -} - -/** - * Normalize args from @a argv input array when it has one, - * two or three args. - * - * This is a dispatcher function that calls corresponding - * implementation depending on the number of arguments. -*/ -static void -trim_func(struct sql_context *context, int argc, struct Mem *argv) -{ - switch (argc) { - case 2: - trim_func_two_args(context, &argv[0], &argv[1]); - break; - case 3: - trim_func_three_args(context, &argv[0], &argv[1], &argv[2]); - break; - default: - diag_set(ClientError, ER_FUNC_WRONG_ARG_COUNT, "TRIM", - "2 or 3", argc); - context->is_aborted = true; - } -} - /* * Compute the soundex encoding of a word. * @@ -2040,14 +2019,14 @@ static struct sql_func_definition definitions[] = { fin_total}, {"TRIM", 2, {FIELD_TYPE_STRING, FIELD_TYPE_INTEGER}, - FIELD_TYPE_STRING, trim_func, NULL}, + FIELD_TYPE_STRING, func_trim_str, NULL}, {"TRIM", 3, {FIELD_TYPE_STRING, FIELD_TYPE_INTEGER, FIELD_TYPE_STRING}, - FIELD_TYPE_STRING, trim_func, NULL}, + FIELD_TYPE_STRING, func_trim_str, NULL}, {"TRIM", 2, {FIELD_TYPE_VARBINARY, FIELD_TYPE_INTEGER}, - FIELD_TYPE_VARBINARY, trim_func, NULL}, + FIELD_TYPE_VARBINARY, func_trim_bin, NULL}, {"TRIM", 3, {FIELD_TYPE_VARBINARY, FIELD_TYPE_INTEGER, FIELD_TYPE_VARBINARY}, - FIELD_TYPE_VARBINARY, trim_func, NULL}, + FIELD_TYPE_VARBINARY, func_trim_bin, NULL}, {"TYPEOF", 1, {FIELD_TYPE_ANY}, FIELD_TYPE_STRING, typeofFunc, NULL}, {"UNICODE", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, unicodeFunc, diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua index ce8354840..d1e17ca3e 100755 --- a/test/sql-tap/badutf1.test.lua +++ b/test/sql-tap/badutf1.test.lua @@ -1,6 +1,6 @@ #!/usr/bin/env tarantool local test = require("sqltester") -test:plan(19) +test:plan(20) --!./tcltestrunner.lua -- 2007 May 15 @@ -296,47 +296,62 @@ test:do_test( test:do_test( "badutf-4.4", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff80' FROM ]].. - [[x'808080f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff80' FROM x'808080f0808080ff')) AS x; + ]]) end, { -- - "X", "808080F0808080FF" + "X", "F0" -- }) test:do_test( "badutf-4.5", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff80' FROM ]].. - [[x'ff8080f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff80' FROM x'ff8080f0808080ff')) AS x; + ]]) end, { -- - "X", "80F0808080FF" + "X", "F0" -- }) test:do_test( "badutf-4.6", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff80' FROM ]].. - [[x'ff80f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff80' FROM x'ff80f0808080ff')) AS x; + ]]) end, { -- - "X", "F0808080FF" + "X", "F0" -- }) test:do_test( "badutf-4.7", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff8080' FROM ]].. - [[x'ff80f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff8080' FROM x'ff80f0808080ff')) AS x; + ]]) end, { -- - "X", "FF80F0808080FF" + "X", "F0" -- }) +-- gh-4145: Make sure that TRIM() properly work with VARBINARY. +test:do_execsql_test( + "badutf-5", + [[ + SELECT HEX(TRIM(x'ff1234' from x'1234125678123412')); + ]], + { + '5678' + } +) + --db2("close") -- 2.25.1