From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from [87.239.111.99] (localhost [127.0.0.1]) by dev.tarantool.org (Postfix) with ESMTP id 500CC6DB05; Fri, 1 Oct 2021 19:31:56 +0300 (MSK) DKIM-Filter: OpenDKIM Filter v2.11.0 dev.tarantool.org 500CC6DB05 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=tarantool.org; s=dev; t=1633105916; bh=mIiKoEaKjN8C6PipKTuPDbFnWHK90SkMuyGVh7+ZU3g=; h=To:Cc:Date:In-Reply-To:References:Subject:List-Id: List-Unsubscribe:List-Archive:List-Post:List-Help:List-Subscribe: From:Reply-To:From; b=PHmgIpkV6TwhdT6KOMd6o5LcUjREeVf9gMk9eQSgyxuuYN88Eql1RyhWzBZNBniQi Svp9XEAaVcztduFjdJGE+yg3zXJjcWX521Sk45k/oZ+YpNkYJLJjfrTW42WQ6L8d0q G1X5CIFBWhWHH41Iff5/ox12boHdOc717pGw0E0s= Received: from smtpng1.i.mail.ru (smtpng1.i.mail.ru [94.100.181.251]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 7DBA46DB07 for ; Fri, 1 Oct 2021 19:29:33 +0300 (MSK) DKIM-Filter: OpenDKIM Filter v2.11.0 dev.tarantool.org 7DBA46DB07 Received: by smtpng1.m.smailru.net with esmtpa (envelope-from ) id 1mWLPg-0004Jw-Qe; Fri, 01 Oct 2021 19:29:33 +0300 To: v.shpilevoy@tarantool.org Cc: tarantool-patches@dev.tarantool.org Date: Fri, 1 Oct 2021 19:29:32 +0300 Message-Id: <9424a550a61d32fd8ec2e578ea9cc00b45fbe4ed.1633105483.git.imeevma@gmail.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-4EC0790: 10 X-7564579A: B8F34718100C35BD X-77F55803: 4F1203BC0FB41BD96A58C36AA2E996499DED20B83AFF2DE90C450EFC080BE8B7182A05F53808504037775CA61D78B0BFE3124BC6535BD2D7B5E743AFD73B4F9A82A87BA86DC72BE1 X-7FA49CB5: FF5795518A3D127A4AD6D5ED66289B5278DA827A17800CE78981306C6E927004EA1F7E6F0F101C67BD4B6F7A4D31EC0BCC500DACC3FED6E28638F802B75D45FF8AA50765F7900637CD1DFD3ABA64F6568638F802B75D45FF36EB9D2243A4F8B5A6FCA7DBDB1FC311F39EFFDF887939037866D6147AF826D8B3EE2F30A642B69B86A645823F0977F3117882F4460429724CE54428C33FAD305F5C1EE8F4F765FCAA867293B0326636D2E47CDBA5A96583BD4B6F7A4D31EC0BC014FD901B82EE079FA2833FD35BB23D27C277FBC8AE2E8BAA867293B0326636D2E47CDBA5A96583BA9C0B312567BB231DD303D21008E29813377AFFFEAFD269A417C69337E82CC2E827F84554CEF50127C277FBC8AE2E8BA83251EDC214901ED5E8D9A59859A8B66F6A3E018CF4DC80089D37D7C0E48F6C5571747095F342E88FB05168BE4CE3AF X-C1DE0DAB: C20DE7B7AB408E4181F030C43753B8186998911F362727C414F749A5E30D975CBBB17C150BCA6793C6FDBC2ADFD12930C4A8CF46E0A7A20F9C2B6934AE262D3EE7EAB7254005DCED7532B743992DF240BDC6A1CF3F042BAD6DF99611D93F60EF783E2B6F79C23BED699F904B3F4130E343918A1A30D5E7FCCB5012B2E24CD356 X-C8649E89: 4E36BF7865823D7055A7F0CF078B5EC49A30900B95165D349F4E6BC39AAD02BB685B93DB1143DA7BF20FCB0E8A5934A363F736915B5FE949F5F2B10E722986D41D7E09C32AA3244C67CD98A2F5B6B618FFA2E3CFFD712220A995755A1445935E729B2BEF169E0186 X-D57D3AED: 3ZO7eAau8CL7WIMRKs4sN3D3tLDjz0dLbV79QFUyzQ2Ujvy7cMT6pYYqY16iZVKkSc3dCLJ7zSJH7+u4VD18S7Vl4ZUrpaVfd2+vE6kuoey4m4VkSEu530nj6fImhcD4MUrOEAnl0W826KZ9Q+tr5ycPtXkTV4k65bRjmOUUP8cvGozZ33TWg5HZplvhhXbhDGzqmQDTd6OAevLeAnq3Ra9uf7zvY2zzsIhlcp/Y7m53TZgf2aB4JOg4gkr2bioj/2kGGGWRGj6Um5BZP9ObBg== X-Mailru-Sender: 689FA8AB762F7393C37E3C1AEC41BA5DEC894A0BF6F0A72679EEF08DD38C19B383D72C36FC87018B9F80AB2734326CD2FB559BB5D741EB96352A0ABBE4FDA4210A04DAD6CC59E33667EA787935ED9F1B X-Mras: Ok Subject: [Tarantool-patches] [PATCH v1 5/8] sql: rework TRIM() function X-BeenThere: tarantool-patches@dev.tarantool.org X-Mailman-Version: 2.1.34 Precedence: list List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , From: Mergen Imeev via Tarantool-patches Reply-To: imeevma@tarantool.org Errors-To: tarantool-patches-bounces@dev.tarantool.org Sender: "Tarantool-patches" This patch refactoring TRIM() and fixes an issue with incorrect trimming of some VARBINARY values. Part of #4415 --- src/box/sql/func.c | 365 ++++++++++++++++------------------ test/sql-tap/badutf1.test.lua | 41 ++-- 2 files changed, 202 insertions(+), 204 deletions(-) diff --git a/src/box/sql/func.c b/src/box/sql/func.c index 17d5fab8f..1d1a8b0cd 100644 --- a/src/box/sql/func.c +++ b/src/box/sql/func.c @@ -360,6 +360,176 @@ func_nullif(struct sql_context *ctx, int argc, struct Mem *argv) ctx->is_aborted = true; } +static inline void +return_empty_str(struct sql_context *ctx, bool is_str) +{ + return is_str ? mem_set_str_static(ctx->pOut, "", 0) : + mem_set_bin_static(ctx->pOut, "", 0); +} + +/** Implementation of the TRIM() function. */ +static inline int +trim_bin_end(const char *str, int end, const char *octets, int octets_size, + int flags) +{ + if ((flags & TRIM_TRAILING) == 0) + return end; + while (end > 0) { + bool is_trimmed = false; + char c = str[end - 1]; + for (int i = 0; i < octets_size && !is_trimmed; ++i) + is_trimmed = c == octets[i]; + if (!is_trimmed) + break; + --end; + } + return end; +} + +static inline int +trim_bin_start(const char *str, int end, const char *octets, int octets_size, + int flags) +{ + if ((flags & TRIM_LEADING) == 0) + return 0; + int start = 0; + while (start < end) { + bool is_trimmed = false; + char c = str[start]; + for (int i = 0; i < octets_size && !is_trimmed; ++i) + is_trimmed = c == octets[i]; + if (!is_trimmed) + break; + ++start; + } + return start; +} + +static void +func_trim_bin(struct sql_context *ctx, int argc, struct Mem *argv) +{ + if (mem_is_null(&argv[0]) || (argc == 3 && mem_is_null(&argv[2]))) + return; + assert(argc == 2 || (argc == 3 && mem_is_bin(&argv[2]))); + assert(mem_is_bin(&argv[0]) && mem_is_uint(&argv[1])); + const char *str = argv[0].z; + int size = argv[0].n; + const char *octets; + int octets_size; + if (argc == 3) { + octets = argv[2].z; + octets_size = argv[2].n; + } else { + octets = "\0"; + octets_size = 1; + } + + int flags = argv[1].u.u; + int end = trim_bin_end(str, size, octets, octets_size, flags); + int start = trim_bin_start(str, end, octets, octets_size, flags); + + if (start >= end) + return return_empty_str(ctx, false); + if (mem_copy_bin(ctx->pOut, &str[start], end - start) != 0) + ctx->is_aborted = true; +} + +static inline int +trim_str_end(const char *str, int end, const char *chars, uint8_t *chars_len, + int chars_count, int flags) +{ + if ((flags & TRIM_TRAILING) == 0) + return end; + while (end > 0) { + bool is_trimmed = false; + const char *c = chars; + int len; + for (int i = 0; i < chars_count && !is_trimmed; ++i) { + len = chars_len[i]; + const char *s = str + end - len; + is_trimmed = len <= end && memcmp(c, s, len) == 0; + c += len; + } + if (!is_trimmed) + break; + assert(len > 0); + end -= len; + } + return end; +} + +static inline int +trim_str_start(const char *str, int end, const char *chars, uint8_t *chars_len, + int chars_count, int flags) +{ + if ((flags & TRIM_LEADING) == 0) + return 0; + int start = 0; + while (start < end) { + bool is_trimmed = false; + const char *c = chars; + int len; + for (int i = 0; i < chars_count && !is_trimmed; ++i) { + len = chars_len[i]; + const char *s = str + start; + is_trimmed = start + len <= end && + memcmp(c, s, len) == 0; + c += len; + } + if (!is_trimmed) + break; + assert(len > 0); + start += len; + } + return start; +} + +static void +func_trim_str(struct sql_context *ctx, int argc, struct Mem *argv) +{ + if (mem_is_null(&argv[0]) || (argc == 3 && mem_is_null(&argv[2]))) + return; + assert(argc == 2 || (argc == 3 && mem_is_str(&argv[2]))); + assert(mem_is_str(&argv[0]) && mem_is_uint(&argv[1])); + const char *str = argv[0].z; + int size = argv[0].n; + const char *chars; + int chars_size; + if (argc == 3) { + chars = argv[2].z; + chars_size = argv[2].n; + } else { + chars = " "; + chars_size = 1; + } + + uint8_t *chars_len = sqlDbMallocRawNN(sql_get(), + chars_size * sizeof(uint8_t)); + if (chars_len == NULL) { + ctx->is_aborted = true; + return; + } + int chars_count = 0; + int i = 0; + while (i < chars_size) { + uint8_t len = utf8_len_char(chars[i]); + i += len; + if (i <= chars_size) + chars_len[chars_count++] = len; + } + + uint64_t flags = argv[1].u.u; + int end = trim_str_end(str, size, chars, chars_len, chars_count, flags); + int start = trim_str_start(str, end, chars, chars_len, chars_count, + flags); + sqlDbFree(sql_get(), chars_len); + + if (start >= end) + return mem_set_str0_static(ctx->pOut, ""); + if (mem_copy_str(ctx->pOut, &str[start], end - start) != 0) + ctx->is_aborted = true; +} + static const unsigned char * mem_as_ustr(struct Mem *mem) { @@ -1545,193 +1715,6 @@ replaceFunc(struct sql_context *context, int argc, struct Mem *argv) mem_set_bin_dynamic(context->pOut, (char *)zOut, j); } -/** - * Remove characters included in @a trim_set from @a input_str - * until encounter a character that doesn't belong to @a trim_set. - * Remove from the side specified by @a flags. - * @param context SQL context. - * @param flags Trim specification: left, right or both. - * @param trim_set The set of characters for trimming. - * @param char_len Lengths of each UTF-8 character in @a trim_set. - * @param char_cnt A number of UTF-8 characters in @a trim_set. - * @param input_str Input string for trimming. - * @param input_str_sz Input string size in bytes. - */ -static void -trim_procedure(struct sql_context *context, enum trim_side_mask flags, - const unsigned char *trim_set, const uint8_t *char_len, - int char_cnt, const unsigned char *input_str, int input_str_sz) -{ - if (char_cnt == 0) - goto finish; - int i, len; - const unsigned char *z; - if ((flags & TRIM_LEADING) != 0) { - while (input_str_sz > 0) { - z = trim_set; - for (i = 0; i < char_cnt; ++i, z += len) { - len = char_len[i]; - if (len <= input_str_sz - && memcmp(input_str, z, len) == 0) - break; - } - if (i >= char_cnt) - break; - input_str += len; - input_str_sz -= len; - } - } - if ((flags & TRIM_TRAILING) != 0) { - while (input_str_sz > 0) { - z = trim_set; - for (i = 0; i < char_cnt; ++i, z += len) { - len = char_len[i]; - if (len <= input_str_sz - && memcmp(&input_str[input_str_sz - len], - z, len) == 0) - break; - } - if (i >= char_cnt) - break; - input_str_sz -= len; - } - } -finish: - if (context->func->def->returns == FIELD_TYPE_STRING) - mem_copy_str(context->pOut, (char *)input_str, input_str_sz); - else - mem_copy_bin(context->pOut, (char *)input_str, input_str_sz); -} - -/** - * Prepare arguments for trimming procedure. Allocate memory for - * @a char_len (array of lengths each character in @a trim_set) - * and fill it. - * - * @param context SQL context. - * @param trim_set The set of characters for trimming. - * @param[out] char_len Lengths of each character in @ trim_set. - * @retval >=0 A number of UTF-8 characters in @a trim_set. - * @retval -1 Memory allocation error. - */ -static int -trim_prepare_char_len(struct sql_context *context, - const unsigned char *trim_set, int trim_set_sz, - uint8_t **char_len) -{ - /* - * Count the number of UTF-8 characters passing through - * the entire char set, but not up to the '\0' or X'00' - * character. This allows to handle trimming set - * containing such characters. - */ - int char_cnt = sql_utf8_char_count(trim_set, trim_set_sz); - if (char_cnt == 0) { - *char_len = NULL; - return 0; - } - - if ((*char_len = (uint8_t *)contextMalloc(context, char_cnt)) == NULL) - return -1; - - int i = 0, j = 0; - while(j < char_cnt) { - int old_i = i; - SQL_UTF8_FWD_1(trim_set, i, trim_set_sz); - (*char_len)[j++] = i - old_i; - } - - return char_cnt; -} - -/** - * Normalize args from @a argv input array when it has two args. - * - * Case: TRIM() - * Call trimming procedure with TRIM_BOTH as the flags and " " as - * the trimming set. - * - * Case: TRIM(LEADING/TRAILING/BOTH FROM ) - * If user has specified side keyword only, then call trimming - * procedure with the specified side and " " as the trimming set. - */ -static void -trim_func_two_args(struct sql_context *context, sql_value *arg1, - sql_value *arg2) -{ - const unsigned char *trim_set; - if (mem_is_bin(arg1)) - trim_set = (const unsigned char *)"\0"; - else - trim_set = (const unsigned char *)" "; - const unsigned char *input_str; - if ((input_str = mem_as_ustr(arg1)) == NULL) - return; - - int input_str_sz = mem_len_unsafe(arg1); - assert(arg2->type == MEM_TYPE_UINT); - uint8_t len_one = 1; - trim_procedure(context, arg2->u.u, trim_set, - &len_one, 1, input_str, input_str_sz); -} - -/** - * Normalize args from @a argv input array when it has three args. - * - * Case: TRIM( FROM ) - * If user has specified only, call trimming procedure with - * TRIM_BOTH as the flags and that trimming set. - * - * Case: TRIM(LEADING/TRAILING/BOTH FROM ) - * If user has specified side keyword and , then - * call trimming procedure with that args. - */ -static void -trim_func_three_args(struct sql_context *context, sql_value *arg1, - sql_value *arg2, sql_value *arg3) -{ - assert(arg2->type == MEM_TYPE_UINT); - const unsigned char *input_str, *trim_set; - if ((input_str = mem_as_ustr(arg1)) == NULL || - (trim_set = mem_as_ustr(arg3)) == NULL) - return; - - int trim_set_sz = mem_len_unsafe(arg3); - int input_str_sz = mem_len_unsafe(arg1); - uint8_t *char_len; - int char_cnt = trim_prepare_char_len(context, trim_set, trim_set_sz, - &char_len); - if (char_cnt == -1) - return; - trim_procedure(context, arg2->u.u, trim_set, char_len, - char_cnt, input_str, input_str_sz); - sql_free(char_len); -} - -/** - * Normalize args from @a argv input array when it has one, - * two or three args. - * - * This is a dispatcher function that calls corresponding - * implementation depending on the number of arguments. -*/ -static void -trim_func(struct sql_context *context, int argc, struct Mem *argv) -{ - switch (argc) { - case 2: - trim_func_two_args(context, &argv[0], &argv[1]); - break; - case 3: - trim_func_three_args(context, &argv[0], &argv[1], &argv[2]); - break; - default: - diag_set(ClientError, ER_FUNC_WRONG_ARG_COUNT, "TRIM", - "2 or 3", argc); - context->is_aborted = true; - } -} - /* * Compute the soundex encoding of a word. * @@ -2058,14 +2041,14 @@ static struct sql_func_definition definitions[] = { fin_total}, {"TRIM", 2, {FIELD_TYPE_STRING, FIELD_TYPE_INTEGER}, - FIELD_TYPE_STRING, trim_func, NULL}, + FIELD_TYPE_STRING, func_trim_str, NULL}, {"TRIM", 3, {FIELD_TYPE_STRING, FIELD_TYPE_INTEGER, FIELD_TYPE_STRING}, - FIELD_TYPE_STRING, trim_func, NULL}, + FIELD_TYPE_STRING, func_trim_str, NULL}, {"TRIM", 2, {FIELD_TYPE_VARBINARY, FIELD_TYPE_INTEGER}, - FIELD_TYPE_VARBINARY, trim_func, NULL}, + FIELD_TYPE_VARBINARY, func_trim_bin, NULL}, {"TRIM", 3, {FIELD_TYPE_VARBINARY, FIELD_TYPE_INTEGER, FIELD_TYPE_VARBINARY}, - FIELD_TYPE_VARBINARY, trim_func, NULL}, + FIELD_TYPE_VARBINARY, func_trim_bin, NULL}, {"TYPEOF", 1, {FIELD_TYPE_ANY}, FIELD_TYPE_STRING, typeofFunc, NULL}, {"UNICODE", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, unicodeFunc, diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua index b25436186..9fae91621 100755 --- a/test/sql-tap/badutf1.test.lua +++ b/test/sql-tap/badutf1.test.lua @@ -1,6 +1,6 @@ #!/usr/bin/env tarantool local test = require("sqltester") -test:plan(23) +test:plan(24) --!./tcltestrunner.lua -- 2007 May 15 @@ -336,47 +336,62 @@ test:do_test( test:do_test( "badutf-4.4", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff80' FROM ]].. - [[x'808080f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff80' FROM x'808080f0808080ff')) AS x; + ]]) end, { -- - "X", "808080F0808080FF" + "X", "F0" -- }) test:do_test( "badutf-4.5", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff80' FROM ]].. - [[x'ff8080f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff80' FROM x'ff8080f0808080ff')) AS x; + ]]) end, { -- - "X", "80F0808080FF" + "X", "F0" -- }) test:do_test( "badutf-4.6", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff80' FROM ]].. - [[x'ff80f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff80' FROM x'ff80f0808080ff')) AS x; + ]]) end, { -- - "X", "F0808080FF" + "X", "F0" -- }) test:do_test( "badutf-4.7", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff8080' FROM ]].. - [[x'ff80f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff8080' FROM x'ff80f0808080ff')) AS x; + ]]) end, { -- - "X", "FF80F0808080FF" + "X", "F0" -- }) +-- gh-4145: Make sure that TRIM() properly work with VARBINARY. +test:do_execsql_test( + "badutf-5", + [[ + SELECT HEX(TRIM(x'ff1234' from x'1234125678123412')); + ]], + { + '5678' + } +) + --db2("close") -- 2.25.1