From: Mergen Imeev via Tarantool-patches <tarantool-patches@dev.tarantool.org> To: v.shpilevoy@tarantool.org, tarantool-patches@dev.tarantool.org Subject: Re: [Tarantool-patches] [PATCH v1 5/8] sql: rework TRIM() function Date: Wed, 20 Oct 2021 20:05:23 +0300 [thread overview] Message-ID: <20211020170523.GD203963@tarantool.org> (raw) In-Reply-To: <9424a550a61d32fd8ec2e578ea9cc00b45fbe4ed.1633105483.git.imeevma@gmail.com> Thank you for the review! I replaced self-created function by ucnv_getNextUChar(). Diff and new patch below. On Fri, Oct 01, 2021 at 07:29:32PM +0300, Mergen Imeev via Tarantool-patches wrote: <cut> Diff: diff --git a/src/box/sql/func.c b/src/box/sql/func.c index ed7a85c83..1294ff5b3 100644 --- a/src/box/sql/func.c +++ b/src/box/sql/func.c @@ -494,12 +494,14 @@ func_trim_str(struct sql_context *ctx, int argc, struct Mem *argv) return; } int chars_count = 0; - int i = 0; - while (i < chars_size) { - uint8_t len = utf8_len_char(chars[i]); - i += len; - if (i <= chars_size) - chars_len[chars_count++] = len; + + UErrorCode err = U_ZERO_ERROR; + const char *pos_start = chars; + const char *pos_end = chars + chars_size; + while (pos_start < pos_end) { + const char *cur = pos_start; + ucnv_getNextUChar(icu_utf8_conv, &pos_start, pos_end, &err); + chars_len[chars_count++] = pos_start - cur; } uint64_t flags = argv[1].u.u; New patch: commit d91a925090822bf074751f96b65d26bb35f5e5e2 Author: Mergen Imeev <imeevma@gmail.com> Date: Tue Sep 21 19:45:36 2021 +0300 sql: rework TRIM() function This patch refactoring TRIM() and fixes an issue with incorrect trimming of some VARBINARY values. Also, TRIM() now use ICU functions instead of self-created. Part of #4415 diff --git a/src/box/sql/func.c b/src/box/sql/func.c index d36c83501..1294ff5b3 100644 --- a/src/box/sql/func.c +++ b/src/box/sql/func.c @@ -344,6 +344,178 @@ func_nullif(struct sql_context *ctx, int argc, struct Mem *argv) ctx->is_aborted = true; } +static inline void +return_empty_str(struct sql_context *ctx, bool is_str) +{ + return is_str ? mem_set_str_static(ctx->pOut, "", 0) : + mem_set_bin_static(ctx->pOut, "", 0); +} + +/** Implementation of the TRIM() function. */ +static inline int +trim_bin_end(const char *str, int end, const char *octets, int octets_size, + int flags) +{ + if ((flags & TRIM_TRAILING) == 0) + return end; + while (end > 0) { + bool is_trimmed = false; + char c = str[end - 1]; + for (int i = 0; i < octets_size && !is_trimmed; ++i) + is_trimmed = c == octets[i]; + if (!is_trimmed) + break; + --end; + } + return end; +} + +static inline int +trim_bin_start(const char *str, int end, const char *octets, int octets_size, + int flags) +{ + if ((flags & TRIM_LEADING) == 0) + return 0; + int start = 0; + while (start < end) { + bool is_trimmed = false; + char c = str[start]; + for (int i = 0; i < octets_size && !is_trimmed; ++i) + is_trimmed = c == octets[i]; + if (!is_trimmed) + break; + ++start; + } + return start; +} + +static void +func_trim_bin(struct sql_context *ctx, int argc, struct Mem *argv) +{ + if (mem_is_null(&argv[0]) || (argc == 3 && mem_is_null(&argv[2]))) + return; + assert(argc == 2 || (argc == 3 && mem_is_bin(&argv[2]))); + assert(mem_is_bin(&argv[0]) && mem_is_uint(&argv[1])); + const char *str = argv[0].z; + int size = argv[0].n; + const char *octets; + int octets_size; + if (argc == 3) { + octets = argv[2].z; + octets_size = argv[2].n; + } else { + octets = "\0"; + octets_size = 1; + } + + int flags = argv[1].u.u; + int end = trim_bin_end(str, size, octets, octets_size, flags); + int start = trim_bin_start(str, end, octets, octets_size, flags); + + if (start >= end) + return return_empty_str(ctx, false); + if (mem_copy_bin(ctx->pOut, &str[start], end - start) != 0) + ctx->is_aborted = true; +} + +static inline int +trim_str_end(const char *str, int end, const char *chars, uint8_t *chars_len, + int chars_count, int flags) +{ + if ((flags & TRIM_TRAILING) == 0) + return end; + while (end > 0) { + bool is_trimmed = false; + const char *c = chars; + int len; + for (int i = 0; i < chars_count && !is_trimmed; ++i) { + len = chars_len[i]; + const char *s = str + end - len; + is_trimmed = len <= end && memcmp(c, s, len) == 0; + c += len; + } + if (!is_trimmed) + break; + assert(len > 0); + end -= len; + } + return end; +} + +static inline int +trim_str_start(const char *str, int end, const char *chars, uint8_t *chars_len, + int chars_count, int flags) +{ + if ((flags & TRIM_LEADING) == 0) + return 0; + int start = 0; + while (start < end) { + bool is_trimmed = false; + const char *c = chars; + int len; + for (int i = 0; i < chars_count && !is_trimmed; ++i) { + len = chars_len[i]; + const char *s = str + start; + is_trimmed = start + len <= end && + memcmp(c, s, len) == 0; + c += len; + } + if (!is_trimmed) + break; + assert(len > 0); + start += len; + } + return start; +} + +static void +func_trim_str(struct sql_context *ctx, int argc, struct Mem *argv) +{ + if (mem_is_null(&argv[0]) || (argc == 3 && mem_is_null(&argv[2]))) + return; + assert(argc == 2 || (argc == 3 && mem_is_str(&argv[2]))); + assert(mem_is_str(&argv[0]) && mem_is_uint(&argv[1])); + const char *str = argv[0].z; + int size = argv[0].n; + const char *chars; + int chars_size; + if (argc == 3) { + chars = argv[2].z; + chars_size = argv[2].n; + } else { + chars = " "; + chars_size = 1; + } + + uint8_t *chars_len = sqlDbMallocRawNN(sql_get(), + chars_size * sizeof(uint8_t)); + if (chars_len == NULL) { + ctx->is_aborted = true; + return; + } + int chars_count = 0; + + UErrorCode err = U_ZERO_ERROR; + const char *pos_start = chars; + const char *pos_end = chars + chars_size; + while (pos_start < pos_end) { + const char *cur = pos_start; + ucnv_getNextUChar(icu_utf8_conv, &pos_start, pos_end, &err); + chars_len[chars_count++] = pos_start - cur; + } + + uint64_t flags = argv[1].u.u; + int end = trim_str_end(str, size, chars, chars_len, chars_count, flags); + int start = trim_str_start(str, end, chars, chars_len, chars_count, + flags); + sqlDbFree(sql_get(), chars_len); + + if (start >= end) + return mem_set_str0_static(ctx->pOut, ""); + if (mem_copy_str(ctx->pOut, &str[start], end - start) != 0) + ctx->is_aborted = true; +} + static const unsigned char * mem_as_ustr(struct Mem *mem) { @@ -1528,193 +1700,6 @@ replaceFunc(struct sql_context *context, int argc, struct Mem *argv) mem_set_bin_dynamic(context->pOut, (char *)zOut, j); } -/** - * Remove characters included in @a trim_set from @a input_str - * until encounter a character that doesn't belong to @a trim_set. - * Remove from the side specified by @a flags. - * @param context SQL context. - * @param flags Trim specification: left, right or both. - * @param trim_set The set of characters for trimming. - * @param char_len Lengths of each UTF-8 character in @a trim_set. - * @param char_cnt A number of UTF-8 characters in @a trim_set. - * @param input_str Input string for trimming. - * @param input_str_sz Input string size in bytes. - */ -static void -trim_procedure(struct sql_context *context, enum trim_side_mask flags, - const unsigned char *trim_set, const uint8_t *char_len, - int char_cnt, const unsigned char *input_str, int input_str_sz) -{ - if (char_cnt == 0) - goto finish; - int i, len; - const unsigned char *z; - if ((flags & TRIM_LEADING) != 0) { - while (input_str_sz > 0) { - z = trim_set; - for (i = 0; i < char_cnt; ++i, z += len) { - len = char_len[i]; - if (len <= input_str_sz - && memcmp(input_str, z, len) == 0) - break; - } - if (i >= char_cnt) - break; - input_str += len; - input_str_sz -= len; - } - } - if ((flags & TRIM_TRAILING) != 0) { - while (input_str_sz > 0) { - z = trim_set; - for (i = 0; i < char_cnt; ++i, z += len) { - len = char_len[i]; - if (len <= input_str_sz - && memcmp(&input_str[input_str_sz - len], - z, len) == 0) - break; - } - if (i >= char_cnt) - break; - input_str_sz -= len; - } - } -finish: - if (context->func->def->returns == FIELD_TYPE_STRING) - mem_copy_str(context->pOut, (char *)input_str, input_str_sz); - else - mem_copy_bin(context->pOut, (char *)input_str, input_str_sz); -} - -/** - * Prepare arguments for trimming procedure. Allocate memory for - * @a char_len (array of lengths each character in @a trim_set) - * and fill it. - * - * @param context SQL context. - * @param trim_set The set of characters for trimming. - * @param[out] char_len Lengths of each character in @ trim_set. - * @retval >=0 A number of UTF-8 characters in @a trim_set. - * @retval -1 Memory allocation error. - */ -static int -trim_prepare_char_len(struct sql_context *context, - const unsigned char *trim_set, int trim_set_sz, - uint8_t **char_len) -{ - /* - * Count the number of UTF-8 characters passing through - * the entire char set, but not up to the '\0' or X'00' - * character. This allows to handle trimming set - * containing such characters. - */ - int char_cnt = sql_utf8_char_count(trim_set, trim_set_sz); - if (char_cnt == 0) { - *char_len = NULL; - return 0; - } - - if ((*char_len = (uint8_t *)contextMalloc(context, char_cnt)) == NULL) - return -1; - - int i = 0, j = 0; - while(j < char_cnt) { - int old_i = i; - SQL_UTF8_FWD_1(trim_set, i, trim_set_sz); - (*char_len)[j++] = i - old_i; - } - - return char_cnt; -} - -/** - * Normalize args from @a argv input array when it has two args. - * - * Case: TRIM(<str>) - * Call trimming procedure with TRIM_BOTH as the flags and " " as - * the trimming set. - * - * Case: TRIM(LEADING/TRAILING/BOTH FROM <str>) - * If user has specified side keyword only, then call trimming - * procedure with the specified side and " " as the trimming set. - */ -static void -trim_func_two_args(struct sql_context *context, sql_value *arg1, - sql_value *arg2) -{ - const unsigned char *trim_set; - if (mem_is_bin(arg1)) - trim_set = (const unsigned char *)"\0"; - else - trim_set = (const unsigned char *)" "; - const unsigned char *input_str; - if ((input_str = mem_as_ustr(arg1)) == NULL) - return; - - int input_str_sz = mem_len_unsafe(arg1); - assert(arg2->type == MEM_TYPE_UINT); - uint8_t len_one = 1; - trim_procedure(context, arg2->u.u, trim_set, - &len_one, 1, input_str, input_str_sz); -} - -/** - * Normalize args from @a argv input array when it has three args. - * - * Case: TRIM(<character_set> FROM <str>) - * If user has specified <character_set> only, call trimming procedure with - * TRIM_BOTH as the flags and that trimming set. - * - * Case: TRIM(LEADING/TRAILING/BOTH <character_set> FROM <str>) - * If user has specified side keyword and <character_set>, then - * call trimming procedure with that args. - */ -static void -trim_func_three_args(struct sql_context *context, sql_value *arg1, - sql_value *arg2, sql_value *arg3) -{ - assert(arg2->type == MEM_TYPE_UINT); - const unsigned char *input_str, *trim_set; - if ((input_str = mem_as_ustr(arg1)) == NULL || - (trim_set = mem_as_ustr(arg3)) == NULL) - return; - - int trim_set_sz = mem_len_unsafe(arg3); - int input_str_sz = mem_len_unsafe(arg1); - uint8_t *char_len; - int char_cnt = trim_prepare_char_len(context, trim_set, trim_set_sz, - &char_len); - if (char_cnt == -1) - return; - trim_procedure(context, arg2->u.u, trim_set, char_len, - char_cnt, input_str, input_str_sz); - sql_free(char_len); -} - -/** - * Normalize args from @a argv input array when it has one, - * two or three args. - * - * This is a dispatcher function that calls corresponding - * implementation depending on the number of arguments. -*/ -static void -trim_func(struct sql_context *context, int argc, struct Mem *argv) -{ - switch (argc) { - case 2: - trim_func_two_args(context, &argv[0], &argv[1]); - break; - case 3: - trim_func_three_args(context, &argv[0], &argv[1], &argv[2]); - break; - default: - diag_set(ClientError, ER_FUNC_WRONG_ARG_COUNT, "TRIM", - "2 or 3", argc); - context->is_aborted = true; - } -} - /* * Compute the soundex encoding of a word. * @@ -2041,14 +2026,14 @@ static struct sql_func_definition definitions[] = { fin_total}, {"TRIM", 2, {FIELD_TYPE_STRING, FIELD_TYPE_INTEGER}, - FIELD_TYPE_STRING, trim_func, NULL}, + FIELD_TYPE_STRING, func_trim_str, NULL}, {"TRIM", 3, {FIELD_TYPE_STRING, FIELD_TYPE_INTEGER, FIELD_TYPE_STRING}, - FIELD_TYPE_STRING, trim_func, NULL}, + FIELD_TYPE_STRING, func_trim_str, NULL}, {"TRIM", 2, {FIELD_TYPE_VARBINARY, FIELD_TYPE_INTEGER}, - FIELD_TYPE_VARBINARY, trim_func, NULL}, + FIELD_TYPE_VARBINARY, func_trim_bin, NULL}, {"TRIM", 3, {FIELD_TYPE_VARBINARY, FIELD_TYPE_INTEGER, FIELD_TYPE_VARBINARY}, - FIELD_TYPE_VARBINARY, trim_func, NULL}, + FIELD_TYPE_VARBINARY, func_trim_bin, NULL}, {"TYPEOF", 1, {FIELD_TYPE_ANY}, FIELD_TYPE_STRING, typeofFunc, NULL}, {"UNICODE", 1, {FIELD_TYPE_STRING}, FIELD_TYPE_INTEGER, unicodeFunc, diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua index 27f17168b..e8b162ecc 100755 --- a/test/sql-tap/badutf1.test.lua +++ b/test/sql-tap/badutf1.test.lua @@ -1,6 +1,6 @@ #!/usr/bin/env tarantool local test = require("sqltester") -test:plan(23) +test:plan(24) --!./tcltestrunner.lua -- 2007 May 15 @@ -336,47 +336,62 @@ test:do_test( test:do_test( "badutf-4.4", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff80' FROM ]].. - [[x'808080f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff80' FROM x'808080f0808080ff')) AS x; + ]]) end, { -- <badutf-4.4> - "X", "808080F0808080FF" + "X", "F0" -- </badutf-4.4> }) test:do_test( "badutf-4.5", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff80' FROM ]].. - [[x'ff8080f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff80' FROM x'ff8080f0808080ff')) AS x; + ]]) end, { -- <badutf-4.5> - "X", "80F0808080FF" + "X", "F0" -- </badutf-4.5> }) test:do_test( "badutf-4.6", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff80' FROM ]].. - [[x'ff80f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff80' FROM x'ff80f0808080ff')) AS x; + ]]) end, { -- <badutf-4.6> - "X", "F0808080FF" + "X", "F0" -- </badutf-4.6> }) test:do_test( "badutf-4.7", function() - return test:execsql2([[SELECT hex(CAST(TRIM(x'ff8080' FROM ]].. - [[x'ff80f0808080ff') AS VARBINARY)) AS x]]) + return test:execsql2([[ + SELECT hex(TRIM(x'ff8080' FROM x'ff80f0808080ff')) AS x; + ]]) end, { -- <badutf-4.7> - "X", "FF80F0808080FF" + "X", "F0" -- </badutf-4.7> }) +-- gh-4145: Make sure that TRIM() properly work with VARBINARY. +test:do_execsql_test( + "badutf-5", + [[ + SELECT HEX(TRIM(x'ff1234' from x'1234125678123412')); + ]], + { + '5678' + } +) + --db2("close")
next prev parent reply other threads:[~2021-10-20 17:05 UTC|newest] Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top 2021-10-01 16:29 [Tarantool-patches] [PATCH v1 0/8] Rework standard function Mergen Imeev via Tarantool-patches 2021-10-01 16:29 ` [Tarantool-patches] [PATCH v1 1/8] sql: refactor ABS() funcion Mergen Imeev via Tarantool-patches 2021-10-08 21:55 ` Vladislav Shpilevoy via Tarantool-patches 2021-10-20 16:52 ` Mergen Imeev via Tarantool-patches 2021-10-28 22:11 ` Vladislav Shpilevoy via Tarantool-patches 2021-11-01 10:11 ` Mergen Imeev via Tarantool-patches 2021-11-01 13:37 ` Vladislav Shpilevoy via Tarantool-patches 2021-10-01 16:29 ` [Tarantool-patches] [PATCH v1 2/8] sql: refactor CHAR_LENGTH() function Mergen Imeev via Tarantool-patches 2021-10-08 21:56 ` Vladislav Shpilevoy via Tarantool-patches 2021-10-20 16:58 ` Mergen Imeev via Tarantool-patches 2021-10-28 22:11 ` Vladislav Shpilevoy via Tarantool-patches 2021-11-01 10:20 ` Mergen Imeev via Tarantool-patches 2021-10-01 16:29 ` [Tarantool-patches] [PATCH v1 3/8] sql: refactor UPPER() and LOWER() functions Mergen Imeev via Tarantool-patches 2021-10-20 17:02 ` Mergen Imeev via Tarantool-patches 2021-10-01 16:29 ` [Tarantool-patches] [PATCH v1 4/8] sql: refactor NULLIF() function Mergen Imeev via Tarantool-patches 2021-10-01 16:29 ` [Tarantool-patches] [PATCH v1 5/8] sql: rework TRIM() function Mergen Imeev via Tarantool-patches 2021-10-20 17:05 ` Mergen Imeev via Tarantool-patches [this message] 2021-10-28 22:12 ` Vladislav Shpilevoy via Tarantool-patches 2021-11-01 10:35 ` Mergen Imeev via Tarantool-patches 2021-10-01 16:29 ` [Tarantool-patches] [PATCH v1 6/8] sql: rework POSITION() function Mergen Imeev via Tarantool-patches 2021-10-08 21:58 ` Vladislav Shpilevoy via Tarantool-patches 2021-10-20 17:08 ` Mergen Imeev via Tarantool-patches 2021-11-01 10:41 ` Mergen Imeev via Tarantool-patches 2021-10-01 16:29 ` [Tarantool-patches] [PATCH v1 7/8] sql: rework SUBSTR() function Mergen Imeev via Tarantool-patches 2021-10-08 22:02 ` Vladislav Shpilevoy via Tarantool-patches 2021-10-20 17:15 ` Mergen Imeev via Tarantool-patches 2021-10-28 22:13 ` Vladislav Shpilevoy via Tarantool-patches 2021-11-01 10:45 ` Mergen Imeev via Tarantool-patches 2021-10-01 16:29 ` [Tarantool-patches] [PATCH v1 8/8] sql: refactor LIKE() function Mergen Imeev via Tarantool-patches 2021-10-08 22:02 ` Vladislav Shpilevoy via Tarantool-patches 2021-10-20 17:19 ` Mergen Imeev via Tarantool-patches 2021-11-01 10:48 ` Mergen Imeev via Tarantool-patches 2021-11-01 10:53 ` Mergen Imeev via Tarantool-patches 2021-10-04 13:32 ` [Tarantool-patches] [PATCH v1 0/8] Rework standard function Mergen Imeev via Tarantool-patches 2021-11-01 13:38 ` Vladislav Shpilevoy via Tarantool-patches 2021-11-11 10:45 Mergen Imeev via Tarantool-patches 2021-11-11 10:45 ` [Tarantool-patches] [PATCH v1 5/8] sql: rework TRIM() function Mergen Imeev via Tarantool-patches
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20211020170523.GD203963@tarantool.org \ --to=tarantool-patches@dev.tarantool.org \ --cc=imeevma@tarantool.org \ --cc=v.shpilevoy@tarantool.org \ --subject='Re: [Tarantool-patches] [PATCH v1 5/8] sql: rework TRIM() function' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox