[tarantool-patches] Re: [PATCH] sql: LIKE/LENGTH process '\0'
i.koptelov
ivan.koptelov at tarantool.org
Thu Feb 7 18:14:36 MSK 2019
> On 5 Feb 2019, at 16:50, n.pettik <korablev at tarantool.org> wrote:
>
>
>> On 29/01/2019 19:35, n.pettik wrote:
>>>> Fixes LIKE and LENGTH functions. '\0' now treated as
>>>>
>>> Nit: is treated.
>> Fixed.
>>>> a usual symbol. Strings with '\0' are now processed
>>>> entirely. Consider examples:
>>>>
>>>> LENGTH(CHAR(65,00,65)) == 3
>>>> LIKE(CHAR(65,00,65), CHAR(65,00,66)) == False
>>>>
>>> Also, I see that smth wrong with text in this mail again
>> I hope now the mail text is ok.
>
> Not quite. It is still highlighted in some way. Have no idea.
Oh, I am really sorry. Does it look ok now? (I changed the mailing client)
>> src/box/sql/func.c | 88 +++++++++++++-----
>> src/box/sql/vdbeInt.h | 2 +-
>> test/sql-tap/func.test.lua | 220 ++++++++++++++++++++++++++++++++++++++++++++-
>> 3 files changed, 284 insertions(+), 26 deletions(-)
>>
>> diff --git a/src/box/sql/func.c b/src/box/sql/func.c
>> index e46b162d9..2978af983 100644
>> --- a/src/box/sql/func.c
>> +++ b/src/box/sql/func.c
>> @@ -128,6 +128,30 @@ typeofFunc(sqlite3_context * context, int NotUsed, sqlite3_value ** argv)
>> sqlite3_result_text(context, z, -1, SQLITE_STATIC);
>> }
>>
>> +/**
>> + * Return number of chars in the given string.
>> + *
>> + * Number of chars != byte size of string because some characters
>> + * are encoded with more than one byte. Also note that all
>> + * characters from 'str' to 'str + byte_len' would be counted,
>> + * even if there is a '\0' somewhere between them.
>> + * @param str String to be counted.
>> + * @param byte_len Byte length of given string.
>> + * @return
>>
> Return what?
Fixed.
>> + */
>> +static int
>> +count_chars(const unsigned char *str, size_t byte_len)
>>
> Quite poor naming. I would call it utf8_str_len or
> smth with utf8 prefix. Mb it is worth to put it some utils source file.
> Also, consider using native U8_NEXT function from utf8.c,
> instead of custom SQLITE_SKIP_UTF8. It may be not so fast
> but safer I suppose. I don't insist though.
>> +{
> What if str is NULL? Add at least an assertion.
>> + int n_chars = 0;
>> + const unsigned char *prev_z;
>> + for (size_t cnt = 0; cnt < byte_len; cnt += (str - prev_z)) {
>> + n_chars++;
>> + prev_z = str;
>> + SQLITE_SKIP_UTF8(str);
>> + }
>> + return n_chars;
>> +}
>>
> You can rewrite this function in a simpler way without using SQLITE macroses.
> Read this topic: https://stackoverflow.com/questions/3911536/utf-8-unicode-whats-with-0xc0-and-0x80/3911566#3911566
> It is quite useful. You may borrow implementation from there.
Both usage of U8_NEXT and the solution from stack overflow causes sql-tap/badutf1 test to fail,
while func.test (and other tests) are OK. In other words, SQLITE_SKIP_UTF8 and U8_NEXT
proceeds Invalid byte sequences differently.
As far as I understand, the purpose of the test is to check that malformed UTF-8 would not cause a crash.
So, I just used U8_NEXT and fixed the test.
(Since the function is used only in func.c, I prefer to keep it there and don’t move to separate utils file)
diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index 2978af983..8ad75adb8 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -129,27 +129,29 @@ typeofFunc(sqlite3_context * context, int NotUsed, sqlite3_value ** argv)
}
/**
- * Return number of chars in the given string.
+ * Return number of symbols in the given string.
*
- * Number of chars != byte size of string because some characters
+ * Number of symbols != byte size of string because some symbols
* are encoded with more than one byte. Also note that all
- * characters from 'str' to 'str + byte_len' would be counted,
+ * symbols from 'str' to 'str + byte_len' would be counted,
* even if there is a '\0' somewhere between them.
* @param str String to be counted.
* @param byte_len Byte length of given string.
- * @return
+ * @return number of symbols in the given string.
*/
static int
-count_chars(const unsigned char *str, size_t byte_len)
+char_count(const unsigned char *str, size_t byte_len)
{
- int n_chars = 0;
- const unsigned char *prev_z;
- for (size_t cnt = 0; cnt < byte_len; cnt += (str - prev_z)) {
- n_chars++;
- prev_z = str;
- SQLITE_SKIP_UTF8(str);
+ int symbol_len = 0;
+ int offset = 0;
+ UChar32 res;
+ while (offset < (int) byte_len) {
+ U8_NEXT(str, offset, (int) byte_len, res)
+ if (res < 0)
+ break;
+ symbol_len++;
}
- return n_chars;
+ return symbol_len;
}
/*
@@ -174,7 +176,7 @@ lengthFunc(sqlite3_context * context, int argc, sqlite3_value ** argv)
const unsigned char *z = sqlite3_value_text(argv[0]);
if (z == 0)
return;
- len = count_chars(z, sqlite3_value_bytes(argv[0]));
+ len = char_count(z, sqlite3_value_bytes(argv[0]));
sqlite3_result_int(context, len);
break;
}
@@ -361,7 +363,7 @@ substrFunc(sqlite3_context * context, int argc, sqlite3_value ** argv)
return;
len = 0;
if (p1 < 0)
- len = count_chars(z, sqlite3_value_bytes(argv[0]));
+ len = char_count(z, sqlite3_value_bytes(argv[0]));
}
#ifdef SQLITE_SUBSTR_COMPATIBILITY
/* If SUBSTR_COMPATIBILITY is defined then substr(X,0,N) work the same as
@@ -410,7 +412,7 @@ substrFunc(sqlite3_context * context, int argc, sqlite3_value ** argv)
* used because '\0' is not supposed to be
* end-of-string symbol.
*/
- int n_chars = count_chars(z, sqlite3_value_bytes(argv[0]));
+ int n_chars = char_count(z, sqlite3_value_bytes(argv[0]));
int cnt = 0;
while (cnt < n_chars && p1) {
SQLITE_SKIP_UTF8(z);
diff --git a/test/sql-tap/badutf1.test.lua b/test/sql-tap/badutf1.test.lua
index 534c762ba..0a90d1b17 100755
--- a/test/sql-tap/badutf1.test.lua
+++ b/test/sql-tap/badutf1.test.lua
@@ -215,7 +215,7 @@ test:do_test(
return test:execsql2("SELECT length('\x80') AS x")
end, {
-- <badutf-3.1>
- "X", 1
+ "X", 0
-- </badutf-3.1>
})
@@ -235,7 +235,7 @@ test:do_test(
return test:execsql2("SELECT length('\x7f\x80\x81') AS x")
end, {
-- <badutf-3.3>
- "X", 3
+ "X", 1
-- </badutf-3.3>
})
@@ -245,7 +245,7 @@ test:do_test(
return test:execsql2("SELECT length('\x61\xc0') AS x")
end, {
-- <badutf-3.4>
- "X", 2
+ "X", 1
-- </badutf-3.4>
})
@@ -255,7 +255,7 @@ test:do_test(
return test:execsql2("SELECT length('\x61\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
end, {
-- <badutf-3.5>
- "X", 2
+ "X", 1
-- </badutf-3.5>
})
@@ -265,7 +265,7 @@ test:do_test(
return test:execsql2("SELECT length('\xc0\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
end, {
-- <badutf-3.6>
- "X", 1
+ "X", 0
-- </badutf-3.6>
})
@@ -275,7 +275,7 @@ test:do_test(
return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80') AS x")
end, {
-- <badutf-3.7>
- "X", 10
+ "X", 0
-- </badutf-3.7>
})
@@ -285,7 +285,7 @@ test:do_test(
return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\x80') AS x")
end, {
-- <badutf-3.8>
- "X", 6
+ "X", 0
-- </badutf-3.8>
})
@@ -295,7 +295,7 @@ test:do_test(
return test:execsql2("SELECT length('\x80\x80\x80\x80\x80\xf0\x80\x80\x80\xff') AS x")
end, {
-- <badutf-3.9>
- "X", 7
+ "X", 0
-- </badutf-3.9>
})
>> +
>> /*
>> * Implementation of the length() function
>> */
>> @@ -150,11 +174,7 @@ lengthFunc(sqlite3_context * context, int argc, sqlite3_value ** argv)
>> const unsigned char *z = sqlite3_value_text(argv[0]);
>> if (z == 0)
>> return;
>> - len = 0;
>> - while (*z) {
>> - len++;
>> - SQLITE_SKIP_UTF8(z);
>> - }
>> + len = count_chars(z, sqlite3_value_bytes(argv[0]));
>> sqlite3_result_int(context, len);
>> break;
>> }
>> @@ -340,11 +360,8 @@ substrFunc(sqlite3_context * context, int argc, sqlite3_value ** argv)
>> if (z == 0)
>> return;
>> len = 0;
>> - if (p1 < 0) {
>> - for (z2 = z; *z2; len++) {
>> - SQLITE_SKIP_UTF8(z2);
>> - }
>> - }
>> + if (p1 < 0)
>> + len = count_chars(z, sqlite3_value_bytes(argv[0]));
>> }
>> #ifdef SQLITE_SUBSTR_COMPATIBILITY
>> /* If SUBSTR_COMPATIBILITY is defined then substr(X,0,N) work the same as
>> @@ -388,12 +405,21 @@ substrFunc(sqlite3_context * context, int argc, sqlite3_value ** argv)
>> }
>> assert(p1 >= 0 && p2 >= 0);
>> if (p0type != SQLITE_BLOB) {
>> - while (*z && p1) {
>> + /*
>> + * In the code below 'cnt' and 'n_chars' is
>> + * used because '\0' is not supposed to be
>> + * end-of-string symbol.
>> + */
>> + int n_chars = count_chars(z, sqlite3_value_bytes(argv[0]));
>>
> I’d better call it char_count or symbol_count or char_count.
>> diff --git a/test/sql-tap/func.test.lua b/test/sql-tap/func.test.lua
>> index b7de1d955..8c712bd5e 100755
>> --- a/test/sql-tap/func.test.lua
>> +++ b/test/sql-tap/func.test.lua
>> +-- REPLACE
>> +test:do_execsql_test(
>> + "func-62",
>> + "SELECT REPLACE(CHAR(00,65,00,65), CHAR(00), CHAR(65)) LIKE 'AAAA';",
>> + {1})
>> +
>> +test:do_execsql_test(
>> + "func-63",
>> + "SELECT REPLACE(CHAR(00,65,00,65), CHAR(65), CHAR(00)) \
>> + LIKE CHAR(00,00,00,00);",
>> + {1})
>> +
>> +-- SUBSTR
>> +test:do_execsql_test(
>> + "func-64",
>> + "SELECT SUBSTR(CHAR(65,00,66,67), 3, 2) LIKE CHAR(66, 67);",
>> + {1})
>> +
>> +test:do_execsql_test(
>> + "func-65",
>> + "SELECT SUBSTR(CHAR(00,00,00,65), 1, 4) LIKE CHAR(00,00,00,65);",
>> + {1})
>> +
>>
> Just wondering: why do you use LIKE function almost in all tests?
>
To compare actual result with expected one. But now I think It would be more readable like this:
diff --git a/test/sql-tap/func.test.lua b/test/sql-tap/func.test.lua
index 8c712bd5e..7d1611fc2 100755
--- a/test/sql-tap/func.test.lua
+++ b/test/sql-tap/func.test.lua
@@ -2795,61 +2795,60 @@ test:do_execsql_test(
-- REPLACE
test:do_execsql_test(
"func-62",
- "SELECT REPLACE(CHAR(00,65,00,65), CHAR(00), CHAR(65)) LIKE 'AAAA';",
- {1})
+ "SELECT REPLACE(CHAR(00,65,00,65), CHAR(00), CHAR(65));",
+ {'AAAA'})
test:do_execsql_test(
"func-63",
- "SELECT REPLACE(CHAR(00,65,00,65), CHAR(65), CHAR(00)) \
- LIKE CHAR(00,00,00,00);",
- {1})
+ "SELECT REPLACE(CHAR(00,65,00,65), CHAR(65), CHAR(00));",
+ {string.char(00,00,00,00)})
-- SUBSTR
test:do_execsql_test(
"func-64",
- "SELECT SUBSTR(CHAR(65,00,66,67), 3, 2) LIKE CHAR(66, 67);",
- {1})
+ "SELECT SUBSTR(CHAR(65,00,66,67), 3, 2);",
+ {string.char(66,67)})
test:do_execsql_test(
"func-65",
- "SELECT SUBSTR(CHAR(00,00,00,65), 1, 4) LIKE CHAR(00,00,00,65);",
- {1})
+ "SELECT SUBSTR(CHAR(00,00,00,65), 1, 4);",
+ {string.char(00,00,00,65)})
-- UPPER
test:do_execsql_test(
"func-66",
- "SELECT UPPER(CHAR(00,97,00,98,00)) LIKE CHAR(00,65,00,66,00);",
- {1})
+ "SELECT UPPER(CHAR(00,97,00,98,00));",
+ {string.char(00,65,00,66,00)})
-- LOWER
test:do_execsql_test(
"func-67",
- "SELECT LOWER(CHAR(00,65,00,66,00)) LIKE CHAR(00,97,00,98,00);",
- {1})
+ "SELECT LOWER(CHAR(00,65,00,66,00));",
+ {string.char(00,97,00,98,00)})
-- HEX
test:do_execsql_test(
"func-68",
- "SELECT HEX(CHAR(00,65,00,65,00)) LIKE ('0041004100');",
- {1})
+ "SELECT HEX(CHAR(00,65,00,65,00));",
+ {'0041004100'})
-- TRIM
test:do_execsql_test(
"func-69",
- "SELECT TRIM(CHAR(32,00,32,00,32)) LIKE (CHAR(00,32,00));",
- {1})
+ "SELECT TRIM(CHAR(32,00,32,00,32));",
+ {string.char(00,32,00)})
-- LTRIM
test:do_execsql_test(
"func-70",
- "SELECT LTRIM(CHAR(32,00,32,00,32)) LIKE (CHAR(00,32,00,32));",
- {1})
+ "SELECT LTRIM(CHAR(32,00,32,00,32));",
+ {string.char(00,32,00,32)})
-- RTRIM
test:do_execsql_test(
"func-71",
- "SELECT RTRIM(CHAR(32,00,32,00,32)) LIKE (CHAR(32,00,32,00));",
- {1})
+ "SELECT RTRIM(CHAR(32,00,32,00,32));",
+ {string.char(32,00,32,00)})
-- GROUP_CONCAT
test:do_execsql_test(
More information about the Tarantool-patches
mailing list