Hello, Alex!

Thanks for the review!

чт, 19 июл. 2018 г. в 14:56, Alex Khatskevich <avkhatskevich@tarantool.org>:

Hi.

I have some questions related to the func.c file, however before that I would ask you to fix tests.

General ideas:
1. Those tests are regresson tests (it just tests that problem will not appear in the future).
We name those tests in the following manear: gh-XXXX-short-description.test.lua

Done.

2. The thing you test is not related to a table and other columns.
Please, convert the tests to the next format: {[[select '' like '_B';]], {1}]]}.
To make it more readable, you can do it like `like_testcases` in `sql-tap/collation.test.lua`.

Done.

3. There is two extra things that should be tested:
1. When string or pattern ends with incorrect unicode symbol (e.g. half of the whole unicode symbol)
2. String or pattern contains incorrect unicode symbol.

Refactored test with this idea taken into account.

Now comparing function is only supposed to work with TEXT types,

which led to the part of #3572 propositions.

Also added error output in case pattern contains invalid symbol &

now if string contains invalid symbol it can't be matched with whatever

pattern. Some minor fixes to patternCompare function as well.

Here's diff:

diff --git a/src/box/sql/func.c b/src/box/sql/func.c

index c06e3bd..5b53076 100644

--- a/src/box/sql/func.c

+++ b/src/box/sql/func.c

@@ -617,13 +617,17 @@ struct compareInfo {

u8 noCase; /* true to ignore case differences */

};

-/*

- * For LIKE and GLOB matching on EBCDIC machines, assume that every

- * character is exactly one byte in size. Also, provde the Utf8Read()

- * macro for fast reading of the next character in the common case where

- * the next character is ASCII.

+/**

+ * Providing there are symbols in string s this

+ * macro returns UTF-8 code of character and

+ * promotes pointer to the next symbol in the string.

+ * Otherwise return code is SQL_END_OF_STRING.

-#define Utf8Read(s, e) ucnv_getNextUChar(pUtf8conv, &s, e, &status)

+#define Utf8Read(s, e) (((s) < (e)) ?\

+ ucnv_getNextUChar(pUtf8conv, &(s), (e), &(status)) : 0)

+#define SQL_END_OF_STRING 0

+#define SQL_INVALID_UTF8_SYMBOL 0xfffd

static const struct compareInfo globInfo = { '*', '?', '[', 0 };

@@ -643,51 +647,61 @@ static const struct compareInfo likeInfoAlt = { '%', '_', 0, 0 };

#define SQLITE_MATCH 0

#define SQLITE_NOMATCH 1

#define SQLITE_NOWILDCARDMATCH 2

+#define SQL_PROHIBITED_PATTERN 3

-/*

- * Compare two UTF-8 strings for equality where the first string is

- * a GLOB or LIKE expression. Return values:

- *

- * SQLITE_MATCH: Match

- * SQLITE_NOMATCH: No match

- * SQLITE_NOWILDCARDMATCH: No match in spite of having * or % wildcards.

+/**

+ * Compare two UTF-8 strings for equality where the first string

+ * is a GLOB or LIKE expression.

* Globbing rules:

- * '*' Matches any sequence of zero or more characters.

+ * '*' Matches any sequence of zero or more characters.

- * '?' Matches exactly one character.

+ * '?' Matches exactly one character.

- * [...] Matches one character from the enclosed list of

- * characters.

+ * [...] Matches one character from the enclosed list of

+ * characters.

- * [^...] Matches one character not in the enclosed list.

+ * [^...] Matches one character not in the enclosed list.

- * With the [...] and [^...] matching, a ']' character can be included

- * in the list by making it the first character after '[' or '^'. A

- * range of characters can be specified using '-'. Example:

- * "[a-z]" matches any single lower-case letter. To match a '-', make

- * it the last character in the list.

+ * With the [...] and [^...] matching, a ']' character can be

+ * included in the list by making it the first character after

+ * '[' or '^'. A range of characters can be specified using '-'.

+ * Example: "[a-z]" matches any single lower-case letter.

+ * To match a '-', make it the last character in the list.

* Like matching rules:

- * '%' Matches any sequence of zero or more characters

+ * '%' Matches any sequence of zero or more characters.

- ** '_' Matches any one character

+ ** '_' Matches any one character.

* Ec Where E is the "esc" character and c is any other

- * character, including '%', '_', and esc, match exactly c.

+ * character, including '%', '_', and esc, match

+ * exactly c.

* The comments within this routine usually assume glob matching.

- * This routine is usually quick, but can be N**2 in the worst case.

+ * This routine is usually quick, but can be N**2 in the worst

+ * case.

+ *

+ * @param pattern String containing comparison pattern.

+ * @param string String being compared.

+ * @param compareInfo Information about how to compare.

+ * @param matchOther The escape char (LIKE) or '[' (GLOB).

+ *

+ * @retval SQLITE_MATCH: Match.

+ * SQLITE_NOMATCH: No match.

+ * SQLITE_NOWILDCARDMATCH: No match in spite of having *

+ * or % wildcards.

+ * SQL_PROHIBITED_PATTERN Pattern contains invalid

+ * symbol.

static int

-patternCompare(const char * pattern, /* The glob pattern */

- const char * string, /* The string to compare against the glob */

- const struct compareInfo *pInfo, /* Information about how to do the compare */

- UChar32 matchOther /* The escape char (LIKE) or '[' (GLOB) */

- )

+sql_utf8_pattern_compare(const char * pattern,

+ const char * string,

+ const struct compareInfo *pInfo,

+ UChar32 matchOther)

{

UChar32 c, c2; /* Next pattern and input string chars */

UChar32 matchOne = pInfo->matchOne; /* "?" or "_" */

@@ -698,29 +712,41 @@ patternCompare(const char * pattern, /* The glob pattern */

const char * string_end = string + strlen(string);

UErrorCode status = U_ZERO_ERROR;

- while (pattern < pattern_end){

- c = Utf8Read(pattern, pattern_end);

+ while ((c = Utf8Read(pattern, pattern_end)) != SQL_END_OF_STRING) {

+ if (c == SQL_INVALID_UTF8_SYMBOL)

+ return SQL_PROHIBITED_PATTERN;

if (c == matchAll) { /* Match "*" */

/* Skip over multiple "*" characters in the pattern. If there

* are also "?" characters, skip those as well, but consume a

* single character of the input string for each "?" skipped

- while (pattern < pattern_end){

- c = Utf8Read(pattern, pattern_end);

+ while ((c = Utf8Read(pattern, pattern_end)) !=

+ SQL_END_OF_STRING) {

+ if (c == SQL_INVALID_UTF8_SYMBOL)

+ return SQL_PROHIBITED_PATTERN;

if (c != matchAll && c != matchOne)

break;

- if (c == matchOne

- && Utf8Read(string, string_end) == 0) {

+ if (c == matchOne &&

+ (c2 = Utf8Read(string, string_end)) ==

+ SQL_END_OF_STRING)

return SQLITE_NOWILDCARDMATCH;

- }

+ if (c2 == SQL_INVALID_UTF8_SYMBOL)

+ return SQLITE_NOMATCH;

}

/* "*" at the end of the pattern matches */

- if (pattern == pattern_end)

+ if (c == SQL_END_OF_STRING) {

+ while ((c2 = Utf8Read(string, string_end)) !=

+ SQL_END_OF_STRING)

+ if (c2 == SQL_INVALID_UTF8_SYMBOL)

+ return SQLITE_NOMATCH;

return SQLITE_MATCH;

+ }

if (c == matchOther) {

if (pInfo->matchSet == 0) {

c = Utf8Read(pattern, pattern_end);

- if (c == 0)

+ if (c == SQL_INVALID_UTF8_SYMBOL)

+ return SQL_PROHIBITED_PATTERN;

+ if (c == SQL_END_OF_STRING)

return SQLITE_NOWILDCARDMATCH;

} else {

/* "[...]" immediately follows the "*". We have to do a slow

@@ -729,13 +755,16 @@ patternCompare(const char * pattern, /* The glob pattern */

assert(matchOther < 0x80); /* '[' is a single-byte character */

while (string < string_end) {

int bMatch =

- patternCompare(&pattern[-1],

- string,

- pInfo,

- matchOther);

+ sql_utf8_pattern_compare(

+ &pattern[-1],

+ string,

+ pInfo,

+ matchOther);

if (bMatch != SQLITE_NOMATCH)

return bMatch;

- Utf8Read(string, string_end);

+ c = Utf8Read(string, string_end);

+ if (c == SQL_INVALID_UTF8_SYMBOL)

+ return SQLITE_NOMATCH;

}

return SQLITE_NOWILDCARDMATCH;

}

@@ -764,6 +793,8 @@ patternCompare(const char * pattern, /* The glob pattern */

* languages.

c2 = Utf8Read(string, string_end);

+ if (c2 == SQL_INVALID_UTF8_SYMBOL)

+ return SQLITE_NOMATCH;

if (!noCase) {

if (c2 != c)

continue;

@@ -771,9 +802,10 @@ patternCompare(const char * pattern, /* The glob pattern */

if (c2 != c && u_tolower(c2) != c)

continue;

}

- bMatch =

- patternCompare(pattern, string,

- pInfo, matchOther);

+ bMatch = sql_utf8_pattern_compare(pattern,

+ string,

+ pInfo,

+ matchOther);

if (bMatch != SQLITE_NOMATCH)

return bMatch;

}

@@ -782,7 +814,9 @@ patternCompare(const char * pattern, /* The glob pattern */

if (c == matchOther) {

if (pInfo->matchSet == 0) {

c = Utf8Read(pattern, pattern_end);

- if (c == 0)

+ if (c == SQL_INVALID_UTF8_SYMBOL)

+ return SQL_PROHIBITED_PATTERN;

+ if (c == SQL_END_OF_STRING)

return SQLITE_NOMATCH;

zEscaped = pattern;

} else {

@@ -790,23 +824,33 @@ patternCompare(const char * pattern, /* The glob pattern */

int seen = 0;

int invert = 0;

c = Utf8Read(string, string_end);

+ if (c == SQL_INVALID_UTF8_SYMBOL)

+ return SQLITE_NOMATCH;

if (string == string_end)

return SQLITE_NOMATCH;

c2 = Utf8Read(pattern, pattern_end);

+ if (c2 == SQL_INVALID_UTF8_SYMBOL)

+ return SQL_PROHIBITED_PATTERN;

if (c2 == '^') {

invert = 1;

c2 = Utf8Read(pattern, pattern_end);

+ if (c2 == SQL_INVALID_UTF8_SYMBOL)

+ return SQL_PROHIBITED_PATTERN;

}

if (c2 == ']') {

if (c == ']')

seen = 1;

c2 = Utf8Read(pattern, pattern_end);

+ if (c2 == SQL_INVALID_UTF8_SYMBOL)

+ return SQL_PROHIBITED_PATTERN;

}

- while (c2 && c2 != ']') {

+ while (c2 != SQL_END_OF_STRING && c2 != ']') {

if (c2 == '-' && pattern[0] != ']'

&& pattern < pattern_end

&& prior_c > 0) {

c2 = Utf8Read(pattern, pattern_end);

+ if (c2 == SQL_INVALID_UTF8_SYMBOL)

+ return SQL_PROHIBITED_PATTERN;

if (c >= prior_c && c <= c2)

seen = 1;

prior_c = 0;

@@ -817,14 +861,19 @@ patternCompare(const char * pattern, /* The glob pattern */

prior_c = c2;

}

c2 = Utf8Read(pattern, pattern_end);

+ if (c2 == SQL_INVALID_UTF8_SYMBOL)

+ return SQL_PROHIBITED_PATTERN;

}

- if (pattern == pattern_end || (seen ^ invert) == 0) {

+ if (pattern == pattern_end ||

+ (seen ^ invert) == 0) {

return SQLITE_NOMATCH;

}

continue;

}

c2 = Utf8Read(string, string_end);

+ if (c2 == SQL_INVALID_UTF8_SYMBOL)

+ return SQLITE_NOMATCH;

if (c == c2)

continue;

if (noCase){

@@ -839,7 +888,8 @@ patternCompare(const char * pattern, /* The glob pattern */

c == u_tolower(c2))

continue;

}

- if (c == matchOne && pattern != zEscaped && c2 != 0)

+ if (c == matchOne && pattern != zEscaped &&

+ c2 != SQL_END_OF_STRING)

continue;

return SQLITE_NOMATCH;

}

@@ -853,8 +903,7 @@ patternCompare(const char * pattern, /* The glob pattern */

int

sqlite3_strglob(const char *zGlobPattern, const char *zString)

{

- return patternCompare(zGlobPattern, zString, &globInfo,

- '[');

+ return sql_utf8_pattern_compare(zGlobPattern, zString, &globInfo, '[');

}

@@ -864,7 +913,7 @@ sqlite3_strglob(const char *zGlobPattern, const char *zString)

int

sqlite3_strlike(const char *zPattern, const char *zStr, unsigned int esc)

{

- return patternCompare(zPattern, zStr, &likeInfoNorm, esc);

+ return sql_utf8_pattern_compare(zPattern, zStr, &likeInfoNorm, esc);

}

@@ -910,8 +959,9 @@ likeFunc(sqlite3_context * context, int argc, sqlite3_value ** argv)

zB = (const char *) sqlite3_value_text(argv[0]);

zA = (const char *) sqlite3_value_text(argv[1]);

- /* Limit the length of the LIKE or GLOB pattern to avoid problems

- * of deep recursion and N*N behavior in patternCompare().

+ /* Limit the length of the LIKE or GLOB pattern to avoid

+ * problems of deep recursion and N*N behavior in

+ * sql_utf8_pattern_compare().

nPat = sqlite3_value_bytes(argv[0]);

testcase(nPat == db->aLimit[SQLITE_LIMIT_LIKE_PATTERN_LENGTH]);

@@ -947,7 +997,12 @@ likeFunc(sqlite3_context * context, int argc, sqlite3_value ** argv)

sqlite3_like_count++;

#endif

int res;

- res = patternCompare(zB, zA, pInfo, escape);

+ res = sql_utf8_pattern_compare(zB, zA, pInfo, escape);

+ if (res == SQL_PROHIBITED_PATTERN) {

+ sqlite3_result_error(context, "LIKE or GLOB pattern can only"

+ " contain UTF-8 characters", -1);

+ return;

+ }

sqlite3_result_int(context, res == SQLITE_MATCH);

}

diff --git a/test/sql-tap/e_expr.test.lua b/test/sql-tap/e_expr.test.lua

index 13d3a96..051210a 100755

--- a/test/sql-tap/e_expr.test.lua

+++ b/test/sql-tap/e_expr.test.lua

@@ -1,6 +1,6 @@

#!/usr/bin/env tarantool

test = require("sqltester")

-test:plan(12431)

+test:plan(10665)

--!./tcltestrunner.lua

-- 2010 July 16

@@ -77,8 +77,10 @@ local operations = {

{"<>", "ne1"},

{"!=", "ne2"},

{"IS", "is"},

- {"LIKE", "like"},

- {"GLOB", "glob"},

+-- NOTE: This test needs refactoring after deletion of GLOB &

+-- type restrictions for LIKE.

+-- {"LIKE", "like"},

+-- {"GLOB", "glob"},

{"AND", "and"},

{"OR", "or"},

{"MATCH", "match"},

@@ -96,7 +98,7 @@ operations = {

{"+", "-"},

{"<<", ">>", "&", "|"},

{"<", "<=", ">", ">="},

- {"=", "==", "!=", "<>", "LIKE", "GLOB"}, --"MATCH", "REGEXP"},

+ {"=", "==", "!=", "<>"}, --"LIKE", "GLOB"}, "MATCH", "REGEXP"},

{"AND"},

{"OR"},

}

@@ -475,6 +477,7 @@ for _, op in ipairs(oplist) do

end

---------------------------------------------------------------------------

-- Test the IS and IS NOT operators.

diff --git a/test/sql-tap/gh-3251-string-pattern-comparison.lua b/test/sql-tap/gh-3251-string-pattern-comparison.lua

new file mode 100755

index 0000000..0202efc

--- /dev/null

+++ b/test/sql-tap/gh-3251-string-pattern-comparison.lua

@@ -0,0 +1,238 @@

+#!/usr/bin/env tarantool

+test = require("sqltester")

+test:plan(106)

+local prefix = "like-test-"

+-- Unciode byte sequences.

+local valid_testcases = {

+ '\x01',

+ '\x09',

+ '\x1F',

+ '\x7F',

+ '\xC2\x80',

+ '\xC2\x90',

+ '\xC2\x9F',

+ '\xE2\x80\xA8',

+ '\x20\x0B',

+ '\xE2\x80\xA9',

+-- Non-Unicode byte sequences.

+local invalid_testcases = {

+ '\xE2\x80',

+ '\xFE\xFF',

+ '\xC2',

+ '\xED\xB0\x80',

+ '\xD0',

+local like_test_cases =

+ {"1.1",

+ [[

+ CREATE TABLE t2 (column1 INTEGER,

+ column2 VARCHAR(100),

+ column3 BLOB,

+ column4 FLOAT,

+ PRIMARY KEY (column1, column2));

+ INSERT INTO t2 VALUES (1, 'AB', X'4142', 5.5);

+ INSERT INTO t2 VALUES (1, 'CD', X'2020', 1E4);

+ INSERT INTO t2 VALUES (2, 'AB', X'2020', 12.34567);

+ INSERT INTO t2 VALUES (-1000, '', X'', 0.0);

+ CREATE TABLE t1 (a INT PRIMARY KEY, str VARCHAR(100));

+ INSERT INTO t1 VALUES (1, 'ab');

+ INSERT INTO t1 VALUES (2, 'abCDF');

+ INSERT INTO t1 VALUES (3, 'CDF');

+ CREATE TABLE t (s1 CHAR(2) PRIMARY KEY, s2 CHAR(2));

+ INSERT INTO t VALUES ('AB', 'AB');

+ ]], {0}},

+ {"1.2",

+ [[

+ SELECT column1, column2, column1 * column4 FROM

+ t2 WHERE column2 LIKE '_B';

+ ]],

+ {0, {1, 'AB', 5.5, 2, 'AB', 24.69134}} },

+ {"1.3",

+ "SELECT column1, column2 FROM t2 WHERE column2 LIKE '%B';",

+ {0, {1, 'AB', 2, 'AB'}} },

+ {"1.4",

+ "SELECT column1, column2 FROM t2 WHERE column2 LIKE 'A__';",

+ {0, {}} },

+ {"1.5",

+ "SELECT column1, column2 FROM t2 WHERE column2 LIKE 'A_';",

+ {0, {1, 'AB', 2, 'AB'}} },

+ {"1.6",

+ "SELECT column1, column2 FROM t2 WHERE column2 LIKE 'A';",

+ {0, {}} },

+ {"1.7",

+ "SELECT column1, column2 FROM t2 WHERE column2 LIKE '_';",

+ {0, {}} },

+ {"1.8",

+ "SELECT * FROM t WHERE s1 LIKE '%A';",

+ {0, {}} },

+ {"1.9",

+ "SELECT * FROM t WHERE s1 LIKE '%C';",

+ {0, {}} },

+ {"1.10",

+ "SELECT * FROM t1 WHERE str LIKE '%df';",

+ {0, {2, 'abCDF', 3, 'CDF'}} },

+ {"1.11",

+ "SELECT * FROM t1 WHERE str LIKE 'a_';",

+ {0, {1, 'ab'}} },

+ {"1.12",

+ "SELECT column1, column2 FROM t2 WHERE column2 LIKE '__';",

+ {0, {1, 'AB', 1, 'CD', 2, 'AB'}} },

+ {"1.13",

+ "SELECT str FROM t1 WHERE str LIKE 'ab%';",

+ {0, {'ab', 'abCDF'}} },

+ {"1.14",

+ "SELECT str FROM t1 WHERE str LIKE 'abC%';",

+ {0, {'abCDF'}} },

+ {"1.15",

+ "SELECT str FROM t1 WHERE str LIKE 'a_%';",

+ {0, {'ab', 'abCDF'}} },

+ {"1.16",

+ [[

+ DROP TABLE t1;

+ DROP TABLE t2;

+ DROP TABLE t;

+ ]], {0}},

+test:do_catchsql_set_test(like_test_cases, prefix)

+-- Invalid testcases.

+for i, tested_string in ipairs(invalid_testcases) do

+ local test_name = prefix .. "2." .. tostring(i)

+ local test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "';"

+-- We should raise an error if pattern contains invalid characters.

+ test:do_catchsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 1, "LIKE or GLOB pattern can only contain UTF-8 characters"

+ -- <test_name>

+ })

+ test_name = prefix .. "3." .. tostring(i)

+ test_itself = "SELECT 'abc' LIKE 'abc" .. tested_string .. "';"

+ test:do_catchsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 1, "LIKE or GLOB pattern can only contain UTF-8 characters"

+ -- <test_name>

+ })

+ test_name = prefix .. "4." .. tostring(i)

+ test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "c';"

+ test:do_catchsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 1, "LIKE or GLOB pattern can only contain UTF-8 characters"

+ -- <test_name>

+ })

+-- Just skipping if row value predicand contains invalid character.

+ test_name = prefix .. "5." .. tostring(i)

+ test_itself = "SELECT 'ab" .. tested_string .. "' LIKE 'abc';"

+ test:do_execsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 0

+ -- <test_name>

+ })

+ test_name = prefix .. "6." .. tostring(i)

+ test_itself = "SELECT 'abc" .. tested_string .. "' LIKE 'abc';"

+ test:do_execsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 0

+ -- <test_name>

+ })

+ test_name = prefix .. "7." .. tostring(i)

+ test_itself = "SELECT 'ab" .. tested_string .. "c' LIKE 'abc';"

+ test:do_execsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 0

+ -- <test_name>

+ })

+end

+-- Valid testcases.

+for i, tested_string in ipairs(valid_testcases) do

+ test_name = prefix .. "8." .. tostring(i)

+ local test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "';"

+ test:do_execsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 0

+ -- <test_name>

+ })

+ test_name = prefix .. "9." .. tostring(i)

+ test_itself = "SELECT 'abc' LIKE 'abc" .. tested_string .. "';"

+ test:do_execsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 0

+ -- <test_name>

+ })

+ test_name = prefix .. "10." .. tostring(i)

+ test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "c';"

+ test:do_execsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 0

+ -- <test_name>

+ })

+ test_name = prefix .. "11." .. tostring(i)

+ test_itself = "SELECT 'ab" .. tested_string .. "' LIKE 'abc';"

+ test:do_execsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 0

+ -- <test_name>

+ })

+ test_name = prefix .. "12." .. tostring(i)

+ test_itself = "SELECT 'abc" .. tested_string .. "' LIKE 'abc';"

+ test:do_execsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 0

+ -- <test_name>

+ })

+ test_name = prefix .. "13." .. tostring(i)

+ test_itself = "SELECT 'ab" .. tested_string .. "c' LIKE 'abc';"

+ test:do_execsql_test(

+ test_name,

+ test_itself, {

+ -- <test_name>

+ 0

+ -- <test_name>

+ })

+end

+test:finish_test()