[tarantool-patches] Re: [PATCH] sql: LIKE & GLOB pattern comparison issue
Nikita Tatunov
hollow653 at gmail.com
Tue Jul 31 16:27:21 MSK 2018
Hello, thank you for review!
Diff in the end.
пт, 27 июл. 2018 г. в 23:22, Alexander Turenko <
alexander.turenko at tarantool.org>:
> Minor comments are below. Looks good for me in general.
>
> Please, review the patch with Lesha after the fixes.
>
> WBR, Alexander Turenko.
>
> > > > With the patch applied an error will be returned in case there's
> > > > invalid UTF-8 symbol in pattern & pattern will not be matched
> > > > with the string that contains it. Some minor corrections to function
> > > > were made as well.
> > > >
> > >
> > > Nitpicking: 'error will be returned' is third situatuon, other then
> > > 'matched' and 'not matched'.
> > >
> > >
> > I guess commit message is about changes. Do I really need to
> > mention valid cases that didn't change?
> >
>
> Ok, now I got that 'it' is 'invalid symbol'. Misunderstood it before as
> description of the one case when a pattern has invalid UTF-8 symbols.
> The sentence is correct. Maybe it can be clarified like so: '& correct
> UTF-8 pattern will not be matched with a string with invalid UTF-8
> symbols'. Or in another way you like.
>
> 'Some minor corrections to function were made as well' -- describe
> certain behaviour or code changes (like 'fixed infinite loop during
> pattern matching and incorrect matching as well') or remove this
> abstract sentence.
>
>
Fixed.
> > > The function itself looks good except lines longer then 80 columns.
> > >
> >
> > As i said before I don't think breaking these lines will increase
> > readability.
> >
>
> There are two little length exceed place in the code, it needs some
> refactoring to decrease code blocks nesting level. I think we can lease
> it as is for now (if other reviewer don't ask for that).
>
> But comments within the function are worth to fix to fit the limit.
>
>
Fixed.
> > +test:do_catchsql_set_test(like_test_cases, prefix)
> > +
> > +-- Invalid testcases.
> > +
> > +for i, tested_string in ipairs(invalid_testcases) do
> > +
> > +-- We should raise an error if pattern contains invalid characters.
>
> Again, indent comments to the same level as code in the block.
>
> If a comment is related to several code blocks (with empty lines btw),
> then separate it with the empty line from the code (to don't
> misinterpret it as related to the first block).
>
>
Fixed
> > + local test_name = prefix .. "2." .. tostring(i)
> > + local test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "';"
> > + test:do_catchsql_test(
> > + test_name,
> > + test_itself, {
> > + -- <test_name>
> > + 1, "LIKE or GLOB pattern can only contain UTF-8 characters"
> > + -- <test_name>
> > + })
> > +
>
> Are there reason to format it like so? Maybe just:
>
> test:do_catchsql_test(test_name, test_itself,
> {1, "LIKE or GLOB pattern can only contain UTF-8 characters"})
>
Fixed.
diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index 2f989d4..7f93ef6 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -703,11 +703,16 @@ sql_utf8_pattern_compare(const char * pattern,
const struct compareInfo *pInfo,
UChar32 matchOther)
{
- UChar32 c, c2; /* Next pattern and input string chars */
- UChar32 matchOne = pInfo->matchOne; /* "?" or "_" */
- UChar32 matchAll = pInfo->matchAll; /* "*" or "%" */
- UChar32 noCase = pInfo->noCase; /* True if uppercase==lowercase */
- const char *zEscaped = 0; /* One past the last escaped input char */
+ /* Next pattern and input string chars */
+ UChar32 c, c2;
+ /* "?" or "_" */
+ UChar32 matchOne = pInfo->matchOne;
+ /* "*" or "%" */
+ UChar32 matchAll = pInfo->matchAll;
+ /* True if uppercase==lowercase */
+ UChar32 noCase = pInfo->noCase;
+ /* One past the last escaped input char */
+ const char *zEscaped = 0;
const char * pattern_end = pattern + strlen(pattern);
const char * string_end = string + strlen(string);
UErrorCode status = U_ZERO_ERROR;
@@ -716,9 +721,11 @@ sql_utf8_pattern_compare(const char * pattern,
if (c == SQL_INVALID_UTF8_SYMBOL)
return SQL_PROHIBITED_PATTERN;
if (c == matchAll) { /* Match "*" */
- /* Skip over multiple "*" characters in the pattern. If there
- * are also "?" characters, skip those as well, but consume a
- * single character of the input string for each "?" skipped
+ /* Skip over multiple "*" characters in
+ * the pattern. If there are also "?"
+ * characters, skip those as well, but
+ * consume a single character of the
+ * input string for each "?" skipped.
*/
while ((c = Utf8Read(pattern, pattern_end)) !=
SQL_END_OF_STRING) {
@@ -733,7 +740,9 @@ sql_utf8_pattern_compare(const char * pattern,
if (c2 == SQL_INVALID_UTF8_SYMBOL)
return SQLITE_NOMATCH;
}
- /* "*" at the end of the pattern matches */
+ /*
+ * "*" at the end of the pattern matches.
+ */
if (c == SQL_END_OF_STRING) {
while ((c2 = Utf8Read(string, string_end)) !=
SQL_END_OF_STRING)
@@ -749,10 +758,14 @@ sql_utf8_pattern_compare(const char * pattern,
if (c == SQL_END_OF_STRING)
return SQLITE_NOWILDCARDMATCH;
} else {
- /* "[...]" immediately follows the "*". We have to do a slow
- * recursive search in this case, but it is an unusual case.
+ /* "[...]" immediately
+ * follows the "*". We
+ * have to do a slow
+ * recursive search in
+ * this case, but it is
+ * an unusual case.
*/
- assert(matchOther < 0x80); /* '[' is a single-byte character */
+ assert(matchOther < 0x80);
while (string < string_end) {
int bMatch =
sql_utf8_pattern_compare(
@@ -770,14 +783,17 @@ sql_utf8_pattern_compare(const char * pattern,
}
}
- /* At this point variable c contains the first character of the
- * pattern string past the "*". Search in the input string for the
- * first matching character and recursively continue the match from
- * that point.
+ /* At this point variable c contains the
+ * first character of the pattern string
+ * past the "*". Search in the input
+ * string for the first matching
+ * character and recursively continue the
+ * match from that point.
*
- * For a case-insensitive search, set variable cx to be the same as
- * c but in the other case and search the input string for either
- * c or cx.
+ * For a case-insensitive search, set
+ * variable cx to be the same as c but in
+ * the other case and search the input
+ * string for either c or cx.
*/
int bMatch;
@@ -785,12 +801,14 @@ sql_utf8_pattern_compare(const char * pattern,
c = u_tolower(c);
while (string < string_end){
/**
- * This loop could have been implemented
- * without if converting c2 to lower case
- * (by holding c_upper and c_lower), however
- * it is implemented this way because lower
- * works better with German and Turkish
- * languages.
+ * This loop could have been
+ * implemented without if
+ * converting c2 to lower case
+ * by holding c_upper and
+ * c_lower,however it is
+ * implemented this way because
+ * lower works better with German
+ * and Turkish languages.
*/
c2 = Utf8Read(string, string_end);
if (c2 == SQL_INVALID_UTF8_SYMBOL)
@@ -878,11 +896,12 @@ sql_utf8_pattern_compare(const char * pattern,
continue;
if (noCase){
/**
- * Small optimisation. Reduce number of calls
- * to u_tolower function.
- * SQL standards suggest use to_upper for symbol
- * normalisation. However, using to_lower allows to
- * respect Turkish 'İ' in default locale.
+ * Small optimisation. Reduce number of
+ * calls to u_tolower function. SQL
+ * standards suggest use to_upper for
+ * symbol normalisation. However, using
+ * to_lower allows to respect Turkish 'İ'
+ * in default locale.
*/
if (u_tolower(c) == c2 ||
c == u_tolower(c2))
diff --git a/test/sql-tap/gh-3251-string-pattern-comparison.test.lua
b/test/sql-tap/gh-3251-string-pattern-comparison.test.lua
index a823bc6..a8e4a12 100755
--- a/test/sql-tap/gh-3251-string-pattern-comparison.test.lua
+++ b/test/sql-tap/gh-3251-string-pattern-comparison.test.lua
@@ -6,276 +6,208 @@ local prefix = "like-test-"
-- Unicode byte sequences.
local valid_testcases = {
- '\x01',
- '\x09',
- '\x1F',
- '\x7F',
- '\xC2\x80',
- '\xC2\x90',
- '\xC2\x9F',
- '\xE2\x80\xA8',
- '\x20\x0B',
- '\xE2\x80\xA9',
+ '\x01',
+ '\x09',
+ '\x1F',
+ '\x7F',
+ '\xC2\x80',
+ '\xC2\x90',
+ '\xC2\x9F',
+ '\xE2\x80\xA8',
+ '\x20\x0B',
+ '\xE2\x80\xA9',
}
-- Non-Unicode byte sequences.
local invalid_testcases = {
- '\xE2\x80',
- '\xFE\xFF',
- '\xC2',
- '\xED\xB0\x80',
- '\xD0',
+ '\xE2\x80',
+ '\xFE\xFF',
+ '\xC2',
+ '\xED\xB0\x80',
+ '\xD0',
}
local like_test_cases =
{
- {"1.1",
- "SELECT 'AB' LIKE '_B';",
- {0, {1}} },
- {"1.2",
- "SELECT 'CD' LIKE '_B';",
- {0, {0}} },
- {"1.3",
- "SELECT '' LIKE '_B';",
- {0, {0}} },
- {"1.4",
- "SELECT 'AB' LIKE '%B';",
- {0, {1}} },
- {"1.5",
- "SELECT 'CD' LIKE '%B';",
- {0, {0}} },
- {"1.6",
- "SELECT '' LIKE '%B';",
- {0, {0}} },
- {"1.7",
- "SELECT 'AB' LIKE 'A__';",
- {0, {0}} },
- {"1.8",
- "SELECT 'CD' LIKE 'A__';",
- {0, {0}} },
- {"1.9",
- "SELECT '' LIKE 'A__';",
- {0, {0}} },
- {"1.10",
- "SELECT 'AB' LIKE 'A_';",
- {0, {1}} },
- {"1.11",
- "SELECT 'CD' LIKE 'A_';",
- {0, {0}} },
- {"1.12",
- "SELECT '' LIKE 'A_';",
- {0, {0}} },
- {"1.13",
- "SELECT 'AB' LIKE 'A';",
- {0, {0}} },
- {"1.14",
- "SELECT 'CD' LIKE 'A';",
- {0, {0}} },
- {"1.15",
- "SELECT '' LIKE 'A';",
- {0, {0}} },
- {"1.16",
- "SELECT 'AB' LIKE '_';",
- {0, {0}} },
- {"1.17",
- "SELECT 'CD' LIKE '_';",
- {0, {0}} },
- {"1.18",
- "SELECT '' LIKE '_';",
- {0, {0}} },
- {"1.19",
- "SELECT 'AB' LIKE '__';",
- {0, {1}} },
- {"1.20",
- "SELECT 'CD' LIKE '__';",
- {0, {1}} },
- {"1.21",
- "SELECT '' LIKE '__';",
- {0, {0}} },
- {"1.22",
- "SELECT 'AB' LIKE '%A';",
- {0, {0}} },
- {"1.23",
- "SELECT 'AB' LIKE '%C';",
- {0, {0}} },
- {"1.24",
- "SELECT 'ab' LIKE '%df';",
- {0, {0}} },
- {"1.25",
- "SELECT 'abCDF' LIKE '%df';",
- {0, {1}} },
- {"1.26",
- "SELECT 'CDF' LIKE '%df';",
- {0, {1}} },
- {"1.27",
- "SELECT 'ab' LIKE 'a_';",
- {0, {1}} },
- {"1.28",
- "SELECT 'abCDF' LIKE 'a_';",
- {0, {0}} },
- {"1.29",
- "SELECT 'CDF' LIKE 'a_';",
- {0, {0}} },
- {"1.30",
- "SELECT 'ab' LIKE 'ab%';",
- {0, {1}} },
- {"1.31",
- "SELECT 'abCDF' LIKE 'ab%';",
- {0, {1}} },
- {"1.32",
- "SELECT 'CDF' LIKE 'ab%';",
- {0, {0}} },
- {"1.33",
- "SELECT 'ab' LIKE 'abC%';",
- {0, {0}} },
- {"1.34",
- "SELECT 'abCDF' LIKE 'abC%';",
- {0, {1}} },
- {"1.35",
- "SELECT 'CDF' LIKE 'abC%';",
- {0, {0}} },
- {"1.36",
- "SELECT 'ab' LIKE 'a_%';",
- {0, {1}} },
- {"1.37",
- "SELECT 'abCDF' LIKE 'a_%';",
- {0, {1}} },
- {"1.38",
- "SELECT 'CDF' LIKE 'a_%';",
- {0, {0}} },
+ {"1.1",
+ "SELECT 'AB' LIKE '_B';",
+ {0, {1}} },
+ {"1.2",
+ "SELECT 'CD' LIKE '_B';",
+ {0, {0}} },
+ {"1.3",
+ "SELECT '' LIKE '_B';",
+ {0, {0}} },
+ {"1.4",
+ "SELECT 'AB' LIKE '%B';",
+ {0, {1}} },
+ {"1.5",
+ "SELECT 'CD' LIKE '%B';",
+ {0, {0}} },
+ {"1.6",
+ "SELECT '' LIKE '%B';",
+ {0, {0}} },
+ {"1.7",
+ "SELECT 'AB' LIKE 'A__';",
+ {0, {0}} },
+ {"1.8",
+ "SELECT 'CD' LIKE 'A__';",
+ {0, {0}} },
+ {"1.9",
+ "SELECT '' LIKE 'A__';",
+ {0, {0}} },
+ {"1.10",
+ "SELECT 'AB' LIKE 'A_';",
+ {0, {1}} },
+ {"1.11",
+ "SELECT 'CD' LIKE 'A_';",
+ {0, {0}} },
+ {"1.12",
+ "SELECT '' LIKE 'A_';",
+ {0, {0}} },
+ {"1.13",
+ "SELECT 'AB' LIKE 'A';",
+ {0, {0}} },
+ {"1.14",
+ "SELECT 'CD' LIKE 'A';",
+ {0, {0}} },
+ {"1.15",
+ "SELECT '' LIKE 'A';",
+ {0, {0}} },
+ {"1.16",
+ "SELECT 'AB' LIKE '_';",
+ {0, {0}} },
+ {"1.17",
+ "SELECT 'CD' LIKE '_';",
+ {0, {0}} },
+ {"1.18",
+ "SELECT '' LIKE '_';",
+ {0, {0}} },
+ {"1.19",
+ "SELECT 'AB' LIKE '__';",
+ {0, {1}} },
+ {"1.20",
+ "SELECT 'CD' LIKE '__';",
+ {0, {1}} },
+ {"1.21",
+ "SELECT '' LIKE '__';",
+ {0, {0}} },
+ {"1.22",
+ "SELECT 'AB' LIKE '%A';",
+ {0, {0}} },
+ {"1.23",
+ "SELECT 'AB' LIKE '%C';",
+ {0, {0}} },
+ {"1.24",
+ "SELECT 'ab' LIKE '%df';",
+ {0, {0}} },
+ {"1.25",
+ "SELECT 'abCDF' LIKE '%df';",
+ {0, {1}} },
+ {"1.26",
+ "SELECT 'CDF' LIKE '%df';",
+ {0, {1}} },
+ {"1.27",
+ "SELECT 'ab' LIKE 'a_';",
+ {0, {1}} },
+ {"1.28",
+ "SELECT 'abCDF' LIKE 'a_';",
+ {0, {0}} },
+ {"1.29",
+ "SELECT 'CDF' LIKE 'a_';",
+ {0, {0}} },
+ {"1.30",
+ "SELECT 'ab' LIKE 'ab%';",
+ {0, {1}} },
+ {"1.31",
+ "SELECT 'abCDF' LIKE 'ab%';",
+ {0, {1}} },
+ {"1.32",
+ "SELECT 'CDF' LIKE 'ab%';",
+ {0, {0}} },
+ {"1.33",
+ "SELECT 'ab' LIKE 'abC%';",
+ {0, {0}} },
+ {"1.34",
+ "SELECT 'abCDF' LIKE 'abC%';",
+ {0, {1}} },
+ {"1.35",
+ "SELECT 'CDF' LIKE 'abC%';",
+ {0, {0}} },
+ {"1.36",
+ "SELECT 'ab' LIKE 'a_%';",
+ {0, {1}} },
+ {"1.37",
+ "SELECT 'abCDF' LIKE 'a_%';",
+ {0, {1}} },
+ {"1.38",
+ "SELECT 'CDF' LIKE 'a_%';",
+ {0, {0}} },
}
test:do_catchsql_set_test(like_test_cases, prefix)
-- Invalid testcases.
-
for i, tested_string in ipairs(invalid_testcases) do
--- We should raise an error if pattern contains invalid characters.
- local test_name = prefix .. "2." .. tostring(i)
- local test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "';"
- test:do_catchsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 1, "LIKE or GLOB pattern can only contain UTF-8 characters"
- -- <test_name>
- })
+ -- We should raise an error in case
+ -- pattern contains invalid characters.
- test_name = prefix .. "3." .. tostring(i)
- test_itself = "SELECT 'abc' LIKE 'abc" .. tested_string .. "';"
- test:do_catchsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 1, "LIKE or GLOB pattern can only contain UTF-8 characters"
- -- <test_name>
- })
+ local test_name = prefix .. "2." .. tostring(i)
+ local test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "';"
+ test:do_catchsql_test(test_name, test_itself,
+ {1, "LIKE or GLOB pattern can only contain UTF-8
characters"})
- test_name = prefix .. "4." .. tostring(i)
- test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "c';"
- test:do_catchsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 1, "LIKE or GLOB pattern can only contain UTF-8 characters"
- -- <test_name>
- })
+ test_name = prefix .. "3." .. tostring(i)
+ test_itself = "SELECT 'abc' LIKE 'abc" .. tested_string .. "';"
+ test:do_catchsql_test(test_name, test_itself,
+ {1, "LIKE or GLOB pattern can only contain UTF-8
characters"})
--- Just skipping if row value predicand contains invalid character.
+ test_name = prefix .. "4." .. tostring(i)
+ test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "c';"
+ test:do_catchsql_test(test_name, test_itself,
+ {1, "LIKE or GLOB pattern can only contain UTF-8
characters"})
- test_name = prefix .. "5." .. tostring(i)
- test_itself = "SELECT 'ab" .. tested_string .. "' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ -- Just skipping if row value predicand contains invalid character.
- test_name = prefix .. "6." .. tostring(i)
- test_itself = "SELECT 'abc" .. tested_string .. "' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test_name = prefix .. "5." .. tostring(i)
+ test_itself = "SELECT 'ab" .. tested_string .. "' LIKE 'abc';"
+ test:do_execsql_test(test_name, test_itself, {0})
- test_name = prefix .. "7." .. tostring(i)
- test_itself = "SELECT 'ab" .. tested_string .. "c' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test_name = prefix .. "6." .. tostring(i)
+ test_itself = "SELECT 'abc" .. tested_string .. "' LIKE 'abc';"
+ test:do_execsql_test(test_name, test_itself, {0})
+
+ test_name = prefix .. "7." .. tostring(i)
+ test_itself = "SELECT 'ab" .. tested_string .. "c' LIKE 'abc';"
+ test:do_execsql_test(test_name, test_itself, {0})
end
-- Valid testcases.
-
for i, tested_string in ipairs(valid_testcases) do
test_name = prefix .. "8." .. tostring(i)
local test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})
test_name = prefix .. "9." .. tostring(i)
test_itself = "SELECT 'abc' LIKE 'abc" .. tested_string .. "';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})
test_name = prefix .. "10." .. tostring(i)
test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "c';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})
+
test_name = prefix .. "11." .. tostring(i)
test_itself = "SELECT 'ab" .. tested_string .. "' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})
test_name = prefix .. "12." .. tostring(i)
test_itself = "SELECT 'abc" .. tested_string .. "' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})
test_name = prefix .. "13." .. tostring(i)
test_itself = "SELECT 'ab" .. tested_string .. "c' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})
end
test:finish_test()
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.tarantool.org/pipermail/tarantool-patches/attachments/20180731/c2555776/attachment.html>
More information about the Tarantool-patches
mailing list