[tarantool-patches] Re: [PATCH] sql: LIKE & GLOB pattern comparison issue

Tue Jul 31 16:27:21 MSK 2018

Hello, thank you for review!
Diff in the end.


пт, 27 июл. 2018 г. в 23:22, Alexander Turenko <
alexander.turenko at tarantool.org>:

> Minor comments are below. Looks good for me in general.
>
> Please, review the patch with Lesha after the fixes.
>
> WBR, Alexander Turenko.
>
> > > > With the patch applied an error will be returned in case there's
> > > > invalid UTF-8 symbol in pattern & pattern will not be matched
> > > > with the string that contains it. Some minor corrections to function
> > > > were made as well.
> > > >
> > >
> > > Nitpicking: 'error will be returned' is third situatuon, other then
> > > 'matched' and 'not matched'.
> > >
> > >
> > I guess commit message is about changes. Do I really need to
> > mention valid cases that didn't change?
> >
>
> Ok, now I got that 'it' is 'invalid symbol'. Misunderstood it before as
> description of the one case when a pattern has invalid UTF-8 symbols.
> The sentence is correct. Maybe it can be clarified like so: '& correct
> UTF-8 pattern will not be matched with a string with invalid UTF-8
> symbols'. Or in another way you like.
>
> 'Some minor corrections to function were made as well' -- describe
> certain behaviour or code changes (like 'fixed infinite loop during
> pattern matching and incorrect matching as well') or remove this
> abstract sentence.
>
>
Fixed.


> > > The function itself looks good except lines longer then 80 columns.
> > >
> >
> > As i said before I don't think breaking these lines will increase
> > readability.
> >
>
> There are two little length exceed place in the code, it needs some
> refactoring to decrease code blocks nesting level. I think we can lease
> it as is for now (if other reviewer don't ask for that).
>
> But comments within the function are worth to fix to fit the limit.
>
>
Fixed.


> > +test:do_catchsql_set_test(like_test_cases, prefix)
> > +
> > +-- Invalid testcases.
> > +
> > +for i, tested_string in ipairs(invalid_testcases) do
> > +
> > +-- We should raise an error if pattern contains invalid characters.
>
> Again, indent comments to the same level as code in the block.
>
> If a comment is related to several code blocks (with empty lines btw),
> then separate it with the empty line from the code (to don't
> misinterpret it as related to the first block).
>
>
Fixed


> > + local test_name = prefix .. "2." .. tostring(i)
> > + local test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "';"
> > + test:do_catchsql_test(
> > + test_name,
> > + test_itself, {
> > + -- <test_name>
> > + 1, "LIKE or GLOB pattern can only contain UTF-8 characters"
> > + -- <test_name>
> > + })
> > +
>
> Are there reason to format it like so? Maybe just:
>
> test:do_catchsql_test(test_name, test_itself,
>     {1, "LIKE or GLOB pattern can only contain UTF-8 characters"})
>

Fixed.

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index 2f989d4..7f93ef6 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -703,11 +703,16 @@ sql_utf8_pattern_compare(const char * pattern,
  const struct compareInfo *pInfo,
  UChar32 matchOther)
 {
- UChar32 c, c2; /* Next pattern and input string chars */
- UChar32 matchOne = pInfo->matchOne; /* "?" or "_" */
- UChar32 matchAll = pInfo->matchAll; /* "*" or "%" */
- UChar32 noCase = pInfo->noCase; /* True if uppercase==lowercase */
- const char *zEscaped = 0; /* One past the last escaped input char */
+ /* Next pattern and input string chars */
+ UChar32 c, c2;
+ /* "?" or "_" */
+ UChar32 matchOne = pInfo->matchOne;
+ /* "*" or "%" */
+ UChar32 matchAll = pInfo->matchAll;
+ /* True if uppercase==lowercase */
+ UChar32 noCase = pInfo->noCase;
+ /* One past the last escaped input char */
+ const char *zEscaped = 0;
  const char * pattern_end = pattern + strlen(pattern);
  const char * string_end = string + strlen(string);
  UErrorCode status = U_ZERO_ERROR;
@@ -716,9 +721,11 @@ sql_utf8_pattern_compare(const char * pattern,
  if (c == SQL_INVALID_UTF8_SYMBOL)
  return SQL_PROHIBITED_PATTERN;
  if (c == matchAll) { /* Match "*" */
- /* Skip over multiple "*" characters in the pattern.  If there
- * are also "?" characters, skip those as well, but consume a
- * single character of the input string for each "?" skipped
+ /* Skip over multiple "*" characters in
+ * the pattern. If there are also "?"
+ * characters, skip those as well, but
+ * consume a single character of the
+ * input string for each "?" skipped.
  */
  while ((c = Utf8Read(pattern, pattern_end)) !=
         SQL_END_OF_STRING) {
@@ -733,7 +740,9 @@ sql_utf8_pattern_compare(const char * pattern,
  if (c2 == SQL_INVALID_UTF8_SYMBOL)
  return SQLITE_NOMATCH;
  }
- /* "*" at the end of the pattern matches */
+ /*
+ * "*" at the end of the pattern matches.
+ */
  if (c == SQL_END_OF_STRING) {
  while ((c2 = Utf8Read(string, string_end)) !=
         SQL_END_OF_STRING)
@@ -749,10 +758,14 @@ sql_utf8_pattern_compare(const char * pattern,
  if (c == SQL_END_OF_STRING)
  return SQLITE_NOWILDCARDMATCH;
  } else {
- /* "[...]" immediately follows the "*".  We have to do a slow
- * recursive search in this case, but it is an unusual case.
+ /* "[...]" immediately
+ * follows the "*". We
+ * have to do a slow
+ * recursive search in
+ * this case, but it is
+ * an unusual case.
  */
- assert(matchOther < 0x80); /* '[' is a single-byte character */
+ assert(matchOther < 0x80);
  while (string < string_end) {
  int bMatch =
      sql_utf8_pattern_compare(
@@ -770,14 +783,17 @@ sql_utf8_pattern_compare(const char * pattern,
  }
  }

- /* At this point variable c contains the first character of the
- * pattern string past the "*".  Search in the input string for the
- * first matching character and recursively continue the match from
- * that point.
+ /* At this point variable c contains the
+ * first character of the pattern string
+ * past the "*". Search in the input
+ * string for the first matching
+ * character and recursively continue the
+ * match from that point.
  *
- * For a case-insensitive search, set variable cx to be the same as
- * c but in the other case and search the input string for either
- * c or cx.
+ * For a case-insensitive search, set
+ * variable cx to be the same as c but in
+ * the other case and search the input
+ * string for either c or cx.
  */

  int bMatch;
@@ -785,12 +801,14 @@ sql_utf8_pattern_compare(const char * pattern,
  c = u_tolower(c);
  while (string < string_end){
  /**
- * This loop could have been implemented
- * without if converting c2 to lower case
- * (by holding c_upper and c_lower), however
- * it is implemented this way because lower
- * works better with German and Turkish
- * languages.
+ * This loop could have been
+ * implemented without if
+ * converting c2 to lower case
+ * by holding c_upper and
+ * c_lower,however it is
+ * implemented this way because
+ * lower works better with German
+ * and Turkish languages.
  */
  c2 = Utf8Read(string, string_end);
  if (c2 == SQL_INVALID_UTF8_SYMBOL)
@@ -878,11 +896,12 @@ sql_utf8_pattern_compare(const char * pattern,
  continue;
  if (noCase){
  /**
- * Small optimisation. Reduce number of calls
- * to u_tolower function.
- * SQL standards suggest use to_upper for symbol
- * normalisation. However, using to_lower allows to
- * respect Turkish 'İ' in default locale.
+ * Small optimisation. Reduce number of
+ * calls to u_tolower function. SQL
+ * standards suggest use to_upper for
+ * symbol normalisation. However, using
+ * to_lower allows to respect Turkish 'İ'
+ * in default locale.
  */
  if (u_tolower(c) == c2 ||
      c == u_tolower(c2))
diff --git a/test/sql-tap/gh-3251-string-pattern-comparison.test.lua
b/test/sql-tap/gh-3251-string-pattern-comparison.test.lua
index a823bc6..a8e4a12 100755
--- a/test/sql-tap/gh-3251-string-pattern-comparison.test.lua
+++ b/test/sql-tap/gh-3251-string-pattern-comparison.test.lua
@@ -6,276 +6,208 @@ local prefix = "like-test-"

 -- Unicode byte sequences.
 local valid_testcases = {
- '\x01',
- '\x09',
- '\x1F',
- '\x7F',
- '\xC2\x80',
- '\xC2\x90',
- '\xC2\x9F',
- '\xE2\x80\xA8',
- '\x20\x0B',
- '\xE2\x80\xA9',
+    '\x01',
+    '\x09',
+    '\x1F',
+    '\x7F',
+    '\xC2\x80',
+    '\xC2\x90',
+    '\xC2\x9F',
+    '\xE2\x80\xA8',
+    '\x20\x0B',
+    '\xE2\x80\xA9',
 }

 -- Non-Unicode byte sequences.
 local invalid_testcases = {
- '\xE2\x80',
- '\xFE\xFF',
- '\xC2',
- '\xED\xB0\x80',
- '\xD0',
+    '\xE2\x80',
+    '\xFE\xFF',
+    '\xC2',
+    '\xED\xB0\x80',
+    '\xD0',
 }

 local like_test_cases =
 {
- {"1.1",
- "SELECT 'AB' LIKE '_B';",
- {0, {1}} },
- {"1.2",
- "SELECT 'CD' LIKE '_B';",
- {0, {0}} },
- {"1.3",
- "SELECT '' LIKE '_B';",
- {0, {0}} },
- {"1.4",
- "SELECT 'AB' LIKE '%B';",
- {0, {1}} },
- {"1.5",
- "SELECT 'CD' LIKE '%B';",
- {0, {0}} },
- {"1.6",
- "SELECT '' LIKE '%B';",
- {0, {0}} },
- {"1.7",
- "SELECT 'AB' LIKE 'A__';",
- {0, {0}} },
- {"1.8",
- "SELECT 'CD' LIKE 'A__';",
- {0, {0}} },
- {"1.9",
- "SELECT '' LIKE 'A__';",
- {0, {0}} },
- {"1.10",
- "SELECT 'AB' LIKE 'A_';",
- {0, {1}} },
- {"1.11",
- "SELECT 'CD' LIKE 'A_';",
- {0, {0}} },
- {"1.12",
- "SELECT '' LIKE 'A_';",
- {0, {0}} },
- {"1.13",
- "SELECT 'AB' LIKE 'A';",
- {0, {0}} },
- {"1.14",
- "SELECT 'CD' LIKE 'A';",
- {0, {0}} },
- {"1.15",
- "SELECT '' LIKE 'A';",
- {0, {0}} },
- {"1.16",
- "SELECT 'AB' LIKE '_';",
- {0, {0}} },
- {"1.17",
- "SELECT 'CD' LIKE '_';",
- {0, {0}} },
- {"1.18",
- "SELECT '' LIKE '_';",
- {0, {0}} },
- {"1.19",
- "SELECT 'AB' LIKE '__';",
- {0, {1}} },
- {"1.20",
- "SELECT 'CD' LIKE '__';",
- {0, {1}} },
- {"1.21",
- "SELECT '' LIKE '__';",
- {0, {0}} },
- {"1.22",
- "SELECT 'AB' LIKE '%A';",
- {0, {0}} },
- {"1.23",
- "SELECT 'AB' LIKE '%C';",
- {0, {0}} },
- {"1.24",
- "SELECT 'ab' LIKE '%df';",
- {0, {0}} },
- {"1.25",
- "SELECT 'abCDF' LIKE '%df';",
- {0, {1}} },
- {"1.26",
- "SELECT 'CDF' LIKE '%df';",
- {0, {1}} },
- {"1.27",
- "SELECT 'ab' LIKE 'a_';",
- {0, {1}} },
- {"1.28",
- "SELECT 'abCDF' LIKE 'a_';",
- {0, {0}} },
- {"1.29",
- "SELECT 'CDF' LIKE 'a_';",
- {0, {0}} },
- {"1.30",
- "SELECT 'ab' LIKE 'ab%';",
- {0, {1}} },
- {"1.31",
- "SELECT 'abCDF' LIKE 'ab%';",
- {0, {1}} },
- {"1.32",
- "SELECT 'CDF' LIKE 'ab%';",
- {0, {0}} },
- {"1.33",
- "SELECT 'ab' LIKE 'abC%';",
- {0, {0}} },
- {"1.34",
- "SELECT 'abCDF' LIKE 'abC%';",
- {0, {1}} },
- {"1.35",
- "SELECT 'CDF' LIKE 'abC%';",
- {0, {0}} },
- {"1.36",
- "SELECT 'ab' LIKE 'a_%';",
- {0, {1}} },
- {"1.37",
- "SELECT 'abCDF' LIKE 'a_%';",
- {0, {1}} },
- {"1.38",
- "SELECT 'CDF' LIKE 'a_%';",
- {0, {0}} },
+    {"1.1",
+        "SELECT 'AB' LIKE '_B';",
+        {0, {1}} },
+    {"1.2",
+        "SELECT 'CD' LIKE '_B';",
+        {0, {0}} },
+    {"1.3",
+        "SELECT '' LIKE '_B';",
+        {0, {0}} },
+    {"1.4",
+        "SELECT 'AB' LIKE '%B';",
+        {0, {1}} },
+    {"1.5",
+        "SELECT 'CD' LIKE '%B';",
+        {0, {0}} },
+    {"1.6",
+        "SELECT '' LIKE '%B';",
+        {0, {0}} },
+    {"1.7",
+        "SELECT 'AB' LIKE 'A__';",
+        {0, {0}} },
+    {"1.8",
+        "SELECT 'CD' LIKE 'A__';",
+        {0, {0}} },
+    {"1.9",
+        "SELECT '' LIKE 'A__';",
+        {0, {0}} },
+    {"1.10",
+        "SELECT 'AB' LIKE 'A_';",
+        {0, {1}} },
+    {"1.11",
+        "SELECT 'CD' LIKE 'A_';",
+        {0, {0}} },
+    {"1.12",
+        "SELECT '' LIKE 'A_';",
+        {0, {0}} },
+    {"1.13",
+        "SELECT 'AB' LIKE 'A';",
+        {0, {0}} },
+    {"1.14",
+        "SELECT 'CD' LIKE 'A';",
+        {0, {0}} },
+    {"1.15",
+        "SELECT '' LIKE 'A';",
+        {0, {0}} },
+    {"1.16",
+        "SELECT 'AB' LIKE '_';",
+        {0, {0}} },
+    {"1.17",
+        "SELECT 'CD' LIKE '_';",
+        {0, {0}} },
+    {"1.18",
+        "SELECT '' LIKE '_';",
+        {0, {0}} },
+    {"1.19",
+        "SELECT 'AB' LIKE '__';",
+        {0, {1}} },
+    {"1.20",
+        "SELECT 'CD' LIKE '__';",
+        {0, {1}} },
+    {"1.21",
+        "SELECT '' LIKE '__';",
+        {0, {0}} },
+    {"1.22",
+        "SELECT 'AB' LIKE '%A';",
+        {0, {0}} },
+    {"1.23",
+        "SELECT 'AB' LIKE '%C';",
+        {0, {0}} },
+    {"1.24",
+        "SELECT 'ab' LIKE '%df';",
+        {0, {0}} },
+    {"1.25",
+        "SELECT 'abCDF' LIKE '%df';",
+        {0, {1}} },
+    {"1.26",
+        "SELECT 'CDF' LIKE '%df';",
+        {0, {1}} },
+    {"1.27",
+        "SELECT 'ab' LIKE 'a_';",
+        {0, {1}} },
+    {"1.28",
+        "SELECT 'abCDF' LIKE 'a_';",
+        {0, {0}} },
+    {"1.29",
+        "SELECT 'CDF' LIKE 'a_';",
+        {0, {0}} },
+    {"1.30",
+        "SELECT 'ab' LIKE 'ab%';",
+        {0, {1}} },
+    {"1.31",
+        "SELECT 'abCDF' LIKE 'ab%';",
+        {0, {1}} },
+    {"1.32",
+        "SELECT 'CDF' LIKE 'ab%';",
+        {0, {0}} },
+    {"1.33",
+        "SELECT 'ab' LIKE 'abC%';",
+        {0, {0}} },
+    {"1.34",
+        "SELECT 'abCDF' LIKE 'abC%';",
+        {0, {1}} },
+    {"1.35",
+        "SELECT 'CDF' LIKE 'abC%';",
+        {0, {0}} },
+    {"1.36",
+        "SELECT 'ab' LIKE 'a_%';",
+        {0, {1}} },
+    {"1.37",
+        "SELECT 'abCDF' LIKE 'a_%';",
+        {0, {1}} },
+    {"1.38",
+        "SELECT 'CDF' LIKE 'a_%';",
+        {0, {0}} },
 }

 test:do_catchsql_set_test(like_test_cases, prefix)

 -- Invalid testcases.
-
 for i, tested_string in ipairs(invalid_testcases) do

--- We should raise an error if pattern contains invalid characters.
- local test_name = prefix .. "2." .. tostring(i)
- local test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "';"
- test:do_catchsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 1, "LIKE or GLOB pattern can only contain UTF-8 characters"
- -- <test_name>
- })
+    -- We should raise an error in case
+    -- pattern contains invalid characters.

- test_name = prefix .. "3." .. tostring(i)
- test_itself = "SELECT 'abc' LIKE 'abc" .. tested_string .. "';"
- test:do_catchsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 1, "LIKE or GLOB pattern can only contain UTF-8 characters"
- -- <test_name>
- })
+    local test_name = prefix .. "2." .. tostring(i)
+    local test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "';"
+    test:do_catchsql_test(test_name, test_itself,
+                          {1, "LIKE or GLOB pattern can only contain UTF-8
characters"})

- test_name = prefix .. "4." .. tostring(i)
- test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "c';"
- test:do_catchsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 1, "LIKE or GLOB pattern can only contain UTF-8 characters"
- -- <test_name>
- })
+    test_name = prefix .. "3." .. tostring(i)
+    test_itself = "SELECT 'abc' LIKE 'abc" .. tested_string .. "';"
+    test:do_catchsql_test(test_name, test_itself,
+                          {1, "LIKE or GLOB pattern can only contain UTF-8
characters"})

--- Just skipping if row value predicand contains invalid character.
+    test_name = prefix .. "4." .. tostring(i)
+    test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "c';"
+    test:do_catchsql_test(test_name, test_itself,
+                          {1, "LIKE or GLOB pattern can only contain UTF-8
characters"})

- test_name = prefix .. "5." .. tostring(i)
- test_itself = "SELECT 'ab" .. tested_string .. "' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+    -- Just skipping if row value predicand contains invalid character.

- test_name = prefix .. "6." .. tostring(i)
- test_itself = "SELECT 'abc" .. tested_string .. "' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+    test_name = prefix .. "5." .. tostring(i)
+    test_itself = "SELECT 'ab" .. tested_string .. "' LIKE 'abc';"
+    test:do_execsql_test(test_name, test_itself, {0})

- test_name = prefix .. "7." .. tostring(i)
- test_itself = "SELECT 'ab" .. tested_string .. "c' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+    test_name = prefix .. "6." .. tostring(i)
+    test_itself = "SELECT 'abc" .. tested_string .. "' LIKE 'abc';"
+    test:do_execsql_test(test_name, test_itself, {0})
+
+    test_name = prefix .. "7." .. tostring(i)
+    test_itself = "SELECT 'ab" .. tested_string .. "c' LIKE 'abc';"
+    test:do_execsql_test(test_name, test_itself, {0})
 end

 -- Valid testcases.
-
 for i, tested_string in ipairs(valid_testcases) do
  test_name = prefix .. "8." .. tostring(i)
  local test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})

  test_name = prefix .. "9." .. tostring(i)
  test_itself = "SELECT 'abc' LIKE 'abc" .. tested_string .. "';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})

  test_name = prefix .. "10." .. tostring(i)
  test_itself = "SELECT 'abc' LIKE 'ab" .. tested_string .. "c';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})
+
  test_name = prefix .. "11." .. tostring(i)
  test_itself = "SELECT 'ab" .. tested_string .. "' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})

  test_name = prefix .. "12." .. tostring(i)
  test_itself = "SELECT 'abc" .. tested_string .. "' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})

  test_name = prefix .. "13." .. tostring(i)
  test_itself = "SELECT 'ab" .. tested_string .. "c' LIKE 'abc';"
- test:do_execsql_test(
- test_name,
- test_itself, {
- -- <test_name>
- 0
- -- <test_name>
- })
+ test:do_execsql_test(test_name, test_itself, {0})
 end

 test:finish_test()
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.tarantool.org/pipermail/tarantool-patches/attachments/20180731/c2555776/attachment.html>