From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id AABAC25251 for ; Wed, 24 Jul 2019 17:02:24 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id aDIIowskjOQq for ; Wed, 24 Jul 2019 17:02:24 -0400 (EDT) Received: from smtp58.i.mail.ru (smtp58.i.mail.ru [217.69.128.38]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 6607D2521F for ; Wed, 24 Jul 2019 17:02:23 -0400 (EDT) Content-Type: text/plain; charset=utf-8 Mime-Version: 1.0 (Mac OS X Mail 12.2 \(3445.102.3\)) Subject: [tarantool-patches] Re: [PATCH 2/2] sql: make LIKE predicate dependent on collation From: Roman Khabibov In-Reply-To: Date: Thu, 25 Jul 2019 00:02:20 +0300 Content-Transfer-Encoding: quoted-printable Message-Id: <9DFC7444-7C8D-4DB0-B059-948122587CE4@tarantool.org> References: <67ec7b5425d16078e45571c99ba9b58859b3c7b8.1563057282.git.roman.habibov@tarantool.org> Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-Help: List-Unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-Subscribe: List-Owner: List-post: List-Archive: To: tarantool-patches@freelists.org Cc: "n. pettik" Hi! Thanks for the review. > On Jul 17, 2019, at 19:19, n.pettik wrote: >=20 >=20 >> On 14 Jul 2019, at 01:51, Roman Khabibov = wrote: >>=20 >> According to ANSI, LIKE should match characters taking into >> account passed collation. >>=20 >> ISO/IEC JTC 1/SC 32 2011, Part 2: Foundation, page 445 >=20 > Nit: it=E2=80=99s likely to be meaningless to indicate exact page; > I=E2=80=99d better outline number of section. >=20 > Moreover, I guess this ticket deserves doc request > explaining user-visible changes. Done. > I=E2=80=99ve pushed code-style fixes at the top of your branch. >=20 >> 7 files changed, 187 insertions(+), 111 deletions(-) >>=20 >> diff --git a/src/box/sql/func.c b/src/box/sql/func.c >> index 4e4d14cf7..b660d05c5 100644 >> --- a/src/box/sql/func.c >> +++ b/src/box/sql/func.c >>=20 >> @@ -699,7 +723,7 @@ enum pattern_match_status { >> * @param string String being compared. >> * @param pattern_end Ptr to pattern last symbol. >> * @param string_end Ptr to string last symbol. >> - * @param is_like_ci true if LIKE is case insensitive. >> + * @param coll Pointer to collation. >> * @param match_other The escape char for LIKE. >> * >> * @retval One of pattern_match_status values. >> @@ -709,7 +733,7 @@ sql_utf8_pattern_compare(const char *pattern, >> const char *string, >> const char *pattern_end, >> const char *string_end, >> - const int is_like_ci, >> + struct coll *coll, >> UChar32 match_other) >> { >> /* Next pattern and input string chars. */ >> @@ -717,9 +741,13 @@ sql_utf8_pattern_compare(const char *pattern, >> /* One past the last escaped input char. */ >> const char *zEscaped =3D 0; >> UErrorCode status =3D U_ZERO_ERROR; >> + const char *pat_char_ptr; >> + const char *str_char_ptr; >> + int pat_char_len; >> + int str_char_len; >=20 > Declaring variables without initialisation is considered > to be bad practise. Also intend to use appropriate types - > in this case size_t. >=20 >> while (pattern < pattern_end) { >> - c =3D Utf8Read(pattern, pattern_end); >> + c =3D step_utf8_char(&pattern, pattern_end, &pat_char_ptr, = &pat_char_len); >=20 > Broken indentation. Thank you. >> if (c =3D=3D SQL_INVALID_UTF8_SYMBOL) >> return INVALID_PATTERN; >> if (c =3D=3D MATCH_ALL_WILDCARD) { >>=20 >> @@ -847,9 +867,10 @@ sql_utf8_pattern_compare(const char *pattern, >> int >> sql_strlike_cs(const char *zPattern, const char *zStr, unsigned int = esc) >> { >=20 > I guess you can remove both strike_cs and strlike_ci functions. Maybe in 2.3.0. r:tarantool r.khabibov$ grep -r strlike_c $HOME/tarantool/src /Users/r.khabibov/tarantool/src/box/sql/analyze.c: if = (sql_strlike_cs("unordered%", z, '[') =3D=3D 0) /Users/r.khabibov/tarantool/src/box/sql/analyze.c: else if = (sql_strlike_cs("noskipscan%", z, '[') =3D=3D 0) /Users/r.khabibov/tarantool/src/box/sql/vdbe.c: || = sql_strlike_ci("DELETE%", p->zSql, 0) !=3D 0 >=20 >> + struct coll_id *p =3D coll_by_name("unicode", = strlen("unicode")); >> return sql_utf8_pattern_compare(zPattern, zStr, >> zPattern + strlen(zPattern), >> - zStr + strlen(zStr), 0, esc); >> + zStr + strlen(zStr), p->coll, = esc); >> } >>=20 >> @@ -228,13 +228,11 @@ operatorMask(int op) >> * In order for the operator to be optimizible, the RHS must be a >> * string literal that does not begin with a wildcard. The LHS >> * must be a column that may only be NULL, a string, or a BLOB, >> - * never a number. The collating sequence for the column on the >> - * LHS must be appropriate for the operator. >> + * never a number. >> * >> * @param pParse Parsing and code generating context. >> * @param pExpr Test this expression. >> - * @param ppPrefix Pointer to TK_STRING expression with >> - * pattern prefix. >> + * @param ppPrefix Pointer to expression with pattern prefix. >=20 > If smth has changed, please underline this in comment. >=20 >> * @param pisComplete True if the only wildcard is '%' in the >> * last character. >> * @retval True if the given expr is a LIKE operator & is >> @@ -276,9 +274,20 @@ like_optimization_is_valid(Parse *pParse, Expr = *pExpr, Expr **ppPrefix, >> */ >> return 0; >> } >> + >> + /* Only for "binary" and "unicode_ci" collations. */ >=20 > Why only two collations? Why not =E2=80=9Cunicode=E2=80=9D for = instance? > What if collation is a part of expression, not field: > =E2=80=A6 a COLLATE =E2=80=9Cunicode=E2=80=9D LIKE =E2=80=A6 ? For instance, we consider the following query '... WHERE x LIKE "A%" = COLLATE "unicode"' (strength - tertiary). The optimization will be: "A" = <=3D x < "B". Let's take x =3D=3D "aaa". Comparison of "aaa" and "A" = gives us '1'. Comparison of "aaa" and "B" gives us '-1'. In other = words, "A" <=3D "aaa" < "B", but it is a bad result, because "aaa" is = not LIKE "A%" COLLATE "unicode". Now, let's take "unicode_ci" as the collation (strength - primary). Comparison of "aaa" and "B" gives us '-1'. Comparison of "aaa" and "A" = gives us '1'. "A" <=3D "aaa" < "B". Here, we may accept this result. As I understand, it depends on a collation strength. We might check = collation strength and filter appropriate collations this way (in 2.3.0), but I=E2=80=99m not = sure 100%. I have chosen =E2=80=9Cbinary=E2=80=9D and =E2=80=9Cunicode=E2=80=9D, because they = most frequently used, IMHO. >> + struct field_def *field_def =3D pLeft->space_def->fields + = pLeft->iColumn; >> + if (field_def->coll_id !=3D 0 && field_def->coll_id !=3D 2 && >> + field_def->coll_id !=3D 3) >=20 > It=E2=80=99s definitively bad practice to rely on collation=E2=80=99s = id. >=20 >> + return 0; >> assert(pLeft->iColumn !=3D (-1)); /* Because IPK never has = AFF_TEXT */ >>=20 >> pRight =3D sqlExprSkipCollate(pList->a[0].pExpr); >> + const char *coll_name =3D NULL; >> + if (pRight !=3D pList->a[0].pExpr) { >> + assert(pList->a[0].pExpr->op =3D=3D TK_COLLATE); >> + coll_name =3D pList->a[0].pExpr->u.zToken; >=20 > Please, accompany this snippet with comment. >=20 >> + } >> op =3D pRight->op; >> if (op =3D=3D TK_VARIABLE) { >> Vdbe *pReprepare =3D pParse->pReprepare; >> @@ -308,6 +317,10 @@ like_optimization_is_valid(Parse *pParse, Expr = *pExpr, Expr **ppPrefix, >> pParse->is_aborted =3D true; >> else >> pPrefix->u.zToken[cnt] =3D 0; >> + if (coll_name !=3D NULL) >> + pPrefix =3D = sqlExprAddCollateString(pParse, >> + = pPrefix, >> + = coll_name); >=20 > The same: I don=E2=80=99t understand what=E2=80=99s going on here. > Please, add explanation to the code. +/** + * Skip TK_COLLATE in @a expr and save root collation name to @a + * coll_name if any. + * + * Wrapper for sqlExprSkipCollate(). + * + * @param expr Expression. + * @param[out] coll_name Collation name. + * @retval Pointer to non-collate node. + */ +static struct Expr* +skip_coll_and_get_name(struct Expr* expr, const char **coll_name) +{ + struct Expr *ret =3D sqlExprSkipCollate(expr); + if (ret !=3D expr) { + assert(expr->op =3D=3D TK_COLLATE); + *coll_name =3D expr->u.zToken; + } else { + *coll_name =3D NULL; + } + return ret; +} + /** * Check to see if the given expression is a LIKE operator that * can be optimized using inequality constraints. @@ -228,13 +251,11 @@ operatorMask(int op) * In order for the operator to be optimizible, the RHS must be a * string literal that does not begin with a wildcard. The LHS * must be a column that may only be NULL, a string, or a BLOB, - * never a number. The collating sequence for the column on the - * LHS must be appropriate for the operator. + * never a number. * * @param pParse Parsing and code generating context. * @param pExpr Test this expression. - * @param ppPrefix Pointer to TK_STRING expression with - * pattern prefix. + * @param ppPrefix Pointer to expression with pattern prefix. * @param pisComplete True if the only wildcard is '%' in the * last character. * @retval True if the given expr is a LIKE operator & is @@ -266,7 +287,9 @@ like_optimization_is_valid(Parse *pParse, Expr = *pExpr, Expr **ppPrefix, return 0; } pList =3D pExpr->x.pList; - pLeft =3D pList->a[1].pExpr; + const char *l_coll_name =3D NULL; + pLeft =3D skip_coll_and_get_name(pList->a[1].pExpr, = &l_coll_name); + /* Value might be numeric */ if (pLeft->op !=3D TK_COLUMN || sql_expr_type(pLeft) !=3D FIELD_TYPE_STRING) { @@ -278,7 +301,34 @@ like_optimization_is_valid(Parse *pParse, Expr = *pExpr, Expr **ppPrefix, } assert(pLeft->iColumn !=3D (-1)); /* Because IPK never has = AFF_TEXT */ =20 - pRight =3D sqlExprSkipCollate(pList->a[0].pExpr); + const char *r_coll_name =3D NULL; + pRight =3D skip_coll_and_get_name(pList->a[0].pExpr, = &r_coll_name); + + /* Only for "binary" and "unicode_ci" collations. */ + if (r_coll_name !=3D NULL) { + if (strcmp(r_coll_name, "binary") !=3D 0 && + strcmp(r_coll_name, "unicode_ci") !=3D 0) + return 0; + } else if (l_coll_name !=3D NULL) { + if (strcmp(l_coll_name, "binary") !=3D 0 && + strcmp(l_coll_name, "unicode_ci") !=3D 0) + return 0; + } else { + /* + * If both arguments haven't then we + * should check implicit collation. + */ + struct field_def *field_def =3D pLeft->space_def->fields = + + pLeft->iColumn; + uint32_t none_id =3D coll_by_name("none", = strlen("none"))->id; + uint32_t u_ci_id =3D + coll_by_name("unicode_ci", = strlen("unicode_ci"))->id; + uint32_t bin_id =3D coll_by_name("binary", = strlen("binary"))->id; + if (field_def->coll_id !=3D none_id && = field_def->coll_id !=3D + u_ci_id && field_def->coll_id !=3D bin_id) + return 0; + } + op =3D pRight->op; if (op =3D=3D TK_VARIABLE) { Vdbe *pReprepare =3D pParse->pReprepare; @@ -308,6 +358,15 @@ like_optimization_is_valid(Parse *pParse, Expr = *pExpr, Expr **ppPrefix, pParse->is_aborted =3D true; else pPrefix->u.zToken[cnt] =3D 0; + /* + * If was typed to 's RHS + * add it result expression. + */ + if (r_coll_name !=3D NULL) + pPrefix =3D + sqlExprAddCollateString(pParse, + pPrefix, + = r_coll_name); *ppPrefix =3D pPrefix; if (op =3D=3D TK_VARIABLE) { Vdbe *v =3D pParse->pVdbe; >> @@ -977,8 +990,6 @@ exprAnalyze(SrcList * pSrc, /* the FROM = clause */ >> Expr *pStr1 =3D 0; >> /* RHS of LIKE ends with wildcard. */ >> int isComplete =3D 0; >> - /* uppercase equivalent to lowercase. */ >> - int noCase =3D 0; >> /* Top-level operator. pExpr->op. */ >> int op; >> /* Parsing context. */ >> @@ -1165,59 +1176,22 @@ exprAnalyze(SrcList * pSrc, /* the FROM = clause */ >> const u16 wtFlags =3D TERM_LIKEOPT | TERM_VIRTUAL | = TERM_DYNAMIC; >>=20 >> pLeft =3D pExpr->x.pList->a[1].pExpr; >> - pStr2 =3D sqlExprDup(db, pStr1, 0); >> - >> - /* >> - * Convert the lower bound to upper-case and the >> - * upper bound to lower-case (upper-case is less >> - * than lower-case in ASCII) so that the range >> - * constraints also work for BLOBs. >> - */ >> - if (noCase && !pParse->db->mallocFailed) { >> - int i; >> - char c; >> - pTerm->wtFlags |=3D TERM_LIKE; >> - for (i =3D 0; (c =3D pStr1->u.zToken[i]) !=3D 0; = i++) { >> - pStr1->u.zToken[i] =3D sqlToupper(c); >> - pStr2->u.zToken[i] =3D sqlTolower(c); >> - } >> - } >> + pStr2 =3D sqlExprDup(db, sqlExprSkipCollate(pStr1), 0); >>=20 >> if (!db->mallocFailed) { >> u8 c, *pC; /* Last character before the = first wildcard */ >> pC =3D (u8 *) & pStr2->u. >> zToken[sqlStrlen30(pStr2->u.zToken) - 1]; >> c =3D *pC; >> - if (noCase) { >> - /* The point is to increment the last = character before the first >> - * wildcard. But if we increment '@', = that will push it into the >> - * alphabetic range where case = conversions will mess up the >> - * inequality. To avoid this, make sure = to also run the full >> - * LIKE on all candidate expressions by = clearing the isComplete flag >> - */ >> - if (c =3D=3D 'A' - 1) >> - isComplete =3D 0; >> - c =3D sqlUpperToLower[c]; >> - } >=20 > Are you sure that this code removal wouldn=E2=80=99t broke smth? Now, comparison is occurred with collations, so we don=E2=80=99t need to = translate characters to upper or lower case. >> *pC =3D c + 1; >> } >> pNewExpr1 =3D sqlExprDup(db, pLeft, 0); >> - if (noCase) { >> - pNewExpr1 =3D >> - sqlExprAddCollateString(pParse, = pNewExpr1, >> - = "unicode_ci"); >> - } >> pNewExpr1 =3D sqlPExpr(pParse, TK_GE, pNewExpr1, pStr1); >> transferJoinMarkings(pNewExpr1, pExpr); >> idxNew1 =3D whereClauseInsert(pWC, pNewExpr1, wtFlags); >> testcase(idxNew1 =3D=3D 0); >> exprAnalyze(pSrc, pWC, idxNew1); >> pNewExpr2 =3D sqlExprDup(db, pLeft, 0); >> - if (noCase) { >> - pNewExpr2 =3D >> - sqlExprAddCollateString(pParse, = pNewExpr2, >> - = "unicode_ci"); >> - } >> pNewExpr2 =3D sqlPExpr(pParse, TK_LT, pNewExpr2, pStr2); >> transferJoinMarkings(pNewExpr2, pExpr); >> idxNew2 =3D whereClauseInsert(pWC, pNewExpr2, wtFlags); >> diff --git a/test/sql-tap/collation.test.lua = b/test/sql-tap/collation.test.lua >> index 79547361c..9e404282a 100755 >> --- a/test/sql-tap/collation.test.lua >> +++ b/test/sql-tap/collation.test.lua >> @@ -1,6 +1,6 @@ >> #!/usr/bin/env tarantool >> test =3D require("sqltester") >> -test:plan(174) >> +test:plan(173) >=20 > Please add test verifying that arguments of incompatible > collations can=E2=80=99t participate in LIKE predicate. diff --git a/test/sql-tap/gh-3251-string-pattern-comparison.test.lua = b/test/sql-tap/gh-3251-string-pattern-comparison.test.lua index 95e8c98d0..641469211 100755 --- a/test/sql-tap/gh-3251-string-pattern-comparison.test.lua +++ b/test/sql-tap/gh-3251-string-pattern-comparison.test.lua @@ -1,6 +1,6 @@ #!/usr/bin/env tarantool test =3D require("sqltester") -test:plan(128) +test:plan(162) =20 local prefix =3D "like-test-" =20 @@ -79,10 +79,10 @@ local like_test_cases =3D "SELECT '=D1=91=D1=84' LIKE '%=C5=93=D8=B4';", {0, {false}} }, {"1.25", - "SELECT '=D1=91=D1=84=C3=85=C5=92=D8=B4' LIKE '%=C5=93=D8=B4';", + "SELECT '=D1=91=D1=84=C3=85=C5=92=D8=B4' LIKE '%=C5=93=D8=B4' = COLLATE \"unicode_ci\";", {0, {true}} }, {"1.26", - "SELECT '=C3=85=C5=92=D8=B4' LIKE '%=C5=93=D8=B4';", + "SELECT '=C3=85=C5=92=D8=B4' LIKE '%=C5=93=D8=B4' COLLATE = \"unicode_ci\";", {0, {true}} }, {"1.27", "SELECT '=D1=91=D1=84' LIKE '=D1=91_';", @@ -120,6 +120,108 @@ local like_test_cases =3D {"1.38", "SELECT '=C3=85=C5=92=D8=B4' LIKE '=D1=91_%';", {0, {false}} }, + {"1.39", + "SELECT 'A' LIKE 'A' COLLATE \"unicode\";", + {0, {true}} }, + {"1.40", + "SELECT 'A' LIKE 'a' COLLATE \"unicode\";", + {0, {false}} }, + {"1.41", + "SELECT 'Ab' COLLATE \"unicode\" LIKE 'ab';", + {0, {false}} }, + {"1.42", + "SELECT 'ss' LIKE '=C3=9F' COLLATE \"unicode\";", + {0, {false}} }, + {"1.43", + "SELECT '=D0=AF' LIKE '=D1=8F' COLLATE \"unicode\";", + {0, {false}} }, + {"1.44", + "SELECT 'A=D0=AFB' LIKE 'A=D1=8FB' COLLATE \"unicode\";", + {0, {false}} }, + {"1.45", + "SELECT '=D0=87' LIKE '=D1=97' COLLATE \"unicode\";", + {0, {false}} }, + {"1.46", + "SELECT 'Ab' LIKE '_b' COLLATE \"unicode\";", + {0, {true}} }, + {"1.47", + "SELECT 'A' LIKE '_' COLLATE \"unicode\";", + {0, {true}} }, + {"1.48", + "SELECT 'AB' LIKE '%B' COLLATE \"unicode\";", + {0, {true}} }, + {"1.49", + "SELECT 'A' LIKE 'A' COLLATE \"unicode_ci\";", + {0, {true}} }, + {"1.50", + "SELECT 'A' COLLATE \"unicode_ci\" LIKE 'a';", + {0, {true}} }, + {"1.51", + "SELECT '=D0=AF' LIKE '=D1=8F' COLLATE \"unicode_ci\";", + {0, {true}} }, + {"1.52", + "SELECT 'A=D0=AFB' LIKE 'A=D1=8FB' COLLATE \"unicode_ci\";", + {0, {true}} }, + {"1.53", + "SELECT '=D0=87' LIKE '=D1=97' COLLATE \"unicode_ci\";", + {0, {true}} }, + {"1.54", + "SELECT 'Ab' LIKE 'ab' COLLATE \"unicode_ci\";", + {0, {true}} }, + {"1.55", + "SELECT 'ba' LIKE 'ab' COLLATE \"unicode_ci\";", + {0, {false}} }, + {"1.56", + "SELECT 'Aaa' LIKE 'A%' COLLATE \"unicode_ci\";", + {0, {true}} }, + {"1.57", + "SELECT 'aaa' LIKE 'A%' COLLATE \"unicode_ci\";", + {0, {true}} }, + {"1.58", + "SELECT 'A' LIKE '_' COLLATE \"unicode_ci\";", + {0, {true}} }, + {"1.59", + "SELECT 'ss' LIKE '=C3=9F' COLLATE = \"unicode_de__phonebook_s1\";", + {0, {false}} }, + {"1.60", + "SELECT 'ss' LIKE '=C3=9F' COLLATE = \"unicode_de__phonebook_s3\";", + {0, {false}} }, + {"1.61", + "SELECT '=D0=87' LIKE '=D1=97' COLLATE \"unicode_uk_s1\";", + {0, {true}} }, + {"1.62", + "SELECT '=D0=87' LIKE '=D1=97' COLLATE \"unicode_uk_s3\";", + {0, {false}} }, + {"1.63", + "SELECT '=D0=87' COLLATE \"unicode_uk_s3\" LIKE '=D1=97' = COLLATE \"unicode_uk_s3\";", + {0, {false}} }, + {"1.64", + "SELECT '%a_' LIKE '=E0=B6=B8%A=E0=B6=B8_' COLLATE \"unicode\" = ESCAPE '=E0=B6=B8';", + {0, {false}} }, + {"1.65", + "SELECT '%a_' COLLATE \"unicode\" LIKE '=E0=B6=B8%A=E0=B6=B8_' = COLLATE \"unicode\" ESCAPE '=E0=B6=B8' COLLATE \"unicode\";", + {0, {false}} }, + {"1.66", + "SELECT '%a_' LIKE '=E0=B6=B8%A=E0=B6=B8_' ESCAPE '=E0=B6=B8' = COLLATE \"unicode_ci\";", + {0, {true}} }, + {"1.67", + "SELECT '%_' LIKE 'a%a_' ESCAPE 'A' COLLATE \"unicode_ci\";", + {0, {false}} }, + {"1.68", + "SELECT '%_' LIKE 'a%a_' ESCAPE 'a' COLLATE \"unicode_ci\";", + {0, {true}} }, + {"1.69", + "SELECT '=D0=87' COLLATE \"unicode\" LIKE '=D1=97' COLLATE = \"unicode_uk_s3\";", + {1, "Illegal mix of collations"} }, + {"1.70", + "SELECT '%a_' COLLATE \"unicode_ci\" LIKE '=E0=B6=B8%A=E0=B6=B8_'= COLLATE \"unicode\" ESCAPE '=E0=B6=B8';", + {1, "Illegal mix of collations"} }, + {"1.71", + "SELECT '%a_' COLLATE \"unicode\" LIKE '=E0=B6=B8%A=E0=B6=B8_' = ESCAPE '=E0=B6=B8' COLLATE \"unicode_ci\";", + {1, "Illegal mix of collations"} }, + {"1.72", + "SELECT '%_' LIKE 'a%a_' COLLATE \"unicode\" ESCAPE 'A' COLLATE = \"unicode_ci\";", + {1, "Illegal mix of collations"} } } >>=20 >> local prefix =3D "collation-" >>=20 >> @@ -477,13 +477,13 @@ for _, data_collation in = ipairs(data_collations) do >> end >> end >>=20 >> --- Like uses collation (only for unicode_ci and binary) >> +-- uses collation. If has explicit , use it >> +-- instead of implicit. >> local like_testcases =3D >> { >> {"2.0", >> [[ >> - CREATE TABLE tx1 (s1 VARCHAR(5) PRIMARY KEY); >> - CREATE INDEX I1 on tx1(s1 collate "unicode_ci=E2=80=9D); >=20 > Why did you drop index? Dependece of I1 using is conditioned by the following pieces of code or = the value of noCase or the value of pragma case insensitive: - if (noCase) { - pNewExpr1 =3D - sqlExprAddCollateString(pParse, pNewExpr1, - "unicode_ci"); - } - if (noCase) { - pNewExpr2 =3D - sqlExprAddCollateString(pParse, pNewExpr2, - "unicode_ci"); - } I have removed this code, so covering index isn't used anymore, primary = key is used now. I can't say why it worked this way, but I can say, that now there is no = notion "case/nocase". Comparison is provided with implicit or explicit = collation. IMHO, covering index should not affect at all. >> + CREATE TABLE tx1 (s1 VARCHAR(5) PRIMARY KEY COLLATE = "unicode_ci"); >> INSERT INTO tx1 VALUES('aaa'); >> INSERT INTO tx1 VALUES('Aab'); >> INSERT INTO tx1 VALUES('=C4=B0ac'); >> @@ -491,35 +491,32 @@ local like_testcases =3D >> ]], {0}}, >> {"2.1.1", >> "SELECT * FROM tx1 WHERE s1 LIKE 'A%' order by s1;", >> - {0, {"Aab", "aaa"}} }, >> + {0, {"aaa","Aab"}} }, >=20 > Why order of results has changed? Because I added collation to s1. >=20 >> {"2.1.2", >> "EXPLAIN QUERY PLAN SELECT * FROM tx1 WHERE s1 LIKE 'A%';", >> - {0, {0, 0, 0, "SEARCH TABLE TX1 USING COVERING INDEX I1 = (S1>? AND S1> + {0, {0, 0, 0, "SEARCH TABLE TX1 USING PRIMARY KEY (S1>? AND = S1=20 > I guess this is due to dropped index. I dropped index, because it don=E2=80=99t affect on this query now. >> {"2.2.0", >> - "PRAGMA case_sensitive_like =3D true;", >> - {0}}, >> - {"2.2.1", >> - "SELECT * FROM tx1 WHERE s1 LIKE 'A%' order by s1;", >> + "SELECT * FROM tx1 WHERE s1 LIKE 'A%' COLLATE \"unicode\" = order by s1;", >> {0, {"Aab"}} }, >> - {"2.2.2", >> + {"2.2.1", >> "EXPLAIN QUERY PLAN SELECT * FROM tx1 WHERE s1 LIKE 'A%';", >> {0, {0, 0, 0, "/USING PRIMARY KEY/"}} }, >> {"2.3.0", >> - "PRAGMA case_sensitive_like =3D false;", >> - {0}}, >> - {"2.3.1", >> "SELECT * FROM tx1 WHERE s1 LIKE 'i%' order by s1;", >> - {0, {"iad", "=C4=B0ac"}}}, >> - {"2.3.2", >> - "SELECT * FROM tx1 WHERE s1 LIKE '=C4=B0%'order by s1;", >> - {0, {"iad", "=C4=B0ac"}} }, >> + {0, {"=C4=B0ac", "iad"}}}, >> + {"2.3.1", >> + "SELECT * FROM tx1 WHERE s1 LIKE '=C4=B0%' COLLATE = \"unicode\" order by s1;", >> + {0, {"=C4=B0ac"}} }, >> {"2.4.0", >> [[ >> INSERT INTO tx1 VALUES('=D0=AF=D0=81=D0=AE'); >> ]], {0} }, >> {"2.4.1", >> + "SELECT * FROM tx1 WHERE s1 LIKE '=D1=8F=D1=91=D1=8E' = COLLATE \"unicode\";", >> + {0, {}} }, >> + {"2.4.2", >> "SELECT * FROM tx1 WHERE s1 LIKE '=D1=8F=D1=91=D1=8E';", >> - {0, {"=D0=AF=D0=81=D0=AE"}} }, >> + {0, {"=D0=AF=D0=81=D0=AE"}} } >=20 > Stray diff. The pragma don=E2=80=99t exist now, so I use case sensitive/insensitive = collations in this test. And meanwhile, I check that explicit collation has the highest priority. --- Like uses collation (only for unicode_ci and binary) +-- uses collation. If has explicit , use it +-- instead of implicit. local like_testcases =3D { {"2.0", [[ - CREATE TABLE tx1 (s1 VARCHAR(5) PRIMARY KEY); + CREATE TABLE tx1 (s1 VARCHAR(5) PRIMARY KEY COLLATE = "unicode_ci"); INSERT INTO tx1 VALUES('aaa'); INSERT INTO tx1 VALUES('Aab'); INSERT INTO tx1 VALUES('=C4=B0ac'); @@ -490,29 +491,38 @@ local like_testcases =3D ]], {0}}, {"2.1.1", "SELECT * FROM tx1 WHERE s1 LIKE 'A%' order by s1;", - {0, {"Aab"}} }, + {0, {"aaa","Aab"}} }, {"2.1.2", "EXPLAIN QUERY PLAN SELECT * FROM tx1 WHERE s1 LIKE 'A%';", {0, {0, 0, 0, "SEARCH TABLE TX1 USING PRIMARY KEY (S1>? AND = S1> } >>=20 >> test:do_catchsql_set_test(like_testcases, prefix) >> diff --git a/test/sql-tap/gh-3251-string-pattern-comparison.test.lua = b/test/sql-tap/gh-3251-string-pattern-comparison.test.lua >> index 95e8c98d0..eb9d7c3b9 100755 >> --- a/test/sql-tap/gh-3251-string-pattern-comparison.test.lua >> +++ b/test/sql-tap/gh-3251-string-pattern-comparison.test.lua >> @@ -1,6 +1,6 @@ >> #!/usr/bin/env tarantool >> test =3D require("sqltester") >> -test:plan(128) >> +test:plan(156) >>=20 >> local prefix =3D "like-test-" >>=20 >> @@ -80,10 +80,10 @@ local like_test_cases =3D >> {0, {false}} }, >> {"1.25", >> "SELECT '=D1=91=D1=84=E2=84=AB=C5=92=D8=B4' LIKE '%=C5=93=D8=B4'= ;", >> - {0, {true}} }, >> + {0, {false}} }, >=20 > Could you add an appropriate collation to avoid fixing > test results? @@ -79,10 +79,10 @@ local like_test_cases =3D "SELECT '=D1=91=D1=84' LIKE '%=C5=93=D8=B4';", {0, {false}} }, {"1.25", - "SELECT '=D1=91=D1=84=C3=85=C5=92=D8=B4' LIKE '%=C5=93=D8=B4';", + "SELECT '=D1=91=D1=84=C3=85=C5=92=D8=B4' LIKE '%=C5=93=D8=B4' = COLLATE \"unicode_ci\";", {0, {true}} }, {"1.26", - "SELECT '=C3=85=C5=92=D8=B4' LIKE '%=C5=93=D8=B4';", + "SELECT '=C3=85=C5=92=D8=B4' LIKE '%=C5=93=D8=B4' COLLATE = \"unicode_ci\";", {0, {true}} }, >> {"1.26", >> "SELECT '=E2=84=AB=C5=92=D8=B4' LIKE '%=C5=93=D8=B4';", >> - {0, {true}} }, >> + {0, {false}} }, >> {"1.27", >> "SELECT '=D1=91=D1=84' LIKE '=D1=91_';", >> {0, {true}} }, >> @@ -120,6 +120,90 @@ local like_test_cases =3D >> {"1.38", >> "SELECT '=E2=84=AB=C5=92=D8=B4' LIKE '=D1=91_%';", >> {0, {false}} }, >> + {"1.39", >> + "SELECT 'A' LIKE 'A' COLLATE \"unicode\";", >> + {0, {true}} }, >=20 > Please, add not only tests involving constant literals, but > also columns with implicit/explicit collations. I check implicit/explicit in /test/sql-tap/collation.test.lua. + {"1.69", + "SELECT '=D0=87' COLLATE \"unicode\" LIKE '=D1=97' COLLATE = \"unicode_uk_s3\";", + {1, "Illegal mix of collations"} }, + {"1.70", + "SELECT '%a_' COLLATE \"unicode_ci\" LIKE '=E0=B6=B8%A=E0=B6=B8_'= COLLATE \"unicode\" ESCAPE '=E0=B6=B8';", + {1, "Illegal mix of collations"} }, + {"1.71", + "SELECT '%a_' COLLATE \"unicode\" LIKE '=E0=B6=B8%A=E0=B6=B8_' = ESCAPE '=E0=B6=B8' COLLATE \"unicode_ci\";", + {1, "Illegal mix of collations"} }, + {"1.72", + "SELECT '%_' LIKE 'a%a_' COLLATE \"unicode\" ESCAPE 'A' COLLATE = \"unicode_ci\";", + {1, "Illegal mix of collations"} } } commit 6701a59045c10191d7883a58a1f9ff61feb62b86 Author: Roman Khabibov Date: Tue Apr 23 02:48:26 2019 +0300 sql: make LIKE predicate dependent on collation =20 According to ANSI, LIKE should match characters taking into account passed collation. =20 ISO/IEC JTC 1/SC 32 2011, Part 2: Foundation, 8.5 =20 Closes #3589 =20 @TarantoolBot document Title: LIKE depends on collations =20 Now pattern comparison depends on arguments' collation. Highest priority has explicit collation, in other words, clause for string (first), pattern (second) or escape (third) arguments. If one of the arguments has this clause, it is used. If more than one of the arguments has it, collations must be similar. Implicit (column) collation is used when there is no explicit collations. If there are neither those nor other collation, then the comparison is binary. diff --git a/src/box/sql/func.c b/src/box/sql/func.c index 4ac11cb77..f5991ff61 100644 --- a/src/box/sql/func.c +++ b/src/box/sql/func.c @@ -44,6 +44,7 @@ #include #include #include +#include "box/coll_id_cache.h" =20 static UConverter* pUtf8conv; =20 @@ -673,6 +674,30 @@ enum pattern_match_status { INVALID_PATTERN =3D 3 }; =20 +/** + * Read an UTF-8 character from string, and move pointer to the + * next character. Save current character and its length to output + * params - they are served as arguments of coll->cmp() call. + * + * @param[out] str Ptr to string. + * @param str_end Ptr the last symbol in @str. + * @param[out] char_ptr Ptr to the UTF-8 character. + * @param[out] char_len Ptr to length of the UTF-8 character in + * bytes. + * + * @retval UTF-8 character. + */ +static UChar32 +step_utf8_char(const char **str, const char *str_end, const char = **char_ptr, + size_t *char_len) +{ + UErrorCode status =3D U_ZERO_ERROR; + *char_ptr =3D *str; + UChar32 next_utf8 =3D Utf8Read(*str, str_end); + *char_len =3D *str - *char_ptr; + return next_utf8; +} + /** * Compare two UTF-8 strings for equality where the first string * is a LIKE expression. @@ -699,7 +724,7 @@ enum pattern_match_status { * @param string String being compared. * @param pattern_end Ptr to pattern last symbol. * @param string_end Ptr to string last symbol. - * @param is_like_ci true if LIKE is case insensitive. + * @param coll Pointer to collation. * @param match_other The escape char for LIKE. * * @retval One of pattern_match_status values. @@ -709,7 +734,7 @@ sql_utf8_pattern_compare(const char *pattern, const char *string, const char *pattern_end, const char *string_end, - const int is_like_ci, + struct coll *coll, UChar32 match_other) { /* Next pattern and input string chars. */ @@ -717,9 +742,14 @@ sql_utf8_pattern_compare(const char *pattern, /* One past the last escaped input char. */ const char *zEscaped =3D 0; UErrorCode status =3D U_ZERO_ERROR; + const char *pat_char_ptr =3D NULL; + const char *str_char_ptr =3D NULL; + size_t pat_char_len =3D 0; + size_t str_char_len =3D 0; =20 while (pattern < pattern_end) { - c =3D Utf8Read(pattern, pattern_end); + c =3D step_utf8_char(&pattern, pattern_end, = &pat_char_ptr, + &pat_char_len); if (c =3D=3D SQL_INVALID_UTF8_SYMBOL) return INVALID_PATTERN; if (c =3D=3D MATCH_ALL_WILDCARD) { @@ -730,7 +760,9 @@ sql_utf8_pattern_compare(const char *pattern, * consume a single character of the * input string for each "_" skipped. */ - while ((c =3D Utf8Read(pattern, pattern_end)) !=3D= + while ((c =3D step_utf8_char(&pattern, = pattern_end, + &pat_char_ptr, + &pat_char_len)) !=3D SQL_END_OF_STRING) { if (c =3D=3D SQL_INVALID_UTF8_SYMBOL) return INVALID_PATTERN; @@ -751,7 +783,9 @@ sql_utf8_pattern_compare(const char *pattern, return MATCH; } if (c =3D=3D match_other) { - c =3D Utf8Read(pattern, pattern_end); + c =3D step_utf8_char(&pattern, = pattern_end, + &pat_char_ptr, + &pat_char_len); if (c =3D=3D SQL_INVALID_UTF8_SYMBOL) return INVALID_PATTERN; if (c =3D=3D SQL_END_OF_STRING) @@ -773,8 +807,6 @@ sql_utf8_pattern_compare(const char *pattern, */ =20 int bMatch; - if (is_like_ci) - c =3D u_tolower(c); while (string < string_end){ /* * This loop could have been @@ -786,21 +818,20 @@ sql_utf8_pattern_compare(const char *pattern, * lower works better with German * and Turkish languages. */ - c2 =3D Utf8Read(string, string_end); + c2 =3D step_utf8_char(&string, = string_end, + &str_char_ptr, + &str_char_len); if (c2 =3D=3D SQL_INVALID_UTF8_SYMBOL) return NO_MATCH; - if (!is_like_ci) { - if (c2 !=3D c) - continue; - } else { - if (c2 !=3D c && u_tolower(c2) = !=3D c) - continue; - } + if (coll->cmp(pat_char_ptr, = pat_char_len, + str_char_ptr, = str_char_len, coll) + !=3D 0) + continue; bMatch =3D = sql_utf8_pattern_compare(pattern, = string, = pattern_end, = string_end, - = is_like_ci, + coll, = match_other); if (bMatch !=3D NO_MATCH) return bMatch; @@ -808,30 +839,21 @@ sql_utf8_pattern_compare(const char *pattern, return NO_WILDCARD_MATCH; } if (c =3D=3D match_other) { - c =3D Utf8Read(pattern, pattern_end); + c =3D step_utf8_char(&pattern, pattern_end, = &pat_char_ptr, + &pat_char_len); if (c =3D=3D SQL_INVALID_UTF8_SYMBOL) return INVALID_PATTERN; if (c =3D=3D SQL_END_OF_STRING) return NO_MATCH; zEscaped =3D pattern; } - c2 =3D Utf8Read(string, string_end); + c2 =3D step_utf8_char(&string, string_end, = &str_char_ptr, + &str_char_len); if (c2 =3D=3D SQL_INVALID_UTF8_SYMBOL) return NO_MATCH; - if (c =3D=3D c2) + if (coll->cmp(pat_char_ptr, pat_char_len, str_char_ptr, + str_char_len, coll) =3D=3D 0) continue; - if (is_like_ci) { - /* - * Small optimization. Reduce number of - * calls to u_tolower function. SQL - * standards suggest use to_upper for - * symbol normalisation. However, using - * to_lower allows to respect Turkish '=C4=B0' - * in default locale. - */ - if (u_tolower(c) =3D=3D c2 || c =3D=3D = u_tolower(c2)) - continue; - } if (c =3D=3D MATCH_ONE_WILDCARD && pattern !=3D zEscaped = && c2 !=3D SQL_END_OF_STRING) continue; @@ -847,9 +869,10 @@ sql_utf8_pattern_compare(const char *pattern, int sql_strlike_cs(const char *zPattern, const char *zStr, unsigned int = esc) { + struct coll_id *p =3D coll_by_name("unicode", = strlen("unicode")); return sql_utf8_pattern_compare(zPattern, zStr, zPattern + strlen(zPattern), - zStr + strlen(zStr), 0, esc); + zStr + strlen(zStr), p->coll, = esc); } =20 /** @@ -859,9 +882,10 @@ sql_strlike_cs(const char *zPattern, const char = *zStr, unsigned int esc) int sql_strlike_ci(const char *zPattern, const char *zStr, unsigned int = esc) { + struct coll_id *p =3D coll_by_name("unicode_ci", = strlen("unicode_ci")); return sql_utf8_pattern_compare(zPattern, zStr, zPattern + strlen(zPattern), - zStr + strlen(zStr), 1, esc); + zStr + strlen(zStr), p->coll, = esc); } =20 /** @@ -883,7 +907,6 @@ likeFunc(sql_context *context, int argc, sql_value = **argv) u32 escape =3D SQL_END_OF_STRING; int nPat; sql *db =3D sql_context_db_handle(context); - int is_like_ci =3D SQL_PTR_TO_INT(sql_user_data(context)); int rhs_type =3D sql_value_type(argv[0]); int lhs_type =3D sql_value_type(argv[1]); =20 @@ -940,8 +963,10 @@ likeFunc(sql_context *context, int argc, sql_value = **argv) if (!zA || !zB) return; int res; - res =3D sql_utf8_pattern_compare(zB, zA, zB_end, zA_end, - is_like_ci, escape); + struct coll *coll =3D sqlGetFuncCollSeq(context); + assert(coll !=3D NULL); + res =3D sql_utf8_pattern_compare(zB, zA, zB_end, zA_end, coll, = escape); + if (res =3D=3D INVALID_PATTERN) { diag_set(ClientError, ER_SQL_EXECUTE, "LIKE pattern can = only "\ "contain UTF-8 characters"); diff --git a/src/box/sql/sqlInt.h b/src/box/sql/sqlInt.h index 57da8ec5d..989499abe 100644 --- a/src/box/sql/sqlInt.h +++ b/src/box/sql/sqlInt.h @@ -1356,7 +1356,7 @@ enum trim_side_mask { {nArg, SQL_FUNC_SLOCHNG|(bNC*SQL_FUNC_NEEDCOLL), \ pArg, 0, xFunc, 0, #zName, {SQL_AFF_STRING, {0}}, false} #define LIKEFUNC(zName, nArg, arg, flags, type) \ - {nArg, SQL_FUNC_CONSTANT|flags, \ + {nArg, SQL_FUNC_NEEDCOLL|SQL_FUNC_CONSTANT|flags, \ (void *)(SQL_INT_TO_PTR(arg)), 0, likeFunc, 0, #zName, {0}, type, = false } #define AGGREGATE(zName, nArg, arg, nc, xStep, xFinal, type) \ {nArg, (nc*SQL_FUNC_NEEDCOLL), \ diff --git a/src/box/sql/whereexpr.c b/src/box/sql/whereexpr.c index 0a0e900bd..19f142e6c 100644 --- a/src/box/sql/whereexpr.c +++ b/src/box/sql/whereexpr.c @@ -221,6 +221,29 @@ operatorMask(int op) return c; } =20 +/** + * Skip TK_COLLATE in @a expr and save root collation name to @a + * coll_name if any. + * + * Wrapper for sqlExprSkipCollate(). + * + * @param expr Expression. + * @param[out] coll_name Collation name. + * @retval Pointer to non-collate node. + */ +static struct Expr* +skip_coll_and_get_name(struct Expr* expr, const char **coll_name) +{ + struct Expr *ret =3D sqlExprSkipCollate(expr); + if (ret !=3D expr) { + assert(expr->op =3D=3D TK_COLLATE); + *coll_name =3D expr->u.zToken; + } else { + *coll_name =3D NULL; + } + return ret; +} + /** * Check to see if the given expression is a LIKE operator that * can be optimized using inequality constraints. @@ -228,13 +251,11 @@ operatorMask(int op) * In order for the operator to be optimizible, the RHS must be a * string literal that does not begin with a wildcard. The LHS * must be a column that may only be NULL, a string, or a BLOB, - * never a number. The collating sequence for the column on the - * LHS must be appropriate for the operator. + * never a number. * * @param pParse Parsing and code generating context. * @param pExpr Test this expression. - * @param ppPrefix Pointer to TK_STRING expression with - * pattern prefix. + * @param ppPrefix Pointer to expression with pattern prefix. * @param pisComplete True if the only wildcard is '%' in the * last character. * @retval True if the given expr is a LIKE operator & is @@ -266,7 +287,9 @@ like_optimization_is_valid(Parse *pParse, Expr = *pExpr, Expr **ppPrefix, return 0; } pList =3D pExpr->x.pList; - pLeft =3D pList->a[1].pExpr; + const char *l_coll_name =3D NULL; + pLeft =3D skip_coll_and_get_name(pList->a[1].pExpr, = &l_coll_name); + /* Value might be numeric */ if (pLeft->op !=3D TK_COLUMN || sql_expr_type(pLeft) !=3D FIELD_TYPE_STRING) { @@ -278,7 +301,34 @@ like_optimization_is_valid(Parse *pParse, Expr = *pExpr, Expr **ppPrefix, } assert(pLeft->iColumn !=3D (-1)); /* Because IPK never has = AFF_TEXT */ =20 - pRight =3D sqlExprSkipCollate(pList->a[0].pExpr); + const char *r_coll_name =3D NULL; + pRight =3D skip_coll_and_get_name(pList->a[0].pExpr, = &r_coll_name); + + /* Only for "binary" and "unicode_ci" collations. */ + if (r_coll_name !=3D NULL) { + if (strcmp(r_coll_name, "binary") !=3D 0 && + strcmp(r_coll_name, "unicode_ci") !=3D 0) + return 0; + } else if (l_coll_name !=3D NULL) { + if (strcmp(l_coll_name, "binary") !=3D 0 && + strcmp(l_coll_name, "unicode_ci") !=3D 0) + return 0; + } else { + /* + * If both arguments haven't then we + * should check implicit collation. + */ + struct field_def *field_def =3D pLeft->space_def->fields = + + pLeft->iColumn; + uint32_t none_id =3D coll_by_name("none", = strlen("none"))->id; + uint32_t u_ci_id =3D + coll_by_name("unicode_ci", = strlen("unicode_ci"))->id; + uint32_t bin_id =3D coll_by_name("binary", = strlen("binary"))->id; + if (field_def->coll_id !=3D none_id && = field_def->coll_id !=3D + u_ci_id && field_def->coll_id !=3D bin_id) + return 0; + } + op =3D pRight->op; if (op =3D=3D TK_VARIABLE) { Vdbe *pReprepare =3D pParse->pReprepare; @@ -308,6 +358,15 @@ like_optimization_is_valid(Parse *pParse, Expr = *pExpr, Expr **ppPrefix, pParse->is_aborted =3D true; else pPrefix->u.zToken[cnt] =3D 0; + /* + * If was typed to 's RHS + * add it result expression. + */ + if (r_coll_name !=3D NULL) + pPrefix =3D + sqlExprAddCollateString(pParse, + pPrefix, + = r_coll_name); *ppPrefix =3D pPrefix; if (op =3D=3D TK_VARIABLE) { Vdbe *v =3D pParse->pVdbe; @@ -977,8 +1036,6 @@ exprAnalyze(SrcList * pSrc, /* the FROM = clause */ Expr *pStr1 =3D 0; /* RHS of LIKE ends with wildcard. */ int isComplete =3D 0; - /* uppercase equivalent to lowercase. */ - int noCase =3D 0; /* Top-level operator. pExpr->op. */ int op; /* Parsing context. */ @@ -1143,18 +1200,13 @@ exprAnalyze(SrcList * pSrc, /* the FROM = clause */ * A like pattern of the form "x LIKE 'aBc%'" is changed * into constraints: * - * x>=3D'ABC' AND x<'abd' AND x LIKE 'aBc%' + * x>=3D'aBc' AND x<'abd' AND x LIKE 'aBc%' * * The last character of the prefix "abc" is incremented - * to form the termination condition "abd". If case is - * not significant (the default for LIKE) then the - * lower-bound is made all uppercase and the upper-bound - * is made all lowercase so that the bounds also work - * when comparing BLOBs. + * to form the termination condition "abd". */ if (pWC->op =3D=3D TK_AND && - like_optimization_is_valid(pParse, pExpr, &pStr1, - &isComplete)) { + like_optimization_is_valid(pParse, pExpr, &pStr1, = &isComplete)) { Expr *pLeft; /* Copy of pStr1 - RHS of LIKE operator. */ Expr *pStr2; @@ -1167,57 +1219,22 @@ exprAnalyze(SrcList * pSrc, /* the FROM = clause */ pLeft =3D pExpr->x.pList->a[1].pExpr; pStr2 =3D sqlExprDup(db, pStr1, 0); =20 - /* - * Convert the lower bound to upper-case and the - * upper bound to lower-case (upper-case is less - * than lower-case in ASCII) so that the range - * constraints also work for BLOBs. - */ - if (noCase && !pParse->db->mallocFailed) { - int i; - char c; - pTerm->wtFlags |=3D TERM_LIKE; - for (i =3D 0; (c =3D pStr1->u.zToken[i]) !=3D 0; = i++) { - pStr1->u.zToken[i] =3D sqlToupper(c); - pStr2->u.zToken[i] =3D sqlTolower(c); - } - } - if (!db->mallocFailed) { u8 c, *pC; /* Last character before the = first wildcard */ - pC =3D (u8 *) & pStr2->u. - zToken[sqlStrlen30(pStr2->u.zToken) - 1]; + /* pStr2 can contain COLLATE nodes. */ + struct Expr *real_str =3D = sqlExprSkipCollate(pStr2); + pC =3D (u8 *) & real_str->u. + zToken[sqlStrlen30(real_str->u.zToken) - 1]; c =3D *pC; - if (noCase) { - /* The point is to increment the last = character before the first - * wildcard. But if we increment '@', = that will push it into the - * alphabetic range where case = conversions will mess up the - * inequality. To avoid this, make sure = to also run the full - * LIKE on all candidate expressions by = clearing the isComplete flag - */ - if (c =3D=3D 'A' - 1) - isComplete =3D 0; - c =3D sqlUpperToLower[c]; - } *pC =3D c + 1; } pNewExpr1 =3D sqlExprDup(db, pLeft, 0); - if (noCase) { - pNewExpr1 =3D - sqlExprAddCollateString(pParse, = pNewExpr1, - = "unicode_ci"); - } pNewExpr1 =3D sqlPExpr(pParse, TK_GE, pNewExpr1, pStr1); transferJoinMarkings(pNewExpr1, pExpr); idxNew1 =3D whereClauseInsert(pWC, pNewExpr1, wtFlags); testcase(idxNew1 =3D=3D 0); exprAnalyze(pSrc, pWC, idxNew1); pNewExpr2 =3D sqlExprDup(db, pLeft, 0); - if (noCase) { - pNewExpr2 =3D - sqlExprAddCollateString(pParse, = pNewExpr2, - = "unicode_ci"); - } pNewExpr2 =3D sqlPExpr(pParse, TK_LT, pNewExpr2, pStr2); transferJoinMarkings(pNewExpr2, pExpr); idxNew2 =3D whereClauseInsert(pWC, pNewExpr2, wtFlags); diff --git a/test/sql-tap/collation.test.lua = b/test/sql-tap/collation.test.lua index e1df8750b..6c3c8ea34 100755 --- a/test/sql-tap/collation.test.lua +++ b/test/sql-tap/collation.test.lua @@ -1,6 +1,6 @@ #!/usr/bin/env tarantool test =3D require("sqltester") -test:plan(172) +test:plan(175) =20 local prefix =3D "collation-" =20 @@ -477,12 +477,13 @@ for _, data_collation in ipairs(data_collations) = do end end =20 --- Like uses collation (only for unicode_ci and binary) +-- uses collation. If has explicit , use it +-- instead of implicit. local like_testcases =3D { {"2.0", [[ - CREATE TABLE tx1 (s1 VARCHAR(5) PRIMARY KEY); + CREATE TABLE tx1 (s1 VARCHAR(5) PRIMARY KEY COLLATE = "unicode_ci"); INSERT INTO tx1 VALUES('aaa'); INSERT INTO tx1 VALUES('Aab'); INSERT INTO tx1 VALUES('=C4=B0ac'); @@ -490,29 +491,38 @@ local like_testcases =3D ]], {0}}, {"2.1.1", "SELECT * FROM tx1 WHERE s1 LIKE 'A%' order by s1;", - {0, {"Aab"}} }, + {0, {"aaa","Aab"}} }, {"2.1.2", "EXPLAIN QUERY PLAN SELECT * FROM tx1 WHERE s1 LIKE 'A%';", {0, {0, 0, 0, "SEARCH TABLE TX1 USING PRIMARY KEY (S1>? AND = S1 - 1, "abc", 4, "abc" + 1, "abc", 2, "ABX", 4, "abc", 5, "ABX" -- }) test:do_execsql_test( "like3-2.1", [[ - SELECT a, b FROM t2 WHERE +b LIKE 'ab%' ORDER BY +a; + SELECT a, b FROM t2 WHERE +b LIKE 'ab%' COLLATE "unicode_ci" = ORDER BY +a; ]], { -- 1, "abc", 2, "ABX", 4, "abc", 5, "ABX" diff --git a/test/sql-tap/whereG.test.lua b/test/sql-tap/whereG.test.lua index 177d9d14e..590027023 100755 --- a/test/sql-tap/whereG.test.lua +++ b/test/sql-tap/whereG.test.lua @@ -89,7 +89,7 @@ test:do_execsql_test( [[ SELECT DISTINCT aname FROM album, composer, track - WHERE unlikely(cname LIKE '%bach%') + WHERE unlikely(cname LIKE '%bach%' COLLATE "unicode_ci") AND composer.cid=3Dtrack.cid AND album.aid=3Dtrack.aid; ]], { @@ -118,7 +118,7 @@ test:do_execsql_test( [[ SELECT DISTINCT aname FROM album, composer, track - WHERE likelihood(cname LIKE '%bach%', 0.5) + WHERE likelihood(cname LIKE '%bach%' COLLATE "unicode_ci", = 0.5) AND composer.cid=3Dtrack.cid AND album.aid=3Dtrack.aid; ]], { @@ -146,7 +146,7 @@ test:do_execsql_test( [[ SELECT DISTINCT aname FROM album, composer, track - WHERE cname LIKE '%bach%' + WHERE cname LIKE '%bach%' COLLATE "unicode_ci" AND composer.cid=3Dtrack.cid AND album.aid=3Dtrack.aid; ]], { @@ -174,7 +174,7 @@ test:do_execsql_test( [[ SELECT DISTINCT aname FROM album, composer, track - WHERE cname LIKE '%bach%' + WHERE cname LIKE '%bach%' COLLATE "unicode_ci" AND unlikely(composer.cid=3Dtrack.cid) AND unlikely(album.aid=3Dtrack.aid); ]], {