From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 7766024FA3 for ; Wed, 17 Jul 2019 12:19:20 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id ZDUAr27MsrJY for ; Wed, 17 Jul 2019 12:19:20 -0400 (EDT) Received: from smtpng3.m.smailru.net (smtpng3.m.smailru.net [94.100.177.149]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id F3EB824FA2 for ; Wed, 17 Jul 2019 12:19:19 -0400 (EDT) Content-Type: text/plain; charset=utf-8 Mime-Version: 1.0 (Mac OS X Mail 12.4 \(3445.104.11\)) Subject: [tarantool-patches] Re: [PATCH 2/2] sql: make LIKE predicate dependent on collation From: "n.pettik" In-Reply-To: <67ec7b5425d16078e45571c99ba9b58859b3c7b8.1563057282.git.roman.habibov@tarantool.org> Date: Wed, 17 Jul 2019 19:19:18 +0300 Content-Transfer-Encoding: quoted-printable Message-Id: References: <67ec7b5425d16078e45571c99ba9b58859b3c7b8.1563057282.git.roman.habibov@tarantool.org> Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-Help: List-Unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-Subscribe: List-Owner: List-post: List-Archive: To: tarantool-patches@freelists.org Cc: Roman Khabibov > On 14 Jul 2019, at 01:51, Roman Khabibov = wrote: >=20 > According to ANSI, LIKE should match characters taking into > account passed collation. >=20 > ISO/IEC JTC 1/SC 32 2011, Part 2: Foundation, page 445 Nit: it=E2=80=99s likely to be meaningless to indicate exact page; I=E2=80=99d better outline number of section. Moreover, I guess this ticket deserves doc request explaining user-visible changes. I=E2=80=99ve pushed code-style fixes at the top of your branch. > 7 files changed, 187 insertions(+), 111 deletions(-) >=20 > diff --git a/src/box/sql/func.c b/src/box/sql/func.c > index 4e4d14cf7..b660d05c5 100644 > --- a/src/box/sql/func.c > +++ b/src/box/sql/func.c >=20 > @@ -699,7 +723,7 @@ enum pattern_match_status { > * @param string String being compared. > * @param pattern_end Ptr to pattern last symbol. > * @param string_end Ptr to string last symbol. > - * @param is_like_ci true if LIKE is case insensitive. > + * @param coll Pointer to collation. > * @param match_other The escape char for LIKE. > * > * @retval One of pattern_match_status values. > @@ -709,7 +733,7 @@ sql_utf8_pattern_compare(const char *pattern, > const char *string, > const char *pattern_end, > const char *string_end, > - const int is_like_ci, > + struct coll *coll, > UChar32 match_other) > { > /* Next pattern and input string chars. */ > @@ -717,9 +741,13 @@ sql_utf8_pattern_compare(const char *pattern, > /* One past the last escaped input char. */ > const char *zEscaped =3D 0; > UErrorCode status =3D U_ZERO_ERROR; > + const char *pat_char_ptr; > + const char *str_char_ptr; > + int pat_char_len; > + int str_char_len; Declaring variables without initialisation is considered to be bad practise. Also intend to use appropriate types - in this case size_t. > while (pattern < pattern_end) { > - c =3D Utf8Read(pattern, pattern_end); > + c =3D step_utf8_char(&pattern, pattern_end, &pat_char_ptr, = &pat_char_len); Broken indentation. > if (c =3D=3D SQL_INVALID_UTF8_SYMBOL) > return INVALID_PATTERN; > if (c =3D=3D MATCH_ALL_WILDCARD) { >=20 > @@ -847,9 +867,10 @@ sql_utf8_pattern_compare(const char *pattern, > int > sql_strlike_cs(const char *zPattern, const char *zStr, unsigned int = esc) > { I guess you can remove both strike_cs and strlike_ci functions. > + struct coll_id *p =3D coll_by_name("unicode", = strlen("unicode")); > return sql_utf8_pattern_compare(zPattern, zStr, > zPattern + strlen(zPattern), > - zStr + strlen(zStr), 0, esc); > + zStr + strlen(zStr), p->coll, = esc); > } >=20 > @@ -228,13 +228,11 @@ operatorMask(int op) > * In order for the operator to be optimizible, the RHS must be a > * string literal that does not begin with a wildcard. The LHS > * must be a column that may only be NULL, a string, or a BLOB, > - * never a number. The collating sequence for the column on the > - * LHS must be appropriate for the operator. > + * never a number. > * > * @param pParse Parsing and code generating context. > * @param pExpr Test this expression. > - * @param ppPrefix Pointer to TK_STRING expression with > - * pattern prefix. > + * @param ppPrefix Pointer to expression with pattern prefix. If smth has changed, please underline this in comment. > * @param pisComplete True if the only wildcard is '%' in the > * last character. > * @retval True if the given expr is a LIKE operator & is > @@ -276,9 +274,20 @@ like_optimization_is_valid(Parse *pParse, Expr = *pExpr, Expr **ppPrefix, > */ > return 0; > } > + > + /* Only for "binary" and "unicode_ci" collations. */ Why only two collations? Why not =E2=80=9Cunicode=E2=80=9D for instance? What if collation is a part of expression, not field: =E2=80=A6 a COLLATE =E2=80=9Cunicode=E2=80=9D LIKE =E2=80=A6 ? > + struct field_def *field_def =3D pLeft->space_def->fields + = pLeft->iColumn; > + if (field_def->coll_id !=3D 0 && field_def->coll_id !=3D 2 && > + field_def->coll_id !=3D 3) It=E2=80=99s definitively bad practice to rely on collation=E2=80=99s = id. > + return 0; > assert(pLeft->iColumn !=3D (-1)); /* Because IPK never has = AFF_TEXT */ >=20 > pRight =3D sqlExprSkipCollate(pList->a[0].pExpr); > + const char *coll_name =3D NULL; > + if (pRight !=3D pList->a[0].pExpr) { > + assert(pList->a[0].pExpr->op =3D=3D TK_COLLATE); > + coll_name =3D pList->a[0].pExpr->u.zToken; Please, accompany this snippet with comment. > + } > op =3D pRight->op; > if (op =3D=3D TK_VARIABLE) { > Vdbe *pReprepare =3D pParse->pReprepare; > @@ -308,6 +317,10 @@ like_optimization_is_valid(Parse *pParse, Expr = *pExpr, Expr **ppPrefix, > pParse->is_aborted =3D true; > else > pPrefix->u.zToken[cnt] =3D 0; > + if (coll_name !=3D NULL) > + pPrefix =3D = sqlExprAddCollateString(pParse, > + = pPrefix, > + = coll_name); The same: I don=E2=80=99t understand what=E2=80=99s going on here. Please, add explanation to the code. > @@ -977,8 +990,6 @@ exprAnalyze(SrcList * pSrc, /* the FROM = clause */ > Expr *pStr1 =3D 0; > /* RHS of LIKE ends with wildcard. */ > int isComplete =3D 0; > - /* uppercase equivalent to lowercase. */ > - int noCase =3D 0; > /* Top-level operator. pExpr->op. */ > int op; > /* Parsing context. */ > @@ -1165,59 +1176,22 @@ exprAnalyze(SrcList * pSrc, /* the FROM = clause */ > const u16 wtFlags =3D TERM_LIKEOPT | TERM_VIRTUAL | = TERM_DYNAMIC; >=20 > pLeft =3D pExpr->x.pList->a[1].pExpr; > - pStr2 =3D sqlExprDup(db, pStr1, 0); > - > - /* > - * Convert the lower bound to upper-case and the > - * upper bound to lower-case (upper-case is less > - * than lower-case in ASCII) so that the range > - * constraints also work for BLOBs. > - */ > - if (noCase && !pParse->db->mallocFailed) { > - int i; > - char c; > - pTerm->wtFlags |=3D TERM_LIKE; > - for (i =3D 0; (c =3D pStr1->u.zToken[i]) !=3D 0; = i++) { > - pStr1->u.zToken[i] =3D sqlToupper(c); > - pStr2->u.zToken[i] =3D sqlTolower(c); > - } > - } > + pStr2 =3D sqlExprDup(db, sqlExprSkipCollate(pStr1), 0); >=20 > if (!db->mallocFailed) { > u8 c, *pC; /* Last character before the = first wildcard */ > pC =3D (u8 *) & pStr2->u. > zToken[sqlStrlen30(pStr2->u.zToken) - 1]; > c =3D *pC; > - if (noCase) { > - /* The point is to increment the last = character before the first > - * wildcard. But if we increment '@', = that will push it into the > - * alphabetic range where case = conversions will mess up the > - * inequality. To avoid this, make sure = to also run the full > - * LIKE on all candidate expressions by = clearing the isComplete flag > - */ > - if (c =3D=3D 'A' - 1) > - isComplete =3D 0; > - c =3D sqlUpperToLower[c]; > - } Are you sure that this code removal wouldn=E2=80=99t broke smth? > *pC =3D c + 1; > } > pNewExpr1 =3D sqlExprDup(db, pLeft, 0); > - if (noCase) { > - pNewExpr1 =3D > - sqlExprAddCollateString(pParse, = pNewExpr1, > - = "unicode_ci"); > - } > pNewExpr1 =3D sqlPExpr(pParse, TK_GE, pNewExpr1, pStr1); > transferJoinMarkings(pNewExpr1, pExpr); > idxNew1 =3D whereClauseInsert(pWC, pNewExpr1, wtFlags); > testcase(idxNew1 =3D=3D 0); > exprAnalyze(pSrc, pWC, idxNew1); > pNewExpr2 =3D sqlExprDup(db, pLeft, 0); > - if (noCase) { > - pNewExpr2 =3D > - sqlExprAddCollateString(pParse, = pNewExpr2, > - = "unicode_ci"); > - } > pNewExpr2 =3D sqlPExpr(pParse, TK_LT, pNewExpr2, pStr2); > transferJoinMarkings(pNewExpr2, pExpr); > idxNew2 =3D whereClauseInsert(pWC, pNewExpr2, wtFlags); > diff --git a/test/sql-tap/collation.test.lua = b/test/sql-tap/collation.test.lua > index 79547361c..9e404282a 100755 > --- a/test/sql-tap/collation.test.lua > +++ b/test/sql-tap/collation.test.lua > @@ -1,6 +1,6 @@ > #!/usr/bin/env tarantool > test =3D require("sqltester") > -test:plan(174) > +test:plan(173) Please add test verifying that arguments of incompatible collations can=E2=80=99t participate in LIKE predicate. >=20 > local prefix =3D "collation-" >=20 > @@ -477,13 +477,13 @@ for _, data_collation in ipairs(data_collations) = do > end > end >=20 > --- Like uses collation (only for unicode_ci and binary) > +-- uses collation. If has explicit , use it > +-- instead of implicit. > local like_testcases =3D > { > {"2.0", > [[ > - CREATE TABLE tx1 (s1 VARCHAR(5) PRIMARY KEY); > - CREATE INDEX I1 on tx1(s1 collate "unicode_ci=E2=80=9D); Why did you drop index? > + CREATE TABLE tx1 (s1 VARCHAR(5) PRIMARY KEY COLLATE = "unicode_ci"); > INSERT INTO tx1 VALUES('aaa'); > INSERT INTO tx1 VALUES('Aab'); > INSERT INTO tx1 VALUES('=C4=B0ac'); > @@ -491,35 +491,32 @@ local like_testcases =3D > ]], {0}}, > {"2.1.1", > "SELECT * FROM tx1 WHERE s1 LIKE 'A%' order by s1;", > - {0, {"Aab", "aaa"}} }, > + {0, {"aaa","Aab"}} }, Why order of results has changed? > {"2.1.2", > "EXPLAIN QUERY PLAN SELECT * FROM tx1 WHERE s1 LIKE 'A%';", > - {0, {0, 0, 0, "SEARCH TABLE TX1 USING COVERING INDEX I1 (S1>? = AND S1 + {0, {0, 0, 0, "SEARCH TABLE TX1 USING PRIMARY KEY (S1>? AND = S1 {"2.2.0", > - "PRAGMA case_sensitive_like =3D true;", > - {0}}, > - {"2.2.1", > - "SELECT * FROM tx1 WHERE s1 LIKE 'A%' order by s1;", > + "SELECT * FROM tx1 WHERE s1 LIKE 'A%' COLLATE \"unicode\" = order by s1;", > {0, {"Aab"}} }, > - {"2.2.2", > + {"2.2.1", > "EXPLAIN QUERY PLAN SELECT * FROM tx1 WHERE s1 LIKE 'A%';", > {0, {0, 0, 0, "/USING PRIMARY KEY/"}} }, > {"2.3.0", > - "PRAGMA case_sensitive_like =3D false;", > - {0}}, > - {"2.3.1", > "SELECT * FROM tx1 WHERE s1 LIKE 'i%' order by s1;", > - {0, {"iad", "=C4=B0ac"}}}, > - {"2.3.2", > - "SELECT * FROM tx1 WHERE s1 LIKE '=C4=B0%'order by s1;", > - {0, {"iad", "=C4=B0ac"}} }, > + {0, {"=C4=B0ac", "iad"}}}, > + {"2.3.1", > + "SELECT * FROM tx1 WHERE s1 LIKE '=C4=B0%' COLLATE = \"unicode\" order by s1;", > + {0, {"=C4=B0ac"}} }, > {"2.4.0", > [[ > INSERT INTO tx1 VALUES('=D0=AF=D0=81=D0=AE'); > ]], {0} }, > {"2.4.1", > + "SELECT * FROM tx1 WHERE s1 LIKE '=D1=8F=D1=91=D1=8E' COLLATE = \"unicode\";", > + {0, {}} }, > + {"2.4.2", > "SELECT * FROM tx1 WHERE s1 LIKE '=D1=8F=D1=91=D1=8E';", > - {0, {"=D0=AF=D0=81=D0=AE"}} }, > + {0, {"=D0=AF=D0=81=D0=AE"}} } Stray diff. > } >=20 > test:do_catchsql_set_test(like_testcases, prefix) > diff --git a/test/sql-tap/gh-3251-string-pattern-comparison.test.lua = b/test/sql-tap/gh-3251-string-pattern-comparison.test.lua > index 95e8c98d0..eb9d7c3b9 100755 > --- a/test/sql-tap/gh-3251-string-pattern-comparison.test.lua > +++ b/test/sql-tap/gh-3251-string-pattern-comparison.test.lua > @@ -1,6 +1,6 @@ > #!/usr/bin/env tarantool > test =3D require("sqltester") > -test:plan(128) > +test:plan(156) >=20 > local prefix =3D "like-test-" >=20 > @@ -80,10 +80,10 @@ local like_test_cases =3D > {0, {false}} }, > {"1.25", > "SELECT '=D1=91=D1=84=E2=84=AB=C5=92=D8=B4' LIKE '%=C5=93=D8=B4'= ;", > - {0, {true}} }, > + {0, {false}} }, Could you add an appropriate collation to avoid fixing test results? > {"1.26", > "SELECT '=E2=84=AB=C5=92=D8=B4' LIKE '%=C5=93=D8=B4';", > - {0, {true}} }, > + {0, {false}} }, > {"1.27", > "SELECT '=D1=91=D1=84' LIKE '=D1=91_';", > {0, {true}} }, > @@ -120,6 +120,90 @@ local like_test_cases =3D > {"1.38", > "SELECT '=E2=84=AB=C5=92=D8=B4' LIKE '=D1=91_%';", > {0, {false}} }, > + {"1.39", > + "SELECT 'A' LIKE 'A' COLLATE \"unicode\";", > + {0, {true}} }, Please, add not only tests involving constant literals, but also columns with implicit/explicit collations.