From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 018F320C13 for ; Tue, 22 May 2018 14:06:30 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id GO4Fn3Thx1HF for ; Tue, 22 May 2018 14:06:29 -0400 (EDT) Received: from smtp60.i.mail.ru (smtp60.i.mail.ru [217.69.128.40]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id F1C762032C for ; Tue, 22 May 2018 14:06:28 -0400 (EDT) Subject: [tarantool-patches] Re: [PATCH] sql: allow any space symbols to be a white space References: From: Vladislav Shpilevoy Message-ID: Date: Tue, 22 May 2018 21:06:26 +0300 MIME-Version: 1.0 In-Reply-To: Content-Type: text/plain; charset=utf-8; format=flowed Content-Language: en-US Content-Transfer-Encoding: 7bit Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: Kirill Yukhin Cc: tarantool-patches@freelists.org Hello. Thanks for the patch! At first, no one test is passed on my machine. I see infinite errors like this: [012] Worker "012_box" cannot start tarantool server; the tasks will be ignored... [012] The raised exception is '' of type ''. [012] Worker "012_box" received the following error: [012] Traceback (most recent call last): [012] File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/worker.py", line 211, in __init__ [012] self.inspector = suite.start_server(self.server) [012] File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/test_suite.py", line 158, in start_server [012] server.deploy(silent=False) [012] File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/tarantool_server.py", line 510, in deploy [012] self.start(silent=silent, **kwargs) [012] File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/tarantool_server.py", line 578, in start [012] self.wait_until_started(wait_load) [012] File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/tarantool_server.py", line 745, in wait_until_started [012] msg, self.process if not self.gdb and not self.lldb else None) [012] File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/tarantool_server.py", line 211, in seek_wait [012] raise TarantoolStartError [012] TarantoolStartError [012] On Travis I see the same. At second, see 17 comments below. Most of them are about style, but a pair of functional ones too. On 22/05/2018 18:51, Kirill Yukhin wrote: > Branch: https://github.com/tarantool/tarantool/tree/kyukhin/gh-2371-utf8-spaces > Issue: https://github.com/tarantool/tarantool/issues/2371 > > ANSI SQL allows any of Unicode classes ZI, Zp or Zs to > act as white space symbol. Allow this in lexical analyzer. > Refactor lexical analyzer routine to follow Tarantool's > coding style. > Also, remove dead encoding: ABCDIC. 1. Maybe EBCDIC? > > Closes #2371 > --- > src/box/sql/alter.c | 13 +- > src/box/sql/complete.c | 16 -- > src/box/sql/func.c | 5 - > src/box/sql/global.c | 22 -- > src/box/sql/sqliteInt.h | 45 ++-- > src/box/sql/tokenize.c | 631 +++++++++++++++++++++--------------------------- > src/box/sql/util.c | 5 - > src/box/sql/vdbetrace.c | 2 +- > src/box/sql/whereexpr.c | 4 - > 9 files changed, 291 insertions(+), 452 deletions(-) > > diff --git a/src/box/sql/alter.c b/src/box/sql/alter.c > index c9c8f9b..f509d4e 100644 > --- a/src/box/sql/alter.c > +++ b/src/box/sql/alter.c > @@ -351,7 +351,7 @@ rename_table(sqlite3 *db, const char *sql_stmt, const char *table_name, > > int token; > Token old_name; > - unsigned char const *csr = (unsigned const char *)sql_stmt; > + char const *csr = sql_stmt; 2. const char * > int len = 0; > char *new_sql_stmt; > bool unused; > diff --git a/src/box/sql/complete.c b/src/box/sql/complete.c > index 092d4fb..047e09e 100644 > --- a/src/box/sql/complete.c > +++ b/src/box/sql/complete.c > @@ -40,19 +40,6 @@ > #include "sqliteInt.h" > #ifndef SQLITE_OMIT_COMPLETE > > -/* > - * This is defined in tokenize.c. We just have to import the definition. > - */ > -#ifndef SQLITE_AMALGAMATION > -#ifdef SQLITE_ASCII > -#define IdChar(C) ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0) > -#endif > -#ifdef SQLITE_EBCDIC 3. I still can grep SQLITE_EBCDIC in tarantool/extra/mkkeywordhash.c. It is not generated file. Same about SQLITE_ASCII. > -extern const char sqlite3IsEbcdicIdChar[]; > -#define IdChar(C) (((c=C)>=0x42 && sqlite3IsEbcdicIdChar[c-0x40])) > -#endif > -#endif /* SQLITE_AMALGAMATION */ > - > /* > * Token types used by the sqlite3_complete() routine. See the header > * comments on that procedure for additional information. > diff --git a/src/box/sql/sqliteInt.h b/src/box/sql/sqliteInt.h > index b3db468..f55c734 100644 > --- a/src/box/sql/sqliteInt.h > +++ b/src/box/sql/sqliteInt.h > @@ -36,6 +36,8 @@ > #ifndef SQLITEINT_H > #define SQLITEINT_H > > +#define IdChar(C) ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0) 4. Why can not you leave it in sql/complete.c? > @@ -4164,7 +4136,18 @@ extern int sqlite3PendingByte; > #endif > void sqlite3Reindex(Parse *, Token *, Token *); > void sqlite3AlterRenameTable(Parse *, SrcList *, Token *); > -int sqlite3GetToken(const unsigned char *, int *, bool *); > + > +/** > + * Return the length (in bytes) of the token that begins at z[0]. > + * Store the token type in *tokenType before returning. 5. No tokenType. > + * > + * @param z Input stream. > + * @param[out] type Detected type of token. > + * @param[out] is_reserved True if reserved word. > + */ > +int > +sql_token(const char *z, int *type, bool *is_reserved); > + > void sqlite3NestedParse(Parse *, const char *, ...); > void sqlite3ExpirePreparedStatements(sqlite3 *); > int sqlite3CodeSubselect(Parse *, Expr *, int); > diff --git a/src/box/sql/tokenize.c b/src/box/sql/tokenize.c > index c77aa9b..8df58e9 100644 > --- a/src/box/sql/tokenize.c > +++ b/src/box/sql/tokenize.c > @@ -77,10 +81,9 @@ > #define CC_DOT 26 /* '.' */ > #define CC_ILLEGAL 27 /* Illegal character */ > > -static const unsigned char aiClass[] = { > -#ifdef SQLITE_ASCII > +static const char sql_ascii_class[] = { > /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ > -/* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27, > +/* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 7, 7, 7, 27, 27, 6. Why did you change 27 to 7 in xb column? > /* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, > /* 2x */ 7, 15, 9, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16, > /* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6, > @@ -96,63 +99,16 @@ static const unsigned char aiClass[] = { > /* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, > /* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, > /* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 > -#endif > -#ifdef SQLITE_EBCDIC > -/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */ > -/* 0x */ 27, 27, 27, 27, 27, 7, 27, 27, 27, 27, 27, 27, 7, 7, 27, > - 27, > -/* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, > -/* 2x */ 27, 27, 27, 27, 27, 7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, > -/* 3x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, > -/* 4x */ 7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 26, 12, 17, 20, 10, > -/* 5x */ 24, 27, 27, 27, 27, 27, 27, 27, 27, 27, 15, 4, 21, 18, 19, 27, > -/* 6x */ 11, 16, 27, 27, 27, 27, 27, 27, 27, 27, 27, 23, 22, 1, 13, 6, > -/* 7x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 8, 5, 5, 5, 8, 14, 8, > -/* 8x */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, > -/* 9x */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, > -/* Ax */ 27, 25, 1, 1, 1, 1, 1, 0, 1, 1, 27, 27, 27, 27, 27, 27, > -/* Bx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 9, 27, 27, 27, 27, 27, > -/* Cx */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, > -/* Dx */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27, > -/* Ex */ 27, 27, 1, 1, 1, 1, 1, 0, 1, 1, 27, 27, 27, 27, 27, 27, > -/* Fx */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 27, 27, 27, 27, 27, 27, > -#endif > }; > > -/* > +/** > * The charMap() macro maps alphabetic characters (only) into their 7. Out of 66. > * lower-case ASCII equivalent. On ASCII machines, this is just > - * an upper-to-lower case map. On EBCDIC machines we also need > - * to adjust the encoding. The mapping is only valid for alphabetics > - * which are the only characters for which this feature is used. > + * an upper-to-lower case map. > * > * Used by keywordhash.h > */ > -#ifdef SQLITE_ASCII > #define charMap(X) sqlite3UpperToLower[(unsigned char)X] > -#endif > -#ifdef SQLITE_EBCDIC > -#define charMap(X) ebcdicToAscii[(unsigned char)X] > -const unsigned char ebcdicToAscii[] = { > -/* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 3x */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 4x */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 5x */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95, 0, 0, /* 6x */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 7x */ > - 0, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, /* 8x */ > - 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 0, 0, 0, 0, 0, 0, /* 9x */ > - 0, 0, 115, 116, 117, 118, 119, 120, 121, 122, 0, 0, 0, 0, 0, 0, /* Ax */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Bx */ > - 0, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, /* Cx */ > - 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 0, 0, 0, 0, 0, 0, /* Dx */ > - 0, 0, 115, 116, 117, 118, 119, 120, 121, 122, 0, 0, 0, 0, 0, 0, /* Ex */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Fx */ > -}; > -#endif > > /* > * The sqlite3KeywordCode function looks up an identifier to determine if > @@ -167,360 +123,313 @@ const unsigned char ebcdicToAscii[] = { > */ > #include "keywordhash.h" > > -/* > +/** > * If X is a character that can be used in an identifier then > - * IdChar(X) will be true. Otherwise it is false. > + * sql_id_char(X) will be true. Otherwise it is false. > * > - * For ASCII, any character with the high-order bit set is > + * Any character with the high-order bit set is > * allowed in an identifier. For 7-bit characters, > * sqlite3IsIdChar[X] must be 1. 8. No sqlite3IsIdChar. 9. Now I see two identical implementations of this macro: sql_id_char here and IdChar in sqliteInt.h. And sql_id_char looks to be unused. > * > - * For EBCDIC, the rules are more complex but have the same > - * end result. > - * > - * Ticket #1066. the SQL standard does not allow '$' in the > + * The SQL standard does not allow '$' in the > * middle of identifiers. But many SQL implementations do. > * SQLite will allow '$' in identifiers for compatibility. > * But the feature is undocumented. > + * 10. Trailing white space. > + * @param C Character to classify. > */ > -#ifdef SQLITE_ASCII > -#define IdChar(C) ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0) > -#endif > -#ifdef SQLITE_EBCDIC > -const char sqlite3IsEbcdicIdChar[] = { > -/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ > - 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 4x */ > - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, /* 5x */ > - 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, /* 6x */ > - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* 7x */ > - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, /* 8x */ > - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, /* 9x */ > - 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, /* Ax */ > - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Bx */ > - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, /* Cx */ > - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, /* Dx */ > - 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, /* Ex */ > - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, /* Fx */ > -}; > +#define sql_id_char(C) ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0) > > -#define IdChar(C) (((c=C)>=0x42 && sqlite3IsEbcdicIdChar[c-0x40])) > -#endif > +/** > + * Return true if current symbol is space. > + * > + * @param z Input stream. > + * @retval True if current symbl space. 11. symbol. > + */ > +static inline bool > +sql_is_space_char(const char *z) > +{ > + if (sqlite3Isspace(z[0])) > + return true; > + if (sqlite3CtypeMap[*(unsigned char*)z] & 0x40) { 12. Why 0x40? > + UChar32 c; > + int unused = 0; > + U8_NEXT(z, unused, INT32_MAX, c); > + if (u_isspace(c)) > + return true; > + } > + return false; > +} > > -/* > - * Return the length (in bytes) of the token that begins at z[0]. > - * Store the token type in *tokenType before returning. > +/** > + * Calculate length of continuous sequence of > + * space symbols. > + * > + * @param z Input stream. > + * @retval Number of bytes which constitute sequence of spaces. > + * Can be 0 if first symbol in stram is not space. > */ > +static inline int > +sql_skip_spaces(const char *z) > +{ > + int idx = 0; > + while (true) { > + if (sqlite3Isspace(z[idx])) { > + idx += 1; > + } else if ((sqlite3CtypeMap[*(unsigned char *)(z + idx)] & 0x40) 13. Already second place with 0x40. What is it? Maybe you should create a separate macro like it is done here? #define sqlite3Isspace(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x01) #define sqlite3Isalnum(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x06) #define sqlite3Isalpha(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x02) #define sqlite3Isdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x04) #define sqlite3Isxdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x08) ... > + != 0) { > + UChar32 c; > + int prev_offset = idx; > + U8_NEXT(z, idx, INT32_MAX, c); 14. If you do not check for errors and don't know a length, you can use U8_NEXT_UNSAFE. It is much shorter. Same about U8_NEXT in sql_is_space_char I think. > + if (!u_isspace(c)) { > + idx = prev_offset; 15. In most cases you will have one white space, so maybe it is faster to make this: int new_offset = idx; U8_NEXT(z, new_offset, INT32_MAX, c); if (! u_isspace(c)) break; idx = new_offset; Here you have not to reset the offset in most cases. > + break; > + } > + } else { > + break; > + } > + } > + return idx; > +} > + > int > -sqlite3GetToken(const unsigned char *z, int *tokenType, bool *is_reserved) > +sql_token(const char *z, int *type, bool *is_reserved) 16. Please, consider my light refactoring. diff --git a/src/box/sql/tokenize.c b/src/box/sql/tokenize.c index 8df58e949..4cdd375ee 100644 --- a/src/box/sql/tokenize.c +++ b/src/box/sql/tokenize.c @@ -209,8 +209,7 @@ sql_token(const char *z, int *type, bool *is_reserved) return i; case CC_MINUS: if (z[1] == '-') { - for (i = 2; (c = z[i]) != 0 && c != '\n'; i++) { - } + for (i = 2; (c = z[i]) != 0 && c != '\n'; i++); *type = TK_SPACE; return i; } @@ -237,10 +236,8 @@ sql_token(const char *z, int *type, bool *is_reserved) return 1; } for (i = 3, c = z[2]; - (c != '*' || z[i] != '/') && (c = z[i]) != 0; - i++) { - } - if (c) + (c != '*' || z[i] != '/') && (c = z[i]) != 0; i++); + if (c != 0) i++; *type = TK_SPACE; return i; @@ -323,8 +320,7 @@ sql_token(const char *z, int *type, bool *is_reserved) } FALLTHROUGH; case CC_DOT: - if (!sqlite3Isdigit(z[1])) - { + if (! sqlite3Isdigit(z[1])) { *type = TK_DOT; return 1; } @@ -335,38 +331,32 @@ sql_token(const char *z, int *type, bool *is_reserved) FALLTHROUGH; case CC_DIGIT: *type = TK_INTEGER; - if (z[0] == '0' && (z[1] == 'x' || z[1] == 'X') - && sqlite3Isxdigit(z[2])) { - for (i = 3; sqlite3Isxdigit(z[i]); i++) { - } + if (z[0] == '0' && (z[1] == 'x' || z[1] == 'X') && + sqlite3Isxdigit(z[2])) { + for (i = 3; sqlite3Isxdigit(z[i]); i++); return i; } - for (i = 0; sqlite3Isdigit(z[i]); i++) { - } + for (i = 0; sqlite3Isdigit(z[i]); i++); if (z[i] == '.') { - i++; - while (sqlite3Isdigit(z[i])) - i++; + while (sqlite3Isdigit(z[++i])); *type = TK_FLOAT; } if ((z[i] == 'e' || z[i] == 'E') && (sqlite3Isdigit(z[i + 1]) - || ((z[i + 1] == '+' || z[i + 1] == '-') - && sqlite3Isdigit(z[i + 2])))) { - i += 2; - while (sqlite3Isdigit(z[i])) - i++; + || ((z[i + 1] == '+' || z[i + 1] == '-') && + sqlite3Isdigit(z[i + 2])))) { + ++i; + while (sqlite3Isdigit(z[++i])); *type = TK_FLOAT; } - while (IdChar(z[i])) { + if (IdChar(z[i])) { *type = TK_ILLEGAL; - i++; + while(IdChar(z[++i])); } return i; case CC_VARNUM: *type = TK_VARIABLE; - for (i = 1; sqlite3Isdigit(z[i]); i++) { - } + for (i = 1; sqlite3Isdigit(z[i]); i++); return i; case CC_DOLLAR: case CC_VARALPHA: @@ -383,9 +373,8 @@ sql_token(const char *z, int *type, bool *is_reserved) return i; case CC_KYWD: for (i = 1; sql_ascii_class[*(unsigned char*)(z+i)] <= CC_KYWD; - i++) { - } - if (!sql_is_space_char((const char *)z + i) && IdChar(z[i])) { + i++); + if (!sql_is_space_char(z + i) && IdChar(z[i])) { /* This token started out using characters * that can appear in keywords, but z[i] is * a character not allowed within keywords, @@ -395,18 +384,17 @@ sql_token(const char *z, int *type, bool *is_reserved) break; } *type = TK_ID; - return keywordCode((char *)z, i, type, is_reserved); + return keywordCode(z, i, type, is_reserved); case CC_X: if (z[1] == '\'') { *type = TK_BLOB; - for (i = 2; sqlite3Isxdigit(z[i]); i++) { - } + for (i = 2; sqlite3Isxdigit(z[i]); i++); if (z[i] != '\'' || i % 2) { *type = TK_ILLEGAL; - while (z[i] && z[i] != '\'') + while (z[i] != 0 && z[i] != '\'') i++; } - if (z[i]) + if (z[i] != 0) i++; return i; } 17. And the last comment. I see, that you use sql_is_space_char to detect if a char is a white space. But the caller code propagates the offset to 1 byte always. Regardless of the fact the white space symbol can occupy more than 1 byte. > if (!sql_is_space_char(z + i) && IdChar(z[i])) { > /* This token started out using characters > * that can appear in keywords, but z[i] is > * a character not allowed within keywords, > * so this must be an identifier instead. > */ > i++; ^^^^^^^^^ \|/ here --------+ > break; > } I propose to inline it and propagate 'i' on the value got from U8_NEXT_UNSAFE as offset.