[tarantool-patches] Re: [PATCH] sql: allow any space symbols to be a white space

Tue May 22 21:06:26 MSK 2018

Hello. Thanks for the patch! At first, no one test is passed on my
machine. I see infinite errors like this:

[012] Worker "012_box" cannot start tarantool server; the tasks will be ignored...
[012] The raised exception is '' of type '<class 'lib.tarantool_server.TarantoolStartError'>'.
[012] Worker "012_box" received the following error:
[012] Traceback (most recent call last):
[012]   File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/worker.py", line 211, in __init__
[012]     self.inspector = suite.start_server(self.server)
[012]   File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/test_suite.py", line 158, in start_server
[012]     server.deploy(silent=False)
[012]   File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/tarantool_server.py", line 510, in deploy
[012]     self.start(silent=silent, **kwargs)
[012]   File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/tarantool_server.py", line 578, in start
[012]     self.wait_until_started(wait_load)
[012]   File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/tarantool_server.py", line 745, in wait_until_started
[012]     msg, self.process if not self.gdb and not self.lldb else None)
[012]   File "/Users/v.shpilevoy/Work/Repositories/tarantool/test-run/lib/tarantool_server.py", line 211, in seek_wait
[012]     raise TarantoolStartError
[012] TarantoolStartError
[012]

On Travis I see the same.

At second, see 17 comments below. Most of them are about style, but a pair
of functional ones too.

On 22/05/2018 18:51, Kirill Yukhin wrote:
> Branch: https://github.com/tarantool/tarantool/tree/kyukhin/gh-2371-utf8-spaces
> Issue: https://github.com/tarantool/tarantool/issues/2371
> 
> ANSI SQL allows any of Unicode classes ZI, Zp or Zs to
> act as white space symbol. Allow this in lexical analyzer.
> Refactor lexical analyzer routine to follow Tarantool's
> coding style.
> Also, remove dead encoding: ABCDIC.

1. Maybe EBCDIC?

> 
> Closes #2371
> ---
>   src/box/sql/alter.c     |  13 +-
>   src/box/sql/complete.c  |  16 --
>   src/box/sql/func.c      |   5 -
>   src/box/sql/global.c    |  22 --
>   src/box/sql/sqliteInt.h |  45 ++--
>   src/box/sql/tokenize.c  | 631 +++++++++++++++++++++---------------------------
>   src/box/sql/util.c      |   5 -
>   src/box/sql/vdbetrace.c |   2 +-
>   src/box/sql/whereexpr.c |   4 -
>   9 files changed, 291 insertions(+), 452 deletions(-)
> 
> diff --git a/src/box/sql/alter.c b/src/box/sql/alter.c
> index c9c8f9b..f509d4e 100644
> --- a/src/box/sql/alter.c
> +++ b/src/box/sql/alter.c
> @@ -351,7 +351,7 @@ rename_table(sqlite3 *db, const char *sql_stmt, const char *table_name,
>   
>   	int token;
>   	Token old_name;
> -	unsigned char const *csr = (unsigned const char *)sql_stmt;
> +	char const *csr = sql_stmt;

2. const char *

>   	int len = 0;
>   	char *new_sql_stmt;
>   	bool unused;
> diff --git a/src/box/sql/complete.c b/src/box/sql/complete.c
> index 092d4fb..047e09e 100644
> --- a/src/box/sql/complete.c
> +++ b/src/box/sql/complete.c
> @@ -40,19 +40,6 @@
>   #include "sqliteInt.h"
>   #ifndef SQLITE_OMIT_COMPLETE
>   
> -/*
> - * This is defined in tokenize.c.  We just have to import the definition.
> - */
> -#ifndef SQLITE_AMALGAMATION
> -#ifdef SQLITE_ASCII
> -#define IdChar(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
> -#endif
> -#ifdef SQLITE_EBCDIC

3. I still can grep SQLITE_EBCDIC in tarantool/extra/mkkeywordhash.c. It is not
generated file. Same about SQLITE_ASCII.

> -extern const char sqlite3IsEbcdicIdChar[];
> -#define IdChar(C)  (((c=C)>=0x42 && sqlite3IsEbcdicIdChar[c-0x40]))
> -#endif
> -#endif				/* SQLITE_AMALGAMATION */
> -
>   /*
>    * Token types used by the sqlite3_complete() routine.  See the header
>    * comments on that procedure for additional information.
> diff --git a/src/box/sql/sqliteInt.h b/src/box/sql/sqliteInt.h
> index b3db468..f55c734 100644
> --- a/src/box/sql/sqliteInt.h
> +++ b/src/box/sql/sqliteInt.h
> @@ -36,6 +36,8 @@
>   #ifndef SQLITEINT_H
>   #define SQLITEINT_H
>   
> +#define IdChar(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)

4. Why can not you leave it in sql/complete.c?
> @@ -4164,7 +4136,18 @@ extern int sqlite3PendingByte;
>   #endif
>   void sqlite3Reindex(Parse *, Token *, Token *);
>   void sqlite3AlterRenameTable(Parse *, SrcList *, Token *);
> -int sqlite3GetToken(const unsigned char *, int *, bool *);
> +
> +/**
> + * Return the length (in bytes) of the token that begins at z[0].
> + * Store the token type in *tokenType before returning.

5. No tokenType.

> + *
> + * @param z Input stream.
> + * @param[out] type Detected type of token.
> + * @param[out] is_reserved True if reserved word.
> + */
> +int
> +sql_token(const char *z, int *type, bool *is_reserved);
> +
>   void sqlite3NestedParse(Parse *, const char *, ...);
>   void sqlite3ExpirePreparedStatements(sqlite3 *);
>   int sqlite3CodeSubselect(Parse *, Expr *, int);
> diff --git a/src/box/sql/tokenize.c b/src/box/sql/tokenize.c
> index c77aa9b..8df58e9 100644
> --- a/src/box/sql/tokenize.c
> +++ b/src/box/sql/tokenize.c
> @@ -77,10 +81,9 @@
>   #define CC_DOT       26		/* '.' */
>   #define CC_ILLEGAL   27		/* Illegal character */
>   
> -static const unsigned char aiClass[] = {
> -#ifdef SQLITE_ASCII
> +static const char sql_ascii_class[] = {
>   /*       x0  x1  x2  x3  x4  x5  x6  x7  x8 x9  xa xb  xc xd xe  xf */
> -/* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7,  7, 27, 7, 7, 27, 27,
> +/* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7,  7, 7, 7, 7, 27, 27,

6. Why did you change 27 to 7 in xb column?

>   /* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
>   /* 2x */ 7, 15, 9, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16,
>   /* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6,
> @@ -96,63 +99,16 @@ static const unsigned char aiClass[] = {
>   /* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>   /* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>   /* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
> -#endif
> -#ifdef SQLITE_EBCDIC
> -/*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
> -/* 0x */ 27, 27, 27, 27, 27, 7, 27, 27, 27, 27, 27, 27, 7, 7, 27,
> -	    27,
> -/* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
> -/* 2x */ 27, 27, 27, 27, 27, 7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
> -/* 3x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
> -/* 4x */ 7, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 26, 12, 17, 20, 10,
> -/* 5x */ 24, 27, 27, 27, 27, 27, 27, 27, 27, 27, 15, 4, 21, 18, 19, 27,
> -/* 6x */ 11, 16, 27, 27, 27, 27, 27, 27, 27, 27, 27, 23, 22, 1, 13, 6,
> -/* 7x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 8, 5, 5, 5, 8, 14, 8,
> -/* 8x */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27,
> -/* 9x */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27,
> -/* Ax */ 27, 25, 1, 1, 1, 1, 1, 0, 1, 1, 27, 27, 27, 27, 27, 27,
> -/* Bx */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 9, 27, 27, 27, 27, 27,
> -/* Cx */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27,
> -/* Dx */ 27, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 27, 27, 27, 27, 27,
> -/* Ex */ 27, 27, 1, 1, 1, 1, 1, 0, 1, 1, 27, 27, 27, 27, 27, 27,
> -/* Fx */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 27, 27, 27, 27, 27, 27,
> -#endif
>   };
>   
> -/*
> +/**
>    * The charMap() macro maps alphabetic characters (only) into their

7. Out of 66.

>    * lower-case ASCII equivalent.  On ASCII machines, this is just
> - * an upper-to-lower case map.  On EBCDIC machines we also need
> - * to adjust the encoding.  The mapping is only valid for alphabetics
> - * which are the only characters for which this feature is used.
> + * an upper-to-lower case map.
>    *
>    * Used by keywordhash.h
>    */
> -#ifdef SQLITE_ASCII
>   #define charMap(X) sqlite3UpperToLower[(unsigned char)X]
> -#endif
> -#ifdef SQLITE_EBCDIC
> -#define charMap(X) ebcdicToAscii[(unsigned char)X]
> -const unsigned char ebcdicToAscii[] = {
> -/* 0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 0x */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 1x */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 2x */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 3x */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 4x */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 5x */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95, 0, 0,	/* 6x */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 7x */
> -	0, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0,	/* 8x */
> -	0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 0, 0, 0, 0, 0, 0,	/* 9x */
> -	0, 0, 115, 116, 117, 118, 119, 120, 121, 122, 0, 0, 0, 0, 0, 0,	/* Ax */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* Bx */
> -	0, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0,	/* Cx */
> -	0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 0, 0, 0, 0, 0, 0,	/* Dx */
> -	0, 0, 115, 116, 117, 118, 119, 120, 121, 122, 0, 0, 0, 0, 0, 0,	/* Ex */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* Fx */
> -};
> -#endif
>   
>   /*
>    * The sqlite3KeywordCode function looks up an identifier to determine if
> @@ -167,360 +123,313 @@ const unsigned char ebcdicToAscii[] = {
>    */
>   #include "keywordhash.h"
>   
> -/*
> +/**
>    * If X is a character that can be used in an identifier then
> - * IdChar(X) will be true.  Otherwise it is false.
> + * sql_id_char(X) will be true.  Otherwise it is false.
>    *
> - * For ASCII, any character with the high-order bit set is
> + * Any character with the high-order bit set is
>    * allowed in an identifier.  For 7-bit characters,
>    * sqlite3IsIdChar[X] must be 1.

8. No sqlite3IsIdChar.

9. Now I see two identical implementations of this macro:
sql_id_char here and IdChar in sqliteInt.h. And sql_id_char looks
to be unused.

>    *
> - * For EBCDIC, the rules are more complex but have the same
> - * end result.
> - *
> - * Ticket #1066.  the SQL standard does not allow '$' in the
> + * The SQL standard does not allow '$' in the
>    * middle of identifiers.  But many SQL implementations do.
>    * SQLite will allow '$' in identifiers for compatibility.
>    * But the feature is undocumented.
> + *

10. Trailing white space.

> + * @param C Character to classify.
>    */
> -#ifdef SQLITE_ASCII
> -#define IdChar(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
> -#endif
> -#ifdef SQLITE_EBCDIC
> -const char sqlite3IsEbcdicIdChar[] = {
> -/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
> -	0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,	/* 4x */
> -	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,	/* 5x */
> -	0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,	/* 6x */
> -	0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,	/* 7x */
> -	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,	/* 8x */
> -	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,	/* 9x */
> -	1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,	/* Ax */
> -	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* Bx */
> -	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,	/* Cx */
> -	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,	/* Dx */
> -	0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,	/* Ex */
> -	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,	/* Fx */
> -};
> +#define sql_id_char(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
>   
> -#define IdChar(C)  (((c=C)>=0x42 && sqlite3IsEbcdicIdChar[c-0x40]))
> -#endif
> +/**
> + * Return true if current symbol is space.
> + *
> + * @param z Input stream.
> + * @retval True if current symbl space.

11. symbol.

> + */
> +static inline bool
> +sql_is_space_char(const char *z)
> +{
> +	if (sqlite3Isspace(z[0]))
> +		return true;
> +	if (sqlite3CtypeMap[*(unsigned char*)z] & 0x40) {

12. Why 0x40?

> +		UChar32 c;
> +		int unused = 0;
> +		U8_NEXT(z, unused, INT32_MAX, c);
> +		if (u_isspace(c))
> +			return true;
> +	}
> +	return false;
> +}
>   
> -/*
> - * Return the length (in bytes) of the token that begins at z[0].
> - * Store the token type in *tokenType before returning.
> +/**
> + * Calculate length of continuous sequence of
> + * space symbols.
> + *
> + * @param z Input stream.
> + * @retval Number of bytes which constitute sequence of spaces.
> + *         Can be 0 if first symbol in stram is not space.
>    */
> +static inline int
> +sql_skip_spaces(const char *z)
> +{
> +	int idx = 0;
> +	while (true) {
> +		if (sqlite3Isspace(z[idx])) {
> +			idx += 1;
> +		} else if ((sqlite3CtypeMap[*(unsigned char *)(z + idx)] & 0x40)

13. Already second place with 0x40. What is it? Maybe you should create a separate
macro like it is done here?

#define sqlite3Isspace(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
#define sqlite3Isalnum(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
#define sqlite3Isalpha(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
#define sqlite3Isdigit(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
#define sqlite3Isxdigit(x)  (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
...

> +			   != 0) {
> +			UChar32 c;
> +			int prev_offset = idx;
> +			U8_NEXT(z, idx, INT32_MAX, c);

14. If you do not check for errors and don't know a length,
you can use U8_NEXT_UNSAFE. It is much shorter. Same about U8_NEXT
in sql_is_space_char I think.

> +			if (!u_isspace(c)) {
> +				idx = prev_offset;

15. In most cases you will have one white space, so maybe it is faster
to make this:
int new_offset = idx;
U8_NEXT(z, new_offset, INT32_MAX, c);
if (! u_isspace(c))
	break;
idx = new_offset;

Here you have not to reset the offset in most cases.

> +				break;
> +			}
> +		} else {
> +			break;
> +		}
> +	}
> +	return idx;
> +}
> +
>   int
> -sqlite3GetToken(const unsigned char *z, int *tokenType, bool *is_reserved)
> +sql_token(const char *z, int *type, bool *is_reserved)

16. Please, consider my light refactoring.

diff --git a/src/box/sql/tokenize.c b/src/box/sql/tokenize.c
index 8df58e949..4cdd375ee 100644
--- a/src/box/sql/tokenize.c
+++ b/src/box/sql/tokenize.c
@@ -209,8 +209,7 @@ sql_token(const char *z, int *type, bool *is_reserved)
  		return i;
  	case CC_MINUS:
  		if (z[1] == '-') {
-			for (i = 2; (c = z[i]) != 0 && c != '\n'; i++) {
-			}
+			for (i = 2; (c = z[i]) != 0 && c != '\n'; i++);
  			*type = TK_SPACE;
  			return i;
  		}
@@ -237,10 +236,8 @@ sql_token(const char *z, int *type, bool *is_reserved)
  			return 1;
  		}
  		for (i = 3, c = z[2];
-		     (c != '*' || z[i] != '/') && (c = z[i]) != 0;
-		     i++) {
-		}
-		if (c)
+		     (c != '*' || z[i] != '/') && (c = z[i]) != 0; i++);
+		if (c != 0)
  			i++;
  		*type = TK_SPACE;
  		return i;
@@ -323,8 +320,7 @@ sql_token(const char *z, int *type, bool *is_reserved)
  		}
  		FALLTHROUGH;
  	case CC_DOT:
-		if (!sqlite3Isdigit(z[1]))
-		{
+		if (! sqlite3Isdigit(z[1])) {
  			*type = TK_DOT;
  			return 1;
  		}
@@ -335,38 +331,32 @@ sql_token(const char *z, int *type, bool *is_reserved)
  		FALLTHROUGH;
  	case CC_DIGIT:
  		*type = TK_INTEGER;
-		if (z[0] == '0' && (z[1] == 'x' || z[1] == 'X')
-		    && sqlite3Isxdigit(z[2])) {
-			for (i = 3; sqlite3Isxdigit(z[i]); i++) {
-			}
+		if (z[0] == '0' && (z[1] == 'x' || z[1] == 'X') &&
+		    sqlite3Isxdigit(z[2])) {
+			for (i = 3; sqlite3Isxdigit(z[i]); i++);
  			return i;
  		}
-		for (i = 0; sqlite3Isdigit(z[i]); i++) {
-		}
+		for (i = 0; sqlite3Isdigit(z[i]); i++);
  		if (z[i] == '.') {
-			i++;
-			while (sqlite3Isdigit(z[i]))
-				i++;
+			while (sqlite3Isdigit(z[++i]));
  			*type = TK_FLOAT;
  		}
  		if ((z[i] == 'e' || z[i] == 'E') &&
  		    (sqlite3Isdigit(z[i + 1])
-		     || ((z[i + 1] == '+' || z[i + 1] == '-')
-			 && sqlite3Isdigit(z[i + 2])))) {
-			i += 2;
-			while (sqlite3Isdigit(z[i]))
-				i++;
+		     || ((z[i + 1] == '+' || z[i + 1] == '-') &&
+			 sqlite3Isdigit(z[i + 2])))) {
+			++i;
+			while (sqlite3Isdigit(z[++i]));
  			*type = TK_FLOAT;
  		}
-		while (IdChar(z[i])) {
+		if (IdChar(z[i])) {
  			*type = TK_ILLEGAL;
-			i++;
+			while(IdChar(z[++i]));
  		}
  		return i;
  	case CC_VARNUM:
  		*type = TK_VARIABLE;
-		for (i = 1; sqlite3Isdigit(z[i]); i++) {
-		}
+		for (i = 1; sqlite3Isdigit(z[i]); i++);
  		return i;
  	case CC_DOLLAR:
  	case CC_VARALPHA:
@@ -383,9 +373,8 @@ sql_token(const char *z, int *type, bool *is_reserved)
  		return i;
  	case CC_KYWD:
  		for (i = 1; sql_ascii_class[*(unsigned char*)(z+i)] <= CC_KYWD;
-		     i++) {
-		}
-		if (!sql_is_space_char((const char *)z + i) && IdChar(z[i])) {
+		     i++);
+		if (!sql_is_space_char(z + i) && IdChar(z[i])) {
  			/* This token started out using characters
  			 * that can appear in keywords, but z[i] is
  			 * a character not allowed within keywords,
@@ -395,18 +384,17 @@ sql_token(const char *z, int *type, bool *is_reserved)
  			break;
  		}
  		*type = TK_ID;
-		return keywordCode((char *)z, i, type, is_reserved);
+		return keywordCode(z, i, type, is_reserved);
  	case CC_X:
  		if (z[1] == '\'') {
  			*type = TK_BLOB;
-			for (i = 2; sqlite3Isxdigit(z[i]); i++) {
-			}
+			for (i = 2; sqlite3Isxdigit(z[i]); i++);
  			if (z[i] != '\'' || i % 2) {
  				*type = TK_ILLEGAL;
-				while (z[i] && z[i] != '\'')
+				while (z[i] != 0 && z[i] != '\'')
  					i++;
  			}
-			if (z[i])
+			if (z[i] != 0)
  				i++;
  			return i;
  		}


17. And the last comment. I see, that you use sql_is_space_char to
detect if a char is a white space. But the caller code propagates the
offset to 1 byte always. Regardless of the fact the white space symbol
can occupy more than 1 byte.

> 		if (!sql_is_space_char(z + i) && IdChar(z[i])) {
> 			/* This token started out using characters
> 			 * that can appear in keywords, but z[i] is
> 			 * a character not allowed within keywords,
> 			 * so this must be an identifier instead.
> 			 */
> 			i++;
                       ^^^^^^^^^
                          \|/
              here --------+

> 			break;
> 		}
I propose to inline it and propagate 'i' on the value got from
U8_NEXT_UNSAFE as offset.