[tarantool-patches] Re: [PATCH] sql: LIKE/LENGTH process '\0'

Tue Feb 26 16:33:01 MSK 2019

> On 25 Feb 2019, at 18:10, n.pettik <korablev at tarantool.org> wrote:
> 
> That’s I was talking about. But using the macros with the same
> name as in utf library doesn’t look like a good pattern. Yep, you
> can use define guards like:
> 
> #ifdef  U8_COUNT_TRAIL_BYTES_UNSAFE
> #undef U8_COUNT_TRAIL_BYTES_UNSAFE
> #endif
> #define U8_COUNT_TRAIL_BYTES_UNSAFE
> 
> But I’d rather just give it another name.
> Hence, taking into account comment below,
> we are going to substitute SQL_SKIP_UTF8() with
> implementation borrowed from icu library.
I changed the names to SQL_UTF8_FWD_1_UNSAFE and
SQL_UTF8_COUNT_TRAIL_BYTES_UNSAFE.
> 
>>>>> Furthermore, description says that it “assumes well-formed UTF-8”,
>>>>> which in our case is not true. So who knows what may happen if we pass
>>>>> malformed byte sequence. I am not even saying that behaviour of
>>>>> this function on invalid inputs may change later.
>>>> 
>>>> In it's current implementation U8_FWD_1_UNSAFE satisfy our needs safely. Returned
>>>> symbol length would never exceed byte_len.
>>>> 
>>>> static int
>>>> utf8_char_count(const unsigned char *str, int byte_len)
>>>> {
>>>> 	int symbol_count = 0;
>>>> 	for (int i = 0; i < byte_len;) {
>>>> 		U8_FWD_1_UNSAFE(str, i);
>>>> 		symbol_count++;
>>>> 	}
>>>> 	return symbol_count;
>>>> }
>>>> 
>>>> I agree that it is a bad idea to relay on lib behaviour which may
>>>> change lately. So maybe I would just inline these one line macros?
>>>> Or use my own implementation, since it’s more efficient (but less beautiful)
>>> 
>>> Nevermind, let's keep it as is.
>>> I really worry only about the fact that in other places SQL_SKIP_UTF8
>>> is used instead. It handles only two-bytes utf8 symbols, meanwhile
>>> U8_FWD_1_UNSAFE() accounts three and four bytes length symbols.
>>> Can we use everywhere the same pattern?
>> Yes, I think, we can.
> 
> Ok, then will be waiting for updates.
So here are the updates:

sqlUtf8CharLen is removed with utf8_char_count (which is
just moved to utf.c). sqlUtf8CharLen used SQL_SKIP_UTF8
and itself was used only in one place.

diff --git a/src/box/sql/utf.c b/src/box/sql/utf.c
index 22e0de809..5a1862a52 100644
--- a/src/box/sql/utf.c
+++ b/src/box/sql/utf.c
@@ -157,30 +157,31 @@ sqlUtf8Read(const unsigned char **pz	/* Pointer to string from which to read cha
  */
 /* #define TRANSLATE_TRACE 1 */
 
-/*
- * pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
- * return the number of unicode characters in pZ up to (but not including)
- * the first 0x00 byte. If nByte is not less than zero, return the
- * number of unicode characters in the first nByte of pZ (or up to
- * the first 0x00, whichever comes first).
+/**
+ * Return number of symbols in the given string.
+ *
+ * Number of symbols != byte size of string because some symbols
+ * are encoded with more than one byte. Also note that all
+ * symbols from 'str' to 'str + byte_len' would be counted,
+ * even if there is a '\0' somewhere between them.
+ *
+ * This function is implemented to be fast and indifferent to
+ * correctness of string being processed. If input string has
+ * even one invalid utf-8 sequence, then the resulting length
+ * could be arbitary in these boundaries (0 < len < byte_len).
+ * @param str String to be counted.
+ * @param byte_len Byte length of given string.
+ * @return number of symbols in the given string.
  */
 int
-sqlUtf8CharLen(const char *zIn, int nByte)
+utf8_char_count(const unsigned char *str, int byte_len)
 {
-	int r = 0;
-	const u8 *z = (const u8 *)zIn;
-	const u8 *zTerm;
-	if (nByte >= 0) {
-		zTerm = &z[nByte];
-	} else {
-		zTerm = (const u8 *)(-1);
-	}
-	assert(z <= zTerm);
-	while (*z != 0 && z < zTerm) {
-		SQL_SKIP_UTF8(z);
-		r++;
+	int symbol_count = 0;
+	for (int i = 0; i < byte_len;) {
+		SQL_UTF8_FWD_1_UNSAFE(str, i, byte_len);
+		symbol_count++;
 	}
-	return r;
+	return symbol_count;
 }


I inlined SQL_UTF8_FWD_1_UNSAFE and
SQL_UTF8_COUNT_TRAIL_BYTES_UNSAFE from icu
library, because different versions of
the library provide slightly different
behavior of UTF8_FWD_1_UNSAFE. It caused
sql-tap/badutf test to fail on some platforms,
while on the others it was working.

diff --git a/src/box/sql/sqlInt.h b/src/box/sql/sqlInt.h
index 2830ab639..859450e88 100644
--- a/src/box/sql/sqlInt.h
+++ b/src/box/sql/sqlInt.h
@@ -3071,16 +3071,6 @@ struct TreeView {
 };
 #endif				/* SQL_DEBUG */
 
-/*
- * Assuming zIn points to the first byte of a UTF-8 character,
- * advance zIn to point to the first byte of the next UTF-8 character.
- */
-#define SQL_SKIP_UTF8(zIn) {                        \
-  if( (*(zIn++))>=0xc0 ){                              \
-    while( (*zIn & 0xc0)==0x80 ){ zIn++; }             \
-  }                                                    \
-}
-
 /*
  * The sql_*_BKPT macros are substitutes for the error codes with
  * the same name but without the _BKPT suffix.  These macros invoke
@@ -4142,11 +4132,51 @@ void
 sql_drop_foreign_key(struct Parse *parse_context, struct SrcList *table,
 		     struct Token *constraint);
 
+/**
+ * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8
+ * sequence. Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
+ *
+ * @param leadByte The first byte of a UTF-8 sequence.
+ *                 Must be 0..0xff.
+ */
+#define SQL_UTF8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
+    (((uint8_t)(leadByte) >= 0xc2) + ((uint8_t)(leadByte) >= 0xe0) + \
+    ((uint8_t)(leadByte) >= 0xf0))
+
+/**
+ * Advance the string offset from one code point boundary to the
+ * next. (Post-incrementing iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * After the whole string is traversed, (str + i) points to the
+ * position right after the last element of the string (*).
+ *
+ * If resulting offset > byte_size then resulting offset is set
+ * to byte_size. This is to provide (*) in cases where it might
+ * be violated.
+ *
+ * SQL_UTF8_FWD_1_UNSAFE sometimes is used get the size of utf-8
+ * character subsequence and we don't want to get summary size
+ * which exceeds total string size (in bytes). Consider example:
+ *
+ * '0xE0' - this is invalid utf-8 string because it consists only
+ * of first byte of 3 byte sequence. After traverse, the
+ * offset == 2 and we set it to 1, to keep (*).
+ *
+ * @param s const uint8_t * string.
+ * @param i string offset.
+ * @param byte_size byte size of the string.
+ */
+#define SQL_UTF8_FWD_1_UNSAFE(str, i, byte_size) \
+    (i) += 1 + SQL_UTF8_COUNT_TRAIL_BYTES_UNSAFE((str)[i]); \
+    (i) = (i) <= (byte_size) ? (i) : (byte_size);
+
 void sqlDetach(Parse *, Expr *);
 int sqlAtoF(const char *z, double *, int);
 int sqlGetInt32(const char *, int *);
 int sqlAtoi(const char *);
-int sqlUtf8CharLen(const char *pData, int nByte);
+int utf8_char_count(const unsigned char *str, int byte_len);
 u32 sqlUtf8Read(const u8 **);
 LogEst sqlLogEst(u64);
 LogEst sqlLogEstAdd(LogEst, LogEst);



Just removed SQL_SKIP_UTF8 with SQL_UTF8_FWD_1_UNSAFE.

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index 8ddb9780f..5352ca6b8 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -128,33 +128,6 @@ typeofFunc(sql_context * context, int NotUsed, sql_value ** argv)
 	sql_result_text(context, z, -1, SQL_STATIC);
 }
 
-/**
- * Return number of symbols in the given string.
- *
- * Number of symbols != byte size of string because some symbols
- * are encoded with more than one byte. Also note that all
- * symbols from 'str' to 'str + byte_len' would be counted,
- * even if there is a '\0' somewhere between them.
- *
- * This function is implemented to be fast and indifferent to
- * correctness of string being processed. If input string has
- * even one invalid utf-8 sequence, then the resulting length
- * could be arbitary in these boundaries (0 < len < byte_len).
- * @param str String to be counted.
- * @param byte_len Byte length of given string.
- * @return number of symbols in the given string.
- */
-static int
-utf8_char_count(const unsigned char *str, int byte_len)
-{
-	int symbol_count = 0;
-	for (int i = 0; i < byte_len;) {
-		U8_FWD_1_UNSAFE(str, i);
-		symbol_count++;
-	}
-	return symbol_count;
-}
-
 /*
  * Implementation of the length() function
  */
@@ -413,17 +386,22 @@ substrFunc(sql_context * context, int argc, sql_value ** argv)
 		 * used because '\0' is not supposed to be
 		 * end-of-string symbol.
 		 */
-		int n_chars = utf8_char_count(z, sql_value_bytes(argv[0]));
+		int byte_size = sql_value_bytes(argv[0]);
+		int n_chars = utf8_char_count(z, byte_size);
 		int cnt = 0;
+		int i = 0;
 		while (cnt < n_chars && p1) {
-			SQL_SKIP_UTF8(z);
+			SQL_UTF8_FWD_1_UNSAFE(z, i, byte_size);
 			cnt++;
 			p1--;
 		}
+		z += i;
+		i = 0;
 		for (z2 = z; cnt < n_chars && p2; p2--) {
-			SQL_SKIP_UTF8(z2);
+			SQL_UTF8_FWD_1_UNSAFE(z2, i, byte_size);
 			cnt++;
 		}
+		z2 += i;
 		sql_result_text64(context, (char *)z, z2 - z,
 				      SQL_TRANSIENT);
 	} else {
@@ -890,7 +868,7 @@ likeFunc(sql_context *context, int argc, sql_value **argv)
 			return;
 		const char *const err_msg =
 			"ESCAPE expression must be a single character";
-		if (sqlUtf8CharLen((char *)zEsc, -1) != 1) {
+		if (utf8_char_count(zEsc, sql_value_bytes(argv[2])) != 1) {
 			sql_result_error(context, err_msg, -1);
 			return;
 		}
@@ -1268,8 +1246,6 @@ trimFunc(sql_context * context, int argc, sql_value ** argv)
 	} else {
 		const unsigned char *z = zCharSet;
 		int trim_set_sz = sql_value_bytes(argv[1]);
-		int handled_bytes_cnt = trim_set_sz;
-		nChar = 0;
 		/*
 		* Count the number of UTF-8 characters passing
 		* through the entire char set, but not up
@@ -1277,12 +1253,7 @@ trimFunc(sql_context * context, int argc, sql_value ** argv)
 		* to handle trimming set containing such
 		* characters.
 		*/
-		while(handled_bytes_cnt > 0) {
-		const unsigned char *prev_byte = z;
-			SQL_SKIP_UTF8(z);
-			handled_bytes_cnt -= (z - prev_byte);
-			nChar++;
-		}
+		nChar = utf8_char_count(z, trim_set_sz);
 		if (nChar > 0) {
 			azChar =
 			    contextMalloc(context,
@@ -1292,12 +1263,13 @@ trimFunc(sql_context * context, int argc, sql_value ** argv)
 			}
 			aLen = (unsigned char *)&azChar[nChar];
 			z = zCharSet;
+			i = 0;
 			nChar = 0;
-			handled_bytes_cnt = trim_set_sz;
+			int handled_bytes_cnt = trim_set_sz;
 			while(handled_bytes_cnt > 0) {
-				azChar[nChar] = (unsigned char *)z;
-				SQL_SKIP_UTF8(z);
-				aLen[nChar] = (u8) (z - azChar[nChar]);
+				azChar[nChar] = (unsigned char *)(z + i);
+				SQL_UTF8_FWD_1_UNSAFE(z, i, trim_set_sz);
+				aLen[nChar] = (u8) (z + i - azChar[nChar]);
 				handled_bytes_cnt -= aLen[nChar];
 				nChar++;
 			}