[tarantool-patches] Re: [PATCH] sql: LIKE/LENGTH process '\0'
i.koptelov
ivan.koptelov at tarantool.org
Tue Feb 26 16:33:01 MSK 2019
> On 25 Feb 2019, at 18:10, n.pettik <korablev at tarantool.org> wrote:
>
> That’s I was talking about. But using the macros with the same
> name as in utf library doesn’t look like a good pattern. Yep, you
> can use define guards like:
>
> #ifdef U8_COUNT_TRAIL_BYTES_UNSAFE
> #undef U8_COUNT_TRAIL_BYTES_UNSAFE
> #endif
> #define U8_COUNT_TRAIL_BYTES_UNSAFE
>
> But I’d rather just give it another name.
> Hence, taking into account comment below,
> we are going to substitute SQL_SKIP_UTF8() with
> implementation borrowed from icu library.
I changed the names to SQL_UTF8_FWD_1_UNSAFE and
SQL_UTF8_COUNT_TRAIL_BYTES_UNSAFE.
>
>>>>> Furthermore, description says that it “assumes well-formed UTF-8”,
>>>>> which in our case is not true. So who knows what may happen if we pass
>>>>> malformed byte sequence. I am not even saying that behaviour of
>>>>> this function on invalid inputs may change later.
>>>>
>>>> In it's current implementation U8_FWD_1_UNSAFE satisfy our needs safely. Returned
>>>> symbol length would never exceed byte_len.
>>>>
>>>> static int
>>>> utf8_char_count(const unsigned char *str, int byte_len)
>>>> {
>>>> int symbol_count = 0;
>>>> for (int i = 0; i < byte_len;) {
>>>> U8_FWD_1_UNSAFE(str, i);
>>>> symbol_count++;
>>>> }
>>>> return symbol_count;
>>>> }
>>>>
>>>> I agree that it is a bad idea to relay on lib behaviour which may
>>>> change lately. So maybe I would just inline these one line macros?
>>>> Or use my own implementation, since it’s more efficient (but less beautiful)
>>>
>>> Nevermind, let's keep it as is.
>>> I really worry only about the fact that in other places SQL_SKIP_UTF8
>>> is used instead. It handles only two-bytes utf8 symbols, meanwhile
>>> U8_FWD_1_UNSAFE() accounts three and four bytes length symbols.
>>> Can we use everywhere the same pattern?
>> Yes, I think, we can.
>
> Ok, then will be waiting for updates.
So here are the updates:
sqlUtf8CharLen is removed with utf8_char_count (which is
just moved to utf.c). sqlUtf8CharLen used SQL_SKIP_UTF8
and itself was used only in one place.
diff --git a/src/box/sql/utf.c b/src/box/sql/utf.c
index 22e0de809..5a1862a52 100644
--- a/src/box/sql/utf.c
+++ b/src/box/sql/utf.c
@@ -157,30 +157,31 @@ sqlUtf8Read(const unsigned char **pz /* Pointer to string from which to read cha
*/
/* #define TRANSLATE_TRACE 1 */
-/*
- * pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
- * return the number of unicode characters in pZ up to (but not including)
- * the first 0x00 byte. If nByte is not less than zero, return the
- * number of unicode characters in the first nByte of pZ (or up to
- * the first 0x00, whichever comes first).
+/**
+ * Return number of symbols in the given string.
+ *
+ * Number of symbols != byte size of string because some symbols
+ * are encoded with more than one byte. Also note that all
+ * symbols from 'str' to 'str + byte_len' would be counted,
+ * even if there is a '\0' somewhere between them.
+ *
+ * This function is implemented to be fast and indifferent to
+ * correctness of string being processed. If input string has
+ * even one invalid utf-8 sequence, then the resulting length
+ * could be arbitary in these boundaries (0 < len < byte_len).
+ * @param str String to be counted.
+ * @param byte_len Byte length of given string.
+ * @return number of symbols in the given string.
*/
int
-sqlUtf8CharLen(const char *zIn, int nByte)
+utf8_char_count(const unsigned char *str, int byte_len)
{
- int r = 0;
- const u8 *z = (const u8 *)zIn;
- const u8 *zTerm;
- if (nByte >= 0) {
- zTerm = &z[nByte];
- } else {
- zTerm = (const u8 *)(-1);
- }
- assert(z <= zTerm);
- while (*z != 0 && z < zTerm) {
- SQL_SKIP_UTF8(z);
- r++;
+ int symbol_count = 0;
+ for (int i = 0; i < byte_len;) {
+ SQL_UTF8_FWD_1_UNSAFE(str, i, byte_len);
+ symbol_count++;
}
- return r;
+ return symbol_count;
}
I inlined SQL_UTF8_FWD_1_UNSAFE and
SQL_UTF8_COUNT_TRAIL_BYTES_UNSAFE from icu
library, because different versions of
the library provide slightly different
behavior of UTF8_FWD_1_UNSAFE. It caused
sql-tap/badutf test to fail on some platforms,
while on the others it was working.
diff --git a/src/box/sql/sqlInt.h b/src/box/sql/sqlInt.h
index 2830ab639..859450e88 100644
--- a/src/box/sql/sqlInt.h
+++ b/src/box/sql/sqlInt.h
@@ -3071,16 +3071,6 @@ struct TreeView {
};
#endif /* SQL_DEBUG */
-/*
- * Assuming zIn points to the first byte of a UTF-8 character,
- * advance zIn to point to the first byte of the next UTF-8 character.
- */
-#define SQL_SKIP_UTF8(zIn) { \
- if( (*(zIn++))>=0xc0 ){ \
- while( (*zIn & 0xc0)==0x80 ){ zIn++; } \
- } \
-}
-
/*
* The sql_*_BKPT macros are substitutes for the error codes with
* the same name but without the _BKPT suffix. These macros invoke
@@ -4142,11 +4132,51 @@ void
sql_drop_foreign_key(struct Parse *parse_context, struct SrcList *table,
struct Token *constraint);
+/**
+ * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8
+ * sequence. Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
+ * leadByte might be evaluated multiple times.
+ *
+ * @param leadByte The first byte of a UTF-8 sequence.
+ * Must be 0..0xff.
+ */
+#define SQL_UTF8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
+ (((uint8_t)(leadByte) >= 0xc2) + ((uint8_t)(leadByte) >= 0xe0) + \
+ ((uint8_t)(leadByte) >= 0xf0))
+
+/**
+ * Advance the string offset from one code point boundary to the
+ * next. (Post-incrementing iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * After the whole string is traversed, (str + i) points to the
+ * position right after the last element of the string (*).
+ *
+ * If resulting offset > byte_size then resulting offset is set
+ * to byte_size. This is to provide (*) in cases where it might
+ * be violated.
+ *
+ * SQL_UTF8_FWD_1_UNSAFE sometimes is used get the size of utf-8
+ * character subsequence and we don't want to get summary size
+ * which exceeds total string size (in bytes). Consider example:
+ *
+ * '0xE0' - this is invalid utf-8 string because it consists only
+ * of first byte of 3 byte sequence. After traverse, the
+ * offset == 2 and we set it to 1, to keep (*).
+ *
+ * @param s const uint8_t * string.
+ * @param i string offset.
+ * @param byte_size byte size of the string.
+ */
+#define SQL_UTF8_FWD_1_UNSAFE(str, i, byte_size) \
+ (i) += 1 + SQL_UTF8_COUNT_TRAIL_BYTES_UNSAFE((str)[i]); \
+ (i) = (i) <= (byte_size) ? (i) : (byte_size);
+
void sqlDetach(Parse *, Expr *);
int sqlAtoF(const char *z, double *, int);
int sqlGetInt32(const char *, int *);
int sqlAtoi(const char *);
-int sqlUtf8CharLen(const char *pData, int nByte);
+int utf8_char_count(const unsigned char *str, int byte_len);
u32 sqlUtf8Read(const u8 **);
LogEst sqlLogEst(u64);
LogEst sqlLogEstAdd(LogEst, LogEst);
Just removed SQL_SKIP_UTF8 with SQL_UTF8_FWD_1_UNSAFE.
diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index 8ddb9780f..5352ca6b8 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -128,33 +128,6 @@ typeofFunc(sql_context * context, int NotUsed, sql_value ** argv)
sql_result_text(context, z, -1, SQL_STATIC);
}
-/**
- * Return number of symbols in the given string.
- *
- * Number of symbols != byte size of string because some symbols
- * are encoded with more than one byte. Also note that all
- * symbols from 'str' to 'str + byte_len' would be counted,
- * even if there is a '\0' somewhere between them.
- *
- * This function is implemented to be fast and indifferent to
- * correctness of string being processed. If input string has
- * even one invalid utf-8 sequence, then the resulting length
- * could be arbitary in these boundaries (0 < len < byte_len).
- * @param str String to be counted.
- * @param byte_len Byte length of given string.
- * @return number of symbols in the given string.
- */
-static int
-utf8_char_count(const unsigned char *str, int byte_len)
-{
- int symbol_count = 0;
- for (int i = 0; i < byte_len;) {
- U8_FWD_1_UNSAFE(str, i);
- symbol_count++;
- }
- return symbol_count;
-}
-
/*
* Implementation of the length() function
*/
@@ -413,17 +386,22 @@ substrFunc(sql_context * context, int argc, sql_value ** argv)
* used because '\0' is not supposed to be
* end-of-string symbol.
*/
- int n_chars = utf8_char_count(z, sql_value_bytes(argv[0]));
+ int byte_size = sql_value_bytes(argv[0]);
+ int n_chars = utf8_char_count(z, byte_size);
int cnt = 0;
+ int i = 0;
while (cnt < n_chars && p1) {
- SQL_SKIP_UTF8(z);
+ SQL_UTF8_FWD_1_UNSAFE(z, i, byte_size);
cnt++;
p1--;
}
+ z += i;
+ i = 0;
for (z2 = z; cnt < n_chars && p2; p2--) {
- SQL_SKIP_UTF8(z2);
+ SQL_UTF8_FWD_1_UNSAFE(z2, i, byte_size);
cnt++;
}
+ z2 += i;
sql_result_text64(context, (char *)z, z2 - z,
SQL_TRANSIENT);
} else {
@@ -890,7 +868,7 @@ likeFunc(sql_context *context, int argc, sql_value **argv)
return;
const char *const err_msg =
"ESCAPE expression must be a single character";
- if (sqlUtf8CharLen((char *)zEsc, -1) != 1) {
+ if (utf8_char_count(zEsc, sql_value_bytes(argv[2])) != 1) {
sql_result_error(context, err_msg, -1);
return;
}
@@ -1268,8 +1246,6 @@ trimFunc(sql_context * context, int argc, sql_value ** argv)
} else {
const unsigned char *z = zCharSet;
int trim_set_sz = sql_value_bytes(argv[1]);
- int handled_bytes_cnt = trim_set_sz;
- nChar = 0;
/*
* Count the number of UTF-8 characters passing
* through the entire char set, but not up
@@ -1277,12 +1253,7 @@ trimFunc(sql_context * context, int argc, sql_value ** argv)
* to handle trimming set containing such
* characters.
*/
- while(handled_bytes_cnt > 0) {
- const unsigned char *prev_byte = z;
- SQL_SKIP_UTF8(z);
- handled_bytes_cnt -= (z - prev_byte);
- nChar++;
- }
+ nChar = utf8_char_count(z, trim_set_sz);
if (nChar > 0) {
azChar =
contextMalloc(context,
@@ -1292,12 +1263,13 @@ trimFunc(sql_context * context, int argc, sql_value ** argv)
}
aLen = (unsigned char *)&azChar[nChar];
z = zCharSet;
+ i = 0;
nChar = 0;
- handled_bytes_cnt = trim_set_sz;
+ int handled_bytes_cnt = trim_set_sz;
while(handled_bytes_cnt > 0) {
- azChar[nChar] = (unsigned char *)z;
- SQL_SKIP_UTF8(z);
- aLen[nChar] = (u8) (z - azChar[nChar]);
+ azChar[nChar] = (unsigned char *)(z + i);
+ SQL_UTF8_FWD_1_UNSAFE(z, i, trim_set_sz);
+ aLen[nChar] = (u8) (z + i - azChar[nChar]);
handled_bytes_cnt -= aLen[nChar];
nChar++;
}
More information about the Tarantool-patches
mailing list