[Tarantool-patches] [PATCH v1 6/8] sql: rework POSITION() function

Mergen Imeev imeevma at tarantool.org
Mon Nov 1 13:41:11 MSK 2021


Thank you for the review! I replaced ucnv_getNextUChar() by U8_NEXT(). Diff and
new patch below.

On Wed, Oct 20, 2021 at 08:08:24PM +0300, Mergen Imeev via Tarantool-patches wrote:
<cut>

Diff:

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index 600366b36..7914d2ec7 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -545,28 +545,31 @@ func_position_characters(struct sql_context *ctx, int argc, struct Mem *argv)
 	if (key_size <= 0)
 		return mem_set_uint(ctx->pOut, 1);
 
-	UErrorCode err = U_ZERO_ERROR;
-	const char *pos = str;
-	const char *cur = str;
-	const char *end = str + str_size;
-	const char *tmp_pos = key;
-	const char *tmp_end = key + key_size;
-	assert(icu_utf8_conv != NULL);
-	while (tmp_pos < tmp_end && err == U_ZERO_ERROR) {
-		ucnv_getNextUChar(icu_utf8_conv, &tmp_pos, tmp_end, &err);
-		ucnv_getNextUChar(icu_utf8_conv, &cur, end, &err);
-	}
-
-	int i = 0;
-	while (err == U_ZERO_ERROR) {
-		struct coll *coll = ctx->coll;
-		if (coll->cmp(key, key_size, pos, cur - pos, coll) == 0)
-			return mem_set_uint(ctx->pOut, i + 1);
-		ucnv_getNextUChar(icu_utf8_conv, &pos, end, &err);
-		ucnv_getNextUChar(icu_utf8_conv, &cur, end, &err);
+	int key_end = 0;
+	int str_end = 0;
+	while (key_end < key_size && str_end < str_size) {
+		UChar32 c;
+		U8_NEXT((uint8_t *)key, key_end, key_size, c);
+		U8_NEXT((uint8_t *)str, str_end, str_size, c);
+	}
+	if (key_end < key_size)
+		return mem_set_uint(ctx->pOut, 0);
+
+	struct coll *coll = ctx->coll;
+	if (coll->cmp(key, key_size, str, str_end, coll) == 0)
+		return mem_set_uint(ctx->pOut, 1);
+
+	int i = 2;
+	int str_pos = 0;
+	while (str_end < str_size) {
+		UChar32 c;
+		U8_NEXT((uint8_t *)str, str_pos, str_size, c);
+		U8_NEXT((uint8_t *)str, str_end, str_size, c);
+		const char *s = str + str_pos;
+		if (coll->cmp(key, key_size, s, str_end - str_pos, coll) == 0)
+			return mem_set_uint(ctx->pOut, i);
 		++i;
 	}
-	assert(err == U_INDEX_OUTOFBOUNDS_ERROR && cur == end);
 	return mem_set_uint(ctx->pOut, 0);
 }
 


New patch:

commit cb4bcb4bc7da6a80c040cf0864b7bf538af02aaa
Author: Mergen Imeev <imeevma at gmail.com>
Date:   Wed Sep 22 14:36:40 2021 +0300

    sql: rework POSITION() function
    
    This patch is a refactoring of POSITION(). In addition, VARBINARY
    arguments can now be used in this function. In addition, POSITION() now
    uses ICU functions instead of self-created.
    
    Part of #4145

diff --git a/src/box/sql/func.c b/src/box/sql/func.c
index ba6b9246d..7914d2ec7 100644
--- a/src/box/sql/func.c
+++ b/src/box/sql/func.c
@@ -509,6 +509,70 @@ func_trim_str(struct sql_context *ctx, int argc, struct Mem *argv)
 		ctx->is_aborted = true;
 }
 
+/** Implementation of the POSITION() function. */
+static void
+func_position_octets(struct sql_context *ctx, int argc, struct Mem *argv)
+{
+	assert(argc == 2);
+	(void)argc;
+	if (mem_is_any_null(&argv[0], &argv[1]))
+		return;
+	assert(mem_is_bytes(&argv[0]) && mem_is_bytes(&argv[1]));
+
+	const char *key = argv[0].z;
+	const char *str = argv[1].z;
+	int key_size = argv[0].n;
+	int str_size = argv[1].n;
+	if (key_size <= 0)
+		return mem_set_uint(ctx->pOut, 1);
+	const char *pos = memmem(str, str_size, key, key_size);
+	return mem_set_uint(ctx->pOut, pos == NULL ? 0 : pos - str + 1);
+}
+
+static void
+func_position_characters(struct sql_context *ctx, int argc, struct Mem *argv)
+{
+	assert(argc == 2);
+	(void)argc;
+	if (mem_is_any_null(&argv[0], &argv[1]))
+		return;
+	assert(mem_is_str(&argv[0]) && mem_is_str(&argv[1]));
+
+	const char *key = argv[0].z;
+	const char *str = argv[1].z;
+	int key_size = argv[0].n;
+	int str_size = argv[1].n;
+	if (key_size <= 0)
+		return mem_set_uint(ctx->pOut, 1);
+
+	int key_end = 0;
+	int str_end = 0;
+	while (key_end < key_size && str_end < str_size) {
+		UChar32 c;
+		U8_NEXT((uint8_t *)key, key_end, key_size, c);
+		U8_NEXT((uint8_t *)str, str_end, str_size, c);
+	}
+	if (key_end < key_size)
+		return mem_set_uint(ctx->pOut, 0);
+
+	struct coll *coll = ctx->coll;
+	if (coll->cmp(key, key_size, str, str_end, coll) == 0)
+		return mem_set_uint(ctx->pOut, 1);
+
+	int i = 2;
+	int str_pos = 0;
+	while (str_end < str_size) {
+		UChar32 c;
+		U8_NEXT((uint8_t *)str, str_pos, str_size, c);
+		U8_NEXT((uint8_t *)str, str_end, str_size, c);
+		const char *s = str + str_pos;
+		if (coll->cmp(key, key_size, s, str_end - str_pos, coll) == 0)
+			return mem_set_uint(ctx->pOut, i);
+		++i;
+	}
+	return mem_set_uint(ctx->pOut, 0);
+}
+
 static const unsigned char *
 mem_as_ustr(struct Mem *mem)
 {
@@ -672,141 +736,6 @@ lengthFunc(struct sql_context *context, int argc, struct Mem *argv)
 	}
 }
 
-/**
- * Implementation of the position() function.
- *
- * position(needle, haystack) finds the first occurrence of needle
- * in haystack and returns the number of previous characters
- * plus 1, or 0 if needle does not occur within haystack.
- *
- * If both haystack and needle are BLOBs, then the result is one
- * more than the number of bytes in haystack prior to the first
- * occurrence of needle, or 0 if needle never occurs in haystack.
- */
-static void
-position_func(struct sql_context *context, int argc, struct Mem *argv)
-{
-	UNUSED_PARAMETER(argc);
-	struct Mem *needle = &argv[0];
-	struct Mem *haystack = &argv[1];
-	enum mp_type needle_type = sql_value_type(needle);
-	enum mp_type haystack_type = sql_value_type(haystack);
-
-	if (haystack_type == MP_NIL || needle_type == MP_NIL)
-		return;
-	/*
-	 * Position function can be called only with string
-	 * or blob params.
-	 */
-	struct Mem *inconsistent_type_arg = NULL;
-	if (needle_type != MP_STR && needle_type != MP_BIN)
-		inconsistent_type_arg = needle;
-	if (haystack_type != MP_STR && haystack_type != MP_BIN)
-		inconsistent_type_arg = haystack;
-	if (inconsistent_type_arg != NULL) {
-		diag_set(ClientError, ER_INCONSISTENT_TYPES,
-			 "string or varbinary", mem_str(inconsistent_type_arg));
-		context->is_aborted = true;
-		return;
-	}
-	/*
-	 * Both params of Position function must be of the same
-	 * type.
-	 */
-	if (haystack_type != needle_type) {
-		diag_set(ClientError, ER_INCONSISTENT_TYPES,
-			 mem_type_to_str(needle), mem_str(haystack));
-		context->is_aborted = true;
-		return;
-	}
-
-	int n_needle_bytes = mem_len_unsafe(needle);
-	int n_haystack_bytes = mem_len_unsafe(haystack);
-	int position = 1;
-	if (n_needle_bytes > 0) {
-		const unsigned char *haystack_str;
-		const unsigned char *needle_str;
-		if (haystack_type == MP_BIN) {
-			needle_str = mem_as_bin(needle);
-			haystack_str = mem_as_bin(haystack);
-			assert(needle_str != NULL);
-			assert(haystack_str != NULL || n_haystack_bytes == 0);
-			/*
-			 * Naive implementation of substring
-			 * searching: matching time O(n * m).
-			 * Can be improved.
-			 */
-			while (n_needle_bytes <= n_haystack_bytes &&
-			       memcmp(haystack_str, needle_str, n_needle_bytes) != 0) {
-				position++;
-				n_haystack_bytes--;
-				haystack_str++;
-			}
-			if (n_needle_bytes > n_haystack_bytes)
-				position = 0;
-		} else {
-			/*
-			 * Code below handles not only simple
-			 * cases like position('a', 'bca'), but
-			 * also more complex ones:
-			 * position('a', 'bcá' COLLATE "unicode_ci")
-			 * To do so, we need to use comparison
-			 * window, which has constant character
-			 * size, but variable byte size.
-			 * Character size is equal to
-			 * needle char size.
-			 */
-			haystack_str = mem_as_ustr(haystack);
-			needle_str = mem_as_ustr(needle);
-
-			int n_needle_chars =
-				sql_utf8_char_count(needle_str, n_needle_bytes);
-			int n_haystack_chars =
-				sql_utf8_char_count(haystack_str,
-						    n_haystack_bytes);
-
-			if (n_haystack_chars < n_needle_chars) {
-				position = 0;
-				goto finish;
-			}
-			/*
-			 * Comparison window is determined by
-			 * beg_offset and end_offset. beg_offset
-			 * is offset in bytes from haystack
-			 * beginning to window beginning.
-			 * end_offset is offset in bytes from
-			 * haystack beginning to window end.
-			 */
-			int end_offset = 0;
-			for (int c = 0; c < n_needle_chars; c++) {
-				SQL_UTF8_FWD_1(haystack_str, end_offset,
-					       n_haystack_bytes);
-			}
-			int beg_offset = 0;
-			struct coll *coll = context->coll;
-			int c;
-			for (c = 0; c + n_needle_chars <= n_haystack_chars; c++) {
-				if (coll->cmp((const char *) haystack_str + beg_offset,
-					      end_offset - beg_offset,
-					      (const char *) needle_str,
-					      n_needle_bytes, coll) == 0)
-					goto finish;
-				position++;
-				/* Update offsets. */
-				SQL_UTF8_FWD_1(haystack_str, beg_offset,
-					       n_haystack_bytes);
-				SQL_UTF8_FWD_1(haystack_str, end_offset,
-					       n_haystack_bytes);
-			}
-			/* Needle was not found in the haystack. */
-			position = 0;
-		}
-	}
-finish:
-	assert(position >= 0);
-	sql_result_uint(context, position);
-}
-
 /*
  * Implementation of the printf() function.
  */
@@ -1982,7 +1911,9 @@ static struct sql_func_definition definitions[] = {
 	{"NULLIF", 2, {FIELD_TYPE_ANY, FIELD_TYPE_ANY}, FIELD_TYPE_SCALAR,
 	 func_nullif, NULL},
 	{"POSITION", 2, {FIELD_TYPE_STRING, FIELD_TYPE_STRING},
-	 FIELD_TYPE_INTEGER, position_func, NULL},
+	 FIELD_TYPE_INTEGER, func_position_characters, NULL},
+	{"POSITION", 2, {FIELD_TYPE_VARBINARY, FIELD_TYPE_VARBINARY},
+	 FIELD_TYPE_INTEGER, func_position_octets, NULL},
 	{"PRINTF", -1, {FIELD_TYPE_ANY}, FIELD_TYPE_STRING, printfFunc, 
 	 NULL},
 	{"QUOTE", 1, {FIELD_TYPE_ANY}, FIELD_TYPE_STRING, quoteFunc, NULL},
diff --git a/test/sql-tap/position.test.lua b/test/sql-tap/position.test.lua
index 6a96ed9bc..e49f4665a 100755
--- a/test/sql-tap/position.test.lua
+++ b/test/sql-tap/position.test.lua
@@ -1,6 +1,6 @@
 #!/usr/bin/env tarantool
 local test = require("sqltester")
-test:plan(80)
+test:plan(81)
 
 test:do_test(
     "position-1.1",
@@ -305,130 +305,130 @@ test:do_test(
 test:do_test(
     "position-1.31",
     function()
-        return test:catchsql "SELECT position(x'01', x'0102030405');"
+        return test:execsql "SELECT position(x'01', x'0102030405');"
     end, {
         -- <position-1.31>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        1
         -- </position-1.31>
     })
 
 test:do_test(
     "position-1.32",
     function()
-        return test:catchsql "SELECT position(x'02', x'0102030405');"
+        return test:execsql "SELECT position(x'02', x'0102030405');"
     end, {
         -- <position-1.32>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        2
         -- </position-1.32>
     })
 
 test:do_test(
     "position-1.33",
     function()
-        return test:catchsql "SELECT position(x'03', x'0102030405');"
+        return test:execsql "SELECT position(x'03', x'0102030405');"
     end, {
         -- <position-1.33>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        3
         -- </position-1.33>
     })
 
 test:do_test(
     "position-1.34",
     function()
-        return test:catchsql "SELECT position(x'04', x'0102030405');"
+        return test:execsql "SELECT position(x'04', x'0102030405');"
     end, {
         -- <position-1.34>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        4
         -- </position-1.34>
     })
 
 test:do_test(
     "position-1.35",
     function()
-        return test:catchsql "SELECT position(x'05', x'0102030405');"
+        return test:execsql "SELECT position(x'05', x'0102030405');"
     end, {
         -- <position-1.35>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        5
         -- </position-1.35>
     })
 
 test:do_test(
     "position-1.36",
     function()
-        return test:catchsql "SELECT position(x'06', x'0102030405');"
+        return test:execsql "SELECT position(x'06', x'0102030405');"
     end, {
         -- <position-1.36>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        0
         -- </position-1.36>
     })
 
 test:do_test(
     "position-1.37",
     function()
-        return test:catchsql "SELECT position(x'0102030405', x'0102030405');"
+        return test:execsql "SELECT position(x'0102030405', x'0102030405');"
     end, {
         -- <position-1.37>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        1
         -- </position-1.37>
     })
 
 test:do_test(
     "position-1.38",
     function()
-        return test:catchsql "SELECT position(x'02030405', x'0102030405');"
+        return test:execsql "SELECT position(x'02030405', x'0102030405');"
     end, {
         -- <position-1.38>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        2
         -- </position-1.38>
     })
 
 test:do_test(
     "position-1.39",
     function()
-        return test:catchsql "SELECT position(x'030405', x'0102030405');"
+        return test:execsql "SELECT position(x'030405', x'0102030405');"
     end, {
         -- <position-1.39>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        3
         -- </position-1.39>
     })
 
 test:do_test(
     "position-1.40",
     function()
-        return test:catchsql "SELECT position(x'0405', x'0102030405');"
+        return test:execsql "SELECT position(x'0405', x'0102030405');"
     end, {
         -- <position-1.40>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        4
         -- </position-1.40>
     })
 
 test:do_test(
     "position-1.41",
     function()
-        return test:catchsql "SELECT position(x'0506', x'0102030405');"
+        return test:execsql "SELECT position(x'0506', x'0102030405');"
     end, {
         -- <position-1.41>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        0
         -- </position-1.41>
     })
 
 test:do_test(
     "position-1.42",
     function()
-        return test:catchsql "SELECT position(x'', x'0102030405');"
+        return test:execsql "SELECT position(x'', x'0102030405');"
     end, {
         -- <position-1.42>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        1
         -- </position-1.42>
     })
 
 test:do_test(
     "position-1.43",
     function()
-        return test:catchsql "SELECT position(x'', x'');"
+        return test:execsql "SELECT position(x'', x'');"
     end, {
         -- <position-1.43>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        1
         -- </position-1.43>
     })
 
@@ -571,40 +571,40 @@ test:do_test(
 test:do_test(
     "position-1.56.1",
     function()
-        return test:catchsql "SELECT position(x'79', x'78c3a4e282ac79');"
+        return test:execsql "SELECT position(x'79', x'78c3a4e282ac79');"
     end, {
         -- <position-1.56.1>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        7
         -- </position-1.56.1>
     })
 
 test:do_test(
     "position-1.56.2",
     function()
-        return test:catchsql "SELECT position(x'7a', x'78c3a4e282ac79');"
+        return test:execsql "SELECT position(x'7a', x'78c3a4e282ac79');"
     end, {
         -- <position-1.56.2>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        0
         -- </position-1.56.2>
     })
 
 test:do_test(
     "position-1.56.3",
     function()
-        return test:catchsql "SELECT position(x'78', x'78c3a4e282ac79');"
+        return test:execsql "SELECT position(x'78', x'78c3a4e282ac79');"
     end, {
         -- <position-1.56.3>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        1
         -- </position-1.56.3>
     })
 
 test:do_test(
     "position-1.56.3",
     function()
-        return test:catchsql "SELECT position(x'a4', x'78c3a4e282ac79');"
+        return test:execsql "SELECT position(x'a4', x'78c3a4e282ac79');"
     end, {
         -- <position-1.56.3>
-        1, "Failed to execute SQL statement: wrong arguments for function POSITION()"
+        3
         -- </position-1.56.3>
     })
 
@@ -858,4 +858,14 @@ test:do_catchsql_test(
     }
 )
 
+-- gh-4145: Make sure POSITION() can work with VARBINARY.
+test:do_execsql_test(
+    "position-2",
+    [[
+        SELECT POSITION(x'313233', x'30313231323334353132333435');
+    ]], {
+        4
+    }
+)
+
 test:finish_test()


More information about the Tarantool-patches mailing list