Hi! I was wrong about indentation: in C code it should be 8-symbol wide tabs. Sorry, my bad.   Best regards, Maxim Kokryashkin >Среда, 20 октября 2021, 14:47 +03:00 от Максим Корякшин via Tarantool-patches : >  >Hi! Thanks for the patch! >Please consider my comments below. >  >Use the imperative mood in the subject line as it is stated in the developer >guideline[1]. A properly formed Git commit subject line should always be >able to complete the following sentence: “If applied, this commit will   >/your subject line here/ ”. >>Среда, 20 октября 2021, 11:37 +03:00 от Lord via Tarantool-patches < tarantool-patches@dev.tarantool.org >: >>  >>From: Gleb < kashkin.nsaa@gmail.com > >> >>Replaced positioning logic in POSITION() and REPLACE() sql functions >Typo: s/sql/SQL >>with Knuth-Morris-Pratt algorithm by adding two different versions >>of latter for STRING and VARBINARY arguments and corresponding >>prefix table generators. >>Slightly adapted tests to fit new collation rules that differ >>decorated letters. >> >>Improved asymptotics of positioning logic to: >>time O(haystack_len + needle_len), mem O(needle_len) >>As one of collating types is a unicode string, >Typo: s/unicode/Unicode >>ICU UCharIterators had to be used and thus unicode >Typo: s/unicode/Unicode >>letters are compared as UChar32 values and normalization >>is not done ('a' and 'ä' are treated as different) >> >>Closes #6492 >>--- >Here is your notes section, where you can provide any additional >information such as the link to the issue, the link to the >GitHub branch, or any other additional comments. It would be >much more convinient if you could provide that information here. >Check this[2] as an example. >> src/box/sql/func.c | 402 ++++++++++++++++++++++++++------- >> test/sql-tap/position.test.lua | 9 +- >> 2 files changed, 321 insertions(+), 90 deletions(-) >> >>diff --git a/src/box/sql/func.c b/src/box/sql/func.c >>index 29d713fd0..fe3c16828 100644 >>--- a/src/box/sql/func.c >>+++ b/src/box/sql/func.c >>@@ -291,6 +291,147 @@ absFunc(sql_context * context, int argc, sql_value ** argv) >>  } >> } >>  >>+/** >>+ * Implementation of prefix table generator >>+ * prefix_table_gen() is a part of Knuth-Morris-Pratt algorithm >>+ * that is used in position() and replaceFunc() to find >>+ * instances of `needle` in `haystack` bin data >>+ */ >>+inline static void >>+prefix_table_gen_bin(const unsigned char * needle, >>+ const size_t n_needle_bytes, >>+ size_t * table) >When declaring pointer data or a function that returns a pointer >type, the preferred use of * is adjacent to the data name or function name. >>+{ >The indentation step should be four whitespaces, so please fix the >indentation here and below. >>+ size_t i = 1, j = 0; >>+ table[0] = 0; >>+ while (i < n_needle_bytes) { >>+ if (needle[i] != needle[j]) { >>+ if (j == 0) { >>+ table[i] = 0; >>+ ++i; >>+ } else >>+ j = table[j - 1]; >>+ } >>+ else { >>+ table[i] = j + 1; >>+ ++i; >>+ ++j; >>+ } >>+ } >>+} >Use braces in both branches if only one branch of a conditional >statement is a single statement. Also, the braces code style >should be the following[3]. >>+ >>+/** >>+ * Implementation of prefix table generator using UCharIterators >>+ * prefix_table_gen() is a part of Knuth-Morris-Pratt algorithm >>+ * that is used in position() to find instances of `needle` >>+ * in `haystack` unicode string >>+ */ >>+inline static void >>+prefix_table_gen_ustr(const UCharIterator * needle_begin, >>+ const unsigned char * needle_str, >See the comment about the pointer declaration above. >>+ size_t n_needle_bytes, >>+ size_t n_needle_chars, >>+ size_t table[]) >>+{ >>+ table[0] = 0; >>+ UCharIterator needle_iter; >>+ UCharIterator second_iter; >>+ uiter_setUTF8(&needle_iter, needle_str, n_needle_bytes); >>+ uiter_setUTF8(&second_iter, needle_str, n_needle_bytes); >>+ needle_iter.next(&needle_iter); >>+ while (needle_iter.getIndex(&needle_iter, UITER_CURRENT) < n_needle_chars) { >Please, follow the indentation rules here and below. A loop’s body should be indented. >>+ if (needle_iter.current(&needle_iter) != second_iter.current(&second_iter)) { >>+ if (second_iter.getIndex(&second_iter, UITER_CURRENT) == 0) { >>+ table[needle_iter.getIndex(&needle_iter, UITER_CURRENT)] = 0; >>+ needle_iter.next(&needle_iter); >>+ } else >>+ second_iter.move(&second_iter, table[second_iter.getIndex(&second_iter, UITER_CURRENT) - 1], UITER_START); >>+ } >See the comment about braces above. >>+ else { >>+ table[needle_iter.getIndex(&needle_iter, UITER_CURRENT)] = second_iter.getIndex(&second_iter, UITER_CURRENT) + 1; >>+ needle_iter.next(&needle_iter); >>+ second_iter.next(&second_iter); >>+ } >>+ } >>+} >>+ >>+/** >>+ * Implementation of KMP algorithm to find substring >>+ * in a bin array starting from given position. >>+ * >>+ * Requieres index for starting position in `haystack`. >>+ * >Type: s/Requieres/Requires >>+ * Returns found substring position + 1 or 0 if none. >Minor: I guess, it should be ‘Returns found substring position + 1 or 0 >if there is no such substring.’ >>+ */ >>+static int >>+kmp_bin(const unsigned char * haystack_str, >>+ const unsigned char * needle_str, >>+ size_t n_haystack_bytes, >>+ size_t n_needle_bytes, >As I see no changes to these args in the function body, I think they >should be declared as a constant. >>+ size_t i, >>+ const size_t * prefix_table) >See the comment about the pointer declaration above. >>+{ >>+ size_t j = 0; >>+ int position = 0; >>+ while (i < n_haystack_bytes) { >>+ if (needle_str[j] == haystack_str[i]) { >>+ ++i; >>+ ++j; >>+ } >>+ else { >>+ if (j == 0) { >>+ ++i; >>+ } else { >>+ j = prefix_table[j - 1]; >>+ } >>+ } >>+ if (j >= n_needle_bytes) { >>+ position = i - j + 1; >>+ return position; >>+ } >>+ } >>+ return position; >>+} >See the comment about the indentation above.  >>+ >>+/** >>+ * Implementation of KMP algorithm to find substring >>+ * in an array of unicode chars starting from given iterator. >>+ * >>+ * Requieres UCharIterator [haystack_iter] of starting position >>+ * in `haystack`, iterator gets set to the first item after found >>+ * `needle` or reaches the end of haystack if none. >>+ * >>+ * Returns found substring position + 1 or 0 if none. >>+ */ >>+inline static int >>+kmp_ustr(struct sql_context * context, >>+ UCharIterator * haystack_iter, >>+ UCharIterator * needle_iter, >>+ size_t n_haystack_chars, >I think that arg should be declared as a const. >>+ const size_t * prefix_table) >See the comment about the pointer declaration above. >>+{ >>+ struct coll *coll = sqlGetFuncCollSeq(context); >>+ int position = 0; >>+ while (haystack_iter->getIndex(haystack_iter, UITER_CURRENT) < n_haystack_chars) { >>+ if (needle_iter->current(needle_iter) == haystack_iter->current(haystack_iter)) { >>+ needle_iter->next(needle_iter); >>+ haystack_iter->next(haystack_iter); >>+ } else { >>+ if (needle_iter->getIndex(needle_iter, UITER_CURRENT) == 0) { >>+ haystack_iter->next(haystack_iter); >>+ } else { >>+ needle_iter->move(needle_iter, prefix_table[needle_iter->getIndex(needle_iter, UITER_CURRENT) - 1], UITER_START); >>+ } >>+ } >>+ if (!needle_iter->hasNext(needle_iter)) { >>+ position = haystack_iter->getIndex(haystack_iter, UITER_CURRENT) - needle_iter->getIndex(needle_iter, UITER_CURRENT) + 1; >>+ haystack_iter->next(haystack_iter); >>+ return position; >>+ } >>+ } >>+ return position; >>+} >>+ >See the comment about the indentation above. >> /** >>  * Implementation of the position() function. >>  * >>@@ -303,7 +444,7 @@ absFunc(sql_context * context, int argc, sql_value ** argv) >>  * occurrence of needle, or 0 if needle never occurs in haystack. >>  */ >> static void >>-position_func(struct sql_context *context, int argc, struct Mem **argv) >>+position_func(struct sql_context * context, int argc, struct Mem ** argv) >> { >>  UNUSED_PARAMETER(argc); >>  struct Mem *needle = argv[0]; >>@@ -339,9 +480,10 @@ position_func(struct sql_context *context, int argc, struct Mem **argv) >>  return; >>  } >>  >>- int n_needle_bytes = mem_len_unsafe(needle); >>- int n_haystack_bytes = mem_len_unsafe(haystack); >>+ size_t n_needle_bytes = mem_len_unsafe(needle); >>+ size_t n_haystack_bytes = mem_len_unsafe(haystack); >>  int position = 1; >>+ size_t *prefix_table = NULL; >>  if (n_needle_bytes > 0) { >>  const unsigned char *haystack_str; >>  const unsigned char *needle_str; >>@@ -351,18 +493,25 @@ position_func(struct sql_context *context, int argc, struct Mem **argv) >>  assert(needle_str != NULL); >>  assert(haystack_str != NULL || n_haystack_bytes == 0); >>  /* >>- * Naive implementation of substring >>- * searching: matching time O(n * m). >>+ * Basic version of KMP alg >Typo: s/of KMP/of the KMP >>+ * searching: matching time O(n + m), requires mem O(m). >>  * Can be improved. >That comment about improvement can be >removed because it relates to the original naive >implementation. >>  */ >>- while (n_needle_bytes <= n_haystack_bytes && >>- memcmp(haystack_str, needle_str, n_needle_bytes) != 0) { >>- position++; >>- n_haystack_bytes--; >>- haystack_str++; >>+ prefix_table = calloc(n_needle_bytes, sizeof(size_t)); >>+ >>+ if (prefix_table != NULL) { >>+ prefix_table_gen_bin(needle_str, n_needle_bytes, prefix_table); >>+ position = kmp_bin(haystack_str, needle_str, n_haystack_bytes, n_needle_bytes, 0, prefix_table); >>+ } else { >>+ while (n_needle_bytes <= n_haystack_bytes && >>+ memcmp(haystack_str, needle_str, n_needle_bytes) != 0) { >>+ position++; >>+ n_haystack_bytes--; >>+ haystack_str++; >>+ } >>+ if (n_needle_bytes > n_haystack_bytes) >>+ position = 0; >>  } >See the comment about the indentation above. >>+ /* >>+ * KMP algorithm with unicode support >Type s/unicode/Unicode. >> /* >>- * The replace() function. Three arguments are all strings: call >>+ * The replace() function. Three arguments are all strings: call >This change is excess. >>@@ -1309,7 +1487,7 @@ replaceFunc(sql_context * context, int argc, sql_value ** argv) >>  zPattern = mem_as_ustr(argv[1]); >>  if (zPattern == 0) { >>  assert(mem_is_null(argv[1]) >>- || sql_context_db_handle(context)->mallocFailed); >>+ || sql_context_db_handle(context)->mallocFailed); >This change is excess. >>  return; >>  } >>  nPattern = mem_len_unsafe(argv[1]); >>@@ -1331,40 +1509,90 @@ replaceFunc(sql_context * context, int argc, sql_value ** argv) >>  return; >>  } >>  loopLimit = nStr - nPattern; >>- for (i = j = 0; i <= loopLimit; i++) { >>- if (zStr[i] != zPattern[0] >>- || memcmp(&zStr[i], zPattern, nPattern)) { >>- zOut[j++] = zStr[i]; >>- } else { >>- u8 *zOld; >>- sql *db = sql_context_db_handle(context); >>- nOut += nRep - nPattern; >>- testcase(nOut - 1 == db->aLimit[SQL_LIMIT_LENGTH]); >>- testcase(nOut - 2 == db->aLimit[SQL_LIMIT_LENGTH]); >>- if (nOut - 1 > db->aLimit[SQL_LIMIT_LENGTH]) { >>- diag_set(ClientError, ER_SQL_EXECUTE, "string "\ >>- "or binary string is too big"); >>- context->is_aborted = true; >>- sql_free(zOut); >>- return; >>- } >>- zOld = zOut; >>- zOut = sql_realloc64(zOut, (int)nOut); >>- if (zOut == 0) { >>- context->is_aborted = true; >>- sql_free(zOld); >>- return; >>+ >>+ if (prefix_table != NULL) { >>+ /** >>+ * optimized replacement logic using KMP algorithm >>+ * requires additional O(nPattern) memory for prefix table >>+ * replacing: time O(n + m) >>+ */ >In C comments out of functions and inside of functions should >be different in how they are started. Everything else is wrong. >Below are correct examples: >  /** comes for documentation comments >  /* for local not documented comments. >  >However the difference is vague already, so the rule is simple — >out of function = /**, inside = /*.  >  >This is described in more detail here[4]. >>+ >>+ /* precalculating prefix table */ >>+ prefix_table = calloc(nPattern, sizeof(size_t)); >>+ prefix_table_gen_bin(zPattern, nPattern, prefix_table); >>+ /* counting the number of Pattern occurrences */ >>+ for (nCnt = iStr = nPatternPos = 0;;) { >>+ nPatternPos = kmp_bin(zStr, zPattern, nStr, nPattern, iStr, prefix_table); >>+ iStr = nPatternPos + nPattern; >>+ if (nPatternPos == 0) >>+ break; >>+ ++nCnt; >>+ } >>+ /* reallocationg mem for zOut */ >>+ nOut += (nRep - nPattern) * nCnt; >>+ assert(nOut < SQL_MAX_LENGTH); >>+ zOut = contextMalloc(context, (i64) nOut); >>+ if (zOut == 0) { >>+ return; >>+ } >Brackets are excess. >>+ /* coping and replacing in-place */ >>+ for (iStr = nCopiedStr = nCopiedOut = nPatternPos = 0;;) { >>+ nPatternPos = kmp_bin(zStr, zPattern, nStr, nPattern, iStr, prefix_table); >>+ iStr = nPatternPos + nPattern; >>+ if (nPatternPos == 0) >>+ break; >>+ assert(nPatternPos >= nCopiedStr); >>+ memcpy(zOut + nCopiedOut, zStr + nCopiedStr, nPatternPos - nCopiedStr); >>+ nCopiedOut += nPatternPos - nCopiedStr; >>+ memcpy(zOut + nCopiedOut, zRep, nRep); >>+ nCopiedOut += nRep; >>+ nCopiedStr = nPatternPos + nPattern; >>+ } >>+ memcpy(zOut + nCopiedOut, zStr + nCopiedStr, nStr - nCopiedStr); >>+ zOut[nOut - 1] = 0; >>+ free(prefix_table); >>+ } >>+ else { >>+ /** >>+ * Naive replacing substring algorithm >>+ * replacing: time O(n * m), mem O(1) >>+ * used if cannot allocate mem for prefix table >>+ */ >See the comment about in-function comments above. >>diff --git a/test/sql-tap/position.test.lua b/test/sql-tap/position.test.lua >>index 6a96ed9bc..b9244baf6 100755 >>--- a/test/sql-tap/position.test.lua >>+++ b/test/sql-tap/position.test.lua >>@@ -679,12 +679,15 @@ test:do_execsql_test( >>  >> -- Collation is set in space format. >>  >>+-- Tests 63-65 are altered because new version of position() that uses KMP alg >>+-- intentionally does not normalize given strings (in which case 'a' differs from 'à' etc.) >>+ >> test:do_execsql_test( >>     "position-1.63", >>     [[ >>         CREATE TABLE test1 (s1 VARCHAR(5) PRIMARY KEY COLLATE "unicode_ci"); >>         INSERT INTO test1 VALUES('à'); >>- SELECT POSITION('a', s1) FROM test1; >>+ SELECT POSITION('à', s1) FROM test1; >>         DELETE FROM test1; >>     ]], { >>         1 >>@@ -695,7 +698,7 @@ test:do_execsql_test( >>     "position-1.64", >>     [[ >>         INSERT INTO test1 VALUES('qwèrty'); >>- SELECT POSITION('er', s1) FROM test1; >>+ SELECT POSITION('èr', s1) FROM test1; >>         DELETE FROM test1; >>     ]], { >>         3 >>@@ -706,7 +709,7 @@ test:do_execsql_test( >>     "position-1.65", >>     [[ >>         INSERT INTO test1 VALUES('qwèrtÿ'); >>- SELECT POSITION('y', s1) FROM test1; >>+ SELECT POSITION('ÿ', s1) FROM test1; >>         DELETE FROM test1; >>     ]], { >>         6 >>-- >>2.33.0 >I propose to introduce a separate patch for tests modification. >Changes in tests have nothing to do with the KMP algorithm >implementation itself.  >  >You can read a guide on how to send a patch set here[5]. >  >[1]:  https://www.tarantool.io/en/doc/latest/dev_guide/developer_guidelines/ >[2]:  https://lists.tarantool.org/tarantool-patches/20210731133648.25660-1-m.kokryashkin@tarantool.org/ >[3]:  https://www.tarantool.io/en/doc/latest/dev_guide/c_style_guide/#chapter-3-placing-braces-and-spaces >[4]:  https://www.tarantool.io/en/doc/latest/dev_guide/c_style_guide/#chapter-8-commenting >[5]:  https://www.tarantool.io/en/doc/latest/dev_guide/developer_guidelines/#how-to-submit-a-patch-for-review >  >Best regards, >Maxim Kokryashkin