[tarantool-patches] [PATCH] sql: Duplicate key error for a non-unique index

Tarantool development patches archive
 help / color / mirror / Atom feed

* [tarantool-patches] [PATCH] sql: Duplicate key error for a non-unique index
@ 2019-02-18 19:25 Stanislav Zudin
  2019-02-19  8:04 ` [tarantool-patches] " Konstantin Osipov
  0 siblings, 1 reply; 2+ messages in thread
From: Stanislav Zudin @ 2019-02-18 19:25 UTC (permalink / raw)
  To: tarantool-patches, kostja; +Cc: Stanislav Zudin

Adds collation analysis into creating of a composite key for
index tuples.
The keys of secondary index consist of parts defined for index itself
combined with parts defined for primary key.
The duplicate parts are ignored. But the search of duplicates didn't
take the collation into consideration.
If non-unique secondary index contained primary key columns their
parts from the primary key were omitted. This fact caused an issue.

Closes #3537
---
Branch: https://github.com/tarantool/tarantool/tree/stanztt/gh-3537-nonunique-index-dup-key-error
Issue: https://github.com/tarantool/tarantool/issues/3537

 src/box/key_def.c           | 44 +++++++++++++++++++++++++++++++++++--
 src/coll.c                  |  1 +
 src/coll.h                  |  2 ++
 test/sql/collation.result   | 14 ++++++++++++
 test/sql/collation.test.lua |  8 +++++++
 5 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/src/box/key_def.c b/src/box/key_def.c
index 9411ade39..b5b4c6edf 100644
--- a/src/box/key_def.c
+++ b/src/box/key_def.c
@@ -37,6 +37,7 @@
 #include "schema_def.h"
 #include "coll_id_cache.h"
 #include "small/region.h"
+#include "coll.h"
 
 const char *sort_order_strs[] = { "asc", "desc", "undef" };
 
@@ -596,6 +597,45 @@ key_def_find_by_fieldno(const struct key_def *key_def, uint32_t fieldno)
 	return key_def_find(key_def, &part);
 }
 
+static bool
+key_part_coll_equal(const struct coll* sec_coll, const struct coll* pk_coll)
+{
+	if (sec_coll == pk_coll)
+		return true;
+	/* If they're not the same object
+	 * compare their strength */
+	enum coll_icu_strength sec_strength = sec_coll
+		? sec_coll->strength
+		: COLL_ICU_STRENGTH_DEFAULT;
+	enum coll_icu_strength pk_strength = pk_coll
+		? pk_coll->strength
+		: COLL_ICU_STRENGTH_DEFAULT;
+	/* If collation of the secondary key beats
+	 * the PK key's collation then they're not
+	 * equal and the secondary key must be
+	 * combined with the primary one. */
+	if (sec_strength > pk_strength)
+		return false;
+	else
+		return true;
+}
+
+static const struct key_part *
+key_def_find_with_coll(const struct key_def *sec_key_def, const struct key_part *pk_key_part)
+{
+	const struct key_part *part = sec_key_def->parts;
+	const struct key_part *end = part + sec_key_def->part_count;
+	for (; part != end; part++) {
+		if (part->fieldno == pk_key_part->fieldno &&
+		    key_part_coll_equal(part->coll, pk_key_part->coll) &&
+		    json_path_cmp(part->path, part->path_len,
+				  pk_key_part->path, pk_key_part->path_len,
+				  TUPLE_INDEX_BASE) == 0)
+			return part;
+	}
+	return NULL;
+}
+
 const struct key_part *
 key_def_find(const struct key_def *key_def, const struct key_part *to_find)
 {
@@ -639,7 +679,7 @@ key_def_merge(const struct key_def *first, const struct key_def *second)
 	part = second->parts;
 	end = part + second->part_count;
 	for (; part != end; part++) {
-		if (key_def_find(first, part) != NULL)
+		if (key_def_find_with_coll(first, part) != NULL)
 			--new_part_count;
 		else
 			sz += part->path_len;
@@ -677,7 +717,7 @@ key_def_merge(const struct key_def *first, const struct key_def *second)
 	part = second->parts;
 	end = part + second->part_count;
 	for (; part != end; part++) {
-		if (key_def_find(first, part) != NULL)
+		if (key_def_find_with_coll(first, part) != NULL)
 			continue;
 		key_def_set_part(new_def, pos++, part->fieldno, part->type,
 				 part->nullable_action, part->coll,
diff --git a/src/coll.c b/src/coll.c
index 6d9c44dbf..8f9dcbac2 100644
--- a/src/coll.c
+++ b/src/coll.c
@@ -320,6 +320,7 @@ coll_new(const struct coll_def *def)
 	}
 	memcpy((char *) coll->fingerprint, fingerprint, fingerprint_len + 1);
 	coll->refs = 1;
+	coll->strength = def->icu.strength;
 	coll->type = def->type;
 	switch (coll->type) {
 	case COLL_TYPE_ICU:
diff --git a/src/coll.h b/src/coll.h
index 9c725712a..2412f8032 100644
--- a/src/coll.h
+++ b/src/coll.h
@@ -56,6 +56,8 @@ struct UCollator;
 struct coll {
 	/** Collation type. */
 	enum coll_type type;
+	/** Strength ICU settings */
+	enum coll_icu_strength strength;
 	/** ICU collation specific data. */
 	struct UCollator *collator;
 	/** String comparator. */
diff --git a/test/sql/collation.result b/test/sql/collation.result
index 5721ef854..94e03d624 100644
--- a/test/sql/collation.result
+++ b/test/sql/collation.result
@@ -325,3 +325,17 @@ box.sql.execute("DROP TABLE t1;")
 box.sql.execute("DROP TABLE t0;")
 ---
 ...
+-- gh-3537 Duplicate key error for an index that is not unique
+--
+box.sql.execute('CREATE TABLE t3 (s1 CHAR(5) PRIMARY KEY);')
+---
+...
+box.sql.execute('CREATE INDEX i3 ON t3 (s1 collate "unicode_ci");')
+---
+...
+box.sql.execute("INSERT INTO t3 VALUES ('a');")
+---
+...
+box.sql.execute("INSERT INTO t3 VALUES ('A');")
+---
+...
diff --git a/test/sql/collation.test.lua b/test/sql/collation.test.lua
index 4c649a444..7210c8e9e 100644
--- a/test/sql/collation.test.lua
+++ b/test/sql/collation.test.lua
@@ -126,3 +126,11 @@ box.sql.execute("SELECT * FROM t1;")
 box.sql.execute("SELECT s1 FROM t0;")
 box.sql.execute("DROP TABLE t1;")
 box.sql.execute("DROP TABLE t0;")
+
+-- gh-3537 Duplicate key error for an index that is not unique
+--
+box.sql.execute('CREATE TABLE t3 (s1 CHAR(5) PRIMARY KEY);')
+box.sql.execute('CREATE INDEX i3 ON t3 (s1 collate "unicode_ci");')
+box.sql.execute("INSERT INTO t3 VALUES ('a');")
+box.sql.execute("INSERT INTO t3 VALUES ('A');")
+
-- 
2.17.1

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [tarantool-patches] Re: [PATCH] sql: Duplicate key error for a non-unique index
  2019-02-18 19:25 [tarantool-patches] [PATCH] sql: Duplicate key error for a non-unique index Stanislav Zudin
@ 2019-02-19  8:04 ` Konstantin Osipov
  0 siblings, 0 replies; 2+ messages in thread
From: Konstantin Osipov @ 2019-02-19  8:04 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Stanislav Zudin

* Stanislav Zudin <szudin@tarantool.org> [19/02/19 00:19]:

Please add more tests. Let's have tests for the following
combinations:

- binary and unicode
- binary and unicode_ci
- unicode_ci and unicode
- multipart index

Second, as follows from PeterG's explanation, even for collations
with the same strength this may not work. For example, unicode_ci
and unicode_ci_ky (please see in the chat how to create this
collation), the strength is the same, but "selectivity" is
different. 

So I think we need to split all collations, after all, to two
classes: {"binary", s0} and {everything else}. Two collations from
"everything else" world can't be matched in strength. We need a
test case highlighting that.

Finally, I would rename key_def_find_with_coll() to
key_def_can_merge(), this would be a more specific name.

It's a pity we can't use key_def_find() in key_def_can_merge(). I
take the reason is that you believe that key_def can contain the
same field no multiple times. I see no reason why a user-defined
key def would have it - I think we could easily ban such
scenarios, at least for the primary key def. The trouble is that
there is no is_primary flag in key_def - please try adding this,
but I assume it's going to be a bit messy. If you want to follow
this track, please ask with Vlad or Vova for help.

Alternatively, you could rename key_def_find() to
key_def_find_next() which would key_def->parts offset as a
starting point for linear search, and have multiple invocations of
key_def_find_next in key_def_can_merge(). key_def_find() can stay
around as a thin wrapper around key_def_find_next().

> Adds collation analysis into creating of a composite key for
> index tuples.
> The keys of secondary index consist of parts defined for index itself
> combined with parts defined for primary key.
> The duplicate parts are ignored. But the search of duplicates didn't
> take the collation into consideration.
> If non-unique secondary index contained primary key columns their
> parts from the primary key were omitted. This fact caused an issue.
> 
> Closes #3537
> ---
> Branch: https://github.com/tarantool/tarantool/tree/stanztt/gh-3537-nonunique-index-dup-key-error
> Issue: https://github.com/tarantool/tarantool/issues/3537
> 
>  src/box/key_def.c           | 44 +++++++++++++++++++++++++++++++++++--
>  src/coll.c                  |  1 +
>  src/coll.h                  |  2 ++
>  test/sql/collation.result   | 14 ++++++++++++
>  test/sql/collation.test.lua |  8 +++++++
>  5 files changed, 67 insertions(+), 2 deletions(-)
> 
> diff --git a/src/box/key_def.c b/src/box/key_def.c
> index 9411ade39..b5b4c6edf 100644
> --- a/src/box/key_def.c
> +++ b/src/box/key_def.c
> @@ -37,6 +37,7 @@
>  #include "schema_def.h"
>  #include "coll_id_cache.h"
>  #include "small/region.h"
> +#include "coll.h"
>  
>  const char *sort_order_strs[] = { "asc", "desc", "undef" };
>  
> @@ -596,6 +597,45 @@ key_def_find_by_fieldno(const struct key_def *key_def, uint32_t fieldno)
>  	return key_def_find(key_def, &part);
>  }
>  
> +static bool
> +key_part_coll_equal(const struct coll* sec_coll, const struct coll* pk_coll)
> +{
> +	if (sec_coll == pk_coll)
> +		return true;
> +	/* If they're not the same object
> +	 * compare their strength */
> +	enum coll_icu_strength sec_strength = sec_coll
> +		? sec_coll->strength
> +		: COLL_ICU_STRENGTH_DEFAULT;
> +	enum coll_icu_strength pk_strength = pk_coll
> +		? pk_coll->strength
> +		: COLL_ICU_STRENGTH_DEFAULT;
> +	/* If collation of the secondary key beats
> +	 * the PK key's collation then they're not
> +	 * equal and the secondary key must be
> +	 * combined with the primary one. */
> +	if (sec_strength > pk_strength)
> +		return false;
> +	else
> +		return true;
> +}
> +
> +static const struct key_part *
> +key_def_find_with_coll(const struct key_def *sec_key_def, const struct key_part *pk_key_part)
> +{
> +	const struct key_part *part = sec_key_def->parts;
> +	const struct key_part *end = part + sec_key_def->part_count;
> +	for (; part != end; part++) {
> +		if (part->fieldno == pk_key_part->fieldno &&
> +		    key_part_coll_equal(part->coll, pk_key_part->coll) &&
> +		    json_path_cmp(part->path, part->path_len,
> +				  pk_key_part->path, pk_key_part->path_len,
> +				  TUPLE_INDEX_BASE) == 0)
> +			return part;
> +	}
> +	return NULL;
> +}
> +
>  const struct key_part *
>  key_def_find(const struct key_def *key_def, const struct key_part *to_find)
>  {
> @@ -639,7 +679,7 @@ key_def_merge(const struct key_def *first, const struct key_def *second)
>  	part = second->parts;
>  	end = part + second->part_count;
>  	for (; part != end; part++) {
> -		if (key_def_find(first, part) != NULL)
> +		if (key_def_find_with_coll(first, part) != NULL)
>  			--new_part_count;
>  		else
>  			sz += part->path_len;
> @@ -677,7 +717,7 @@ key_def_merge(const struct key_def *first, const struct key_def *second)
>  	part = second->parts;
>  	end = part + second->part_count;
>  	for (; part != end; part++) {
> -		if (key_def_find(first, part) != NULL)
> +		if (key_def_find_with_coll(first, part) != NULL)
>  			continue;
>  		key_def_set_part(new_def, pos++, part->fieldno, part->type,
>  				 part->nullable_action, part->coll,
> diff --git a/src/coll.c b/src/coll.c
> index 6d9c44dbf..8f9dcbac2 100644
> --- a/src/coll.c
> +++ b/src/coll.c
> @@ -320,6 +320,7 @@ coll_new(const struct coll_def *def)
>  	}
>  	memcpy((char *) coll->fingerprint, fingerprint, fingerprint_len + 1);
>  	coll->refs = 1;
> +	coll->strength = def->icu.strength;
>  	coll->type = def->type;
>  	switch (coll->type) {
>  	case COLL_TYPE_ICU:
> diff --git a/src/coll.h b/src/coll.h
> index 9c725712a..2412f8032 100644
> --- a/src/coll.h
> +++ b/src/coll.h
> @@ -56,6 +56,8 @@ struct UCollator;
>  struct coll {
>  	/** Collation type. */
>  	enum coll_type type;
> +	/** Strength ICU settings */
> +	enum coll_icu_strength strength;
>  	/** ICU collation specific data. */
>  	struct UCollator *collator;
>  	/** String comparator. */
> diff --git a/test/sql/collation.result b/test/sql/collation.result
> index 5721ef854..94e03d624 100644
> --- a/test/sql/collation.result
> +++ b/test/sql/collation.result
> @@ -325,3 +325,17 @@ box.sql.execute("DROP TABLE t1;")
>  box.sql.execute("DROP TABLE t0;")
>  ---
>  ...
> +-- gh-3537 Duplicate key error for an index that is not unique
> +--
> +box.sql.execute('CREATE TABLE t3 (s1 CHAR(5) PRIMARY KEY);')
> +---
> +...
> +box.sql.execute('CREATE INDEX i3 ON t3 (s1 collate "unicode_ci");')
> +---
> +...
> +box.sql.execute("INSERT INTO t3 VALUES ('a');")
> +---
> +...
> +box.sql.execute("INSERT INTO t3 VALUES ('A');")
> +---
> +...
> diff --git a/test/sql/collation.test.lua b/test/sql/collation.test.lua
> index 4c649a444..7210c8e9e 100644
> --- a/test/sql/collation.test.lua
> +++ b/test/sql/collation.test.lua
> @@ -126,3 +126,11 @@ box.sql.execute("SELECT * FROM t1;")
>  box.sql.execute("SELECT s1 FROM t0;")
>  box.sql.execute("DROP TABLE t1;")
>  box.sql.execute("DROP TABLE t0;")
> +
> +-- gh-3537 Duplicate key error for an index that is not unique
> +--
> +box.sql.execute('CREATE TABLE t3 (s1 CHAR(5) PRIMARY KEY);')
> +box.sql.execute('CREATE INDEX i3 ON t3 (s1 collate "unicode_ci");')
> +box.sql.execute("INSERT INTO t3 VALUES ('a');")
> +box.sql.execute("INSERT INTO t3 VALUES ('A');")
> +
> -- 
> 2.17.1
> 

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-02-19  8:04 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-18 19:25 [tarantool-patches] [PATCH] sql: Duplicate key error for a non-unique index Stanislav Zudin
2019-02-19  8:04 ` [tarantool-patches] " Konstantin Osipov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox