From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <vdavydov.dev@gmail.com>
Date: Tue, 26 Feb 2019 16:26:25 +0300
From: Vladimir Davydov <vdavydov.dev@gmail.com>
Subject: Re: [tarantool-patches] [PATCH v2] sql: Duplicate key error for a
 non-unique index
Message-ID: <20190226132625.4bsl5p27m4rgoizi@esperanza>
References: <20190220100610.29771-1-szudin@tarantool.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20190220100610.29771-1-szudin@tarantool.org>
To: Stanislav Zudin <szudin@tarantool.org>
Cc: tarantool-patches@freelists.org, kostja@tarantool.org
List-ID: <tarantool-patches.dev.tarantool.org>

On Wed, Feb 20, 2019 at 01:06:10PM +0300, Stanislav Zudin wrote:
> Adds collation analysis into creating of a composite key for
> index tuples.
> The keys of secondary index consist of parts defined for index itself
> combined with parts defined for primary key.
> The duplicate parts are ignored. But the search of duplicates didn't
> take the collation into consideration.
> If non-unique secondary index contained primary key columns their
> parts from the primary key were omitted. This fact caused an issue.
> 
> Closes #3537
> ---
> Branch: https://github.com/tarantool/tarantool/tree/stanztt/gh-3537-nonunique-index-dup-key-error
> Issue: https://github.com/tarantool/tarantool/issues/3537

Whenever you resubmit a patch, please write a brief change log here.
For more details see

https://tarantool.io/en/doc/1.10/dev_guide/developer_guidelines/#how-to-submit-a-patch-for-review

> 
>  src/box/key_def.c           |  51 +++++-
>  src/coll.c                  |   1 +
>  src/coll.h                  |   2 +
>  test/sql/collation.result   | 330 ++++++++++++++++++++++++++++++++++++
>  test/sql/collation.test.lua | 123 ++++++++++++++
>  5 files changed, 500 insertions(+), 7 deletions(-)
> 
> diff --git a/src/box/key_def.c b/src/box/key_def.c
> index 9411ade39..9814d6df0 100644
> --- a/src/box/key_def.c
> +++ b/src/box/key_def.c
> @@ -37,6 +37,7 @@
>  #include "schema_def.h"
>  #include "coll_id_cache.h"
>  #include "small/region.h"
> +#include "coll.h"
>  
>  const char *sort_order_strs[] = { "asc", "desc", "undef" };
>  
> @@ -596,12 +597,11 @@ key_def_find_by_fieldno(const struct key_def *key_def, uint32_t fieldno)
>  	return key_def_find(key_def, &part);
>  }
>  
> -const struct key_part *
> -key_def_find(const struct key_def *key_def, const struct key_part *to_find)
> +static const struct key_part *
> +key_def_find_next(const struct key_part *part, const struct key_part *end,
> +		  const struct key_part *to_find)
>  {
> -	const struct key_part *part = key_def->parts;
> -	const struct key_part *end = part + key_def->part_count;
> -	for (; part != end; part++) {
> +	for(; part != end; part++) {
>  		if (part->fieldno == to_find->fieldno &&
>  		    json_path_cmp(part->path, part->path_len,
>  				  to_find->path, to_find->path_len,
> @@ -611,6 +611,43 @@ key_def_find(const struct key_def *key_def, const struct key_part *to_find)
>  	return NULL;
>  }
>  
> +static bool
> +key_def_need_merge(const struct key_def *sec_key_def,
> +		   const struct key_part *pk_key_part)

Please don't use secondary/primary in argument names/comments,
because key_def_merge doesn't. This creates inconsistency leading
to confusion. Let's just assume that this function checks whether
an arbitrary key part can be merged into an arbitrary key def.

> +{
> +	const struct key_part* end = sec_key_def->parts +

Nit: we always put * closer to the variable name, not type. Please fix.

> +				     sec_key_def->part_count;
> +	const struct key_part* part = key_def_find_next(sec_key_def->parts,
> +							end,
> +							pk_key_part);
> +	if (part == NULL)
> +		return true;

The function name sounds like it should return true if the given key
part needs to be merged into the given key def, but actually it return
false in this case. Also, IMO key_def_can_merge would be a better name.

> +
> +	/* The duplicate key_part is found,
> +	 * compare collation */

Nit: we always format multline comments like

/*
 * This is a very long
 * comment.
 */

Please fix.

> +	if (part->coll == pk_key_part->coll)
> +		return false;
> +
> +	if (part->coll == NULL ||
> +	    part->coll->strength == COLL_ICU_STRENGTH_DEFAULT) {
> +		return false;
> +		/* If collation of the sec. key part

'sec' doesn't look like a common abbreviation for secondary.

> +		 * is binary then the sec. key
> +		 * doesn't require a composite key.
> +		 * */

This comment is pretty much useless, because it follows directly from
the code surrounding it. Please explain *why* you're doing it rather
than *what* you're doing.

Perhaps, it's worth encapsulating this logic in a helper function
defined in coll.c. Something like coll_is_compatible.

> +	} else
> +		return true;
> +}
> +
> +const struct key_part *
> +key_def_find(const struct key_def *key_def, const struct key_part *to_find)
> +{
> +	/* find the first match */
> +	return key_def_find_next(key_def->parts,
> +				 key_def->parts + key_def->part_count,
> +				 to_find);

Why did you factor key_def_find_next out of key_def_find? AFAIU you
could as well use key_def_find directly in key_def_need_merge, no?

> +}
> +
>  bool
>  key_def_contains(const struct key_def *first, const struct key_def *second)
>  {
> @@ -639,7 +676,7 @@ key_def_merge(const struct key_def *first, const struct key_def *second)
>  	part = second->parts;
>  	end = part + second->part_count;
>  	for (; part != end; part++) {
> -		if (key_def_find(first, part) != NULL)
> +		if (!key_def_need_merge(first, part))
>  			--new_part_count;
>  		else
>  			sz += part->path_len;
> @@ -677,7 +714,7 @@ key_def_merge(const struct key_def *first, const struct key_def *second)
>  	part = second->parts;
>  	end = part + second->part_count;
>  	for (; part != end; part++) {
> -		if (key_def_find(first, part) != NULL)
> +		if (!key_def_need_merge(first, part))
>  			continue;
>  		key_def_set_part(new_def, pos++, part->fieldno, part->type,
>  				 part->nullable_action, part->coll,
> diff --git a/src/coll.c b/src/coll.c
> index 6d9c44dbf..8f9dcbac2 100644
> --- a/src/coll.c
> +++ b/src/coll.c
> @@ -320,6 +320,7 @@ coll_new(const struct coll_def *def)
>  	}
>  	memcpy((char *) coll->fingerprint, fingerprint, fingerprint_len + 1);
>  	coll->refs = 1;
> +	coll->strength = def->icu.strength;
>  	coll->type = def->type;
>  	switch (coll->type) {
>  	case COLL_TYPE_ICU:
> diff --git a/src/coll.h b/src/coll.h
> index 9c725712a..2412f8032 100644
> --- a/src/coll.h
> +++ b/src/coll.h
> @@ -56,6 +56,8 @@ struct UCollator;
>  struct coll {
>  	/** Collation type. */
>  	enum coll_type type;
> +	/** Strength ICU settings */
> +	enum coll_icu_strength strength;

Do we really need it to add strengh here? Can't we extract it from the
collator?

>  	/** ICU collation specific data. */
>  	struct UCollator *collator;
>  	/** String comparator. */