From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <vdavydov.dev@gmail.com>
Date: Thu, 28 Feb 2019 19:19:45 +0300
From: Vladimir Davydov <vdavydov.dev@gmail.com>
Subject: Re: [tarantool-patches] Re: [PATCH v2] sql: Duplicate key error for
 a non-unique index
Message-ID: <20190228161945.o3jdmus4tehyoo53@esperanza>
References: <20190220100610.29771-1-szudin@tarantool.org>
 <20190226132625.4bsl5p27m4rgoizi@esperanza>
 <3e9f77a3-62d7-1619-fc43-7d6d9b300031@tarantool.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="iso-8859-1"
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable
In-Reply-To: <3e9f77a3-62d7-1619-fc43-7d6d9b300031@tarantool.org>
To: Stanislav Zudin <szudin@tarantool.org>
Cc: tarantool-patches@freelists.org, Konstantin Osipov <kostja@tarantool.org>
List-ID: <tarantool-patches.dev.tarantool.org>

[ Cc +=3D Kostja - AFAIR he had something against using STRENGTH in
  collation comparison, see my comment inline ]

On Thu, Feb 28, 2019 at 05:17:13PM +0300, Stanislav Zudin wrote:
> > > +static bool
> > > +key_def_need_merge(const struct key_def *sec_key_def,
> > > +		   const struct key_part *pk_key_part)
> > > +{
> > > +	const struct key_part* end =3D sec_key_def->parts +
> > > +				     sec_key_def->part_count;
> > > +	const struct key_part* part =3D key_def_find_next(sec_key_def->part=
s,
> > > +							end,
> > > +							pk_key_part);
> > > +	if (part =3D=3D NULL)
> > > +		return true;
> > > +
> > > +	/* The duplicate key_part is found,
> > > +	 * compare collation */
> > > +	if (part->coll =3D=3D pk_key_part->coll)
> > > +		return false;
> > > +
> > > +	if (part->coll =3D=3D NULL ||
> > > +	    part->coll->strength =3D=3D COLL_ICU_STRENGTH_DEFAULT) {
> > > +		return false;
> > > +		/* If collation of the sec. key part
> > > +		 * is binary then the sec. key
> > > +		 * doesn't require a composite key.
> > > +		 * */
> >
> > Perhaps, it's worth encapsulating this logic in a helper function
> > defined in coll.c. Something like coll_is_compatible.
>
> This case is not a general one, so it doesn't deserve to be implemented a=
s a
> generic helper function.

Well, coll isn't an independent library. IMO it'd be perfectly okay to
add a helper there, which is only relevant to Tarantool internals.

> The updated patch is below:
>=20
> ---
> Branch: https://github.com/tarantool/tarantool/tree/stanztt/gh-3537-nonun=
ique-index-dup-key-error
> Issue: https://github.com/tarantool/tarantool/issues/3537
>=20
> =A0src/CMakeLists.txt=A0=A0=A0=A0=A0=A0=A0=A0=A0 |=A0=A0 3 +-
> =A0src/box/key_def.c=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0 |=A0 29 +++-
> =A0src/coll.c=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0 |=A0 38 =
+++++
> =A0src/coll.h=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0=A0 |=A0=A0 =
4 +
> =A0test/sql/collation.result=A0=A0 | 330 ++++++++++++++++++++++++++++++++=
++++
> =A0test/sql/collation.test.lua | 123 ++++++++++++++
> =A06 files changed, 524 insertions(+), 3 deletions(-)
>=20
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
> index 04de5ad04..7346e7ba7 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -125,7 +125,8 @@ target_link_libraries(core
> =A0=A0=A0=A0 ${LIBEIO_LIBRARIES}
> =A0=A0=A0=A0 ${LIBCORO_LIBRARIES}
> =A0=A0=A0=A0 ${MSGPUCK_LIBRARIES}
> -=A0=A0=A0 ${generic_libraries})
> +=A0=A0=A0 ${generic_libraries}
> +=A0=A0=A0 ${ICU_LIBRARIES})
>=20
> =A0add_library(stat STATIC rmean.c latency.c histogram.c)
> =A0target_link_libraries(stat core)
> diff --git a/src/box/key_def.c b/src/box/key_def.c
> index 9411ade39..61b0258b2 100644
> --- a/src/box/key_def.c
> +++ b/src/box/key_def.c
> @@ -37,6 +37,7 @@
> =A0#include "schema_def.h"
> =A0#include "coll_id_cache.h"
> =A0#include "small/region.h"
> +#include "coll.h"
>=20
> =A0const char *sort_order_strs[] =3D { "asc", "desc", "undef" };
>=20
> @@ -611,6 +612,30 @@ key_def_find(const struct key_def *key_def, const
> struct key_part *to_find)
> =A0=A0=A0=A0 return NULL;
> =A0}
>=20
> +static bool
> +key_def_can_merge(const struct key_def *first_key_def,
> +=A0=A0=A0 const struct key_part *second_key_part)

Your MUA mangles patches. Please fix it - it makes the patches difficult
to review.

> +{
> +=A0=A0=A0 const struct key_part *part =3D key_def_find(first_key_def,
> +=A0=A0=A0 =A0=A0=A0 =A0=A0=A0 =A0=A0=A0 =A0=A0=A0 =A0=A0=A0 =A0=A0 secon=
d_key_part);
> +=A0=A0=A0 if (part =3D=3D NULL)
> +=A0=A0=A0 =A0=A0=A0 return true;

I asked you to return false in this case, because this means we can't
merge the given key part into the given key def.

> +
> +=A0=A0=A0 /*
> +=A0=A0=A0 =A0* The duplicate key_part is found,
> +=A0=A0=A0 =A0* compare collation
> +=A0=A0=A0 =A0*/
> +=A0=A0=A0 if (part->coll =3D=3D second_key_part->coll)
> +=A0=A0=A0 =A0=A0=A0 return false;
> +
> +=A0=A0=A0 if (part->coll =3D=3D NULL ||
> +=A0=A0=A0 =A0=A0=A0 /*part->coll->strength =3D=3D COLL_ICU_STRENGTH_DEFA=
ULT)*/

Stray hunk, pleaase remove.

> +=A0=A0=A0 =A0=A0=A0 coll_get_strength(part->coll) =3D=3D COLL_ICU_STRENG=
TH_DEFAULT)

If my memory doesn't fail, on our last daily meeting, Kostja mentioned
that using STRENGTH here isn't quite correct...

> +=A0=A0=A0 =A0=A0=A0 return false;
> +=A0=A0=A0 else
> +=A0=A0=A0 =A0=A0=A0 return true;
> +}
> +
> =A0bool
> =A0key_def_contains(const struct key_def *first, const struct key_def *se=
cond)
> =A0{
> @@ -639,7 +664,7 @@ key_def_merge(const struct key_def *first, const stru=
ct
> key_def *second)
> =A0=A0=A0=A0 part =3D second->parts;
> =A0=A0=A0=A0 end =3D part + second->part_count;
> =A0=A0=A0=A0 for (; part !=3D end; part++) {
> -=A0=A0=A0 =A0=A0=A0 if (key_def_find(first, part) !=3D NULL)
> +=A0=A0=A0 =A0=A0=A0 if (!key_def_can_merge(first, part))
> =A0=A0=A0=A0 =A0=A0=A0 =A0=A0=A0 --new_part_count;
> =A0=A0=A0=A0 =A0=A0=A0 else
> =A0=A0=A0=A0 =A0=A0=A0 =A0=A0=A0 sz +=3D part->path_len;
> @@ -677,7 +702,7 @@ key_def_merge(const struct key_def *first, const stru=
ct
> key_def *second)
> =A0=A0=A0=A0 part =3D second->parts;
> =A0=A0=A0=A0 end =3D part + second->part_count;
> =A0=A0=A0=A0 for (; part !=3D end; part++) {
> -=A0=A0=A0 =A0=A0=A0 if (key_def_find(first, part) !=3D NULL)
> +=A0=A0=A0 =A0=A0=A0 if (!key_def_can_merge(first, part))
> =A0=A0=A0=A0 =A0=A0=A0 =A0=A0=A0 continue;
> =A0=A0=A0=A0 =A0=A0=A0 key_def_set_part(new_def, pos++, part->fieldno, pa=
rt->type,
> =A0=A0=A0=A0 =A0=A0=A0 =A0=A0=A0 =A0=A0=A0 =A0part->nullable_action, part=
->coll,
> diff --git a/src/coll.c b/src/coll.c
> index 6d9c44dbf..7d7267d8a 100644
> --- a/src/coll.c
> +++ b/src/coll.c
> @@ -295,6 +295,44 @@ coll_def_snfingerprint(char *buffer, int size, const
> struct coll_def *def)
> =A0=A0=A0=A0 return total;
> =A0}
>=20
> +static enum coll_icu_strength
> +cast_coll_strength(UCollationStrength s)
> +{
> +=A0=A0=A0 enum coll_icu_strength res =3D COLL_ICU_STRENGTH_DEFAULT;
> +
> +=A0=A0=A0 switch(s) {
> +=A0=A0=A0 case UCOL_PRIMARY:
> +=A0=A0=A0 =A0=A0=A0 res =3D COLL_ICU_STRENGTH_PRIMARY;
> +=A0=A0=A0 =A0=A0=A0 break;
> +=A0=A0=A0 case UCOL_SECONDARY:
> +=A0=A0=A0 =A0=A0=A0 res =3D COLL_ICU_STRENGTH_SECONDARY;
> +=A0=A0=A0 =A0=A0=A0 break;
> +=A0=A0=A0 case UCOL_TERTIARY:
> +=A0=A0=A0 =A0=A0=A0 res =3D COLL_ICU_STRENGTH_TERTIARY;
> +=A0=A0=A0 =A0=A0=A0 break;
> +=A0=A0=A0 case UCOL_QUATERNARY:
> +=A0=A0=A0 =A0=A0=A0 res =3D COLL_ICU_STRENGTH_QUATERNARY;
> +=A0=A0=A0 =A0=A0=A0 break;
> +=A0=A0=A0 case UCOL_IDENTICAL:
> +=A0=A0=A0 =A0=A0=A0 res =3D COLL_ICU_STRENGTH_IDENTICAL;
> +=A0=A0=A0 =A0=A0=A0 break;
> +=A0=A0=A0 default:
> +=A0=A0=A0 =A0=A0=A0 break;

Ouch. This function looks cumbersome. I'd really prefer if you rather
factored out the whole collation check into a helper function defined
in coll.c.

> +=A0=A0=A0 }
> +
> +=A0=A0=A0 return res;
> +}
> +
> +/*enum coll_icu_strength*/
> +enum coll_icu_strength
> +coll_get_strength(const struct coll *coll)
> +{
> +=A0=A0=A0 if (coll->collator =3D=3D NULL)
> +=A0=A0=A0 =A0=A0=A0 return COLL_ICU_STRENGTH_DEFAULT;
> +=A0=A0=A0 else
> +=A0=A0=A0 =A0=A0=A0 return cast_coll_strength(ucol_getStrength(coll->col=
lator));
> +}
> +
> =A0struct coll *
> =A0coll_new(const struct coll_def *def)
> =A0{