[tarantool-patches] [PATCH] Feature request for a new collation

Stanislav Zudin szudin at tarantool.org
Fri Feb 22 14:49:39 MSK 2019


Adds a new default collation 'unicode_s2' to support the difference 
between Cyrillic letters 'Е' and 'Ё'. The standard case insensitive 
collation ('unicode_ci') doesn't distinguish these letters.

Closes #4007
---
Branch: https://github.com/tarantool/tarantool/tree/stanztt/gh-4007-new-default-collation
Issue: https://github.com/tarantool/tarantool/issues/4007

 src/box/bootstrap.snap      | Bin 1527 -> 1561 bytes
 src/box/lua/upgrade.lua     |   6 ++
 test/box/collation.result   | 160 ++++++++++++++++++++++++++++++++++++
 test/box/collation.test.lua |  49 +++++++++++
 4 files changed, 215 insertions(+)
 create mode 100644 test/box/collation.result
 create mode 100644 test/box/collation.test.lua

diff --git a/src/box/bootstrap.snap b/src/box/bootstrap.snap
index ba2af079571bab8f073d56223a4028e34068c3f6..190eb63d57990a95e0d1d8b3b588c02ad98ff1bc 100644
GIT binary patch
delta 1524
zcmV<Q1q=H33z-a%9Dg<~XJ#>GW-wwkFgFTGZgX^DZewLSAT~5JGB;&5WGyl at Vlyo?
zG&eUbIXE*jEj2MVWHn|nWjHf7HVRflY;R+0Iv{&}3JMAe3JMAe)w&D1%>|eM&V=|9
zZ>0bL0000ewJ-euP<<=__Po<a;Mk}DFbosKFvEnw+!zrV%YVYz*tSA4=C!uVU@)Qb
zUu0!Ula`WLm+t*Vn)Fnz*E(8<>$Uw#zBO|*OD$y at m6g|-Ba$hly8yTV#Q?M&o*^@b
zx4u>0*0a2AYt$vc)%unXik+8vgR<}Q3h&%)FS_{b!}h%^ZritRp6w8!AGTr-^~$C*
z9}HPRybIpko_|01pZ#9zSN&%}SH`1#LMinz(uDwgIrM(tY<AeLOAwoD;p!w^`nIqZ
zYb?IXulcs=V9zS4u at 1y>b&^An*tD1-qyL2-&b>xs9f+%wtOaNe at zy`qicqdj@`w6-
z*yh|8?bW%^Ijy+ttjI!T3Kj`hW~#K*xf8qFlA0D2 at j7x>jh04BE?k{t=f8`1xvOkR
zNeW`4t<f%lzG8`Gv7G-hwloTZ;K*o|(9r>|PO^h{EMNS7mI<9;lg|Mle~#quLVnvD
zeJOzRc-*dC8*Pn>2=Lp|p^CP;psM7#ho1ew$Eu!;SNUm1ERBW+aCMU6ZOJwl!`byh
zkS4mg{)`irNn$y~{DNTUStS%C;Dg9k8Ed`+T%9EMB7#DlN}NcXMwmpHLYP35P9+mh
z=>sW-<bkV`<RNwFIJ~+Ie=wyN3~8iclwpK{wJ&SJg%=Ai6<#_KjEI70RBLDgeep!0
zp_h;ZN+WP}k|5E1 at kseD1m{lE`PY0lU8qm2g9&}lum1ciyYZ49B28!w1g=i9e15&$
zHMciP^MI?9?6Vi&9rUb?$~YLY7E>UuPIC1i9<dg07rfr4-7NPWf5=VP^B?kj^{O^v
zE!vWz8o>1LeqzAYNwRpqF#Q1(e9#^;oK2;vN%1)CDCy{OjBrrV4G{Gys|}h%%)Q)I
zwn||gs>aXHj}=OTtCI}P-}*<KyN!mclWeJz!aCFp{bJuGr69w at p!m9naxoqZ^~}TI
zO&4jj!a5U-j9cF=e<^*z)k&sUsaT?GgEKT>cpP;6cpTRaZW3wW{Ky)ioGxp75HHKS
z>FD2m#`pJCNhho`x!~#~XKZT7XsF6KM*r?HF*)Il6G~wnsfCT!u&9}#k)bjxWK^V5
zq)?<zyl53=Vxm$3fhwrMWMrsDMi77i1V9h~5e7vuXGISZe}LdHjDt7|U=Rpo0184I
z1^^&9F%%R*)Pey75~g&!U!_PkN1mudexbuXk$LV!V?fT3C at Ucb7MDC*Bhz|hqzedb
zNA}jRkR!XbB%bl#LLt|#YHu|z;QHl~R2xEL%hjt6V{N5hininV5RZc8<Ln at kir{Pt
zp|>eG>C{?He-*@85$*wJPIJOmr8X!(3OzXx^#)n|42M=|i1tPw8`W7XazNt<b|Ii~
z1iNs+wsFpPZppX<9gQdWuVT^QjSxbx!oqX%pB#%haKqoZWrL0uoje}IcEba-c)9@@
zfIK6 at 3_zX{XsQRd<>)?CSQT>wYXJT5`^*3Z6U^Zle;LyKM at 8|KefuqG>KOc|*5nZa
z+M$MYfU|o{yy)-a2n9n73^woJ{Ntew)m6WQueR}vpjHAN_zGff&1;kymV==X!6A;N
zulC?UAt)pv3#v$STgU$SG8#cc9qDg06A^NMl;@fqt?FQ-s6W&<i~q1T0Lz)~)stTc
z;94%oe;XO&cwpOz>dd6)c<&idIxlfY<2pl5Yw7Z&3BXx#ic0{V1d9sDFMA82Bkb|s
zgkhjLqwQ9x at GwT5K8p)`G0m3@{E|0y2)Xf+&5`1Xh+f5$%%EC7N%RkP1XC6P6RzEc
zTnf at gQfXdkud(eY+i}*a_-2FU`BSPv634iVF3Ld^QhePtqa!xh%|ewD_-##$hD0#p
afA;>!*~QQbXX1#4Lj<_D_`^HZ5UuSwvD(=H

delta 1517
zcmV<J1rqw14EGC=9Dgt^XJR-xWi(+lV+u)bb97;DV`VxZF=a3?WHvK3Ei__fVJ$Re
zVmK{0He@(0H8MCbH)dmGFkxh33RXjGZ)0mZAbWiZ3e~y`y3GZB0M0v-;7g?d00000
zD77#B08phX0Oq^TNZ=N!05Hrj!whqVus$f|h(j78$yOO6N`HVV(u5GnqiM7<rIL`6
zsGsirMM`(>k|{VlgSRsQg{`)!<1)oS+mEf$3Fi#Slv1|<vjD;XM<^)3_mHhJ)_l`;
zcz(<r-uhN~ThH>gt<e<#SL<6oD0W`v4a&aHE4*{Jz3Aez58L;yxNYCMdA0+De%OjV
z)GM3Ld at yA7@P96Nb9?^afA)K=U-h2_T^Wy-38mCWNEcG!>LgwIwy+j!EWXOG`L^j`
z&mO6<4#aVFl0S;rtcW3_|AijTy+&glh^v#V1!xZO)<4#QP_9n$hx&Zj=G+$T)veGu
zrL61_fx_ccBAOx-MWxQ2lVtIJVfuHsB{i)C at p4y<mVZV|3b;DS&VL8-a#z`sk`ly5
zTcaHUeZ>;XVmbe1Y-#l6z>v`%p(6uaon!~^SiboEY!cd1fUA?t_rWl{0X^5Vyz7-*
zU%tznn{i8~fAu`$n)rOrhUD))e%l*mA%OFE+^$_4ZH<Zs at Y~X%inh9-s^qzcp8deb
zs-BEj`G09bERBW)aCMU6ZOJwl!`byhkS4mg{)`irMq)X{{DNTU*{IY+p}0Cp?ga!j
z;w0h};snC<!Q{boDw#?hcuFuN4x|lSog@!QLzm&zm0n1%7Yw5dBMYMntbJK66j&y(
zbSy6-7tu)6Qfr8XzIdF_kc3bKN=00qBq%grJbzEV`@p%=bpAD;%@yhs>s}yy&#(Uc
zE4wk09w16+%>%AZvV4BM+%>m1I^%$=lkBq>-yQU<jmCgRti at CVu1<3GARe(6Zx_7Y
zrrj*}p2tnt^B?kj^{O^vE!vWz7{K)Jep<lQE;^`93|CWUa-?vaZj@|vHAXb3Xa<P-
zlz&wQ%^~Jq?kZcQuntA@^YbHx&Tw^-q4`_?h;z5a;OZn*>ZGs^1w+5scS&7dco-C4
z_fXEogQ1>z7`*8sjaFD^YLRj4yCtP8xH`!bD-}z0ZBT~Bg{R at hj>mD);3km<=SSAq
z<aAk^gLqlqO-KLkGrqsCN;+YkX$4m&DSu-lLq<bY#xVMKhk?lnZ<bIB>qshWv__^T
zW+nxVK at oEzWg=A~MdC%P0f8i_!E9rwL`D#R00ck~01*a7DaeW*5`f at 1jDt7|U=Rpo
z0E*%o1^^&9F%=X+)Pey71#G&fJr9@!MIH75s?cK(BpG&MF?&oPAS)09>+57YV}EpR
z<EUhYRwHX`IFut^q{-4gZ#kB0Z_QUF7XtNqDN!40W6LF}4r709TZ(}3w24P~5pi=M
zmIe|<3Q_S?!0lA4W0iw6i3W*dQ<7k-(-;i)gdPqM^?7oCg+mxMM2Mr05mkRo(Mm)y
zMPyM?L at 8{u=}5sqA*>0Y;q8fY?|&Ja7zXj|u8lf}e at Xis4mV<`5Z44+zMu1O2 at TDf
z;>caHiAY41EK&qhO`ub0Bg=lfS at jbMWc2jfd!)h1CYa-6#H#C$7ZF!>?{suhjo?2`
zlSc*6T18TPZujtg(fdc-3x-%1l)NJ%#`7ywSL8WgPth-eS~$=7+7#WeRDV$pTFz}D
z+(I0qANIj)A&4g-6&;Y~mJYA8Wu$^EJJR21B|<d(D72GuwAH~*qIQknEdHw607z%L
z7puS6;%Ga`^J~VL?KV7f#`Dh^i8COyynLr=c!r$T`uXH);R1Jxbb#aqH=)Q|f(ynY
z%=BMKhBwV at Zn%OfhcWH+U1l8Jt0|piV3)j!t;dZ|mR5>+Bf5)E8gn%>CDE?$4y%lV
zFwK^Nv^i0lN!ndxKa}aX)~fwx#qyR)`Qn7hl7agmJSqM*n0X#G=x1R~353;u(Ck9c
T3OaEP#UVJ^Tm1GN)ex=iFYwYr

diff --git a/src/box/lua/upgrade.lua b/src/box/lua/upgrade.lua
index ab705e978..a28b93ada 100644
--- a/src/box/lua/upgrade.lua
+++ b/src/box/lua/upgrade.lua
@@ -998,9 +998,15 @@ local function create_vinyl_deferred_delete_space()
                   'blackhole', 0, {group_id = 1}, format}
 end
 
+local function create_default_collation_s2()
+    log.info("create predefined collation")
+    box.space._collation:replace{3, "unicode_s2", ADMIN, "ICU", "ru_RU", {strength='secondary'}}
+end
+
 local function upgrade_to_1_10_2()
     upgrade_priv_to_1_10_2()
     create_vinyl_deferred_delete_space()
+    create_default_collation_s2()
 end
 
 local function get_version()
diff --git a/test/box/collation.result b/test/box/collation.result
new file mode 100644
index 000000000..2dbb43c31
--- /dev/null
+++ b/test/box/collation.result
@@ -0,0 +1,160 @@
+env = require('test_run')
+---
+...
+test_run = env.new()
+---
+...
+--
+-- gh-4007 Feature request for a new collation
+--
+-- Ensure all default collations exist
+box.space._collation.index.name:get{'unicode'};
+---
+- [1, 'unicode', 1, 'ICU', '', {}]
+...
+box.space._collation.index.name:get{'unicode_ci'};
+---
+- [2, 'unicode_ci', 1, 'ICU', '', {'strength': 'primary'}]
+...
+box.space._collation.index.name:get{'unicode_s2'};
+---
+- [3, 'unicode_s2', 1, 'ICU', 'ru_RU', {'strength': 'secondary'}]
+...
+-- Default unicode collation deals with russian letters
+s = box.schema.space.create('t1');
+---
+...
+s:format({{name='s1', type='string', collation = 'unicode'}});
+---
+...
+s:create_index('pk', {unique = true, type='tree', parts={{'s1', collation = 'unicode'}}});
+---
+- unique: true
+  parts:
+  - type: string
+    is_nullable: false
+    collation: unicode
+    fieldno: 1
+  id: 0
+  space_id: 512
+  name: pk
+  type: TREE
+...
+s:insert{'Ё'};
+---
+- ['Ё']
+...
+s:insert{'Е'};
+---
+- ['Е']
+...
+s:insert{'ё'};
+---
+- ['ё']
+...
+s:insert{'е'};
+---
+- ['е']
+...
+-- all 4 letters are in the table
+s:select{};
+---
+- - ['е']
+  - ['Е']
+  - ['ё']
+  - ['Ё']
+...
+s:drop();
+---
+...
+-- unicode_ci collation doesn't distinguish russian letters 'Е' and 'Ё'
+s = box.schema.space.create('t1');
+---
+...
+s:format({{name='s1', type='string', collation = 'unicode_ci'}});
+---
+...
+s:create_index('pk', {unique = true, type='tree', parts={{'s1', collation = 'unicode_ci'}}});
+---
+- unique: true
+  parts:
+  - type: string
+    is_nullable: false
+    collation: unicode_ci
+    fieldno: 1
+  id: 0
+  space_id: 513
+  name: pk
+  type: TREE
+...
+s:insert{'Ё'};
+---
+- ['Ё']
+...
+-- the following calls should fail
+s:insert{'е'};
+---
+- error: Duplicate key exists in unique index 'pk' in space 't1'
+...
+s:insert{'Е'};
+---
+- error: Duplicate key exists in unique index 'pk' in space 't1'
+...
+s:insert{'ё'};
+---
+- error: Duplicate key exists in unique index 'pk' in space 't1'
+...
+-- return single 'Ё'
+s:select{};
+---
+- - ['Ё']
+...
+s:drop();
+---
+...
+-- unicode_s2 collation does distinguish russian letters 'Е' and 'Ё'
+s = box.schema.space.create('t1');
+---
+...
+s:format({{name='s1', type='string', collation = 'unicode_s2'}});
+---
+...
+s:create_index('pk', {unique = true, type='tree', parts={{'s1', collation = 'unicode_s2'}}});
+---
+- unique: true
+  parts:
+  - type: string
+    is_nullable: false
+    collation: unicode_s2
+    fieldno: 1
+  id: 0
+  space_id: 514
+  name: pk
+  type: TREE
+...
+s:insert{'Ё'};
+---
+- ['Ё']
+...
+s:insert{'е'};
+---
+- ['е']
+...
+-- the following calls should fail
+s:insert{'Е'};
+---
+- error: Duplicate key exists in unique index 'pk' in space 't1'
+...
+s:insert{'ё'};
+---
+- error: Duplicate key exists in unique index 'pk' in space 't1'
+...
+-- return two: 'Ё' and 'е'
+s:select{};
+---
+- - ['е']
+  - ['Ё']
+...
+s:drop();
+---
+...
diff --git a/test/box/collation.test.lua b/test/box/collation.test.lua
new file mode 100644
index 000000000..4cd24e64c
--- /dev/null
+++ b/test/box/collation.test.lua
@@ -0,0 +1,49 @@
+env = require('test_run')
+test_run = env.new()
+
+--
+-- gh-4007 Feature request for a new collation
+--
+-- Ensure all default collations exist
+box.space._collation.index.name:get{'unicode'};
+box.space._collation.index.name:get{'unicode_ci'};
+box.space._collation.index.name:get{'unicode_s2'};
+
+-- Default unicode collation deals with russian letters
+s = box.schema.space.create('t1');
+s:format({{name='s1', type='string', collation = 'unicode'}});
+s:create_index('pk', {unique = true, type='tree', parts={{'s1', collation = 'unicode'}}});
+s:insert{'Ё'};
+s:insert{'Е'};
+s:insert{'ё'};
+s:insert{'е'};
+-- all 4 letters are in the table
+s:select{};
+s:drop();
+
+-- unicode_ci collation doesn't distinguish russian letters 'Е' and 'Ё'
+s = box.schema.space.create('t1');
+s:format({{name='s1', type='string', collation = 'unicode_ci'}});
+s:create_index('pk', {unique = true, type='tree', parts={{'s1', collation = 'unicode_ci'}}});
+s:insert{'Ё'};
+-- the following calls should fail
+s:insert{'е'};
+s:insert{'Е'};
+s:insert{'ё'};
+-- return single 'Ё'
+s:select{};
+s:drop();
+
+-- unicode_s2 collation does distinguish russian letters 'Е' and 'Ё'
+s = box.schema.space.create('t1');
+s:format({{name='s1', type='string', collation = 'unicode_s2'}});
+s:create_index('pk', {unique = true, type='tree', parts={{'s1', collation = 'unicode_s2'}}});
+s:insert{'Ё'};
+s:insert{'е'};
+-- the following calls should fail
+s:insert{'Е'};
+s:insert{'ё'};
+-- return two: 'Ё' and 'е'
+s:select{};
+s:drop();
+
-- 
2.17.1





More information about the Tarantool-patches mailing list