[Tarantool-patches] [PATCH] Trigger on vclock change

Tarantool development patches archive
 help / color / mirror / Atom feed

* [Tarantool-patches] [PATCH] Trigger on vclock change
@ 2019-11-14 12:57 Maria
  2019-11-14 13:44 ` Konstantin Osipov
  0 siblings, 1 reply; 21+ messages in thread
From: Maria @ 2019-11-14 12:57 UTC (permalink / raw)
  To: tarantool-patches, georgy

This patch implements replication.on_vclock
trigger that can be  useful for programming
shard-systems with redundancy.

Closes #3808
Issue:
https://github.com/tarantool/tarantool/issues/3808
Branch:
https://github.com/tarantool/tarantool/tree/eljashm/gh-3808-wait-for-lsn
---
 src/box/lua/info.c             | 24 ++++++++++
 src/box/relay.cc               |  1 +
 src/box/replication.cc         |  1 +
 src/box/replication.h          |  4 ++
 test/replication/misc.result   | 87 ++++++++++++++++++++++++++++++++++
 test/replication/misc.test.lua | 35 ++++++++++++++
 6 files changed, 152 insertions(+)

diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 55382fd77..607c3084e 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -38,6 +38,7 @@
 
 #include "box/applier.h"
 #include "box/relay.h"
+#include "lua/trigger.h"
 #include "box/iproto.h"
 #include "box/wal.h"
 #include "box/replication.h"
@@ -146,6 +147,25 @@ lbox_pushrelay(lua_State *L, struct relay *relay)
 	}
 }
 
+static int
+lbox_push_replica_vclock(struct lua_State *L, void *event)
+{
+	struct replica *replica = (struct replica *) event;
+	lbox_pushvclock(L, relay_vclock(replica->relay));
+	lua_pushinteger(L, replica->id);
+	return 2;
+}
+
+/**
+ * Set/Reset/Get replication.on_vclock trigger
+ */
+static int
+lbox_replication_on_vclock(struct lua_State *L)
+{
+	return lbox_trigger_reset(L, 2, &replicaset.on_vclock,
+      lbox_push_replica_vclock, NULL);
+}
+
 static void
 lbox_pushreplica(lua_State *L, struct replica *replica)
 {
@@ -191,6 +211,10 @@ lbox_info_replication(struct lua_State *L)
 	lua_setfield(L, -2, "__serialize");
 	lua_setmetatable(L, -2);
 
+	lua_pushstring(L, "on_vclock");
+	lua_pushcfunction(L, lbox_replication_on_vclock);
+	lua_settable(L, -3);
+
 	replicaset_foreach(replica) {
 		/* Applier hasn't received replica id yet */
 		if (replica->id == REPLICA_ID_NIL)
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 74588cba7..9484b4ea5 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -393,6 +393,7 @@ tx_status_update(struct cmsg *msg)
 {
 	struct relay_status_msg *status = (struct relay_status_msg *)msg;
 	vclock_copy(&status->relay->tx.vclock, &status->vclock);
+	trigger_run(&replicaset.on_vclock, status->relay->replica);
 	static const struct cmsg_hop route[] = {
 		{relay_status_update, NULL}
 	};
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 6fcc56fe3..e743ec972 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -95,6 +95,7 @@ replication_init(void)
 	vclock_copy(&replicaset.applier.vclock, &replicaset.vclock);
 	rlist_create(&replicaset.applier.on_rollback);
 	rlist_create(&replicaset.applier.on_commit);
+	rlist_create(&replicaset.on_vclock);
 
 	diag_create(&replicaset.applier.diag);
 }
diff --git a/src/box/replication.h b/src/box/replication.h
index 19f283c7d..c916a5711 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -187,6 +187,10 @@ struct replicaset {
 	 * to connect and those that failed to connect.
 	 */
 	struct rlist anon;
+	/**
+	 * List of triggers invoked on lsn changed.
+	 */
+	struct rlist on_vclock;
 	/**
 	 * TX thread local vclock reflecting the state
 	 * of the cluster as maintained by appliers.
diff --git a/test/replication/misc.result b/test/replication/misc.result
index ae72ce3e4..6066ff905 100644
--- a/test/replication/misc.result
+++ b/test/replication/misc.result
@@ -827,3 +827,90 @@ box.cfg{replication_connect_timeout=replication_connect_timeout}
 box.cfg{replication_connect_quorum=replication_connect_quorum}
 ---
 ...
+--
+-- gh-3808 Trigger on_vclock_changed added
+--
+engine = test_run:get_cfg('engine')
+---
+...
+test_run:cleanup_cluster()
+---
+...
+box.schema.user.grant('guest', 'replication')
+---
+...
+_ = box.schema.space.create('test', {engine = engine})
+---
+...
+_ = box.space.test:create_index('pk')
+---
+...
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+---
+- true
+...
+test_run:cmd("start server replica")
+---
+- true
+...
+replication_events = {}
+---
+...
+-- trigger setup
+trigger = box.info.replication.on_vclock(function(vclock, replica_id) replication_events[replica_id] = vclock end)
+---
+...
+for i = 0, 99 do box.space["test"]:insert({i}) end
+---
+...
+-- wait until replica catches up the local one
+test_run:wait_cond(function() return box.info.replication[2].downstream.vclock ~= box.info.vclock end)
+---
+- true
+...
+-- check that the trigger caught the last transaction
+replication_events[box.info.replication[2].id][box.info.id] == box.info.lsn
+---
+- true
+...
+-- trigger reset
+_ = box.info.replication.on_vclock(nil, trigger)
+---
+...
+for i = 100, 199 do box.space["test"]:insert({i}) end
+---
+...
+-- wait until replica catches up the local one
+test_run:wait_cond(function() return box.info.replication[2].downstream.vclock ~= box.info.vclock end)
+---
+- true
+...
+-- check that the trigger didn't catch the last transaction
+replication_events[box.info.replication[2].id][box.info.id] < box.info.lsn
+---
+- true
+...
+replication_events = nil
+---
+...
+test_run:cmd("stop server replica")
+---
+- true
+...
+test_run:cmd("cleanup server replica")
+---
+- true
+...
+test_run:cmd("delete server replica")
+---
+- true
+...
+box.space.test:drop()
+---
+...
+test_run:cleanup_cluster()
+---
+...
+box.schema.user.revoke('guest', 'replication')
+---
+...
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 16e7e9e42..b8f9c1e9e 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -332,3 +332,38 @@ box.cfg{replication=""}
 
 box.cfg{replication_connect_timeout=replication_connect_timeout}
 box.cfg{replication_connect_quorum=replication_connect_quorum}
+
+--
+-- gh-3808 Trigger on_vclock_changed added
+--
+engine = test_run:get_cfg('engine')
+test_run:cleanup_cluster()
+box.schema.user.grant('guest', 'replication')
+_ = box.schema.space.create('test', {engine = engine})
+_ = box.space.test:create_index('pk')
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+test_run:cmd("start server replica")
+replication_events = {}
+-- trigger setup
+trigger = box.info.replication.on_vclock(function(vclock, replica_id) replication_events[replica_id] = vclock end)
+for i = 0, 99 do box.space["test"]:insert({i}) end
+-- wait until replica catches up the local one
+test_run:wait_cond(function() return box.info.replication[2].downstream.vclock ~= box.info.vclock end)
+-- check that the trigger caught the last transaction
+replication_events[box.info.replication[2].id][box.info.id] == box.info.lsn
+-- trigger reset
+_ = box.info.replication.on_vclock(nil, trigger)
+for i = 100, 199 do box.space["test"]:insert({i}) end
+-- wait until replica catches up the local one
+test_run:wait_cond(function() return box.info.replication[2].downstream.vclock ~= box.info.vclock end)
+-- check that the trigger didn't catch the last transaction
+replication_events[box.info.replication[2].id][box.info.id] < box.info.lsn
+
+replication_events = nil
+test_run:cmd("stop server replica")
+test_run:cmd("cleanup server replica")
+test_run:cmd("delete server replica")
+box.space.test:drop()
+
+test_run:cleanup_cluster()
+box.schema.user.revoke('guest', 'replication')
-- 
2.21.0 (Apple Git-122.2)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-14 12:57 [Tarantool-patches] [PATCH] Trigger on vclock change Maria
@ 2019-11-14 13:44 ` Konstantin Osipov
  2019-11-14 14:06   ` Georgy Kirichenko
  0 siblings, 1 reply; 21+ messages in thread
From: Konstantin Osipov @ 2019-11-14 13:44 UTC (permalink / raw)
  To: Maria; +Cc: tarantool-patches

* Maria <maria.khaydich@tarantool.org> [19/11/14 15:59]:
> This patch implements replication.on_vclock
> trigger that can be  useful for programming
> shard-systems with redundancy.

3808 is about being able to wait for an lsn.

Using a trigger for *waiting* is called busy waiting, and is a cpu
hog, especially at a performance critical space like update of
replica vclock, which can happen a hundred times a second.

Why not implement a way to wait for an lsn instead?

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-14 13:44 ` Konstantin Osipov
@ 2019-11-14 14:06   ` Georgy Kirichenko
  2019-11-14 15:26     ` Konstantin Osipov
  0 siblings, 1 reply; 21+ messages in thread
From: Georgy Kirichenko @ 2019-11-14 14:06 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Thursday, November 14, 2019 4:44:22 PM MSK Konstantin Osipov wrote:
> * Maria <maria.khaydich@tarantool.org> [19/11/14 15:59]:
> > This patch implements replication.on_vclock
> > trigger that can be  useful for programming
> > shard-systems with redundancy.
> 
> 3808 is about being able to wait for an lsn.
> 
> Using a trigger for *waiting* is called busy waiting, and is a cpu
> hog, especially at a performance critical space like update of
> replica vclock, which can happen a hundred times a second.
> 
> Why not implement a way to wait for an lsn instead?
Please explain your proposal in a more detailed way.
Do you wish to implement a hard-coded `handler` and each time when a replica 
vclock is updated this handler will compare the updated vclock against members 
of set of replica_id:lsn pairs organized in a list, tree or something else? 
And if a compare matches to true then a corresponding handler will be called?

Anyway, we will need to have such trigger in order to make applier able to 
report local replica wal and commited vclock in scope of synchronous 
replication issue.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-14 14:06   ` Georgy Kirichenko
@ 2019-11-14 15:26     ` Konstantin Osipov
  2019-11-14 17:13       ` Georgy Kirichenko
  0 siblings, 1 reply; 21+ messages in thread
From: Konstantin Osipov @ 2019-11-14 15:26 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

* Georgy Kirichenko <kirichenkoga@gmail.com> [19/11/14 17:11]:
> On Thursday, November 14, 2019 4:44:22 PM MSK Konstantin Osipov wrote:
> > * Maria <maria.khaydich@tarantool.org> [19/11/14 15:59]:
> > > This patch implements replication.on_vclock
> > > trigger that can be  useful for programming
> > > shard-systems with redundancy.
> > 
> > 3808 is about being able to wait for an lsn.
> > 
> > Using a trigger for *waiting* is called busy waiting, and is a cpu
> > hog, especially at a performance critical space like update of
> > replica vclock, which can happen a hundred times a second.
> > 
> > Why not implement a way to wait for an lsn instead?
> Please explain your proposal in a more detailed way.
> Do you wish to implement a hard-coded `handler` and each time when a replica 
> vclock is updated this handler will compare the updated vclock against members 
> of set of replica_id:lsn pairs organized in a list, tree or something else? 
> And if a compare matches to true then a corresponding handler will be called?

Yes, quite simply wait_lsn() could add the server_id, lsn that is
being waited for to a sorted list, and whenever we update
replicaset vclock for this lsn we also look at top of the list, if
it is not empty, and if the current lsn is greater than the top,
we could pop the value from the list and send a notification to
the waiter.

I also think it it's a pair of server_id, lsn, rather than entire
vclock - usually you know what you're waiting for, and it's only
one component of vclock, not all of them.

Going forward I think one is better off using synchronous
replication, not wait-lsn, since wait_lsn doesn't roll back the
transaction on failure.

Why did you decide to do this ticket at all?

> Anyway, we will need to have such trigger in order to make applier able to 
> report local replica wal and commited vclock in scope of synchronous 
> replication issue.

This has to happen in WAL thread, not in main thread, and has to
watch relay-from-memory vclock, not async-replication vclock. And
it also needs to roll back the transaction locally on failure,
i.e. write some sort of undo records to the WAL.

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-14 15:26     ` Konstantin Osipov
@ 2019-11-14 17:13       ` Georgy Kirichenko
  2019-11-14 17:33         ` Konstantin Osipov
  0 siblings, 1 reply; 21+ messages in thread
From: Georgy Kirichenko @ 2019-11-14 17:13 UTC (permalink / raw)
  To: tarantool-patches

[-- Attachment #1: Type: text/plain, Size: 3341 bytes --]

On Thursday, November 14, 2019 6:26:56 PM MSK Konstantin Osipov wrote:
> * Georgy Kirichenko <kirichenkoga@gmail.com> [19/11/14 17:11]:
> > On Thursday, November 14, 2019 4:44:22 PM MSK Konstantin Osipov wrote:
> > > * Maria <maria.khaydich@tarantool.org> [19/11/14 15:59]:
> > > > This patch implements replication.on_vclock
> > > > trigger that can be  useful for programming
> > > > shard-systems with redundancy.
> > > 
> > > 3808 is about being able to wait for an lsn.
> > > 
> > > Using a trigger for *waiting* is called busy waiting, and is a cpu
> > > hog, especially at a performance critical space like update of
> > > replica vclock, which can happen a hundred times a second.
> > > 
> > > Why not implement a way to wait for an lsn instead?
> > 
> > Please explain your proposal in a more detailed way.
> > Do you wish to implement a hard-coded `handler` and each time when a
> > replica vclock is updated this handler will compare the updated vclock
> > against members of set of replica_id:lsn pairs organized in a list, tree
> > or something else? And if a compare matches to true then a corresponding
> > handler will be called?
> Yes, quite simply wait_lsn() could add the server_id, lsn that is
> being waited for to a sorted list, and whenever we update
> replicaset vclock for this lsn we also look at top of the list, if
> it is not empty, and if the current lsn is greater than the top,
> we could pop the value from the list and send a notification to
> the waiter.
> 
> I also think it it's a pair of server_id, lsn, rather than entire
> vclock - usually you know what you're waiting for, and it's only
> one component of vclock, not all of them.
But there are some issues
1. what if we wish to have a timeout
2. what if lsn are waited in non-strictly increasing order
3. what if awaiting fiber is canceled
The approach you suggested looks for me like a bike-shed trigger 
implementation but the implementation is limited to use only for wait for lsn. 
So I would like to propose to ask Alexander Tikhonov to provide us with a 
benchmark result first and then make a conclusion about performance impact.

> 
> Going forward I think one is better off using synchronous
> replication, not wait-lsn, since wait_lsn doesn't roll back the
> transaction on failure.
> 
> Why did you decide to do this ticket at all?
Because some applications (like sls) do not require for any transaction to be 
replicated synchronously. Also this allows to be more-flexible.
> 
> > Anyway, we will need to have such trigger in order to make applier able to
> > report local replica wal and commited vclock in scope of synchronous
> > replication issue.
> 
> This has to happen in WAL thread, not in main thread, and has to
> watch relay-from-memory vclock, not async-replication vclock. And
> it also needs to roll back the transaction locally on failure,
> i.e. write some sort of undo records to the WAL.
This will work in an applier which lives in the TX cord, as an applier 
processes incoming transactions through the TX. And an applier should be able 
to answer with two vclocks - committed and written ones. Yes, WAL will batch 
such vclocks updates but this is still of hundreds of events per second. 
Unfortunately there is no point to move an applier to the WAL thread because a 
transaction could not be validated without TX.


[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-14 17:13       ` Georgy Kirichenko
@ 2019-11-14 17:33         ` Konstantin Osipov
  2019-11-14 19:16           ` Georgy Kirichenko
  0 siblings, 1 reply; 21+ messages in thread
From: Konstantin Osipov @ 2019-11-14 17:33 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

* Georgy Kirichenko <georgy@tarantool.org> [19/11/14 20:14]:
> > I also think it it's a pair of server_id, lsn, rather than entire
> > vclock - usually you know what you're waiting for, and it's only
> > one component of vclock, not all of them.
> But there are some issues
> 1. what if we wish to have a timeout
> 2. what if lsn are waited in non-strictly increasing order
> 3. what if awaiting fiber is canceled
> The approach you suggested looks for me like a bike-shed trigger 
> implementation but the implementation is limited to use only for wait for lsn. 
> So I would like to propose to ask Alexander Tikhonov to provide us with a 
> benchmark result first and then make a conclusion about performance impact.

Maybe you're right. But isn't the entire idea of wait_lsn()
bike-shed, as you put it, because we don't have sync replication?

> > > Anyway, we will need to have such trigger in order to make applier able to
> > > report local replica wal and commited vclock in scope of synchronous
> > > replication issue.
> > 
> > This has to happen in WAL thread, not in main thread, and has to
> > watch relay-from-memory vclock, not async-replication vclock. And
> > it also needs to roll back the transaction locally on failure,
> > i.e. write some sort of undo records to the WAL.
> This will work in an applier which lives in the TX cord, as an applier 
> processes incoming transactions through the TX. And an applier should be able 
> to answer with two vclocks - committed and written ones. Yes, WAL will batch 
> such vclocks updates but this is still of hundreds of events per second. 
> Unfortunately there is no point to move an applier to the WAL thread because a 
> transaction could not be validated without TX.

OK, now I get it where you're heading. I think sending acks from
tx thread has the following disadvantages:
- we mix up "committed" event and "written to the commit log"
  event. They become indistinguishable in tx thread. Per RAFT, we
  should send back acks as soon as we write to the local commit
  log, and when the leader gets enough 'acks' from enough commit
  logs it sends another message which makes the local transaction
  commit. If you 'ack' when you commit the local transaction, how
  would you be able to roll it back on leader change or majority 
  failure?

  So the event you need to be acknowledging is not the event this 
  trigger in question is capturing. 

- the second issue is latency. tx/wal scheduling delay can be in
  hundreds of microseconds, and this is close to  networking
  delays on fast networks within the same rack/data center.
  So it acknowledging commit log writes from WAL thread will
  also speed up the leader quite a bit, since the round trip
  will be shorter.

To sum up, I still think you should not use this trigger to
acknowledge commit log writes. Better have a separate socket for
this altogether, or move the write end of the existing socket to
the wal, while keeping the read end where it is now, in
tx/applier.

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-14 17:33         ` Konstantin Osipov
@ 2019-11-14 19:16           ` Georgy Kirichenko
  2019-11-14 19:48             ` Konstantin Osipov
  0 siblings, 1 reply; 21+ messages in thread
From: Georgy Kirichenko @ 2019-11-14 19:16 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Thursday, November 14, 2019 8:33:38 PM MSK Konstantin Osipov wrote:
> * Georgy Kirichenko <georgy@tarantool.org> [19/11/14 20:14]:
> > > I also think it it's a pair of server_id, lsn, rather than entire
> > > vclock - usually you know what you're waiting for, and it's only
> > > one component of vclock, not all of them.
> > 
> > But there are some issues
> > 1. what if we wish to have a timeout
> > 2. what if lsn are waited in non-strictly increasing order
> > 3. what if awaiting fiber is canceled
> > The approach you suggested looks for me like a bike-shed trigger
> > implementation but the implementation is limited to use only for wait for
> > lsn. So I would like to propose to ask Alexander Tikhonov to provide us
> > with a benchmark result first and then make a conclusion about
> > performance impact.
> Maybe you're right. But isn't the entire idea of wait_lsn()
> bike-shed, as you put it, because we don't have sync replication?
> 
> > > > Anyway, we will need to have such trigger in order to make applier
> > > > able to
> > > > report local replica wal and commited vclock in scope of synchronous
> > > > replication issue.
> > > 
> > > This has to happen in WAL thread, not in main thread, and has to
> > > watch relay-from-memory vclock, not async-replication vclock. And
> > > it also needs to roll back the transaction locally on failure,
> > > i.e. write some sort of undo records to the WAL.
> > 
> > This will work in an applier which lives in the TX cord, as an applier
> > processes incoming transactions through the TX. And an applier should be
> > able to answer with two vclocks - committed and written ones. Yes, WAL
> > will batch such vclocks updates but this is still of hundreds of events
> > per second. Unfortunately there is no point to move an applier to the WAL
> > thread because a transaction could not be validated without TX.
> 
> OK, now I get it where you're heading. I think sending acks from
> tx thread has the following disadvantages:
> - we mix up "committed" event and "written to the commit log"
>   event. They become indistinguishable in tx thread. Per RAFT, we
>   should send back acks as soon as we write to the local commit
>   log, and when the leader gets enough 'acks' from enough commit
>   logs it sends another message which makes the local transaction
>   commit. If you 'ack' when you commit the local transaction, how
>   would you be able to roll it back on leader change or majority
>   failure?
> 
>   So the event you need to be acknowledging is not the event this
>   trigger in question is capturing.
Sorry, I think you have outdated information about synchronous replication 
design. At this moment we do not implement the RAFT protocol (I did some 
attempts to discuss it some month before you but you ignored them all). So let 
me give some technical details.
A replica state is described by 2 vclocks - written and committed ones. Right 
now it is not an issue to report them both as an applier submits transaction 
asynchronously. In addition to these two vclocks (yes, the both could be 
transferred from the WAL thread) applier will report a reject vclock - the 
vclock where applying breaks, and this could be done from TX. I do not like 
the idea to split transmission between 2 threads. The write and reject vclocks 
are used to evaluate majority whereas commit vclock instructs a whole cluster 
that majority was already reached. The main point is that any replica member 
could commit a transaction - this relaxes RAFT limitations and increases the 
whole cluster durability (and it is simpler in design and implementation, 
really). Also the new synchronous replication design has a lot of advantages 
in comparison with RAFT but let us discuss it in another thread. If you 
interested please ask for details as I have not enough time to write public 
document right now.
Returning to the subject, I would like to conclude that wal on_commit and 
on_write triggers are good source to initiate status transmission. And the 
trigger implemented by Maria will be replaced by replica on_commit which 
allows us not to change anything at higher levels.

> 
> - the second issue is latency. tx/wal scheduling delay can be in
>   hundreds of microseconds, and this is close to  networking
>   delays on fast networks within the same rack/data center.
>   So it acknowledging commit log writes from WAL thread will
>   also speed up the leader quite a bit, since the round trip
>   will be shorter.
> 
> To sum up, I still think you should not use this trigger to
> acknowledge commit log writes. Better have a separate socket for
> this altogether, or move the write end of the existing socket to
> the wal, while keeping the read end where it is now, in
> tx/applier.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-14 19:16           ` Georgy Kirichenko
@ 2019-11-14 19:48             ` Konstantin Osipov
  2019-11-14 20:01               ` Georgy Kirichenko
  0 siblings, 1 reply; 21+ messages in thread
From: Konstantin Osipov @ 2019-11-14 19:48 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

* Georgy Kirichenko <georgy@tarantool.org> [19/11/14 22:42]:
> A replica state is described by 2 vclocks - written and committed ones. Right 
> now it is not an issue to report them both as an applier submits transaction 
> asynchronously. In addition to these two vclocks (yes, the both could be 
> transferred from the WAL thread) applier will report a reject vclock - the 
> vclock where applying breaks, and this could be done from TX. I do not like 
> the idea to split transmission between 2 threads. The write and reject vclocks 
> are used to evaluate majority whereas commit vclock instructs a whole cluster 
> that majority was already reached. The main point is that any replica member 
> could commit a transaction - this relaxes RAFT limitations and increases the 
> whole cluster durability (and it is simpler in design and implementation, 
> really). Also the new synchronous replication design has a lot of advantages 
> in comparison with RAFT but let us discuss it in another thread. If you 
> interested please ask for details as I have not enough time to write public 
> document right now.
> Returning to the subject, I would like to conclude that wal on_commit and 
> on_write triggers are good source to initiate status transmission. And the 
> trigger implemented by Maria will be replaced by replica on_commit which 
> allows us not to change anything at higher levels.

Congratulations, Georgy, maybe you even get a Turing award for
inventing a new protocol.

Wait... they don't give a Turing award for "protocols" which have
no proof and yield inconsistent results, or do they?

Meanwhile, if you have a design in mind, you could send an RFC. I
will respond to the RFC.

PS What a shame...

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-14 19:48             ` Konstantin Osipov
@ 2019-11-14 20:01               ` Georgy Kirichenko
  2019-11-15  1:57                 ` Konstantin Osipov
  0 siblings, 1 reply; 21+ messages in thread
From: Georgy Kirichenko @ 2019-11-14 20:01 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Thursday, November 14, 2019 10:48:06 PM MSK Konstantin Osipov wrote:
> * Georgy Kirichenko <georgy@tarantool.org> [19/11/14 22:42]:
> > A replica state is described by 2 vclocks - written and committed ones.
> > Right now it is not an issue to report them both as an applier submits
> > transaction asynchronously. In addition to these two vclocks (yes, the
> > both could be transferred from the WAL thread) applier will report a
> > reject vclock - the vclock where applying breaks, and this could be done
> > from TX. I do not like the idea to split transmission between 2 threads.
> > The write and reject vclocks are used to evaluate majority whereas commit
> > vclock instructs a whole cluster that majority was already reached. The
> > main point is that any replica member could commit a transaction - this
> > relaxes RAFT limitations and increases the whole cluster durability (and
> > it is simpler in design and implementation, really). Also the new
> > synchronous replication design has a lot of advantages in comparison with
> > RAFT but let us discuss it in another thread. If you interested please
> > ask for details as I have not enough time to write public document right
> > now.
> > Returning to the subject, I would like to conclude that wal on_commit and
> > on_write triggers are good source to initiate status transmission. And the
> > trigger implemented by Maria will be replaced by replica on_commit which
> > allows us not to change anything at higher levels.
> 
> Congratulations, Georgy, maybe you even get a Turing award for
> inventing a new protocol.
> 
> Wait... they don't give a Turing award for "protocols" which have
> no proof and yield inconsistent results, or do they?
You do not even know details of the protocol but make such suggestion, so I 
could only repeat your last statement: "what a shame", seriously.
Please, remember all my attempts to discuss it with you or, for instance, our 
one-per-2-week meetings which all (except the first one) were skipped by you.
> 
> Meanwhile, if you have a design in mind, you could send an RFC. I
> will respond to the RFC.
Anybody could see the design document after this protocol research will be 
done. Yes, the research requires to be implemented first.
> 
> PS What a shame...

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-14 20:01               ` Georgy Kirichenko
@ 2019-11-15  1:57                 ` Konstantin Osipov
  2019-11-15  6:02                   ` Georgy Kirichenko
  0 siblings, 1 reply; 21+ messages in thread
From: Konstantin Osipov @ 2019-11-15  1:57 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

* Georgy Kirichenko <georgy@tarantool.org> [19/11/15 04:33]:
> On Thursday, November 14, 2019 10:48:06 PM MSK Konstantin Osipov wrote:
> > * Georgy Kirichenko <georgy@tarantool.org> [19/11/14 22:42]:
> > > A replica state is described by 2 vclocks - written and committed ones.
> > > Right now it is not an issue to report them both as an applier submits
> > > transaction asynchronously. In addition to these two vclocks (yes, the
> > > both could be transferred from the WAL thread) applier will report a
> > > reject vclock - the vclock where applying breaks, and this could be done
> > > from TX. I do not like the idea to split transmission between 2 threads.
> > > The write and reject vclocks are used to evaluate majority whereas commit
> > > vclock instructs a whole cluster that majority was already reached. The
> > > main point is that any replica member could commit a transaction - this
> > > relaxes RAFT limitations and increases the whole cluster durability (and
> > > it is simpler in design and implementation, really). Also the new
> > > synchronous replication design has a lot of advantages in comparison with
> > > RAFT but let us discuss it in another thread. If you interested please
> > > ask for details as I have not enough time to write public document right
> > > now.
> > > Returning to the subject, I would like to conclude that wal on_commit and
> > > on_write triggers are good source to initiate status transmission. And the
> > > trigger implemented by Maria will be replaced by replica on_commit which
> > > allows us not to change anything at higher levels.
> > 
> > Congratulations, Georgy, maybe you even get a Turing award for
> > inventing a new protocol.
> > 
> > Wait... they don't give a Turing award for "protocols" which have
> > no proof and yield inconsistent results, or do they?
> You do not even know details of the protocol but make such suggestion, so I 
> could only repeat your last statement: "what a shame", seriously.
> Please, remember all my attempts to discuss it with you or, for instance, our 
> one-per-2-week meetings which all (except the first one) were skipped by you.

If you want to discuss anything with me, feel free to reach out.

I am following the process as I believe it should work in a
distributed open source project: before there is a change, there
is a design document on which everyone can equally comment.

> > Meanwhile, if you have a design in mind, you could send an RFC. I
> > will respond to the RFC.
> Anybody could see the design document after this protocol research will be 
> done. Yes, the research requires to be implemented first.

You don't need to waste time on implementation. Your approach,
just by the description of it, is neither consistent, nor durable:

- if you allow active-active, you can have lost writes.

Here's a simple example:
    box.begin() local a = box.space.t.select{1}[2] box.space.t:replace{1, a+1} box.commit()

By running this transaction concurrently on two masters, you will
get lost writes. RAFT would not let that happen.

But let's imagine for a second that this is not an issue.
Your proposal is missing the critical parts of RAFT: neutralizing
old leaders and completing transactions upon leader failure - i.e.
when the new leader commits writes accepting by the majority and
rolls back the rest, on behalf of the deceased. 

Imagine one of your active replica fails midway: 
- it can fail after a record is written to wal by one of the peers
- it can fail after a record is written to wal by the majority
  of the peers, bu
- it can fail after a record is committed by one of the peers, but
  not all.

Who and how is going to repair these replicas upon master failure?
You just threw RAFT "longest log wins" principle into a garbage 
bin, so you would never be able to identify which specific
transactions need repair, on which replica, and what this repair
should do. Needless to say that you haven't stopped transaction
processing on these replicas, so even if you knew which specific
transactions needed completion and on which replica, the data they
modify could be easily overwritten by the time you get to finish these
transactions.

As to your suggestion to track commit/wal write vclock in tx
thread, well, this has fortunately nothing to do with correctness,
but has all to do with efficiency and performance. There was a
plan to move out the applier to iproto thread from tx, and you
even wrote a patch, which wasn't finished like many others,
because you never addressed the review comments. Now you chose to
go in the opposite direction by throwing more logic to the tx
thread, adding to the scalability bottleneck of the
single-threaded architecture. We discussed that before - but
somehow it slips from your mind each time.

Meanwhile, Vlad's review of your patch for in-memory WAL is not
addressed. You could complain that my reviews are too harsh
and asking too much, but this is Vlad's...

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-15  1:57                 ` Konstantin Osipov
@ 2019-11-15  6:02                   ` Georgy Kirichenko
  2019-11-15 13:57                     ` Konstantin Osipov
  0 siblings, 1 reply; 21+ messages in thread
From: Georgy Kirichenko @ 2019-11-15  6:02 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Friday, November 15, 2019 4:57:45 AM MSK Konstantin Osipov wrote:
> * Georgy Kirichenko <georgy@tarantool.org> [19/11/15 04:33]:
> > On Thursday, November 14, 2019 10:48:06 PM MSK Konstantin Osipov wrote:
> > > * Georgy Kirichenko <georgy@tarantool.org> [19/11/14 22:42]:
> > > > A replica state is described by 2 vclocks - written and committed
> > > > ones.
> > > > Right now it is not an issue to report them both as an applier submits
> > > > transaction asynchronously. In addition to these two vclocks (yes, the
> > > > both could be transferred from the WAL thread) applier will report a
> > > > reject vclock - the vclock where applying breaks, and this could be
> > > > done
> > > > from TX. I do not like the idea to split transmission between 2
> > > > threads.
> > > > The write and reject vclocks are used to evaluate majority whereas
> > > > commit
> > > > vclock instructs a whole cluster that majority was already reached.
> > > > The
> > > > main point is that any replica member could commit a transaction -
> > > > this
> > > > relaxes RAFT limitations and increases the whole cluster durability
> > > > (and
> > > > it is simpler in design and implementation, really). Also the new
> > > > synchronous replication design has a lot of advantages in comparison
> > > > with
> > > > RAFT but let us discuss it in another thread. If you interested please
> > > > ask for details as I have not enough time to write public document
> > > > right
> > > > now.
> > > > Returning to the subject, I would like to conclude that wal on_commit
> > > > and
> > > > on_write triggers are good source to initiate status transmission. And
> > > > the
> > > > trigger implemented by Maria will be replaced by replica on_commit
> > > > which
> > > > allows us not to change anything at higher levels.
> > > 
> > > Congratulations, Georgy, maybe you even get a Turing award for
> > > inventing a new protocol.
> > > 
> > > Wait... they don't give a Turing award for "protocols" which have
> > > no proof and yield inconsistent results, or do they?
> > 
> > You do not even know details of the protocol but make such suggestion, so
> > I
> > could only repeat your last statement: "what a shame", seriously.
> > Please, remember all my attempts to discuss it with you or, for instance,
> > our one-per-2-week meetings which all (except the first one) were skipped
> > by you.
> If you want to discuss anything with me, feel free to reach out.
> 
> I am following the process as I believe it should work in a
> distributed open source project: before there is a change, there
> is a design document on which everyone can equally comment.
90% of tarantool core developers are sitting together in one room or just 
call-available during the day. Also, please, tell us how many RFC responds you 
saw from a somebody who is not a part of tarantool core team. So, you wish to 
force the whole team to use only this inconvenient and unproductive (because 
of long-latency responds) communication way because of your beliefs.
So I have the another look: each thing to be discussed should be discussed (or 
brainstormed) verbally (because we are the tarantol TEAM members) first and 
only then a well-designed RFC could be formed (or, maybe, you wish to have 
lots of worthless RFCs but I no see any point here).
> 
> > > Meanwhile, if you have a design in mind, you could send an RFC. I
> > > will respond to the RFC.
> > 
> > Anybody could see the design document after this protocol research will be
> > done. Yes, the research requires to be implemented first.
> 
> You don't need to waste time on implementation. Your approach,
> just by the description of it, is neither consistent, nor durable:
I think you just did not understand the approach because all of your further 
considerations are not related to protocol I am implementing. Also I think you 
have not got even base points why RAFT is not well applicable in case of 
tarantool.
> 
> - if you allow active-active, you can have lost writes.
> 
> Here's a simple example:
>     box.begin() local a = box.space.t.select{1}[2] box.space.t:replace{1,
> a+1} box.commit()
> 
> By running this transaction concurrently on two masters, you will
> get lost writes. RAFT would not let that happen.
> 
> But let's imagine for a second that this is not an issue.
> Your proposal is missing the critical parts of RAFT: neutralizing
> old leaders and completing transactions upon leader failure - i.e.
> when the new leader commits writes accepting by the majority and
> rolls back the rest, on behalf of the deceased.
> 
> Imagine one of your active replica fails midway:
> - it can fail after a record is written to wal by one of the peers
> - it can fail after a record is written to wal by the majority
>   of the peers, bu
> - it can fail after a record is committed by one of the peers, but
>   not all.
> 
> Who and how is going to repair these replicas upon master failure?
> You just threw RAFT "longest log wins" principle into a garbage
> bin, so you would never be able to identify which specific
> transactions need repair, on which replica, and what this repair
> should do. Needless to say that you haven't stopped transaction
> processing on these replicas, so even if you knew which specific
> transactions needed completion and on which replica, the data they
> modify could be easily overwritten by the time you get to finish these
> transactions.
> 
> As to your suggestion to track commit/wal write vclock in tx
> thread, well, this has fortunately nothing to do with correctness,
> but has all to do with efficiency and performance. There was a
> plan to move out the applier to iproto thread from tx, and you
> even wrote a patch, which wasn't finished like many others,
> because you never addressed the review comments.
Wrong, this task was not finished because of schema latch and yielding ddl 
while you rejected strict applier ordering. But after 2 years you changed you 
mind and I was able implement the parallel applier. The second issue why 
applier in iproto was not implemented is the limited capacity oftx fiber pool 
with you still want to preserve. So I wonder why did this slip from your mind.
> Now you chose to
> go in the opposite direction by throwing more logic to the tx
> thread, adding to the scalability bottleneck of the
> single-threaded architecture. We discussed that before - but
> somehow it slips from your mind each time.
The only thing which I wish to add to the tx cord is just around 10 trigger 
calls per batch and each ones just wakes an applier fiber up. And this costs 
nothing in terms of increased WAL latency because of synchronous replication. 
And now you still suggest to do the transaction way longer (iproto->tx->wal-
>tx->iproto), well well.
> 
> Meanwhile, Vlad's review of your patch for in-memory WAL is not
> addressed. You could complain that my reviews are too harsh
> and asking too much, but this is Vlad's...
I have no clue why you remembered my in-memory replication patch and Vlad.
This patch is on-hold because I wish to fix the by-design broken GC first.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-15  6:02                   ` Georgy Kirichenko
@ 2019-11-15 13:57                     ` Konstantin Osipov
  2019-11-15 19:57                       ` Georgy Kirichenko
  0 siblings, 1 reply; 21+ messages in thread
From: Konstantin Osipov @ 2019-11-15 13:57 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

* Georgy Kirichenko <georgy@tarantool.org> [19/11/15 09:43]:
> 90% of tarantool core developers are sitting together in one room or just 
> call-available during the day. Also, please, tell us how many RFC responds you 
> saw from a somebody who is not a part of tarantool core team. So, you wish to 
> force the whole team to use only this inconvenient and unproductive (because 
> of long-latency responds) communication way because of your beliefs.
> So I have the another look: each thing to be discussed should be discussed (or 
> brainstormed) verbally (because we are the tarantol TEAM members) first and 
> only then a well-designed RFC could be formed (or, maybe, you wish to have 
> lots of worthless RFCs but I no see any point here).

I gave earlier in this thread concrete examples how active-active
won't work. It didn't take me long. You chose to respond back with
some vague claims and promises of magic.

If you have a miracle design, and you happen to not want to send
an RFC, you still can prove it by sending a patch. Last time it
didn't work: your refused to send an RFC for in-memory WAL - and
the patch can't pass the code review for over 3 months. 

All this suggests that the patch by Maria is simply not worth it. 
Whatever it is needed for may never happen - and even if it
happens, it is most likely the wrong thing to do.

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-15 13:57                     ` Konstantin Osipov
@ 2019-11-15 19:57                       ` Georgy Kirichenko
  2019-11-16 10:37                         ` Konstantin Osipov
  2019-11-16 11:56                         ` Konstantin Osipov
  0 siblings, 2 replies; 21+ messages in thread
From: Georgy Kirichenko @ 2019-11-15 19:57 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Friday, November 15, 2019 4:57:04 PM MSK Konstantin Osipov wrote:
> * Georgy Kirichenko <georgy@tarantool.org> [19/11/15 09:43]:
> > 90% of tarantool core developers are sitting together in one room or just
> > call-available during the day. Also, please, tell us how many RFC responds
> > you saw from a somebody who is not a part of tarantool core team. So, you
> > wish to force the whole team to use only this inconvenient and
> > unproductive (because of long-latency responds) communication way because
> > of your beliefs. So I have the another look: each thing to be discussed
> > should be discussed (or brainstormed) verbally (because we are the
> > tarantol TEAM members) first and only then a well-designed RFC could be
> > formed (or, maybe, you wish to have lots of worthless RFCs but I no see
> > any point here).
> 
> I gave earlier in this thread concrete examples how active-active
You made a mistake again. My approach is not about 
active-active. I did not ever claim that my patch will allow active-active 
(because we do not still have a transaction manager). When I said that any 
instance is able to commit I mean that any replica, which sees a majority, 
able to finalize a transaction (commit it) even if the transaction initiator is 
dead. 
> won't work. It didn't take me long. You chose to respond back with
> some vague claims and promises of magic.
Please, point me out first how your claims related to my approach. Because you 
made no effort to understand the approach. Even did not ask for very brief 
explanation.
> 
> If you have a miracle design, and you happen to not want to send
> an RFC, you still can prove it by sending a patch.
The next wrong suggestion. I have a concrete design which was shared and 
discussed.
 > Last time it didn't work: your refused to send an RFC for in-memory WAL - 
and the patch can't pass the code review for over 3 months.
Please read my previous message and find out why this patchset is on hold.
To be concrete, the patch is not passed the review because of:
1. Bad gc design which I want to fix first, and I already answered why your 
approach to fix it is not even working. Yes, you could not / did not want to 
object.
2. Vlad's comment about comments and naming. Please tell me how a miracle RFC 
could fix this issue (Yes, I am not very accurate with comments and texts)
3. Vlad comment about dynamic array allocation which I want to respond in the 
next version. I would like to repeat, I do not want to sent it until the first 
point will not be fixed.
4. Vlad's comments about some mess in my code (xlog_buf_begin and friends). 
They are already fixed but not shared because of the first point. And there is 
no way how a RFC could prevent it.

> 
> All this suggests that the patch by Maria is simply not worth it.
All this suggest that you have no clue how the patch would work in the future, 
seriously.
> Whatever it is needed for may never happen - and even if it
> happens, it is most likely the wrong thing to do.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-15 19:57                       ` Georgy Kirichenko
@ 2019-11-16 10:37                         ` Konstantin Osipov
  2019-11-16 20:43                           ` Georgy Kirichenko
  2019-11-16 11:56                         ` Konstantin Osipov
  1 sibling, 1 reply; 21+ messages in thread
From: Konstantin Osipov @ 2019-11-16 10:37 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

* Georgy Kirichenko <georgy@tarantool.org> [19/11/15 22:59]:
> > I gave earlier in this thread concrete examples how active-active
> You made a mistake again. My approach is not about 
> active-active. I did not ever claim that my patch will allow active-active 
> (because we do not still have a transaction manager). When I said that any 
> instance is able to commit I mean that any replica, which sees a majority, 
> able to finalize a transaction (commit it) even if the transaction initiator is 
> dead. 

Fine, that might work - given the vector clock this how it will
naturally work in most cases. Too bad you kept it a secret until now.

> > won't work. It didn't take me long. You chose to respond back with
> > some vague claims and promises of magic.
> Please, point me out first how your claims related to my approach. Because you 
> made no effort to understand the approach. Even did not ask for very brief 
> explanation.
> > 
> > If you have a miracle design, and you happen to not want to send
> > an RFC, you still can prove it by sending a patch.
> The next wrong suggestion. I have a concrete design which was shared and 
> discussed.

Ehm, where's the link? Or tarantool is now closed source? Sharing
it publicly would have allowed me collaborate  - which you seems to
intentionally want to avoid.

>  > Last time it didn't work: your refused to send an RFC for in-memory WAL - 
> and the patch can't pass the code review for over 3 months.
> Please read my previous message and find out why this patchset is on hold.
> To be concrete, the patch is not passed the review because of:
> 1. Bad gc design which I want to fix first, and I already answered why your 
> approach to fix it is not even working. Yes, you could not / did not want to 
> object.
> 2. Vlad's comment about comments and naming. Please tell me how a miracle RFC 
> could fix this issue (Yes, I am not very accurate with comments and texts)
> 3. Vlad comment about dynamic array allocation which I want to respond in the 
> next version. I would like to repeat, I do not want to sent it until the first 
> point will not be fixed.
> 4. Vlad's comments about some mess in my code (xlog_buf_begin and friends). 
> They are already fixed but not shared because of the first point. And there is 
> no way how a RFC could prevent it.
> 
> > 
> > All this suggests that the patch by Maria is simply not worth it.
> All this suggest that you have no clue how the patch would work in the future, 
> seriously.

Well, this is not because of any lack of intent on my part. Going
back to the patch, it doesn't look good so far. If you wanted to
change my opinion, the best course of action is to use technical
arguments (and RFC is the best way for it), not some ungrounded 
claims about better processes or how to best contribute to an open
source project.

> > Whatever it is needed for may never happen - and even if it
> > happens, it is most likely the wrong thing to do.

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-15 19:57                       ` Georgy Kirichenko
  2019-11-16 10:37                         ` Konstantin Osipov
@ 2019-11-16 11:56                         ` Konstantin Osipov
  2019-11-16 20:34                           ` Georgy Kirichenko
  1 sibling, 1 reply; 21+ messages in thread
From: Konstantin Osipov @ 2019-11-16 11:56 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

* Georgy Kirichenko <georgy@tarantool.org> [19/11/15 22:59]:
> Please, point me out first how your claims related to my approach. Because you 
> made no effort to understand the approach. Even did not ask for very brief 
> explanation.

Alright, you claim I am not asking for a brief explanation. I am
asking for an RFC, which is the best way to explain ideas we have.

But fine, I am asking you:
> > 
> > If you have a miracle design, and you happen to not want to send
> > an RFC, you still can prove it by sending a patch.
> The next wrong suggestion. I have a concrete design which was shared and 
> discussed.
>  > Last time it didn't work: your refused to send an RFC for in-memory WAL - 
> and the patch can't pass the code review for over 3 months.
> Please read my previous message and find out why this patchset is on hold.
> To be concrete, the patch is not passed the review because of:
> 1. Bad gc design which I want to fix first, and I already answered why your 
> approach to fix it is not even working. Yes, you could not / did not want to 
> object.

What is wrong with GC and how exactly do you want to "fix" it?

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-16 11:56                         ` Konstantin Osipov
@ 2019-11-16 20:34                           ` Georgy Kirichenko
  2019-11-18  9:31                             ` Konstantin Osipov
  0 siblings, 1 reply; 21+ messages in thread
From: Georgy Kirichenko @ 2019-11-16 20:34 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Saturday, November 16, 2019 2:56:07 PM MSK Konstantin Osipov wrote:
> * Georgy Kirichenko <georgy@tarantool.org> [19/11/15 22:59]:
> > Please, point me out first how your claims related to my approach. Because
> > you made no effort to understand the approach. Even did not ask for very
> > brief explanation.
> 
> Alright, you claim I am not asking for a brief explanation. I am
> asking for an RFC, which is the best way to explain ideas we have.
Yes, of course, the RFC is the best way to explain my ideas, but today I have 
only a proof of concepts which requires some further investigation to 
understand how can I implement it (as I pretty sure the RFC should contain an 
implementation plan). Just after еhe raw patch would be working I definitely 
will write a RFC and any other documentation. The only thing I asked you to 
discuss this proof of concept with you as the research is not done yet.
> 
> But fine, I am asking you:
> > > If you have a miracle design, and you happen to not want to send
> > > an RFC, you still can prove it by sending a patch.
> > 
> > The next wrong suggestion. I have a concrete design which was shared and
> > discussed.
> > 
> >  > Last time it didn't work: your refused to send an RFC for in-memory WAL
> >  > -
> > 
> > and the patch can't pass the code review for over 3 months.
> > Please read my previous message and find out why this patchset is on hold.
> > To be concrete, the patch is not passed the review because of:
> > 1. Bad gc design which I want to fix first, and I already answered why
> > your
> > approach to fix it is not even working. Yes, you could not / did not want
> > to object.
> 
> What is wrong with GC and how exactly do you want to "fix" it?
We have discussed some points with you verbally (about 3-4 month ago).
The main point is: the way of information processing is weird:
1. WAL has the full information about the wal directory (xlogs and their 
boundaries)
2. WAL process the wal directory cleanup
3. We filter out all this information while relaying (as a relay has only a 
stream of rows)
4. We try to restore some of this information using on_close_log recovery 
trigger.
5. We send recovered boundaries to TX and tx tread reconstruct the relay order 
loosing really relay vclocks (as they mapped to the local xlog history)
6. TX sends the oldest vclock back to wal
7. There is some issues with making a consumer inactive. For instance if we 
deactivated a consumer could survive, for instance if deleted xlog was already 
send by an applier but not reported yet (I do not even know how it could be 
fixed in the current design).
Maybe it is working, but I afraid, this was done without any thinking about 
the future development (I mean the synchronous replication). Let me explain 
why.
1. WAL should receive all relay states as soon as possible.
2. The set of relay vclocks is enough to perform garbage collection (as we 
could form a vclock with is the lower bound of the set)
So I wish the garbage collection would be implemented using direct relay to 
wal reporting. In this circumstances I was in need to implement a structure (I 
named it as matrix clock - mclcok) which able to contain relay vclocks and 
evaluate a vclock which is lower bound of n-members of the mclcock.
The mclock could be used to get the n-majority vclock as wel as the lowest 
boundary of all vclock alive.
The mclock is already implemented as well as new gc design (wal knows about 
all relay vclock and the first vclock locked by TX - checkpoint or join read 
view).

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-16 10:37                         ` Konstantin Osipov
@ 2019-11-16 20:43                           ` Georgy Kirichenko
  0 siblings, 0 replies; 21+ messages in thread
From: Georgy Kirichenko @ 2019-11-16 20:43 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Saturday, November 16, 2019 1:37:55 PM MSK Konstantin Osipov wrote:
> * Georgy Kirichenko <georgy@tarantool.org> [19/11/15 22:59]:
> > > I gave earlier in this thread concrete examples how active-active
> > 
> > You made a mistake again. My approach is not about
> > active-active. I did not ever claim that my patch will allow active-active
> > (because we do not still have a transaction manager). When I said that any
> > instance is able to commit I mean that any replica, which sees a majority,
> > able to finalize a transaction (commit it) even if the transaction
> > initiator is dead.
> 
> Fine, that might work - given the vector clock this how it will
> naturally work in most cases. Too bad you kept it a secret until now.
> 
> > > won't work. It didn't take me long. You chose to respond back with
> > > some vague claims and promises of magic.
> > 
> > Please, point me out first how your claims related to my approach. Because
> > you made no effort to understand the approach. Even did not ask for very
> > brief explanation.
> > 
> > > If you have a miracle design, and you happen to not want to send
> > > an RFC, you still can prove it by sending a patch.
> > 
> > The next wrong suggestion. I have a concrete design which was shared and
> > discussed.
> 
> Ehm, where's the link? Or tarantool is now closed source? Sharing
> it publicly would have allowed me collaborate  - which you seems to
> intentionally want to avoid.
There is no link because this is just a research which would be shared when it 
have some proofs. I definitely do not want to pollute the public mail list. And 
having many-turns responded RFCs is not good for communication as it erodes 
the initial topic.
> 
> >  > Last time it didn't work: your refused to send an RFC for in-memory WAL
> >  > -
> > 
> > and the patch can't pass the code review for over 3 months.
> > Please read my previous message and find out why this patchset is on hold.
> > To be concrete, the patch is not passed the review because of:
> > 1. Bad gc design which I want to fix first, and I already answered why
> > your
> > approach to fix it is not even working. Yes, you could not / did not want
> > to object.
> > 2. Vlad's comment about comments and naming. Please tell me how a miracle
> > RFC could fix this issue (Yes, I am not very accurate with comments and
> > texts) 3. Vlad comment about dynamic array allocation which I want to
> > respond in the next version. I would like to repeat, I do not want to
> > sent it until the first point will not be fixed.
> > 4. Vlad's comments about some mess in my code (xlog_buf_begin and
> > friends).
> > They are already fixed but not shared because of the first point. And
> > there is no way how a RFC could prevent it.
> > 
> > > All this suggests that the patch by Maria is simply not worth it.
> > 
> > All this suggest that you have no clue how the patch would work in the
> > future, seriously.
> 
> Well, this is not because of any lack of intent on my part. Going
> back to the patch, it doesn't look good so far.
Sorry, but you have  written: "This is well designed in my view" which I 
accepted with proud. The only you complain was about the garbage collection. 
And you did not respond anything to my last patch.
> If you wanted to
> change my opinion, the best course of action is to use technical
> arguments (and RFC is the best way for it), not some ungrounded
> claims about better processes or how to best contribute to an open
> source project.
> 
> > > Whatever it is needed for may never happen - and even if it
> > > happens, it is most likely the wrong thing to do.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-16 20:34                           ` Georgy Kirichenko
@ 2019-11-18  9:31                             ` Konstantin Osipov
  2020-06-02 12:22                               ` Maria Khaydich
  0 siblings, 1 reply; 21+ messages in thread
From: Konstantin Osipov @ 2019-11-18  9:31 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

* Georgy Kirichenko <georgy@tarantool.org> [19/11/16 23:37]:

> > What is wrong with GC and how exactly do you want to "fix" it?
> We have discussed some points with you verbally (about 3-4 month
> ago).  The main point is: the way of information processing is
> weird:

> 1. WAL has the full information about the wal directory (xlogs
>    and their boundaries)

This is not strictly necessary. It saves us one xdir_scan() in
xdir_collect_garbage(), this is perhaps the main historical
reason it's there.

We even have to make an effort to maintain this state in WAL:
- we call xdir_scan() in wal_enable()
- we call xdir_add_vclock() whenever we open/close the next xlog.

The second reason it was done in WAL was to not block tx
thread, but later we had latency spikes in WAL thread as well, so
we added XDIR_GC_ASYNC to fix these, and this second reason is a
non-reason any more.

Finally, the third reason WAL does it is wal_fallocate() function,
which removes files if we're out of space. Instead of going back
to GC subsystem and asking it to remove a file, the implementation
went the short route and removes the file directly in WAL
susbystem and notifies GC as a matter of fact.

As you can see, all these reasons are accidental. Technically any
subsystem (WAL, GC) can remove these files if we add xdir_scan()
to xdir_collect_garbage().

GC subsystem is responsible for all the old files, so it should be
dealing with them.

The fix is to add xdir_scan() to xdir_collect_garbage(), and
change wal_fallocate() to send a message to GC asking it to remove
some data, rather than kick the chair out of GC butt by calling
xdir_collect_garbage(XDIR_GC_REMOVE_ONE). One issue with fixing it
this way, is what would you do in wal_fallocate() after you send
the message? You will have to have wal_fallocate_soft(), which
sends the message asynchronously, to not stall WAL, and
wal_fallocate_hard(), which would stall WAL until there is
response from TX about extra space. A lot more work.

Even though WAL contains some of GC state, it's neither an owner
of it nor a consumer: it is only a producer of GC state, and
it updates GC state by sending notifications about the files that
it creates and closes. The consumers are engines, checkpoints,
backups, relays.

BTW, I don't think in-memory replication is a new consumer of GC state
- it doesn't act like a standard consumer:

 * a usual consumer may need multiple xlog files, because it can
   be at a position way behind the current xlog; in-memory
   replication is almost always pointing to the current xlog, 
   there may be rare cases when it depends on the previous xlogs
   when xlog size is small or there was a recent rotation.

 * in case of standard consumers, each consumer is at its own
   position, while for in-memory replication, all relays are more
   or less on the same position - at least it doesn't make any
   logical sense to advance each relay's position independently

I remember having suggested that, and I don't remember why using a
single consumer for all in-memory relays did not work out for you. 
The idea is that whenever a relay switches to the memory mode it
unsubscribes from GC, and whenever it is back to file mode, it is
subscribes to GC again. In order to avoid any races, in-memory-WAL
as a consumer keeps a reference to a few WALs.

The alternative is to move GC subsystem entirely to WAL. This
could perhaps also work and even be cleaner than centralizing GC
in TX. Either way I don't see it as a blocker for in-memory WAL -
I think in-memory WAL can work with GC being either in WAL or in
TX, it's just the messages that threads exchange become a bit more
complicated. 

> 2. WAL process the wal directory cleanup

As I wrote above, there are two reasons for this, both historical:
- we wanted to avoid TX stalls
- we have wal_fallocate(), a feature which was implemented
  "lazily" so it just removes the files under GCs feet and
  notifies GC after the fact.

GC, logically, controls the WAL dir, and WAL is only a producer of
WAL files.

> 3. We filter out all this information while relaying (as a relay
>    has only a stream of rows)

GC subscription is not interested in the stream of rows.
It is interested in a stream files. A file is represented in GC as a
vclock, and a row is identified by a vclock, but it doesn't mean
it's the same thing. 

This is why I really dislike your idea of calling gc_advance on
row events. 

> 4. We try to restore some of this information using on_close_log
>    recovery trigger.

No, it's not "restore" the information. It's pass the right event
about the consumer - the file event - to the GC.

> 5. We send recovered boundaries to TX and tx tread reconstruct
>    the relay order loosing really relay vclocks (as they mapped
>    to the local xlog history)

I don't quite get what you mean here? Could you elaborate?
I think there is no "reconstruction". There are two types of
events: the events updating replicaset_vclock, are needed for
replication monitoring, and they happen often. The action upon 
this event is very cheap - you simply
vclock_advance(replicaset_vclock).

The second type of event is when relay or backup or engine stops
using an xlog file. It is also represented by a vclock but it is
not as cheap to process as the first kind, because gc_advance() is
not cheap, it's rbtree search.

You keep trying to merge the two streams into a single stream, I
keep asking to keep the two streams separate. There is of course
the standard pluses and minuses of using a centralized "event bus"
for all these events - with a single bus, as you suggest, things
become simpler for the producer, but the consumers have to do more
work to filter out the unnecessary events. 

> 6. TX sends the oldest vclock back to wal

> 7. There is some issues with making a consumer inactive. For
>    instance if we deactivated a consumer could survive, for
>    instance if deleted xlog was already send by an applier but
>    not reported yet (I do not even know how it could be fixed in
>    the current design).

I don't want to argue whether it's weird or not, it's subjective. 
I agree GC state is distributed now, and it's better if it is
centralized.

This could be achieved by either moving WAL xdir state to tx, 
and making sure tx is controlling it, or by moving entire GC 
to WAL. Moving GC state to WAL seems a cleaner approach, but I'm
fine either way.

> Maybe it is working, but I afraid, this was done without any thinking about 
> the future development (I mean the synchronous replication). Let me explain 
> why.
> 1. WAL should receive all relay states as soon as possible.

Agree, but it is a different stream of events - it's sync
replication events. File events are routed to GC subsystem, sync
replication events are routed to RAFT subsystem in WAL. 

> 2. The set of relay vclocks is enough to perform garbage
>    collection (as we could form a vclock with is the lower bound
>    of the set)

This is thanks to the fact that each file is unequivocally defined
by its vclock boundaries, which is accidental.

> So I wish the garbage collection would be implemented using direct relay to 
> wal reporting. In this circumstances I was in need to implement a structure (I 
> named it as matrix clock - mclcok) which able to contain relay vclocks and 
> evaluate a vclock which is lower bound of n-members of the mclcock.
> The mclock could be used to get the n-majority vclock as wel as the lowest 
> boundary of all vclock alive.
> The mclock is already implemented as well as new gc design (wal knows about 
> all relay vclock and the first vclock locked by TX - checkpoint or join read 
> view).

The idea of vclock matrix is totally fine & dandy for bsync. Using it for
GC seems like a huge overkill.

As to the fact that you have more patches on branches, I think
it's better to finish in-memory-replication first - it's a huge
performance boost for replicated set-ups, and reduces the latency,
too.

-- 
Konstantin Osipov, Moscow, Russia
https://scylladb.com

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2019-11-18  9:31                             ` Konstantin Osipov
@ 2020-06-02 12:22                               ` Maria Khaydich
  2020-06-03 10:12                                 ` Sergey Ostanevich
  0 siblings, 1 reply; 21+ messages in thread
From: Maria Khaydich @ 2020-06-02 12:22 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Alexander Turenko

[-- Attachment #1: Type: text/plain, Size: 9866 bytes --]


Using trigger on vclock change to determine the state would be cpu consuming, so I’m currently remaking previous patch so that we could yield from fiber and wait for a specific lsn from a specific replica. A possible use-case: committing a transaction and waiting for it to apply on all replicas. The way I am going to implement it is pretty much how Kostja suggested:  «..wait_lsn() could add the server_id, lsn that is  being waited for to a sorted list, and whenever we update  replicaset vclock for this lsn we also look at top of the list, if  it is not empty, and if the current lsn is greater than the top,  we could pop the value from the list and send a notification to  the waiter».
 
Anyway, there are still some questions to discuss. 1. Do we need wait_lsn_any() method mentioned here  https://github.com/tarantool/tarantool/issues/3808  ? I don’t see how this one can be useful. 2. What should be done in case of fail (reaching the timeout)? Simply returning an error seems like the best choice to me, so that user can later decide to do as he pleases with this information. 
Another issue is that during the last discussion in the mailing list it was mentioned that we wouldn’t need this feature altogether if we had synchronous replication. Any thoughts on this matter?
> 
>>
>>Понедельник, 18 ноября 2019, 12:31 +03:00 от Konstantin Osipov <kostja.osipov@gmail.com>:
>> 
>>* Georgy Kirichenko < georgy@tarantool.org > [19/11/16 23:37]:
>>
>>> > What is wrong with GC and how exactly do you want to "fix" it?
>>> We have discussed some points with you verbally (about 3-4 month
>>> ago). The main point is: the way of information processing is
>>> weird:
>>
>>> 1. WAL has the full information about the wal directory (xlogs
>>> and their boundaries)
>>
>>This is not strictly necessary. It saves us one xdir_scan() in
>>xdir_collect_garbage(), this is perhaps the main historical
>>reason it's there.
>>
>>We even have to make an effort to maintain this state in WAL:
>>- we call xdir_scan() in wal_enable()
>>- we call xdir_add_vclock() whenever we open/close the next xlog.
>>
>>The second reason it was done in WAL was to not block tx
>>thread, but later we had latency spikes in WAL thread as well, so
>>we added XDIR_GC_ASYNC to fix these, and this second reason is a
>>non-reason any more.
>>
>>Finally, the third reason WAL does it is wal_fallocate() function,
>>which removes files if we're out of space. Instead of going back
>>to GC subsystem and asking it to remove a file, the implementation
>>went the short route and removes the file directly in WAL
>>susbystem and notifies GC as a matter of fact.
>>
>>As you can see, all these reasons are accidental. Technically any
>>subsystem (WAL, GC) can remove these files if we add xdir_scan()
>>to xdir_collect_garbage().
>>
>>GC subsystem is responsible for all the old files, so it should be
>>dealing with them.
>>
>>The fix is to add xdir_scan() to xdir_collect_garbage(), and
>>change wal_fallocate() to send a message to GC asking it to remove
>>some data, rather than kick the chair out of GC butt by calling
>>xdir_collect_garbage(XDIR_GC_REMOVE_ONE). One issue with fixing it
>>this way, is what would you do in wal_fallocate() after you send
>>the message? You will have to have wal_fallocate_soft(), which
>>sends the message asynchronously, to not stall WAL, and
>>wal_fallocate_hard(), which would stall WAL until there is
>>response from TX about extra space. A lot more work.
>>
>>Even though WAL contains some of GC state, it's neither an owner
>>of it nor a consumer: it is only a producer of GC state, and
>>it updates GC state by sending notifications about the files that
>>it creates and closes. The consumers are engines, checkpoints,
>>backups, relays.
>>
>>BTW, I don't think in-memory replication is a new consumer of GC state
>>- it doesn't act like a standard consumer:
>> 
>> * a usual consumer may need multiple xlog files, because it can
>>   be at a position way behind the current xlog; in-memory
>>   replication is almost always pointing to the current xlog,
>>   there may be rare cases when it depends on the previous xlogs
>>   when xlog size is small or there was a recent rotation.
>>
>> * in case of standard consumers, each consumer is at its own
>>   position, while for in-memory replication, all relays are more
>>   or less on the same position - at least it doesn't make any
>>   logical sense to advance each relay's position independently
>>
>>I remember having suggested that, and I don't remember why using a
>>single consumer for all in-memory relays did not work out for you.
>>The idea is that whenever a relay switches to the memory mode it
>>unsubscribes from GC, and whenever it is back to file mode, it is
>>subscribes to GC again. In order to avoid any races, in-memory-WAL
>>as a consumer keeps a reference to a few WALs.
>>
>>The alternative is to move GC subsystem entirely to WAL. This
>>could perhaps also work and even be cleaner than centralizing GC
>>in TX. Either way I don't see it as a blocker for in-memory WAL -
>>I think in-memory WAL can work with GC being either in WAL or in
>>TX, it's just the messages that threads exchange become a bit more
>>complicated.
>>
>>> 2. WAL process the wal directory cleanup
>>
>>As I wrote above, there are two reasons for this, both historical:
>>- we wanted to avoid TX stalls
>>- we have wal_fallocate(), a feature which was implemented
>>  "lazily" so it just removes the files under GCs feet and
>>  notifies GC after the fact.
>>
>>GC, logically, controls the WAL dir, and WAL is only a producer of
>>WAL files.
>>  
>>> 3. We filter out all this information while relaying (as a relay
>>> has only a stream of rows)
>>
>>GC subscription is not interested in the stream of rows.
>>It is interested in a stream files. A file is represented in GC as a
>>vclock, and a row is identified by a vclock, but it doesn't mean
>>it's the same thing.
>>
>>This is why I really dislike your idea of calling gc_advance on
>>row events.
>>
>>> 4. We try to restore some of this information using on_close_log
>>> recovery trigger.
>>
>>No, it's not "restore" the information. It's pass the right event
>>about the consumer - the file event - to the GC.
>>
>>> 5. We send recovered boundaries to TX and tx tread reconstruct
>>> the relay order loosing really relay vclocks (as they mapped
>>> to the local xlog history)
>>
>>I don't quite get what you mean here? Could you elaborate?
>>I think there is no "reconstruction". There are two types of
>>events: the events updating replicaset_vclock, are needed for
>>replication monitoring, and they happen often. The action upon
>>this event is very cheap - you simply
>>vclock_advance(replicaset_vclock).
>>
>>The second type of event is when relay or backup or engine stops
>>using an xlog file. It is also represented by a vclock but it is
>>not as cheap to process as the first kind, because gc_advance() is
>>not cheap, it's rbtree search.
>>
>>You keep trying to merge the two streams into a single stream, I
>>keep asking to keep the two streams separate. There is of course
>>the standard pluses and minuses of using a centralized "event bus"
>>for all these events - with a single bus, as you suggest, things
>>become simpler for the producer, but the consumers have to do more
>>work to filter out the unnecessary events.
>>
>>> 6. TX sends the oldest vclock back to wal
>>
>>
>>
>>> 7. There is some issues with making a consumer inactive. For
>>> instance if we deactivated a consumer could survive, for
>>> instance if deleted xlog was already send by an applier but
>>> not reported yet (I do not even know how it could be fixed in
>>> the current design).
>>
>>I don't want to argue whether it's weird or not, it's subjective.
>>I agree GC state is distributed now, and it's better if it is
>>centralized.
>>
>>This could be achieved by either moving WAL xdir state to tx,
>>and making sure tx is controlling it, or by moving entire GC
>>to WAL. Moving GC state to WAL seems a cleaner approach, but I'm
>>fine either way.
>>
>>> Maybe it is working, but I afraid, this was done without any thinking about
>>> the future development (I mean the synchronous replication). Let me explain
>>> why.
>>> 1. WAL should receive all relay states as soon as possible.
>>
>>Agree, but it is a different stream of events - it's sync
>>replication events. File events are routed to GC subsystem, sync
>>replication events are routed to RAFT subsystem in WAL.
>>
>>> 2. The set of relay vclocks is enough to perform garbage
>>> collection (as we could form a vclock with is the lower bound
>>> of the set)
>>
>>This is thanks to the fact that each file is unequivocally defined
>>by its vclock boundaries, which is accidental.
>>
>>> So I wish the garbage collection would be implemented using direct relay to
>>> wal reporting. In this circumstances I was in need to implement a structure (I
>>> named it as matrix clock - mclcok) which able to contain relay vclocks and
>>> evaluate a vclock which is lower bound of n-members of the mclcock.
>>> The mclock could be used to get the n-majority vclock as wel as the lowest
>>> boundary of all vclock alive.
>>> The mclock is already implemented as well as new gc design (wal knows about
>>> all relay vclock and the first vclock locked by TX - checkpoint or join read
>>> view).
>>
>>The idea of vclock matrix is totally fine & dandy for bsync. Using it for
>>GC seems like a huge overkill.
>>
>>As to the fact that you have more patches on branches, I think
>>it's better to finish in-memory-replication first - it's a huge
>>performance boost for replicated set-ups, and reduces the latency,
>>too.
>>
>>--
>>Konstantin Osipov, Moscow, Russia
>>https://scylladb.com 
> 
> 
>--
>Maria Khaydich
> 

[-- Attachment #2: Type: text/html, Size: 14395 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2020-06-02 12:22                               ` Maria Khaydich
@ 2020-06-03 10:12                                 ` Sergey Ostanevich
  2020-06-03 12:08                                   ` Alexander Turenko
  0 siblings, 1 reply; 21+ messages in thread
From: Sergey Ostanevich @ 2020-06-03 10:12 UTC (permalink / raw)
  To: Maria Khaydich; +Cc: tarantool-patches, Alexander Turenko

Hi!

Thank you for bringing this up!

It was discussed back in 2019
https://lists.tarantool.org/pipermail/tarantool-patches/2019-November/012402.html
and before Kostja and Georgy got personal in their discussion,
they [kind of] agreed that the single-leader sync replication will
implement the required functionality.

As you know, we have first version of synchro planned this quarter
and I believe this tracker will be resolved as part of it - so no need
to implement it for the second time.

Please, link it to the qsync tracker #4842 so we update/close it as
soon as we're done.

Regards,
Sergos


On 02 июн 15:22, Maria Khaydich wrote:
> 
> Using trigger on vclock change to determine the state would be cpu consuming, so I’m currently remaking previous patch so that we could yield from fiber and wait for a specific lsn from a specific replica. A possible use-case: committing a transaction and waiting for it to apply on all replicas. The way I am going to implement it is pretty much how Kostja suggested:  «..wait_lsn() could add the server_id, lsn that is  being waited for to a sorted list, and whenever we update  replicaset vclock for this lsn we also look at top of the list, if  it is not empty, and if the current lsn is greater than the top,  we could pop the value from the list and send a notification to  the waiter».
>  
> Anyway, there are still some questions to discuss. 1. Do we need wait_lsn_any() method mentioned here  https://github.com/tarantool/tarantool/issues/3808  ? I don’t see how this one can be useful. 2. What should be done in case of fail (reaching the timeout)? Simply returning an error seems like the best choice to me, so that user can later decide to do as he pleases with this information. 
> Another issue is that during the last discussion in the mailing list it was mentioned that we wouldn’t need this feature altogether if we had synchronous replication. Any thoughts on this matter?
> > 
> >>
> >>Понедельник, 18 ноября 2019, 12:31 +03:00 от Konstantin Osipov <kostja.osipov@gmail.com>:
> >> 
> >>* Georgy Kirichenko < georgy@tarantool.org > [19/11/16 23:37]:
> >>
> >>> > What is wrong with GC and how exactly do you want to "fix" it?
> >>> We have discussed some points with you verbally (about 3-4 month
> >>> ago). The main point is: the way of information processing is
> >>> weird:
> >>
> >>> 1. WAL has the full information about the wal directory (xlogs
> >>> and their boundaries)
> >>
> >>This is not strictly necessary. It saves us one xdir_scan() in
> >>xdir_collect_garbage(), this is perhaps the main historical
> >>reason it's there.
> >>
> >>We even have to make an effort to maintain this state in WAL:
> >>- we call xdir_scan() in wal_enable()
> >>- we call xdir_add_vclock() whenever we open/close the next xlog.
> >>
> >>The second reason it was done in WAL was to not block tx
> >>thread, but later we had latency spikes in WAL thread as well, so
> >>we added XDIR_GC_ASYNC to fix these, and this second reason is a
> >>non-reason any more.
> >>
> >>Finally, the third reason WAL does it is wal_fallocate() function,
> >>which removes files if we're out of space. Instead of going back
> >>to GC subsystem and asking it to remove a file, the implementation
> >>went the short route and removes the file directly in WAL
> >>susbystem and notifies GC as a matter of fact.
> >>
> >>As you can see, all these reasons are accidental. Technically any
> >>subsystem (WAL, GC) can remove these files if we add xdir_scan()
> >>to xdir_collect_garbage().
> >>
> >>GC subsystem is responsible for all the old files, so it should be
> >>dealing with them.
> >>
> >>The fix is to add xdir_scan() to xdir_collect_garbage(), and
> >>change wal_fallocate() to send a message to GC asking it to remove
> >>some data, rather than kick the chair out of GC butt by calling
> >>xdir_collect_garbage(XDIR_GC_REMOVE_ONE). One issue with fixing it
> >>this way, is what would you do in wal_fallocate() after you send
> >>the message? You will have to have wal_fallocate_soft(), which
> >>sends the message asynchronously, to not stall WAL, and
> >>wal_fallocate_hard(), which would stall WAL until there is
> >>response from TX about extra space. A lot more work.
> >>
> >>Even though WAL contains some of GC state, it's neither an owner
> >>of it nor a consumer: it is only a producer of GC state, and
> >>it updates GC state by sending notifications about the files that
> >>it creates and closes. The consumers are engines, checkpoints,
> >>backups, relays.
> >>
> >>BTW, I don't think in-memory replication is a new consumer of GC state
> >>- it doesn't act like a standard consumer:
> >> 
> >> * a usual consumer may need multiple xlog files, because it can
> >>   be at a position way behind the current xlog; in-memory
> >>   replication is almost always pointing to the current xlog,
> >>   there may be rare cases when it depends on the previous xlogs
> >>   when xlog size is small or there was a recent rotation.
> >>
> >> * in case of standard consumers, each consumer is at its own
> >>   position, while for in-memory replication, all relays are more
> >>   or less on the same position - at least it doesn't make any
> >>   logical sense to advance each relay's position independently
> >>
> >>I remember having suggested that, and I don't remember why using a
> >>single consumer for all in-memory relays did not work out for you.
> >>The idea is that whenever a relay switches to the memory mode it
> >>unsubscribes from GC, and whenever it is back to file mode, it is
> >>subscribes to GC again. In order to avoid any races, in-memory-WAL
> >>as a consumer keeps a reference to a few WALs.
> >>
> >>The alternative is to move GC subsystem entirely to WAL. This
> >>could perhaps also work and even be cleaner than centralizing GC
> >>in TX. Either way I don't see it as a blocker for in-memory WAL -
> >>I think in-memory WAL can work with GC being either in WAL or in
> >>TX, it's just the messages that threads exchange become a bit more
> >>complicated.
> >>
> >>> 2. WAL process the wal directory cleanup
> >>
> >>As I wrote above, there are two reasons for this, both historical:
> >>- we wanted to avoid TX stalls
> >>- we have wal_fallocate(), a feature which was implemented
> >>  "lazily" so it just removes the files under GCs feet and
> >>  notifies GC after the fact.
> >>
> >>GC, logically, controls the WAL dir, and WAL is only a producer of
> >>WAL files.
> >>  
> >>> 3. We filter out all this information while relaying (as a relay
> >>> has only a stream of rows)
> >>
> >>GC subscription is not interested in the stream of rows.
> >>It is interested in a stream files. A file is represented in GC as a
> >>vclock, and a row is identified by a vclock, but it doesn't mean
> >>it's the same thing.
> >>
> >>This is why I really dislike your idea of calling gc_advance on
> >>row events.
> >>
> >>> 4. We try to restore some of this information using on_close_log
> >>> recovery trigger.
> >>
> >>No, it's not "restore" the information. It's pass the right event
> >>about the consumer - the file event - to the GC.
> >>
> >>> 5. We send recovered boundaries to TX and tx tread reconstruct
> >>> the relay order loosing really relay vclocks (as they mapped
> >>> to the local xlog history)
> >>
> >>I don't quite get what you mean here? Could you elaborate?
> >>I think there is no "reconstruction". There are two types of
> >>events: the events updating replicaset_vclock, are needed for
> >>replication monitoring, and they happen often. The action upon
> >>this event is very cheap - you simply
> >>vclock_advance(replicaset_vclock).
> >>
> >>The second type of event is when relay or backup or engine stops
> >>using an xlog file. It is also represented by a vclock but it is
> >>not as cheap to process as the first kind, because gc_advance() is
> >>not cheap, it's rbtree search.
> >>
> >>You keep trying to merge the two streams into a single stream, I
> >>keep asking to keep the two streams separate. There is of course
> >>the standard pluses and minuses of using a centralized "event bus"
> >>for all these events - with a single bus, as you suggest, things
> >>become simpler for the producer, but the consumers have to do more
> >>work to filter out the unnecessary events.
> >>
> >>> 6. TX sends the oldest vclock back to wal
> >>
> >>
> >>
> >>> 7. There is some issues with making a consumer inactive. For
> >>> instance if we deactivated a consumer could survive, for
> >>> instance if deleted xlog was already send by an applier but
> >>> not reported yet (I do not even know how it could be fixed in
> >>> the current design).
> >>
> >>I don't want to argue whether it's weird or not, it's subjective.
> >>I agree GC state is distributed now, and it's better if it is
> >>centralized.
> >>
> >>This could be achieved by either moving WAL xdir state to tx,
> >>and making sure tx is controlling it, or by moving entire GC
> >>to WAL. Moving GC state to WAL seems a cleaner approach, but I'm
> >>fine either way.
> >>
> >>> Maybe it is working, but I afraid, this was done without any thinking about
> >>> the future development (I mean the synchronous replication). Let me explain
> >>> why.
> >>> 1. WAL should receive all relay states as soon as possible.
> >>
> >>Agree, but it is a different stream of events - it's sync
> >>replication events. File events are routed to GC subsystem, sync
> >>replication events are routed to RAFT subsystem in WAL.
> >>
> >>> 2. The set of relay vclocks is enough to perform garbage
> >>> collection (as we could form a vclock with is the lower bound
> >>> of the set)
> >>
> >>This is thanks to the fact that each file is unequivocally defined
> >>by its vclock boundaries, which is accidental.
> >>
> >>> So I wish the garbage collection would be implemented using direct relay to
> >>> wal reporting. In this circumstances I was in need to implement a structure (I
> >>> named it as matrix clock - mclcok) which able to contain relay vclocks and
> >>> evaluate a vclock which is lower bound of n-members of the mclcock.
> >>> The mclock could be used to get the n-majority vclock as wel as the lowest
> >>> boundary of all vclock alive.
> >>> The mclock is already implemented as well as new gc design (wal knows about
> >>> all relay vclock and the first vclock locked by TX - checkpoint or join read
> >>> view).
> >>
> >>The idea of vclock matrix is totally fine & dandy for bsync. Using it for
> >>GC seems like a huge overkill.
> >>
> >>As to the fact that you have more patches on branches, I think
> >>it's better to finish in-memory-replication first - it's a huge
> >>performance boost for replicated set-ups, and reduces the latency,
> >>too.
> >>
> >>--
> >>Konstantin Osipov, Moscow, Russia
> >>https://scylladb.com 
> > 
> > 
> >--
> >Maria Khaydich
> > 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Tarantool-patches] [PATCH] Trigger on vclock change
  2020-06-03 10:12                                 ` Sergey Ostanevich
@ 2020-06-03 12:08                                   ` Alexander Turenko
  0 siblings, 0 replies; 21+ messages in thread
From: Alexander Turenko @ 2020-06-03 12:08 UTC (permalink / raw)
  To: Sergey Ostanevich; +Cc: tarantool-patches

Commented the issue:

https://github.com/tarantool/tarantool/issues/3808#issuecomment-638152861

I would let you, Sergey, to decide whether it worth to close the issue
or implement something within it.

WBR, Alexander Turenko.

On Wed, Jun 03, 2020 at 01:12:38PM +0300, Sergey Ostanevich wrote:
> Hi!
> 
> Thank you for bringing this up!
> 
> It was discussed back in 2019
> https://lists.tarantool.org/pipermail/tarantool-patches/2019-November/012402.html
> and before Kostja and Georgy got personal in their discussion,
> they [kind of] agreed that the single-leader sync replication will
> implement the required functionality.
> 
> As you know, we have first version of synchro planned this quarter
> and I believe this tracker will be resolved as part of it - so no need
> to implement it for the second time.
> 
> Please, link it to the qsync tracker #4842 so we update/close it as
> soon as we're done.
> 
> Regards,
> Sergos
> 
> 
> On 02 июн 15:22, Maria Khaydich wrote:
> > 
> > Using trigger on vclock change to determine the state would be cpu consuming, so I’m currently remaking previous patch so that we could yield from fiber and wait for a specific lsn from a specific replica. A possible use-case: committing a transaction and waiting for it to apply on all replicas. The way I am going to implement it is pretty much how Kostja suggested:  «..wait_lsn() could add the server_id, lsn that is  being waited for to a sorted list, and whenever we update  replicaset vclock for this lsn we also look at top of the list, if  it is not empty, and if the current lsn is greater than the top,  we could pop the value from the list and send a notification to  the waiter».
> >  
> > Anyway, there are still some questions to discuss. 1. Do we need wait_lsn_any() method mentioned here  https://github.com/tarantool/tarantool/issues/3808  ? I don’t see how this one can be useful. 2. What should be done in case of fail (reaching the timeout)? Simply returning an error seems like the best choice to me, so that user can later decide to do as he pleases with this information. 
> > Another issue is that during the last discussion in the mailing list it was mentioned that we wouldn’t need this feature altogether if we had synchronous replication. Any thoughts on this matter?
> > > 
> > >>
> > >>Понедельник, 18 ноября 2019, 12:31 +03:00 от Konstantin Osipov <kostja.osipov@gmail.com>:
> > >> 
> > >>* Georgy Kirichenko < georgy@tarantool.org > [19/11/16 23:37]:
> > >>
> > >>> > What is wrong with GC and how exactly do you want to "fix" it?
> > >>> We have discussed some points with you verbally (about 3-4 month
> > >>> ago). The main point is: the way of information processing is
> > >>> weird:
> > >>
> > >>> 1. WAL has the full information about the wal directory (xlogs
> > >>> and their boundaries)
> > >>
> > >>This is not strictly necessary. It saves us one xdir_scan() in
> > >>xdir_collect_garbage(), this is perhaps the main historical
> > >>reason it's there.
> > >>
> > >>We even have to make an effort to maintain this state in WAL:
> > >>- we call xdir_scan() in wal_enable()
> > >>- we call xdir_add_vclock() whenever we open/close the next xlog.
> > >>
> > >>The second reason it was done in WAL was to not block tx
> > >>thread, but later we had latency spikes in WAL thread as well, so
> > >>we added XDIR_GC_ASYNC to fix these, and this second reason is a
> > >>non-reason any more.
> > >>
> > >>Finally, the third reason WAL does it is wal_fallocate() function,
> > >>which removes files if we're out of space. Instead of going back
> > >>to GC subsystem and asking it to remove a file, the implementation
> > >>went the short route and removes the file directly in WAL
> > >>susbystem and notifies GC as a matter of fact.
> > >>
> > >>As you can see, all these reasons are accidental. Technically any
> > >>subsystem (WAL, GC) can remove these files if we add xdir_scan()
> > >>to xdir_collect_garbage().
> > >>
> > >>GC subsystem is responsible for all the old files, so it should be
> > >>dealing with them.
> > >>
> > >>The fix is to add xdir_scan() to xdir_collect_garbage(), and
> > >>change wal_fallocate() to send a message to GC asking it to remove
> > >>some data, rather than kick the chair out of GC butt by calling
> > >>xdir_collect_garbage(XDIR_GC_REMOVE_ONE). One issue with fixing it
> > >>this way, is what would you do in wal_fallocate() after you send
> > >>the message? You will have to have wal_fallocate_soft(), which
> > >>sends the message asynchronously, to not stall WAL, and
> > >>wal_fallocate_hard(), which would stall WAL until there is
> > >>response from TX about extra space. A lot more work.
> > >>
> > >>Even though WAL contains some of GC state, it's neither an owner
> > >>of it nor a consumer: it is only a producer of GC state, and
> > >>it updates GC state by sending notifications about the files that
> > >>it creates and closes. The consumers are engines, checkpoints,
> > >>backups, relays.
> > >>
> > >>BTW, I don't think in-memory replication is a new consumer of GC state
> > >>- it doesn't act like a standard consumer:
> > >> 
> > >> * a usual consumer may need multiple xlog files, because it can
> > >>   be at a position way behind the current xlog; in-memory
> > >>   replication is almost always pointing to the current xlog,
> > >>   there may be rare cases when it depends on the previous xlogs
> > >>   when xlog size is small or there was a recent rotation.
> > >>
> > >> * in case of standard consumers, each consumer is at its own
> > >>   position, while for in-memory replication, all relays are more
> > >>   or less on the same position - at least it doesn't make any
> > >>   logical sense to advance each relay's position independently
> > >>
> > >>I remember having suggested that, and I don't remember why using a
> > >>single consumer for all in-memory relays did not work out for you.
> > >>The idea is that whenever a relay switches to the memory mode it
> > >>unsubscribes from GC, and whenever it is back to file mode, it is
> > >>subscribes to GC again. In order to avoid any races, in-memory-WAL
> > >>as a consumer keeps a reference to a few WALs.
> > >>
> > >>The alternative is to move GC subsystem entirely to WAL. This
> > >>could perhaps also work and even be cleaner than centralizing GC
> > >>in TX. Either way I don't see it as a blocker for in-memory WAL -
> > >>I think in-memory WAL can work with GC being either in WAL or in
> > >>TX, it's just the messages that threads exchange become a bit more
> > >>complicated.
> > >>
> > >>> 2. WAL process the wal directory cleanup
> > >>
> > >>As I wrote above, there are two reasons for this, both historical:
> > >>- we wanted to avoid TX stalls
> > >>- we have wal_fallocate(), a feature which was implemented
> > >>  "lazily" so it just removes the files under GCs feet and
> > >>  notifies GC after the fact.
> > >>
> > >>GC, logically, controls the WAL dir, and WAL is only a producer of
> > >>WAL files.
> > >>  
> > >>> 3. We filter out all this information while relaying (as a relay
> > >>> has only a stream of rows)
> > >>
> > >>GC subscription is not interested in the stream of rows.
> > >>It is interested in a stream files. A file is represented in GC as a
> > >>vclock, and a row is identified by a vclock, but it doesn't mean
> > >>it's the same thing.
> > >>
> > >>This is why I really dislike your idea of calling gc_advance on
> > >>row events.
> > >>
> > >>> 4. We try to restore some of this information using on_close_log
> > >>> recovery trigger.
> > >>
> > >>No, it's not "restore" the information. It's pass the right event
> > >>about the consumer - the file event - to the GC.
> > >>
> > >>> 5. We send recovered boundaries to TX and tx tread reconstruct
> > >>> the relay order loosing really relay vclocks (as they mapped
> > >>> to the local xlog history)
> > >>
> > >>I don't quite get what you mean here? Could you elaborate?
> > >>I think there is no "reconstruction". There are two types of
> > >>events: the events updating replicaset_vclock, are needed for
> > >>replication monitoring, and they happen often. The action upon
> > >>this event is very cheap - you simply
> > >>vclock_advance(replicaset_vclock).
> > >>
> > >>The second type of event is when relay or backup or engine stops
> > >>using an xlog file. It is also represented by a vclock but it is
> > >>not as cheap to process as the first kind, because gc_advance() is
> > >>not cheap, it's rbtree search.
> > >>
> > >>You keep trying to merge the two streams into a single stream, I
> > >>keep asking to keep the two streams separate. There is of course
> > >>the standard pluses and minuses of using a centralized "event bus"
> > >>for all these events - with a single bus, as you suggest, things
> > >>become simpler for the producer, but the consumers have to do more
> > >>work to filter out the unnecessary events.
> > >>
> > >>> 6. TX sends the oldest vclock back to wal
> > >>
> > >>
> > >>
> > >>> 7. There is some issues with making a consumer inactive. For
> > >>> instance if we deactivated a consumer could survive, for
> > >>> instance if deleted xlog was already send by an applier but
> > >>> not reported yet (I do not even know how it could be fixed in
> > >>> the current design).
> > >>
> > >>I don't want to argue whether it's weird or not, it's subjective.
> > >>I agree GC state is distributed now, and it's better if it is
> > >>centralized.
> > >>
> > >>This could be achieved by either moving WAL xdir state to tx,
> > >>and making sure tx is controlling it, or by moving entire GC
> > >>to WAL. Moving GC state to WAL seems a cleaner approach, but I'm
> > >>fine either way.
> > >>
> > >>> Maybe it is working, but I afraid, this was done without any thinking about
> > >>> the future development (I mean the synchronous replication). Let me explain
> > >>> why.
> > >>> 1. WAL should receive all relay states as soon as possible.
> > >>
> > >>Agree, but it is a different stream of events - it's sync
> > >>replication events. File events are routed to GC subsystem, sync
> > >>replication events are routed to RAFT subsystem in WAL.
> > >>
> > >>> 2. The set of relay vclocks is enough to perform garbage
> > >>> collection (as we could form a vclock with is the lower bound
> > >>> of the set)
> > >>
> > >>This is thanks to the fact that each file is unequivocally defined
> > >>by its vclock boundaries, which is accidental.
> > >>
> > >>> So I wish the garbage collection would be implemented using direct relay to
> > >>> wal reporting. In this circumstances I was in need to implement a structure (I
> > >>> named it as matrix clock - mclcok) which able to contain relay vclocks and
> > >>> evaluate a vclock which is lower bound of n-members of the mclcock.
> > >>> The mclock could be used to get the n-majority vclock as wel as the lowest
> > >>> boundary of all vclock alive.
> > >>> The mclock is already implemented as well as new gc design (wal knows about
> > >>> all relay vclock and the first vclock locked by TX - checkpoint or join read
> > >>> view).
> > >>
> > >>The idea of vclock matrix is totally fine & dandy for bsync. Using it for
> > >>GC seems like a huge overkill.
> > >>
> > >>As to the fact that you have more patches on branches, I think
> > >>it's better to finish in-memory-replication first - it's a huge
> > >>performance boost for replicated set-ups, and reduces the latency,
> > >>too.
> > >>
> > >>--
> > >>Konstantin Osipov, Moscow, Russia
> > >>https://scylladb.com 
> > > 
> > > 
> > >--
> > >Maria Khaydich
> > > 

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2020-06-03 12:08 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-14 12:57 [Tarantool-patches] [PATCH] Trigger on vclock change Maria
2019-11-14 13:44 ` Konstantin Osipov
2019-11-14 14:06   ` Georgy Kirichenko
2019-11-14 15:26     ` Konstantin Osipov
2019-11-14 17:13       ` Georgy Kirichenko
2019-11-14 17:33         ` Konstantin Osipov
2019-11-14 19:16           ` Georgy Kirichenko
2019-11-14 19:48             ` Konstantin Osipov
2019-11-14 20:01               ` Georgy Kirichenko
2019-11-15  1:57                 ` Konstantin Osipov
2019-11-15  6:02                   ` Georgy Kirichenko
2019-11-15 13:57                     ` Konstantin Osipov
2019-11-15 19:57                       ` Georgy Kirichenko
2019-11-16 10:37                         ` Konstantin Osipov
2019-11-16 20:43                           ` Georgy Kirichenko
2019-11-16 11:56                         ` Konstantin Osipov
2019-11-16 20:34                           ` Georgy Kirichenko
2019-11-18  9:31                             ` Konstantin Osipov
2020-06-02 12:22                               ` Maria Khaydich
2020-06-03 10:12                                 ` Sergey Ostanevich
2020-06-03 12:08                                   ` Alexander Turenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox