Tarantool development patches archive
 help / color / mirror / Atom feed
* [Tarantool-patches] [PATCH 1/1] qsync: remove polling from box_promote()
@ 2021-07-12 22:20 Vladislav Shpilevoy via Tarantool-patches
  2021-07-13 10:01 ` Serge Petrenko via Tarantool-patches
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Vladislav Shpilevoy via Tarantool-patches @ 2021-07-12 22:20 UTC (permalink / raw)
  To: tarantool-patches, gorcunov, sergepetrenko

box_promote() when called manually used to wait for the existing
transactions from a foreign limbo to end during a timeout. Giving
them a chance to end on their terms.

The waiting was done via polling like

    while (!done)
        sleep(small_timeout);

Polling is almost always super bad both for execution time and
for CPU usage. The patch replaces it with proper waiting based on
events happening in the limbo.

Closes #5190
---
Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5190-qsync-polling
Issue: https://github.com/tarantool/tarantool/issues/5190

 src/box/box.cc      |  7 +----
 src/box/txn_limbo.c | 74 ++++++++++++++++++++++++++++++++++-----------
 src/box/txn_limbo.h |  4 +++
 3 files changed, 62 insertions(+), 23 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index ab7d983c9..eeb57b04e 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1627,12 +1627,7 @@ box_promote(void)
 	if (try_wait) {
 		/* Wait until pending confirmations/rollbacks reach us. */
 		double timeout = 2 * replication_synchro_timeout;
-		double start_tm = fiber_clock();
-		while (!txn_limbo_is_empty(&txn_limbo)) {
-			if (fiber_clock() - start_tm > timeout)
-				break;
-			fiber_sleep(0.001);
-		}
+		txn_limbo_wait_empty(&txn_limbo, timeout);
 		/*
 		 * Our mission was to clear the limbo from former leader's
 		 * transactions. Exit in case someone did that for us.
diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
index 51dc2a186..fdea287c7 100644
--- a/src/box/txn_limbo.c
+++ b/src/box/txn_limbo.c
@@ -612,11 +612,14 @@ txn_rollback_cb(struct trigger *trigger, void *event)
 	return 0;
 }
 
-int
-txn_limbo_wait_confirm(struct txn_limbo *limbo)
+/**
+ * Wait until the last transaction in the limbo is finished and get its result.
+ */
+static int
+txn_limbo_wait_last_txn(struct txn_limbo *limbo, bool *is_rollback,
+			double timeout)
 {
-	if (txn_limbo_is_empty(limbo))
-		return 0;
+	assert(!txn_limbo_is_empty(limbo));
 
 	/* initialization of a waitpoint. */
 	struct confirm_waitpoint cwp;
@@ -632,27 +635,42 @@ txn_limbo_wait_confirm(struct txn_limbo *limbo)
 	struct txn_limbo_entry *tle = txn_limbo_last_entry(limbo);
 	txn_on_commit(tle->txn, &on_complete);
 	txn_on_rollback(tle->txn, &on_rollback);
-	double start_time = fiber_clock();
+	double deadline = fiber_clock() + timeout;
+	int rc;
 	while (true) {
-		double deadline = start_time + replication_synchro_timeout;
+		if (timeout < 0) {
+			rc = -1;
+			break;
+		}
 		bool cancellable = fiber_set_cancellable(false);
-		double timeout = deadline - fiber_clock();
-		int rc = fiber_cond_wait_timeout(&limbo->wait_cond, timeout);
+		rc = fiber_cond_wait_timeout(&limbo->wait_cond, timeout);
 		fiber_set_cancellable(cancellable);
-		if (cwp.is_confirm || cwp.is_rollback)
-			goto complete;
+		if (cwp.is_confirm || cwp.is_rollback) {
+			*is_rollback = cwp.is_rollback;
+			rc = 0;
+			break;
+		}
 		if (rc != 0)
-			goto timed_out;
+			break;
+		timeout = deadline - fiber_clock();
 	}
-timed_out:
-	/* Clear the triggers if the timeout has been reached. */
 	trigger_clear(&on_complete);
 	trigger_clear(&on_rollback);
-	diag_set(ClientError, ER_SYNC_QUORUM_TIMEOUT);
-	return -1;
+	return rc;
+}
 
-complete:
-	if (!cwp.is_confirm) {
+int
+txn_limbo_wait_confirm(struct txn_limbo *limbo)
+{
+	if (txn_limbo_is_empty(limbo))
+		return 0;
+	bool is_rollback;
+	if (txn_limbo_wait_last_txn(limbo, &is_rollback,
+				    replication_synchro_timeout) != 0) {
+		diag_set(ClientError, ER_SYNC_QUORUM_TIMEOUT);
+		return -1;
+	}
+	if (is_rollback) {
 		/* The transaction has been rolled back. */
 		diag_set(ClientError, ER_SYNC_ROLLBACK);
 		return -1;
@@ -660,6 +678,28 @@ complete:
 	return 0;
 }
 
+int
+txn_limbo_wait_empty(struct txn_limbo *limbo, double timeout)
+{
+	if (txn_limbo_is_empty(limbo))
+		return 0;
+	bool is_rollback;
+	double deadline = fiber_clock() + timeout;
+	/*
+	 * Retry in the loop. More transactions might be added while waiting for
+	 * the last one.
+	 */
+	do {
+		if (txn_limbo_wait_last_txn(limbo, &is_rollback,
+					    timeout) != 0) {
+			diag_set(ClientError, ER_TIMEOUT);
+			return -1;
+		}
+		timeout = deadline - fiber_clock();
+	} while (!txn_limbo_is_empty(limbo));
+	return 0;
+}
+
 void
 txn_limbo_process(struct txn_limbo *limbo, const struct synchro_request *req)
 {
diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h
index e409ac657..7debbc0b9 100644
--- a/src/box/txn_limbo.h
+++ b/src/box/txn_limbo.h
@@ -311,6 +311,10 @@ txn_limbo_process(struct txn_limbo *limbo, const struct synchro_request *req);
 int
 txn_limbo_wait_confirm(struct txn_limbo *limbo);
 
+/** Wait until the limbo is empty. Regardless of how its transactions end. */
+int
+txn_limbo_wait_empty(struct txn_limbo *limbo, double timeout);
+
 /**
  * Write a PROMOTE request, which has the same effect as CONFIRM(@a lsn) and
  * ROLLBACK(@a lsn + 1) combined.
-- 
2.24.3 (Apple Git-128)


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Tarantool-patches] [PATCH 1/1] qsync: remove polling from box_promote()
  2021-07-12 22:20 [Tarantool-patches] [PATCH 1/1] qsync: remove polling from box_promote() Vladislav Shpilevoy via Tarantool-patches
@ 2021-07-13 10:01 ` Serge Petrenko via Tarantool-patches
  2021-07-13 10:02 ` Serge Petrenko via Tarantool-patches
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Serge Petrenko via Tarantool-patches @ 2021-07-13 10:01 UTC (permalink / raw)
  To: Vladislav Shpilevoy, tarantool-patches, gorcunov



13.07.2021 01:20, Vladislav Shpilevoy пишет:
> box_promote() when called manually used to wait for the existing
> transactions from a foreign limbo to end during a timeout. Giving
> them a chance to end on their terms.
>
> The waiting was done via polling like
>
>      while (!done)
>          sleep(small_timeout);
>
> Polling is almost always super bad both for execution time and
> for CPU usage. The patch replaces it with proper waiting based on
> events happening in the limbo.
>
> Closes #5190
> ---
> Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5190-qsync-polling
> Issue: https://github.com/tarantool/tarantool/issues/5190
>
>   src/box/box.cc      |  7 +----
>   src/box/txn_limbo.c | 74 ++++++++++++++++++++++++++++++++++-----------
>   src/box/txn_limbo.h |  4 +++
>   3 files changed, 62 insertions(+), 23 deletions(-)
>
> diff --git a/src/box/box.cc b/src/box/box.cc
> index ab7d983c9..eeb57b04e 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -1627,12 +1627,7 @@ box_promote(void)
>   	if (try_wait) {
>   		/* Wait until pending confirmations/rollbacks reach us. */
>   		double timeout = 2 * replication_synchro_timeout;
> -		double start_tm = fiber_clock();
> -		while (!txn_limbo_is_empty(&txn_limbo)) {
> -			if (fiber_clock() - start_tm > timeout)
> -				break;
> -			fiber_sleep(0.001);
> -		}
> +		txn_limbo_wait_empty(&txn_limbo, timeout);
>   		/*
>   		 * Our mission was to clear the limbo from former leader's
>   		 * transactions. Exit in case someone did that for us.
> diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
> index 51dc2a186..fdea287c7 100644
> --- a/src/box/txn_limbo.c
> +++ b/src/box/txn_limbo.c
> @@ -612,11 +612,14 @@ txn_rollback_cb(struct trigger *trigger, void *event)
>   	return 0;
>   }
>   
> -int
> -txn_limbo_wait_confirm(struct txn_limbo *limbo)
> +/**
> + * Wait until the last transaction in the limbo is finished and get its result.
> + */
> +static int
> +txn_limbo_wait_last_txn(struct txn_limbo *limbo, bool *is_rollback,
> +			double timeout)
>   {
> -	if (txn_limbo_is_empty(limbo))
> -		return 0;
> +	assert(!txn_limbo_is_empty(limbo));
>   
>   	/* initialization of a waitpoint. */
>   	struct confirm_waitpoint cwp;
> @@ -632,27 +635,42 @@ txn_limbo_wait_confirm(struct txn_limbo *limbo)
>   	struct txn_limbo_entry *tle = txn_limbo_last_entry(limbo);
>   	txn_on_commit(tle->txn, &on_complete);
>   	txn_on_rollback(tle->txn, &on_rollback);
> -	double start_time = fiber_clock();
> +	double deadline = fiber_clock() + timeout;
> +	int rc;
>   	while (true) {
> -		double deadline = start_time + replication_synchro_timeout;
> +		if (timeout < 0) {
> +			rc = -1;
> +			break;
> +		}
>   		bool cancellable = fiber_set_cancellable(false);
> -		double timeout = deadline - fiber_clock();
> -		int rc = fiber_cond_wait_timeout(&limbo->wait_cond, timeout);
> +		rc = fiber_cond_wait_timeout(&limbo->wait_cond, timeout);
>   		fiber_set_cancellable(cancellable);
> -		if (cwp.is_confirm || cwp.is_rollback)
> -			goto complete;
> +		if (cwp.is_confirm || cwp.is_rollback) {
> +			*is_rollback = cwp.is_rollback;
> +			rc = 0;
> +			break;
> +		}
>   		if (rc != 0)
> -			goto timed_out;
> +			break;
> +		timeout = deadline - fiber_clock();
>   	}
> -timed_out:
> -	/* Clear the triggers if the timeout has been reached. */
>   	trigger_clear(&on_complete);
>   	trigger_clear(&on_rollback);
> -	diag_set(ClientError, ER_SYNC_QUORUM_TIMEOUT);
> -	return -1;
> +	return rc;
> +}
>   
> -complete:
> -	if (!cwp.is_confirm) {
> +int
> +txn_limbo_wait_confirm(struct txn_limbo *limbo)
> +{
> +	if (txn_limbo_is_empty(limbo))
> +		return 0;
> +	bool is_rollback;
> +	if (txn_limbo_wait_last_txn(limbo, &is_rollback,
> +				    replication_synchro_timeout) != 0) {
> +		diag_set(ClientError, ER_SYNC_QUORUM_TIMEOUT);
> +		return -1;
> +	}
> +	if (is_rollback) {
>   		/* The transaction has been rolled back. */
>   		diag_set(ClientError, ER_SYNC_ROLLBACK);
>   		return -1;
> @@ -660,6 +678,28 @@ complete:
>   	return 0;
>   }
>   
> +int
> +txn_limbo_wait_empty(struct txn_limbo *limbo, double timeout)
> +{
> +	if (txn_limbo_is_empty(limbo))
> +		return 0;
> +	bool is_rollback;
> +	double deadline = fiber_clock() + timeout;
> +	/*
> +	 * Retry in the loop. More transactions might be added while waiting for
> +	 * the last one.
> +	 */
> +	do {
> +		if (txn_limbo_wait_last_txn(limbo, &is_rollback,
> +					    timeout) != 0) {
> +			diag_set(ClientError, ER_TIMEOUT);
> +			return -1;
> +		}
> +		timeout = deadline - fiber_clock();
> +	} while (!txn_limbo_is_empty(limbo));
> +	return 0;
> +}
> +
>   void
>   txn_limbo_process(struct txn_limbo *limbo, const struct synchro_request *req)
>   {
> diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h
> index e409ac657..7debbc0b9 100644
> --- a/src/box/txn_limbo.h
> +++ b/src/box/txn_limbo.h
> @@ -311,6 +311,10 @@ txn_limbo_process(struct txn_limbo *limbo, const struct synchro_request *req);
>   int
>   txn_limbo_wait_confirm(struct txn_limbo *limbo);
>   
> +/** Wait until the limbo is empty. Regardless of how its transactions end. */
> +int
> +txn_limbo_wait_empty(struct txn_limbo *limbo, double timeout);
> +
>   /**
>    * Write a PROMOTE request, which has the same effect as CONFIRM(@a lsn) and
>    * ROLLBACK(@a lsn + 1) combined.

-- 
Serge Petrenko


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Tarantool-patches] [PATCH 1/1] qsync: remove polling from box_promote()
  2021-07-12 22:20 [Tarantool-patches] [PATCH 1/1] qsync: remove polling from box_promote() Vladislav Shpilevoy via Tarantool-patches
  2021-07-13 10:01 ` Serge Petrenko via Tarantool-patches
@ 2021-07-13 10:02 ` Serge Petrenko via Tarantool-patches
  2021-07-13 12:19 ` Cyrill Gorcunov via Tarantool-patches
  2021-07-13 20:13 ` Vladislav Shpilevoy via Tarantool-patches
  3 siblings, 0 replies; 5+ messages in thread
From: Serge Petrenko via Tarantool-patches @ 2021-07-13 10:02 UTC (permalink / raw)
  To: Vladislav Shpilevoy, tarantool-patches, gorcunov



13.07.2021 01:20, Vladislav Shpilevoy пишет:
> box_promote() when called manually used to wait for the existing
> transactions from a foreign limbo to end during a timeout. Giving
> them a chance to end on their terms.
>
> The waiting was done via polling like
>
>      while (!done)
>          sleep(small_timeout);
>
> Polling is almost always super bad both for execution time and
> for CPU usage. The patch replaces it with proper waiting based on
> events happening in the limbo.
>
> Closes #5190
> ---
> Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5190-qsync-polling
> Issue: https://github.com/tarantool/tarantool/issues/5190
>
>   src/box/box.cc      |  7 +----
>   src/box/txn_limbo.c | 74 ++++++++++++++++++++++++++++++++++-----------
>   src/box/txn_limbo.h |  4 +++
>   3 files changed, 62 insertions(+), 23 deletions(-)
>

  Hi! Thanks for the fix! LGTM.

-- 
Serge Petrenko


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Tarantool-patches] [PATCH 1/1] qsync: remove polling from box_promote()
  2021-07-12 22:20 [Tarantool-patches] [PATCH 1/1] qsync: remove polling from box_promote() Vladislav Shpilevoy via Tarantool-patches
  2021-07-13 10:01 ` Serge Petrenko via Tarantool-patches
  2021-07-13 10:02 ` Serge Petrenko via Tarantool-patches
@ 2021-07-13 12:19 ` Cyrill Gorcunov via Tarantool-patches
  2021-07-13 20:13 ` Vladislav Shpilevoy via Tarantool-patches
  3 siblings, 0 replies; 5+ messages in thread
From: Cyrill Gorcunov via Tarantool-patches @ 2021-07-13 12:19 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

On Tue, Jul 13, 2021 at 12:20:07AM +0200, Vladislav Shpilevoy wrote:
> box_promote() when called manually used to wait for the existing
> transactions from a foreign limbo to end during a timeout. Giving
> them a chance to end on their terms.
Ack

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Tarantool-patches] [PATCH 1/1] qsync: remove polling from box_promote()
  2021-07-12 22:20 [Tarantool-patches] [PATCH 1/1] qsync: remove polling from box_promote() Vladislav Shpilevoy via Tarantool-patches
                   ` (2 preceding siblings ...)
  2021-07-13 12:19 ` Cyrill Gorcunov via Tarantool-patches
@ 2021-07-13 20:13 ` Vladislav Shpilevoy via Tarantool-patches
  3 siblings, 0 replies; 5+ messages in thread
From: Vladislav Shpilevoy via Tarantool-patches @ 2021-07-13 20:13 UTC (permalink / raw)
  To: tarantool-patches, gorcunov, sergepetrenko

Pushed to master, 2.8, 2.7.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-07-13 20:13 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-12 22:20 [Tarantool-patches] [PATCH 1/1] qsync: remove polling from box_promote() Vladislav Shpilevoy via Tarantool-patches
2021-07-13 10:01 ` Serge Petrenko via Tarantool-patches
2021-07-13 10:02 ` Serge Petrenko via Tarantool-patches
2021-07-13 12:19 ` Cyrill Gorcunov via Tarantool-patches
2021-07-13 20:13 ` Vladislav Shpilevoy via Tarantool-patches

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox