Tarantool development patches archive
 help / color / mirror / Atom feed
* [Tarantool-patches] [PATCH] recovery: make it yield when positioning in a WAL
@ 2021-04-26 16:59 Serge Petrenko via Tarantool-patches
  2021-04-26 21:20 ` Vladislav Shpilevoy via Tarantool-patches
  0 siblings, 1 reply; 7+ messages in thread
From: Serge Petrenko via Tarantool-patches @ 2021-04-26 16:59 UTC (permalink / raw)
  To: v.shpilevoy, gorcunov; +Cc: tarantool-patches

We had various places in box.cc and relay.cc which counted processed
rows and yielded every now and then. These yields didn't cover cases,
when recovery has to position inside a long WAL file:

For example, when tarantool exits without leaving an empty WAL file
which'll be used to recover instance vclock on restart. In this case
the instance freezes while processing the last availabe WAL in order
to recover the vclock.

Another issue is with replication. If a replica connects and needs data
from the end of a really long WAL, recovery will read up to the needed
position without yields, making relay disconnect by timeout.

In order to fix the issue, make recovery decide when a yield should
happen. Introduce a new callback: schedule_yield, which is called by
recovery once it processes (no matter how, either simply skips or calls
xstream_write) enough rows (WAL_ROWS_PER_YIELD).

schedule_yield either yields right away, in case of relay, or saves the
yield for later, in case of local recovery, because it might be in the
middle of a transaction.

The only place with explicit row counting and manual yielding is now in
relay_initial_join, since its row sources are engines rather than recovery
with its WAL files.

Closes #5979
---
https://github.com/tarantool/tarantool/tree/sp/gh-5979-recovery-yield
https://github.com/tarantool/tarantool/issues/5979

 src/box/box.cc      | 32 +++++++++++++++-----------------
 src/box/recovery.cc | 26 +++++++++++++++++---------
 src/box/recovery.h  | 15 ++++++++++++++-
 src/box/relay.cc    | 28 ++++++++++++++++++----------
 4 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 59925962d..69a8f87eb 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -335,8 +335,6 @@ struct wal_stream {
 	 * matches LSN of a first global row.
 	 */
 	bool has_global_row;
-	/** How many rows have been recovered so far. */
-	size_t rows;
 };
 
 /**
@@ -570,12 +568,6 @@ end_diag_request:
 static void
 wal_stream_try_yield(struct wal_stream *stream)
 {
-	/*
-	 * Save the yield. Otherwise it would happen only on rows which
-	 * are a multiple of WAL_ROWS_PER_YIELD and are last in their
-	 * transaction, which is probably a very rare coincidence.
-	 */
-	stream->has_yield |= (stream->rows % WAL_ROWS_PER_YIELD == 0);
 	if (wal_stream_has_tx(stream) || !stream->has_yield)
 		return;
 	stream->has_yield = false;
@@ -587,11 +579,6 @@ wal_stream_apply_row(struct xstream *base, struct xrow_header *row)
 {
 	struct wal_stream *stream =
 		container_of(base, struct wal_stream, base);
-	/*
-	 * Account all rows, even non-DML, and even leading to an error. Because
-	 * still need to yield sometimes.
-	 */
-	++stream->rows;
 	if (iproto_type_is_synchro_request(row->type)) {
 		if (wal_stream_apply_synchro_row(stream, row) != 0)
 			goto end_error;
@@ -618,7 +605,6 @@ wal_stream_create(struct wal_stream *ctx)
 	ctx->first_row_lsn = 0;
 	ctx->has_yield = false;
 	ctx->has_global_row = false;
-	ctx->rows = 0;
 }
 
 /* {{{ configuration bindings */
@@ -3101,6 +3087,19 @@ bootstrap(const struct tt_uuid *instance_uuid,
 	}
 }
 
+struct wal_stream wal_stream;
+
+/**
+ * Plan a yield in recovery stream. Wal stream will execute it as soon as it's
+ * ready.
+ */
+static void
+wal_stream_schedule_yield(void)
+{
+	wal_stream.has_yield = true;
+	wal_stream_try_yield(&wal_stream);
+}
+
 /**
  * Recover the instance from the local directory.
  * Enter hot standby if the directory is locked.
@@ -3124,7 +3123,6 @@ local_recovery(const struct tt_uuid *instance_uuid,
 
 	say_info("instance uuid %s", tt_uuid_str(&INSTANCE_UUID));
 
-	struct wal_stream wal_stream;
 	wal_stream_create(&wal_stream);
 	auto stream_guard = make_scoped_guard([&]{
 		wal_stream_abort(&wal_stream);
@@ -3132,8 +3130,8 @@ local_recovery(const struct tt_uuid *instance_uuid,
 
 	struct recovery *recovery;
 	bool is_force_recovery = cfg_geti("force_recovery");
-	recovery = recovery_new(wal_dir(), is_force_recovery,
-				checkpoint_vclock);
+	recovery = recovery_new(wal_dir(), is_force_recovery, checkpoint_vclock,
+				wal_stream_schedule_yield);
 
 	/*
 	 * Make sure we report the actual recovery position
diff --git a/src/box/recovery.cc b/src/box/recovery.cc
index cd33e7635..5351d8524 100644
--- a/src/box/recovery.cc
+++ b/src/box/recovery.cc
@@ -81,7 +81,7 @@
  */
 struct recovery *
 recovery_new(const char *wal_dirname, bool force_recovery,
-	     const struct vclock *vclock)
+	     const struct vclock *vclock, schedule_yield_f schedule_yield)
 {
 	struct recovery *r = (struct recovery *)
 			calloc(1, sizeof(*r));
@@ -113,6 +113,9 @@ recovery_new(const char *wal_dirname, bool force_recovery,
 	r->watcher = NULL;
 	rlist_create(&r->on_close_log);
 
+	r->row_count = 0;
+	r->schedule_yield = schedule_yield;
+
 	guard.is_active = false;
 	return r;
 }
@@ -139,8 +142,12 @@ recovery_scan(struct recovery *r, struct vclock *end_vclock,
 	if (xdir_open_cursor(&r->wal_dir, vclock_sum(end_vclock), &cursor) != 0)
 		return;
 	struct xrow_header row;
-	while (xlog_cursor_next(&cursor, &row, true) == 0)
+	while (xlog_cursor_next(&cursor, &row, true) == 0) {
 		vclock_follow_xrow(end_vclock, &row);
+		if (++r->row_count % WAL_ROWS_PER_YIELD == 0) {
+			r->schedule_yield();
+		}
+	}
 	xlog_cursor_close(&cursor, false);
 }
 
@@ -241,10 +248,16 @@ static void
 recover_xlog(struct recovery *r, struct xstream *stream,
 	     const struct vclock *stop_vclock)
 {
+	/* Imitate old behaviour. Rows are counted separately for each xlog. */
+	r->row_count = 0;
 	struct xrow_header row;
-	uint64_t row_count = 0;
 	while (xlog_cursor_next_xc(&r->cursor, &row,
 				   r->wal_dir.force_recovery) == 0) {
+		if (++r->row_count % WAL_ROWS_PER_YIELD == 0) {
+			r->schedule_yield();
+		}
+		if (r->row_count % 100000 == 0)
+			say_info("%.1fM rows processed", r->row_count / 1000000.);
 		/*
 		 * Read the next row from xlog file.
 		 *
@@ -273,12 +286,7 @@ recover_xlog(struct recovery *r, struct xstream *stream,
 		 * failed row anyway.
 		 */
 		vclock_follow_xrow(&r->vclock, &row);
-		if (xstream_write(stream, &row) == 0) {
-			++row_count;
-			if (row_count % 100000 == 0)
-				say_info("%.1fM rows processed",
-					 row_count / 1000000.);
-		} else {
+		if (xstream_write(stream, &row) != 0) {
 			if (!r->wal_dir.force_recovery)
 				diag_raise();
 
diff --git a/src/box/recovery.h b/src/box/recovery.h
index c8ccaa553..4212fd192 100644
--- a/src/box/recovery.h
+++ b/src/box/recovery.h
@@ -43,6 +43,12 @@ extern "C" {
 struct xrow_header;
 struct xstream;
 
+/**
+ * A type for a callback invoked by recovery after some batch of rows is
+ * processed. Is used mostly to unblock the event loop every now and then.
+ */
+typedef void (*schedule_yield_f)(void);
+
 struct recovery {
 	struct vclock vclock;
 	/** The WAL cursor we're currently reading/writing from/to. */
@@ -56,11 +62,18 @@ struct recovery {
 	struct fiber *watcher;
 	/** List of triggers invoked when the current WAL is closed. */
 	struct rlist on_close_log;
+	uint64_t row_count;
+	/**
+	 * A callback recovery calls eveery WAL_ROWS_PER_YIELD processed rows.
+	 * This includes both applied rows and the rows which were skipped to
+	 * position recovery at the right position in xlog.
+	 */
+	schedule_yield_f schedule_yield;
 };
 
 struct recovery *
 recovery_new(const char *wal_dirname, bool force_recovery,
-	     const struct vclock *vclock);
+	     const struct vclock *vclock, schedule_yield_f schedule_yield);
 
 void
 recovery_delete(struct recovery *r);
diff --git a/src/box/relay.cc b/src/box/relay.cc
index ff43c2fc7..c7002bd46 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -418,6 +418,12 @@ relay_final_join_f(va_list ap)
 	return 0;
 }
 
+static void
+relay_yield(void)
+{
+	fiber_sleep(0);
+}
+
 void
 relay_final_join(int fd, uint64_t sync, struct vclock *start_vclock,
 		 struct vclock *stop_vclock)
@@ -438,7 +444,7 @@ relay_final_join(int fd, uint64_t sync, struct vclock *start_vclock,
 	 * always be valid.
 	 */
 	vclock_copy(&relay->recv_vclock, start_vclock);
-	relay->r = recovery_new(wal_dir(), false, start_vclock);
+	relay->r = recovery_new(wal_dir(), false, start_vclock, relay_yield);
 	vclock_copy(&relay->stop_vclock, stop_vclock);
 
 	int rc = cord_costart(&relay->cord, "final_join",
@@ -904,7 +910,7 @@ relay_subscribe(struct replica *replica, int fd, uint64_t sync,
 	 * always be valid.
 	 */
 	vclock_copy(&relay->recv_vclock, replica_clock);
-	relay->r = recovery_new(wal_dir(), false, replica_clock);
+	relay->r = recovery_new(wal_dir(), false, replica_clock, relay_yield);
 	vclock_copy(&relay->tx.vclock, replica_clock);
 	relay->version_id = replica_version_id;
 
@@ -928,13 +934,6 @@ relay_send(struct relay *relay, struct xrow_header *packet)
 	coio_write_xrow(&relay->io, packet);
 	fiber_gc();
 
-	/*
-	 * It may happen that the socket is always ready for write, so yield
-	 * explicitly every now and then to not block the event loop.
-	 */
-	if (++relay->row_count % WAL_ROWS_PER_YIELD == 0)
-		fiber_sleep(0);
-
 	struct errinj *inj = errinj(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE);
 	if (inj != NULL && inj->dparam > 0)
 		fiber_sleep(inj->dparam);
@@ -944,6 +943,15 @@ static void
 relay_send_initial_join_row(struct xstream *stream, struct xrow_header *row)
 {
 	struct relay *relay = container_of(stream, struct relay, stream);
+	/*
+	 * It may happen that the socket is always ready for write, so yield
+	 * explicitly every now and then to not block the event loop.
+	 * This is not part of relay_yield(), because here the engines are the
+	 * source of rows, and not recovery.
+	 */
+	if (++relay->row_count % WAL_ROWS_PER_YIELD == 0)
+		fiber_sleep(0);
+
 	/*
 	 * Ignore replica local requests as we don't need to promote
 	 * vclock while sending a snapshot.
@@ -982,7 +990,7 @@ relay_restart_recovery(struct relay *relay)
 	struct vclock restart_vclock;
 	vclock_copy(&restart_vclock, &relay->recv_vclock);
 	vclock_reset(&restart_vclock, 0, vclock_get(&relay->r->vclock, 0));
-	struct recovery *r = recovery_new(wal_dir(), false, &restart_vclock);
+	struct recovery *r = recovery_new(wal_dir(), false, &restart_vclock, relay_yield);
 	rlist_swap(&relay->r->on_close_log, &r->on_close_log);
 	recovery_delete(relay->r);
 	relay->r = r;
-- 
2.24.3 (Apple Git-128)


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Tarantool-patches] [PATCH] recovery: make it yield when positioning in a WAL
  2021-04-26 16:59 [Tarantool-patches] [PATCH] recovery: make it yield when positioning in a WAL Serge Petrenko via Tarantool-patches
@ 2021-04-26 21:20 ` Vladislav Shpilevoy via Tarantool-patches
  2021-04-28 15:34   ` Serge Petrenko via Tarantool-patches
  0 siblings, 1 reply; 7+ messages in thread
From: Vladislav Shpilevoy via Tarantool-patches @ 2021-04-26 21:20 UTC (permalink / raw)
  To: Serge Petrenko, gorcunov; +Cc: tarantool-patches

Hi! Thanks for the patch!

See 2 questions, 1 comment.

On 26.04.2021 18:59, Serge Petrenko wrote:
> We had various places in box.cc and relay.cc which counted processed
> rows and yielded every now and then. These yields didn't cover cases,
> when recovery has to position inside a long WAL file:
> 
> For example, when tarantool exits without leaving an empty WAL file
> which'll be used to recover instance vclock on restart. In this case
> the instance freezes while processing the last availabe WAL in order
> to recover the vclock.
> 
> Another issue is with replication. If a replica connects and needs data
> from the end of a really long WAL, recovery will read up to the needed
> position without yields, making relay disconnect by timeout.
> 
> In order to fix the issue, make recovery decide when a yield should
> happen. Introduce a new callback: schedule_yield, which is called by
> recovery once it processes (no matter how, either simply skips or calls
> xstream_write) enough rows (WAL_ROWS_PER_YIELD).
> 
> schedule_yield either yields right away, in case of relay, or saves the
> yield for later, in case of local recovery, because it might be in the
> middle of a transaction.

1. Did you consider an option to yield explicitly in recovery code when
it skips rows? If they are being skipped, it does not matter what are
their transaction borders.

Then the whole patch would be to add the yield once per WAL_ROWS_PER_YIELD
to recovery_scan(), correct?

> The only place with explicit row counting and manual yielding is now in
> relay_initial_join, since its row sources are engines rather than recovery
> with its WAL files.
> 
> Closes #5979
> ---
> https://github.com/tarantool/tarantool/tree/sp/gh-5979-recovery-yield
> https://github.com/tarantool/tarantool/issues/5979
> 
> diff --git a/src/box/box.cc b/src/box/box.cc
> index 59925962d..69a8f87eb 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -3101,6 +3087,19 @@ bootstrap(const struct tt_uuid *instance_uuid,
>  	}
>  }
>  
> +struct wal_stream wal_stream;

2. This must be static.

> +
> +/**
> + * Plan a yield in recovery stream. Wal stream will execute it as soon as it's
> + * ready.
> + */
> +static void
> +wal_stream_schedule_yield(void)
> +{
> +	wal_stream.has_yield = true;
> +	wal_stream_try_yield(&wal_stream);
> +}
> diff --git a/src/box/recovery.cc b/src/box/recovery.cc
> index cd33e7635..5351d8524 100644
> --- a/src/box/recovery.cc
> +++ b/src/box/recovery.cc
> @@ -241,10 +248,16 @@ static void
>  recover_xlog(struct recovery *r, struct xstream *stream,
>  	     const struct vclock *stop_vclock)
>  {
> +	/* Imitate old behaviour. Rows are counted separately for each xlog. */
> +	r->row_count = 0;

3. But why do you need to imitate it? Does it mean if the files are
too small to yield even once in each, but in total their number is
huge, there won't be yields?

Also does it mean "1M rows processed" was not ever printed in that
case?

>  	struct xrow_header row;
> -	uint64_t row_count = 0;
>  	while (xlog_cursor_next_xc(&r->cursor, &row,
>  				   r->wal_dir.force_recovery) == 0) {
> +		if (++r->row_count % WAL_ROWS_PER_YIELD == 0) {
> +			r->schedule_yield();
> +		}
> +		if (r->row_count % 100000 == 0)
> +			say_info("%.1fM rows processed", r->row_count / 1000000.);
>  		/*
>  		 * Read the next row from xlog file.
>  		 *
> @@ -273,12 +286,7 @@ recover_xlog(struct recovery *r, struct xstream *stream,
>  		 * failed row anyway.
>  		 */
>  		vclock_follow_xrow(&r->vclock, &row);
> -		if (xstream_write(stream, &row) == 0) {
> -			++row_count;
> -			if (row_count % 100000 == 0)
> -				say_info("%.1fM rows processed",
> -					 row_count / 1000000.);
> -		} else {
> +		if (xstream_write(stream, &row) != 0) {
>  			if (!r->wal_dir.force_recovery)
>  				diag_raise();
>  

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Tarantool-patches] [PATCH] recovery: make it yield when positioning in a WAL
  2021-04-26 21:20 ` Vladislav Shpilevoy via Tarantool-patches
@ 2021-04-28 15:34   ` Serge Petrenko via Tarantool-patches
  2021-04-28 20:50     ` Vladislav Shpilevoy via Tarantool-patches
  0 siblings, 1 reply; 7+ messages in thread
From: Serge Petrenko via Tarantool-patches @ 2021-04-28 15:34 UTC (permalink / raw)
  To: Vladislav Shpilevoy, gorcunov; +Cc: tarantool-patches



27.04.2021 00:20, Vladislav Shpilevoy пишет:
> Hi! Thanks for the patch!
>
> See 2 questions, 1 comment.
>
> On 26.04.2021 18:59, Serge Petrenko wrote:

Thanks for the review!
>> We had various places in box.cc and relay.cc which counted processed
>> rows and yielded every now and then. These yields didn't cover cases,
>> when recovery has to position inside a long WAL file:
>>
>> For example, when tarantool exits without leaving an empty WAL file
>> which'll be used to recover instance vclock on restart. In this case
>> the instance freezes while processing the last availabe WAL in order
>> to recover the vclock.
>>
>> Another issue is with replication. If a replica connects and needs data
>> from the end of a really long WAL, recovery will read up to the needed
>> position without yields, making relay disconnect by timeout.
>>
>> In order to fix the issue, make recovery decide when a yield should
>> happen. Introduce a new callback: schedule_yield, which is called by
>> recovery once it processes (no matter how, either simply skips or calls
>> xstream_write) enough rows (WAL_ROWS_PER_YIELD).
>>
>> schedule_yield either yields right away, in case of relay, or saves the
>> yield for later, in case of local recovery, because it might be in the
>> middle of a transaction.
> 1. Did you consider an option to yield explicitly in recovery code when
> it skips rows? If they are being skipped, it does not matter what are
> their transaction borders.

I did consider that. It is possible to do so, but then we'll have yet 
another
place (in addition to relay and wal_stream) which counts rows and yields
every now and then.

I thought it would be better to unify all these places. Actually, this 
could be
done this way from the very beginning.
I think it's not recovery's business whether to yield or not once
some rows are processed.

Anyway, I can make it this way, if you insist.

>
> Then the whole patch would be to add the yield once per WAL_ROWS_PER_YIELD
> to recovery_scan(), correct?

True. One place in recovery_scan() and one place in recover_xlog(), when
the rows are skipped.

>
>> The only place with explicit row counting and manual yielding is now in
>> relay_initial_join, since its row sources are engines rather than recovery
>> with its WAL files.
>>
>> Closes #5979
>> ---
>> https://github.com/tarantool/tarantool/tree/sp/gh-5979-recovery-yield
>> https://github.com/tarantool/tarantool/issues/5979
>>
>> diff --git a/src/box/box.cc b/src/box/box.cc
>> index 59925962d..69a8f87eb 100644
>> --- a/src/box/box.cc
>> +++ b/src/box/box.cc
>> @@ -3101,6 +3087,19 @@ bootstrap(const struct tt_uuid *instance_uuid,
>>   	}
>>   }
>>   
>> +struct wal_stream wal_stream;
> 2. This must be static.

Sure.

===================================================

diff --git a/src/box/box.cc b/src/box/box.cc
index 69a8f87eb..62b55352e 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -3087,7 +3087,7 @@ bootstrap(const struct tt_uuid *instance_uuid,
         }
  }

-struct wal_stream wal_stream;
+static struct wal_stream wal_stream;

  /**
   * Plan a yield in recovery stream. Wal stream will execute it as soon 
as it's


===================================================
>
>> +
>> +/**
>> + * Plan a yield in recovery stream. Wal stream will execute it as soon as it's
>> + * ready.
>> + */
>> +static void
>> +wal_stream_schedule_yield(void)
>> +{
>> +	wal_stream.has_yield = true;
>> +	wal_stream_try_yield(&wal_stream);
>> +}
>> diff --git a/src/box/recovery.cc b/src/box/recovery.cc
>> index cd33e7635..5351d8524 100644
>> --- a/src/box/recovery.cc
>> +++ b/src/box/recovery.cc
>> @@ -241,10 +248,16 @@ static void
>>   recover_xlog(struct recovery *r, struct xstream *stream,
>>   	     const struct vclock *stop_vclock)
>>   {
>> +	/* Imitate old behaviour. Rows are counted separately for each xlog. */
>> +	r->row_count = 0;
> 3. But why do you need to imitate it? Does it mean if the files are
> too small to yield even once in each, but in total their number is
> huge, there won't be yields?

Yes, that's true.

>
> Also does it mean "1M rows processed" was not ever printed in that
> case?

Yes, when WALs are not big enough.
Recovery starts over with '0.1M rows processed' on every new WAL file.

>
>>   	struct xrow_header row;
>> -	uint64_t row_count = 0;
>>   	while (xlog_cursor_next_xc(&r->cursor, &row,
>>   				   r->wal_dir.force_recovery) == 0) {
>> +		if (++r->row_count % WAL_ROWS_PER_YIELD == 0) {
>> +			r->schedule_yield();
>> +		}
>> +		if (r->row_count % 100000 == 0)
>> +			say_info("%.1fM rows processed", r->row_count / 1000000.);
>>   		/*
>>   		 * Read the next row from xlog file.
>>   		 *
>> @@ -273,12 +286,7 @@ recover_xlog(struct recovery *r, struct xstream *stream,
>>   		 * failed row anyway.
>>   		 */
>>   		vclock_follow_xrow(&r->vclock, &row);
>> -		if (xstream_write(stream, &row) == 0) {
>> -			++row_count;
>> -			if (row_count % 100000 == 0)
>> -				say_info("%.1fM rows processed",
>> -					 row_count / 1000000.);
>> -		} else {
>> +		if (xstream_write(stream, &row) != 0) {
>>   			if (!r->wal_dir.force_recovery)
>>   				diag_raise();
>>   

-- 
Serge Petrenko


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Tarantool-patches] [PATCH] recovery: make it yield when positioning in a WAL
  2021-04-28 15:34   ` Serge Petrenko via Tarantool-patches
@ 2021-04-28 20:50     ` Vladislav Shpilevoy via Tarantool-patches
  2021-04-29  8:55       ` Serge Petrenko via Tarantool-patches
  0 siblings, 1 reply; 7+ messages in thread
From: Vladislav Shpilevoy via Tarantool-patches @ 2021-04-28 20:50 UTC (permalink / raw)
  To: Serge Petrenko, gorcunov; +Cc: tarantool-patches

>>> We had various places in box.cc and relay.cc which counted processed
>>> rows and yielded every now and then. These yields didn't cover cases,
>>> when recovery has to position inside a long WAL file:
>>>
>>> For example, when tarantool exits without leaving an empty WAL file
>>> which'll be used to recover instance vclock on restart. In this case
>>> the instance freezes while processing the last availabe WAL in order
>>> to recover the vclock.
>>>
>>> Another issue is with replication. If a replica connects and needs data
>>> from the end of a really long WAL, recovery will read up to the needed
>>> position without yields, making relay disconnect by timeout.
>>>
>>> In order to fix the issue, make recovery decide when a yield should
>>> happen. Introduce a new callback: schedule_yield, which is called by
>>> recovery once it processes (no matter how, either simply skips or calls
>>> xstream_write) enough rows (WAL_ROWS_PER_YIELD).
>>>
>>> schedule_yield either yields right away, in case of relay, or saves the
>>> yield for later, in case of local recovery, because it might be in the
>>> middle of a transaction.
>> 1. Did you consider an option to yield explicitly in recovery code when
>> it skips rows? If they are being skipped, it does not matter what are
>> their transaction borders.
> 
> I did consider that. It is possible to do so, but then we'll have yet another
> place (in addition to relay and wal_stream) which counts rows and yields
> every now and then.
> 
> I thought it would be better to unify all these places. Actually, this could be
> done this way from the very beginning.
> I think it's not recovery's business whether to yield or not once
> some rows are processed.
> 
> Anyway, I can make it this way, if you insist.

The current solution is also fine.

>>> +
>>> +/**
>>> + * Plan a yield in recovery stream. Wal stream will execute it as soon as it's
>>> + * ready.
>>> + */
>>> +static void
>>> +wal_stream_schedule_yield(void)
>>> +{
>>> +    wal_stream.has_yield = true;
>>> +    wal_stream_try_yield(&wal_stream);
>>> +}
>>> diff --git a/src/box/recovery.cc b/src/box/recovery.cc
>>> index cd33e7635..5351d8524 100644
>>> --- a/src/box/recovery.cc
>>> +++ b/src/box/recovery.cc
>>> @@ -241,10 +248,16 @@ static void
>>>   recover_xlog(struct recovery *r, struct xstream *stream,
>>>            const struct vclock *stop_vclock)
>>>   {
>>> +    /* Imitate old behaviour. Rows are counted separately for each xlog. */
>>> +    r->row_count = 0;
>> 3. But why do you need to imitate it? Does it mean if the files are
>> too small to yield even once in each, but in total their number is
>> huge, there won't be yields?
> 
> Yes, that's true.

Does not this look wrong to you? The xlog files might not contain enough
rows if wal_max_size is small enough, and then the same issue still
exists - no yields.

>> Also does it mean "1M rows processed" was not ever printed in that
>> case?
> 
> Yes, when WALs are not big enough.
> Recovery starts over with '0.1M rows processed' on every new WAL file.

Does not this look wrong to you too? That at least the number of
rows should not drop to 0 on each next xlog file.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Tarantool-patches] [PATCH] recovery: make it yield when positioning in a WAL
  2021-04-28 20:50     ` Vladislav Shpilevoy via Tarantool-patches
@ 2021-04-29  8:55       ` Serge Petrenko via Tarantool-patches
  2021-04-29 20:03         ` Vladislav Shpilevoy via Tarantool-patches
  0 siblings, 1 reply; 7+ messages in thread
From: Serge Petrenko via Tarantool-patches @ 2021-04-29  8:55 UTC (permalink / raw)
  To: Vladislav Shpilevoy, gorcunov; +Cc: tarantool-patches



28.04.2021 23:50, Vladislav Shpilevoy пишет:
>>>> We had various places in box.cc and relay.cc which counted processed
>>>> rows and yielded every now and then. These yields didn't cover cases,
>>>> when recovery has to position inside a long WAL file:
>>>>
>>>> For example, when tarantool exits without leaving an empty WAL file
>>>> which'll be used to recover instance vclock on restart. In this case
>>>> the instance freezes while processing the last availabe WAL in order
>>>> to recover the vclock.
>>>>
>>>> Another issue is with replication. If a replica connects and needs data
>>>> from the end of a really long WAL, recovery will read up to the needed
>>>> position without yields, making relay disconnect by timeout.
>>>>
>>>> In order to fix the issue, make recovery decide when a yield should
>>>> happen. Introduce a new callback: schedule_yield, which is called by
>>>> recovery once it processes (no matter how, either simply skips or calls
>>>> xstream_write) enough rows (WAL_ROWS_PER_YIELD).
>>>>
>>>> schedule_yield either yields right away, in case of relay, or saves the
>>>> yield for later, in case of local recovery, because it might be in the
>>>> middle of a transaction.
>>> 1. Did you consider an option to yield explicitly in recovery code when
>>> it skips rows? If they are being skipped, it does not matter what are
>>> their transaction borders.
>> I did consider that. It is possible to do so, but then we'll have yet another
>> place (in addition to relay and wal_stream) which counts rows and yields
>> every now and then.
>>
>> I thought it would be better to unify all these places. Actually, this could be
>> done this way from the very beginning.
>> I think it's not recovery's business whether to yield or not once
>> some rows are processed.
>>
>> Anyway, I can make it this way, if you insist.
> The current solution is also fine.
>
>>>> +
>>>> +/**
>>>> + * Plan a yield in recovery stream. Wal stream will execute it as soon as it's
>>>> + * ready.
>>>> + */
>>>> +static void
>>>> +wal_stream_schedule_yield(void)
>>>> +{
>>>> +    wal_stream.has_yield = true;
>>>> +    wal_stream_try_yield(&wal_stream);
>>>> +}
>>>> diff --git a/src/box/recovery.cc b/src/box/recovery.cc
>>>> index cd33e7635..5351d8524 100644
>>>> --- a/src/box/recovery.cc
>>>> +++ b/src/box/recovery.cc
>>>> @@ -241,10 +248,16 @@ static void
>>>>    recover_xlog(struct recovery *r, struct xstream *stream,
>>>>             const struct vclock *stop_vclock)
>>>>    {
>>>> +    /* Imitate old behaviour. Rows are counted separately for each xlog. */
>>>> +    r->row_count = 0;
>>> 3. But why do you need to imitate it? Does it mean if the files are
>>> too small to yield even once in each, but in total their number is
>>> huge, there won't be yields?
>> Yes, that's true.
> Does not this look wrong to you? The xlog files might not contain enough
> rows if wal_max_size is small enough, and then the same issue still
> exists - no yields.
>
>>> Also does it mean "1M rows processed" was not ever printed in that
>>> case?
>> Yes, when WALs are not big enough.
>> Recovery starts over with '0.1M rows processed' on every new WAL file.
> Does not this look wrong to you too? That at least the number of
> rows should not drop to 0 on each next xlog file.

Yep, let's change it then. I thought we had to preserve log output.
Fixed and added a changelog entry.

=================================

diff --git a/changelogs/unreleased/gh-5979-recovery-ligs.md 
b/changelogs/unreleased/gh-5979-recovery-ligs.md
new file mode 100644
index 000000000..86abfd66a
--- /dev/null
+++ b/changelogs/unreleased/gh-5979-recovery-ligs.md
@@ -0,0 +1,11 @@
+# bugfix/core
+
+* Now tarantool yields when scanning `.xlog` files for the latest 
applied vclock
+  and when finding the right place in `.xlog`s to start recovering. 
This means
+  that the instance is responsive right after `box.cfg` call even when 
an empty
+  `.xlog` was not created on previous exit.
+  Also this prevents relay from timing out when a freshly subscribed 
replica
+  needs rows from the end of a relatively long (hundreds of MBs) `.xlog`
+  (gh-5979).
+* The counter in `x.yM rows processed` log messages does not reset on 
each new
+  recovered `xlog` anymore.
diff --git a/src/box/recovery.cc b/src/box/recovery.cc
index 5351d8524..8359f216d 100644
--- a/src/box/recovery.cc
+++ b/src/box/recovery.cc
@@ -149,6 +149,13 @@ recovery_scan(struct recovery *r, struct vclock 
*end_vclock,
                 }
         }
         xlog_cursor_close(&cursor, false);
+
+       /*
+        * Do not show scanned rows in log output and yield just in case
+        * row_count was less than WAL_ROWS_PER_YIELD when we reset it.
+        */
+       r->row_count = 0;
+       r->schedule_yield();
  }

  static inline void
@@ -248,8 +255,6 @@ static void
  recover_xlog(struct recovery *r, struct xstream *stream,
              const struct vclock *stop_vclock)
  {
-       /* Imitate old behaviour. Rows are counted separately for each 
xlog. */
-       r->row_count = 0;
         struct xrow_header row;
         while (xlog_cursor_next_xc(&r->cursor, &row,
                                    r->wal_dir.force_recovery) == 0) {

=================================

-- 
Serge Petrenko


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Tarantool-patches] [PATCH] recovery: make it yield when positioning in a WAL
  2021-04-29  8:55       ` Serge Petrenko via Tarantool-patches
@ 2021-04-29 20:03         ` Vladislav Shpilevoy via Tarantool-patches
  2021-05-12 11:30           ` Serge Petrenko via Tarantool-patches
  0 siblings, 1 reply; 7+ messages in thread
From: Vladislav Shpilevoy via Tarantool-patches @ 2021-04-29 20:03 UTC (permalink / raw)
  To: Serge Petrenko, gorcunov; +Cc: tarantool-patches

Hi! Thanks for the patch!

Technically looks good now, can be pushed.

But in case you would find it interesting: now relay->row_count looks odd.
Because its comment is not very correct, most of that is moved to the
recovery layer. And because it is used only by initial join. And because we
still have a single place where we use xstream/recovery, but the yields
are not managed by them.

Did you think about moving your code to xstream instead of recovery? Then
it would be truly not related to recovery. It would have 2 methods:

- write(struct xstream *, struct xrow_header *)
- skip(struct xstream *, struct xrow_header *)

Recovery calls skip() while establishes a position, for each skipped row.
Relaying from memory never calls skip, but calls only write() like now.

In order to yield you inherit xtream, add a member row_count, and both in
skip() and write() increment it like you do. And then yield periodically.

It would work for the initial join too.

Or add row_count right to the xtream struct, and replace skip() with your
function schedule_yield(). This way row_count would be visible to recovery
and available for reporting in the logs like '... rows recovered'. And we
wouldn't all too many virtual functions for the skipped rows.

All that up to you.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Tarantool-patches] [PATCH] recovery: make it yield when positioning in a WAL
  2021-04-29 20:03         ` Vladislav Shpilevoy via Tarantool-patches
@ 2021-05-12 11:30           ` Serge Petrenko via Tarantool-patches
  0 siblings, 0 replies; 7+ messages in thread
From: Serge Petrenko via Tarantool-patches @ 2021-05-12 11:30 UTC (permalink / raw)
  To: Vladislav Shpilevoy, gorcunov; +Cc: tarantool-patches



29.04.2021 23:03, Vladislav Shpilevoy пишет:
> Hi! Thanks for the patch!
>
> Technically looks good now, can be pushed.
>
> But in case you would find it interesting: now relay->row_count looks odd.
> Because its comment is not very correct, most of that is moved to the
> recovery layer. And because it is used only by initial join. And because we
> still have a single place where we use xstream/recovery, but the yields
> are not managed by them.
>
> Did you think about moving your code to xstream instead of recovery? Then
> it would be truly not related to recovery. It would have 2 methods:
>
> - write(struct xstream *, struct xrow_header *)
> - skip(struct xstream *, struct xrow_header *)
>
> Recovery calls skip() while establishes a position, for each skipped row.
> Relaying from memory never calls skip, but calls only write() like now.
>
> In order to yield you inherit xtream, add a member row_count, and both in
> skip() and write() increment it like you do. And then yield periodically.
>
> It would work for the initial join too.
>
> Or add row_count right to the xtream struct, and replace skip() with your
> function schedule_yield(). This way row_count would be visible to recovery
> and available for reporting in the logs like '... rows recovered'. And we
> wouldn't all too many virtual functions for the skipped rows.
>
> All that up to you.

Thanks for your input!

I took the approach with moving schedule_yield to struct xstream.
Please, check it out in v2.

-- 
Serge Petrenko


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-05-12 11:30 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-26 16:59 [Tarantool-patches] [PATCH] recovery: make it yield when positioning in a WAL Serge Petrenko via Tarantool-patches
2021-04-26 21:20 ` Vladislav Shpilevoy via Tarantool-patches
2021-04-28 15:34   ` Serge Petrenko via Tarantool-patches
2021-04-28 20:50     ` Vladislav Shpilevoy via Tarantool-patches
2021-04-29  8:55       ` Serge Petrenko via Tarantool-patches
2021-04-29 20:03         ` Vladislav Shpilevoy via Tarantool-patches
2021-05-12 11:30           ` Serge Petrenko via Tarantool-patches

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox