From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <sergepetrenko@tarantool.org>
Content-Type: text/plain;
	charset=utf-8
Mime-Version: 1.0 (Mac OS X Mail 12.2 \(3445.102.3\))
Subject: Re: [tarantool-patches] [PATCH] evio: fix timeout calculations
From: Serge Petrenko <sergepetrenko@tarantool.org>
In-Reply-To: <20190314151641.26876-1-sergepetrenko@tarantool.org>
Date: Thu, 14 Mar 2019 18:57:12 +0300
Content-Transfer-Encoding: quoted-printable
Message-Id: <D50D4CB9-4CDC-4483-85D7-CC0C9C9D3261@tarantool.org>
References: <20190314151641.26876-1-sergepetrenko@tarantool.org>
To: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: tarantool-patches@freelists.org
List-ID: <tarantool-patches.dev.tarantool.org>


> 14 =D0=BC=D0=B0=D1=80=D1=82=D0=B0 2019 =D0=B3., =D0=B2 18:16, Serge =
Petrenko <sergepetrenko@tarantool.org> =D0=BD=D0=B0=D0=BF=D0=B8=D1=81=D0=B0=
=D0=BB(=D0=B0):
>=20
> The function evio_timeout_update() failed to update the starting time
> point, which lead to timeouts happening much faster than they should =
if
> there were consecutive calls to the function.
> This lead, for example, to applier timing out while reading a several
> megabyte-size row in 0.2 seconds even if replication_timeout was set =
to
> 15 seconds.
>=20
> Closes #4042
> ---
> https://github.com/tarantool/tarantool/tree/sp/gh-4042-applier-timeout
> https://github.com/tarantool/tarantool/issues/4042
>=20
> src/box/xrow_io.cc                         |  4 +-
> src/lib/core/coio.cc                       | 18 ++--
> src/lib/core/coio.h                        |  2 +-
> src/lib/core/evio.h                        |  5 +-
> test/replication/long_row_timeout.result   | 98 ++++++++++++++++++++++
> test/replication/long_row_timeout.test.lua | 43 ++++++++++
> test/replication/replica_big.lua           | 12 +++
> test/replication/suite.cfg                 |  1 +
> 8 files changed, 169 insertions(+), 14 deletions(-)
> create mode 100644 test/replication/long_row_timeout.result
> create mode 100644 test/replication/long_row_timeout.test.lua
> create mode 100644 test/replication/replica_big.lua
>=20
> diff --git a/src/box/xrow_io.cc b/src/box/xrow_io.cc
> index 4b6e68bc1..48707982b 100644
> --- a/src/box/xrow_io.cc
> +++ b/src/box/xrow_io.cc
> @@ -71,7 +71,7 @@ coio_read_xrow_timeout_xc(struct ev_io *coio, struct =
ibuf *in,
> 	/* Read fixed header */
> 	if (ibuf_used(in) < 1)
> 		coio_breadn_timeout(coio, in, 1, delay);
> -	coio_timeout_update(start, &delay);
> +	coio_timeout_update(&start, &delay);
>=20
> 	/* Read length */
> 	if (mp_typeof(*in->rpos) !=3D MP_UINT) {
> @@ -81,7 +81,7 @@ coio_read_xrow_timeout_xc(struct ev_io *coio, struct =
ibuf *in,
> 	ssize_t to_read =3D mp_check_uint(in->rpos, in->wpos);
> 	if (to_read > 0)
> 		coio_breadn_timeout(coio, in, to_read, delay);
> -	coio_timeout_update(start, &delay);
> +	coio_timeout_update(&start, &delay);
>=20
> 	uint32_t len =3D mp_decode_uint((const char **) &in->rpos);
>=20
> diff --git a/src/lib/core/coio.cc b/src/lib/core/coio.cc
> index 6b94cf2a6..e88d724d5 100644
> --- a/src/lib/core/coio.cc
> +++ b/src/lib/core/coio.cc
> @@ -210,7 +210,7 @@ coio_connect_timeout(struct ev_io *coio, struct =
uri *uri, struct sockaddr *addr,
> 		if (!uri->host_hint) freeaddrinfo(ai);
> 		else free(ai_local.ai_addr);
> 	});
> -	evio_timeout_update(loop(), start, &delay);
> +	evio_timeout_update(loop(), &start, &delay);
>=20
> 	coio_timeout_init(&start, &delay, timeout);
> 	assert(! evio_has_fd(coio));
> @@ -232,7 +232,7 @@ coio_connect_timeout(struct ev_io *coio, struct =
uri *uri, struct sockaddr *addr,
> 		}
> 		ai =3D ai->ai_next;
> 		ev_now_update(loop);
> -		coio_timeout_update(start, &delay);
> +		coio_timeout_update(&start, &delay);
> 	}
>=20
> 	tnt_raise(SocketError, sio_socketname(coio->fd), "connection =
failed");
> @@ -278,7 +278,7 @@ coio_accept(struct ev_io *coio, struct sockaddr =
*addr,
> 		fiber_testcancel();
> 		if (is_timedout)
> 			tnt_raise(TimedOut);
> -		coio_timeout_update(start, &delay);
> +		coio_timeout_update(&start, &delay);
> 	}
> }
>=20
> @@ -338,7 +338,7 @@ coio_read_ahead_timeout(struct ev_io *coio, void =
*buf, size_t sz,
> 		fiber_testcancel();
> 		if (is_timedout)
> 			tnt_raise(TimedOut);
> -		coio_timeout_update(start, &delay);
> +		coio_timeout_update(&start, &delay);
> 	}
> }
>=20
> @@ -433,7 +433,7 @@ coio_write_timeout(struct ev_io *coio, const void =
*buf, size_t sz,
>=20
> 		if (is_timedout)
> 			tnt_raise(TimedOut);
> -		coio_timeout_update(start, &delay);
> +		coio_timeout_update(&start, &delay);
> 	}
> }
>=20
> @@ -499,7 +499,7 @@ coio_writev_timeout(struct ev_io *coio, struct =
iovec *iov, int iovcnt,
>=20
> 		if (is_timedout)
> 			tnt_raise(TimedOut);
> -		coio_timeout_update(start, &delay);
> +		coio_timeout_update(&start, &delay);
> 	}
> 	return total;
> }
> @@ -545,7 +545,7 @@ coio_sendto_timeout(struct ev_io *coio, const void =
*buf, size_t sz, int flags,
> 		fiber_testcancel();
> 		if (is_timedout)
> 			tnt_raise(TimedOut);
> -		coio_timeout_update(start, &delay);
> +		coio_timeout_update(&start, &delay);
> 	}
> }
>=20
> @@ -590,7 +590,7 @@ coio_recvfrom_timeout(struct ev_io *coio, void =
*buf, size_t sz, int flags,
> 		fiber_testcancel();
> 		if (is_timedout)
> 			tnt_raise(TimedOut);
> -		coio_timeout_update(start, &delay);
> +		coio_timeout_update(&start, &delay);
> 	}
> }
>=20
> @@ -759,7 +759,7 @@ coio_write_fd_timeout(int fd, const void *data, =
size_t size, ev_tstamp timeout)
> 		}
> 		if (errno =3D=3D EAGAIN || errno =3D=3D EWOULDBLOCK) {
> 			coio_wait(fd, COIO_WRITE, delay);
> -			evio_timeout_update(loop, start, &delay);
> +			evio_timeout_update(loop, &start, &delay);
> 		} else if (errno !=3D EINTR) {
> 			diag_set(SocketError, sio_socketname(fd), =
"write");
> 			return -1;
> diff --git a/src/lib/core/coio.h b/src/lib/core/coio.h
> index 104abf720..6a2337689 100644
> --- a/src/lib/core/coio.h
> +++ b/src/lib/core/coio.h
> @@ -88,7 +88,7 @@ coio_timeout_init(ev_tstamp *start, ev_tstamp =
*delay,
> }
>=20
> static inline void
> -coio_timeout_update(ev_tstamp start, ev_tstamp *delay)
> +coio_timeout_update(ev_tstamp *start, ev_tstamp *delay)
> {
> 	return evio_timeout_update(loop(), start, delay);
> }
> diff --git a/src/lib/core/evio.h b/src/lib/core/evio.h
> index bd48235bc..c86be9e6a 100644
> --- a/src/lib/core/evio.h
> +++ b/src/lib/core/evio.h
> @@ -148,9 +148,10 @@ evio_timeout_init(ev_loop *loop, ev_tstamp =
*start, ev_tstamp *delay,
> }
>=20
> static inline void
> -evio_timeout_update(ev_loop *loop, ev_tstamp start, ev_tstamp *delay)
> +evio_timeout_update(ev_loop *loop, ev_tstamp *start, ev_tstamp =
*delay)
> {
> -	ev_tstamp elapsed =3D ev_monotonic_now(loop) - start;
> +	ev_tstamp elapsed =3D ev_monotonic_now(loop) - *start;
> +	*start +=3D elapsed;
> 	*delay =3D (elapsed >=3D *delay) ? 0 : *delay - elapsed;
> }
>=20
> diff --git a/test/replication/long_row_timeout.result =
b/test/replication/long_row_timeout.result
> new file mode 100644
> index 000000000..690fdcbb4
> --- /dev/null
> +++ b/test/replication/long_row_timeout.result
> @@ -0,0 +1,98 @@
> +fiber =3D require('fiber')
> +---
> +...
> +digest =3D require('digest')
> +---
> +...
> +test_run =3D require('test_run').new()
> +---
> +...
> +--
> +-- gh-4042 applier read times out too fast when reading large rows.
> +--
> +box.schema.user.grant('guest', 'replication')
> +---
> +...
> +test_run:cmd('create server replica with rpl_master=3Ddefault, =
script=3D"replication/replica_big.lua"')
> +---
> +- true
> +...
> +test_run:cmd('start server replica')
> +---
> +- true
> +...
> +box.info.replication[2].downstream.status
> +---
> +- follow
> +...
> +tup_sz =3D box.cfg.memtx_max_tuple_size
> +---
> +...
> +box.cfg{memtx_max_tuple_size =3D 21 * 1024 * 1024, memtx_memory =3D =
1024 * 1024 * 1024}
> +---
> +...
> +-- insert some big rows which cannot be read in one go, so applier =
yields
> +-- on read a couple of times.
> +s =3D box.schema.space.create('test')
> +---
> +...
> +_ =3D s:create_index('pk')
> +---
> +...
> +for i =3D 1,5 do box.space.test:insert{i, =
require('digest').urandom(20 * 1024 * 1024)} end
> +---
> +...
> +-- replication_disconnect_timeout is 4 * replication_timeout, check =
that
> +-- replica doesn't time out too early.
> +test_run:cmd('setopt delimiter ";"')
> +---
> +- true
> +...
> +ok =3D true;
> +---
> +...
> +start =3D fiber.time();
> +---
> +...
> +while fiber.time() - start < 3 * box.cfg.replication_timeout do
> +    if box.info.replication[2].downstream.status ~=3D 'follow' then
> +        ok =3D false
> +        break
> +    end
> +    fiber.sleep(0.001)
> +end;
> +---
> +...
> +test_run:cmd('setopt delimiter ""');
> +---
> +- true
> +...
> +ok
> +---
> +- true
> +...
> +s:drop()
> +---
> +...
> +test_run:cmd('stop server replica')
> +---
> +- true
> +...
> +test_run:cmd('cleanup server replica')
> +---
> +- true
> +...
> +test_run:cmd('delete server replica')
> +---
> +- true
> +...
> +test_run:cleanup_cluster()
> +---
> +...
> +-- cannot decrease memtx_memory to its original value.
> +box.cfg{memtx_max_tuple_size =3D tup_sz}
> +---
> +...
> +box.schema.user.revoke('guest', 'replication')
> +---
> +...
> diff --git a/test/replication/long_row_timeout.test.lua =
b/test/replication/long_row_timeout.test.lua
> new file mode 100644
> index 000000000..21a522018
> --- /dev/null
> +++ b/test/replication/long_row_timeout.test.lua
> @@ -0,0 +1,43 @@
> +fiber =3D require('fiber')
> +digest =3D require('digest')
> +test_run =3D require('test_run').new()
> +
> +--
> +-- gh-4042 applier read times out too fast when reading large rows.
> +--
> +box.schema.user.grant('guest', 'replication')
> +test_run:cmd('create server replica with rpl_master=3Ddefault, =
script=3D"replication/replica_big.lua"')
> +test_run:cmd('start server replica')
> +box.info.replication[2].downstream.status
> +tup_sz =3D box.cfg.memtx_max_tuple_size
> +box.cfg{memtx_max_tuple_size =3D 21 * 1024 * 1024, memtx_memory =3D =
1024 * 1024 * 1024}
> +
> +-- insert some big rows which cannot be read in one go, so applier =
yields
> +-- on read a couple of times.
> +s =3D box.schema.space.create('test')
> +_ =3D s:create_index('pk')
> +for i =3D 1,5 do box.space.test:insert{i, =
require('digest').urandom(20 * 1024 * 1024)} end
> +-- replication_disconnect_timeout is 4 * replication_timeout, check =
that
> +-- replica doesn't time out too early.
> +test_run:cmd('setopt delimiter ";"')
> +ok =3D true;
> +start =3D fiber.time();
> +while fiber.time() - start < 3 * box.cfg.replication_timeout do
> +    if box.info.replication[2].downstream.status ~=3D 'follow' then
> +        ok =3D false
> +        break
> +    end
> +    fiber.sleep(0.001)
> +end;
> +test_run:cmd('setopt delimiter ""');
> +
> +ok
> +
> +s:drop()
> +test_run:cmd('stop server replica')
> +test_run:cmd('cleanup server replica')
> +test_run:cmd('delete server replica')
> +test_run:cleanup_cluster()
> +-- cannot decrease memtx_memory to its original value.
> +box.cfg{memtx_max_tuple_size =3D tup_sz}
> +box.schema.user.revoke('guest', 'replication')
> diff --git a/test/replication/replica_big.lua =
b/test/replication/replica_big.lua
> new file mode 100644
> index 000000000..b768420fd
> --- /dev/null
> +++ b/test/replication/replica_big.lua
> @@ -0,0 +1,12 @@
> +#!/usr/bin/env tarantool
> +
> +box.cfg({
> +    listen                      =3D os.getenv("LISTEN"),
> +    replication                 =3D os.getenv("MASTER"),
> +    memtx_memory                =3D 1024 * 1024 * 1024,
> +    memtx_max_tuple_size        =3D 21 * 1024 * 1024,
> +    replication_timeout         =3D 0.1,
> +    replication_connect_timeout =3D 0.5,
> +})
> +
> +require('console').listen(os.getenv('ADMIN'))
> diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
> index 5e8809731..91e884ece 100644
> --- a/test/replication/suite.cfg
> +++ b/test/replication/suite.cfg
> @@ -9,6 +9,7 @@
>     "wal_rw_stress.test.lua": {},
>     "force_recovery.test.lua": {},
>     "on_schema_init.test.lua": {},
> +    "long_row_timeout.test.lua": {},
>     "*": {
>         "memtx": {"engine": "memtx"},
>         "vinyl": {"engine": "vinyl"}
> --=20
> 2.17.2 (Apple Git-113)
>=20
>=20

Rewrote the test a little to use less memtx_memory in main testing =
instance.
Here=E2=80=99s the diff.

diff --git a/test/replication/long_row_timeout.result =
b/test/replication/long_row_timeout.result
index 690fdcbb4..3f93f1913 100644
--- a/test/replication/long_row_timeout.result
+++ b/test/replication/long_row_timeout.result
@@ -28,7 +28,7 @@ box.info.replication[2].downstream.status
 tup_sz =3D box.cfg.memtx_max_tuple_size
 ---
 ...
-box.cfg{memtx_max_tuple_size =3D 21 * 1024 * 1024, memtx_memory =3D =
1024 * 1024 * 1024}
+box.cfg{memtx_max_tuple_size =3D 21 * 1024 * 1024}
 ---
 ...
 -- insert some big rows which cannot be read in one go, so applier =
yields
@@ -39,7 +39,7 @@ s =3D box.schema.space.create('test')
 _ =3D s:create_index('pk')
 ---
 ...
-for i =3D 1,5 do box.space.test:insert{i, require('digest').urandom(20 =
* 1024 * 1024)} end
+for i =3D 1,5 do box.space.test:replace{1, digest.urandom(20 * 1024 * =
1024)} collectgarbage('collect') end
 ---
 ...
 -- replication_disconnect_timeout is 4 * replication_timeout, check =
that
@@ -89,7 +89,6 @@ test_run:cmd('delete server replica')
 test_run:cleanup_cluster()
 ---
 ...
--- cannot decrease memtx_memory to its original value.
 box.cfg{memtx_max_tuple_size =3D tup_sz}
 ---
 ...
diff --git a/test/replication/long_row_timeout.test.lua =
b/test/replication/long_row_timeout.test.lua
index 21a522018..3993f1657 100644
--- a/test/replication/long_row_timeout.test.lua
+++ b/test/replication/long_row_timeout.test.lua
@@ -10,13 +10,13 @@ test_run:cmd('create server replica with =
rpl_master=3Ddefault, script=3D"replication
 test_run:cmd('start server replica')
 box.info.replication[2].downstream.status
 tup_sz =3D box.cfg.memtx_max_tuple_size
-box.cfg{memtx_max_tuple_size =3D 21 * 1024 * 1024, memtx_memory =3D =
1024 * 1024 * 1024}
+box.cfg{memtx_max_tuple_size =3D 21 * 1024 * 1024}
=20
 -- insert some big rows which cannot be read in one go, so applier =
yields
 -- on read a couple of times.
 s =3D box.schema.space.create('test')
 _ =3D s:create_index('pk')
-for i =3D 1,5 do box.space.test:insert{i, require('digest').urandom(20 =
* 1024 * 1024)} end
+for i =3D 1,5 do box.space.test:replace{1, digest.urandom(20 * 1024 * =
1024)} collectgarbage('collect') end
 -- replication_disconnect_timeout is 4 * replication_timeout, check =
that
 -- replica doesn't time out too early.
 test_run:cmd('setopt delimiter ";"')
@@ -38,6 +38,5 @@ test_run:cmd('stop server replica')
 test_run:cmd('cleanup server replica')
 test_run:cmd('delete server replica')
 test_run:cleanup_cluster()
--- cannot decrease memtx_memory to its original value.
 box.cfg{memtx_max_tuple_size =3D tup_sz}
 box.schema.user.revoke('guest', 'replication')=