Tarantool development patches archive
 help / color / mirror / Atom feed
* [PATCH] test: fix long_row_timeout.test.lua failure in parallel mode
@ 2019-03-25 17:52 Serge Petrenko
  2019-03-26 15:55 ` Vladimir Davydov
  0 siblings, 1 reply; 2+ messages in thread
From: Serge Petrenko @ 2019-03-25 17:52 UTC (permalink / raw)
  To: vdavydov.dev; +Cc: avtikhon, tarantool-patches, Serge Petrenko

The test used to write big rows (20 mb in size), so when run in parallel
mode, it put high load on the disk and processor, which made appliers
time out multiple times during read, and caused the test to fail
occasionally.
So, instead of writing huge rows in test, introduce a new error
injection restricting sio from reading more than a couple of bytes per
request. This ensures that the test is still relevant and makes it a lot
more lightweight.

Closes #4062
---
https://github.com/tarantool/tarantool/tree/sp/gh-4062-early-timeout
https://github.com/tarantool/tarantool/issues/4062

 src/lib/core/errinj.h                      |  1 +
 src/lib/core/sio.c                         | 10 +++++++++-
 test/box/errinj.result                     |  2 ++
 test/replication/long_row_timeout.result   | 18 +++++-------------
 test/replication/long_row_timeout.test.lua | 13 +++++--------
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h
index 41783cc74..e4f06751d 100644
--- a/src/lib/core/errinj.h
+++ b/src/lib/core/errinj.h
@@ -125,6 +125,7 @@ struct errinj {
 	_(ERRINJ_VY_COMPACTION_DELAY, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_TUPLE_FORMAT_COUNT, ERRINJ_INT, {.iparam = -1}) \
 	_(ERRINJ_MEMTX_DELAY_GC, ERRINJ_BOOL, {.bparam = false}) \
+	_(ERRINJ_SOCKET_MAX_READ_SIZE, ERRINJ_INT, {.iparam = -1}) \
 
 ENUM0(errinj_id, ERRINJ_LIST);
 extern struct errinj errinjs[];
diff --git a/src/lib/core/sio.c b/src/lib/core/sio.c
index 8f25b8159..27ea1f0d1 100644
--- a/src/lib/core/sio.c
+++ b/src/lib/core/sio.c
@@ -41,6 +41,7 @@
 #include "trivia/util.h"
 #include "exception.h"
 #include "uri/uri.h"
+#include "errinj.h"
 
 const char *
 sio_socketname(int fd)
@@ -222,7 +223,14 @@ sio_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
 ssize_t
 sio_read(int fd, void *buf, size_t count)
 {
-	ssize_t n = read(fd, buf, count);
+	struct errinj *inj = errinj(ERRINJ_SOCKET_MAX_READ_SIZE, ERRINJ_INT);
+	ssize_t n;
+	if (inj != NULL && inj->iparam >= 0) {
+		n = read(fd, buf, inj->iparam);
+	} else {
+		n = read(fd, buf, count);
+	}
+
 	if (n < 0 && !sio_wouldblock(errno)) {
 		/*
 		 * Happens typically when the client closes
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 8e76b21b3..8734b1282 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -42,6 +42,8 @@ errinj.info()
     state: false
   ERRINJ_PORT_DUMP:
     state: false
+  ERRINJ_SOCKET_MAX_READ_SIZE:
+    state: -1
   ERRINJ_WAL_IO:
     state: false
   ERRINJ_WAL_FALLOCATE:
diff --git a/test/replication/long_row_timeout.result b/test/replication/long_row_timeout.result
index 5b5a46d51..571db7d9e 100644
--- a/test/replication/long_row_timeout.result
+++ b/test/replication/long_row_timeout.result
@@ -25,32 +25,27 @@ box.info.replication[2].downstream.status
 ---
 - follow
 ...
-default_memtx_max_tuple_size = box.cfg.memtx_max_tuple_size
----
-...
+-- make applier incapable of reading rows in one go, so that it
+-- yields a couple of times.
 test_run:cmd('switch replica')
 ---
 - true
 ...
-box.cfg{memtx_max_tuple_size = 21 * 1024 * 1024}
+box.error.injection.set("ERRINJ_SOCKET_MAX_READ_SIZE", 1)
 ---
+- ok
 ...
 test_run:cmd('switch default')
 ---
 - true
 ...
-box.cfg{memtx_max_tuple_size = 21 * 1024 * 1024}
----
-...
--- insert some big rows which cannot be read in one go, so applier yields
--- on read a couple of times.
 s = box.schema.space.create('test')
 ---
 ...
 _ = s:create_index('pk')
 ---
 ...
-for i = 1,5 do box.space.test:replace{1, digest.urandom(20 * 1024 * 1024)} collectgarbage('collect') end
+for i = 1,5 do box.space.test:replace{1, digest.urandom(1024)} collectgarbage('collect') end
 ---
 ...
 -- replication_disconnect_timeout is 4 * replication_timeout, check that
@@ -100,9 +95,6 @@ test_run:cmd('delete server replica')
 test_run:cleanup_cluster()
 ---
 ...
-box.cfg{memtx_max_tuple_size = default_memtx_max_tuple_size}
----
-...
 box.schema.user.revoke('guest', 'replication')
 ---
 ...
diff --git a/test/replication/long_row_timeout.test.lua b/test/replication/long_row_timeout.test.lua
index 6e1d38b11..65def3ba1 100644
--- a/test/replication/long_row_timeout.test.lua
+++ b/test/replication/long_row_timeout.test.lua
@@ -10,17 +10,15 @@ test_run:cmd('create server replica with rpl_master=default, script="replication
 test_run:cmd('start server replica')
 box.info.replication[2].downstream.status
 
-default_memtx_max_tuple_size = box.cfg.memtx_max_tuple_size
+
+-- make applier incapable of reading rows in one go, so that it
+-- yields a couple of times.
 test_run:cmd('switch replica')
-box.cfg{memtx_max_tuple_size = 21 * 1024 * 1024}
+box.error.injection.set("ERRINJ_SOCKET_MAX_READ_SIZE", 1)
 test_run:cmd('switch default')
-box.cfg{memtx_max_tuple_size = 21 * 1024 * 1024}
-
--- insert some big rows which cannot be read in one go, so applier yields
--- on read a couple of times.
 s = box.schema.space.create('test')
 _ = s:create_index('pk')
-for i = 1,5 do box.space.test:replace{1, digest.urandom(20 * 1024 * 1024)} collectgarbage('collect') end
+for i = 1,5 do box.space.test:replace{1, digest.urandom(1024)} collectgarbage('collect') end
 -- replication_disconnect_timeout is 4 * replication_timeout, check that
 -- replica doesn't time out too early.
 test_run:cmd('setopt delimiter ";"')
@@ -42,7 +40,6 @@ test_run:cmd('stop server replica')
 test_run:cmd('cleanup server replica')
 test_run:cmd('delete server replica')
 test_run:cleanup_cluster()
-box.cfg{memtx_max_tuple_size = default_memtx_max_tuple_size}
 box.schema.user.revoke('guest', 'replication')
 
 -- Rotate xlogs so as not to replicate the huge rows in
-- 
2.17.2 (Apple Git-113)

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] test: fix long_row_timeout.test.lua failure in parallel mode
  2019-03-25 17:52 [PATCH] test: fix long_row_timeout.test.lua failure in parallel mode Serge Petrenko
@ 2019-03-26 15:55 ` Vladimir Davydov
  0 siblings, 0 replies; 2+ messages in thread
From: Vladimir Davydov @ 2019-03-26 15:55 UTC (permalink / raw)
  To: Serge Petrenko; +Cc: avtikhon, tarantool-patches

On Mon, Mar 25, 2019 at 08:52:30PM +0300, Serge Petrenko wrote:
> The test used to write big rows (20 mb in size), so when run in parallel
> mode, it put high load on the disk and processor, which made appliers
> time out multiple times during read, and caused the test to fail
> occasionally.
> So, instead of writing huge rows in test, introduce a new error
> injection restricting sio from reading more than a couple of bytes per
> request. This ensures that the test is still relevant and makes it a lot
> more lightweight.
> 
> Closes #4062
> ---
> https://github.com/tarantool/tarantool/tree/sp/gh-4062-early-timeout
> https://github.com/tarantool/tarantool/issues/4062
> 
>  src/lib/core/errinj.h                      |  1 +
>  src/lib/core/sio.c                         | 10 +++++++++-
>  test/box/errinj.result                     |  2 ++
>  test/replication/long_row_timeout.result   | 18 +++++-------------
>  test/replication/long_row_timeout.test.lua | 13 +++++--------
>  5 files changed, 22 insertions(+), 22 deletions(-)

All tests using error injection should be disabled in release builds,
even if they pass. I fixed test/repliation/suite.ini accordingly.

> 
> diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h
> index 41783cc74..e4f06751d 100644
> --- a/src/lib/core/errinj.h
> +++ b/src/lib/core/errinj.h
> @@ -125,6 +125,7 @@ struct errinj {
>  	_(ERRINJ_VY_COMPACTION_DELAY, ERRINJ_BOOL, {.bparam = false}) \
>  	_(ERRINJ_TUPLE_FORMAT_COUNT, ERRINJ_INT, {.iparam = -1}) \
>  	_(ERRINJ_MEMTX_DELAY_GC, ERRINJ_BOOL, {.bparam = false}) \
> +	_(ERRINJ_SOCKET_MAX_READ_SIZE, ERRINJ_INT, {.iparam = -1}) \

Too long name :)

I renamed it to SIO_READ_MAX

>  
>  ENUM0(errinj_id, ERRINJ_LIST);
>  extern struct errinj errinjs[];
> diff --git a/src/lib/core/sio.c b/src/lib/core/sio.c
> index 8f25b8159..27ea1f0d1 100644
> --- a/src/lib/core/sio.c
> +++ b/src/lib/core/sio.c
> @@ -41,6 +41,7 @@
>  #include "trivia/util.h"
>  #include "exception.h"
>  #include "uri/uri.h"
> +#include "errinj.h"
>  
>  const char *
>  sio_socketname(int fd)
> @@ -222,7 +223,14 @@ sio_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
>  ssize_t
>  sio_read(int fd, void *buf, size_t count)
>  {
> -	ssize_t n = read(fd, buf, count);
> +	struct errinj *inj = errinj(ERRINJ_SOCKET_MAX_READ_SIZE, ERRINJ_INT);
> +	ssize_t n;
> +	if (inj != NULL && inj->iparam >= 0) {
> +		n = read(fd, buf, inj->iparam);

Even though it's just an error injection, we shouldn't allow writing
beyond the buffer so it should be MIN(count, inj->iparam).

Also, allowing 0 isn't very helpful - the application would just hang.

> +	} else {
> +		n = read(fd, buf, count);
> +	}
> +
>  	if (n < 0 && !sio_wouldblock(errno)) {
>  		/*
>  		 * Happens typically when the client closes

Committed the patch to the master branch with the following changes:

diff --git a/src/lib/core/errinj.h b/src/lib/core/errinj.h
index e4f06751..99891c5b 100644
--- a/src/lib/core/errinj.h
+++ b/src/lib/core/errinj.h
@@ -125,7 +125,7 @@ struct errinj {
 	_(ERRINJ_VY_COMPACTION_DELAY, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_TUPLE_FORMAT_COUNT, ERRINJ_INT, {.iparam = -1}) \
 	_(ERRINJ_MEMTX_DELAY_GC, ERRINJ_BOOL, {.bparam = false}) \
-	_(ERRINJ_SOCKET_MAX_READ_SIZE, ERRINJ_INT, {.iparam = -1}) \
+	_(ERRINJ_SIO_READ_MAX, ERRINJ_INT, {.iparam = -1}) \
 
 ENUM0(errinj_id, ERRINJ_LIST);
 extern struct errinj errinjs[];
diff --git a/src/lib/core/sio.c b/src/lib/core/sio.c
index 27ea1f0d..ecd6d412 100644
--- a/src/lib/core/sio.c
+++ b/src/lib/core/sio.c
@@ -223,14 +223,11 @@ sio_accept(int fd, struct sockaddr *addr, socklen_t *addrlen)
 ssize_t
 sio_read(int fd, void *buf, size_t count)
 {
-	struct errinj *inj = errinj(ERRINJ_SOCKET_MAX_READ_SIZE, ERRINJ_INT);
-	ssize_t n;
-	if (inj != NULL && inj->iparam >= 0) {
-		n = read(fd, buf, inj->iparam);
-	} else {
-		n = read(fd, buf, count);
-	}
+	struct errinj *inj = errinj(ERRINJ_SIO_READ_MAX, ERRINJ_INT);
+	if (inj != NULL && inj->iparam > 0)
+		count = MIN(count, (size_t)inj->iparam);
 
+	ssize_t n = read(fd, buf, count);
 	if (n < 0 && !sio_wouldblock(errno)) {
 		/*
 		 * Happens typically when the client closes
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 8734b128..2bc41ac5 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -42,7 +42,7 @@ errinj.info()
     state: false
   ERRINJ_PORT_DUMP:
     state: false
-  ERRINJ_SOCKET_MAX_READ_SIZE:
+  ERRINJ_RELAY_BREAK_LSN:
     state: -1
   ERRINJ_WAL_IO:
     state: false
@@ -68,7 +68,7 @@ errinj.info()
     state: 0
   ERRINJ_XLOG_META:
     state: false
-  ERRINJ_RELAY_BREAK_LSN:
+  ERRINJ_SIO_READ_MAX:
     state: -1
   ERRINJ_VY_INDEX_FILE_RENAME:
     state: false
diff --git a/test/replication/long_row_timeout.result b/test/replication/long_row_timeout.result
index 571db7d9..9284d9c2 100644
--- a/test/replication/long_row_timeout.result
+++ b/test/replication/long_row_timeout.result
@@ -31,7 +31,7 @@ test_run:cmd('switch replica')
 ---
 - true
 ...
-box.error.injection.set("ERRINJ_SOCKET_MAX_READ_SIZE", 1)
+box.error.injection.set("ERRINJ_SIO_READ_MAX", 1)
 ---
 - ok
 ...
diff --git a/test/replication/long_row_timeout.test.lua b/test/replication/long_row_timeout.test.lua
index 65def3ba..7fc52e3e 100644
--- a/test/replication/long_row_timeout.test.lua
+++ b/test/replication/long_row_timeout.test.lua
@@ -14,7 +14,7 @@ box.info.replication[2].downstream.status
 -- make applier incapable of reading rows in one go, so that it
 -- yields a couple of times.
 test_run:cmd('switch replica')
-box.error.injection.set("ERRINJ_SOCKET_MAX_READ_SIZE", 1)
+box.error.injection.set("ERRINJ_SIO_READ_MAX", 1)
 test_run:cmd('switch default')
 s = box.schema.space.create('test')
 _ = s:create_index('pk')
diff --git a/test/replication/suite.ini b/test/replication/suite.ini
index 6e9e3edd..dd5b0140 100644
--- a/test/replication/suite.ini
+++ b/test/replication/suite.ini
@@ -3,7 +3,7 @@ core = tarantool
 script =  master.lua
 description = tarantool/box, replication
 disabled = consistent.test.lua
-release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua
+release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua
 config = suite.cfg
 lua_libs = lua/fast_replica.lua lua/rlimit.lua
 use_unix_sockets = True

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-03-26 15:55 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-25 17:52 [PATCH] test: fix long_row_timeout.test.lua failure in parallel mode Serge Petrenko
2019-03-26 15:55 ` Vladimir Davydov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox