From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp33.i.mail.ru (smtp33.i.mail.ru [94.100.177.93]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id AA35245C304 for ; Wed, 2 Dec 2020 21:09:09 +0300 (MSK) From: "Alexander V. Tikhonov" Date: Wed, 2 Dec 2020 21:09:06 +0300 Message-Id: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH v2] test: fix hang of vinyl/select_consistency.* test List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Kirill Yukhin , Nikita Pettik Cc: tarantool-patches@dev.tarantool.org During testing process: https://gitlab.com/tarantool/tarantool/-/jobs/800863457#L5303 found that test vinyl/select_consistency.test.lua hanged: --- vinyl/select_consistency.result Thu Jun 11 11:59:48 2020 +++ var/152_vinyl/select_consistency.result Tue Oct 20 09:13:02 2020 @@ -137,16 +137,3 @@ for i = 1, ch:size() do ch:get() end; ---- -... -test_run:cmd("setopt delimiter ''"); ---- -- true -... This issue stops the testing till the complete timeout of the test-run will stop all the testing process. Also found that real hang occured at box.snapshot command in snap_loop loop, like: [001] Last 15 lines of Tarantool Log file [Instance "vinyl"][/source/test/var/001_vinyl/vinyl.log]: [001] 2020-11-15 18:13:49.928 [78887] main/105/gc I> removed /source/test/var/001_vinyl/vinyl/519/2/00000000000000011145.run [001] 2020-11-15 18:13:49.928 [78887] vinyl.dump.0/640/task I> writing `/source/test/var/001_vinyl/vinyl/519/0/00000000000000011538.index' [001] 2020-11-15 18:13:49.928 [78887] snapshot/101/main I> done [001] 2020-11-15 18:13:49.929 [78887] main/105/gc I> removed /source/test/var/001_vinyl/vinyl/519/2/00000000000000011157.index [001] 2020-11-15 18:13:49.929 [78887] main/108/vinyl.scheduler I> 519/0: dump completed [001] 2020-11-15 18:13:49.929 [78887] main/108/vinyl.scheduler I> dumped 0 bytes in 0.0 s, rate 0.0 MB/s [001] 2020-11-15 18:13:49.929 [78887] main/481/lua I> vinyl checkpoint completed [001] 2020-11-15 18:13:49.929 [78887] main/105/gc I> removed /source/test/var/001_vinyl/vinyl/519/2/00000000000000011157.run [001] 2020-11-15 18:13:49.929 [78887] main/108/vinyl.scheduler I> 519/0: started compacting range ([94]..inf), runs 2/2 [001] 2020-11-15 18:13:49.930 [78887] wal I> removed /source/test/var/001_vinyl/vinyl/00000000000000093070.xlog [001] 2020-11-15 18:13:49.930 [78887] main/105/gc I> removed /source/test/var/001_vinyl/vinyl/00000000000000094030.snap.inprogress [001] 2020-11-15 18:13:49.930 [78887] main I> removed /source/test/var/001_vinyl/vinyl/00000000000000093306.snap [001] 2020-11-15 18:13:49.930 [78887] main I> removed /source/test/var/001_vinyl/vinyl/00000000000000093070.vylog [001] 2020-11-15 18:13:49.931 [78887] main/481/lua F> can't rename .snap.inprogress [001] 2020-11-15 18:13:49.931 [78887] main/481/lua F> can't rename .snap.inprogress It happened because on heavy loaded hosts may occure the situation when the previous snapshot was inprogress when the new snapshot came with the same file name *.snap.inprogress. It happens before the current snapshot completed and printed "dump completed" in log file. Also this file *.snap.inprogress was seen left on manual debug, when the test hanged before this patch. To resolve the test issue fiber sleep delay after it can be increased, but we want to save the issue reproducable. The current patch corrects the test to avoid of hang on box snapshot, to be able to continue testing after it failed. Needed for #4385 --- Github: https://github.com/tarantool/tarantool/tree/avtikhon/gh-4385_hang_select_consistency Issue: https://github.com/tarantool/tarantool/issues/4385 test/vinyl/select_consistency.result | 67 ++++++++++++++++++++------ test/vinyl/select_consistency.test.lua | 61 ++++++++++++++++++----- test/vinyl/suite.ini | 3 +- 3 files changed, 103 insertions(+), 28 deletions(-) diff --git a/test/vinyl/select_consistency.result b/test/vinyl/select_consistency.result index f27d0ec68..f6d96473d 100644 --- a/test/vinyl/select_consistency.result +++ b/test/vinyl/select_consistency.result @@ -4,6 +4,9 @@ test_run = require('test_run').new() fiber = require 'fiber' --- ... +log = require 'log' +--- +... math.randomseed(os.time()) --- ... @@ -52,6 +55,8 @@ function gen_update() end; --- ... +failed = false + function dml_loop() local insert = true while not stop do @@ -75,8 +80,18 @@ end; ... function snap_loop() while not stop do - box.snapshot() - fiber.sleep(0.1) + local ok = fiber.create(function() + local ok, err = pcall(box.snapshot) + if ok == false then + log.error("error: box.snapshot failed with error " .. err) + end + return ok + end) + if ok == false then + failed = true + break + end + fiber.sleep(0.5) end ch:put(true) end; @@ -97,10 +112,7 @@ _ = fiber.create(dml_loop); _ = fiber.create(snap_loop); --- ... -failed = {}; ---- -... -for i = 1, 10000 do +function select_loop() local val = math.random(MAX_VAL) box.begin() local res1 = s.index.i1:select({val}) @@ -117,25 +129,48 @@ for i = 1, 10000 do end end if not found then + log.error("error: equal not found for res1 and res2:") + local ts1 = '' + local ts2 = '' + for _, t1 in ipairs(res1) do + ts1 = ts1 .. " " .. t1[1] + end + log.info("error: equal not found for res1" .. ts1) + for _, t2 in ipairs(res2) do + ts2 = ts2 .. " " .. t2[1] + end + log.info("error: equal not found for res2" .. ts2) equal = false break end end - else - equal = false - end - if not equal then - table.insert(failed, {res1, res2}) end fiber.sleep(0) + return equal +end; +--- +... +for i = 1, 10000 do + if failed or not select_loop(i) then + log.error("error: failed on iteration " .. i) + failed = true + break + end end; --- ... stop = true; --- ... -for i = 1, ch:size() do - ch:get() +CHANNEL_GET_TIMEOUT = 10 +function check_get() + for i = 1, ch:size() do + if ch:get(CHANNEL_GET_TIMEOUT) == nil then + log.error("error: hanged on ch:get() on iteration " .. i) + return false + end + end + return true end; --- ... @@ -143,10 +178,14 @@ test_run:cmd("setopt delimiter ''"); --- - true ... -#failed == 0 or failed +test_run:wait_cond(function() return check_get() end) --- - true ... +failed +--- +- false +... s:drop() --- ... diff --git a/test/vinyl/select_consistency.test.lua b/test/vinyl/select_consistency.test.lua index 73b443e89..644f68f5f 100644 --- a/test/vinyl/select_consistency.test.lua +++ b/test/vinyl/select_consistency.test.lua @@ -1,6 +1,7 @@ test_run = require('test_run').new() fiber = require 'fiber' +log = require 'log' math.randomseed(os.time()) @@ -33,6 +34,8 @@ function gen_update() pcall(s.update, s, math.random(MAX_KEY), {{'+', 5, 1}}) end; +failed = false + function dml_loop() local insert = true while not stop do @@ -55,8 +58,18 @@ end; function snap_loop() while not stop do - box.snapshot() - fiber.sleep(0.1) + local ok = fiber.create(function() + local ok, err = pcall(box.snapshot) + if ok == false then + log.error("error: box.snapshot failed with error " .. err) + end + return ok + end) + if ok == false then + failed = true + break + end + fiber.sleep(0.5) end ch:put(true) end; @@ -68,9 +81,7 @@ _ = fiber.create(dml_loop); _ = fiber.create(dml_loop); _ = fiber.create(snap_loop); -failed = {}; - -for i = 1, 10000 do +function select_loop() local val = math.random(MAX_VAL) box.begin() local res1 = s.index.i1:select({val}) @@ -87,26 +98,50 @@ for i = 1, 10000 do end end if not found then + log.error("error: equal not found for res1 and res2:") + local ts1 = '' + local ts2 = '' + for _, t1 in ipairs(res1) do + ts1 = ts1 .. " " .. t1[1] + end + log.info("error: equal not found for res1" .. ts1) + for _, t2 in ipairs(res2) do + ts2 = ts2 .. " " .. t2[1] + end + log.info("error: equal not found for res2" .. ts2) equal = false break end end - else - equal = false - end - if not equal then - table.insert(failed, {res1, res2}) end fiber.sleep(0) + return equal +end; + +for i = 1, 10000 do + if failed or not select_loop(i) then + log.error("error: failed on iteration " .. i) + failed = true + break + end end; stop = true; -for i = 1, ch:size() do - ch:get() + +CHANNEL_GET_TIMEOUT = 10 +function check_get() + for i = 1, ch:size() do + if ch:get(CHANNEL_GET_TIMEOUT) == nil then + log.error("error: hanged on ch:get() on iteration " .. i) + return false + end + end + return true end; test_run:cmd("setopt delimiter ''"); -#failed == 0 or failed +test_run:wait_cond(function() return check_get() end) +failed s:drop() diff --git a/test/vinyl/suite.ini b/test/vinyl/suite.ini index 93d1b3cf6..9f1856be0 100644 --- a/test/vinyl/suite.ini +++ b/test/vinyl/suite.ini @@ -23,7 +23,8 @@ fragile = { "issues": [ "gh-4346" ] }, "select_consistency.test.lua": { - "issues": [ "gh-4385" ] + "issues": [ "gh-4385" ], + "checksums": [ "94d5359d451ee7bfdb35a037f80b14ff", "f7e8ebc72f4d59941b4ca10fde474963" ] }, "throttle.test.lua": { "issues": [ "gh-4168" ] -- 2.25.1