[Tarantool-patches] [PATCH vshard 1/5] router: backoff on some box errors

Fri Dec 17 03:25:27 MSK 2021

Storage configuration takes time. Firstly, box.cfg{} which can be
called before vshard.storage.cfg(). Secondly, vshard.storage.cfg()
is not immediate as well.

During that time accessing the storage is not safe. Attempts to
call vshard.storage functions can return weird errors, or the
functions can even be not available yet. They need to be created
in _func and get access rights in _priv before becoming public.

Routers used to forward errors like 'access denied error' and
'no such function' to users as is, treating them as critical.

Not only it was confusing for users, but also could make an entire
replicaset not available for requests - the connection to it is
alive, so router would send all requests into it and they all
would fail. Even if the replicaset has another instance which is
perfectly functional.

This patch handles such specific errors inside of the router. The
faulty replicas are put into a 'backoff' state. They remain in it
for some fixed time (5 seconds for now), new requests won't be
sent to them until the time passes. Router will use other
instances.

Backoff is activated only for vshard.* functions. If the errors
are about some user's function, it is considered a regular error.
Because the router can't tell whether any side effects were done
on the remote instance before the error happened. Hence can't
retry to another node.

For example, if access was denied to 'vshard.storage.call', then
it is backoff. If inside of vshard.storage.call the access was
denied to 'user_test_func', then it is not backoff.

It all works for read-only requests exclusively of course. Because
for read-write requests the instance is just one - master. Router
does not have other options so backoff here wouldn't help.

Part of #298
---
 test/router/router2.result   | 237 +++++++++++++++++++++++++++++++++++
 test/router/router2.test.lua |  97 ++++++++++++++
 vshard/consts.lua            |   1 +
 vshard/error.lua             |   8 +-
 vshard/replicaset.lua        | 100 +++++++++++++--
 vshard/router/init.lua       |   3 +-
 6 files changed, 432 insertions(+), 14 deletions(-)

diff --git a/test/router/router2.result b/test/router/router2.result
index 492421b..a501dbf 100644
--- a/test/router/router2.result
+++ b/test/router/router2.result
@@ -311,6 +311,243 @@ vshard.router.info().bucket
  |   available_rw: 3000
  | ...
 
+--
+-- gh-298: backoff replicas when the storage raises errors meaning its
+-- configuration is not finished or even didn't start. Immediately failover to
+-- other instances then.
+--
+test_run:switch('storage_2_b')
+ | ---
+ | - true
+ | ...
+-- Turn off replication so as _func manipulations on the master wouldn't reach
+-- the replica.
+old_replication = box.cfg.replication
+ | ---
+ | ...
+box.cfg{replication = {}}
+ | ---
+ | ...
+
+test_run:switch('storage_2_a')
+ | ---
+ | - true
+ | ...
+box.schema.user.revoke('storage', 'execute', 'function', 'vshard.storage.call')
+ | ---
+ | ...
+
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+vshard.consts.REPLICA_BACKOFF_INTERVAL = 0.1
+ | ---
+ | ...
+
+-- Indeed fails when called directly via netbox.
+conn = vshard.router.route(1).master.conn
+ | ---
+ | ...
+ok, err = pcall(conn.call, conn, 'vshard.storage.call',                         \
+                {1, 'read', 'echo', {1}})
+ | ---
+ | ...
+assert(not ok and err.code == box.error.ACCESS_DENIED)
+ | ---
+ | - true
+ | ...
+
+-- Works when called via vshard - it goes to another replica transparently.
+long_timeout = {timeout = 1000000}
+ | ---
+ | ...
+res = vshard.router.callro(1, 'echo', {100}, long_timeout)
+ | ---
+ | ...
+assert(res == 100)
+ | ---
+ | - true
+ | ...
+
+--
+-- When all replicas are in backoff due to lack of access, raise an error.
+--
+test_run:switch('storage_2_b')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 1)
+ | ---
+ | - true
+ | ...
+echo_count = 0
+ | ---
+ | ...
+-- Restore the replication so the replica gets the _func change from master.
+box.cfg{replication = old_replication}
+ | ---
+ | ...
+test_run:wait_vclock('storage_2_b', test_run:get_vclock('storage_1_a'))
+ | ---
+ | ...
+
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+ok, err = vshard.router.callro(1, 'echo', {100}, long_timeout)
+ | ---
+ | ...
+assert(not ok and err.code == vshard.error.code.REPLICASET_IN_BACKOFF)
+ | ---
+ | - true
+ | ...
+assert(err.error.code == box.error.ACCESS_DENIED)
+ | ---
+ | - true
+ | ...
+
+test_run:switch('storage_2_a')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 0)
+ | ---
+ | - true
+ | ...
+box.schema.user.grant('storage', 'execute', 'function', 'vshard.storage.call')
+ | ---
+ | ...
+test_run:wait_vclock('storage_2_b', test_run:get_vclock('storage_1_a'))
+ | ---
+ | ...
+
+--
+-- No vshard function = backoff.
+--
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+-- Drop all backoffs to check all works fine now.
+fiber.sleep(vshard.consts.REPLICA_BACKOFF_INTERVAL)
+ | ---
+ | ...
+res = vshard.router.callrw(1, 'echo', {100}, long_timeout)
+ | ---
+ | ...
+assert(res == 100)
+ | ---
+ | - true
+ | ...
+
+test_run:switch('storage_2_a')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 1)
+ | ---
+ | - true
+ | ...
+echo_count = 0
+ | ---
+ | ...
+old_storage_call = vshard.storage.call
+ | ---
+ | ...
+vshard.storage.call = nil
+ | ---
+ | ...
+
+-- Indeed fails when called directly via netbox.
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+conn = vshard.router.route(1).master.conn
+ | ---
+ | ...
+ok, err = pcall(conn.call, conn, 'vshard.storage.call',                         \
+                {1, 'read', 'echo', {1}})
+ | ---
+ | ...
+assert(not ok and err.code == box.error.NO_SUCH_PROC)
+ | ---
+ | - true
+ | ...
+
+-- Works when called via vshard - it goes to another replica.
+res = vshard.router.callro(1, 'echo', {100}, long_timeout)
+ | ---
+ | ...
+assert(res == 100)
+ | ---
+ | - true
+ | ...
+
+--
+-- When all replicas are in backoff due to not having the function, raise
+-- an error.
+--
+test_run:switch('storage_2_b')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 1)
+ | ---
+ | - true
+ | ...
+echo_count = 0
+ | ---
+ | ...
+old_storage_call = vshard.storage.call
+ | ---
+ | ...
+vshard.storage.call = nil
+ | ---
+ | ...
+
+test_run:switch('router_1')
+ | ---
+ | - true
+ | ...
+ok, err = vshard.router.callro(1, 'echo', {100}, long_timeout)
+ | ---
+ | ...
+assert(not ok and err.code == vshard.error.code.REPLICASET_IN_BACKOFF)
+ | ---
+ | - true
+ | ...
+assert(err.error.code == box.error.NO_SUCH_PROC)
+ | ---
+ | - true
+ | ...
+
+test_run:switch('storage_2_b')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 0)
+ | ---
+ | - true
+ | ...
+vshard.storage.call = old_storage_call
+ | ---
+ | ...
+
+test_run:switch('storage_2_a')
+ | ---
+ | - true
+ | ...
+assert(echo_count == 0)
+ | ---
+ | - true
+ | ...
+vshard.storage.call = old_storage_call
+ | ---
+ | ...
+
 _ = test_run:switch("default")
  | ---
  | ...
diff --git a/test/router/router2.test.lua b/test/router/router2.test.lua
index 72c99e6..fb0c3b2 100644
--- a/test/router/router2.test.lua
+++ b/test/router/router2.test.lua
@@ -119,6 +119,103 @@ end
 test_run:wait_cond(wait_all_rw)
 vshard.router.info().bucket
 
+--
+-- gh-298: backoff replicas when the storage raises errors meaning its
+-- configuration is not finished or even didn't start. Immediately failover to
+-- other instances then.
+--
+test_run:switch('storage_2_b')
+-- Turn off replication so as _func manipulations on the master wouldn't reach
+-- the replica.
+old_replication = box.cfg.replication
+box.cfg{replication = {}}
+
+test_run:switch('storage_2_a')
+box.schema.user.revoke('storage', 'execute', 'function', 'vshard.storage.call')
+
+test_run:switch('router_1')
+vshard.consts.REPLICA_BACKOFF_INTERVAL = 0.1
+
+-- Indeed fails when called directly via netbox.
+conn = vshard.router.route(1).master.conn
+ok, err = pcall(conn.call, conn, 'vshard.storage.call',                         \
+                {1, 'read', 'echo', {1}})
+assert(not ok and err.code == box.error.ACCESS_DENIED)
+
+-- Works when called via vshard - it goes to another replica transparently.
+long_timeout = {timeout = 1000000}
+res = vshard.router.callro(1, 'echo', {100}, long_timeout)
+assert(res == 100)
+
+--
+-- When all replicas are in backoff due to lack of access, raise an error.
+--
+test_run:switch('storage_2_b')
+assert(echo_count == 1)
+echo_count = 0
+-- Restore the replication so the replica gets the _func change from master.
+box.cfg{replication = old_replication}
+test_run:wait_vclock('storage_2_b', test_run:get_vclock('storage_1_a'))
+
+test_run:switch('router_1')
+ok, err = vshard.router.callro(1, 'echo', {100}, long_timeout)
+assert(not ok and err.code == vshard.error.code.REPLICASET_IN_BACKOFF)
+assert(err.error.code == box.error.ACCESS_DENIED)
+
+test_run:switch('storage_2_a')
+assert(echo_count == 0)
+box.schema.user.grant('storage', 'execute', 'function', 'vshard.storage.call')
+test_run:wait_vclock('storage_2_b', test_run:get_vclock('storage_1_a'))
+
+--
+-- No vshard function = backoff.
+--
+test_run:switch('router_1')
+-- Drop all backoffs to check all works fine now.
+fiber.sleep(vshard.consts.REPLICA_BACKOFF_INTERVAL)
+res = vshard.router.callrw(1, 'echo', {100}, long_timeout)
+assert(res == 100)
+
+test_run:switch('storage_2_a')
+assert(echo_count == 1)
+echo_count = 0
+old_storage_call = vshard.storage.call
+vshard.storage.call = nil
+
+-- Indeed fails when called directly via netbox.
+test_run:switch('router_1')
+conn = vshard.router.route(1).master.conn
+ok, err = pcall(conn.call, conn, 'vshard.storage.call',                         \
+                {1, 'read', 'echo', {1}})
+assert(not ok and err.code == box.error.NO_SUCH_PROC)
+
+-- Works when called via vshard - it goes to another replica.
+res = vshard.router.callro(1, 'echo', {100}, long_timeout)
+assert(res == 100)
+
+--
+-- When all replicas are in backoff due to not having the function, raise
+-- an error.
+--
+test_run:switch('storage_2_b')
+assert(echo_count == 1)
+echo_count = 0
+old_storage_call = vshard.storage.call
+vshard.storage.call = nil
+
+test_run:switch('router_1')
+ok, err = vshard.router.callro(1, 'echo', {100}, long_timeout)
+assert(not ok and err.code == vshard.error.code.REPLICASET_IN_BACKOFF)
+assert(err.error.code == box.error.NO_SUCH_PROC)
+
+test_run:switch('storage_2_b')
+assert(echo_count == 0)
+vshard.storage.call = old_storage_call
+
+test_run:switch('storage_2_a')
+assert(echo_count == 0)
+vshard.storage.call = old_storage_call
+
 _ = test_run:switch("default")
 _ = test_run:cmd("stop server router_1")
 _ = test_run:cmd("cleanup server router_1")
diff --git a/vshard/consts.lua b/vshard/consts.lua
index 66a09ae..64cb0ae 100644
--- a/vshard/consts.lua
+++ b/vshard/consts.lua
@@ -40,6 +40,7 @@ return {
     RECONNECT_TIMEOUT = 0.5;
     GC_BACKOFF_INTERVAL = 5,
     RECOVERY_BACKOFF_INTERVAL = 5,
+    REPLICA_BACKOFF_INTERVAL = 5,
     COLLECT_LUA_GARBAGE_INTERVAL = 100;
     DEFAULT_BUCKET_SEND_TIMEOUT = 10,
     DEFAULT_BUCKET_RECV_TIMEOUT = 10,
diff --git a/vshard/error.lua b/vshard/error.lua
index fa7bdaa..0ce208a 100644
--- a/vshard/error.lua
+++ b/vshard/error.lua
@@ -158,7 +158,13 @@ local error_message_template = {
         name = 'MULTIPLE_MASTERS_FOUND',
         msg = 'Found more than one master in replicaset %s on nodes %s and %s',
         args = {'replicaset_uuid', 'master1', 'master2'},
-    }
+    },
+    [32] = {
+        name = 'REPLICASET_IN_BACKOFF',
+        msg = 'Replicaset %s is in backoff, can\'t take requests right now. '..
+              'Last error was %s',
+        args = {'replicaset_uuid', 'error'}
+    },
 }
 
 --
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
index a47a7b9..573a555 100644
--- a/vshard/replicaset.lua
+++ b/vshard/replicaset.lua
@@ -13,6 +13,8 @@
 --             weight = number,
 --             down_ts = <timestamp of disconnect from the
 --                        replica>,
+--             backoff_ts = <timestamp when was sent into backoff state>,
+--             backoff_err = <error object caused the backoff>,
 --             net_timeout = <current network timeout for calls,
 --                            doubled on each network fail until
 --                            max value, and reset to minimal
@@ -142,6 +144,25 @@ local function netbox_wait_connected(conn, timeout)
     return timeout
 end
 
+--
+-- Check if the replica is not in backoff. It also serves as an update - if the
+-- replica still has an old backoff timestamp, it is cleared. This way of
+-- backoff update does not require any fibers to perform background updates.
+-- Hence works not only on the router.
+--
+local function replica_check_backoff(replica, now)
+    if not replica.backoff_ts then
+        return true
+    end
+    if replica.backoff_ts + consts.REPLICA_BACKOFF_INTERVAL > now then
+        return false
+    end
+    log.warn('Replica %s returns from backoff', replica.uuid)
+    replica.backoff_ts = nil
+    replica.backoff_err = nil
+    return true
+end
+
 --
 -- Connect to a specified replica and remember a new connection
 -- in the replica object. Note, that the function does not wait
@@ -421,6 +442,39 @@ local function can_retry_after_error(e)
     return e.type == 'ClientError' and e.code == box.error.TIMEOUT
 end
 
+--
+-- True if after the given error on call of the given function the connection
+-- must go into backoff.
+--
+local function can_backoff_after_error(e, func)
+    if not e then
+        return false
+    end
+    if type(e) ~= 'table' and
+       (type(e) ~= 'cdata' or not ffi.istype('struct error', e)) then
+        return false
+    end
+    -- So far it is enabled only for vshard's own functions. Including
+    -- vshard.storage.call(). Otherwise it is not possible to retry safely -
+    -- user's function could have side effects before raising that error.
+    -- For instance, 'access denied' could be raised by user's function
+    -- internally after it already changed some data on the storage.
+    if not func:startswith('vshard.') then
+        return false
+    end
+    -- ClientError is sent for all errors by old Tarantool versions which didn't
+    -- keep error type. New versions preserve the original error type.
+    if e.type == 'ClientError' or e.type == 'AccessDeniedError' then
+        if e.code == box.error.ACCESS_DENIED then
+            return e.message:startswith("Execute access to function 'vshard.")
+        end
+        if e.code == box.error.NO_SUCH_PROC then
+            return e.message:startswith("Procedure 'vshard.")
+        end
+    end
+    return false
+end
+
 --
 -- Pick a next replica according to round-robin load balancing
 -- policy.
@@ -443,7 +497,7 @@ end
 -- until failover fiber will repair the nearest connection.
 --
 local function replicaset_template_multicallro(prefer_replica, balance)
-    local function pick_next_replica(replicaset)
+    local function pick_next_replica(replicaset, now)
         local r
         local master = replicaset.master
         if balance then
@@ -451,7 +505,8 @@ local function replicaset_template_multicallro(prefer_replica, balance)
             while i > 0 do
                 r = replicaset_balance_replica(replicaset)
                 i = i - 1
-                if r:is_connected() and (not prefer_replica or r ~= master) then
+                if r:is_connected() and (not prefer_replica or r ~= master) and
+                   replica_check_backoff(r, now) then
                     return r
                 end
             end
@@ -459,7 +514,8 @@ local function replicaset_template_multicallro(prefer_replica, balance)
             local start_r = replicaset.replica
             r = start_r
             while r do
-                if r:is_connected() and (not prefer_replica or r ~= master) then
+                if r:is_connected() and (not prefer_replica or r ~= master) and
+                   replica_check_backoff(r, now) then
                     return r
                 end
                 r = r.next_by_priority
@@ -471,7 +527,8 @@ local function replicaset_template_multicallro(prefer_replica, balance)
                     -- Reached already checked part.
                     break
                 end
-                if r:is_connected() and (not prefer_replica or r ~= master) then
+                if r:is_connected() and (not prefer_replica or r ~= master) and
+                   replica_check_backoff(r, now) then
                     return r
                 end
             end
@@ -488,26 +545,43 @@ local function replicaset_template_multicallro(prefer_replica, balance)
         if timeout <= 0 then
             return nil, lerror.timeout()
         end
-        local end_time = fiber_clock() + timeout
+        local now = fiber_clock()
+        local end_time = now + timeout
         while not net_status and timeout > 0 do
-            replica = pick_next_replica(replicaset)
+            replica = pick_next_replica(replicaset, now)
             if not replica then
                 replica, timeout = replicaset_wait_master(replicaset, timeout)
                 if not replica then
                     return nil, timeout
                 end
                 replicaset_connect_to_replica(replicaset, replica)
+                if replica.backoff_ts then
+                    return nil, lerror.vshard(
+                        lerror.code.REPLICASET_IN_BACKOFF, replicaset.uuid,
+                        replica.backoff_err)
+                end
             end
             opts.timeout = timeout
             net_status, storage_status, retval, err =
                 replica_call(replica, func, args, opts)
-            timeout = end_time - fiber_clock()
+            now = fiber_clock()
+            timeout = end_time - now
             if not net_status and not storage_status and
                not can_retry_after_error(retval) then
-                -- There is no sense to retry LuaJit errors, such as
-                -- assetions, not defined variables etc.
-                net_status = true
-                break
+                if can_backoff_after_error(retval, func) then
+                    if not replica.backoff_ts then
+                        log.warn('Replica %s goes into backoff for %s sec '..
+                                 'after error %s', replica.uuid,
+                                 consts.REPLICA_BACKOFF_INTERVAL, retval)
+                        replica.backoff_ts = now
+                        replica.backoff_err = retval
+                    end
+                else
+                    -- There is no sense to retry LuaJit errors, such as
+                    -- assertions, undefined variables etc.
+                    net_status = true
+                    break
+                end
             end
         end
         if not net_status then
@@ -549,6 +623,8 @@ local function rebind_replicasets(replicasets, old_replicasets)
                 local conn = old_replica.conn
                 replica.conn = conn
                 replica.down_ts = old_replica.down_ts
+                replica.backoff_ts = old_replica.backoff_ts
+                replica.backoff_err = old_replica.backoff_err
                 replica.net_timeout = old_replica.net_timeout
                 replica.net_sequential_ok = old_replica.net_sequential_ok
                 replica.net_sequential_fail = old_replica.net_sequential_fail
@@ -986,7 +1062,7 @@ local function buildall(sharding_cfg)
                 uri = replica.uri, name = replica.name, uuid = replica_uuid,
                 zone = replica.zone, net_timeout = consts.CALL_TIMEOUT_MIN,
                 net_sequential_ok = 0, net_sequential_fail = 0,
-                down_ts = curr_ts,
+                down_ts = curr_ts, backoff_ts = nil, backoff_err = nil,
             }, replica_mt)
             new_replicaset.replicas[replica_uuid] = new_replica
             if replica.master then
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index 4748398..116f905 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -172,7 +172,8 @@ local function bucket_discovery(router, bucket_id)
             replicaset:callrw('vshard.storage.bucket_stat', {bucket_id})
         if err == nil then
             return bucket_set(router, bucket_id, replicaset.uuid)
-        elseif err.code ~= lerror.code.WRONG_BUCKET then
+        elseif err.code ~= lerror.code.WRONG_BUCKET and
+               err.code ~= lerror.code.REPLICASET_IN_BACKOFF then
             last_err = err
             unreachable_uuid = uuid
         end
-- 
2.24.3 (Apple Git-128)