[patches] [PATCH vshard 4/5] router: connect to all replicas including slaves

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Fri Mar 2 00:39:40 MSK 2018


There are two reasons. First - if a master is changed on a
replicaset, then a router must immediately switch to it all
'write' requests. Now to do it a router creates a new connection
to a new master.

Second reason: #75. To fast detect master role change on a
replica a router must be connected to all replicas of a
replicaset. Else to establish connections to slaves the router
will spend to many time.

Closes #76

Signed-off-by: Vladislav Shpilevoy <v.shpilevoy at tarantool.org>
---
 test/router/router.result   | 11 ++++++++--
 test/router/router.test.lua |  5 ++++-
 vshard/replicaset.lua       | 51 ++++++++++++++++-----------------------------
 vshard/router/init.lua      |  5 ++++-
 4 files changed, 35 insertions(+), 37 deletions(-)

diff --git a/test/router/router.result b/test/router/router.result
index c7b3437..4bdb3bf 100644
--- a/test/router/router.result
+++ b/test/router/router.result
@@ -101,6 +101,9 @@ old_replicasets = vshard.router.internal.replicasets
 old_connections = {}
 ---
 ...
+connection_count = 0
+---
+...
 test_run:cmd("setopt delimiter ';'")
 ---
 - true
@@ -108,6 +111,7 @@ test_run:cmd("setopt delimiter ';'")
 for _, old_rs in pairs(old_replicasets) do
     for uuid, old_replica in pairs(old_rs.replicas) do
         old_connections[uuid] = old_replica.conn
+        connection_count = connection_count + 1
     end
 end;
 ---
@@ -116,6 +120,10 @@ test_run:cmd("setopt delimiter ''");
 ---
 - true
 ...
+connection_count == 4
+---
+- true
+...
 vshard.router.cfg(cfg)
 ---
 ...
@@ -389,9 +397,8 @@ rs.master = master
 ---
 ...
 -- Test reconnect on bucker_route().
-rs:disconnect()
+master.conn:close()
 ---
-- true
 ...
 conn = vshard.router.route(1):connect()
 ---
diff --git a/test/router/router.test.lua b/test/router/router.test.lua
index 5a51e2d..fcd54e1 100644
--- a/test/router/router.test.lua
+++ b/test/router/router.test.lua
@@ -41,13 +41,16 @@ rs2.replica == rs2.master
 --
 old_replicasets = vshard.router.internal.replicasets
 old_connections = {}
+connection_count = 0
 test_run:cmd("setopt delimiter ';'")
 for _, old_rs in pairs(old_replicasets) do
     for uuid, old_replica in pairs(old_rs.replicas) do
         old_connections[uuid] = old_replica.conn
+        connection_count = connection_count + 1
     end
 end;
 test_run:cmd("setopt delimiter ''");
+connection_count == 4
 vshard.router.cfg(cfg)
 new_replicasets = vshard.router.internal.replicasets
 old_replicasets ~= new_replicasets
@@ -143,7 +146,7 @@ rs.master = nil
 vshard.router.route(1).master
 rs.master = master
 -- Test reconnect on bucker_route().
-rs:disconnect()
+master.conn:close()
 conn = vshard.router.route(1):connect()
 conn:wait_connected()
 conn.state
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
index 84799df..56687be 100644
--- a/vshard/replicaset.lua
+++ b/vshard/replicaset.lua
@@ -113,7 +113,7 @@ end
 --
 -- Create net.box connection to master.
 --
-local function replicaset_connect(replicaset)
+local function replicaset_connect_master(replicaset)
     local master = replicaset.master
     if master == nil then
         return nil, lerror.vshard(lerror.code.MASTER_IS_MISSING,
@@ -122,6 +122,15 @@ local function replicaset_connect(replicaset)
     return replicaset_connect_to_replica(replicaset, master)
 end
 
+--
+-- Create net.box connections to all replicas and master.
+--
+local function replicaset_connect_all(replicaset)
+    for _, replica in pairs(replicaset.replicas) do
+        replicaset_connect_to_replica(replicaset, replica)
+    end
+end
+
 --
 -- Make a replica be used for read requests or be candidate.
 -- @param replicaset Replicaset for which a replica is set.
@@ -132,17 +141,9 @@ end
 --
 local function replicaset_make_replica_read(replicaset, replica, read_name)
     assert(read_name == 'replica' or read_name == 'candidate')
-    local old_replica = replicaset[read_name]
-    assert(old_replica ~= replica)
-    local conn = replicaset_connect_to_replica(replicaset, replica)
+    assert(replicaset[read_name] ~= replica)
+    replicaset_connect_to_replica(replicaset, replica)
     replicaset[read_name] = replica
-    if old_replica and old_replica ~= replicaset.master then
-        assert(conn ~= old_replica.conn)
-        -- Each unused connection holds a worker fiber. Close them
-        -- to return fibers in pool now. Do not wait lua gc - it
-        -- is slow as fuck.
-        old_replica.conn:close()
-    end
 end
 
 --
@@ -214,24 +215,6 @@ local function replicaset_set_candidate_as_replica(replicaset)
            old_replica.weight >= replicaset.replica.weight and
            old_replica ~= replicaset.replica)
     replicaset.candidate = nil
-    if old_replica and old_replica.conn and
-       old_replica ~= replicaset.master then
-        old_replica.conn:close()
-    end
-end
-
---
--- Destroy net.box connection to master
---
-local function replicaset_disconnect(replicaset)
-    local master = replicaset.master
-    if master == nil then
-       return true
-    end
-    local conn = replicaset.master.conn
-    replicaset.master.conn = nil
-    conn:close()
-    return true
 end
 
 --
@@ -320,7 +303,7 @@ local function replicaset_master_call(replicaset, func, args, opts)
     assert(opts == nil or type(opts) == 'table')
     assert(type(func) == 'string', 'function name')
     assert(args == nil or type(args) == 'table', 'function arguments')
-    replicaset_connect(replicaset)
+    replicaset_connect_master(replicaset)
     local timeout = opts and opts.timeout or replicaset.master.net_timeout
     local net_status, storage_status, retval, error_object =
         replica_call(replicaset.master, func, args, timeout)
@@ -363,7 +346,7 @@ local function replicaset_nearest_call(replicaset, func, args, opts)
         if replica and replica:is_connected() then
             conn = replica.conn
         else
-            conn = replicaset_connect(replicaset)
+            conn = replicaset_connect_master(replicaset)
             replica = replicaset.master
         end
         net_status, storage_status, retval, error_object =
@@ -426,12 +409,14 @@ end
 --
 local replicaset_mt = {
     __index = {
-        connect = replicaset_connect;
+        connect = replicaset_connect_master;
+        connect_master = replicaset_connect_master;
+        connect_all = replicaset_connect_all;
+        connect_replica = replicaset_connect_to_replica;
         rebind_connections = replicaset_rebind_connections;
         update_candidate = replicaset_update_candidate;
         down_replica_priority = replicaset_down_replica_priority;
         set_candidate_as_replica = replicaset_set_candidate_as_replica;
-        disconnect = replicaset_disconnect;
         call = replicaset_master_call;
         callrw = replicaset_master_call;
         callro = replicaset_nearest_call;
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index 6814637..d4d5aed 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -311,7 +311,10 @@ local function failover_ping_round()
             if not replica.conn:ping({timeout = 5}) then
                 log.info('Ping error from %s: perhaps a connection is down',
                          replica)
+                -- Connection hangs. Recreate it to be able to
+                -- fail over to a replica next by priority.
                 replica.conn:close()
+                replicaset:connect_replica(replica)
             end
         end
     end
@@ -515,7 +518,7 @@ local function router_cfg(cfg)
     -- Now the new replicasets are fully built. Can establish
     -- connections and yield.
     for _, replicaset in pairs(new_replicasets) do
-        replicaset:connect()
+        replicaset:connect_all()
         replicaset:update_candidate()
     end
     lreplicaset.wait_masters_connect(new_replicasets)
-- 
2.14.3 (Apple Git-98)




More information about the Tarantool-patches mailing list