From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 724CB29D1C for ; Wed, 22 Aug 2018 11:59:39 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id 0MD0DV25lP6a for ; Wed, 22 Aug 2018 11:59:39 -0400 (EDT) Received: from mail-lj1-f171.google.com (mail-lj1-f171.google.com [209.85.208.171]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id E17A129A15 for ; Wed, 22 Aug 2018 11:59:38 -0400 (EDT) Received: by mail-lj1-f171.google.com with SMTP id m84-v6so1815493lje.10 for ; Wed, 22 Aug 2018 08:59:38 -0700 (PDT) From: Olga Arkhangelskaia Subject: [tarantool-patches] [PATCH v2] replication: adds replication sync after cfg. update Date: Wed, 22 Aug 2018 18:59:30 +0300 Message-Id: <20180822155930.34980-1-krishtal.olja@gmail.com> Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org Cc: Olga Arkhangelskaia When replica reconnects to replica set not for the first time, we suffer from absence of synchronization. Such behavior leads to giving away outdated data. Closes #3427 --- https://github.com/tarantool/tarantool/issues/3427 https://github.com/tarantool/tarantool/tree/OKriw/replication_no_sync-1.9 v1: https://www.freelists.org/post/tarantool-patches/PATCH-replication-adds-replication-sync-after-cfg-update Changes in v2: - fixed test - changed replicaset_sync src/box/box.cc | 7 ++- src/box/replication.cc | 34 ++++++++------ src/box/replication.h | 7 ++- test/replication/orphan.result | 92 +++++++++++++++++++++++++++++++++++++ test/replication/orphan.test.lua | 41 +++++++++++++++++ test/replication/replica_orphan.lua | 12 +++++ 6 files changed, 177 insertions(+), 16 deletions(-) create mode 100644 test/replication/orphan.result create mode 100644 test/replication/orphan.test.lua create mode 100644 test/replication/replica_orphan.lua diff --git a/src/box/box.cc b/src/box/box.cc index 8d7454d1f..8c67c79e8 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -634,6 +634,9 @@ box_set_replication(void) box_sync_replication(true); /* Follow replica */ replicaset_follow(); + /* Sync replica up to quorum */ + replicaset_sync(); + say_verbose("synchronization complete"); } void @@ -1941,8 +1944,10 @@ box_cfg_xc(void) fiber_gc(); is_box_configured = true; - if (!is_bootstrap_leader) + if (!is_bootstrap_leader) { replicaset_sync(); + replicaset_is_synced(); + } say_info("ready to accept requests"); } diff --git a/src/box/replication.cc b/src/box/replication.cc index 861ce34ea..c7b11a5d6 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -662,24 +662,12 @@ replicaset_follow(void) } void -replicaset_sync(void) +replicaset_is_synced(void) { int quorum = replicaset_quorum(); if (quorum == 0) return; - - say_verbose("synchronizing with %d replicas", quorum); - - /* - * Wait until all connected replicas synchronize up to - * replication_sync_lag - */ - while (replicaset.applier.synced < quorum && - replicaset.applier.connected + - replicaset.applier.loading >= quorum) - fiber_cond_wait(&replicaset.applier.cond); - if (replicaset.applier.synced < quorum) { /* * Not enough replicas connected to form a quorum. @@ -694,6 +682,26 @@ replicaset_sync(void) "replicas formed", quorum); } +void +replicaset_sync(void) +{ + int quorum = replicaset_quorum(); + + if (quorum == 0) + return; + + say_verbose("synchronizing with %d replicas", quorum); + + /* + * Wait until all connected replicas synchronize up to + * replication_sync_lag + */ + while (replicaset.applier.synced < quorum && + replicaset.applier.connected + + replicaset.applier.loading >= quorum) + fiber_cond_wait(&replicaset.applier.cond); +} + void replicaset_check_quorum(void) { diff --git a/src/box/replication.h b/src/box/replication.h index 06a2867b6..728a73704 100644 --- a/src/box/replication.h +++ b/src/box/replication.h @@ -371,10 +371,13 @@ replicaset_connect(struct applier **appliers, int count, void replicaset_follow(void); +/** + * Check if replicaset is synced with quorum + */ +void +replicaset_is_synced(void); /** * Wait until a replication quorum is formed. - * Return immediately if a quorum cannot be - * formed because of errors. */ void replicaset_sync(void); diff --git a/test/replication/orphan.result b/test/replication/orphan.result new file mode 100644 index 000000000..818ea75de --- /dev/null +++ b/test/replication/orphan.result @@ -0,0 +1,92 @@ +-- +-- gh-3427: no sync after configuration update +-- +env = require('test_run') +--- +... +test_run = env.new() +--- +... +engine = test_run:get_cfg('engine') +--- +... +box.schema.user.grant('guest', 'read,write,execute', 'universe') +--- +... +box.schema.user.grant('guest', 'replication') +--- +... +test_run:cmd("create server replica with rpl_master=default, script='replication/replica_orphan.lua'") +--- +- true +... +test_run:cmd("start server replica") +--- +- true +... +test_run:cmd("switch replica") +--- +- true +... +test_run:cmd("switch default") +--- +- true +... +s = box.schema.space.create('test', {engine = engine}) +--- +... +index = s:create_index('primary') +--- +... +-- change replica configuration +test_run:cmd("switch replica") +--- +- true +... +box.cfg{replication={}} +--- +... +test_run:cmd("switch default") +--- +- true +... +-- insert values on the master while replica is unconfigured +a = 100000 box.begin() while a > 0 do a = a-1 box.space.test:insert{a,a} end box.commit() +--- +... +test_run:cmd("switch replica") +--- +- true +... +require'fiber'.sleep(0.1) +--- +... +box.cfg{replication = os.getenv("MASTER")} +--- +... +box.info.replication[1].upstream.lag > 0.1 +--- +- false +... +test_run:cmd("switch default") +--- +- true +... +-- cleanup +test_run:cmd("stop server replica") +--- +- true +... +test_run:cmd("cleanup server replica") +--- +- true +... +box.space.test:drop() +--- +... +box.schema.user.revoke('guest', 'replication') +--- +... +box.schema.user.revoke('guest', 'read,write,execute', 'universe') +--- +... diff --git a/test/replication/orphan.test.lua b/test/replication/orphan.test.lua new file mode 100644 index 000000000..862fd69af --- /dev/null +++ b/test/replication/orphan.test.lua @@ -0,0 +1,41 @@ +-- +-- gh-3427: no sync after configuration update +-- + +env = require('test_run') +test_run = env.new() +engine = test_run:get_cfg('engine') + +box.schema.user.grant('guest', 'read,write,execute', 'universe') + +box.schema.user.grant('guest', 'replication') +test_run:cmd("create server replica with rpl_master=default, script='replication/replica_orphan.lua'") +test_run:cmd("start server replica") + +test_run:cmd("switch replica") +test_run:cmd("switch default") + +s = box.schema.space.create('test', {engine = engine}) +index = s:create_index('primary') + +-- change replica configuration +test_run:cmd("switch replica") +box.cfg{replication={}} + +test_run:cmd("switch default") +-- insert values on the master while replica is unconfigured +a = 100000 box.begin() while a > 0 do a = a-1 box.space.test:insert{a,a} end box.commit() + +test_run:cmd("switch replica") +require'fiber'.sleep(0.1) +box.cfg{replication = os.getenv("MASTER")} + +box.info.replication[1].upstream.lag > 0.1 +test_run:cmd("switch default") + +-- cleanup +test_run:cmd("stop server replica") +test_run:cmd("cleanup server replica") +box.space.test:drop() +box.schema.user.revoke('guest', 'replication') +box.schema.user.revoke('guest', 'read,write,execute', 'universe') diff --git a/test/replication/replica_orphan.lua b/test/replication/replica_orphan.lua new file mode 100644 index 000000000..97740d69a --- /dev/null +++ b/test/replication/replica_orphan.lua @@ -0,0 +1,12 @@ +#!/usr/bin/env tarantool + +local TIMEOUT = tonumber(arg[1]) + +box.cfg({ + listen = os.getenv("LISTEN"), + replication = os.getenv("MASTER"), + replication_connect_timeout = 0.5, + replication_sync_lag = 0.01, +}) + +require('console').listen(os.getenv('ADMIN')) -- 2.14.3 (Apple Git-98)