[patches] [PATCH] [replication] [recovery] recover missing data

Georgy Kirichenko georgy at tarantool.org
Thu Mar 15 15:52:48 MSK 2018


Seeks to be Ok

On Thursday, March 15, 2018 11:32:55 AM MSK you wrote:
> Recover missing local data from replica.
> If master was restarted and some data missing in xlog, it didn't
> recover with old behaviour. Fix it.
> Based on @GeorgyKirichenko proposal
> 
> Closes #3210
> ---
>  branch: gh-3210-recover-missing-local-data-master-master
>  src/box/relay.cc                          |  6 ++-
>  src/box/wal.cc                            |  8 +++-
>  test/replication/master1.lua              |  1 +
>  test/replication/master2.lua              |  1 +
>  test/replication/master_master.lua        | 30 ++++++++++++
>  test/replication/recover_missing.result   | 78
> +++++++++++++++++++++++++++++++ test/replication/recover_missing.test.lua |
> 30 ++++++++++++
>  7 files changed, 152 insertions(+), 2 deletions(-)
>  create mode 120000 test/replication/master1.lua
>  create mode 120000 test/replication/master2.lua
>  create mode 100644 test/replication/master_master.lua
>  create mode 100644 test/replication/recover_missing.result
>  create mode 100644 test/replication/recover_missing.test.lua
> 
> diff --git a/src/box/relay.cc b/src/box/relay.cc
> index 2bd05ad5f..87975b05a 100644
> --- a/src/box/relay.cc
> +++ b/src/box/relay.cc
> @@ -110,6 +110,8 @@ struct relay {
>  	struct vclock recv_vclock;
>  	/** Replicatoin slave version. */
>  	uint32_t version_id;
> +	/** lsn on subscribe, see #3210 */
> +	int64_t lsn_on_subscribe;
> 
>  	/** Relay endpoint */
>  	struct cbus_endpoint endpoint;
> @@ -541,6 +543,7 @@ relay_subscribe(int fd, uint64_t sync, struct replica
> *replica, relay.version_id = replica_version_id;
>  	relay.replica = replica;
>  	replica_set_relay(replica, &relay);
> +	relay.lsn_on_subscribe = vclock_get(&replicaset.vclock, replica->id);
> 
>  	int rc = cord_costart(&relay.cord, tt_sprintf("relay_%p", &relay),
>  			      relay_subscribe_f, &relay);
> @@ -586,7 +589,8 @@ relay_send_row(struct xstream *stream, struct
> xrow_header *packet) * (i.e. don't send replica's own rows back).
>  	 */
>  	if (relay->replica == NULL ||
> -	    packet->replica_id != relay->replica->id) {
> +	    packet->replica_id != relay->replica->id ||
> +	    packet->lsn <= relay->lsn_on_subscribe) {
>  		relay_send(relay, packet);
>  	}
>  }
> diff --git a/src/box/wal.cc b/src/box/wal.cc
> index 4576cfe09..5c87f08df 100644
> --- a/src/box/wal.cc
> +++ b/src/box/wal.cc
> @@ -768,8 +768,14 @@ wal_write(struct journal *journal, struct journal_entry
> *entry) /*
>  			 * Find last row from local instance id
>  			 * and promote vclock.
> +			 * In master-master configuration, if some data
> +			 * are missing on local master, others will send
> +			 * them back. In this case replicaset vclock is
> +			 * already set, so we should not update it.
> +			 * See #3210 for details.
>  			 */
> -			if ((*last)->replica_id == instance_id) {
> +			if ((*last)->replica_id == instance_id &&
> +			    replicaset.vclock.lsn[instance_id] < (*last)->lsn) {
>  				vclock_follow(&replicaset.vclock, instance_id,
>  					      (*last)->lsn);
>  				break;
> diff --git a/test/replication/master1.lua b/test/replication/master1.lua
> new file mode 120000
> index 000000000..16399ae47
> --- /dev/null
> +++ b/test/replication/master1.lua
> @@ -0,0 +1 @@
> +master_master.lua
> \ No newline at end of file
> diff --git a/test/replication/master2.lua b/test/replication/master2.lua
> new file mode 120000
> index 000000000..16399ae47
> --- /dev/null
> +++ b/test/replication/master2.lua
> @@ -0,0 +1 @@
> +master_master.lua
> \ No newline at end of file
> diff --git a/test/replication/master_master.lua
> b/test/replication/master_master.lua new file mode 100644
> index 000000000..ff165c367
> --- /dev/null
> +++ b/test/replication/master_master.lua
> @@ -0,0 +1,30 @@
> +#!/usr/bin/env tarantool
> +
> +-- get instance name from filename (master1.lua => master1)
> +local INSTANCE_ID = string.match(arg[0], "%d")
> +local USER = 'cluster'
> +local PASSWORD = 'somepassword'
> +local SOCKET_DIR = require('fio').cwd()
> +local function instance_uri(instance_id)
> +    --return 'localhost:'..(3310 + instance_id)
> +    return SOCKET_DIR..'/master'..instance_id..'.sock';
> +end
> +
> +-- start console first
> +require('console').listen(os.getenv('ADMIN'))
> +
> +box.cfg({
> +    listen = instance_uri(INSTANCE_ID);
> +    replication = {
> +        USER..':'..PASSWORD..'@'..instance_uri(1);
> +        USER..':'..PASSWORD..'@'..instance_uri(2);
> +    };
> +})
> +
> +box.once("bootstrap", function()
> +    local test_run = require('test_run').new()
> +    box.schema.user.create(USER, { password = PASSWORD })
> +    box.schema.user.grant(USER, 'replication')
> +    box.schema.space.create('test', {engine = test_run:get_cfg('engine')})
> +    box.space.test:create_index('primary')
> +end)
> diff --git a/test/replication/recover_missing.result
> b/test/replication/recover_missing.result new file mode 100644
> index 000000000..605232be4
> --- /dev/null
> +++ b/test/replication/recover_missing.result
> @@ -0,0 +1,78 @@
> +env = require('test_run')
> +---
> +...
> +test_run = env.new()
> +---
> +...
> +SERVERS = { 'master1', 'master2' }
> +---
> +...
> +-- Start servers
> +test_run:create_cluster(SERVERS)
> +---
> +...
> +-- Wait for full mesh
> +test_run:wait_fullmesh(SERVERS)
> +---
> +...
> +test_run:cmd("switch master1")
> +---
> +- true
> +...
> +box.space._schema:insert({'1'})
> +---
> +- ['1']
> +...
> +box.space._schema:select('1')
> +---
> +- - ['1']
> +...
> +fiber = require('fiber')
> +---
> +...
> +fiber.sleep(0.1)
> +---
> +...
> +test_run:cmd("switch master2")
> +---
> +- true
> +...
> +box.space._schema:select('1')
> +---
> +- - ['1']
> +...
> +test_run:cmd("stop server master1")
> +---
> +- true
> +...
> +fio = require('fio')
> +---
> +...
> +fio.unlink(fio.pathjoin(fio.abspath("."),
> string.format('master1/%020d.xlog', 8))) +---
> +- true
> +...
> +test_run:cmd("start server master1")
> +---
> +- true
> +...
> +test_run:cmd("switch master1")
> +---
> +- true
> +...
> +box.space._schema:select('1')
> +---
> +- - ['1']
> +...
> +test_run:cmd("switch default")
> +---
> +- true
> +...
> +test_run:cmd("stop server master1")
> +---
> +- true
> +...
> +test_run:cmd("stop server master2")
> +---
> +- true
> +...
> diff --git a/test/replication/recover_missing.test.lua
> b/test/replication/recover_missing.test.lua new file mode 100644
> index 000000000..2f36fc329
> --- /dev/null
> +++ b/test/replication/recover_missing.test.lua
> @@ -0,0 +1,30 @@
> +env = require('test_run')
> +test_run = env.new()
> +
> +SERVERS = { 'master1', 'master2' }
> +-- Start servers
> +test_run:create_cluster(SERVERS)
> +-- Wait for full mesh
> +test_run:wait_fullmesh(SERVERS)
> +
> +test_run:cmd("switch master1")
> +box.space._schema:insert({'1'})
> +box.space._schema:select('1')
> +
> +fiber = require('fiber')
> +fiber.sleep(0.1)
> +
> +test_run:cmd("switch master2")
> +box.space._schema:select('1')
> +test_run:cmd("stop server master1")
> +fio = require('fio')
> +fio.unlink(fio.pathjoin(fio.abspath("."),
> string.format('master1/%020d.xlog', 8))) +test_run:cmd("start server
> master1")
> +
> +test_run:cmd("switch master1")
> +box.space._schema:select('1')
> +
> +test_run:cmd("switch default")
> +test_run:cmd("stop server master1")
> +test_run:cmd("stop server master2")
> +

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 488 bytes
Desc: This is a digitally signed message part.
URL: <https://lists.tarantool.org/pipermail/tarantool-patches/attachments/20180315/92a06749/attachment.sig>


More information about the Tarantool-patches mailing list