[Tarantool-patches] [PATCH 1/1] applier: process synchro rows after WAL write

Thu Apr 8 11:39:03 MSK 2021

08.04.2021 01:47, Vladislav Shpilevoy пишет:
> Applier used to process synchronous rows CONFIRM and ROLLBACK
> right after receipt before they are written to WAL.
>
> That led to a bug that the confirmed data became visible, might be
> accessed by user requests, then the node restarted before CONFIRM
> finished its WAL write, and the data was not visible again. That
> is just like if it would be rolled back, which is not acceptable.
>
> Another case - CONFIRM WAL write could simply fail due to any
> reason (no disk space, OOM), but the transactions would remain
> confirmed anyway.
>
> Also that produced some hacks in the limbo's code to support the
> confirmation and rollback of transactions not yet written to WAL.
>
> The patch makes the synchro rows processed only after they are
> written to WAL. Although the 'rollback' case above might still
> happen if the xlogs were in the kernel caches, and the machine was
> powered off before they were flushed to disk. But that is not
> related to qsync specifically.
>
> To handle the synchro rows after WAL write the patch makes them go
> to WAL in a blocking way (journal_write() instead of
> journal_write_try_async()). Otherwise it could happen that a
> CONFIRM/ROLLBACK is being written to WAL and would clear the limbo
> afterwards, but a new transaction arrives with a different owner,
> and it conflicts with the current limbo owner.

Thanks for the patch!

I'm a bit worried about two different synchro rows coming from two
appliers. Is everything ok in this case?
Or even normal rows coming from other appliers. Say some other replica
has already applied this synchro row and even has written some rows on
top of it. Its applier won't block on replica_id latch, and may fail to 
apply
some txs following this synchro row, because it's not yet written to WAL
and thus not applied (limbo is still not empty or belongs to other 
instance).

Looks like this won't be a problem once synchro rows start pinning the
limbo to some specific replica. Because in this case only the replica that
has issued confirm will be able to generate new rows. And these rows will
be ordered by replica_id latch.

But still, maybe this is worth fixing?
Am I missing something?

Please find two more comments below.

>
> Closes #5213
> ---
> Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5213-applier-qsync-write-before
> Issue: https://github.com/tarantool/tarantool/issues/5213
>
>   .../unreleased/applier-qsync-write-order.md   |   6 +
>   src/box/applier.cc                            | 107 +++---
>   src/box/txn_limbo.c                           |  48 +--
>   .../gh-5213-qsync-applier-order.result        | 329 ++++++++++++++++++
>   .../gh-5213-qsync-applier-order.test.lua      | 129 +++++++
>   test/replication/gh-5213-replica.lua          |  10 +
>   test/replication/suite.cfg                    |   1 +
>   test/replication/suite.ini                    |   2 +-
>   8 files changed, 529 insertions(+), 103 deletions(-)
>   create mode 100644 changelogs/unreleased/applier-qsync-write-order.md
>   create mode 100644 test/replication/gh-5213-qsync-applier-order.result
>   create mode 100644 test/replication/gh-5213-qsync-applier-order.test.lua
>   create mode 100644 test/replication/gh-5213-replica.lua
>
> diff --git a/changelogs/unreleased/applier-qsync-write-order.md b/changelogs/unreleased/applier-qsync-write-order.md
> new file mode 100644
> index 000000000..10b9ade80
> --- /dev/null
> +++ b/changelogs/unreleased/applier-qsync-write-order.md
> @@ -0,0 +1,6 @@
> +## bugfix/replication
> +
> +* Fix the bug when a synchronous transaction could be confirmed and visible on
> +  a replica, but then not confirmed / invisible again after restart. Could
> +  happen more likely on memtx spaces with `memtx_use_mvcc_engine` enabled
> +  (gh-5213).
> diff --git a/src/box/applier.cc b/src/box/applier.cc
> index 971b2e64c..f7c82548c 100644
> --- a/src/box/applier.cc
> +++ b/src/box/applier.cc
> @@ -762,26 +762,21 @@ applier_txn_wal_write_cb(struct trigger *trigger, void *event)
>   }
>   
>   struct synchro_entry {
> -	/** Encoded form of a synchro record. */
> -	struct synchro_body_bin	body_bin;
> -
> -	/** xrow to write, used by the journal engine. */
> -	struct xrow_header row;
> -
> +	/** Request to process when WAL write is done. */
> +	struct synchro_request *req;
> +	/** Fiber created the entry. To wakeup when WAL write is done. */
> +	struct fiber *owner;
>   	/**
> -	 * The journal entry itself. Note since
> -	 * it has unsized array it must be the
> -	 * last entry in the structure.
> +	 * The base journal entry. It has unsized array and then must be the
> +	 * last entry in the structure. But can workaround it via a union
> +	 * adding the needed tail as char[].
>   	 */
> -	struct journal_entry journal_entry;
> +	union {
> +		struct journal_entry base;
> +		char base_buf[sizeof(base) + sizeof(base.rows[0])];
> +	};
>   };

I don't understand this union stuff.
The journal_entry is the last entry in synchro_entry anyway.
Is this a hack for allowing to allocate synchro_entry on the stack?

>   
> -static void
> -synchro_entry_delete(struct synchro_entry *entry)
> -{
> -	free(entry);
> -}
> -
>   /**
>    * Async write journal completion.
>    */
> @@ -791,50 +786,15 @@ apply_synchro_row_cb(struct journal_entry *entry)
>   	assert(entry->complete_data != NULL);
>   	struct synchro_entry *synchro_entry =
>   		(struct synchro_entry *)entry->complete_data;
> -	if (entry->res < 0)
> +	if (entry->res < 0) {
>   		applier_rollback_by_wal_io();
> -	else
> +	} else {
> +		txn_limbo_process(&txn_limbo, synchro_entry->req);
>   		trigger_run(&replicaset.applier.on_wal_write, NULL);
> -
> -	synchro_entry_delete(synchro_entry);
> -}
> -
> -/**
> - * Allocate a new synchro_entry to be passed to
> - * the journal engine in async write way.
> - */
> -static struct synchro_entry *
> -synchro_entry_new(struct xrow_header *applier_row,
> -		  struct synchro_request *req)
> -{
> -	struct synchro_entry *entry;
> -	size_t size = sizeof(*entry) + sizeof(struct xrow_header *);
> -
> -	/*
> -	 * For simplicity we use malloc here but
> -	 * probably should provide some cache similar
> -	 * to txn cache.
> -	 */
> -	entry = (struct synchro_entry *)malloc(size);
> -	if (entry == NULL) {
> -		diag_set(OutOfMemory, size, "malloc", "synchro_entry");
> -		return NULL;
>   	}
> -
> -	struct journal_entry *journal_entry = &entry->journal_entry;
> -	struct synchro_body_bin *body_bin = &entry->body_bin;
> -	struct xrow_header *row = &entry->row;
> -
> -	journal_entry->rows[0] = row;
> -
> -	xrow_encode_synchro(row, body_bin, req);
> -
> -	row->lsn = applier_row->lsn;
> -	row->replica_id = applier_row->replica_id;
> -
> -	journal_entry_create(journal_entry, 1, xrow_approx_len(row),
> -			     apply_synchro_row_cb, entry);
> -	return entry;
> +	/* The fiber is the same on final join. */
> +	if (synchro_entry->owner != fiber())
> +		fiber_wakeup(synchro_entry->owner);
>   }
>   
>   /** Process a synchro request. */
> @@ -847,14 +807,31 @@ apply_synchro_row(struct xrow_header *row)
>   	if (xrow_decode_synchro(row, &req) != 0)
>   		goto err;
>   
> -	txn_limbo_process(&txn_limbo, &req);
> -
> -	struct synchro_entry *entry;
> -	entry = synchro_entry_new(row, &req);
> -	if (entry == NULL)
> -		goto err;
> -
> -	if (journal_write_try_async(&entry->journal_entry) != 0) {
> +	struct synchro_entry entry;
> +	entry.base.rows[0] = row;
> +	journal_entry_create(&entry.base, 1, xrow_approx_len(row),
> +			     apply_synchro_row_cb, &entry);
> +	entry.req = &req;
> +	entry.owner = fiber();
> +	/*
> +	 * The WAL write is blocking. Otherwise it might happen that a CONFIRM
> +	 * or ROLLBACK is sent to WAL, and it would empty the limbo, but before
> +	 * it is written, more transactions arrive with a different owner. They
> +	 * won't be able to enter the limbo due to owner ID mismatch. Hence the
> +	 * synchro rows must block receipt of new transactions.
> +	 *
> +	 * Don't forget to return -1 both if the journal write failed right
> +	 * away, and if it failed inside of WAL thread (res < 0). Otherwise the
> +	 * caller would propagate committed vclock to this row thinking it was
> +	 * a success.
> +	 *
> +	 * XXX: in theory it could be done vice-versa. The write could be made
> +	 * non-blocking, and instead the potentially conflicting transactions
> +	 * could try to wait for all the current synchro WAL writes to end
> +	 * before trying to commit. But that requires extra steps from the
> +	 * transactions side, including the async ones.
> +	 */
> +	if (journal_write(&entry.base) != 0 || entry.base.res < 0) {
>   		diag_set(ClientError, ER_WAL_IO);
>   		goto err;
>   	}
...
>
> diff --git a/test/replication/gh-5213-qsync-applier-order.test.lua b/test/replication/gh-5213-qsync-applier-order.test.lua
> new file mode 100644
> index 000000000..3a32626f2
> --- /dev/null
> +++ b/test/replication/gh-5213-qsync-applier-order.test.lua
> @@ -0,0 +1,129 @@
> +--
> +-- gh-5213: applier used to process CONFIRM/ROLLBACK before writing them to WAL.
> +-- As a result it could happen that the transactions became visible on CONFIRM,
> +-- then somehow weren't written to WAL, and on restart the data might not be
> +-- visible again. Which means rollback of confirmed data and is not acceptable
> +-- (on the contrary with commit after rollback).
> +--
> +test_run = require('test_run').new()
> +fiber = require('fiber')
> +old_synchro_quorum = box.cfg.replication_synchro_quorum
> +old_synchro_timeout = box.cfg.replication_synchro_timeout
> +
> +box.schema.user.grant('guest', 'super')
> +
> +s = box.schema.space.create('test', {is_sync = true})
> +_ = s:create_index('pk')
> +
> +test_run:cmd('create server replica with rpl_master=default,\
> +              script="replication/gh-5213-replica.lua"')
> +test_run:cmd('start server replica')
> +
> +test_run:switch('replica')
> +assert(box.info.id == 2)
> +lsn = box.info.vclock[1]
> +
> +test_run:switch('default')
> +fiber = require('fiber')
> +box.cfg{                                                                        \
> +    replication_synchro_quorum = 3,                                             \
> +    replication_synchro_timeout = 1000,                                         \
> +}
> +f = fiber.new(function() s:replace{1} end)
> +
> +test_run:switch('replica')
> +-- Wait when the transaction is written to WAL.
> +test_run:wait_cond(function() return box.info.vclock[1] == lsn + 1 end)

This shouldn't go wrong, but I find
test_run:wait_lsn('replica, 'default') more durable.
And you wouldn't need to save lsn above then.

Same for other usages of lsn = ... and wait_cond(box.info.vclock[1] == 
lsn + ...)

Up to you though.

> +s = box.space.test
> +-- But not visible yet. Because MVCC is on, and no CONFIRM yet.
> +assert(s:get({1}) == nil)
> +-- Block the incoming CONFIRM to be able to ensure the data is not visible until
> +-- WAL write ends.
> +box.error.injection.set('ERRINJ_WAL_DELAY_COUNTDOWN', 0)
> +
> +test_run:switch('default')
> +box.cfg{replication_synchro_quorum = 2}
> +test_run:wait_cond(function() return f:status() == 'dead' end)
> +
> +test_run:switch('replica')
> +-- Wait when CONFIRM is in the WAL thread.
> +test_run:wait_cond(function() return box.error.injection.get('ERRINJ_WAL_DELAY') end)
> +assert(s:get({1}) == nil)
> +box.error.injection.set("ERRINJ_WAL_DELAY", false)
> +-- After CONFIRM is in WAL, the transaction is committed and its data is
> +-- visible.
> +test_run:wait_cond(function() return s:get({1}) ~= nil end)
> +
> +--
> +-- Ensure CONFIRM WAL write fail also works fine when couldn't even start a WAL
> +-- write. The data remains not confirmed until WAL write succeeds.
> +--
> +lsn = box.info.vclock[1]
> +
> +test_run:switch('default')
> +box.cfg{replication_synchro_quorum = 3}
> +f = fiber.new(function() s:replace{2} end)
> +
> +test_run:switch('replica')
> +test_run:wait_cond(function() return box.info.vclock[1] == lsn + 1 end)
> +assert(s:get({2}) == nil)
> +-- Make journal write fail immediately.
> +box.error.injection.set('ERRINJ_WAL_IO', true)
> +
> +test_run:switch('default')
> +box.cfg{replication_synchro_quorum = 2}
> +test_run:wait_cond(function() return f:status() == 'dead' end)
> +
> +test_run:switch('replica')
> +test_run:wait_upstream(1, {status = 'stopped'})
> +box.error.injection.set('ERRINJ_WAL_IO', false)
> +assert(s:get({2}) == nil)
> +-- Re-subscribe.
> +replication = box.cfg.replication
> +box.cfg{replication = {}}
> +box.cfg{replication = replication}
> +test_run:wait_cond(function() return box.info.vclock[1] == lsn + 2 end)
> +test_run:wait_upstream(1, {status = 'follow'})
> +assert(s:get({2}) ~= nil)
> +
> +--
> +-- Ensure CONFIRM WAL write fail works fine when an error happens inside WAL
> +-- thread. The data remains not confirmed until WAL write succeeds.
> +--
> +lsn = box.info.vclock[1]
> +
> +test_run:switch('default')
> +box.cfg{replication_synchro_quorum = 3}
> +f = fiber.new(function() s:replace{3} end)
> +
> +test_run:switch('replica')
> +test_run:wait_cond(function() return box.info.vclock[1] == lsn + 1 end)
> +assert(s:get({3}) == nil)
> +-- Journal write start is going to succeed, but it will fail later on return
> +-- from the WAL thread.
> +box.error.injection.set('ERRINJ_WAL_ROTATE', true)
> +
> +test_run:switch('default')
> +box.cfg{replication_synchro_quorum = 2}
> +test_run:wait_cond(function() return f:status() == 'dead' end)
> +
> +test_run:switch('replica')
> +test_run:wait_upstream(1, {status = 'stopped'})
> +box.error.injection.set('ERRINJ_WAL_ROTATE', false)
> +assert(s:get({3}) == nil)
> +box.cfg{replication = {}}
> +box.cfg{replication = replication}
> +test_run:wait_cond(function() return box.info.vclock[1] == lsn + 2 end)
> +test_run:wait_upstream(1, {status = 'follow'})
> +assert(s:get({3}) ~= nil)
> +
> +test_run:switch('default')
> +test_run:cmd('stop server replica')
> +test_run:cmd('delete server replica')
> +
> +s:drop()
> +box.schema.user.revoke('guest', 'super')
> +box.cfg{                                                                        \
> +    replication_synchro_quorum = old_synchro_quorum,                            \
> +    replication_synchro_timeout = old_synchro_timeout,                          \
> +}
> diff --git a/test/replication/gh-5213-replica.lua b/test/replication/gh-5213-replica.lua
> new file mode 100644
> index 000000000..3ba90d7ab
> --- /dev/null
> +++ b/test/replication/gh-5213-replica.lua
> @@ -0,0 +1,10 @@
> +#!/usr/bin/env tarantool
> +
> +box.cfg({
> +    listen = os.getenv("LISTEN"),
> +    replication = os.getenv("MASTER"),
> +    read_only = true,
> +    memtx_use_mvcc_engine = true,
> +})
> +
> +require('console').listen(os.getenv('ADMIN'))
> diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
> index aff5fda26..ab25fa331 100644
> --- a/test/replication/suite.cfg
> +++ b/test/replication/suite.cfg
> @@ -15,6 +15,7 @@
>       "gh-3760-misc-return-on-quorum-0.test.lua": {},
>       "gh-4399-misc-no-failure-on-error-reading-wal.test.lua": {},
>       "gh-4424-misc-orphan-on-reconfiguration-error.test.lua": {},
> +    "gh-5213-qsync-applier-order.test.lua": {},
>       "gh-5426-election-on-off.test.lua": {},
>       "gh-5433-election-restart-recovery.test.lua": {},
>       "gh-5506-election-on-off.test.lua": {},
> diff --git a/test/replication/suite.ini b/test/replication/suite.ini
> index a9e44e8cf..a7d5f6129 100644
> --- a/test/replication/suite.ini
> +++ b/test/replication/suite.ini
> @@ -3,7 +3,7 @@ core = tarantool
>   script =  master.lua
>   description = tarantool/box, replication
>   disabled = consistent.test.lua
> -release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua
> +release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua
>   config = suite.cfg
>   lua_libs = lua/fast_replica.lua lua/rlimit.lua
>   use_unix_sockets = True

-- 
Serge Petrenko