[Tarantool-patches] [PATCH v4 15/16] replication: forbid implicit limbo owner transition
Serge Petrenko
sergepetrenko at tarantool.org
Wed Jul 14 21:25:43 MSK 2021
Forbid limbo ownership transition without an explicit promote.
Make it so that synchronous transactions may be committed only after it
is claimed by some instance via a PROMOTE request.
Make everyone but the limbo owner read-only even when the limbo is
empty.
Part-of #6034
@TarantoolBot document
Title: synchronous replication changes
`box.info.synchro.queue` receives a new field: `owner`. It's a replica
id of the instance owning the synchronous transaction queue.
Once some instance owns the queue, every other instance becomes
read-only. When the queue is unclaimed, e.g.
`box.info.synchro.queue.owner` is `0`, everyone may be writeable, but
cannot create synchronous transactions.
In order to claim or re-claim the queue, you have to issue
`box.ctl.promote()` on the instance you wish to promote.
When elections are enabled, the instance issues `box.ctl.promote()`
automatically once it wins the elections, no additional actions are
required.
---
src/box/errcode.h | 2 +
src/box/lua/info.c | 4 +-
src/box/txn_limbo.c | 32 ++---
test/box/alter.result | 2 +-
test/box/error.result | 2 +
test/replication/gh-5440-qsync-ro.result | 133 ---------------------
test/replication/gh-5440-qsync-ro.test.lua | 53 --------
test/replication/suite.cfg | 1 -
8 files changed, 19 insertions(+), 210 deletions(-)
delete mode 100644 test/replication/gh-5440-qsync-ro.result
delete mode 100644 test/replication/gh-5440-qsync-ro.test.lua
diff --git a/src/box/errcode.h b/src/box/errcode.h
index 49aec4bf6..d42b64ef4 100644
--- a/src/box/errcode.h
+++ b/src/box/errcode.h
@@ -278,6 +278,8 @@ struct errcode_record {
/*223 */_(ER_INTERFERING_PROMOTE, "Instance with replica id %u was promoted first") \
/*224 */_(ER_RAFT_DISABLED, "Elections were turned off while running box.ctl.promote()")\
/*225 */_(ER_TXN_ROLLBACK, "Transaction was rolled back") \
+ /*226 */_(ER_SYNC_QUEUE_UNCLAIMED, "The synchronous transaction queue doesn't belong to any instance")\
+ /*227 */_(ER_SYNC_QUEUE_FOREIGN, "The synchronous transaction queue belongs to other instance with id %u")\
/*
* !IMPORTANT! Please follow instructions at start of the file
diff --git a/src/box/lua/info.c b/src/box/lua/info.c
index 1d8fe7938..ef3d6f5b2 100644
--- a/src/box/lua/info.c
+++ b/src/box/lua/info.c
@@ -615,9 +615,11 @@ lbox_info_synchro(struct lua_State *L)
/* Queue information. */
struct txn_limbo *queue = &txn_limbo;
- lua_createtable(L, 0, 1);
+ lua_createtable(L, 0, 2);
lua_pushnumber(L, queue->len);
lua_setfield(L, -2, "len");
+ lua_pushnumber(L, queue->owner_id);
+ lua_setfield(L, -2, "owner");
lua_setfield(L, -2, "queue");
return 1;
diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
index 991c47698..c8c4f587c 100644
--- a/src/box/txn_limbo.c
+++ b/src/box/txn_limbo.c
@@ -55,7 +55,8 @@ txn_limbo_create(struct txn_limbo *limbo)
bool
txn_limbo_is_ro(struct txn_limbo *limbo)
{
- return limbo->owner_id != instance_id && !txn_limbo_is_empty(limbo);
+ return limbo->owner_id != REPLICA_ID_NIL &&
+ limbo->owner_id != instance_id;
}
struct txn_limbo_entry *
@@ -95,18 +96,18 @@ txn_limbo_append(struct txn_limbo *limbo, uint32_t id, struct txn *txn)
}
if (id == 0)
id = instance_id;
- bool make_ro = false;
- if (limbo->owner_id != id) {
- if (rlist_empty(&limbo->queue)) {
- limbo->owner_id = id;
- limbo->confirmed_lsn = 0;
- if (id != instance_id)
- make_ro = true;
+ if (limbo->owner_id == REPLICA_ID_NIL) {
+ diag_set(ClientError, ER_SYNC_QUEUE_UNCLAIMED);
+ return NULL;
+ } else if (limbo->owner_id != id) {
+ if (txn_limbo_is_empty(limbo)) {
+ diag_set(ClientError, ER_SYNC_QUEUE_FOREIGN,
+ limbo->owner_id);
} else {
diag_set(ClientError, ER_UNCOMMITTED_FOREIGN_SYNC_TXNS,
limbo->owner_id);
- return NULL;
}
+ return NULL;
}
size_t size;
struct txn_limbo_entry *e = region_alloc_object(&txn->region,
@@ -122,12 +123,6 @@ txn_limbo_append(struct txn_limbo *limbo, uint32_t id, struct txn *txn)
e->is_rollback = false;
rlist_add_tail_entry(&limbo->queue, e, in_queue);
limbo->len++;
- /*
- * We added new entries from a remote instance to an empty limbo.
- * Time to make this instance read-only.
- */
- if (make_ro)
- box_update_ro_summary();
return e;
}
@@ -442,9 +437,6 @@ txn_limbo_read_confirm(struct txn_limbo *limbo, int64_t lsn)
assert(e->txn->signature >= 0);
txn_complete_success(e->txn);
}
- /* Update is_ro once the limbo is clear. */
- if (txn_limbo_is_empty(limbo))
- box_update_ro_summary();
}
/**
@@ -492,9 +484,6 @@ txn_limbo_read_rollback(struct txn_limbo *limbo, int64_t lsn)
if (e == last_rollback)
break;
}
- /* Update is_ro once the limbo is clear. */
- if (txn_limbo_is_empty(limbo))
- box_update_ro_summary();
}
void
@@ -525,6 +514,7 @@ txn_limbo_read_promote(struct txn_limbo *limbo, uint32_t replica_id,
txn_limbo_read_rollback(limbo, lsn + 1);
assert(txn_limbo_is_empty(&txn_limbo));
limbo->owner_id = replica_id;
+ box_update_ro_summary();
limbo->confirmed_lsn = 0;
}
diff --git a/test/box/alter.result b/test/box/alter.result
index a7bffce10..6a64f6b84 100644
--- a/test/box/alter.result
+++ b/test/box/alter.result
@@ -1464,7 +1464,7 @@ assert(s.is_sync)
...
s:replace{1}
---
-- error: Quorum collection for a synchronous transaction is timed out
+- error: The synchronous transaction queue doesn't belong to any instance
...
-- When not specified or nil - ignored.
s:alter({is_sync = nil})
diff --git a/test/box/error.result b/test/box/error.result
index 062a90399..8f783f927 100644
--- a/test/box/error.result
+++ b/test/box/error.result
@@ -444,6 +444,8 @@ t;
| 223: box.error.INTERFERING_PROMOTE
| 224: box.error.RAFT_DISABLED
| 225: box.error.TXN_ROLLBACK
+ | 226: box.error.SYNC_QUEUE_UNCLAIMED
+ | 227: box.error.SYNC_QUEUE_FOREIGN
| ...
test_run:cmd("setopt delimiter ''");
diff --git a/test/replication/gh-5440-qsync-ro.result b/test/replication/gh-5440-qsync-ro.result
deleted file mode 100644
index 1ece26a42..000000000
--- a/test/replication/gh-5440-qsync-ro.result
+++ /dev/null
@@ -1,133 +0,0 @@
--- test-run result file version 2
---
--- gh-5440 everyone but the limbo owner is read-only on non-empty limbo.
---
-env = require('test_run')
- | ---
- | ...
-test_run = env.new()
- | ---
- | ...
-fiber = require('fiber')
- | ---
- | ...
-
-box.schema.user.grant('guest', 'replication')
- | ---
- | ...
-test_run:cmd('create server replica with rpl_master=default, script="replication/replica.lua"')
- | ---
- | - true
- | ...
-test_run:cmd('start server replica with wait=True, wait_load=True')
- | ---
- | - true
- | ...
-
-_ = box.schema.space.create('test', {is_sync=true})
- | ---
- | ...
-_ = box.space.test:create_index('pk')
- | ---
- | ...
-
-old_synchro_quorum = box.cfg.replication_synchro_quorum
- | ---
- | ...
-old_synchro_timeout = box.cfg.replication_synchro_timeout
- | ---
- | ...
-
--- Make sure that the master stalls on commit leaving the limbo non-empty.
-box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=1000}
- | ---
- | ...
-
-f = fiber.new(function() box.space.test:insert{1} end)
- | ---
- | ...
-f:status()
- | ---
- | - suspended
- | ...
-
--- Wait till replica's limbo is non-empty.
-test_run:wait_lsn('replica', 'default')
- | ---
- | ...
-test_run:cmd('switch replica')
- | ---
- | - true
- | ...
-
-box.info.ro
- | ---
- | - true
- | ...
-box.space.test:insert{2}
- | ---
- | - error: Can't modify data because this instance is in read-only mode.
- | ...
-success = false
- | ---
- | ...
-f = require('fiber').new(function() box.ctl.wait_rw() success = true end)
- | ---
- | ...
-f:status()
- | ---
- | - suspended
- | ...
-
-test_run:cmd('switch default')
- | ---
- | - true
- | ...
-
--- Empty the limbo.
-box.cfg{replication_synchro_quorum=2}
- | ---
- | ...
-
-test_run:cmd('switch replica')
- | ---
- | - true
- | ...
-
-test_run:wait_cond(function() return success end)
- | ---
- | - true
- | ...
-box.info.ro
- | ---
- | - false
- | ...
--- Should succeed now.
-box.space.test:insert{2}
- | ---
- | - [2]
- | ...
-
--- Cleanup.
-test_run:cmd('switch default')
- | ---
- | - true
- | ...
-box.cfg{replication_synchro_quorum=old_synchro_quorum,\
- replication_synchro_timeout=old_synchro_timeout}
- | ---
- | ...
-box.space.test:drop()
- | ---
- | ...
-test_run:cmd('stop server replica')
- | ---
- | - true
- | ...
-test_run:cmd('delete server replica')
- | ---
- | - true
- | ...
-box.schema.user.revoke('guest', 'replication')
- | ---
- | ...
diff --git a/test/replication/gh-5440-qsync-ro.test.lua b/test/replication/gh-5440-qsync-ro.test.lua
deleted file mode 100644
index d63ec9c1e..000000000
--- a/test/replication/gh-5440-qsync-ro.test.lua
+++ /dev/null
@@ -1,53 +0,0 @@
---
--- gh-5440 everyone but the limbo owner is read-only on non-empty limbo.
---
-env = require('test_run')
-test_run = env.new()
-fiber = require('fiber')
-
-box.schema.user.grant('guest', 'replication')
-test_run:cmd('create server replica with rpl_master=default, script="replication/replica.lua"')
-test_run:cmd('start server replica with wait=True, wait_load=True')
-
-_ = box.schema.space.create('test', {is_sync=true})
-_ = box.space.test:create_index('pk')
-
-old_synchro_quorum = box.cfg.replication_synchro_quorum
-old_synchro_timeout = box.cfg.replication_synchro_timeout
-
--- Make sure that the master stalls on commit leaving the limbo non-empty.
-box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=1000}
-
-f = fiber.new(function() box.space.test:insert{1} end)
-f:status()
-
--- Wait till replica's limbo is non-empty.
-test_run:wait_lsn('replica', 'default')
-test_run:cmd('switch replica')
-
-box.info.ro
-box.space.test:insert{2}
-success = false
-f = require('fiber').new(function() box.ctl.wait_rw() success = true end)
-f:status()
-
-test_run:cmd('switch default')
-
--- Empty the limbo.
-box.cfg{replication_synchro_quorum=2}
-
-test_run:cmd('switch replica')
-
-test_run:wait_cond(function() return success end)
-box.info.ro
--- Should succeed now.
-box.space.test:insert{2}
-
--- Cleanup.
-test_run:cmd('switch default')
-box.cfg{replication_synchro_quorum=old_synchro_quorum,\
- replication_synchro_timeout=old_synchro_timeout}
-box.space.test:drop()
-test_run:cmd('stop server replica')
-test_run:cmd('delete server replica')
-box.schema.user.revoke('guest', 'replication')
diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
index 6f42db081..7a6fd7052 100644
--- a/test/replication/suite.cfg
+++ b/test/replication/suite.cfg
@@ -42,7 +42,6 @@
"gh-4739-vclock-assert.test.lua": {},
"gh-4730-applier-rollback.test.lua": {},
"gh-4928-tx-boundaries.test.lua": {},
- "gh-5440-qsync-ro.test.lua": {},
"gh-5435-qsync-clear-synchro-queue-commit-all.test.lua": {},
"gh-5536-wal-limit.test.lua": {},
"gh-5566-final-join-synchro.test.lua": {},
--
2.30.1 (Apple Git-130)
More information about the Tarantool-patches
mailing list