[tarantool-patches] [PATCH v2] Emit NOPs in case of a replication conflict

Georgy Kirichenko georgy at tarantool.org
Wed Feb 13 10:37:02 MSK 2019


Applier writes NOP if it is not able to apply a masters row
because of conflict when the replication_skip_conflict option is set.
This prevents applier from reapplying already skipped rows after
restart.

Closes: #3977

Changes in v2:
 - Comment fixed according to review

Issue: https://github.com/tarantool/tarantool/issues/3977
Branch: https://github.com/tarantool/tarantool/tree/g.kirichenko/gh-3977-emit-nop-for-applier-conflict
---
 src/box/applier.cc                          | 25 ++++--
 test/replication/skip_conflict_row.result   | 98 +++++++++++++++++++++
 test/replication/skip_conflict_row.test.lua | 34 +++++++
 3 files changed, 150 insertions(+), 7 deletions(-)

diff --git a/src/box/applier.cc b/src/box/applier.cc
index 7f37fe2ee..00da43553 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -544,19 +544,30 @@ applier_subscribe(struct applier *applier)
 			int res = xstream_write(applier->subscribe_stream, &row);
 			if (res != 0) {
 				struct error *e = diag_last_error(diag_get());
-				/**
-				 * Silently skip ER_TUPLE_FOUND error if such
-				 * option is set in config.
+				/*
+				 * In case of ER_TUPLE_FOUND error and enabled
+				 * replication_skip_conflict configuration
+				 * option, skip applying the foreign row and
+				 * replace it with NOP in the local write ahead
+				 * log.
 				 */
 				if (e->type == &type_ClientError &&
 				    box_error_code(e) == ER_TUPLE_FOUND &&
-				    replication_skip_conflict)
+				    replication_skip_conflict) {
 					diag_clear(diag_get());
-				else {
-					latch_unlock(latch);
-					diag_raise();
+					res = 0;
+					struct xrow_header nop;
+					nop.type = IPROTO_NOP;
+					nop.bodycnt = 0;
+					nop.replica_id = row.replica_id;
+					nop.lsn = row.lsn;
+					res = xstream_write(applier->subscribe_stream, &nop);
 				}
 			}
+			if (res != 0) {
+				latch_unlock(latch);
+				diag_raise();
+			}
 		}
 		latch_unlock(latch);
 
diff --git a/test/replication/skip_conflict_row.result b/test/replication/skip_conflict_row.result
index bcbbbcc34..34be807eb 100644
--- a/test/replication/skip_conflict_row.result
+++ b/test/replication/skip_conflict_row.result
@@ -141,6 +141,104 @@ box.info.replication[1].upstream.message
 ---
 - Duplicate key exists in unique index 'primary' in space 'test'
 ...
+replication = box.cfg.replication
+---
+...
+box.cfg{replication_skip_conflict = true, replication = {}}
+---
+...
+box.cfg{replication = replication}
+---
+...
+test_run:cmd("switch default")
+---
+- true
+...
+-- test if nop were really written
+box.space.test:truncate()
+---
+...
+test_run:cmd("restart server replica")
+---
+- true
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+box.info.replication[1].upstream.status
+---
+- follow
+...
+-- write some conflicting records on slave
+for i = 1, 10 do box.space.test:insert({i, 'r'}) end
+---
+...
+box.cfg{replication_skip_conflict = true}
+---
+...
+v1 = box.info.vclock[1]
+---
+...
+-- write some conflicting records on master
+test_run:cmd("switch default")
+---
+- true
+...
+for i = 1, 10 do box.space.test:insert({i, 'm'}) end
+---
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+-- lsn should be incremented
+v1 == box.info.vclock[1] - 10
+---
+- true
+...
+-- and state is follow
+box.info.replication[1].upstream.status
+---
+- follow
+...
+-- restart server and check replication continues from nop-ed vclock
+test_run:cmd("switch default")
+---
+- true
+...
+test_run:cmd("stop server replica")
+---
+- true
+...
+for i = 11, 20 do box.space.test:insert({i, 'm'}) end
+---
+...
+test_run:cmd("start server replica")
+---
+- true
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+box.info.replication[1].upstream.status
+---
+- follow
+...
+box.space.test:select({11}, {iterator = "GE"})
+---
+- - [11, 'm']
+  - [12, 'm']
+  - [13, 'm']
+  - [14, 'm']
+  - [15, 'm']
+  - [16, 'm']
+  - [17, 'm']
+  - [18, 'm']
+  - [19, 'm']
+  - [20, 'm']
+...
 test_run:cmd("switch default")
 ---
 - true
diff --git a/test/replication/skip_conflict_row.test.lua b/test/replication/skip_conflict_row.test.lua
index 3a9076b39..b7fabd012 100644
--- a/test/replication/skip_conflict_row.test.lua
+++ b/test/replication/skip_conflict_row.test.lua
@@ -46,8 +46,42 @@ test_run:cmd("switch default")
 test_run:cmd("restart server replica")
 -- applier is not in follow state
 box.info.replication[1].upstream.message
+
+replication = box.cfg.replication
+box.cfg{replication_skip_conflict = true, replication = {}}
+box.cfg{replication = replication}
+test_run:cmd("switch default")
+
+-- test if nop were really written
+box.space.test:truncate()
+test_run:cmd("restart server replica")
+test_run:cmd("switch replica")
+box.info.replication[1].upstream.status
+-- write some conflicting records on slave
+for i = 1, 10 do box.space.test:insert({i, 'r'}) end
+box.cfg{replication_skip_conflict = true}
+v1 = box.info.vclock[1]
+
+-- write some conflicting records on master
+test_run:cmd("switch default")
+for i = 1, 10 do box.space.test:insert({i, 'm'}) end
+
+test_run:cmd("switch replica")
+-- lsn should be incremented
+v1 == box.info.vclock[1] - 10
+-- and state is follow
+box.info.replication[1].upstream.status
+
+-- restart server and check replication continues from nop-ed vclock
 test_run:cmd("switch default")
+test_run:cmd("stop server replica")
+for i = 11, 20 do box.space.test:insert({i, 'm'}) end
+test_run:cmd("start server replica")
+test_run:cmd("switch replica")
+box.info.replication[1].upstream.status
+box.space.test:select({11}, {iterator = "GE"})
 
+test_run:cmd("switch default")
 -- cleanup
 test_run:cmd("stop server replica")
 test_run:cmd("cleanup server replica")
-- 
2.20.1





More information about the Tarantool-patches mailing list