Tarantool development patches archive
 help / color / mirror / Atom feed
* [tarantool-patches] [PATCH 0/3] Promote vclock only for successful writes
@ 2019-02-06  8:29 Georgy Kirichenko
  2019-02-06  8:29 ` [tarantool-patches] [PATCH 1/3] Do not promote wal vclock for failed writes Georgy Kirichenko
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Georgy Kirichenko @ 2019-02-06  8:29 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Georgy Kirichenko

Promote tx and wal vclock only if corresponding row was successfully
written to wal. This prevents tarantool from lsn gaps in case of an wal
error as well as from skipped rows in case of replication errors.

Branch: https://github.com/tarantool/tarantool/tree/g.kirichenko/gh-2283-dont-skip-rows-after-errors
Issue: https://github.com/tarantool/tarantool/issues/2283

Georgy Kirichenko (3):
  Do not promote wal vclock for failed writes
  Enforce applier out of order protection
  Promote replicaset.vclock only after wal

 src/box/applier.cc                          |  46 +--
 src/box/wal.c                               |  57 ++-
 test/box/errinj.result                      |  23 ++
 test/box/errinj.test.lua                    |   8 +
 test/replication/skip_conflict_row.test.lua |  19 +
 test/xlog-py/dup_key.result                 |  12 +-
 test/xlog-py/dup_key.test.py                |  23 +-
 test/xlog/errinj.result                     |   1 -
 test/xlog/panic_on_lsn_gap.result           | 377 --------------------
 test/xlog/panic_on_lsn_gap.test.lua         | 136 -------
 10 files changed, 116 insertions(+), 586 deletions(-)
 delete mode 100644 test/xlog/panic_on_lsn_gap.result
 delete mode 100644 test/xlog/panic_on_lsn_gap.test.lua

-- 
2.20.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tarantool-patches] [PATCH 1/3] Do not promote wal vclock for failed writes
  2019-02-06  8:29 [tarantool-patches] [PATCH 0/3] Promote vclock only for successful writes Georgy Kirichenko
@ 2019-02-06  8:29 ` Georgy Kirichenko
  2019-02-06 13:56   ` Vladimir Davydov
  2019-02-06  8:29 ` [tarantool-patches] [PATCH 2/3] Enforce applier out of order protection Georgy Kirichenko
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 8+ messages in thread
From: Georgy Kirichenko @ 2019-02-06  8:29 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Georgy Kirichenko

Increase replica lsn only if row was successfully written to disk. This
prevents wal from lsn gaps in case of IO errors.

Obsoletes xlog/panic_on_lsn_gap.test.

Needs for #2283
---
 src/box/wal.c                       |  19 +-
 test/box/errinj.result              |  23 ++
 test/box/errinj.test.lua            |   8 +
 test/xlog/errinj.result             |   1 -
 test/xlog/panic_on_lsn_gap.result   | 377 ----------------------------
 test/xlog/panic_on_lsn_gap.test.lua | 136 ----------
 6 files changed, 44 insertions(+), 520 deletions(-)
 delete mode 100644 test/xlog/panic_on_lsn_gap.result
 delete mode 100644 test/xlog/panic_on_lsn_gap.test.lua

diff --git a/src/box/wal.c b/src/box/wal.c
index 3b50d3629..a55b544aa 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -901,16 +901,16 @@ wal_writer_begin_rollback(struct wal_writer *writer)
 }
 
 static void
-wal_assign_lsn(struct wal_writer *writer, struct xrow_header **row,
+wal_assign_lsn(struct vclock *vclock, struct xrow_header **row,
 	       struct xrow_header **end)
 {
 	/** Assign LSN to all local rows. */
 	for ( ; row < end; row++) {
 		if ((*row)->replica_id == 0) {
-			(*row)->lsn = vclock_inc(&writer->vclock, instance_id);
+			(*row)->lsn = vclock_inc(vclock, instance_id);
 			(*row)->replica_id = instance_id;
 		} else {
-			vclock_follow_xrow(&writer->vclock, *row);
+			vclock_follow_xrow(vclock, *row);
 		}
 	}
 }
@@ -922,6 +922,11 @@ wal_write_to_disk(struct cmsg *msg)
 	struct wal_msg *wal_msg = (struct wal_msg *) msg;
 	struct error *error;
 
+	/* Local vclock copy. */
+	struct vclock vclock;
+	vclock_create(&vclock);
+	vclock_copy(&vclock, &writer->vclock);
+
 	struct errinj *inj = errinj(ERRINJ_WAL_DELAY, ERRINJ_BOOL);
 	while (inj != NULL && inj->bparam)
 		usleep(10);
@@ -974,14 +979,15 @@ wal_write_to_disk(struct cmsg *msg)
 	struct journal_entry *entry;
 	struct stailq_entry *last_committed = NULL;
 	stailq_foreach_entry(entry, &wal_msg->commit, fifo) {
-		wal_assign_lsn(writer, entry->rows, entry->rows + entry->n_rows);
-		entry->res = vclock_sum(&writer->vclock);
+		wal_assign_lsn(&vclock, entry->rows, entry->rows + entry->n_rows);
+		entry->res = vclock_sum(&vclock);
 		rc = xlog_write_entry(l, entry);
 		if (rc < 0)
 			goto done;
 		if (rc > 0) {
 			writer->checkpoint_wal_size += rc;
 			last_committed = &entry->fifo;
+			vclock_copy(&writer->vclock, &vclock);
 		}
 		/* rc == 0: the write is buffered in xlog_tx */
 	}
@@ -991,6 +997,7 @@ wal_write_to_disk(struct cmsg *msg)
 
 	writer->checkpoint_wal_size += rc;
 	last_committed = stailq_last(&wal_msg->commit);
+	vclock_copy(&writer->vclock, &vclock);
 
 	/*
 	 * Notify TX if the checkpoint threshold has been exceeded.
@@ -1185,7 +1192,7 @@ wal_write_in_wal_mode_none(struct journal *journal,
 			   struct journal_entry *entry)
 {
 	struct wal_writer *writer = (struct wal_writer *) journal;
-	wal_assign_lsn(writer, entry->rows, entry->rows + entry->n_rows);
+	wal_assign_lsn(&writer->vclock, entry->rows, entry->rows + entry->n_rows);
 	int64_t old_lsn = vclock_get(&replicaset.vclock, instance_id);
 	int64_t new_lsn = vclock_get(&writer->vclock, instance_id);
 	if (new_lsn > old_lsn) {
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 12303670e..1d9a16d8d 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -141,6 +141,15 @@ errinj.set("ERRINJ_TESTING", false)
 ---
 - ok
 ...
+env = require('test_run')
+---
+...
+test_run = env.new()
+---
+...
+lsn1 = box.info.vclock[box.info.id]
+---
+...
 -- Check how well we handle a failed log write
 errinj.set("ERRINJ_WAL_IO", true)
 ---
@@ -161,6 +170,11 @@ space:insert{1}
 ---
 - [1]
 ...
+-- Check vclock was promoted only one time
+box.info.vclock[box.info.id] == lsn1 + 1
+---
+- true
+...
 errinj.set("ERRINJ_WAL_IO", true)
 ---
 - ok
@@ -180,6 +194,15 @@ errinj.set("ERRINJ_WAL_IO", false)
 ---
 - ok
 ...
+space:update(1, {{'=', 2, 2}})
+---
+- [1, 2]
+...
+-- Check vclock was promoted only two times
+box.info.vclock[box.info.id] == lsn1 + 2
+---
+- true
+...
 space:truncate()
 ---
 ...
diff --git a/test/box/errinj.test.lua b/test/box/errinj.test.lua
index 6f491d623..566630f67 100644
--- a/test/box/errinj.test.lua
+++ b/test/box/errinj.test.lua
@@ -12,17 +12,25 @@ errinj.set("ERRINJ_TESTING", true)
 space:select{222444}
 errinj.set("ERRINJ_TESTING", false)
 
+env = require('test_run')
+test_run = env.new()
+lsn1 = box.info.vclock[box.info.id]
 -- Check how well we handle a failed log write
 errinj.set("ERRINJ_WAL_IO", true)
 space:insert{1}
 space:get{1}
 errinj.set("ERRINJ_WAL_IO", false)
 space:insert{1}
+-- Check vclock was promoted only one time
+box.info.vclock[box.info.id] == lsn1 + 1
 errinj.set("ERRINJ_WAL_IO", true)
 space:update(1, {{'=', 2, 2}})
 space:get{1}
 space:get{2}
 errinj.set("ERRINJ_WAL_IO", false)
+space:update(1, {{'=', 2, 2}})
+-- Check vclock was promoted only two times
+box.info.vclock[box.info.id] == lsn1 + 2
 space:truncate()
 
 -- Check a failed log rotation
diff --git a/test/xlog/errinj.result b/test/xlog/errinj.result
index 390404b47..7f15bef35 100644
--- a/test/xlog/errinj.result
+++ b/test/xlog/errinj.result
@@ -43,7 +43,6 @@ require('fio').glob(name .. "/*.xlog")
 ---
 - - xlog/00000000000000000000.xlog
   - xlog/00000000000000000001.xlog
-  - xlog/00000000000000000002.xlog
 ...
 test_run:cmd('restart server default with cleanup=1')
 -- gh-881 iproto request with wal IO error
diff --git a/test/xlog/panic_on_lsn_gap.result b/test/xlog/panic_on_lsn_gap.result
deleted file mode 100644
index 4dd1291f8..000000000
--- a/test/xlog/panic_on_lsn_gap.result
+++ /dev/null
@@ -1,377 +0,0 @@
---
--- we actually need to know what xlogs the server creates,
--- so start from a clean state
---
---
--- Check how the server is able to find the next
--- xlog if there are failed writes (lsn gaps).
---
-env = require('test_run')
----
-...
-test_run = env.new()
----
-...
-test_run:cmd("create server panic with script='xlog/panic.lua'")
----
-- true
-...
-test_run:cmd("start server panic")
----
-- true
-...
-test_run:cmd("switch panic")
----
-- true
-...
-box.info.vclock
----
-- {}
-...
-s = box.space._schema
----
-...
--- we need to have at least one record in the
--- xlog otherwise the server believes that there
--- is an lsn gap during recovery.
---
-s:replace{"key", 'test 1'}
----
-- ['key', 'test 1']
-...
-box.info.vclock
----
-- {1: 1}
-...
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
----
-- ok
-...
-t = {}
----
-...
---
--- Try to insert rows, so that it's time to
--- switch WALs. No switch will happen though,
--- since no writes were made.
---
-test_run:cmd("setopt delimiter ';'")
----
-- true
-...
-for i=1,box.cfg.rows_per_wal do
-    status, msg = pcall(s.replace, s, {"key"})
-    table.insert(t, msg)
-end;
----
-...
-test_run:cmd("setopt delimiter ''");
----
-- true
-...
-t
----
-- - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-...
---
--- Before restart: our LSN is 1, because
--- LSN is promoted in tx only on successful
--- WAL write.
---
-name = string.match(arg[0], "([^,]+)%.lua")
----
-...
-box.info.vclock
----
-- {1: 1}
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-...
-test_run:cmd("restart server panic")
---
--- After restart: our LSN is the LSN of the
--- last empty WAL created on shutdown, i.e. 11.
---
-box.info.vclock
----
-- {1: 11}
-...
-box.space._schema:select{'key'}
----
-- - ['key', 'test 1']
-...
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
----
-- ok
-...
-t = {}
----
-...
-s = box.space._schema
----
-...
---
--- now do the same
---
-test_run:cmd("setopt delimiter ';'")
----
-- true
-...
-for i=1,box.cfg.rows_per_wal do
-    status, msg = pcall(s.replace, s, {"key"})
-    table.insert(t, msg)
-end;
----
-...
-test_run:cmd("setopt delimiter ''");
----
-- true
-...
-t
----
-- - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-...
-box.info.vclock
----
-- {1: 11}
-...
-box.error.injection.set("ERRINJ_WAL_WRITE", false)
----
-- ok
-...
---
--- Write a good row after a series of failed
--- rows. There is a gap in LSN, correct,
--- but it's *inside* a single WAL, so doesn't
--- affect WAL search in recover_remaining_wals()
---
-s:replace{'key', 'test 2'}
----
-- ['key', 'test 2']
-...
---
--- notice that vclock before and after
--- server stop is the same -- because it's
--- recorded in the last row
---
-box.info.vclock
----
-- {1: 22}
-...
-test_run:cmd("restart server panic")
-box.info.vclock
----
-- {1: 22}
-...
-box.space._schema:select{'key'}
----
-- - ['key', 'test 2']
-...
--- list all the logs
-name = string.match(arg[0], "([^,]+)%.lua")
----
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-...
--- now insert 10 rows - so that the next
--- row will need to switch the WAL
-test_run:cmd("setopt delimiter ';'")
----
-- true
-...
-for i=1,box.cfg.rows_per_wal do
-    box.space._schema:replace{"key", 'test 3'}
-end;
----
-...
-test_run:cmd("setopt delimiter ''");
----
-- true
-...
--- the next insert should switch xlog, but aha - it fails
--- a new xlog file is created but has 0 rows
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-...
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
----
-- ok
-...
-box.space._schema:replace{"key", 'test 3'}
----
-- error: Failed to write to disk
-...
-box.info.vclock
----
-- {1: 32}
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-  - panic/00000000000000000032.xlog
-...
--- and the next one (just to be sure
-box.space._schema:replace{"key", 'test 3'}
----
-- error: Failed to write to disk
-...
-box.info.vclock
----
-- {1: 32}
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-  - panic/00000000000000000032.xlog
-...
-box.error.injection.set("ERRINJ_WAL_WRITE", false)
----
-- ok
-...
--- then a success
-box.space._schema:replace{"key", 'test 4'}
----
-- ['key', 'test 4']
-...
-box.info.vclock
----
-- {1: 35}
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-  - panic/00000000000000000032.xlog
-...
--- restart is ok
-test_run:cmd("restart server panic")
-box.space._schema:select{'key'}
----
-- - ['key', 'test 4']
-...
---
--- Check that if there's an LSN gap between two WALs
--- that appeared due to a disk error and no files is
--- actually missing, we won't panic on recovery.
---
-box.space._schema:replace{'key', 'test 4'} -- creates new WAL
----
-- ['key', 'test 4']
-...
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
----
-- ok
-...
-box.space._schema:replace{'key', 'test 5'} -- fails, makes gap
----
-- error: Failed to write to disk
-...
-box.snapshot() -- fails, rotates WAL
----
-- error: Error injection 'xlog write injection'
-...
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false)
----
-- ok
-...
-box.space._schema:replace{'key', 'test 5'} -- creates new WAL
----
-- ['key', 'test 5']
-...
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
----
-- ok
-...
-box.space._schema:replace{'key', 'test 6'} -- fails, makes gap
----
-- error: Failed to write to disk
-...
-box.snapshot() -- fails, rotates WAL
----
-- error: Error injection 'xlog write injection'
-...
-box.space._schema:replace{'key', 'test 6'} -- fails, creates empty WAL
----
-- error: Failed to write to disk
-...
-name = string.match(arg[0], "([^,]+)%.lua")
----
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-  - panic/00000000000000000032.xlog
-  - panic/00000000000000000035.xlog
-  - panic/00000000000000000037.xlog
-  - panic/00000000000000000039.xlog
-...
-test_run:cmd("restart server panic")
-box.space._schema:select{'key'}
----
-- - ['key', 'test 5']
-...
--- Check that we don't create a WAL in the gap between the last two.
-box.space._schema:replace{'key', 'test 6'}
----
-- ['key', 'test 6']
-...
-name = string.match(arg[0], "([^,]+)%.lua")
----
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-  - panic/00000000000000000032.xlog
-  - panic/00000000000000000035.xlog
-  - panic/00000000000000000037.xlog
-  - panic/00000000000000000039.xlog
-  - panic/00000000000000000040.xlog
-...
-test_run:cmd('switch default')
----
-- true
-...
-test_run:cmd("stop server panic")
----
-- true
-...
-test_run:cmd("cleanup server panic")
----
-- true
-...
diff --git a/test/xlog/panic_on_lsn_gap.test.lua b/test/xlog/panic_on_lsn_gap.test.lua
deleted file mode 100644
index 6221261a7..000000000
--- a/test/xlog/panic_on_lsn_gap.test.lua
+++ /dev/null
@@ -1,136 +0,0 @@
---
--- we actually need to know what xlogs the server creates,
--- so start from a clean state
---
---
--- Check how the server is able to find the next
--- xlog if there are failed writes (lsn gaps).
---
-env = require('test_run')
-test_run = env.new()
-test_run:cmd("create server panic with script='xlog/panic.lua'")
-test_run:cmd("start server panic")
-test_run:cmd("switch panic")
-box.info.vclock
-s = box.space._schema
--- we need to have at least one record in the
--- xlog otherwise the server believes that there
--- is an lsn gap during recovery.
---
-s:replace{"key", 'test 1'}
-box.info.vclock
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
-t = {}
---
--- Try to insert rows, so that it's time to
--- switch WALs. No switch will happen though,
--- since no writes were made.
---
-test_run:cmd("setopt delimiter ';'")
-for i=1,box.cfg.rows_per_wal do
-    status, msg = pcall(s.replace, s, {"key"})
-    table.insert(t, msg)
-end;
-test_run:cmd("setopt delimiter ''");
-t
---
--- Before restart: our LSN is 1, because
--- LSN is promoted in tx only on successful
--- WAL write.
---
-name = string.match(arg[0], "([^,]+)%.lua")
-box.info.vclock
-require('fio').glob(name .. "/*.xlog")
-test_run:cmd("restart server panic")
---
--- After restart: our LSN is the LSN of the
--- last empty WAL created on shutdown, i.e. 11.
---
-box.info.vclock
-box.space._schema:select{'key'}
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
-t = {}
-s = box.space._schema
---
--- now do the same
---
-test_run:cmd("setopt delimiter ';'")
-for i=1,box.cfg.rows_per_wal do
-    status, msg = pcall(s.replace, s, {"key"})
-    table.insert(t, msg)
-end;
-test_run:cmd("setopt delimiter ''");
-t
-box.info.vclock
-box.error.injection.set("ERRINJ_WAL_WRITE", false)
---
--- Write a good row after a series of failed
--- rows. There is a gap in LSN, correct,
--- but it's *inside* a single WAL, so doesn't
--- affect WAL search in recover_remaining_wals()
---
-s:replace{'key', 'test 2'}
---
--- notice that vclock before and after
--- server stop is the same -- because it's
--- recorded in the last row
---
-box.info.vclock
-test_run:cmd("restart server panic")
-box.info.vclock
-box.space._schema:select{'key'}
--- list all the logs
-name = string.match(arg[0], "([^,]+)%.lua")
-require('fio').glob(name .. "/*.xlog")
--- now insert 10 rows - so that the next
--- row will need to switch the WAL
-test_run:cmd("setopt delimiter ';'")
-for i=1,box.cfg.rows_per_wal do
-    box.space._schema:replace{"key", 'test 3'}
-end;
-test_run:cmd("setopt delimiter ''");
--- the next insert should switch xlog, but aha - it fails
--- a new xlog file is created but has 0 rows
-require('fio').glob(name .. "/*.xlog")
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
-box.space._schema:replace{"key", 'test 3'}
-box.info.vclock
-require('fio').glob(name .. "/*.xlog")
--- and the next one (just to be sure
-box.space._schema:replace{"key", 'test 3'}
-box.info.vclock
-require('fio').glob(name .. "/*.xlog")
-box.error.injection.set("ERRINJ_WAL_WRITE", false)
--- then a success
-box.space._schema:replace{"key", 'test 4'}
-box.info.vclock
-require('fio').glob(name .. "/*.xlog")
--- restart is ok
-test_run:cmd("restart server panic")
-box.space._schema:select{'key'}
---
--- Check that if there's an LSN gap between two WALs
--- that appeared due to a disk error and no files is
--- actually missing, we won't panic on recovery.
---
-box.space._schema:replace{'key', 'test 4'} -- creates new WAL
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
-box.space._schema:replace{'key', 'test 5'} -- fails, makes gap
-box.snapshot() -- fails, rotates WAL
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false)
-box.space._schema:replace{'key', 'test 5'} -- creates new WAL
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
-box.space._schema:replace{'key', 'test 6'} -- fails, makes gap
-box.snapshot() -- fails, rotates WAL
-box.space._schema:replace{'key', 'test 6'} -- fails, creates empty WAL
-name = string.match(arg[0], "([^,]+)%.lua")
-require('fio').glob(name .. "/*.xlog")
-test_run:cmd("restart server panic")
-box.space._schema:select{'key'}
--- Check that we don't create a WAL in the gap between the last two.
-box.space._schema:replace{'key', 'test 6'}
-name = string.match(arg[0], "([^,]+)%.lua")
-require('fio').glob(name .. "/*.xlog")
-test_run:cmd('switch default')
-test_run:cmd("stop server panic")
-test_run:cmd("cleanup server panic")
-- 
2.20.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tarantool-patches] [PATCH 2/3] Enforce applier out of order protection
  2019-02-06  8:29 [tarantool-patches] [PATCH 0/3] Promote vclock only for successful writes Georgy Kirichenko
  2019-02-06  8:29 ` [tarantool-patches] [PATCH 1/3] Do not promote wal vclock for failed writes Georgy Kirichenko
@ 2019-02-06  8:29 ` Georgy Kirichenko
  2019-02-06 14:13   ` Vladimir Davydov
  2019-02-06  8:29 ` [tarantool-patches] [PATCH 3/3] Promote replicaset.vclock only after wal Georgy Kirichenko
  2019-02-06 13:50 ` [tarantool-patches] [PATCH 0/3] Promote vclock only for successful writes Vladimir Davydov
  3 siblings, 1 reply; 8+ messages in thread
From: Georgy Kirichenko @ 2019-02-06  8:29 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Georgy Kirichenko

Do not skip row until the row is not processed.

Prerequisite #2283
---
 src/box/applier.cc | 48 ++++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/src/box/applier.cc b/src/box/applier.cc
index 21d2e6bcb..d87b247e2 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -512,31 +512,25 @@ applier_subscribe(struct applier *applier)
 
 		applier->lag = ev_now(loop()) - row.tm;
 		applier->last_row_time = ev_monotonic_now(loop());
-
-		if (vclock_get(&replicaset.vclock, row.replica_id) < row.lsn) {
-			/**
-			 * Promote the replica set vclock before
-			 * applying the row. If there is an
-			 * exception (conflict) applying the row,
-			 * the row is skipped when the replication
-			 * is resumed.
-			 */
+		struct replica *replica = replica_by_id(row.replica_id);
+		struct latch *latch = (replica ? &replica->order_latch :
+				       &replicaset.applier.order_latch);
+		/*
+		 * In a full mesh topology, the same set
+		 * of changes may arrive via two
+		 * concurrently running appliers. Thanks
+		 * to vclock_follow() above, the first row
+		 * in the set will be skipped - but the
+		 * remaining may execute out of order,
+		 * when the following xstream_write()
+		 * yields on WAL. Hence we need a latch to
+		 * strictly order all changes which belong
+		 * to the same server id.
+		 */
+		latch_lock(latch);
+		if (vclock_get(&replicaset.vclock,
+			       row.replica_id) < row.lsn) {
 			vclock_follow_xrow(&replicaset.vclock, &row);
-			struct replica *replica = replica_by_id(row.replica_id);
-			struct latch *latch = (replica ? &replica->order_latch :
-					       &replicaset.applier.order_latch);
-			/*
-			 * In a full mesh topology, the same set
-			 * of changes may arrive via two
-			 * concurrently running appliers. Thanks
-			 * to vclock_follow() above, the first row
-			 * in the set will be skipped - but the
-			 * remaining may execute out of order,
-			 * when the following xstream_write()
-			 * yields on WAL. Hence we need a latch to
-			 * strictly order all changes which belong
-			 * to the same server id.
-			 */
 			latch_lock(latch);
 			int res = xstream_write(applier->subscribe_stream, &row);
 			latch_unlock(latch);
@@ -550,10 +544,14 @@ applier_subscribe(struct applier *applier)
 				    box_error_code(e) == ER_TUPLE_FOUND &&
 				    replication_skip_conflict)
 					diag_clear(diag_get());
-				else
+				else {
+					latch_unlock(latch);
 					diag_raise();
+				}
 			}
 		}
+		latch_unlock(latch);
+
 		if (applier->state == APPLIER_SYNC ||
 		    applier->state == APPLIER_FOLLOW)
 			fiber_cond_signal(&applier->writer_cond);
-- 
2.20.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tarantool-patches] [PATCH 3/3] Promote replicaset.vclock only after wal
  2019-02-06  8:29 [tarantool-patches] [PATCH 0/3] Promote vclock only for successful writes Georgy Kirichenko
  2019-02-06  8:29 ` [tarantool-patches] [PATCH 1/3] Do not promote wal vclock for failed writes Georgy Kirichenko
  2019-02-06  8:29 ` [tarantool-patches] [PATCH 2/3] Enforce applier out of order protection Georgy Kirichenko
@ 2019-02-06  8:29 ` Georgy Kirichenko
  2019-02-06 14:45   ` Vladimir Davydov
  2019-02-06 13:50 ` [tarantool-patches] [PATCH 0/3] Promote vclock only for successful writes Vladimir Davydov
  3 siblings, 1 reply; 8+ messages in thread
From: Georgy Kirichenko @ 2019-02-06  8:29 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Georgy Kirichenko

Get rid if appliers' vclock_follow and promote vclock only after
wal write.

Closes #2283
Prerequisite #980
---
 src/box/applier.cc                          | 14 ++------
 src/box/wal.c                               | 38 ++++-----------------
 test/replication/skip_conflict_row.test.lua | 19 +++++++++++
 test/xlog-py/dup_key.result                 | 12 +++++--
 test/xlog-py/dup_key.test.py                | 23 ++++++++++---
 5 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/src/box/applier.cc b/src/box/applier.cc
index d87b247e2..cae71ec1c 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -518,22 +518,14 @@ applier_subscribe(struct applier *applier)
 		/*
 		 * In a full mesh topology, the same set
 		 * of changes may arrive via two
-		 * concurrently running appliers. Thanks
-		 * to vclock_follow() above, the first row
-		 * in the set will be skipped - but the
-		 * remaining may execute out of order,
-		 * when the following xstream_write()
-		 * yields on WAL. Hence we need a latch to
-		 * strictly order all changes which belong
-		 * to the same server id.
+		 * concurrently running appliers.
+		 * Hence we need a latch to strictly order all
+		 * changes which belong to the same server id.
 		 */
 		latch_lock(latch);
 		if (vclock_get(&replicaset.vclock,
 			       row.replica_id) < row.lsn) {
-			vclock_follow_xrow(&replicaset.vclock, &row);
-			latch_lock(latch);
 			int res = xstream_write(applier->subscribe_stream, &row);
-			latch_unlock(latch);
 			if (res != 0) {
 				struct error *e = diag_last_error(diag_get());
 				/**
diff --git a/src/box/wal.c b/src/box/wal.c
index a55b544aa..4c3537672 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -171,6 +171,8 @@ struct wal_msg {
 	 * be rolled back.
 	 */
 	struct stailq rollback;
+	/** vclock after the batch processed. */
+	struct vclock vclock;
 };
 
 /**
@@ -209,6 +211,7 @@ wal_msg_create(struct wal_msg *batch)
 	batch->approx_len = 0;
 	stailq_create(&batch->commit);
 	stailq_create(&batch->rollback);
+	vclock_create(&batch->vclock);
 }
 
 static struct wal_msg *
@@ -280,6 +283,7 @@ tx_schedule_commit(struct cmsg *msg)
 	 * wal_msg memory disappears after the first
 	 * iteration of tx_schedule_queue loop.
 	 */
+	vclock_copy(&replicaset.vclock, &batch->vclock);
 	if (! stailq_empty(&batch->rollback)) {
 		/* Closes the input valve. */
 		stailq_concat(&writer->rollback, &batch->rollback);
@@ -1028,6 +1032,8 @@ done:
 		error_log(error);
 		diag_clear(diag_get());
 	}
+	/* Set resulting vclock. */
+	vclock_copy(&wal_msg->vclock, &writer->vclock);
 	/*
 	 * We need to start rollback from the first request
 	 * following the last committed request. If
@@ -1159,31 +1165,6 @@ wal_write(struct journal *journal, struct journal_entry *entry)
 	bool cancellable = fiber_set_cancellable(false);
 	fiber_yield(); /* Request was inserted. */
 	fiber_set_cancellable(cancellable);
-	if (entry->res > 0) {
-		struct xrow_header **last = entry->rows + entry->n_rows - 1;
-		while (last >= entry->rows) {
-			/*
-			 * Find last row from local instance id
-			 * and promote vclock.
-			 */
-			if ((*last)->replica_id == instance_id) {
-				/*
-				 * In master-master configuration, during sudden
-				 * power-loss, if the data have not been written
-				 * to WAL but have already been sent to others,
-				 * they will send the data back. In this case
-				 * vclock has already been promoted by applier.
-				 */
-				if (vclock_get(&replicaset.vclock,
-					       instance_id) < (*last)->lsn) {
-					vclock_follow_xrow(&replicaset.vclock,
-							   *last);
-				}
-				break;
-			}
-			--last;
-		}
-	}
 	return entry->res;
 }
 
@@ -1193,12 +1174,7 @@ wal_write_in_wal_mode_none(struct journal *journal,
 {
 	struct wal_writer *writer = (struct wal_writer *) journal;
 	wal_assign_lsn(&writer->vclock, entry->rows, entry->rows + entry->n_rows);
-	int64_t old_lsn = vclock_get(&replicaset.vclock, instance_id);
-	int64_t new_lsn = vclock_get(&writer->vclock, instance_id);
-	if (new_lsn > old_lsn) {
-		/* There were local writes, promote vclock. */
-		vclock_follow(&replicaset.vclock, instance_id, new_lsn);
-	}
+	vclock_copy(&replicaset.vclock, &writer->vclock);
 	return vclock_sum(&writer->vclock);
 }
 
diff --git a/test/replication/skip_conflict_row.test.lua b/test/replication/skip_conflict_row.test.lua
index 4406ced95..c60999b9b 100644
--- a/test/replication/skip_conflict_row.test.lua
+++ b/test/replication/skip_conflict_row.test.lua
@@ -1,5 +1,6 @@
 env = require('test_run')
 test_run = env.new()
+test_run:cmd("restart server default with cleanup=1")
 engine = test_run:get_cfg('engine')
 
 box.schema.user.grant('guest', 'replication')
@@ -28,6 +29,24 @@ box.space.test:select()
 test_run:cmd("switch default")
 box.info.status
 
+-- test that if replication_skip_conflict is off vclock
+-- is not advanced on errors.
+test_run:cmd("restart server replica")
+test_run:cmd("switch replica")
+box.cfg{replication_skip_conflict=false}
+box.space.test:insert{3}
+box.info.vclock
+test_run:cmd("switch default")
+box.space.test:insert{3, 3}
+box.space.test:insert{4}
+box.info.vclock
+test_run:cmd("switch replica")
+box.info.vclock
+box.info.replication[1].upstream.message
+box.info.replication[1].upstream.status
+box.space.test:select()
+test_run:cmd("switch default")
+
 -- cleanup
 test_run:cmd("stop server replica")
 test_run:cmd("cleanup server replica")
diff --git a/test/xlog-py/dup_key.result b/test/xlog-py/dup_key.result
index f387e8e89..966fa1f4a 100644
--- a/test/xlog-py/dup_key.result
+++ b/test/xlog-py/dup_key.result
@@ -16,7 +16,16 @@ box.space.test:insert{2, 'second tuple'}
 ---
 - [2, 'second tuple']
 ...
-.xlog exists
+.xlog#1 exists
+box.space.test:insert{3, 'third tuple'}
+---
+- [3, 'third tuple']
+...
+box.space.test:insert{4, 'fourth tuple'}
+---
+- [4, 'fourth tuple']
+...
+.xlog#2 exists
 box.space.test:insert{1, 'third tuple'}
 ---
 - [1, 'third tuple']
@@ -25,7 +34,6 @@ box.space.test:insert{2, 'fourth tuple'}
 ---
 - [2, 'fourth tuple']
 ...
-.xlog does not exist
 check log line for 'Duplicate key'
 
 'Duplicate key' exists in server log
diff --git a/test/xlog-py/dup_key.test.py b/test/xlog-py/dup_key.test.py
index 1c033da40..e25b1d477 100644
--- a/test/xlog-py/dup_key.test.py
+++ b/test/xlog-py/dup_key.test.py
@@ -22,23 +22,36 @@ wal = os.path.join(vardir, filename)
 # Create wal#1
 server.admin("box.space.test:insert{1, 'first tuple'}")
 server.admin("box.space.test:insert{2, 'second tuple'}")
+lsn2 = int(yaml.load(server.admin("box.info.lsn", silent=True))[0])
 server.stop()
 
 # Save wal#1
 if os.access(wal, os.F_OK):
-    print ".xlog exists"
+    print ".xlog#1 exists"
     os.rename(wal, wal_old)
+# drop empty log created on shutdown
+filename2 = str(lsn2).zfill(20) + ".xlog"
+wal2 = os.path.join(vardir, filename2)
+os.unlink(wal2)
 
-# Write wal#2
+# Write wal#2 to bump lsn
+server.start()
+server.admin("box.space.test:insert{3, 'third tuple'}")
+server.admin("box.space.test:insert{4, 'fourth tuple'}")
+server.stop()
+
+if os.access(wal, os.F_OK):
+    print ".xlog#2 exists"
+
+# Write wal#3 - confliction with wal#1
 server.start()
 server.admin("box.space.test:insert{1, 'third tuple'}")
 server.admin("box.space.test:insert{2, 'fourth tuple'}")
 server.stop()
 
 # Restore wal#1
-if not os.access(wal, os.F_OK):
-    print ".xlog does not exist"
-    os.rename(wal_old, wal)
+os.unlink(wal)
+os.rename(wal_old, wal)
 
 server.start()
 line = 'Duplicate key'
-- 
2.20.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [tarantool-patches] [PATCH 0/3] Promote vclock only for successful writes
  2019-02-06  8:29 [tarantool-patches] [PATCH 0/3] Promote vclock only for successful writes Georgy Kirichenko
                   ` (2 preceding siblings ...)
  2019-02-06  8:29 ` [tarantool-patches] [PATCH 3/3] Promote replicaset.vclock only after wal Georgy Kirichenko
@ 2019-02-06 13:50 ` Vladimir Davydov
  3 siblings, 0 replies; 8+ messages in thread
From: Vladimir Davydov @ 2019-02-06 13:50 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

This is v2 so you should've added v2 to the subject and a brief change
log to the body.

On Wed, Feb 06, 2019 at 11:29:56AM +0300, Georgy Kirichenko wrote:
> Promote tx and wal vclock only if corresponding row was successfully
> written to wal. This prevents tarantool from lsn gaps in case of an wal
> error as well as from skipped rows in case of replication errors.
> 
> Branch: https://github.com/tarantool/tarantool/tree/g.kirichenko/gh-2283-dont-skip-rows-after-errors
> Issue: https://github.com/tarantool/tarantool/issues/2283
> 
> Georgy Kirichenko (3):
>   Do not promote wal vclock for failed writes
>   Enforce applier out of order protection
>   Promote replicaset.vclock only after wal
> 
>  src/box/applier.cc                          |  46 +--
>  src/box/wal.c                               |  57 ++-
>  test/box/errinj.result                      |  23 ++
>  test/box/errinj.test.lua                    |   8 +
>  test/replication/skip_conflict_row.test.lua |  19 +
>  test/xlog-py/dup_key.result                 |  12 +-
>  test/xlog-py/dup_key.test.py                |  23 +-
>  test/xlog/errinj.result                     |   1 -
>  test/xlog/panic_on_lsn_gap.result           | 377 --------------------
>  test/xlog/panic_on_lsn_gap.test.lua         | 136 -------
>  10 files changed, 116 insertions(+), 586 deletions(-)
>  delete mode 100644 test/xlog/panic_on_lsn_gap.result
>  delete mode 100644 test/xlog/panic_on_lsn_gap.test.lua

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [tarantool-patches] [PATCH 1/3] Do not promote wal vclock for failed writes
  2019-02-06  8:29 ` [tarantool-patches] [PATCH 1/3] Do not promote wal vclock for failed writes Georgy Kirichenko
@ 2019-02-06 13:56   ` Vladimir Davydov
  0 siblings, 0 replies; 8+ messages in thread
From: Vladimir Davydov @ 2019-02-06 13:56 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

On Wed, Feb 06, 2019 at 11:29:57AM +0300, Georgy Kirichenko wrote:
> Increase replica lsn only if row was successfully written to disk. This
> prevents wal from lsn gaps in case of IO errors.

Why are gaps that bad? Please say a few words of explanation in the
commit message.

> 
> Obsoletes xlog/panic_on_lsn_gap.test.
> 
> Needs for #2283

Needed for

> ---
>  src/box/wal.c                       |  19 +-
>  test/box/errinj.result              |  23 ++
>  test/box/errinj.test.lua            |   8 +
>  test/xlog/errinj.result             |   1 -
>  test/xlog/panic_on_lsn_gap.result   | 377 ----------------------------
>  test/xlog/panic_on_lsn_gap.test.lua | 136 ----------
>  6 files changed, 44 insertions(+), 520 deletions(-)
>  delete mode 100644 test/xlog/panic_on_lsn_gap.result
>  delete mode 100644 test/xlog/panic_on_lsn_gap.test.lua
> 
> diff --git a/src/box/wal.c b/src/box/wal.c
> index 3b50d3629..a55b544aa 100644
> --- a/src/box/wal.c
> +++ b/src/box/wal.c
> @@ -901,16 +901,16 @@ wal_writer_begin_rollback(struct wal_writer *writer)
>  }
>  
>  static void
> -wal_assign_lsn(struct wal_writer *writer, struct xrow_header **row,
> +wal_assign_lsn(struct vclock *vclock, struct xrow_header **row,
>  	       struct xrow_header **end)
>  {
>  	/** Assign LSN to all local rows. */
>  	for ( ; row < end; row++) {
>  		if ((*row)->replica_id == 0) {
> -			(*row)->lsn = vclock_inc(&writer->vclock, instance_id);
> +			(*row)->lsn = vclock_inc(vclock, instance_id);
>  			(*row)->replica_id = instance_id;
>  		} else {
> -			vclock_follow_xrow(&writer->vclock, *row);
> +			vclock_follow_xrow(vclock, *row);
>  		}
>  	}
>  }
> @@ -922,6 +922,11 @@ wal_write_to_disk(struct cmsg *msg)
>  	struct wal_msg *wal_msg = (struct wal_msg *) msg;
>  	struct error *error;
>  
> +	/* Local vclock copy. */

Bad comment. Can you elaborate why you need it, please? E.g.

	We don't want to promote the WAL vclock if write fails,
	because that would result in LSN gaps, which are difficult
	to handle in relay. So we promote a local copy instead
	and sync it only if entries have been successfully flushed
	to disk.

Or something like that.

> +	struct vclock vclock;
> +	vclock_create(&vclock);
> +	vclock_copy(&vclock, &writer->vclock);
> +
>  	struct errinj *inj = errinj(ERRINJ_WAL_DELAY, ERRINJ_BOOL);
>  	while (inj != NULL && inj->bparam)
>  		usleep(10);
> @@ -974,14 +979,15 @@ wal_write_to_disk(struct cmsg *msg)
>  	struct journal_entry *entry;
>  	struct stailq_entry *last_committed = NULL;
>  	stailq_foreach_entry(entry, &wal_msg->commit, fifo) {
> -		wal_assign_lsn(writer, entry->rows, entry->rows + entry->n_rows);
> -		entry->res = vclock_sum(&writer->vclock);
> +		wal_assign_lsn(&vclock, entry->rows, entry->rows + entry->n_rows);
> +		entry->res = vclock_sum(&vclock);
>  		rc = xlog_write_entry(l, entry);
>  		if (rc < 0)
>  			goto done;
>  		if (rc > 0) {
>  			writer->checkpoint_wal_size += rc;
>  			last_committed = &entry->fifo;
> +			vclock_copy(&writer->vclock, &vclock);
>  		}
>  		/* rc == 0: the write is buffered in xlog_tx */
>  	}
> @@ -991,6 +997,7 @@ wal_write_to_disk(struct cmsg *msg)
>  
>  	writer->checkpoint_wal_size += rc;
>  	last_committed = stailq_last(&wal_msg->commit);
> +	vclock_copy(&writer->vclock, &vclock);
>  
>  	/*
>  	 * Notify TX if the checkpoint threshold has been exceeded.
> @@ -1185,7 +1192,7 @@ wal_write_in_wal_mode_none(struct journal *journal,
>  			   struct journal_entry *entry)
>  {
>  	struct wal_writer *writer = (struct wal_writer *) journal;
> -	wal_assign_lsn(writer, entry->rows, entry->rows + entry->n_rows);
> +	wal_assign_lsn(&writer->vclock, entry->rows, entry->rows + entry->n_rows);
>  	int64_t old_lsn = vclock_get(&replicaset.vclock, instance_id);
>  	int64_t new_lsn = vclock_get(&writer->vclock, instance_id);
>  	if (new_lsn > old_lsn) {
> diff --git a/test/box/errinj.result b/test/box/errinj.result
> index 12303670e..1d9a16d8d 100644
> --- a/test/box/errinj.result
> +++ b/test/box/errinj.result
> @@ -141,6 +141,15 @@ errinj.set("ERRINJ_TESTING", false)
>  ---
>  - ok
>  ...
> +env = require('test_run')
> +---
> +...
> +test_run = env.new()
> +---
> +...

Why? You don't seem to need it.

> +lsn1 = box.info.vclock[box.info.id]
> +---
> +...
>  -- Check how well we handle a failed log write
>  errinj.set("ERRINJ_WAL_IO", true)
>  ---
> @@ -161,6 +170,11 @@ space:insert{1}
>  ---
>  - [1]
>  ...
> +-- Check vclock was promoted only one time
> +box.info.vclock[box.info.id] == lsn1 + 1
> +---
> +- true
> +...
>  errinj.set("ERRINJ_WAL_IO", true)
>  ---
>  - ok
> @@ -180,6 +194,15 @@ errinj.set("ERRINJ_WAL_IO", false)
>  ---
>  - ok
>  ...
> +space:update(1, {{'=', 2, 2}})
> +---
> +- [1, 2]
> +...
> +-- Check vclock was promoted only two times
> +box.info.vclock[box.info.id] == lsn1 + 2
> +---
> +- true
> +...

This test will pass with and without your patch, because we already
don't promote box.info.lsn on failed WAL write. I think you need to
restart the server to check this properly.

Other than that, the patch is fine by me.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [tarantool-patches] [PATCH 2/3] Enforce applier out of order protection
  2019-02-06  8:29 ` [tarantool-patches] [PATCH 2/3] Enforce applier out of order protection Georgy Kirichenko
@ 2019-02-06 14:13   ` Vladimir Davydov
  0 siblings, 0 replies; 8+ messages in thread
From: Vladimir Davydov @ 2019-02-06 14:13 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

On Wed, Feb 06, 2019 at 11:29:58AM +0300, Georgy Kirichenko wrote:
> Do not skip row until the row is not processed.
                                   ^^^
Redundant 'not'.

I think that this patch should be squashed with patch 3, because it
doesn't seem to make much sense on its own to me.

> 
> Prerequisite #2283
> ---
>  src/box/applier.cc | 48 ++++++++++++++++++++++------------------------
>  1 file changed, 23 insertions(+), 25 deletions(-)
> 
> diff --git a/src/box/applier.cc b/src/box/applier.cc
> index 21d2e6bcb..d87b247e2 100644
> --- a/src/box/applier.cc
> +++ b/src/box/applier.cc
> @@ -512,31 +512,25 @@ applier_subscribe(struct applier *applier)
>  
>  		applier->lag = ev_now(loop()) - row.tm;
>  		applier->last_row_time = ev_monotonic_now(loop());
> -
> -		if (vclock_get(&replicaset.vclock, row.replica_id) < row.lsn) {
> -			/**
> -			 * Promote the replica set vclock before
> -			 * applying the row. If there is an
> -			 * exception (conflict) applying the row,
> -			 * the row is skipped when the replication
> -			 * is resumed.
> -			 */
> +		struct replica *replica = replica_by_id(row.replica_id);
> +		struct latch *latch = (replica ? &replica->order_latch :
> +				       &replicaset.applier.order_latch);
> +		/*
> +		 * In a full mesh topology, the same set
> +		 * of changes may arrive via two
> +		 * concurrently running appliers. Thanks
> +		 * to vclock_follow() above, the first row
                                      ^^^^^
Above? It's below now.

> +		 * in the set will be skipped - but the
> +		 * remaining may execute out of order,
> +		 * when the following xstream_write()
> +		 * yields on WAL. Hence we need a latch to
> +		 * strictly order all changes which belong
> +		 * to the same server id.
> +		 */
> +		latch_lock(latch);
> +		if (vclock_get(&replicaset.vclock,
> +			       row.replica_id) < row.lsn) {
>  			vclock_follow_xrow(&replicaset.vclock, &row);
> -			struct replica *replica = replica_by_id(row.replica_id);
> -			struct latch *latch = (replica ? &replica->order_latch :
> -					       &replicaset.applier.order_latch);
> -			/*
> -			 * In a full mesh topology, the same set
> -			 * of changes may arrive via two
> -			 * concurrently running appliers. Thanks
> -			 * to vclock_follow() above, the first row
> -			 * in the set will be skipped - but the
> -			 * remaining may execute out of order,
> -			 * when the following xstream_write()
> -			 * yields on WAL. Hence we need a latch to
> -			 * strictly order all changes which belong
> -			 * to the same server id.
> -			 */
>  			latch_lock(latch);

Double lock...

>  			int res = xstream_write(applier->subscribe_stream, &row);
>  			latch_unlock(latch);
> @@ -550,10 +544,14 @@ applier_subscribe(struct applier *applier)
>  				    box_error_code(e) == ER_TUPLE_FOUND &&
>  				    replication_skip_conflict)
>  					diag_clear(diag_get());
> -				else
> +				else {
> +					latch_unlock(latch);
>  					diag_raise();
> +				}
>  			}
>  		}
> +		latch_unlock(latch);
> +
>  		if (applier->state == APPLIER_SYNC ||
>  		    applier->state == APPLIER_FOLLOW)
>  			fiber_cond_signal(&applier->writer_cond);

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [tarantool-patches] [PATCH 3/3] Promote replicaset.vclock only after wal
  2019-02-06  8:29 ` [tarantool-patches] [PATCH 3/3] Promote replicaset.vclock only after wal Georgy Kirichenko
@ 2019-02-06 14:45   ` Vladimir Davydov
  0 siblings, 0 replies; 8+ messages in thread
From: Vladimir Davydov @ 2019-02-06 14:45 UTC (permalink / raw)
  To: Georgy Kirichenko; +Cc: tarantool-patches

On Wed, Feb 06, 2019 at 11:29:59AM +0300, Georgy Kirichenko wrote:
> Get rid if appliers' vclock_follow and promote vclock only after
> wal write.

Gosh, what an ascetic commit message, really. Please instead take the
commit message Serge wrote when he tried to fix issue #2283.

> 
> Closes #2283
> Prerequisite #980
> ---
>  src/box/applier.cc                          | 14 ++------
>  src/box/wal.c                               | 38 ++++-----------------
>  test/replication/skip_conflict_row.test.lua | 19 +++++++++++
>  test/xlog-py/dup_key.result                 | 12 +++++--
>  test/xlog-py/dup_key.test.py                | 23 ++++++++++---
>  5 files changed, 57 insertions(+), 49 deletions(-)
> 
> diff --git a/src/box/applier.cc b/src/box/applier.cc
> index d87b247e2..cae71ec1c 100644
> --- a/src/box/applier.cc
> +++ b/src/box/applier.cc
> @@ -518,22 +518,14 @@ applier_subscribe(struct applier *applier)
>  		/*
>  		 * In a full mesh topology, the same set
>  		 * of changes may arrive via two
> -		 * concurrently running appliers. Thanks
> -		 * to vclock_follow() above, the first row
> -		 * in the set will be skipped - but the
> -		 * remaining may execute out of order,
> -		 * when the following xstream_write()
> -		 * yields on WAL. Hence we need a latch to
> -		 * strictly order all changes which belong
> -		 * to the same server id.
> +		 * concurrently running appliers.
> +		 * Hence we need a latch to strictly order all
> +		 * changes which belong to the same server id.
>  		 */
>  		latch_lock(latch);
>  		if (vclock_get(&replicaset.vclock,
>  			       row.replica_id) < row.lsn) {
> -			vclock_follow_xrow(&replicaset.vclock, &row);
> -			latch_lock(latch);
>  			int res = xstream_write(applier->subscribe_stream, &row);
> -			latch_unlock(latch);

Yep, this should definitely be squashed in patch 2.

>  			if (res != 0) {
>  				struct error *e = diag_last_error(diag_get());
>  				/**
> diff --git a/src/box/wal.c b/src/box/wal.c
> index a55b544aa..4c3537672 100644
> --- a/src/box/wal.c
> +++ b/src/box/wal.c
> @@ -171,6 +171,8 @@ struct wal_msg {
>  	 * be rolled back.
>  	 */
>  	struct stailq rollback;
> +	/** vclock after the batch processed. */
> +	struct vclock vclock;
>  };
>  
>  /**
> @@ -209,6 +211,7 @@ wal_msg_create(struct wal_msg *batch)
>  	batch->approx_len = 0;
>  	stailq_create(&batch->commit);
>  	stailq_create(&batch->rollback);
> +	vclock_create(&batch->vclock);
>  }
>  
>  static struct wal_msg *
> @@ -280,6 +283,7 @@ tx_schedule_commit(struct cmsg *msg)
>  	 * wal_msg memory disappears after the first
>  	 * iteration of tx_schedule_queue loop.
>  	 */
> +	vclock_copy(&replicaset.vclock, &batch->vclock);

An unfortunate place to add this. Now it looks like the comment above
relates to this vclock_copy although it doesn't. Please move it below
to avoid confusion.

>  	if (! stailq_empty(&batch->rollback)) {
>  		/* Closes the input valve. */
>  		stailq_concat(&writer->rollback, &batch->rollback);
> @@ -1028,6 +1032,8 @@ done:
>  		error_log(error);
>  		diag_clear(diag_get());
>  	}
> +	/* Set resulting vclock. */

Again, what a pointless comment... Would be more useful if you
elaborated a bit:

	Remember the vclock of the last successfully written row so
	that we can update replicaset.vclock once this message gets
	back to tx.

> +	vclock_copy(&wal_msg->vclock, &writer->vclock);
>  	/*
>  	 * We need to start rollback from the first request
>  	 * following the last committed request. If
> @@ -1159,31 +1165,6 @@ wal_write(struct journal *journal, struct journal_entry *entry)
>  	bool cancellable = fiber_set_cancellable(false);
>  	fiber_yield(); /* Request was inserted. */
>  	fiber_set_cancellable(cancellable);
> -	if (entry->res > 0) {
> -		struct xrow_header **last = entry->rows + entry->n_rows - 1;
> -		while (last >= entry->rows) {
> -			/*
> -			 * Find last row from local instance id
> -			 * and promote vclock.
> -			 */
> -			if ((*last)->replica_id == instance_id) {
> -				/*
> -				 * In master-master configuration, during sudden
> -				 * power-loss, if the data have not been written
> -				 * to WAL but have already been sent to others,
> -				 * they will send the data back. In this case
> -				 * vclock has already been promoted by applier.
> -				 */
> -				if (vclock_get(&replicaset.vclock,
> -					       instance_id) < (*last)->lsn) {
> -					vclock_follow_xrow(&replicaset.vclock,
> -							   *last);
> -				}
> -				break;
> -			}
> -			--last;
> -		}
> -	}
>  	return entry->res;
>  }
>  
> @@ -1193,12 +1174,7 @@ wal_write_in_wal_mode_none(struct journal *journal,
>  {
>  	struct wal_writer *writer = (struct wal_writer *) journal;
>  	wal_assign_lsn(&writer->vclock, entry->rows, entry->rows + entry->n_rows);
> -	int64_t old_lsn = vclock_get(&replicaset.vclock, instance_id);
> -	int64_t new_lsn = vclock_get(&writer->vclock, instance_id);
> -	if (new_lsn > old_lsn) {
> -		/* There were local writes, promote vclock. */
> -		vclock_follow(&replicaset.vclock, instance_id, new_lsn);
> -	}
> +	vclock_copy(&replicaset.vclock, &writer->vclock);
>  	return vclock_sum(&writer->vclock);
>  }
>  
> diff --git a/test/replication/skip_conflict_row.test.lua b/test/replication/skip_conflict_row.test.lua
> index 4406ced95..c60999b9b 100644
> --- a/test/replication/skip_conflict_row.test.lua
> +++ b/test/replication/skip_conflict_row.test.lua
> @@ -1,5 +1,6 @@
>  env = require('test_run')
>  test_run = env.new()
> +test_run:cmd("restart server default with cleanup=1")

Let's please try to rewrite the test without this. E.g. we could
restart the replica and check that replication doesn't resume
instead of checking vclocks.

>  engine = test_run:get_cfg('engine')
>  
>  box.schema.user.grant('guest', 'replication')
> @@ -28,6 +29,24 @@ box.space.test:select()
>  test_run:cmd("switch default")
>  box.info.status
>  
> +-- test that if replication_skip_conflict is off vclock
> +-- is not advanced on errors.
> +test_run:cmd("restart server replica")
> +test_run:cmd("switch replica")

> +box.cfg{replication_skip_conflict=false}

It's already unset after restart.

> +box.space.test:insert{3}
> +box.info.vclock
> +test_run:cmd("switch default")
> +box.space.test:insert{3, 3}
> +box.space.test:insert{4}
> +box.info.vclock
> +test_run:cmd("switch replica")
> +box.info.vclock
> +box.info.replication[1].upstream.message
> +box.info.replication[1].upstream.status
> +box.space.test:select()
> +test_run:cmd("switch default")
> +

We also want to check that replication isn't restarted

>  -- cleanup
>  test_run:cmd("stop server replica")
>  test_run:cmd("cleanup server replica")
> diff --git a/test/xlog-py/dup_key.test.py b/test/xlog-py/dup_key.test.py
> index 1c033da40..e25b1d477 100644
> --- a/test/xlog-py/dup_key.test.py
> +++ b/test/xlog-py/dup_key.test.py
> @@ -22,23 +22,36 @@ wal = os.path.join(vardir, filename)

What happened to this test? (BTW I've already asked you that)

Last time I checked it passed with and without this patch.

>  # Create wal#1
>  server.admin("box.space.test:insert{1, 'first tuple'}")
>  server.admin("box.space.test:insert{2, 'second tuple'}")
> +lsn2 = int(yaml.load(server.admin("box.info.lsn", silent=True))[0])
>  server.stop()
>  
>  # Save wal#1
>  if os.access(wal, os.F_OK):
> -    print ".xlog exists"
> +    print ".xlog#1 exists"
>      os.rename(wal, wal_old)
> +# drop empty log created on shutdown
> +filename2 = str(lsn2).zfill(20) + ".xlog"
> +wal2 = os.path.join(vardir, filename2)
> +os.unlink(wal2)
>  
> -# Write wal#2
> +# Write wal#2 to bump lsn
> +server.start()
> +server.admin("box.space.test:insert{3, 'third tuple'}")
> +server.admin("box.space.test:insert{4, 'fourth tuple'}")
> +server.stop()
> +
> +if os.access(wal, os.F_OK):
> +    print ".xlog#2 exists"
> +
> +# Write wal#3 - confliction with wal#1
>  server.start()
>  server.admin("box.space.test:insert{1, 'third tuple'}")
>  server.admin("box.space.test:insert{2, 'fourth tuple'}")
>  server.stop()
>  
>  # Restore wal#1
> -if not os.access(wal, os.F_OK):
> -    print ".xlog does not exist"
> -    os.rename(wal_old, wal)
> +os.unlink(wal)
> +os.rename(wal_old, wal)
>  
>  server.start()
>  line = 'Duplicate key'

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-02-06 14:45 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-06  8:29 [tarantool-patches] [PATCH 0/3] Promote vclock only for successful writes Georgy Kirichenko
2019-02-06  8:29 ` [tarantool-patches] [PATCH 1/3] Do not promote wal vclock for failed writes Georgy Kirichenko
2019-02-06 13:56   ` Vladimir Davydov
2019-02-06  8:29 ` [tarantool-patches] [PATCH 2/3] Enforce applier out of order protection Georgy Kirichenko
2019-02-06 14:13   ` Vladimir Davydov
2019-02-06  8:29 ` [tarantool-patches] [PATCH 3/3] Promote replicaset.vclock only after wal Georgy Kirichenko
2019-02-06 14:45   ` Vladimir Davydov
2019-02-06 13:50 ` [tarantool-patches] [PATCH 0/3] Promote vclock only for successful writes Vladimir Davydov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox