[tarantool-patches] [PATCH] replication: force gc to clean xdir on ENOSPC err

Konstantin Belyavskiy k.belyavskiy at tarantool.org
Wed Jun 6 12:27:52 MSK 2018


Garbage collector do not delete xlog unless replica do not notify
master with newer vclock. This can lead to running out of disk
space error and this is not right behaviour since it will stop the
master.

List of changes:
- Promoting error from wal_thread via extended journal entry.
- Introduce states for gc_consumers (OK and OFF). GC should not
take into account consumers with state OFF.
- Add an error injection and a test.

WARNING:
Several issues left:
1. This approach call gc if write failes because of ENOSPC. This
aborts last transaction. But initial proposal was to use fallocate
to do force clean in advance.
2. In case of multiple errors we could shoot down all appliers, so
some simple check (e.g. size of LSN gap could be used).

Proposal for #3397
---
ticket: https://github.com/tarantool/tarantool/issues/3397
branch: https://github.com/tarantool/tarantool/tree/kbelyavs/gh-3397-force-del-logs-on-no-disk-space
 src/box/box.cc                                    |   7 +-
 src/box/errcode.h                                 |   1 +
 src/box/gc.c                                      |  31 ++++++-
 src/box/gc.h                                      |  13 +++
 src/box/journal.h                                 |   2 +
 src/box/txn.c                                     |   5 +-
 src/box/wal.cc                                    |   7 +-
 src/errinj.h                                      |   1 +
 src/fio.c                                         |   7 ++
 test/box/errinj.result                            |   2 +
 test/box/misc.result                              |   1 +
 test/replication/force_gc_on_err_nospace.result   | 103 ++++++++++++++++++++++
 test/replication/force_gc_on_err_nospace.test.lua |  36 ++++++++
 13 files changed, 210 insertions(+), 6 deletions(-)
 create mode 100644 test/replication/force_gc_on_err_nospace.result
 create mode 100644 test/replication/force_gc_on_err_nospace.test.lua

diff --git a/src/box/box.cc b/src/box/box.cc
index e3eb2738f..0d25fb502 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -180,8 +180,13 @@ process_rw(struct request *request, struct space *space, struct tuple **result)
 	 * when WAL is written in autocommit mode.
 	 */
 	TupleRefNil ref(tuple);
-	if (txn_commit_stmt(txn, request) != 0)
+	if (txn_commit_stmt(txn, request) != 0) {
+		struct error *e = diag_last_error(diag_get());
+		if (e != NULL && e->type == &type_ClientError &&
+		    box_error_code(e) == ER_NO_DISK_SPACE)
+			gc_leftmost_mark_dead();
 		return -1;
+	}
 	if (result != NULL) {
 		if (tuple != NULL && tuple_bless(tuple) == NULL)
 			return -1;
diff --git a/src/box/errcode.h b/src/box/errcode.h
index a0759f8f4..a5cf99825 100644
--- a/src/box/errcode.h
+++ b/src/box/errcode.h
@@ -206,6 +206,7 @@ struct errcode_record {
 	/*151 */_(ER_WRONG_COLLATION_OPTIONS,	"Wrong collation options (field %u): %s") \
 	/*152 */_(ER_NULLABLE_PRIMARY,		"Primary index of the space '%s' can not contain nullable parts") \
 	/*153 */_(ER_NULLABLE_MISMATCH,		"Field %d is %s in space format, but %s in index parts") \
+	/*154 */_(ER_NO_DISK_SPACE,		"Failed to write to disk: no space left") \
 
 /*
  * !IMPORTANT! Please follow instructions at start of the file
diff --git a/src/box/gc.c b/src/box/gc.c
index 12e68f3dc..8f3dfa368 100644
--- a/src/box/gc.c
+++ b/src/box/gc.c
@@ -59,6 +59,8 @@ struct gc_consumer {
 	gc_node_t node;
 	/** Human-readable name. */
 	char *name;
+	/** status, can be marked as dead if replica is not response */
+	enum gc_consumer_state state;
 	/** The vclock signature tracked by this consumer. */
 	int64_t signature;
 };
@@ -132,6 +134,15 @@ gc_consumer_delete(struct gc_consumer *consumer)
 	free(consumer);
 }
 
+static struct gc_consumer *
+gc_tree_first_alive(gc_tree_t *consumers)
+{
+	struct gc_consumer *consumer = gc_tree_first(consumers);
+	while (consumer != NULL && consumer->state != CONSUMER_OK)
+		consumer = gc_tree_next(&gc.consumers, consumer);
+	return consumer;
+}
+
 void
 gc_init(void)
 {
@@ -162,7 +173,7 @@ gc_run(void)
 	assert(checkpoint_count > 0);
 
 	/* Look up the consumer that uses the oldest snapshot. */
-	struct gc_consumer *leftmost = gc_tree_first(&gc.consumers);
+	struct gc_consumer *leftmost = gc_tree_first_alive(&gc.consumers);
 
 	/*
 	 * Find the oldest checkpoint that must be preserved.
@@ -237,9 +248,23 @@ gc_consumer_unregister(struct gc_consumer *consumer)
 	 * Rerun garbage collection after removing the consumer
 	 * if it referenced the oldest vclock.
 	 */
-	struct gc_consumer *leftmost = gc_tree_first(&gc.consumers);
+	struct gc_consumer *leftmost = gc_tree_first_alive(&gc.consumers);
+	if (leftmost == NULL || leftmost->signature > signature)
+		gc_run();
+}
+
+void
+gc_leftmost_mark_dead(void)
+{
+	struct gc_consumer *leftmost = gc_tree_first_alive(&gc.consumers);
+	if (leftmost == NULL)
+		return; // do nothing
+	leftmost->state = CONSUMER_OFF;
+	int64_t signature = leftmost->signature;
+	leftmost = gc_tree_first_alive(&gc.consumers);
 	if (leftmost == NULL || leftmost->signature > signature)
 		gc_run();
+
 }
 
 void
@@ -270,7 +295,7 @@ gc_consumer_advance(struct gc_consumer *consumer, int64_t signature)
 	 * Rerun garbage collection after advancing the consumer
 	 * if it referenced the oldest vclock.
 	 */
-	struct gc_consumer *leftmost = gc_tree_first(&gc.consumers);
+	struct gc_consumer *leftmost = gc_tree_first_alive(&gc.consumers);
 	if (leftmost == NULL || leftmost->signature > prev_signature)
 		gc_run();
 }
diff --git a/src/box/gc.h b/src/box/gc.h
index 634ce6d38..74aa7ce83 100644
--- a/src/box/gc.h
+++ b/src/box/gc.h
@@ -40,6 +40,11 @@ extern "C" {
 
 struct gc_consumer;
 
+enum gc_consumer_state {
+	CONSUMER_OK = 0,
+	CONSUMER_OFF = 1,
+};
+
 /**
  * Initialize the garbage collection state.
  */
@@ -88,6 +93,14 @@ gc_consumer_register(const char *name, int64_t signature);
 void
 gc_consumer_unregister(struct gc_consumer *consumer);
 
+/**
+ * Remove consumer with least recent vclock and invoke
+ * garbage collection. Originally created for cases with
+ * running out of disk space because of disconnected replica.
+ */
+void
+gc_leftmost_mark_dead(void);
+
 /**
  * Advance the vclock signature tracked by a consumer and
  * invoke garbage collection if needed.
diff --git a/src/box/journal.h b/src/box/journal.h
index 1d64a7bd1..e0ca240a2 100644
--- a/src/box/journal.h
+++ b/src/box/journal.h
@@ -62,6 +62,8 @@ struct journal_entry {
 	 * The number of rows in the request.
 	 */
 	int n_rows;
+	/** Contains last error if any */
+	int errcode;
 	/**
 	 * The rows.
 	 */
diff --git a/src/box/txn.c b/src/box/txn.c
index e25c0e0e0..fd54cfb4a 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -270,7 +270,10 @@ txn_write_to_wal(struct txn *txn)
 		 * pending rollbacks are processed.
 		 */
 		fiber_reschedule();
-		diag_set(ClientError, ER_WAL_IO);
+		struct error *e = diag_last_error(diag_get());
+		if (e == NULL || e->type != &type_ClientError ||
+		    box_error_code(e) != ER_NO_DISK_SPACE)
+			diag_set(ClientError, ER_WAL_IO);
 		diag_log();
 	} else if (stop - start > too_long_threshold) {
 		say_warn("too long WAL write: %d rows at LSN %lld: %.3f sec",
diff --git a/src/box/wal.cc b/src/box/wal.cc
index 099c70caa..647fbd6b1 100644
--- a/src/box/wal.cc
+++ b/src/box/wal.cc
@@ -34,6 +34,7 @@
 #include "fiber.h"
 #include "fio.h"
 #include "errinj.h"
+#include "error.h"
 
 #include "xlog.h"
 #include "xrow.h"
@@ -629,12 +630,13 @@ wal_write_to_disk(struct cmsg *msg)
 	/*
 	 * Iterate over requests (transactions)
 	 */
-	struct journal_entry *entry;
+	struct journal_entry *entry, *prev_entry = NULL;
 	struct stailq_entry *last_committed = NULL;
 	stailq_foreach_entry(entry, &wal_msg->commit, fifo) {
 		wal_assign_lsn(writer, entry->rows, entry->rows + entry->n_rows);
 		entry->res = vclock_sum(&writer->vclock);
 		int rc = xlog_write_entry(l, entry);
+		prev_entry = entry;
 		if (rc < 0)
 			goto done;
 		if (rc > 0)
@@ -652,6 +654,7 @@ done:
 		/* Until we can pass the error to tx, log it and clear. */
 		error_log(error);
 		diag_clear(diag_get());
+		prev_entry->errcode = errno;
 	}
 	/*
 	 * We need to start rollback from the first request
@@ -788,6 +791,8 @@ wal_write(struct journal *journal, struct journal_entry *entry)
 			--last;
 		}
 	}
+	if (entry->errcode == ENOSPC)
+		diag_set(ClientError, ER_NO_DISK_SPACE);
 	return entry->res;
 }
 
diff --git a/src/errinj.h b/src/errinj.h
index ed69b6cb0..438095294 100644
--- a/src/errinj.h
+++ b/src/errinj.h
@@ -109,6 +109,7 @@ struct errinj {
 	_(ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT, ERRINJ_DOUBLE, {.dparam = 0}) \
 	_(ERRINJ_IPROTO_TX_DELAY, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_LOG_ROTATE, ERRINJ_BOOL, {.bparam = false}) \
+	_(ERRINJ_NO_DISK_SPACE, ERRINJ_BOOL, {.bparam = false}) \
 
 ENUM0(errinj_id, ERRINJ_LIST);
 extern struct errinj errinjs[];
diff --git a/src/fio.c b/src/fio.c
index b79d3d058..cdea11e87 100644
--- a/src/fio.c
+++ b/src/fio.c
@@ -29,6 +29,7 @@
  * SUCH DAMAGE.
  */
 #include "fio.h"
+#include "errinj.h"
 
 #include <sys/types.h>
 
@@ -141,6 +142,12 @@ fio_writev(int fd, struct iovec *iov, int iovcnt)
 	ssize_t nwr;
 restart:
 	nwr = writev(fd, iov, iovcnt);
+	/* Simulate running out of disk space to force the gc to clean logs. */
+	struct errinj *inj = errinj(ERRINJ_NO_DISK_SPACE, ERRINJ_BOOL);
+	if (inj != NULL && inj->bparam) {
+		errno = ENOSPC;
+		nwr = -1;
+	}
 	if (nwr < 0) {
 		if (errno == EINTR) {
 			errno = 0;
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 9e3a0bfab..4d85156a9 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -54,6 +54,8 @@ errinj.info()
     state: false
   ERRINJ_VY_RUN_WRITE:
     state: false
+  ERRINJ_NO_DISK_SPACE:
+    state: false
   ERRINJ_RELAY_FINAL_SLEEP:
     state: false
   ERRINJ_VY_RUN_DISCARD:
diff --git a/test/box/misc.result b/test/box/misc.result
index cd3af7f9e..0253a0cde 100644
--- a/test/box/misc.result
+++ b/test/box/misc.result
@@ -353,6 +353,7 @@ t;
   - 'box.error.NONMASTER : 6'
   - 'box.error.MEMTX_MAX_TUPLE_SIZE : 110'
   - 'box.error.DROP_FUNCTION : 71'
+  - 'box.error.NO_DISK_SPACE : 154'
   - 'box.error.CFG : 59'
   - 'box.error.NO_SUCH_FIELD : 37'
   - 'box.error.CONNECTION_TO_SELF : 117'
diff --git a/test/replication/force_gc_on_err_nospace.result b/test/replication/force_gc_on_err_nospace.result
new file mode 100644
index 000000000..ddddd55fa
--- /dev/null
+++ b/test/replication/force_gc_on_err_nospace.result
@@ -0,0 +1,103 @@
+env = require('test_run')
+---
+...
+test_run = env.new()
+---
+...
+engine = test_run:get_cfg('engine')
+---
+...
+box.schema.user.grant('guest', 'read,write,execute', 'universe')
+---
+...
+s = box.schema.space.create('test', {engine = engine});
+---
+...
+index = s:create_index('primary', {type = 'tree'})
+---
+...
+errinj = box.error.injection
+---
+...
+fio = require('fio')
+---
+...
+box.schema.user.grant('guest', 'replication')
+---
+...
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+---
+- true
+...
+test_run:cmd("start server replica")
+---
+- true
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+repl = box.cfg.replication
+---
+...
+box.cfg{replication = ""}
+---
+...
+test_run:cmd("switch default")
+---
+- true
+...
+for i = 1, 5 do s:insert{i} box.snapshot() end
+---
+...
+s:select()
+---
+- - [1]
+  - [2]
+  - [3]
+  - [4]
+  - [5]
+...
+#fio.glob(fio.pathjoin(fio.abspath("."), 'master/*.xlog'))
+---
+- 6
+...
+errinj.set("ERRINJ_NO_DISK_SPACE", true)
+---
+- ok
+...
+function insert(a) s:insert(a) end
+---
+...
+_, err = pcall(insert, {6})
+---
+...
+err:match("ailed to write")
+---
+- ailed to write
+...
+#fio.glob(fio.pathjoin(fio.abspath("."), 'master/*.xlog'))
+---
+- 2
+...
+-- cleanup
+test_run:cmd("switch replica")
+---
+- true
+...
+test_run:cmd("restart server default with cleanup=1")
+---
+- true
+...
+test_run:cmd("switch default")
+---
+- true
+...
+test_run:cmd("stop server replica")
+---
+- true
+...
+test_run:cmd("cleanup server replica")
+---
+- true
+...
diff --git a/test/replication/force_gc_on_err_nospace.test.lua b/test/replication/force_gc_on_err_nospace.test.lua
new file mode 100644
index 000000000..2e1e3e75d
--- /dev/null
+++ b/test/replication/force_gc_on_err_nospace.test.lua
@@ -0,0 +1,36 @@
+env = require('test_run')
+test_run = env.new()
+engine = test_run:get_cfg('engine')
+
+box.schema.user.grant('guest', 'read,write,execute', 'universe')
+
+s = box.schema.space.create('test', {engine = engine});
+index = s:create_index('primary', {type = 'tree'})
+
+errinj = box.error.injection
+fio = require('fio')
+
+box.schema.user.grant('guest', 'replication')
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+test_run:cmd("start server replica")
+
+test_run:cmd("switch replica")
+repl = box.cfg.replication
+box.cfg{replication = ""}
+test_run:cmd("switch default")
+
+for i = 1, 5 do s:insert{i} box.snapshot() end
+s:select()
+#fio.glob(fio.pathjoin(fio.abspath("."), 'master/*.xlog'))
+errinj.set("ERRINJ_NO_DISK_SPACE", true)
+function insert(a) s:insert(a) end
+_, err = pcall(insert, {6})
+err:match("ailed to write")
+#fio.glob(fio.pathjoin(fio.abspath("."), 'master/*.xlog'))
+
+-- cleanup
+test_run:cmd("switch replica")
+test_run:cmd("restart server default with cleanup=1")
+test_run:cmd("switch default")
+test_run:cmd("stop server replica")
+test_run:cmd("cleanup server replica")
-- 
2.14.3 (Apple Git-98)





More information about the Tarantool-patches mailing list