From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 9A96B26F9E for ; Tue, 3 Jul 2018 09:03:54 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id kV5uVKeoeK7w for ; Tue, 3 Jul 2018 09:03:54 -0400 (EDT) Received: from smtp37.i.mail.ru (smtp37.i.mail.ru [94.100.177.97]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 341D926FA0 for ; Tue, 3 Jul 2018 09:03:54 -0400 (EDT) Received: by smtp37.i.mail.ru with esmtpa (envelope-from ) id 1faKyG-0005b1-Mg for tarantool-patches@freelists.org; Tue, 03 Jul 2018 16:03:53 +0300 From: Konstantin Belyavskiy Subject: [tarantool-patches] [PATCH v4 2/2] replication: force gc to clean xdir on ENOSPC err Date: Tue, 3 Jul 2018 16:03:47 +0300 Message-Id: <20180703130347.26296-3-k.belyavskiy@tarantool.org> In-Reply-To: <20180703130347.26296-1-k.belyavskiy@tarantool.org> References: <20180703130347.26296-1-k.belyavskiy@tarantool.org> Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org Garbage collector do not delete xlog unless replica do not notify master with newer vclock. This can lead to running out of disk space error and this is not right behaviour since it will stop the master. Fix it by forcing gc to clean xlogs for replica with highest lag. Add an error injection and a test. Changes in V2: - Promoting error from wal_thread to tx via cpipe. Changes in V3: - Delete consumers and only for replicas (but not backup). Changes in V4: - Bug fix and small changes according to review. Closes #3397 --- src/box/box.cc | 1 + src/box/gc.c | 49 +++++++++ src/box/gc.h | 16 +++ src/box/relay.cc | 1 + src/box/wal.cc | 25 +++++ src/errinj.h | 1 + src/fio.c | 7 ++ test/box/errinj.result | 2 + test/replication/kick_dead_replica_on_enspc.result | 121 +++++++++++++++++++++ .../kick_dead_replica_on_enspc.test.lua | 56 ++++++++++ test/replication/suite.ini | 2 +- 11 files changed, 280 insertions(+), 1 deletion(-) create mode 100644 test/replication/kick_dead_replica_on_enspc.result create mode 100644 test/replication/kick_dead_replica_on_enspc.test.lua diff --git a/src/box/box.cc b/src/box/box.cc index e3eb2738f..ba894c33a 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -1370,6 +1370,7 @@ box_process_join(struct ev_io *io, struct xrow_header *header) replica = replica_by_uuid(&instance_uuid); assert(replica != NULL); replica->gc = gc; + gc_consumer_set_replica(gc, replica); gc_guard.is_active = false; /* Remember master's vclock after the last request */ diff --git a/src/box/gc.c b/src/box/gc.c index 12e68f3dc..ba77d8124 100644 --- a/src/box/gc.c +++ b/src/box/gc.c @@ -61,6 +61,8 @@ struct gc_consumer { char *name; /** The vclock signature tracked by this consumer. */ int64_t signature; + /** Replica associated with consumer (if any). */ + struct replica *replica; }; typedef rb_tree(struct gc_consumer) gc_tree_t; @@ -123,10 +125,18 @@ gc_consumer_new(const char *name, int64_t signature) return consumer; } +void +gc_consumer_set_replica(struct gc_consumer *gc, struct replica *replica) +{ + gc->replica = replica; +} + /** Free a consumer object. */ static void gc_consumer_delete(struct gc_consumer *consumer) { + if (consumer->replica != NULL) + consumer->replica->gc = NULL; free(consumer->name); TRASH(consumer); free(consumer); @@ -216,6 +226,45 @@ gc_set_checkpoint_count(int checkpoint_count) gc.checkpoint_count = checkpoint_count; } +void +gc_xdir_clean_notify() +{ + /* + * Compare the current time with the time of the last run. + * This is needed in case of multiple failures to prevent + * from deleting all replicas. + */ + static double prev_time = 0.; + double cur_time = ev_monotonic_time(); + if (cur_time - prev_time < 1.) + return; + prev_time = cur_time; + /** + * Delete consumer with the least recent vclock and start + * garbage collection. If nothing to delete find next + * consumer etc. Originally created for cases with running + * out of disk space because of disconnected replica. + */ + struct gc_consumer *leftmost = + gc_tree_first(&gc.consumers); + /* + * Exit if no consumers left or if this consumer is + * not associated with replica (backup for example). + */ + if (leftmost == NULL || leftmost->replica == NULL) + return; + int64_t signature = leftmost->signature; + while (true) { + gc_consumer_unregister(leftmost); + leftmost = gc_tree_first(&gc.consumers); + if (leftmost == NULL || leftmost->replica == NULL || + leftmost->signature > signature) { + gc_run(); + return; + } + } +} + struct gc_consumer * gc_consumer_register(const char *name, int64_t signature) { diff --git a/src/box/gc.h b/src/box/gc.h index 634ce6d38..15d966f54 100644 --- a/src/box/gc.h +++ b/src/box/gc.h @@ -31,9 +31,12 @@ * SUCH DAMAGE. */ +#include #include #include +#include "replication.h" + #if defined(__cplusplus) extern "C" { #endif /* defined(__cplusplus) */ @@ -81,6 +84,12 @@ gc_set_checkpoint_count(int checkpoint_count); struct gc_consumer * gc_consumer_register(const char *name, int64_t signature); +/** + * Bind consumer with associated replica (if any). + */ +void +gc_consumer_set_replica(struct gc_consumer *gc, struct replica *replica); + /** * Unregister a consumer and invoke garbage collection * if needed. @@ -88,6 +97,13 @@ gc_consumer_register(const char *name, int64_t signature); void gc_consumer_unregister(struct gc_consumer *consumer); +/** + * Notify gc to clean xdir because of running out + * of disk space. + */ +void +gc_xdir_clean_notify(); + /** * Advance the vclock signature tracked by a consumer and * invoke garbage collection if needed. diff --git a/src/box/relay.cc b/src/box/relay.cc index d2ceaf110..c317775a4 100644 --- a/src/box/relay.cc +++ b/src/box/relay.cc @@ -535,6 +535,7 @@ relay_subscribe(int fd, uint64_t sync, struct replica *replica, vclock_sum(replica_clock)); if (replica->gc == NULL) diag_raise(); + gc_consumer_set_replica(replica->gc, replica); } struct relay relay; diff --git a/src/box/wal.cc b/src/box/wal.cc index 93c350e1f..f6de97cef 100644 --- a/src/box/wal.cc +++ b/src/box/wal.cc @@ -41,6 +41,7 @@ #include "cbus.h" #include "coio_task.h" #include "replication.h" +#include "gc.h" const char *wal_mode_STRS[] = { "none", "write", "fsync", NULL }; @@ -64,6 +65,8 @@ struct wal_thread { * priority pipe and DOES NOT support yield. */ struct cpipe tx_prio_pipe; + /** Return pipe from 'wal' to tx' */ + struct cpipe tx_pipe; }; /* @@ -584,6 +587,13 @@ wal_assign_lsn(struct wal_writer *writer, struct xrow_header **row, } } +static void +gc_status_update(struct cmsg *msg) +{ + gc_xdir_clean_notify(); + free(msg); +} + static void wal_write_to_disk(struct cmsg *msg) { @@ -655,6 +665,19 @@ done: /* Until we can pass the error to tx, log it and clear. */ error_log(error); diag_clear(diag_get()); + if (errno == ENOSPC) { + struct cmsg *msg = + (struct cmsg*)calloc(1, sizeof(struct cmsg)); + if (msg == NULL) { + say_error("failed to allocate cmsg"); + } else { + static const struct cmsg_hop route[] = { + {gc_status_update, NULL} + }; + cmsg_init(msg, route); + cpipe_push(&wal_thread.tx_pipe, msg); + } + } } /* * We need to start rollback from the first request @@ -695,6 +718,7 @@ wal_thread_f(va_list ap) * even when tx fiber pool is used up by net messages. */ cpipe_create(&wal_thread.tx_prio_pipe, "tx_prio"); + cpipe_create(&wal_thread.tx_pipe, "tx"); cbus_loop(&endpoint); @@ -707,6 +731,7 @@ wal_thread_f(va_list ap) xlog_close(&vy_log_writer.xlog, false); cpipe_destroy(&wal_thread.tx_prio_pipe); + cpipe_destroy(&wal_thread.tx_pipe); return 0; } diff --git a/src/errinj.h b/src/errinj.h index 895d938d5..11f1b7fdc 100644 --- a/src/errinj.h +++ b/src/errinj.h @@ -112,6 +112,7 @@ struct errinj { _(ERRINJ_LOG_ROTATE, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_SNAP_COMMIT_DELAY, ERRINJ_BOOL, {.bparam = 0}) \ _(ERRINJ_SNAP_WRITE_ROW_TIMEOUT, ERRINJ_DOUBLE, {.dparam = 0}) \ + _(ERRINJ_NO_DISK_SPACE, ERRINJ_BOOL, {.bparam = false}) \ ENUM0(errinj_id, ERRINJ_LIST); extern struct errinj errinjs[]; diff --git a/src/fio.c b/src/fio.c index b79d3d058..cdea11e87 100644 --- a/src/fio.c +++ b/src/fio.c @@ -29,6 +29,7 @@ * SUCH DAMAGE. */ #include "fio.h" +#include "errinj.h" #include @@ -141,6 +142,12 @@ fio_writev(int fd, struct iovec *iov, int iovcnt) ssize_t nwr; restart: nwr = writev(fd, iov, iovcnt); + /* Simulate running out of disk space to force the gc to clean logs. */ + struct errinj *inj = errinj(ERRINJ_NO_DISK_SPACE, ERRINJ_BOOL); + if (inj != NULL && inj->bparam) { + errno = ENOSPC; + nwr = -1; + } if (nwr < 0) { if (errno == EINTR) { errno = 0; diff --git a/test/box/errinj.result b/test/box/errinj.result index 21a949965..a28688436 100644 --- a/test/box/errinj.result +++ b/test/box/errinj.result @@ -56,6 +56,8 @@ errinj.info() state: false ERRINJ_VY_RUN_WRITE: state: false + ERRINJ_NO_DISK_SPACE: + state: false ERRINJ_VY_LOG_FLUSH_DELAY: state: false ERRINJ_SNAP_COMMIT_DELAY: diff --git a/test/replication/kick_dead_replica_on_enspc.result b/test/replication/kick_dead_replica_on_enspc.result new file mode 100644 index 000000000..53ecc86a8 --- /dev/null +++ b/test/replication/kick_dead_replica_on_enspc.result @@ -0,0 +1,121 @@ +env = require('test_run') +--- +... +vclock_diff = require('fast_replica').vclock_diff +--- +... +test_run = env.new() +--- +... +SERVERS = { 'autobootstrap1', 'autobootstrap2', 'autobootstrap3' } +--- +... +-- +-- Start servers +-- +test_run:create_cluster(SERVERS) +--- +... +-- +-- Wait for full mesh +-- +test_run:wait_fullmesh(SERVERS) +--- +... +-- +-- Check vclock +-- +vclock1 = test_run:get_vclock('autobootstrap1') +--- +... +vclock_diff(vclock1, test_run:get_vclock('autobootstrap2')) +--- +- 0 +... +vclock_diff(vclock1, test_run:get_vclock('autobootstrap3')) +--- +- 0 +... +-- +-- Switch off third replica +-- +test_run:cmd("switch autobootstrap3") +--- +- true +... +repl = box.cfg.replication +--- +... +box.cfg{replication = ""} +--- +... +-- +-- Insert rows +-- +test_run:cmd("switch autobootstrap1") +--- +- true +... +s = box.space.test +--- +... +for i = 1, 5 do s:insert{i} box.snapshot() end +--- +... +s:select() +--- +- - [1] + - [2] + - [3] + - [4] + - [5] +... +fio = require('fio') +--- +... +path = fio.pathjoin(fio.abspath("."), 'autobootstrap1/*.xlog') +--- +... +-- Depend on first master is a leader or not it should be 5 or 6. +#fio.glob(path) >= 5 +--- +- true +... +errinj = box.error.injection +--- +... +errinj.set("ERRINJ_NO_DISK_SPACE", true) +--- +- ok +... +function insert(a) s:insert(a) end +--- +... +_, err = pcall(insert, {6}) +--- +... +err:match("ailed to write") +--- +- ailed to write +... +-- add a little timeout so gc could finish job +fiber = require('fiber') +--- +... +while #fio.glob(path) ~= 2 do fiber.sleep(0.01) end +--- +... +#fio.glob(path) +--- +- 2 +... +test_run:cmd("switch default") +--- +- true +... +-- +-- Stop servers +-- +test_run:drop_cluster(SERVERS) +--- +... diff --git a/test/replication/kick_dead_replica_on_enspc.test.lua b/test/replication/kick_dead_replica_on_enspc.test.lua new file mode 100644 index 000000000..88cb9df63 --- /dev/null +++ b/test/replication/kick_dead_replica_on_enspc.test.lua @@ -0,0 +1,56 @@ +env = require('test_run') +vclock_diff = require('fast_replica').vclock_diff +test_run = env.new() + + +SERVERS = { 'autobootstrap1', 'autobootstrap2', 'autobootstrap3' } + +-- +-- Start servers +-- +test_run:create_cluster(SERVERS) + +-- +-- Wait for full mesh +-- +test_run:wait_fullmesh(SERVERS) + +-- +-- Check vclock +-- +vclock1 = test_run:get_vclock('autobootstrap1') +vclock_diff(vclock1, test_run:get_vclock('autobootstrap2')) +vclock_diff(vclock1, test_run:get_vclock('autobootstrap3')) + +-- +-- Switch off third replica +-- +test_run:cmd("switch autobootstrap3") +repl = box.cfg.replication +box.cfg{replication = ""} + +-- +-- Insert rows +-- +test_run:cmd("switch autobootstrap1") +s = box.space.test +for i = 1, 5 do s:insert{i} box.snapshot() end +s:select() +fio = require('fio') +path = fio.pathjoin(fio.abspath("."), 'autobootstrap1/*.xlog') +-- Depend on first master is a leader or not it should be 5 or 6. +#fio.glob(path) >= 5 +errinj = box.error.injection +errinj.set("ERRINJ_NO_DISK_SPACE", true) +function insert(a) s:insert(a) end +_, err = pcall(insert, {6}) +err:match("ailed to write") +-- add a little timeout so gc could finish job +fiber = require('fiber') +while #fio.glob(path) ~= 2 do fiber.sleep(0.01) end +#fio.glob(path) +test_run:cmd("switch default") +-- +-- Stop servers +-- +test_run:drop_cluster(SERVERS) diff --git a/test/replication/suite.ini b/test/replication/suite.ini index b489add58..27815acb6 100644 --- a/test/replication/suite.ini +++ b/test/replication/suite.ini @@ -3,7 +3,7 @@ core = tarantool script = master.lua description = tarantool/box, replication disabled = consistent.test.lua -release_disabled = catch.test.lua errinj.test.lua gc.test.lua before_replace.test.lua quorum.test.lua recover_missing_xlog.test.lua +release_disabled = catch.test.lua errinj.test.lua gc.test.lua before_replace.test.lua kick_dead_replica_on_enspc.test.lua quorum.test.lua recover_missing_xlog.test.lua config = suite.cfg lua_libs = lua/fast_replica.lua long_run = prune.test.lua -- 2.14.3 (Apple Git-98)