From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtpng2.m.smailru.net (smtpng2.m.smailru.net [94.100.179.3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 37F8D45C304 for ; Fri, 18 Dec 2020 01:44:54 +0300 (MSK) References: <51c5f48d04df7d43e4b773e8d9c2cdad76137dc1.1607879643.git.v.shpilevoy@tarantool.org> <1f433ed9-b8c8-f67a-fb7c-3da7cc504110@tarantool.org> From: Vladislav Shpilevoy Message-ID: Date: Thu, 17 Dec 2020 23:44:51 +0100 MIME-Version: 1.0 In-Reply-To: <1f433ed9-b8c8-f67a-fb7c-3da7cc504110@tarantool.org> Content-Type: text/plain; charset="utf-8" Content-Language: en-US Content-Transfer-Encoding: 8bit Subject: Re: [Tarantool-patches] [PATCH 4/8] test: introduce raft unit tests List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Serge Petrenko , tarantool-patches@dev.tarantool.org Hi! Thanks for the review! >> diff --git a/test/unit/raft.c b/test/unit/raft.c >> new file mode 100644 >> index 000000000..dfb5f8e43 >> --- /dev/null >> +++ b/test/unit/raft.c >> + >> +static void >> +raft_test_vote_skip(void) >> +{ >> +    raft_start_test(33); >> +    struct raft_node node; >> +    raft_node_create(&node); >> + >> +    /* Everything is skipped if the term is outdated. */ > > > 1. Let's also test a case when vote response has greater term than the candidate itself. Good idea. Below is the diff (without .result file diff, to be short). ==================== @@ -346,7 +346,7 @@ raft_test_vote(void) static void raft_test_vote_skip(void) { - raft_start_test(33); + raft_start_test(37); struct raft_node node; raft_node_create(&node); @@ -507,6 +507,35 @@ raft_test_vote_skip(void) raft_node_cfg_is_candidate(&node, true); + /* + * Vote response with a bigger term must be skipped, but it will bump + * the term. + */ + + /* Re-create the node so as not to write the vclock each time. */ + raft_node_destroy(&node); + raft_node_create(&node); + + raft_run_next_event(); + is(node.raft.state, RAFT_STATE_CANDIDATE, "became candidate"); + is(node.raft.term, 2, "term is bumped"); + + is(raft_node_send_vote_response(&node, + 3 /* Term. */, + 1 /* Vote. */, + 2 /* Source. */ + ), 0, "message is accepted"); + + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 3}" /* Vclock. */ + ), "term is bumped and became candidate"); + raft_node_destroy(&node); raft_finish_test(); } ==================== >> + >> +    raft_run_next_event(); >> +    is(node.raft.state, RAFT_STATE_CANDIDATE, "became candidate"); >> +    is(node.raft.term, 2, "term is bumped"); >> + >> +    is(raft_node_send_vote_response(&node, >> +        1 /* Term. */, >> +        1 /* Vote. */, >> +        2 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.vote_count, 1, "but ignored - too old term"); >> + >> +    /* Competing vote requests are skipped. */ >> + >> +    is(raft_node_send_vote_response(&node, >> +        2 /* Term. */, >> +        3 /* Vote. */, >> +        2 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.vote_count, 1, "but ignored - vote not for this node"); >> +    is(node.raft.state, RAFT_STATE_CANDIDATE, "this node does not give up"); >> + >> +    /* Vote requests are ignored when node is disabled. */ >> + >> +    raft_node_cfg_is_enabled(&node, false); >> + >> +    is(raft_node_send_follower(&node, >> +        3 /* Term. */, >> +        2 /* Source. */ >> +    ), 0, "message is accepted"); >> +    ok(raft_node_check_full_state(&node, >> +        RAFT_STATE_FOLLOWER /* State. */, >> +        0 /* Leader. */, >> +        3 /* Term. */, >> +        0 /* Vote. */, >> +        3 /* Volatile term. */, >> +        0 /* Volatile vote. */, >> +        "{0: 2}" /* Vclock. */ >> +    ), "term bump to be able to vote again"); >> +    is(raft_node_send_vote_request(&node, >> +        3 /* Term. */, >> +        "{}" /* Vclock. */, >> +        2 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.vote, 0, "but ignored - node is disabled"); >> + >> +    /* Disabled node still takes term from the vote request. */ >> + >> +    is(raft_node_send_vote_request(&node, >> +        4 /* Term. */, >> +        "{}" /* Vclock. */, >> +        2 /* Source. */ >> +    ), 0, "message is accepted"); >> +    ok(raft_node_check_full_state(&node, >> +        RAFT_STATE_FOLLOWER /* State. */, >> +        0 /* Leader. */, >> +        4 /* Term. */, >> +        0 /* Vote. */, >> +        4 /* Volatile term. */, >> +        0 /* Volatile vote. */, >> +        "{0: 3}" /* Vclock. */ >> +    ), "term is bumped, but vote request is ignored"); >> + >> +    raft_node_cfg_is_enabled(&node, true); >> + >> +    /* Not a candidate won't accept vote request for self. */ >> + >> +    is(raft_node_send_vote_response(&node, >> +        4 /* Term. */, >> +        1 /* Vote. */, >> +        2 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.vote, 0, "but ignored - vote works only on a candidate"); >> + >> +    /* Ignore vote response for some third node. */ >> + >> +    is(raft_node_send_vote_response(&node, >> +        4 /* Term. */, >> +        3 /* Vote. */, >> +        2 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.vote, 0, "but ignored - sender != vote, so it is not a " >> +       "request"); >> + >> +    /* Ignore if leader is already known. */ >> + >> +    is(raft_node_send_leader(&node, >> +        4 /* Term. */, >> +        2 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.leader, 2, "leader is accepted"); >> + >> +    is(raft_node_send_vote_request(&node, >> +        4 /* Term. */, >> +        "{}" /* Vclock. */, >> +        3 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.vote, 0, "but ignored - leader is already known"); >> +    is(node.raft.leader, 2, "leader is not changed"); >> + >> +    /* Ignore too small vclock. */ >> + >> +    /* >> +     * Need to turn off the candidate role to bump the term and not become >> +     * a candidate. >> +     */ >> +    raft_node_cfg_is_candidate(&node, false); >> + >> +    raft_node_journal_follow(&node, 1, 5); >> +    raft_node_journal_follow(&node, 2, 5); >> +    ok(raft_node_check_full_state(&node, >> +        RAFT_STATE_FOLLOWER /* State. */, >> +        2 /* Leader. */, >> +        4 /* Term. */, >> +        0 /* Vote. */, >> +        4 /* Volatile term. */, >> +        0 /* Volatile vote. */, >> +        "{0: 3, 1: 5, 2: 5}" /* Vclock. */ >> +    ), "vclock is bumped"); >> + >> +    is(raft_node_send_vote_request(&node, >> +        5 /* Term. */, >> +        "{1: 4}" /* Vclock. */, >> +        3 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.vote, 0, "but ignored - vclock is too small"); >> +    is(node.raft.term, 5, "new term"); >> +    is(node.raft.leader, 0, "leader is dropped in the new term"); >> + >> +    /* Ignore incomparable vclock. */ >> + >> +    is(raft_node_send_vote_request(&node, >> +        5 /* Term. */, >> +        "{1: 4, 2: 6}" /* Vclock. */, >> +        3 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.vote, 0, "but ignored - vclock is incomparable"); >> + >> +    /* Ignore if voted in the current term. */ >> + >> +    is(raft_node_send_vote_request(&node, >> +        6 /* Term. */, >> +        "{1: 5, 2: 5}" /* Vclock. */, >> +        2 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.vote, 2, "voted"); >> + >> +    is(raft_node_send_vote_request(&node, >> +        6 /* Term. */, >> +        "{1: 5, 2: 5}" /* Vclock. */, >> +        3 /* Source. */ >> +    ), 0, "message is accepted"); >> +    is(node.raft.vote, 2, "but ignored - already voted in the term"); > > > 2. I'd also check the case that the node ignores vote request after restart. Hm. What do you mean? Why should it? Do you mean if it voted in the current term and then restarted? I added this: ==================== @@ -346,7 +346,7 @@ raft_test_vote(void) static void raft_test_vote_skip(void) { - raft_start_test(37); + raft_start_test(39); struct raft_node node; raft_node_create(&node); @@ -505,6 +505,16 @@ raft_test_vote_skip(void) ), 0, "message is accepted"); is(node.raft.vote, 2, "but ignored - already voted in the term"); + /* After restart it still will ignore requests in the current term. */ + + raft_node_restart(&node); + is(raft_node_send_vote_request(&node, + 6 /* Term. */, + "{1: 5, 2: 5}" /* Vclock. */, + 3 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote, 2, "but ignored - already voted in the term"); + raft_node_cfg_is_candidate(&node, true); /* ==================== >> + >> +static void >> +raft_test_heartbeat(void) >> +{ >> +    raft_start_test(12); >> +    struct raft_node node; >> +    raft_node_create(&node); >> + >> +    /* Let the node know there is a leader somewhere. */ >> + >> +    is(raft_node_send_leader(&node, >> +        2 /* Term. */, >> +        2 /* Source. */ >> +    ), 0, "leader notification"); >> +    ok(raft_node_check_full_state(&node, >> +        RAFT_STATE_FOLLOWER /* State. */, >> +        2 /* Leader. */, >> +        2 /* Term. */, >> +        0 /* Vote. */, >> +        2 /* Volatile term. */, >> +        0 /* Volatile vote. */, >> +        "{0: 1}" /* Vclock. */ >> +    ), "follow the leader after notification"); >> + >> +    /* Leader can send the same message many times. */ >> + >> +    is(raft_node_send_leader(&node, >> +        2 /* Term. */, >> +        2 /* Source. */ >> +    ), 0, "leader notification"); >> + >> +    /* The node won't do anything if it is not a candidate. */ >> + >> +    raft_node_cfg_is_candidate(&node, false); >> +    raft_run_for(node.cfg_death_timeout * 2); >> +    ok(raft_node_check_full_state(&node, >> +        RAFT_STATE_FOLLOWER /* State. */, >> +        2 /* Leader. */, >> +        2 /* Term. */, >> +        0 /* Vote. */, >> +        2 /* Volatile term. */, >> +        0 /* Volatile vote. */, >> +        "{0: 1}" /* Vclock. */ >> +    ), "follow the leader because no candidate"); > > > 3. Shouldn't the node reset the leader to 0 even if if it isn't a candidate itself? I am not sure I parsed the sentence correctly. Did you said that the node should reset the leader if raft_node_cfg_is_candidate(false) is called? But why? If it is not a leader, it is a voter. It still must know who is the leader to decide whether to accept rows from its peers. Or do you mean we should still reset the leader after the timeout, but don't start new election? >> +    raft_node_cfg_is_candidate(&node, true); >> + >> +    /* Heartbeats from the leader are accepted. */ >> + >> +    for (int i = 0; i < 5; ++i) { >> +        raft_run_for(node.cfg_death_timeout / 2); >> +        raft_node_send_heartbeat(&node, 2); >> +    } >> +    ok(raft_node_check_full_state(&node, >> +        RAFT_STATE_FOLLOWER /* State. */, >> +        2 /* Leader. */, >> +        2 /* Term. */, >> +        0 /* Vote. */, >> +        2 /* Volatile term. */, >> +        0 /* Volatile vote. */, >> +        "{0: 1}" /* Vclock. */ >> +    ), "follow the leader because had heartbeats"); >> + >> +    /* Heartbeats not from the leader won't do anything. */ >> + >> +    double start = raft_time(); >> +    raft_run_for(node.cfg_death_timeout / 3); >> +    raft_node_send_heartbeat(&node, 3); >> +    raft_run_for(node.cfg_death_timeout / 3); >> +    raft_node_send_heartbeat(&node, 0); >> +    raft_run_next_event(); >> +    ok(raft_time() >= start + node.cfg_death_timeout, "death timeout " >> +       "passed"); > > > 4. Looks like this test  would succeed even if heartbeats from other >    instances were counted as leader heartbeats. You simply say >    `raft_run_next_event()` which'll make the death timeout fire sooner >    or later anyway. I think it's better to add another >    `raft_run_for(node.cfg_death_timeout / 2)` to make sure the node >    times out exactly when it should. I am again not 100% sure I understood correctly, but hopefully you meant this, and here I agree: ==================== @@ -745,8 +755,13 @@ raft_test_heartbeat(void) raft_run_for(node.cfg_death_timeout / 3); raft_node_send_heartbeat(&node, 0); raft_run_next_event(); - ok(raft_time() >= start + node.cfg_death_timeout, "death timeout " - "passed"); + double deadline = start + node.cfg_death_timeout; + /* + * Compare == with 0.1 precision. Because '/ 3' operations above will + * make the doubles contain some small garbage. + */ + ok(raft_time() + 0.1 >= deadline && raft_time() - 0.1 <= deadline, + "death timeout passed"); ok(raft_node_check_full_state(&node, RAFT_STATE_CANDIDATE /* State. */, 0 /* Leader. */, ==================== >> diff --git a/test/unit/raft_test_utils.c b/test/unit/raft_test_utils.c >> new file mode 100644 >> index 000000000..4acd74e8f >> --- /dev/null >> +++ b/test/unit/raft_test_utils.c >> + >> +static void >> +raft_method_node_broadcast(struct raft *raft, const struct raft_msg *msg); >> + >> +static void >> +raft_method_node_write(struct raft *raft, const struct raft_msg *msg); >> + >> +static void >> +raft_method_node_schedule_async(struct raft *raft); >> + > > > 5. Why such a long prefix? Why not simply raft_node_broadcast? I wanted to emphasize that these are 'methods' of 'raft'. Not of raft_node. >    Ok, raft_node_broadcast may sound too general and referring more >    to  a raft test node rather than the raft core itself. >    What about raft_node_broadcast_f then? I don't mind to change it if my version is confusing. ==================== @@ -60,18 +60,18 @@ raft_loop(void) } static void -raft_method_node_broadcast(struct raft *raft, const struct raft_msg *msg); +raft_node_broadcast_f(struct raft *raft, const struct raft_msg *msg); static void -raft_method_node_write(struct raft *raft, const struct raft_msg *msg); +raft_node_write_f(struct raft *raft, const struct raft_msg *msg); static void -raft_method_node_schedule_async(struct raft *raft); +raft_node_schedule_async_f(struct raft *raft); static struct raft_vtab raft_vtab = { - .broadcast = raft_method_node_broadcast, - .write = raft_method_node_write, - .schedule_async = raft_method_node_schedule_async, + .broadcast = raft_node_broadcast_f, + .write = raft_node_write_f, + .schedule_async = raft_node_schedule_async_f, }; static int @@ -485,21 +485,21 @@ raft_node_destroy(struct raft_node *node) } static void -raft_method_node_broadcast(struct raft *raft, const struct raft_msg *msg) +raft_node_broadcast_f(struct raft *raft, const struct raft_msg *msg) { struct raft_node *node = container_of(raft, struct raft_node, raft); raft_net_send(&node->net, msg); } static void -raft_method_node_write(struct raft *raft, const struct raft_msg *msg) +raft_node_write_f(struct raft *raft, const struct raft_msg *msg) { struct raft_node *node = container_of(raft, struct raft_node, raft); raft_journal_write(&node->journal, msg); } static void -raft_method_node_schedule_async(struct raft *raft) +raft_node_schedule_async_f(struct raft *raft) { struct raft_node *node = container_of(raft, struct raft_node, raft); node->has_work = true; ==================== >> +static struct raft_vtab raft_vtab = { >> +    .broadcast = raft_method_node_broadcast, >> +    .write = raft_method_node_write, >> +    .schedule_async = raft_method_node_schedule_async, >> +}; >> + >> +static int >> +raft_node_on_update(struct trigger *t, void *event) >> +{ >> +    struct raft_node *n = t->data; >> +    assert(&n->on_update == t); >> +    assert(&n->raft == event); > > > 6.             `(void) event` for building in release? Yes, need to add. ==================== @@ -80,6 +80,7 @@ raft_node_on_update(struct trigger *t, void *event) struct raft_node *n = t->data; assert(&n->on_update == t); assert(&n->raft == event); + (void)event; ++n->update_count; return 0; } ==================== Full new patch below: ==================== test: introduce raft unit tests Raft algorithm was tested only by functional Lua tests, as a part of the Tarantool executable. Functional testing of something like raft algorithm has drawbacks: - Not possible or too hard to cover some cases without error injections and/or good stability. Such as invalid messages, or small time durations, or a complex case which needs multiple steps to be reproduced. For instance, start WAL write, receive a message, finish the WAL write, and see if an expected thing happens. - Too long time to run when need to test not tiny timeouts. On the other hand, with tiny timeouts the test would become unstable. - Poor reproducibility due to random used in raft, due to system load, and number of other tests running in parallel. - Hard to debug, because for raft it is necessary to start one Tarantool process per raft instance. - Involves too much other systems, such as threads, real journal, relays, appliers, and so on. They can affect the process on the whole and reduce reproducibility and debug simplicity even more. Exactly the same problem existed for SWIM algorithm implemented as a module 'swim'. In order to test it, swim was moved to a separate library, refactored to be able to start many swims in the process (no global variables), its functions talking to other systems were virtualized (swim_ev, swim_transport), and substituted in the unit tests with fake analogue systems. In the unit tests these virtual functions were implemented differently, but the core swim algorithm was left intact and properly tested. The same is done for raft. This patch implements a set of helper functions and objects to unit test raft in raft_test_utils.c/.h files, and uses it to cover almost all the raft algorithm code. During implementation of the tests some bugs were found, which are not covered here, but fixed and covered in next commits. Part of #5303 diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index e6a79e911..e6517de0f 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -259,6 +259,9 @@ target_link_libraries(merger.test unit core box) add_executable(snap_quorum_delay.test snap_quorum_delay.cc) target_link_libraries(snap_quorum_delay.test box core unit) +add_executable(raft.test raft.c raft_test_utils.c core_test_utils.c) +target_link_libraries(raft.test vclock unit fakesys raft_algo) + # # Client for popen.test add_executable(popen-child popen-child.c) diff --git a/test/unit/raft.c b/test/unit/raft.c new file mode 100644 index 000000000..5b026a9c4 --- /dev/null +++ b/test/unit/raft.c @@ -0,0 +1,1243 @@ +/* + * Copyright 2010-2020, Tarantool AUTHORS, please see AUTHORS file. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "raft_test_utils.h" + +/** + * Test result is a real returned value of main_f. Fiber_join can not be used, + * because it expects if a returned value < 0 then diag is not empty. But in + * unit tests it can be violated - check_plan() does not set diag. + */ +static int test_result; + +static void +raft_test_leader_election(void) +{ + raft_start_test(24); + struct raft_node node; + raft_node_create(&node); + + is(node.net.count, 1, "1 pending message at start"); + ok(node.update_count > 0, "trigger worked"); + node.update_count = 0; + ok(raft_node_net_check_msg(&node, + 0 /* Index. */, + RAFT_STATE_FOLLOWER /* State. */, + 1 /* Term. */, + 0 /* Vote. */, + NULL /* Vclock. */ + ), "broadcast at start"); + raft_node_net_drop(&node); + + double death_timeout = node.cfg_death_timeout; + raft_run_next_event(); + ok(raft_time() >= death_timeout, "next event is leader death"); + + /* Elections are started with a new term, which is persisted. */ + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "elections with a new term"); + is(node.raft.vote_count, 1, "single vote for self"); + ok(node.update_count > 0, "trigger worked"); + node.update_count = 0; + + /* Check if all async work is done properly. */ + + is(node.journal.size, 1, "1 record in the journal"); + ok(raft_node_journal_check_row(&node, + 0 /* Index. */, + 2 /* Term. */, + 1 /* Vote. */ + ), "term and vote are on disk"); + + is(node.net.count, 1, "1 pending message"); + ok(raft_node_net_check_msg(&node, + 0 /* Index. */, + RAFT_STATE_CANDIDATE /* State. */, + 2 /* Term. */, + 1 /* Vote. */, + "{0: 1}" /* Vclock. */ + ), "vote request is sent"); + raft_node_net_drop(&node); + + /* Simulate first response. Nothing should happen, quorum is 3. */ + + is(raft_node_send_vote_response(&node, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Source. */ + ), 0, "vote response from 2"); + is(node.raft.vote_count, 2, "2 votes - 1 self and 1 foreign"); + ok(!node.has_work, "no work to do - not enough votes yet"); + + raft_run_for(node.cfg_election_timeout / 2); + is(node.raft.state, RAFT_STATE_CANDIDATE, "still candidate, waiting " + "for elections"); + is(node.update_count, 0, "trigger is the same"); + + /* Simulate second response. Quorum is reached. */ + + is(raft_node_send_vote_response(&node, + 2 /* Term. */, + 1 /* Vote. */, + 3 /* Source. */ + ), 0, "vote response from 3"); + is(node.raft.vote_count, 3, "2 votes - 1 self and 2 foreign"); + is(node.raft.state, RAFT_STATE_LEADER, "became leader"); + ok(node.update_count > 0, "trigger worked"); + node.update_count = 0; + + /* New leader should do a broadcast when elected. */ + + ok(!node.has_work, "no work - broadcast should be done"); + is(node.journal.size, 1, "no new rows in the journal - state change " + "is not persisted"); + is(node.net.count, 1, "1 pending message"); + ok(raft_node_net_check_msg(&node, + 0 /* Index. */, + RAFT_STATE_LEADER /* State. */, + 2 /* Term. */, + 1 /* Vote. */, + NULL /* Vclock. */ + ), "sent new-leader notification"); + raft_node_net_drop(&node); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_recovery(void) +{ + raft_start_test(12); + struct raft_msg msg; + struct raft_node node; + raft_node_create(&node); + + raft_run_next_event(); + is(node.raft.state, RAFT_STATE_CANDIDATE, "became candidate"); + + /* Candidate's checkpoint. */ + + raft_checkpoint_remote(&node.raft, &msg); + ok(raft_msg_check(&msg, + RAFT_STATE_CANDIDATE /* State. */, + 2 /* Term. */, + 1 /* Vote. */, + "{0: 1}" /* Vclock. */ + ), "remote checkpoint of a candidate"); + + raft_checkpoint_local(&node.raft, &msg); + /* State and vclock are not persisted in a local checkpoint. */ + ok(raft_msg_check(&msg, + 0 /* State. */, + 2 /* Term. */, + 1 /* Vote. */, + NULL /* Vclock. */ + ), "local checkpoint of a candidate"); + + is(raft_node_send_vote_response(&node, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Source. */ + ), 0, "vote response from 2"); + is(raft_node_send_vote_response(&node, + 2 /* Term. */, + 1 /* Vote. */, + 3 /* Source. */ + ), 0, "vote response from 3"); + is(node.raft.state, RAFT_STATE_LEADER, "became leader"); + + ok(raft_node_check_full_state(&node, + RAFT_STATE_LEADER /* State. */, + 1 /* Leader. */, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "election is finished"); + + /* Leader's checkpoint. */ + + raft_checkpoint_remote(&node.raft, &msg); + /* Leader does not send vclock. */ + ok(raft_msg_check(&msg, + RAFT_STATE_LEADER /* State. */, + 2 /* Term. */, + 1 /* Vote. */, + NULL /* Vclock. */ + ), "remote checkpoint of a leader"); + + raft_checkpoint_local(&node.raft, &msg); + /* State and vclock are not persisted in a local checkpoint. */ + ok(raft_msg_check(&msg, + 0 /* State. */, + 2 /* Term. */, + 1 /* Vote. */, + NULL /* Vclock. */ + ), "local checkpoint of a leader"); + + /* Restart leads to state loss. Look at follower's checkpoint. */ + + raft_node_restart(&node); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 0 /* Leader. */, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "restart always as a follower"); + + raft_checkpoint_remote(&node.raft, &msg); + ok(raft_msg_check(&msg, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Term. */, + 1 /* Vote. */, + NULL /* Vclock. */ + ), "remote checkpoint of a leader"); + + raft_checkpoint_local(&node.raft, &msg); + ok(raft_msg_check(&msg, + 0 /* State. */, + 2 /* Term. */, + 1 /* Vote. */, + NULL /* Vclock. */ + ), "local checkpoint of a leader"); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_bad_msg(void) +{ + raft_start_test(6); + struct raft_msg msg; + struct raft_node node; + struct vclock vclock; + raft_node_create(&node); + + msg = (struct raft_msg){ + .state = 0, + .term = 10, + }; + is(raft_node_process_msg(&node, &msg, 2), -1, "state can't be 0"); + is(node.raft.term, 1, "term from the bad message wasn't used"); + + raft_vclock_from_string(&vclock, "{2: 1}"); + msg = (struct raft_msg){ + .state = RAFT_STATE_CANDIDATE, + .term = 10, + .vote = 3, + .vclock = &vclock, + }; + is(raft_node_process_msg(&node, &msg, 2), -1, "node can't be a " + "candidate but vote for another node"); + is(node.raft.term, 1, "term from the bad message wasn't used"); + + msg = (struct raft_msg){ + .state = RAFT_STATE_CANDIDATE, + .term = 10, + .vote = 2, + }; + is(raft_node_process_msg(&node, &msg, 2), -1, "node can't be a " + "candidate without vclock"); + is(node.raft.term, 1, "term from the bad message wasn't used"); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_vote(void) +{ + raft_start_test(6); + struct raft_node node; + raft_node_create(&node); + + /* Vote for other node. */ + + is(raft_node_send_vote_request(&node, + 2 /* Term. */, + "{}" /* Vclock. */, + 2 /* Source. */ + ), 0, "vote request from 2"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 0 /* Leader. */, + 2 /* Term. */, + 2 /* Vote. */, + 2 /* Volatile term. */, + 2 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "voted for 2"); + + is(raft_node_send_vote_request(&node, + 2 /* Term. */, + "{}" /* Vclock. */, + 3 /* Source. */ + ), 0, "vote request from 3"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 0 /* Leader. */, + 2 /* Term. */, + 2 /* Vote. */, + 2 /* Volatile term. */, + 2 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "still kept vote for 2"); + + /* If the candidate didn't become a leader, start own election. */ + + double ts = raft_time(); + raft_run_next_event(); + ok(raft_time() - ts >= node.cfg_election_timeout, "election timeout " + "passed"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 2}" /* Vclock. */ + ), "became candidate"); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_vote_skip(void) +{ + raft_start_test(39); + struct raft_node node; + raft_node_create(&node); + + /* Everything is skipped if the term is outdated. */ + + raft_run_next_event(); + is(node.raft.state, RAFT_STATE_CANDIDATE, "became candidate"); + is(node.raft.term, 2, "term is bumped"); + + is(raft_node_send_vote_response(&node, + 1 /* Term. */, + 1 /* Vote. */, + 2 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote_count, 1, "but ignored - too old term"); + + /* Competing vote requests are skipped. */ + + is(raft_node_send_vote_response(&node, + 2 /* Term. */, + 3 /* Vote. */, + 2 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote_count, 1, "but ignored - vote not for this node"); + is(node.raft.state, RAFT_STATE_CANDIDATE, "this node does not give up"); + + /* Vote requests are ignored when node is disabled. */ + + raft_node_cfg_is_enabled(&node, false); + + is(raft_node_send_follower(&node, + 3 /* Term. */, + 2 /* Source. */ + ), 0, "message is accepted"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 0 /* Vote. */, + 3 /* Volatile term. */, + 0 /* Volatile vote. */, + "{0: 2}" /* Vclock. */ + ), "term bump to be able to vote again"); + is(raft_node_send_vote_request(&node, + 3 /* Term. */, + "{}" /* Vclock. */, + 2 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote, 0, "but ignored - node is disabled"); + + /* Disabled node still takes term from the vote request. */ + + is(raft_node_send_vote_request(&node, + 4 /* Term. */, + "{}" /* Vclock. */, + 2 /* Source. */ + ), 0, "message is accepted"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 0 /* Leader. */, + 4 /* Term. */, + 0 /* Vote. */, + 4 /* Volatile term. */, + 0 /* Volatile vote. */, + "{0: 3}" /* Vclock. */ + ), "term is bumped, but vote request is ignored"); + + raft_node_cfg_is_enabled(&node, true); + + /* Not a candidate won't accept vote request for self. */ + + is(raft_node_send_vote_response(&node, + 4 /* Term. */, + 1 /* Vote. */, + 2 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote, 0, "but ignored - vote works only on a candidate"); + + /* Ignore vote response for some third node. */ + + is(raft_node_send_vote_response(&node, + 4 /* Term. */, + 3 /* Vote. */, + 2 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote, 0, "but ignored - sender != vote, so it is not a " + "request"); + + /* Ignore if leader is already known. */ + + is(raft_node_send_leader(&node, + 4 /* Term. */, + 2 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.leader, 2, "leader is accepted"); + + is(raft_node_send_vote_request(&node, + 4 /* Term. */, + "{}" /* Vclock. */, + 3 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote, 0, "but ignored - leader is already known"); + is(node.raft.leader, 2, "leader is not changed"); + + /* Ignore too small vclock. */ + + /* + * Need to turn off the candidate role to bump the term and not become + * a candidate. + */ + raft_node_cfg_is_candidate(&node, false); + + raft_node_journal_follow(&node, 1, 5); + raft_node_journal_follow(&node, 2, 5); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 4 /* Term. */, + 0 /* Vote. */, + 4 /* Volatile term. */, + 0 /* Volatile vote. */, + "{0: 3, 1: 5, 2: 5}" /* Vclock. */ + ), "vclock is bumped"); + + is(raft_node_send_vote_request(&node, + 5 /* Term. */, + "{1: 4}" /* Vclock. */, + 3 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote, 0, "but ignored - vclock is too small"); + is(node.raft.term, 5, "new term"); + is(node.raft.leader, 0, "leader is dropped in the new term"); + + /* Ignore incomparable vclock. */ + + is(raft_node_send_vote_request(&node, + 5 /* Term. */, + "{1: 4, 2: 6}" /* Vclock. */, + 3 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote, 0, "but ignored - vclock is incomparable"); + + /* Ignore if voted in the current term. */ + + is(raft_node_send_vote_request(&node, + 6 /* Term. */, + "{1: 5, 2: 5}" /* Vclock. */, + 2 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote, 2, "voted"); + + is(raft_node_send_vote_request(&node, + 6 /* Term. */, + "{1: 5, 2: 5}" /* Vclock. */, + 3 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote, 2, "but ignored - already voted in the term"); + + /* After restart it still will ignore requests in the current term. */ + + raft_node_restart(&node); + is(raft_node_send_vote_request(&node, + 6 /* Term. */, + "{1: 5, 2: 5}" /* Vclock. */, + 3 /* Source. */ + ), 0, "message is accepted"); + is(node.raft.vote, 2, "but ignored - already voted in the term"); + + raft_node_cfg_is_candidate(&node, true); + + /* + * Vote response with a bigger term must be skipped, but it will bump + * the term. + */ + + /* Re-create the node so as not to write the vclock each time. */ + raft_node_destroy(&node); + raft_node_create(&node); + + raft_run_next_event(); + is(node.raft.state, RAFT_STATE_CANDIDATE, "became candidate"); + is(node.raft.term, 2, "term is bumped"); + + is(raft_node_send_vote_response(&node, + 3 /* Term. */, + 1 /* Vote. */, + 2 /* Source. */ + ), 0, "message is accepted"); + + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 3}" /* Vclock. */ + ), "term is bumped and became candidate"); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_leader_resign(void) +{ + raft_start_test(15); + struct raft_node node; + + /* + * When a node resignes from leader role voluntarily, the other nodes + * will start next election. + */ + + raft_node_create(&node); + + is(raft_node_send_leader(&node, 1, 2), 0, "message is accepted"); + is(node.raft.leader, 2, "leader is elected"); + + is(raft_node_send_follower(&node, 1, 2), 0, "message is accepted"); + is(node.raft.leader, 0, "leader has resigned"); + + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "became candidate"); + + raft_node_destroy(&node); + + /* Resign does not do anything if the node is not a candidate. */ + + raft_node_create(&node); + + is(raft_node_send_leader(&node, 1, 2), 0, "message is accepted"); + is(node.raft.leader, 2, "leader is elected"); + + raft_node_cfg_is_candidate(&node, false); + /* Multiple candidate reset won't break anything. */ + raft_node_cfg_is_candidate(&node, false); + + is(raft_node_send_follower(&node, 1, 2), 0, "message is accepted"); + is(node.raft.leader, 0, "leader has resigned"); + + raft_run_for(node.cfg_death_timeout * 2); + + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 0 /* Leader. */, + 1 /* Term. */, + 0 /* Vote. */, + 1 /* Volatile term. */, + 0 /* Volatile vote. */, + "{}" /* Vclock. */ + ), "still follower"); + + raft_node_destroy(&node); + + /* Resign by refusing to be a candidate. */ + + raft_node_create(&node); + + raft_run_next_event(); + is(raft_node_send_vote_response(&node, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Source. */ + ), 0, "vote from 2"); + is(raft_node_send_vote_response(&node, + 2 /* Term. */, + 1 /* Vote. */, + 3 /* Source. */ + ), 0, "vote from 3"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_LEADER /* State. */, + 1 /* Leader. */, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "became leader"); + + raft_node_net_drop(&node); + raft_node_cfg_is_candidate(&node, false); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 0 /* Leader. */, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "the leader has resigned"); + ok(raft_node_net_check_msg(&node, + 0 /* Index. */, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Term. */, + 1 /* Vote. */, + NULL /* Vclock. */ + ), "resign notification is sent"); + + raft_node_destroy(&node); + + raft_finish_test(); +} + +static void +raft_test_split_brain(void) +{ + raft_start_test(4); + struct raft_node node; + raft_node_create(&node); + + /* + * Split brain is ignored, as there is nothing to do with it + * automatically. + */ + + is(raft_node_send_leader(&node, + 2 /* Term. */, + 2 /* Source. */ + ), 0, "first leader notification"); + is(node.raft.leader, 2, "leader is found"); + + is(raft_node_send_leader(&node, + 2 /* Term. */, + 3 /* Source. */ + ), 0, "second leader notification"); + is(node.raft.leader, 2, "split brain, the old leader is kept"); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_heartbeat(void) +{ + raft_start_test(12); + struct raft_node node; + raft_node_create(&node); + + /* Let the node know there is a leader somewhere. */ + + is(raft_node_send_leader(&node, + 2 /* Term. */, + 2 /* Source. */ + ), 0, "leader notification"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 2 /* Term. */, + 0 /* Vote. */, + 2 /* Volatile term. */, + 0 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "follow the leader after notification"); + + /* Leader can send the same message many times. */ + + is(raft_node_send_leader(&node, + 2 /* Term. */, + 2 /* Source. */ + ), 0, "leader notification"); + + /* The node won't do anything if it is not a candidate. */ + + raft_node_cfg_is_candidate(&node, false); + raft_run_for(node.cfg_death_timeout * 2); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 2 /* Term. */, + 0 /* Vote. */, + 2 /* Volatile term. */, + 0 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "follow the leader because no candidate"); + raft_node_cfg_is_candidate(&node, true); + + /* Heartbeats from the leader are accepted. */ + + for (int i = 0; i < 5; ++i) { + raft_run_for(node.cfg_death_timeout / 2); + raft_node_send_heartbeat(&node, 2); + } + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 2 /* Term. */, + 0 /* Vote. */, + 2 /* Volatile term. */, + 0 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "follow the leader because had heartbeats"); + + /* Heartbeats not from the leader won't do anything. */ + + double start = raft_time(); + raft_run_for(node.cfg_death_timeout / 3); + raft_node_send_heartbeat(&node, 3); + raft_run_for(node.cfg_death_timeout / 3); + raft_node_send_heartbeat(&node, 0); + raft_run_next_event(); + double deadline = start + node.cfg_death_timeout; + /* + * Compare == with 0.1 precision. Because '/ 3' operations above will + * make the doubles contain some small garbage. + */ + ok(raft_time() + 0.1 >= deadline && raft_time() - 0.1 <= deadline, + "death timeout passed"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 2}" /* Vclock. */ + ), "enter candidate state when no heartbeats from the leader"); + + /* Non-candidate ignores heartbeats. */ + + raft_node_cfg_is_candidate(&node, false); + raft_node_send_heartbeat(&node, 2); + raft_node_cfg_is_candidate(&node, true); + + /* Leader ignores all heartbeats - nothing to wait for. */ + + raft_node_new_term(&node); + is(raft_node_send_vote_response(&node, + 4 /* Term. */, + 1 /* Vote. */, + 2 /* Source. */ + ), 0, "vote from 2"); + is(raft_node_send_vote_response(&node, + 4 /* Term. */, + 1 /* Vote. */, + 3 /* Source. */ + ), 0, "vote from 3"); + is(node.raft.state, RAFT_STATE_LEADER, "became leader"); + /* From self. */ + raft_node_send_heartbeat(&node, 1); + /* From somebody else. */ + raft_node_send_heartbeat(&node, 2); + + /* Heartbeats are ignored during WAL write. */ + + raft_node_block(&node); + is(raft_node_send_leader(&node, + 5 /* Term. */, + 2 /* Source. */ + ), 0, "message from leader"); + raft_node_send_heartbeat(&node, 2); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 4 /* Term. */, + 1 /* Vote. */, + 5 /* Volatile term. */, + 0 /* Volatile vote. */, + "{0: 4}" /* Vclock. */ + ), "nothing changed - waiting for WAL write"); + raft_node_unblock(&node); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_election_timeout(void) +{ + raft_start_test(11); + struct raft_node node; + raft_node_create(&node); + + /* Configuration works when done before election. */ + + double election_timeout = node.cfg_election_timeout; + double death_timeout = node.cfg_death_timeout; + double ts = raft_time(); + raft_run_next_event(); + ok(raft_time() == ts + death_timeout, "election is started"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "enter candidate state"); + + ts = raft_time(); + raft_run_next_event(); + ok(raft_time() >= ts + election_timeout, "new election is started"); + ok(raft_time() <= ts + election_timeout * 1.1, "but not too late"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 2}" /* Vclock. */ + ), "re-enter candidate state"); + + /* Reconfiguration works when done during election. */ + + ts = raft_time(); + raft_run_for(election_timeout / 2); + raft_node_cfg_election_timeout(&node, election_timeout * 2); + raft_run_for(election_timeout); + election_timeout = node.cfg_election_timeout; + + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 2}" /* Vclock. */ + ), "still in the same term - new election timeout didn't expire"); + + raft_run_next_event(); + /* + * 0.1 precision is used because random double numbers sometimes loose + * tiny values. + */ + ok(raft_time() + 0.1 >= ts + election_timeout, "new election timeout is " + "respected"); + ok(raft_time() - 0.1 <= ts + election_timeout * 1.1, "but not too " + "late"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 4 /* Term. */, + 1 /* Vote. */, + 4 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 3}" /* Vclock. */ + ), "re-enter candidate state"); + + /* + * Timeout smaller than a millisecond. Election random shift has + * millisecond precision. When timeout is smaller, maximal shift is + * rounded up to 1 ms. + */ + election_timeout = 0.000001; + raft_node_cfg_election_timeout(&node, election_timeout); + uint64_t term = node.raft.term; + do { + ts = raft_time(); + raft_run_next_event(); + ++term; + /* If random part is 0, the loop would become infinite. */ + } while (raft_time() - ts == election_timeout); + is(node.raft.term, term, "term is bumped, timeout was truly random"); + is(node.raft.state, RAFT_STATE_CANDIDATE, "still candidate"); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_election_quorum(void) +{ + raft_start_test(7); + struct raft_node node; + raft_node_create(&node); + + /* + * Quorum decrease during election leads to immediate win if vote count + * is already sufficient. + */ + + raft_node_cfg_election_quorum(&node, 5); + raft_run_next_event(); + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "enter candidate state"); + + raft_node_cfg_election_quorum(&node, 3); + is(node.raft.state, RAFT_STATE_CANDIDATE, "still candidate"); + + is(raft_node_send_vote_response(&node, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Source. */ + ), 0, "send vote response from second node"); + is(node.raft.vote_count, 2, "vote is accepted"); + is(node.raft.state, RAFT_STATE_CANDIDATE, "but still candidate"); + + raft_node_cfg_election_quorum(&node, 2); + ok(raft_node_check_full_state(&node, + RAFT_STATE_LEADER /* State. */, + 1 /* Leader. */, + 2 /* Term. */, + 1 /* Vote. */, + 2 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "enter leader state after another quorum lowering"); + + /* Quorum 1 allows to become leader right after WAL write. */ + + raft_node_cfg_election_quorum(&node, 1); + raft_node_new_term(&node); + ok(raft_node_check_full_state(&node, + RAFT_STATE_LEADER /* State. */, + 1 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 3}" /* Vclock. */ + ), "became leader again immediately with 1 self vote"); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_death_timeout(void) +{ + raft_start_test(4); + struct raft_node node; + raft_node_create(&node); + + /* Change death timeout during leader death wait. */ + + is(raft_node_send_leader(&node, + 2 /* Term. */, + 2 /* Source. */ + ), 0, "leader notification"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 2 /* Term. */, + 0 /* Vote. */, + 2 /* Volatile term. */, + 0 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "follow the leader"); + + double timeout = node.cfg_death_timeout; + raft_run_for(timeout / 2); + raft_node_cfg_death_timeout(&node, timeout * 2); + raft_run_for(timeout); + timeout = node.cfg_death_timeout; + + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 2 /* Term. */, + 0 /* Vote. */, + 2 /* Volatile term. */, + 0 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "the leader still is considered alive"); + + raft_run_for(timeout / 2); + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 2}" /* Vclock. */ + ), "enter candidate state when the new death timeout expires"); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_enable_disable(void) +{ + raft_start_test(10); + struct raft_node node; + raft_node_create(&node); + + /* Disabled node can track a leader. */ + + raft_node_cfg_is_enabled(&node, false); + is(raft_node_send_leader(&node, + 2 /* Term. */, + 2 /* Source. */ + ), 0, "accepted a leader notification"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 2 /* Term. */, + 0 /* Vote. */, + 2 /* Volatile term. */, + 0 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "leader is seen"); + + /* When re-enabled, the leader death timer is started. */ + + raft_node_cfg_is_enabled(&node, true); + double ts = raft_time(); + raft_run_next_event(); + ok(raft_time() - ts == node.cfg_death_timeout, "death timeout passed"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 2}" /* Vclock. */ + ), "became candidate"); + + /* Multiple enabling does not break anything. */ + + raft_node_cfg_is_enabled(&node, true); + raft_node_cfg_is_enabled(&node, true); + ok(raft_node_check_full_state(&node, + RAFT_STATE_CANDIDATE /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 2}" /* Vclock. */ + ), "nothing changed"); + + /* Leader disable makes it forget he was a leader. */ + + is(raft_node_send_vote_response(&node, + 3 /* Term. */, + 1 /* Vote. */, + 2 /* Source. */ + ), 0, "vote from 2"); + is(raft_node_send_vote_response(&node, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Source. */ + ), 0, "vote from 3"); + is(node.raft.state, RAFT_STATE_LEADER, "became leader"); + + raft_node_cfg_is_enabled(&node, false); + + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 2}" /* Vclock. */ + ), "resigned from leader state"); + + /* Multiple disabling does not break anything. */ + + raft_node_cfg_is_enabled(&node, false); + raft_node_cfg_is_enabled(&node, false); + + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 0 /* Leader. */, + 3 /* Term. */, + 1 /* Vote. */, + 3 /* Volatile term. */, + 1 /* Volatile vote. */, + "{0: 2}" /* Vclock. */ + ), "nothing changed"); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static void +raft_test_too_long_wal_write(void) +{ + raft_start_test(8); + struct raft_node node; + raft_node_create(&node); + + /* During WAL write the node does not wait for leader death. */ + + raft_node_block(&node); + is(raft_node_send_vote_request(&node, + 2 /* Term. */, + "{2: 1}" /* Vclock. */, + 2 /* Source. */ + ), 0, "vote for 2"); + + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 0 /* Leader. */, + 1 /* Term. */, + 0 /* Vote. */, + 2 /* Volatile term. */, + 2 /* Volatile vote. */, + "{}" /* Vclock. */ + ), "vote is volatile"); + + is(raft_node_send_leader(&node, + 2 /* Term. */, + 2 /* Source. */ + ), 0, "message from leader"); + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 1 /* Term. */, + 0 /* Vote. */, + 2 /* Volatile term. */, + 2 /* Volatile vote. */, + "{}" /* Vclock. */ + ), "leader is known"); + + raft_run_for(node.cfg_death_timeout * 2); + + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 1 /* Term. */, + 0 /* Vote. */, + 2 /* Volatile term. */, + 2 /* Volatile vote. */, + "{}" /* Vclock. */ + ), "nothing changed"); + + raft_node_unblock(&node); + + ok(raft_node_check_full_state(&node, + RAFT_STATE_FOLLOWER /* State. */, + 2 /* Leader. */, + 2 /* Term. */, + 2 /* Vote. */, + 2 /* Volatile term. */, + 2 /* Volatile vote. */, + "{0: 1}" /* Vclock. */ + ), "wal write is finished"); + + double ts = raft_time(); + raft_run_next_event(); + ok(raft_time() - ts == node.cfg_death_timeout, "timer works again"); + is(node.raft.state, RAFT_STATE_CANDIDATE, "became candidate"); + + raft_node_destroy(&node); + raft_finish_test(); +} + +static int +main_f(va_list ap) +{ + raft_start_test(13); + + (void) ap; + fakeev_init(); + + raft_test_leader_election(); + raft_test_recovery(); + raft_test_bad_msg(); + raft_test_vote(); + raft_test_vote_skip(); + raft_test_leader_resign(); + raft_test_split_brain(); + raft_test_heartbeat(); + raft_test_election_timeout(); + raft_test_election_quorum(); + raft_test_death_timeout(); + raft_test_enable_disable(); + raft_test_too_long_wal_write(); + + fakeev_free(); + + test_result = check_plan(); + footer(); + return 0; +} + +int +main() +{ + raft_run_test("raft.txt", main_f); + return test_result; +} diff --git a/test/unit/raft.result b/test/unit/raft.result new file mode 100644 index 000000000..093047843 --- /dev/null +++ b/test/unit/raft.result @@ -0,0 +1,213 @@ + *** main_f *** +1..13 + *** raft_test_leader_election *** + 1..24 + ok 1 - 1 pending message at start + ok 2 - trigger worked + ok 3 - broadcast at start + ok 4 - next event is leader death + ok 5 - elections with a new term + ok 6 - single vote for self + ok 7 - trigger worked + ok 8 - 1 record in the journal + ok 9 - term and vote are on disk + ok 10 - 1 pending message + ok 11 - vote request is sent + ok 12 - vote response from 2 + ok 13 - 2 votes - 1 self and 1 foreign + ok 14 - no work to do - not enough votes yet + ok 15 - still candidate, waiting for elections + ok 16 - trigger is the same + ok 17 - vote response from 3 + ok 18 - 2 votes - 1 self and 2 foreign + ok 19 - became leader + ok 20 - trigger worked + ok 21 - no work - broadcast should be done + ok 22 - no new rows in the journal - state change is not persisted + ok 23 - 1 pending message + ok 24 - sent new-leader notification +ok 1 - subtests + *** raft_test_leader_election: done *** + *** raft_test_recovery *** + 1..12 + ok 1 - became candidate + ok 2 - remote checkpoint of a candidate + ok 3 - local checkpoint of a candidate + ok 4 - vote response from 2 + ok 5 - vote response from 3 + ok 6 - became leader + ok 7 - election is finished + ok 8 - remote checkpoint of a leader + ok 9 - local checkpoint of a leader + ok 10 - restart always as a follower + ok 11 - remote checkpoint of a leader + ok 12 - local checkpoint of a leader +ok 2 - subtests + *** raft_test_recovery: done *** + *** raft_test_bad_msg *** + 1..6 + ok 1 - state can't be 0 + ok 2 - term from the bad message wasn't used + ok 3 - node can't be a candidate but vote for another node + ok 4 - term from the bad message wasn't used + ok 5 - node can't be a candidate without vclock + ok 6 - term from the bad message wasn't used +ok 3 - subtests + *** raft_test_bad_msg: done *** + *** raft_test_vote *** + 1..6 + ok 1 - vote request from 2 + ok 2 - voted for 2 + ok 3 - vote request from 3 + ok 4 - still kept vote for 2 + ok 5 - election timeout passed + ok 6 - became candidate +ok 4 - subtests + *** raft_test_vote: done *** + *** raft_test_vote_skip *** + 1..39 + ok 1 - became candidate + ok 2 - term is bumped + ok 3 - message is accepted + ok 4 - but ignored - too old term + ok 5 - message is accepted + ok 6 - but ignored - vote not for this node + ok 7 - this node does not give up + ok 8 - message is accepted + ok 9 - term bump to be able to vote again + ok 10 - message is accepted + ok 11 - but ignored - node is disabled + ok 12 - message is accepted + ok 13 - term is bumped, but vote request is ignored + ok 14 - message is accepted + ok 15 - but ignored - vote works only on a candidate + ok 16 - message is accepted + ok 17 - but ignored - sender != vote, so it is not a request + ok 18 - message is accepted + ok 19 - leader is accepted + ok 20 - message is accepted + ok 21 - but ignored - leader is already known + ok 22 - leader is not changed + ok 23 - vclock is bumped + ok 24 - message is accepted + ok 25 - but ignored - vclock is too small + ok 26 - new term + ok 27 - leader is dropped in the new term + ok 28 - message is accepted + ok 29 - but ignored - vclock is incomparable + ok 30 - message is accepted + ok 31 - voted + ok 32 - message is accepted + ok 33 - but ignored - already voted in the term + ok 34 - message is accepted + ok 35 - but ignored - already voted in the term + ok 36 - became candidate + ok 37 - term is bumped + ok 38 - message is accepted + ok 39 - term is bumped and became candidate +ok 5 - subtests + *** raft_test_vote_skip: done *** + *** raft_test_leader_resign *** + 1..15 + ok 1 - message is accepted + ok 2 - leader is elected + ok 3 - message is accepted + ok 4 - leader has resigned + ok 5 - became candidate + ok 6 - message is accepted + ok 7 - leader is elected + ok 8 - message is accepted + ok 9 - leader has resigned + ok 10 - still follower + ok 11 - vote from 2 + ok 12 - vote from 3 + ok 13 - became leader + ok 14 - the leader has resigned + ok 15 - resign notification is sent +ok 6 - subtests + *** raft_test_leader_resign: done *** + *** raft_test_split_brain *** + 1..4 + ok 1 - first leader notification + ok 2 - leader is found + ok 3 - second leader notification + ok 4 - split brain, the old leader is kept +ok 7 - subtests + *** raft_test_split_brain: done *** + *** raft_test_heartbeat *** + 1..12 + ok 1 - leader notification + ok 2 - follow the leader after notification + ok 3 - leader notification + ok 4 - follow the leader because no candidate + ok 5 - follow the leader because had heartbeats + ok 6 - death timeout passed + ok 7 - enter candidate state when no heartbeats from the leader + ok 8 - vote from 2 + ok 9 - vote from 3 + ok 10 - became leader + ok 11 - message from leader + ok 12 - nothing changed - waiting for WAL write +ok 8 - subtests + *** raft_test_heartbeat: done *** + *** raft_test_election_timeout *** + 1..11 + ok 1 - election is started + ok 2 - enter candidate state + ok 3 - new election is started + ok 4 - but not too late + ok 5 - re-enter candidate state + ok 6 - still in the same term - new election timeout didn't expire + ok 7 - new election timeout is respected + ok 8 - but not too late + ok 9 - re-enter candidate state + ok 10 - term is bumped, timeout was truly random + ok 11 - still candidate +ok 9 - subtests + *** raft_test_election_timeout: done *** + *** raft_test_election_quorum *** + 1..7 + ok 1 - enter candidate state + ok 2 - still candidate + ok 3 - send vote response from second node + ok 4 - vote is accepted + ok 5 - but still candidate + ok 6 - enter leader state after another quorum lowering + ok 7 - became leader again immediately with 1 self vote +ok 10 - subtests + *** raft_test_election_quorum: done *** + *** raft_test_death_timeout *** + 1..4 + ok 1 - leader notification + ok 2 - follow the leader + ok 3 - the leader still is considered alive + ok 4 - enter candidate state when the new death timeout expires +ok 11 - subtests + *** raft_test_death_timeout: done *** + *** raft_test_enable_disable *** + 1..10 + ok 1 - accepted a leader notification + ok 2 - leader is seen + ok 3 - death timeout passed + ok 4 - became candidate + ok 5 - nothing changed + ok 6 - vote from 2 + ok 7 - vote from 3 + ok 8 - became leader + ok 9 - resigned from leader state + ok 10 - nothing changed +ok 12 - subtests + *** raft_test_enable_disable: done *** + *** raft_test_too_long_wal_write *** + 1..8 + ok 1 - vote for 2 + ok 2 - vote is volatile + ok 3 - message from leader + ok 4 - leader is known + ok 5 - nothing changed + ok 6 - wal write is finished + ok 7 - timer works again + ok 8 - became candidate +ok 13 - subtests + *** raft_test_too_long_wal_write: done *** + *** main_f: done *** diff --git a/test/unit/raft_test_utils.c b/test/unit/raft_test_utils.c new file mode 100644 index 000000000..b8735f373 --- /dev/null +++ b/test/unit/raft_test_utils.c @@ -0,0 +1,560 @@ +/* + * Copyright 2010-2020, Tarantool AUTHORS, please see AUTHORS file. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "memory.h" +#include "raft/raft_ev.h" +#include "raft_test_utils.h" +#include "random.h" + +#include + +void +raft_ev_timer_start(struct ev_loop *loop, struct ev_timer *watcher) +{ + fakeev_timer_start(loop, watcher); +} + +double +raft_ev_timer_remaining(struct ev_loop *loop, struct ev_timer *watcher) +{ + return fakeev_timer_remaining(loop, watcher); +} + +void +raft_ev_timer_stop(struct ev_loop *loop, struct ev_timer *watcher) +{ + fakeev_timer_stop(loop, watcher); +} + +struct ev_loop * +raft_loop(void) +{ + return fakeev_loop(); +} + +static void +raft_node_broadcast_f(struct raft *raft, const struct raft_msg *msg); + +static void +raft_node_write_f(struct raft *raft, const struct raft_msg *msg); + +static void +raft_node_schedule_async_f(struct raft *raft); + +static struct raft_vtab raft_vtab = { + .broadcast = raft_node_broadcast_f, + .write = raft_node_write_f, + .schedule_async = raft_node_schedule_async_f, +}; + +static int +raft_node_on_update(struct trigger *t, void *event) +{ + struct raft_node *n = t->data; + assert(&n->on_update == t); + assert(&n->raft == event); + (void)event; + ++n->update_count; + return 0; +} + +static void +raft_node_on_destroy(struct trigger *t) +{ + struct raft_node *n = t->data; + assert(&n->on_update == t); + n->update_count = 0; +} + +static inline bool +raft_node_is_started(const struct raft_node *node) +{ + return node->worker != NULL; +} + +static int +raft_node_worker_f(va_list va); + +static void +raft_journal_create(struct raft_journal *journal, uint32_t instance_id) +{ + memset(journal, 0, sizeof(*journal)); + vclock_create(&journal->vclock); + journal->instance_id = instance_id; +} + +static void +raft_journal_write(struct raft_journal *journal, const struct raft_msg *msg) +{ + assert(msg->vclock == NULL); + int index = journal->size; + int new_size = index + 1; + journal->rows = realloc(journal->rows, + sizeof(journal->rows[0]) * new_size); + assert(journal->rows != NULL); + journal->rows[index] = *msg; + journal->size = new_size; + vclock_inc(&journal->vclock, 0); +} + +static void +raft_journal_follow(struct raft_journal *journal, uint32_t replica_id, + int64_t count) +{ + int64_t lsn = vclock_get(&journal->vclock, replica_id); + lsn += count; + vclock_follow(&journal->vclock, replica_id, lsn); +} + +static void +raft_journal_destroy(struct raft_journal *journal) +{ + free(journal->rows); +} + +static void +raft_net_create(struct raft_net *net) +{ + memset(net, 0, sizeof(*net)); +} + +static void +raft_net_send(struct raft_net *net, const struct raft_msg *msg) +{ + int index = net->count; + int new_count = index + 1; + net->msgs = realloc(net->msgs, sizeof(net->msgs[0]) * new_count); + assert(net->msgs != NULL); + net->msgs[index] = *msg; + struct raft_msg *new_msg = &net->msgs[index]; + if (new_msg->vclock != NULL) { + /* + * Network messages can contain vclock, which references the + * original raft vclock. Must copy it, otherwise all net + * messages will point at the same vclock. + */ + struct vclock *v = malloc(sizeof(*v)); + assert(v != NULL); + vclock_copy(v, new_msg->vclock); + new_msg->vclock = v; + } + net->count = new_count; +} + +static void +raft_net_drop(struct raft_net *net) +{ + for (int i = 0; i < net->count; ++i) + free((struct vclock *)net->msgs[i].vclock); + free(net->msgs); + net->msgs = NULL; + net->count = 0; +} + +static void +raft_net_destroy(struct raft_net *net) +{ + raft_net_drop(net); +} + +void +raft_node_create(struct raft_node *node) +{ + memset(node, 0, sizeof(*node)); + node->cfg_is_enabled = true; + node->cfg_is_candidate = true; + node->cfg_election_timeout = 5; + node->cfg_election_quorum = 3; + node->cfg_death_timeout = 5; + node->cfg_instance_id = 1; + node->cfg_vclock = &node->journal.vclock; + raft_journal_create(&node->journal, node->cfg_instance_id); + raft_node_start(node); +} + +void +raft_node_net_drop(struct raft_node *node) +{ + assert(raft_node_is_started(node)); + raft_net_drop(&node->net); +} + +bool +raft_node_net_check_msg(const struct raft_node *node, int i, + enum raft_state state, uint64_t term, uint32_t vote, + const char *vclock) +{ + assert(raft_node_is_started(node)); + assert(node->net.count > i); + return raft_msg_check(&node->net.msgs[i], state, term, vote, vclock); +} + +bool +raft_node_check_full_state(const struct raft_node *node, enum raft_state state, + uint32_t leader, uint64_t term, uint32_t vote, + uint64_t volatile_term, uint32_t volatile_vote, + const char *vclock) +{ + assert(raft_node_is_started(node)); + const struct raft *raft = &node->raft; + struct vclock v; + raft_vclock_from_string(&v, vclock); + return raft->state == state && raft->leader == leader && + raft->term == term && raft->vote == vote && + raft->volatile_term == volatile_term && + raft->volatile_vote == volatile_vote && + vclock_compare(&v, raft->vclock) == 0; +} + +bool +raft_node_journal_check_row(const struct raft_node *node, int i, uint64_t term, + uint32_t vote) +{ + assert(raft_node_is_started(node)); + assert(node->journal.size > i); + return raft_msg_check(&node->journal.rows[i], 0, term, vote, NULL); +} + +void +raft_node_journal_follow(struct raft_node *node, uint32_t replica_id, + int64_t count) +{ + raft_journal_follow(&node->journal, replica_id, count); +} + +void +raft_node_new_term(struct raft_node *node) +{ + raft_new_term(&node->raft); + raft_run_async_work(); +} + +int +raft_node_process_msg(struct raft_node *node, const struct raft_msg *msg, + uint32_t source) +{ + int rc = raft_process_msg(&node->raft, msg, source); + raft_run_async_work(); + return rc; +} + +int +raft_node_send_vote_response(struct raft_node *node, uint64_t term, + uint32_t vote, uint32_t source) +{ + struct raft_msg msg = { + .state = RAFT_STATE_FOLLOWER, + .term = term, + .vote = vote, + }; + return raft_node_process_msg(node, &msg, source); +} + +int +raft_node_send_vote_request(struct raft_node *node, uint64_t term, + const char *vclock, uint32_t source) +{ + struct vclock v; + raft_vclock_from_string(&v, vclock); + struct raft_msg msg = { + .state = RAFT_STATE_CANDIDATE, + .term = term, + .vote = source, + .vclock = &v, + }; + return raft_node_process_msg(node, &msg, source); +} + +int +raft_node_send_leader(struct raft_node *node, uint64_t term, uint32_t source) +{ + struct raft_msg msg = { + .state = RAFT_STATE_LEADER, + .term = term, + }; + return raft_node_process_msg(node, &msg, source); +} + +int +raft_node_send_follower(struct raft_node *node, uint64_t term, uint32_t source) +{ + struct raft_msg msg = { + .state = RAFT_STATE_FOLLOWER, + .term = term, + }; + return raft_node_process_msg(node, &msg, source); +} + +void +raft_node_send_heartbeat(struct raft_node *node, uint32_t source) +{ + assert(raft_node_is_started(node)); + raft_process_heartbeat(&node->raft, source); +} + +void +raft_node_restart(struct raft_node *node) +{ + assert(raft_node_is_started(node)); + raft_node_stop(node); + raft_node_start(node); +} + +void +raft_node_stop(struct raft_node *node) +{ + assert(raft_node_is_started(node)); + fiber_cancel(node->worker); + fiber_join(node->worker); + raft_destroy(&node->raft); + assert(node->update_count == 0); + raft_net_destroy(&node->net); + node->worker = NULL; + node->has_work = false; +} + +void +raft_node_start(struct raft_node *node) +{ + assert(!raft_node_is_started(node)); + + raft_net_create(&node->net); + + node->worker = fiber_new("raft_node_worker", raft_node_worker_f); + node->worker->f_arg = node; + fiber_set_joinable(node->worker, true); + fiber_wakeup(node->worker); + trigger_create(&node->on_update, raft_node_on_update, node, + raft_node_on_destroy); + raft_create(&node->raft, &raft_vtab); + raft_on_update(&node->raft, &node->on_update); + + for (int i = 0; i < node->journal.size; ++i) + raft_process_recovery(&node->raft, &node->journal.rows[i]); + + raft_cfg_is_enabled(&node->raft, node->cfg_is_enabled); + raft_cfg_is_candidate(&node->raft, node->cfg_is_candidate); + raft_cfg_election_timeout(&node->raft, node->cfg_election_timeout); + raft_cfg_election_quorum(&node->raft, node->cfg_election_quorum); + raft_cfg_death_timeout(&node->raft, node->cfg_death_timeout); + raft_cfg_instance_id(&node->raft, node->cfg_instance_id); + raft_cfg_vclock(&node->raft, node->cfg_vclock); + raft_run_async_work(); +} + +void +raft_node_block(struct raft_node *node) +{ + assert(!node->is_work_blocked); + node->is_work_blocked = true; +} + +void +raft_node_unblock(struct raft_node *node) +{ + assert(node->is_work_blocked); + node->is_work_blocked = false; + if (raft_node_is_started(node)) { + fiber_wakeup(node->worker); + raft_run_async_work(); + } +} + +void +raft_node_cfg_is_enabled(struct raft_node *node, bool value) +{ + node->cfg_is_enabled = value; + if (raft_node_is_started(node)) { + raft_cfg_is_enabled(&node->raft, value); + raft_run_async_work(); + } +} + +void +raft_node_cfg_is_candidate(struct raft_node *node, bool value) +{ + node->cfg_is_candidate = value; + if (raft_node_is_started(node)) { + raft_cfg_is_candidate(&node->raft, value); + raft_run_async_work(); + } +} + +void +raft_node_cfg_election_timeout(struct raft_node *node, double value) +{ + node->cfg_election_timeout = value; + if (raft_node_is_started(node)) { + raft_cfg_election_timeout(&node->raft, value); + raft_run_async_work(); + } +} + +void +raft_node_cfg_election_quorum(struct raft_node *node, int value) +{ + node->cfg_election_quorum = value; + if (raft_node_is_started(node)) { + raft_cfg_election_quorum(&node->raft, value); + raft_run_async_work(); + } +} + +void +raft_node_cfg_death_timeout(struct raft_node *node, double value) +{ + node->cfg_death_timeout = value; + if (raft_node_is_started(node)) { + raft_cfg_death_timeout(&node->raft, value); + raft_run_async_work(); + } +} + +bool +raft_msg_check(const struct raft_msg *msg, enum raft_state state, uint64_t term, + uint32_t vote, const char *vclock) +{ + if (vclock != NULL) { + if (msg->vclock == NULL) + return false; + struct vclock v; + raft_vclock_from_string(&v, vclock); + if (vclock_compare(&v, msg->vclock) != 0) + return false; + } else if (msg->vclock != NULL) { + return false; + } + return msg->state == state && msg->term == term && msg->vote == vote; +} + +void +raft_run_next_event(void) +{ + fakeev_loop_update(fakeev_loop()); + raft_run_async_work(); +} + +void +raft_run_async_work(void) +{ + fiber_sleep(0); +} + +void +raft_run_for(double duration) +{ + assert(duration > 0); + fakeev_set_brk(duration); + double deadline = fakeev_time() + duration; + while (fakeev_time() < deadline) + raft_run_next_event(); +} + +void +raft_node_destroy(struct raft_node *node) +{ + if (raft_node_is_started(node)) + raft_node_stop(node); + raft_journal_destroy(&node->journal); +} + +static void +raft_node_broadcast_f(struct raft *raft, const struct raft_msg *msg) +{ + struct raft_node *node = container_of(raft, struct raft_node, raft); + raft_net_send(&node->net, msg); +} + +static void +raft_node_write_f(struct raft *raft, const struct raft_msg *msg) +{ + struct raft_node *node = container_of(raft, struct raft_node, raft); + raft_journal_write(&node->journal, msg); +} + +static void +raft_node_schedule_async_f(struct raft *raft) +{ + struct raft_node *node = container_of(raft, struct raft_node, raft); + node->has_work = true; + fiber_wakeup(node->worker); +} + +static int +raft_node_worker_f(va_list va) +{ + (void)va; + struct raft_node *node = fiber()->f_arg; + while (!fiber_is_cancelled()) { + node->has_work = false; + + while (node->is_work_blocked) { + if (fiber_is_cancelled()) + return 0; + fiber_yield(); + } + raft_process_async(&node->raft); + + if (!node->has_work) { + if (fiber_is_cancelled()) + return 0; + fiber_yield(); + } + } + return 0; +} + +void +raft_run_test(const char *log_file, fiber_func test) +{ + random_init(); + time_t seed = time(NULL); + srand(seed); + memory_init(); + fiber_init(fiber_c_invoke); + int fd = open(log_file, O_TRUNC); + if (fd != -1) + close(fd); + say_logger_init(log_file, 5, 1, "plain", 0); + /* Print the seed to be able to reproduce a bug with the same seed. */ + say_info("Random seed = %llu", (unsigned long long) seed); + + struct fiber *main_fiber = fiber_new("main", test); + fiber_set_joinable(main_fiber, true); + assert(main_fiber != NULL); + fiber_wakeup(main_fiber); + ev_run(loop(), 0); + fiber_join(main_fiber); + + say_logger_free(); + fiber_free(); + memory_free(); + random_free(); +} diff --git a/test/unit/raft_test_utils.h b/test/unit/raft_test_utils.h new file mode 100644 index 000000000..bc3db0c2a --- /dev/null +++ b/test/unit/raft_test_utils.h @@ -0,0 +1,287 @@ +#pragma once +/* + * Copyright 2010-2020, Tarantool AUTHORS, please see AUTHORS file. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "fakesys/fakeev.h" +#include "fiber.h" +#include "raft/raft.h" +#include "unit.h" + +/** WAL simulation. It stores a list of rows which raft wanted to persist. */ +struct raft_journal { + /** Instance ID to propagate the needed vclock component. */ + uint32_t instance_id; + /** Journal vclock, belongs to the journal, not to the core raft. */ + struct vclock vclock; + /** An array of rows collected from raft. */ + struct raft_msg *rows; + /** Number of rows in the journal. */ + int size; +}; + +/** + * Network simulation. There is no real sending of anything. Instead, all + * messages are saved into a list, where they can be checked on their + * correctness. All messages provided by raft are copied and saved here. + */ +struct raft_net { + /** Array of messages. */ + struct raft_msg *msgs; + /** Number of messages. */ + int count; +}; + +/** + * Raft node + all its environment. Journal, network, configuration. The node + * provides helper methods to work with the raft instance. + */ +struct raft_node { + /** Raft instance. Everything else is the environment. */ + struct raft raft; + /** Journal. Survives restart. */ + struct raft_journal journal; + /** Network. Does not survive restart. */ + struct raft_net net; + /** + * Worker fiber for async work. It can be blocked in order to test what + * happens when async work is not dispatched too long. + */ + struct fiber *worker; + /** + * Trigger installed by the node automatically, to increment update + * counter. + */ + struct trigger on_update; + /** + * Update counter helps to check if the triggers are called when + * expected. Each trigger invocation increments it. + */ + int update_count; + /** + * True if async work was scheduled by raft, but it wasn't dispatched + * yet. + */ + bool has_work; + /** + * True if the worker fiber should stop executing async work and should + * wait for an explicit unblock. + */ + bool is_work_blocked; + + /** + * Configuration options. Saved here for the sake of being able to + * survive a restart. + */ + bool cfg_is_enabled; + bool cfg_is_candidate; + double cfg_election_timeout; + int cfg_election_quorum; + double cfg_death_timeout; + uint32_t cfg_instance_id; + struct vclock *cfg_vclock; +}; + +/** Create a raft node from the scratch. */ +void +raft_node_create(struct raft_node *node); + +/** Remove all network messages. To simplify testing. */ +void +raft_node_net_drop(struct raft_node *node); + +/** Check if a network message with index @a i matches the given parameters. */ +bool +raft_node_net_check_msg(const struct raft_node *node, int i, + enum raft_state state, uint64_t term, uint32_t vote, + const char *vclock); + +/** Check full state of the raft instance to match the given parameters. */ +bool +raft_node_check_full_state(const struct raft_node *node, enum raft_state state, + uint32_t leader, uint64_t term, uint32_t vote, + uint64_t volatile_term, uint32_t volatile_vote, + const char *vclock); + +/** Check if a journal message with index @a i matches the given parameters. */ +bool +raft_node_journal_check_row(const struct raft_node *node, int i, uint64_t term, + uint32_t vote); + +/** Simulate @a count of WAL rows from a given replica, to propagate vclock. */ +void +raft_node_journal_follow(struct raft_node *node, uint32_t replica_id, + int64_t count); + +/** Bump term of the instance */ +void +raft_node_new_term(struct raft_node *node); + +/** Deliver @a msg message from @a source instance to the given node. */ +int +raft_node_process_msg(struct raft_node *node, const struct raft_msg *msg, + uint32_t source); + +/** + * Deliver a vote response message from @a source instance to the given node. + * It says @a source voted for @a vote in the specified @a term, and it is in + * 'follower' state. + */ +int +raft_node_send_vote_response(struct raft_node *node, uint64_t term, + uint32_t vote, uint32_t source); + +/** + * Deliver a vote request message from @a source instance to the given node. + * It says the sender has the specified vclock in @a term, and it state is + * 'candidate'. + */ +int +raft_node_send_vote_request(struct raft_node *node, uint64_t term, + const char *vclock, uint32_t source); + +/** + * Deliver a message from a leader @a source just saying that it is a leader. + * It says the sender is in 'leader' state, has the specified @a term. + */ +int +raft_node_send_leader(struct raft_node *node, uint64_t term, uint32_t source); + +/** + * Deliver a message from a follower @a source just saying that it is a + * follower. It says the sender is in 'follower' state, has the specified + * @a term. + */ +int +raft_node_send_follower(struct raft_node *node, uint64_t term, uint32_t source); + +/** Deliver a heartbeat message from @a source instance. */ +void +raft_node_send_heartbeat(struct raft_node *node, uint32_t source); + +/** Restart the node. The same as stop + start. */ +void +raft_node_restart(struct raft_node *node); + +/** + * Stop the node. The raft instance is destroyed, the worker is stopped, network + * messages are lost. + */ +void +raft_node_stop(struct raft_node *node); + +/** Start the node. Raft instance is created and recovered from the journal. */ +void +raft_node_start(struct raft_node *node); + +/** Block async work execution. */ +void +raft_node_block(struct raft_node *node); + +/** Unblock async work execution. */ +void +raft_node_unblock(struct raft_node *node); + +/** Configuration methods. */ + +void +raft_node_cfg_is_enabled(struct raft_node *node, bool value); + +void +raft_node_cfg_is_candidate(struct raft_node *node, bool value); + +void +raft_node_cfg_election_timeout(struct raft_node *node, double value); + +void +raft_node_cfg_election_quorum(struct raft_node *node, int value); + +void +raft_node_cfg_death_timeout(struct raft_node *node, double value); + +/** Check that @a msg message matches the given arguments. */ +bool +raft_msg_check(const struct raft_msg *msg, enum raft_state state, uint64_t term, + uint32_t vote, const char *vclock); + +/** Propagate event loop to a next event and handle it. */ +void +raft_run_next_event(void); + +/** Give worker fibers time to finish their work. */ +void +raft_run_async_work(void); + +/** Run event loop for @a duration number of seconds. */ +void +raft_run_for(double duration); + +/** Destroy the raft instance and its environment. */ +void +raft_node_destroy(struct raft_node *node); + +/** Global monotonic time used by the raft instance. */ +static inline double +raft_time(void) +{ + return fakeev_time(); +} + +/** + * A helper to simplify transformation of a vclock string to an object. Without + * caring about errors. + */ +static void +raft_vclock_from_string(struct vclock *vclock, const char *str) +{ + vclock_create(vclock); + size_t rc = vclock_from_string(vclock, str); + assert(rc == 0); + (void)rc; +} + +/** + * A helper to initialize all the necessary subsystems before @a test, and free + * them afterwards. + */ +void +raft_run_test(const char *log_file, fiber_func test); + +#define raft_start_test(n) { \ + header(); \ + say_verbose("-------- RAFT start test %s --------", __func__); \ + plan(n); \ +} + +#define raft_finish_test() { \ + say_verbose("-------- RAFT end test %s --------", __func__); \ + fakeev_reset(); \ + check_plan(); \ + footer(); \ +}