[tarantool-patches] Re: [PATCH v3/3] replication: handle replication shutdown correctly.
Serge Petrenko
sergepetrenko at tarantool.org
Thu Aug 16 09:05:03 MSK 2018
> 15 авг. 2018 г., в 21:47, Vladimir Davydov <vdavydov.dev at gmail.com> написал(а):
>
> On Wed, Aug 15, 2018 at 07:13:28PM +0300, Serge Petrenko wrote:
>> diff --git a/src/box/replication.cc b/src/box/replication.cc
>> index 48956d2ed..083ae6407 100644
>> --- a/src/box/replication.cc
>> +++ b/src/box/replication.cc
>> @@ -91,13 +91,6 @@ replication_init(void)
>> latch_create(&replicaset.applier.order_latch);
>> }
>>
>> -void
>> -replication_free(void)
>> -{
>> - free(replicaset.replica_by_id);
>> - fiber_cond_destroy(&replicaset.applier.cond);
>> -}
>> -
>> void
>> replica_check_id(uint32_t replica_id)
>> {
>> @@ -242,6 +235,42 @@ replica_clear_applier(struct replica *replica)
>> trigger_clear(&replica->on_applier_state);
>> }
>>
>> +void
>> +replication_free(void)
>> +{
>> + struct replica *replica, *next;
>> +
>> + replica_hash_foreach_safe(&replicaset.hash, replica, next) {
>> + if (replica->id == instance_id) {
>> + replica_hash_remove(&replicaset.hash, replica);
>> + /*
>> + * Local replica doesn't have neither applier
>> + * nor relay, so ignore it.
>> + */
>> + continue;
>> + }
>> + if (replica->applier != NULL) {
>> + replica_clear_applier(replica);
>> + /*
>> + * We're exiting, so control won't be passed
>> + * to appliers and we don't need to stop them.
>> + */
>> + }
>
> You don't need this code either. I want this loop to be as simple as
>
> /*
> * <explain why>
> */
> replicaset_foreach(replica)
> relay_cancel(replica->relay);
>
> Then you wouldn't even need to move the definition of replication_free.
Done as requested. The new diff is below. The branch remains
https://github.com/tarantool/tarantool/tree/sp/alt2-gh-3485-replication-shutdown
>
>> + if (replica->id != REPLICA_ID_NIL) {
>> + /*
>> + * Relay threads keep sending messages
>> + * to tx via cbus upon shutdown, which
>> + * could lead to segfaults. So cancel
>> + * them.
>> + */
>> + relay_cancel(replica->relay);
>> + }
>> + }
>> +
>> + free(replicaset.replica_by_id);
>> + fiber_cond_destroy(&replicaset.applier.cond);
>> +}
>
---
src/box/box.cc | 2 +-
src/box/relay.cc | 16 ++++++++++++++++
src/box/relay.h | 4 ++++
src/box/replication.cc | 10 ++++++++++
src/tt_pthread.h | 5 +++++
5 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/src/box/box.cc b/src/box/box.cc
index ae4959d6f..28bfdd5fb 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1599,13 +1599,13 @@ box_free(void)
if (is_box_configured) {
#if 0
session_free();
- replication_free();
user_cache_free();
schema_free();
module_free();
tuple_free();
port_free();
#endif
+ replication_free();
sequence_free();
gc_free();
engine_shutdown();
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 4cacbc840..edd1c80b0 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -198,6 +198,16 @@ relay_start(struct relay *relay, int fd, uint64_t sync,
relay->state = RELAY_FOLLOW;
}
+void
+relay_cancel(struct relay *relay)
+{
+ /* Check that the thread is running first. */
+ if (relay->cord.id != 0) {
+ tt_pthread_cancel(relay->cord.id);
+ tt_pthread_join(relay->cord.id, NULL);
+ }
+}
+
static void
relay_stop(struct relay *relay)
{
@@ -211,6 +221,12 @@ relay_stop(struct relay *relay)
recovery_delete(relay->r);
relay->r = NULL;
relay->state = RELAY_STOPPED;
+ /*
+ * Needed to track whether relay thread is running or not
+ * for relay_cancel(). Id is reset to a positive value
+ * upon cord_create().
+ */
+ relay->cord.id = 0;
}
void
diff --git a/src/box/relay.h b/src/box/relay.h
index 2988e6b0d..53bf68eb8 100644
--- a/src/box/relay.h
+++ b/src/box/relay.h
@@ -61,6 +61,10 @@ enum relay_state {
struct relay *
relay_new(struct replica *replica);
+/** Cancel a running relay. Called on shutdown. */
+void
+relay_cancel(struct relay *relay);
+
/** Destroy and delete the relay */
void
relay_delete(struct relay *relay);
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 48956d2ed..4b0700f90 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -94,6 +94,16 @@ replication_init(void)
void
replication_free(void)
{
+ /*
+ * Relay threads keep sending messages
+ * to tx via cbus upon shutdown, which
+ * could lead to segfaults. So cancel
+ * them.
+ */
+ replicaset_foreach(replica) {
+ relay_cancel(replica->relay);
+ }
+
free(replicaset.replica_by_id);
fiber_cond_destroy(&replicaset.applier.cond);
}
diff --git a/src/tt_pthread.h b/src/tt_pthread.h
index 60ade50ae..d83694460 100644
--- a/src/tt_pthread.h
+++ b/src/tt_pthread.h
@@ -277,6 +277,11 @@
tt_pthread_error(e__); \
})
+#define tt_pthread_cancel(thread) \
+({ int e__ = pthread_cancel(thread); \
+ tt_pthread_error(e__); \
+})
+
#define tt_pthread_key_create(key, dtor) \
({ int e__ = pthread_key_create(key, dtor); \
tt_pthread_error(e__); \
--
2.15.2 (Apple Git-101.1)
More information about the Tarantool-patches
mailing list