[tarantool-patches] Re: [PATCH v3/3] replication: handle replication shutdown correctly.

Serge Petrenko sergepetrenko at tarantool.org
Thu Aug 16 09:05:03 MSK 2018



> 15 авг. 2018 г., в 21:47, Vladimir Davydov <vdavydov.dev at gmail.com> написал(а):
> 
> On Wed, Aug 15, 2018 at 07:13:28PM +0300, Serge Petrenko wrote:
>> diff --git a/src/box/replication.cc b/src/box/replication.cc
>> index 48956d2ed..083ae6407 100644
>> --- a/src/box/replication.cc
>> +++ b/src/box/replication.cc
>> @@ -91,13 +91,6 @@ replication_init(void)
>> 	latch_create(&replicaset.applier.order_latch);
>> }
>> 
>> -void
>> -replication_free(void)
>> -{
>> -	free(replicaset.replica_by_id);
>> -	fiber_cond_destroy(&replicaset.applier.cond);
>> -}
>> -
>> void
>> replica_check_id(uint32_t replica_id)
>> {
>> @@ -242,6 +235,42 @@ replica_clear_applier(struct replica *replica)
>> 	trigger_clear(&replica->on_applier_state);
>> }
>> 
>> +void
>> +replication_free(void)
>> +{
>> +	struct replica *replica, *next;
>> +
>> +	replica_hash_foreach_safe(&replicaset.hash, replica, next) {
>> +		if (replica->id == instance_id) {
>> +			replica_hash_remove(&replicaset.hash, replica);
>> +			/*
>> +			 * Local replica doesn't have neither applier
>> +			 * nor relay, so ignore it.
>> +			 */
>> +			continue;
>> +		}
>> +		if (replica->applier != NULL) {
>> +			replica_clear_applier(replica);
>> +			/*
>> +			 * We're exiting, so control won't be passed
>> +			 * to appliers and we don't need to stop them.
>> +			 */
>> +		}
> 
> You don't need this code either. I want this loop to be as simple as
> 
> 	/*
> 	 * <explain why>
> 	 */
> 	replicaset_foreach(replica)
> 		relay_cancel(replica->relay);
> 
> Then you wouldn't even need to move the definition of replication_free.

Done as requested. The new diff is below. The branch remains
https://github.com/tarantool/tarantool/tree/sp/alt2-gh-3485-replication-shutdown

> 
>> +		if (replica->id != REPLICA_ID_NIL) {
>> +			/*
>> +			 * Relay threads keep sending messages
>> +			 * to tx via cbus upon shutdown, which
>> +			 * could lead to segfaults. So cancel
>> +			 * them.
>> +			 */
>> +			relay_cancel(replica->relay);
>> +		}
>> +	}
>> +
>> +	free(replicaset.replica_by_id);
>> +	fiber_cond_destroy(&replicaset.applier.cond);
>> +}
> 

---
 src/box/box.cc         |  2 +-
 src/box/relay.cc       | 16 ++++++++++++++++
 src/box/relay.h        |  4 ++++
 src/box/replication.cc | 10 ++++++++++
 src/tt_pthread.h       |  5 +++++
 5 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index ae4959d6f..28bfdd5fb 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1599,13 +1599,13 @@ box_free(void)
 	if (is_box_configured) {
 #if 0
 		session_free();
-		replication_free();
 		user_cache_free();
 		schema_free();
 		module_free();
 		tuple_free();
 		port_free();
 #endif
+		replication_free();
 		sequence_free();
 		gc_free();
 		engine_shutdown();
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 4cacbc840..edd1c80b0 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -198,6 +198,16 @@ relay_start(struct relay *relay, int fd, uint64_t sync,
 	relay->state = RELAY_FOLLOW;
 }
 
+void
+relay_cancel(struct relay *relay)
+{
+	/* Check that the thread is running first. */
+	if (relay->cord.id != 0) {
+		tt_pthread_cancel(relay->cord.id);
+		tt_pthread_join(relay->cord.id, NULL);
+	}
+}
+
 static void
 relay_stop(struct relay *relay)
 {
@@ -211,6 +221,12 @@ relay_stop(struct relay *relay)
 		recovery_delete(relay->r);
 	relay->r = NULL;
 	relay->state = RELAY_STOPPED;
+	/*
+	 * Needed to track whether relay thread is running or not
+	 * for relay_cancel(). Id is reset to a positive value
+	 * upon cord_create().
+	 */
+	relay->cord.id = 0;
 }
 
 void
diff --git a/src/box/relay.h b/src/box/relay.h
index 2988e6b0d..53bf68eb8 100644
--- a/src/box/relay.h
+++ b/src/box/relay.h
@@ -61,6 +61,10 @@ enum relay_state {
 struct relay *
 relay_new(struct replica *replica);
 
+/** Cancel a running relay. Called on shutdown. */
+void
+relay_cancel(struct relay *relay);
+
 /** Destroy and delete the relay */
 void
 relay_delete(struct relay *relay);
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 48956d2ed..4b0700f90 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -94,6 +94,16 @@ replication_init(void)
 void
 replication_free(void)
 {
+	/*
+	 * Relay threads keep sending messages
+	 * to tx via cbus upon shutdown, which
+	 * could lead to segfaults. So cancel
+	 * them.
+	 */
+	replicaset_foreach(replica) {
+		relay_cancel(replica->relay);
+	}
+
 	free(replicaset.replica_by_id);
 	fiber_cond_destroy(&replicaset.applier.cond);
 }
diff --git a/src/tt_pthread.h b/src/tt_pthread.h
index 60ade50ae..d83694460 100644
--- a/src/tt_pthread.h
+++ b/src/tt_pthread.h
@@ -277,6 +277,11 @@
 	tt_pthread_error(e__);				\
 })
 
+#define tt_pthread_cancel(thread)			\
+({	int e__ = pthread_cancel(thread);		\
+	tt_pthread_error(e__);				\
+})
+
 #define tt_pthread_key_create(key, dtor)		\
 ({	int e__ = pthread_key_create(key, dtor);	\
 	tt_pthread_error(e__);				\
-- 
2.15.2 (Apple Git-101.1)




More information about the Tarantool-patches mailing list