[Tarantool-patches] [PATCH 1/2] fiber: make wakeup in Lua and C nop on self

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Sat Apr 24 02:15:53 MSK 2021


fiber.wakeup() in Lua and fiber_wakeup() in C could lead to a
crash or undefined behaviour when called on the currently running
fiber.

In particular, if after wakeup was started a new fiber in a
blocking way (fiber.create() and fiber_start()) it would crash in
debug build, and lead to unknown results in release.

If after wakeup was made a sleep with non-zero timeout or an
infinite yield (fiber_yield()), the fiber woke up in the same
event loop iteration regardless of any timeout or other wakeups.
It was a spurious wakeup, which is not expected in most of the
places internally.

The patch makes the wakeup nop on the current fiber making it safe
to use anywhere.

Closes #5292
Closes #6043

@TarantoolBot document
Title: fiber.wakeup() in Lua and fiber_wakeup() in C are nop on self
In Lua `fiber.wakeup()` being called on the current fiber does not
do anything, safe to use. The same for `fiber_wakeup()` in C.
---
 .../unreleased/gh-6043-fiber-wakeup-self.md   | 33 +++++++
 src/lib/core/fiber.c                          | 85 +++++++++++--------
 src/lib/core/fiber.h                          |  6 +-
 .../gh-6043-fiber-wakeup-self.test.lua        | 50 +++++++++++
 test/unit/fiber.cc                            | 33 ++++++-
 test/unit/fiber.result                        |  2 +
 test/unit/fiber_stress.cc                     |  6 +-
 7 files changed, 171 insertions(+), 44 deletions(-)
 create mode 100644 changelogs/unreleased/gh-6043-fiber-wakeup-self.md
 create mode 100755 test/app-tap/gh-6043-fiber-wakeup-self.test.lua

diff --git a/changelogs/unreleased/gh-6043-fiber-wakeup-self.md b/changelogs/unreleased/gh-6043-fiber-wakeup-self.md
new file mode 100644
index 000000000..5869e2696
--- /dev/null
+++ b/changelogs/unreleased/gh-6043-fiber-wakeup-self.md
@@ -0,0 +1,33 @@
+## bugfix/core
+
+* **[Breaking change]** `fiber.wakeup()` in Lua and `fiber_wakeup()` in C became
+  NOP on the currently running fiber. Previously they allowed to "ignore" the
+  next yield or sleep leading to unexpected spurious wakeups. Could lead to a
+  crash (in debug build) or undefined behaviour (in release build) if called
+  right before `fiber.create()` in Lua or `fiber_start()` in C (gh-6043).
+
+  There was a single usecase for that - reschedule in the same event loop
+  iteration which is not the same as `fiber.sleep(0)` in Lua and
+  `fiber_sleep(0)` in C. Could be done in C like that:
+  ```C
+  fiber_wakeup(fiber_self());
+  fiber_yield();
+  ```
+  and in Lua like that:
+  ```Lua
+  fiber.self():wakeup()
+  fiber.yield()
+  ```
+  Now to get the same effect in C use `fiber_reschedule()`. In Lua it is now
+  simply impossible to reschedule the current fiber in the same event loop
+  iteration directly. But still can reschedule self through a second fiber like
+  this (**never use it, please**):
+  ```Lua
+  local self = fiber.self()
+  fiber.new(function() self:wakeup() end)
+  fiber.sleep(0)
+  ```
+
+----
+Breaking change: `fiber.wakeup()` in Lua and `fiber_wakeup()` in C became NOP on
+the currently running fiber.
diff --git a/src/lib/core/fiber.c b/src/lib/core/fiber.c
index f8b85d99d..cb6295171 100644
--- a/src/lib/core/fiber.c
+++ b/src/lib/core/fiber.c
@@ -396,10 +396,13 @@ fiber_call_impl(struct fiber *callee)
 	assert(rlist_empty(&callee->state));
 	assert(caller);
 	assert(caller != callee);
+	assert((caller->flags & FIBER_IS_RUNNING) != 0);
+	assert((callee->flags & FIBER_IS_RUNNING) == 0);
 
+	caller->flags &= ~FIBER_IS_RUNNING;
 	cord->fiber = callee;
+	callee->flags = (callee->flags & ~FIBER_IS_READY) | FIBER_IS_RUNNING;
 
-	callee->flags &= ~FIBER_IS_READY;
 	ASAN_START_SWITCH_FIBER(asan_state, 1,
 				callee->stack,
 				callee->stack_size);
@@ -443,19 +446,9 @@ fiber_checkstack(void)
 	return false;
 }
 
-/**
- * Interrupt a synchronous wait of a fiber inside the event loop.
- * We do so by keeping an "async" event in every fiber, solely
- * for this purpose, and raising this event here.
- *
- * @note: if this is sent to self, followed by a fiber_yield()
- * call, it simply reschedules the fiber after other ready
- * fibers in the same event loop iteration.
- */
-void
-fiber_wakeup(struct fiber *f)
+static void
+fiber_make_ready(struct fiber *f)
 {
-	assert(! (f->flags & FIBER_IS_DEAD));
 	/**
 	 * Do nothing if the fiber is already in cord->ready
 	 * list *or* is in the call chain created by
@@ -464,21 +457,12 @@ fiber_wakeup(struct fiber *f)
 	 * but the same game is deadly when the fiber is in
 	 * the callee list created by fiber_schedule_list().
 	 *
-	 * To put it another way, fiber_wakeup() is a 'request' to
+	 * To put it another way, fiber_make_ready() is a 'request' to
 	 * schedule the fiber for execution, and once it is executing
-	 * a wakeup request is considered complete and it must be
+	 * the 'make ready' request is considered complete and it must be
 	 * removed.
-	 *
-	 * A dead fiber can be lingering in the cord fiber list
-	 * if it is joinable. This makes it technically possible
-	 * to schedule it. We would never make such a mistake
-	 * in our own code, hence the assert above. But as long
-	 * as fiber.wakeup() is a part of public Lua API, an
-	 * external rock can mess things up. Ignore such attempts
-	 * as well.
 	 */
-	if (f->flags & (FIBER_IS_READY | FIBER_IS_DEAD))
-		return;
+	assert((f->flags & (FIBER_IS_DEAD | FIBER_IS_READY)) == 0);
 	struct cord *cord = cord();
 	if (rlist_empty(&cord->ready)) {
 		/*
@@ -503,6 +487,23 @@ fiber_wakeup(struct fiber *f)
 	f->flags |= FIBER_IS_READY;
 }
 
+void
+fiber_wakeup(struct fiber *f)
+{
+	/*
+	 * DEAD is checked both in the assertion and in the release build
+	 * because it should not ever happen, at least internally. But in some
+	 * user modules it might happen, and better ignore such fibers.
+	 * Especially since this was allowed for quite some time in the public
+	 * API and need to keep it if it costs nothing, for backward
+	 * compatibility.
+	 */
+	assert((f->flags & FIBER_IS_DEAD) == 0);
+	const int no_flags = FIBER_IS_READY | FIBER_IS_DEAD | FIBER_IS_RUNNING;
+	if ((f->flags & no_flags) == 0)
+		fiber_make_ready(f);
+}
+
 /** Cancel the subject fiber.
 *
  * Note: cancelation is asynchronous. Use fiber_join() to wait for the
@@ -521,7 +522,6 @@ void
 fiber_cancel(struct fiber *f)
 {
 	assert(f->fid != 0);
-	struct fiber *self = fiber();
 	/**
 	 * Do nothing if the fiber is dead, since cancelling
 	 * the fiber would clear the diagnostics area and
@@ -535,10 +535,8 @@ fiber_cancel(struct fiber *f)
 	/**
 	 * Don't wake self and zombies.
 	 */
-	if (f != self) {
-		if (f->flags & FIBER_IS_CANCELLABLE)
-			fiber_wakeup(f);
-	}
+	if ((f->flags & FIBER_IS_CANCELLABLE) != 0)
+		fiber_wakeup(f);
 }
 
 /**
@@ -602,7 +600,14 @@ fiber_clock64(void)
 void
 fiber_reschedule(void)
 {
-	fiber_wakeup(fiber());
+	struct fiber *f = fiber();
+	/*
+	 * The current fiber can't be dead, the flag is set when the fiber
+	 * function returns. Can't be ready, because such status is assigned
+	 * only to the queued fibers in the ready-list.
+	 */
+	assert((f->flags & (FIBER_IS_READY | FIBER_IS_DEAD)) == 0);
+	fiber_make_ready(f);
 	fiber_yield();
 }
 
@@ -686,8 +691,13 @@ fiber_yield(void)
 
 	assert(callee->flags & FIBER_IS_READY || callee == &cord->sched);
 	assert(! (callee->flags & FIBER_IS_DEAD));
+	assert((caller->flags & FIBER_IS_RUNNING) != 0);
+	assert((callee->flags & FIBER_IS_RUNNING) == 0);
+
+	caller->flags &= ~FIBER_IS_RUNNING;
 	cord->fiber = callee;
-	callee->flags &= ~FIBER_IS_READY;
+	callee->flags = (callee->flags & ~FIBER_IS_READY) | FIBER_IS_RUNNING;
+
 	ASAN_START_SWITCH_FIBER(asan_state,
 				(caller->flags & FIBER_IS_DEAD) == 0,
 				callee->stack,
@@ -748,10 +758,6 @@ fiber_sleep(double delay)
 	if (delay == 0) {
 		ev_idle_start(loop(), &cord()->idle_event);
 	}
-	/*
-	 * We don't use fiber_wakeup() here to ensure there is
-	 * no infinite wakeup loop in case of fiber_sleep(0).
-	 */
 	fiber_yield_timeout(delay);
 
 	if (delay == 0) {
@@ -864,7 +870,11 @@ fiber_reset(struct fiber *fiber)
 {
 	rlist_create(&fiber->on_yield);
 	rlist_create(&fiber->on_stop);
-	fiber->flags = FIBER_DEFAULT_FLAGS;
+	/*
+	 * Preserve the running flag if set. Reset might be called on the
+	 * current fiber when it is recycled.
+	 */
+	fiber->flags = FIBER_DEFAULT_FLAGS | (fiber->flags & FIBER_IS_RUNNING);
 #if ENABLE_FIBER_TOP
 	clock_stat_reset(&fiber->clock_stat);
 #endif /* ENABLE_FIBER_TOP */
@@ -1449,6 +1459,7 @@ cord_create(struct cord *cord, const char *name)
 	cord->sched.name = NULL;
 	fiber_set_name(&cord->sched, "sched");
 	cord->fiber = &cord->sched;
+	cord->sched.flags |= FIBER_IS_RUNNING;
 
 	cord->max_fid = FIBER_ID_MAX_RESERVED;
 	/*
diff --git a/src/lib/core/fiber.h b/src/lib/core/fiber.h
index b9eeef697..586bc9433 100644
--- a/src/lib/core/fiber.h
+++ b/src/lib/core/fiber.h
@@ -162,6 +162,10 @@ enum {
 	 * This flag is set when fiber uses custom stack size.
 	 */
 	FIBER_CUSTOM_STACK	= 1 << 5,
+	/**
+	 * The flag is set for the fiber currently being executed by the cord.
+	 */
+	FIBER_IS_RUNNING	= 1 << 6,
 	FIBER_DEFAULT_FLAGS = FIBER_IS_CANCELLABLE
 };
 
@@ -279,7 +283,7 @@ API_EXPORT void
 fiber_start(struct fiber *callee, ...);
 
 /**
- * Interrupt a synchronous wait of a fiber
+ * Interrupt a synchronous wait of a fiber. Nop for the currently running fiber.
  *
  * \param f fiber to be woken up
  */
diff --git a/test/app-tap/gh-6043-fiber-wakeup-self.test.lua b/test/app-tap/gh-6043-fiber-wakeup-self.test.lua
new file mode 100755
index 000000000..082c1228d
--- /dev/null
+++ b/test/app-tap/gh-6043-fiber-wakeup-self.test.lua
@@ -0,0 +1,50 @@
+#!/usr/bin/env tarantool
+
+local tap = require('tap')
+local fiber = require('fiber')
+
+--
+-- gh-6043: fiber.wakeup() on self could lead to a crash if followed by
+-- fiber.create(). Also it could lead to spurious wakeup if followed by a sleep
+-- with non-zero timeout, or by, for example, an infinite yield such as WAL
+-- write.
+--
+
+local function test_wakeup_self_and_call(test)
+    test:plan(3)
+
+    local self = fiber.self()
+    fiber.wakeup(self)
+    local count = 0
+    local f = fiber.create(function()
+        count = count + 1
+        fiber.yield()
+        count = count + 1
+    end)
+    test:is(count, 1, 'created a fiber')
+    fiber.yield()
+    test:is(count, 2, 'it yielded')
+    test:is(f:status(), 'dead', 'and ended')
+end
+
+local function test_wakeup_self_and_wal_write(test)
+    test:plan(1)
+
+    local s = box.schema.create_space('test')
+    s:create_index('pk')
+
+    fiber.wakeup(fiber.self())
+    local lsn = box.info.lsn
+    s:replace{1}
+    test:is(box.info.lsn, lsn + 1, 'written to WAL')
+    s:drop()
+end
+
+box.cfg{}
+
+local test = tap.test('gh-6043-fiber-wakeup-self')
+test:plan(2)
+test:test('wakeup self + call', test_wakeup_self_and_call)
+test:test('wakeup self + wal write', test_wakeup_self_and_wal_write)
+
+os.exit(test:check() and 0 or 1)
diff --git a/test/unit/fiber.cc b/test/unit/fiber.cc
index 9c1a23bdd..a1e3ded87 100644
--- a/test/unit/fiber.cc
+++ b/test/unit/fiber.cc
@@ -138,8 +138,7 @@ fiber_join_test()
 	fiber_set_joinable(fiber, true);
 	fiber_wakeup(fiber);
 	/** Let the fiber schedule */
-	fiber_wakeup(fiber());
-	fiber_yield();
+	fiber_reschedule();
 	note("by this time the fiber should be dead already");
 	fiber_cancel(fiber);
 	fiber_join(fiber);
@@ -201,12 +200,42 @@ fiber_name_test()
 	footer();
 }
 
+static void
+fiber_wakeup_self_test()
+{
+	header();
+
+	struct fiber *f = fiber();
+
+	fiber_wakeup(f);
+	double duration = 0.001;
+	uint64_t t1 = fiber_clock64();
+	fiber_sleep(duration);
+	uint64_t t2 = fiber_clock64();
+	/*
+	 * It was a real sleep, not 0 duration. Wakeup is nop on the running
+	 * fiber.
+	 */
+	assert(t2 - t1 >= duration);
+
+	/*
+	 * Wakeup + start of a new fiber. This is different from yield but
+	 * works without crashes too.
+	 */
+	struct fiber *newf = fiber_new_xc("nop", noop_f);
+	fiber_wakeup(f);
+	fiber_start(newf);
+
+	footer();
+}
+
 static int
 main_f(va_list ap)
 {
 	fiber_name_test();
 	fiber_join_test();
 	fiber_stack_test();
+	fiber_wakeup_self_test();
 	ev_break(loop(), EVBREAK_ALL);
 	return 0;
 }
diff --git a/test/unit/fiber.result b/test/unit/fiber.result
index a61e0a2b8..36bf2a599 100644
--- a/test/unit/fiber.result
+++ b/test/unit/fiber.result
@@ -17,3 +17,5 @@ SystemError Failed to allocate 42 bytes in allocator for exception: Cannot alloc
 # normal-stack fiber not crashed
 # big-stack fiber not crashed
 	*** fiber_stack_test: done ***
+	*** fiber_wakeup_self_test ***
+	*** fiber_wakeup_self_test: done ***
diff --git a/test/unit/fiber_stress.cc b/test/unit/fiber_stress.cc
index 44ab12487..aa892d0a0 100644
--- a/test/unit/fiber_stress.cc
+++ b/test/unit/fiber_stress.cc
@@ -9,10 +9,8 @@ enum {
 static int
 yield_f(va_list ap)
 {
-	for (int i = 0; i < ITERATIONS; i++) {
-		fiber_wakeup(fiber());
-		fiber_yield();
-	}
+	for (int i = 0; i < ITERATIONS; i++)
+		fiber_reschedule();
 	return 0;
 }
 
-- 
2.24.3 (Apple Git-128)



More information about the Tarantool-patches mailing list