[PATCH 3/3] lib/core/fiber: Put watermarks into stack to track its usage

Vladimir Davydov vdavydov.dev at gmail.com
Wed Mar 13 16:52:04 MSK 2019


On Wed, Mar 13, 2019 at 01:47:21AM +0300, Cyrill Gorcunov wrote:
> We want to detect a situation where task in fiber is too eager for
> stack and close to its exhausting. For this sake upon stack creation
> we put 8 marks on last stack page with step of 128 bytes. Such params
> allows us to fill ~1/4 of a page, which does seem reasonable but
> we might change this params with time.
> 
> Since the watermark position is permanent and some task is close
> to stack limit we print about the situation once to not spam
> a user much and stop putting the mark on recycling.
> 
> Still this techique doesn't guarantee to handle all possible
> situations so to increas probability of catching eager fibers
> we put marks *not* at page start address but withing some offset,
> randomly generated.
> 
> To minimize a pressue on memory system we try to relax stack
> usage with that named "shink" watermark. Basically it is the
> same mark as for overflow detection but placed at 64K bound
> and on every recycle we try to shrink stack usage dropping
> pages if the watermark has been erased by a task.
> 
> Closes #3418
> 
> Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
> ---
>  src/lib/core/fiber.c | 167 +++++++++++++++++++++++++++++++++++++++++++
>  src/lib/core/fiber.h |  15 ++++
>  2 files changed, 182 insertions(+)
> 
> diff --git a/src/lib/core/fiber.c b/src/lib/core/fiber.c
> index a1b261ad4..cae291d1a 100644
> --- a/src/lib/core/fiber.c
> +++ b/src/lib/core/fiber.c
> @@ -104,6 +104,27 @@ static const struct fiber_attr fiber_attr_default = {
>         .flags = FIBER_DEFAULT_FLAGS
>  };
>  
> +/*
> + * Random values generated with uuid.
> + */
> +static const uint64_t poison_pool[] = {
> +	0x74f31d37285c4c37, 0xb10269a05bf10c29,
> +	0x0994d845bd284e0f, 0x9ffd4f7129c184df,
> +	0x357151e6711c4415, 0x8c5e5f41aafe6f28,
> +	0x6917dd79e78049d5, 0xba61957c65ca2465,
> +};
> +
> +/*
> + * We poison by 8 bytes as it natural for stack
> + * step on x86-64. Also 128 byte gap between
> + * poison values should cover a common cases.
> + */
> +#define POISON_SIZE	(sizeof(poison_pool) / sizeof(poison_pool[0]))
> +#define POISON_GAP	(128 + sizeof(poison_pool[0]))
> +#define POISON_OFF	(POISON_GAP / sizeof(poison_pool[0]))
> +
> +static void fiber_wmark_recycle(struct fiber *fiber);
> +
>  void
>  fiber_attr_create(struct fiber_attr *fiber_attr)
>  {
> @@ -624,6 +645,7 @@ fiber_recycle(struct fiber *fiber)
>  	/* no pending wakeup */
>  	assert(rlist_empty(&fiber->state));
>  	bool has_custom_stack = fiber->flags & FIBER_CUSTOM_STACK;
> +	fiber_wmark_recycle(fiber);
>  	fiber_reset(fiber);
>  	fiber->name[0] = '\0';
>  	fiber->f = NULL;
> @@ -710,6 +732,146 @@ page_align_up(void *ptr)
>  	return page_align_down(ptr + page_size - 1);
>  }
>  
> +static bool
> +stack_has_wmark(void *addr)

Please add brief comments to all the functions you add.

> +{
> +	const uint64_t *src = poison_pool;
> +	const uint64_t *dst = addr;
> +	size_t i;
> +
> +	for (i = 0; i < POISON_SIZE; i++) {
> +		if (*dst != src[i])
> +			return false;
> +		dst += POISON_OFF;
> +	}
> +
> +	return true;
> +}
> +
> +static void
> +stack_put_wmark(void *addr)
> +{
> +	const uint64_t *src = poison_pool;
> +	uint64_t *dst = addr;
> +	size_t i;
> +
> +	for (i = 0; i < POISON_SIZE; i++) {
> +		*dst = src[i];
> +		dst += POISON_OFF;
> +	}
> +}
> +
> +#ifndef TARGET_OS_DARWIN
> +static void
> +stack_shrink(struct fiber *fiber)
> +{
> +	void *start, *end;
> +
> +	/*
> +	 * When dropping pages make sure the page
> +	 * containing overflow mark is untouched.
> +	 * Same time no need to unmap the page which
> +	 * carries "shrink" wmark, since we're updating
> +	 * this page anyway.
> +	 */
> +	if (stack_direction < 0) {
> +		end = page_align_down(fiber->stack_shrink_wmark);
> +		start = page_align_up(fiber->stack_overflow_wmark);
> +		madvise(start, end - start, MADV_DONTNEED);
> +	} else {
> +		start = page_align_up(fiber->stack_shrink_wmark);
> +		end = page_align_down(fiber->stack_overflow_wmark);
> +		madvise(start, end - start, MADV_DONTNEED);
> +	}

Nit: you could move madvise() out of the if-else block.

> +	stack_put_wmark(fiber->stack_shrink_wmark);
> +}
> +#endif
> +
> +static void
> +fiber_wmark_recycle(struct fiber *fiber)
> +{
> +	static bool overflow_warned = false;
> +
> +	if (fiber->stack == NULL || fiber->flags & FIBER_CUSTOM_STACK)
> +		return;
> +
> +#ifndef TARGET_OS_DARWIN
> +	/*
> +	 * On recycle we're trying to shrink stack
> +	 * to release memory pressure but if only
> +	 * a fiber has been using too much memory.
> +	 */
> +	if (!stack_has_wmark(fiber->stack_shrink_wmark))
> +		stack_shrink(fiber);
> +#endif
> +
> +	/*
> +	 * We are watching for stack overflow in one shot way:
> +	 * simply to not spam a user with messages, if someone
> +	 * triggered the problem it is highly likely that
> +	 * an another fiber hit the same soon.
> +	 */
> +	if (overflow_warned)
> +		return;
> +
> +	if (!stack_has_wmark(fiber->stack_overflow_wmark)) {
> +		say_warn("stack usage is close to the limit of %zu bytes",
> +			 (size_t)FIBER_STACK_SIZE_DEFAULT);

IMO warning only when we are close to the limit isn't very useful.
Let's keep track of the max stack size instead and print it whenever
it's increased.

> +		overflow_warned = true;
> +	}
> +}
> +
> +static void
> +fiber_wmark_init(struct fiber *fiber)
> +{
> +	/*
> +	 * No tracking on custom stacks
> +	 * in a sake of simplicity.
> +	 */
> +	if (fiber->flags & FIBER_CUSTOM_STACK) {
> +		fiber->stack_overflow_wmark = NULL;
> +		fiber->stack_shrink_wmark = NULL;
> +		fiber->wmark_inpage_offset = 0;
> +		return;
> +	}
> +
> +	/*
> +	 * To increase probability of the stack overflow
> +	 * detection we put first mark at random position
> +	 * of the first 128 bytes range. The rest of the marks
> +	 * are put with constant step simply to not carry
> +	 * offsets in memory.
> +	 */
> +	fiber->wmark_inpage_offset = ((rand() % 128) + 8) & ~7;

Please define a constant for this.

> +
> +	/*
> +	 * Initially we arm the last page of the stack
> +	 * to catch if we're getting close to its exhausting.
> +	 *
> +	 * The shrink watermark is put at 64K limit which is
> +	 * known value to not cause much memory pressue even
> +	 * with large number of fibers.
> +	 */
> +	if (stack_direction < 0) {
> +		fiber->stack_overflow_wmark  = fiber->stack;
> +		fiber->stack_overflow_wmark += fiber->wmark_inpage_offset;
> +
> +		fiber->stack_shrink_wmark  = fiber->stack + fiber->stack_size;
> +		fiber->stack_shrink_wmark -= 16 << 12;

Ditto.

> +		fiber->stack_shrink_wmark += fiber->wmark_inpage_offset;
> +	} else {
> +		fiber->stack_overflow_wmark  = fiber->stack + fiber->stack_size;
> +		fiber->stack_overflow_wmark -= page_size;
> +		fiber->stack_overflow_wmark += fiber->wmark_inpage_offset;
> +
> +		fiber->stack_shrink_wmark  = fiber->stack;
> +		fiber->stack_shrink_wmark += (16 << 12) - page_size;
> +		fiber->stack_shrink_wmark += fiber->wmark_inpage_offset;
> +	}
> +	stack_put_wmark(fiber->stack_overflow_wmark);
> +	stack_put_wmark(fiber->stack_shrink_wmark);
> +}
> +
>  static int
>  fiber_stack_create(struct fiber *fiber, size_t stack_size)
>  {
> @@ -758,6 +920,7 @@ fiber_stack_create(struct fiber *fiber, size_t stack_size)
>  	madvise(fiber->stack, fiber->stack_size, MADV_DONTNEED);
>  #endif
>  
> +	fiber_wmark_init(fiber);
>  	mprotect(guard, page_size, PROT_NONE);
>  	return 0;
>  }
> @@ -927,8 +1090,12 @@ cord_create(struct cord *cord, const char *name)
>  	/* Record stack extents */
>  	tt_pthread_attr_getstack(cord->id, &cord->sched.stack,
>  				 &cord->sched.stack_size);
> +	cord->sched.stack_overflow_wmark = cord->sched.stack;
> +	cord->sched.stack_shrink_wmark = cord->sched.stack;

I don't think we need to set the watermarks for the sched fiber,
even if ASAN is on.

>  #else
>  	cord->sched.stack = NULL;
> +	cord->sched.stack_overflow_wmark = NULL;
> +	cord->sched.stack_shrink_wmark = NULL;
>  	cord->sched.stack_size = 0;
>  #endif
>  }
> diff --git a/src/lib/core/fiber.h b/src/lib/core/fiber.h
> index f1f5a0555..431e3da09 100644
> --- a/src/lib/core/fiber.h
> +++ b/src/lib/core/fiber.h
> @@ -348,6 +348,21 @@ struct fiber {
>  	struct slab *stack_slab;
>  	/** Coro stack addr. */
>  	void *stack;
> +	/**
> +	 * Stack watermark addr to detect
> +	 * if we need shrink stack on reuse.
> +	 */
> +	void *stack_shrink_wmark;
> +	/**
> +	 * Stack watermark addr for overflow detection. To warn
> +	 * a user about stack eager fibers.
> +	 */
> +	void *stack_overflow_wmark;
> +	/**
> +	 * An offset to watermark position in stack
> +	 * since page bound address.
> +	 */
> +	unsigned int wmark_inpage_offset;
>  	/** Coro stack size. */
>  	size_t stack_size;
>  	/** Valgrind stack id. */



More information about the Tarantool-patches mailing list