Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: tml <tarantool-patches@freelists.org>,
	Alexander Turenko <alexander.turenko@tarantool.org>
Subject: Re: [RFC v2] slab_arena: Enhance slab_arena_create to support madvise hints
Date: Mon, 13 May 2019 13:06:42 +0300	[thread overview]
Message-ID: <20190513100642.h3ye4sphinkcemv4@esperanza> (raw)
In-Reply-To: <20190506174321.11614-1-gorcunov@gmail.com>

[Cc += Alexander]

Alexander, please take a look at how Cyrill handled the backward
compatibility problem.

Cyrill, please see a few comments inline.

On Mon, May 06, 2019 at 08:43:21PM +0300, Cyrill Gorcunov wrote:
> The purpose of this extension is to be able to eliminate
> some of slab arenas from dumping, especially when huge
> amount of memory involves.
> 
> To keep backward compatibility we map flags passed into
> new SLAB_ARENA_x flags. To be able to distinguish old
> and new interfaces we reserve high bit of the integer
> value and remap flags internally to syscall compatible
> form.
> 
> Same time I have to modify build script to define
> _GNU_SOURCE symbol otherwise madvise bits won't be
> found.
> 
> Part-of https://github.com/tarantool/tarantool/issues/3509
> ---
> v2:
>   - Encode madvise hint inside slab_arena_create flags, for backward compatibility
>   - Enhance test to address dynamic slabs
>   - Enhance build procedure to test all falgs and functions we're interested in
> 
> Please review very carefully. I still don't modify tarantool code, I'm pretty
> sure it should be a separate patch since this one is big enough already and
> more importantly it *must not* break existing code thus better testing on
> its own.
> 
>  .gitignore           |   1 +
>  CMakeLists.txt       |  19 +++++++
>  small/config.h.cmake |  33 +++++++++++
>  small/slab_arena.c   |  52 +++++++++++++++---
>  small/slab_arena.h   |  25 ++++++++-
>  test/slab_arena.c    | 128 +++++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 248 insertions(+), 10 deletions(-)
>  create mode 100644 small/config.h.cmake
> 
> diff --git a/.gitignore b/.gitignore
> index e2339c3..c4e4ade 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -17,3 +17,4 @@ build/
>  CTestTestfile.cmake
>  Testing
>  CTestTestfile.cmake
> +small/config.h
> diff --git a/CMakeLists.txt b/CMakeLists.txt
> index ad27423..208528e 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -1,6 +1,9 @@
>  project(small C CXX)
>  cmake_minimum_required(VERSION 2.8 FATAL_ERROR)
>  
> +include(CheckFunctionExists)
> +include(CheckSymbolExists)
> +
>  if(NOT CMAKE_BUILD_TYPE)
>      set(CMAKE_BUILD_TYPE Debug)
>  endif()
> @@ -15,10 +18,26 @@ endif()
>  # Enable GNU glibc extentions.
>  add_definitions("-D_GNU_SOURCE")
>  
> +set(CMAKE_REQUIRED_DEFINITIONS "-D_GNU_SOURCE")
> +
> +check_symbol_exists(MAP_ANON sys/mman.h TARANTOOL_SMALL_HAVE_MAP_ANON)
> +check_symbol_exists(MAP_ANONYMOUS sys/mman.h TARANTOOL_SMALL_HAVE_MAP_ANONYMOUS)
> +
> +check_function_exists(madvise TARANTOOL_SMALL_HAS_MADVISE)

HAS/HAVE - please be consistent in naming.

> +check_symbol_exists(MADV_NORMAL sys/mman.h TARANTOOL_SMALL_HAS_MADV_NORMAL)
> +check_symbol_exists(MADV_DONTDUMP sys/mman.h TARANTOOL_SMALL_HAS_MADV_DONTDUMP)

Do we really need to check all of that? Is a check for MADV_DONTDUMP not
enough?

> +
> +configure_file(
> +    "small/config.h.cmake"
> +    "small/config.h"
> +    )
> +message (STATUS "")
> +
>  # Valgrind
>  include_directories(third_party)
>  
>  set(lib_headers
> +    small/config.h
>      small/ibuf.h
>      small/lf_lifo.h
>      small/lifo.h
> diff --git a/small/config.h.cmake b/small/config.h.cmake
> new file mode 100644
> index 0000000..cbb0429
> --- /dev/null
> +++ b/small/config.h.cmake
> @@ -0,0 +1,33 @@
> +#ifndef TARANTOOL_SMALL_CONFIG_H_INCLUDED
> +#define TARANTOOL_SMALL_CONFIG_H_INCLUDED
> +
> +/*
> + * Check for deprecated MAP_ANON.
> + */
> +#cmakedefine TARANTOOL_SMALL_HAS_MAP_ANON 1
> +#cmakedefine TARANTOOL_SMALL_HAS_MAP_ANONYMOUS 1
> +
> +#if !defined(TARANTOOL_SMALL_HAS_MAP_ANONYMOUS) && defined(TARANTOOL_SMALL_HAS_MAP_ANON)
> +/*
> + * MAP_ANON is deprecated, MAP_ANONYMOUS should be used instead.
> + * Unfortunately, it's not universally present (e.g. not present
> + * on FreeBSD.
> + */
> +# define MAP_ANONYMOUS MAP_ANON
> +#endif
> +
> +/*
> + * Defined if this platform has madvise(..)
> + * and flags we're interested in.
> + */
> +#cmakedefine TARANTOOL_SMALL_HAS_MADVISE 1
> +#cmakedefine TARANTOOL_SMALL_HAS_MADV_NORMAL 1
> +#cmakedefine TARANTOOL_SMALL_HAS_MADV_DONTDUMP 1
> +
> +#if defined(TARANTOOL_SMALL_HAS_MADVISE)	&& \
> +    defined(TARANTOOL_SMALL_HAS_MADV_NORMAL)	&& \
> +    defined(TARANTOOL_SMALL_HAS_MADV_DONTDUMP)
> +# define TARANTOOL_SMALL_USE_MADVISE 1
> +#endif
> +
> +#endif /* TARANTOOL_SMALL_CONFIG_H_INCLUDED */
> diff --git a/small/slab_arena.c b/small/slab_arena.c
> index 26e2620..0b6f2f5 100644
> --- a/small/slab_arena.c
> +++ b/small/slab_arena.c
> @@ -41,17 +41,12 @@
>  #include <valgrind/valgrind.h>
>  #include <valgrind/memcheck.h>
>  
> -#if !defined(MAP_ANONYMOUS)
> -#define MAP_ANONYMOUS MAP_ANON
> -#endif
> -
>  static void
>  munmap_checked(void *addr, size_t size)
>  {
>  	if (munmap(addr, size)) {
>  		char buf[64];
> -		intptr_t ignore_it = (intptr_t)strerror_r(errno, buf,
> -							  sizeof(buf));
> +		char *ignore_it = strerror_r(errno, buf, sizeof(buf));
>  		(void)ignore_it;
>  		fprintf(stderr, "Error in munmap(%p, %zu): %s\n",
>  			addr, size, buf);
> @@ -123,11 +118,38 @@ pow2round(size_t size)
>  #define MAX(a, b) ((a) > (b) ? (a) : (b))
>  #define MIN(a, b) ((a) < (b) ? (a) : (b))
>  
> +static void
> +slab_arena_init_flags(struct slab_arena *arena, int slab_flags)
> +{
> +#ifdef TARANTOOL_SMALL_USE_MADVISE
> +	arena->madv_flag = MADV_NORMAL;
> +#endif
> +
> +	/*
> +	 * Old interface for backward compatibility, MAP_
> +	 * flags are passed directly without SLAB_ARENA_FLAG_MARK.
> +	 */
> +	if (!(slab_flags & SLAB_ARENA_FLAG_MARK)) {
> +		assert(slab_flags & (MAP_PRIVATE | MAP_SHARED));
> +		arena->flags = slab_flags;
> +		return;
> +	}
> +
> +	if (!(slab_flags & SLAB_ARENA_SHARED))
> +		arena->flags = MAP_PRIVATE;
> +	else
> +		arena->flags = MAP_SHARED;
> +
> +#ifdef TARANTOOL_SMALL_USE_MADVISE
> +	if (slab_flags & SLAB_ARENA_DONTDUMP)
> +		arena->madv_flag = MADV_DONTDUMP;
> +#endif

May be, better store slab_flags in arena and convert them to
mmap/madvise flags when necessary? Would be one variable instead
of two.

> +}
> +
>  int
>  slab_arena_create(struct slab_arena *arena, struct quota *quota,
> -		  size_t prealloc, uint32_t slab_size, int flags)
> +		  size_t prealloc, uint32_t slab_size, int slab_flags)
>  {
> -	assert(flags & (MAP_PRIVATE | MAP_SHARED));
>  	lf_lifo_init(&arena->cache);
>  	VALGRIND_MAKE_MEM_DEFINED(&arena->cache, sizeof(struct lf_lifo));
>  
> @@ -148,7 +170,7 @@ slab_arena_create(struct slab_arena *arena, struct quota *quota,
>  
>  	arena->used = 0;
>  
> -	arena->flags = flags;
> +	slab_arena_init_flags(arena, slab_flags);
>  
>  	if (arena->prealloc) {
>  		arena->arena = mmap_checked(arena->prealloc,
> @@ -157,6 +179,12 @@ slab_arena_create(struct slab_arena *arena, struct quota *quota,
>  	} else {
>  		arena->arena = NULL;
>  	}
> +
> +#ifdef TARANTOOL_SMALL_USE_MADVISE
> +	if (arena->arena && arena->madv_flag != MADV_NORMAL)
> +		madvise(arena->arena, arena->slab_size, arena->madv_flag);
> +#endif
> +
>  	return arena->prealloc && !arena->arena ? -1 : 0;
>  }
>  
> @@ -205,6 +233,12 @@ slab_map(struct slab_arena *arena)
>  		__sync_sub_and_fetch(&arena->used, arena->slab_size);
>  		quota_release(arena->quota, arena->slab_size);
>  	}
> +
> +#ifdef TARANTOOL_SMALL_USE_MADVISE
> +	if (ptr && arena->madv_flag != MADV_NORMAL)
> +		madvise(ptr, arena->slab_size, arena->madv_flag);
> +#endif
> +

I'd move this code to a helper function to avoid code duplication in
slab_arena_create/slab_map.

>  	VALGRIND_MAKE_MEM_UNDEFINED(ptr, arena->slab_size);
>  	return ptr;
>  }
> diff --git a/small/slab_arena.h b/small/slab_arena.h
> index 364252f..133878b 100644
> --- a/small/slab_arena.h
> +++ b/small/slab_arena.h
> @@ -33,6 +33,7 @@
>  #include "lf_lifo.h"
>  #include <sys/mman.h>
>  #include <limits.h>
> +#include "config.h"
>  
>  #if defined(__cplusplus)
>  extern "C" {
> @@ -45,6 +46,22 @@ enum {
>  	SMALL_UNLIMITED = SIZE_MAX/2 + 1
>  };
>  
> +/**
> + * Backward compatible flags to be used with slab_arena_create().

Please elaborate the comment: explain how this backward compatibility
hack works, what it's needed for.

> + */
> +#define SLAB_ARENA_FLAG_MARK	(0x80000000)
> +#define SLAB_ARENA_FLAG(x)	((x) | SLAB_ARENA_FLAG_MARK)
> +
> +enum {
> +	/* mmap() flags */
> +	SLAB_ARENA_PRIVATE	= SLAB_ARENA_FLAG(1 << 0),
> +	SLAB_ARENA_SHARED	= SLAB_ARENA_FLAG(1 << 1),
> +
> +	/* madvise() flags */
> +	SLAB_ARENA_NORMAL	= SLAB_ARENA_FLAG(1 << 2),
> +	SLAB_ARENA_DONTDUMP	= SLAB_ARENA_FLAG(1 << 3)
> +};
> +
>  /**
>   * slab_arena -- a source of large aligned blocks of memory.
>   * MT-safe.
> @@ -97,12 +114,18 @@ struct slab_arena {
>  	 * mmap() flags: MAP_SHARED or MAP_PRIVATE
>  	 */
>  	int flags;
> +#ifdef TARANTOOL_SMALL_USE_MADVISE
> +	/**
> +	 * madvise() flag: MADV_DONTDUMP or MADV_NORMAL.
> +	 */
> +	int madv_flag;
> +#endif
>  };
>  
>  /** Initialize an arena.  */
>  int
>  slab_arena_create(struct slab_arena *arena, struct quota *quota,
> -		  size_t prealloc, uint32_t slab_size, int flags);
> +		  size_t prealloc, uint32_t slab_size, int slab_flags);
>  
>  /** Destroy an arena. */
>  void
> diff --git a/test/slab_arena.c b/test/slab_arena.c
> index d0a78d1..a9ea838 100644
> --- a/test/slab_arena.c
> +++ b/test/slab_arena.c
> @@ -2,7 +2,9 @@
>  #include <small/quota.h>
>  #include <stdio.h>
>  #include <limits.h>
> +#include <unistd.h>
>  #include <stdlib.h>
> +#include <string.h>
>  #include <time.h>
>  #include "unit.h"
>  
> @@ -15,6 +17,130 @@ slab_arena_print(struct slab_arena *arena)
>  	       arena->used, arena->slab_size);
>  }
>  
> +#define is_hex_digit(c)				\
> +	(((c) >= '0' && (c) <= '9')	||	\
> +	 ((c) >= 'a' && (c) <= 'f')	||	\
> +	 ((c) >= 'A' && (c) <= 'F'))
> +
> +static bool
> +is_vma_range_fmt(char *line, unsigned long *start, unsigned long *end)
> +{
> +	char *p = line;
> +	while (*line && is_hex_digit(*line))
> +		line++;
> +
> +	if (*line++ != '-')
> +		return false;
> +
> +	while (*line && is_hex_digit(*line))
> +		line++;
> +
> +	if (*line++ != ' ')
> +		return false;
> +
> +	sscanf(p, "%lx-%lx", start, end);
> +	return true;
> +}
> +
> +static bool
> +vma_has_flag(unsigned long vm_start, const char *flag)
> +{
> +	unsigned long start = 0, end = 0;
> +	char buf[1024], *tok = NULL;
> +	bool found = false;
> +	FILE *f = NULL;
> +
> +	f = fopen("/proc/self/smaps", "r");
> +	if (!f) {
> +		printf("ERROR: Can't open smaps for %lx\n", vm_start);
> +		return false;
> +	}
> +
> +	while (fgets(buf, sizeof(buf), f)) {
> +		if (is_vma_range_fmt(buf, &start, &end))
> +			continue;
> +		if (strncmp(buf, "VmFlags: ", 9) || start != vm_start)
> +			continue;
> +		tok = buf;
> +		break;
> +	}
> +
> +	if (tok) {
> +		for (tok = strtok(tok, " \n"); tok;
> +		     tok = strtok(NULL, " \n")) {
> +			if (strcmp(tok, flag))
> +				continue;
> +			found = true;
> +			break;
> +		}
> +	}
> +
> +	fclose(f);
> +	return found;
> +}
> +
> +static void
> +slab_test_madvise(void)
> +{
> +	struct slab_arena arena;
> +	struct quota quota;
> +	void *ptr;
> +
> +	/*
> +	 * The system doesn't support flags fetching.
> +	 */
> +	if (access("/proc/self/smaps", F_OK))
> +		return;
> +
> +	/*
> +	 * If system supports madvise call, test that
> +	 * preallocated area has been madvised.
> +	 */
> +	quota_init(&quota, 2000000);
> +	slab_arena_create(&arena, &quota, 3000000, 1, SLAB_ARENA_DONTDUMP);
> +
> +	/*
> +	 * Will fetch from preallocated area.
> +	 */
> +	ptr = slab_map(&arena);
> +	if (!ptr) {
> +		printf("ERROR: can't obtain preallocated slab\n");
> +		goto out;
> +	}
> +
> +	if (!vma_has_flag((unsigned long)ptr, "dd"))
> +		goto no_dd;
> +
> +	slab_unmap(&arena, ptr);
> +	slab_arena_destroy(&arena);
> +
> +	/*
> +	 * A new slab for dynamic allocation.
> +	 */
> +	quota_init(&quota, 2000000);
> +	slab_arena_create(&arena, &quota, 0, 0x10000, SLAB_ARENA_DONTDUMP);
> +
> +	/*
> +	 * Will fetch newly allocated area.
> +	 */
> +	ptr = slab_map(&arena);
> +	if (!ptr) {
> +		printf("ERROR: can't obtain dynamic slab\n");
> +		goto out;
> +	}
> +
> +	if (!vma_has_flag((unsigned long)ptr, "dd"))
> +		goto no_dd;
> +
> +out:
> +	slab_unmap(&arena, ptr);
> +	slab_arena_destroy(&arena);
> +	return;
> +no_dd:
> +	printf("ERROR: Expected dd flag on VMA address %p\n", ptr);
> +	goto out;
> +}
> +
>  int main()
>  {
>  	struct quota quota;
> @@ -42,4 +168,6 @@ int main()
>  	slab_arena_create(&arena, &quota, 3000000, 1, MAP_PRIVATE);
>  	slab_arena_print(&arena);
>  	slab_arena_destroy(&arena);
> +
> +	slab_test_madvise();
>  }

  reply	other threads:[~2019-05-13 10:06 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-06 17:43 Cyrill Gorcunov
2019-05-13 10:06 ` Vladimir Davydov [this message]
2019-05-13 10:30   ` Cyrill Gorcunov
2019-05-13 11:21     ` Vladimir Davydov
2019-05-13 11:41       ` Cyrill Gorcunov
2019-05-13 11:56         ` Vladimir Davydov
2019-05-13 12:02           ` Cyrill Gorcunov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190513100642.h3ye4sphinkcemv4@esperanza \
    --to=vdavydov.dev@gmail.com \
    --cc=alexander.turenko@tarantool.org \
    --cc=gorcunov@gmail.com \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [RFC v2] slab_arena: Enhance slab_arena_create to support madvise hints' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox