[Tarantool-patches] [PATCH] Add new 'allocator' option in box.cfg

mechanik20051988 mechanik20.05.1988 at gmail.com
Fri Dec 18 16:58:04 MSK 2020


Slab allocator, which is used for tuples allocation, has a certa$
disadvantage - it tends to unresolvable fragmentation on certain
workloads (size migration). New option allows to select
the appropriate allocator if necessary.

@TarantoolBot document
Title: Add new 'allocator' option
Add new 'allocator' option which allows to select
the appropriate allocator for memtx tuples if necessary.
Use box.cfg{allocator="small"} or no option to use default
small allocator, use box.cfg{allocator="system"} to use
libc malloc.

Closes #5419
---
 CMakeLists.txt                           |  11 ++
 perf/allocator_perf.test.lua             |  34 ++++
 src/box/allocator.h                      | 200 ++++++++++++++++++++
 src/box/box.cc                           |   1 +
 src/box/lua/load_cfg.lua                 |   2 +
 src/box/lua/slab.c                       |  39 +++-
 src/box/memtx_engine.c                   |  53 ++++--
 src/box/memtx_engine.h                   |  41 +++-
 src/box/system_allocator.h               | 226 +++++++++++++++++++++++
 src/trivia/config.h.cmake                |   3 +
 test/app-tap/init_script.result          |   1 +
 test/box/admin.result                    |   4 +-
 test/box/cfg.result                      |   8 +-
 test/box/choose_memtx_allocator.lua      |   9 +
 test/box/choose_memtx_allocator.result   | 135 ++++++++++++++
 test/box/choose_memtx_allocator.test.lua |  43 +++++
 16 files changed, 776 insertions(+), 34 deletions(-)
 create mode 100755 perf/allocator_perf.test.lua
 create mode 100644 src/box/allocator.h
 create mode 100644 src/box/system_allocator.h
 create mode 100644 test/box/choose_memtx_allocator.lua
 create mode 100644 test/box/choose_memtx_allocator.result
 create mode 100644 test/box/choose_memtx_allocator.test.lua

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa6818f8e..290cd535a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,6 +92,17 @@ check_symbol_exists(posix_fadvise fcntl.h HAVE_POSIX_FADVISE)
 check_symbol_exists(fallocate fcntl.h HAVE_FALLOCATE)
 check_symbol_exists(mremap sys/mman.h HAVE_MREMAP)
 
+check_function_exists(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
+check_symbol_exists(malloc_size malloc/malloc.h HAVE_MALLOC_SIZE_DARWIN)
+
+if (HAVE_MALLOC_USABLE_SIZE)
+    if (TARGET_OS_LINUX)
+        set(HAVE_MALLOC_USABLE_SIZE_LINUX 1)
+    else ()
+        set(HAVE_MALLOC_USABLE_SIZE_BSD 1)
+    endif ()
+endif ()
+
 check_function_exists(sync_file_range HAVE_SYNC_FILE_RANGE)
 check_function_exists(memmem HAVE_MEMMEM)
 check_function_exists(memrchr HAVE_MEMRCHR)
diff --git a/perf/allocator_perf.test.lua b/perf/allocator_perf.test.lua
new file mode 100755
index 000000000..be270379b
--- /dev/null
+++ b/perf/allocator_perf.test.lua
@@ -0,0 +1,34 @@
+#!/usr/bin/env ../src/tarantool
+os.execute('rm -rf *.snap *.xlog *.vylog ./512 ./513 ./514 ./515 ./516 ./517 ./518 ./519 ./520 ./521')
+local clock = require('clock')
+box.cfg{listen = 3301, wal_mode='none', allocator=arg[1]}
+local space = box.schema.space.create('test')
+space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} })
+space:create_index('primary', { parts = {'id'} })
+local time_insert = 0
+local time_replace = 0
+local time_delete = 0
+local cnt = 0
+local cnt_max = 20
+local op_max = 2500000
+local nanosec = 1.0e9
+while cnt < cnt_max do
+    cnt = cnt + 1
+    local time_before = clock.monotonic64()
+    for key = 1, op_max do space:insert({key, key + 1000}) end
+    local time_after = clock.monotonic64()
+    time_insert = time_insert + (time_after - time_before)
+    time_before = clock.monotonic64()
+    for key = 1, op_max do space:replace({key, key + 5000}) end
+    time_after = clock.monotonic64()
+    time_replace = time_replace + (time_after - time_before)
+    time_before = clock.monotonic64()
+    for key = 1, op_max do space:delete(key) end
+    time_after = clock.monotonic64()
+    time_delete = time_delete + (time_after - time_before)
+end
+io.write("{\n")
+io.write(string.format("  \"alloc time\": \"%.3f\"\n", tonumber(time_insert) / (nanosec * cnt_max)))
+io.write(string.format("  \"replace time\": \"%.3f\"\n", tonumber(time_replace) / (nanosec * cnt_max)))
+io.write(string.format("  \"delete time\": \"%.3f\"\n}\n", tonumber(time_delete) / (nanosec * cnt_max)))
+os.exit()
diff --git a/src/box/allocator.h b/src/box/allocator.h
new file mode 100644
index 000000000..3bea67f50
--- /dev/null
+++ b/src/box/allocator.h
@@ -0,0 +1,200 @@
+#pragma once
+/*
+ * Copyright 2010-2020, Tarantool AUTHORS, please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <small/small.h>
+#include <trivia/util.h>
+#include <stdarg.h>
+
+#include "memtx_engine.h"
+#include "system_allocator.h"
+#include "tuple.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* defined(__cplusplus) */
+
+#define noop_one_arg(a)
+#define noop_two_arg(a, b)	
+
+struct allocator_stats {
+	size_t used;
+	size_t total;
+};
+
+static inline void
+_small_allocator_create(struct memtx_engine *memtx, va_list argptr)
+{
+	uint32_t objsize_min = va_arg(argptr, uint32_t);
+	double alloc_factor = va_arg(argptr, double);
+	return small_alloc_create(memtx->alloc, &memtx->slab_cache, 
+		objsize_min, (float)alloc_factor);
+}
+
+static inline void
+_system_allocator_create(struct memtx_engine *memtx, MAYBE_UNUSED va_list argptr)
+{
+	return system_alloc_create(memtx->alloc, &memtx->quota);
+}
+
+static inline void 
+_small_allocator_stats(struct small_alloc *alloc, struct small_stats *stats, 
+	va_list argptr)
+{
+	mempool_stats_cb stats_cb = 
+		va_arg(argptr, mempool_stats_cb);
+	void *cb_ctx = va_arg(argptr, void  *);
+	return small_stats(alloc, stats, stats_cb, cb_ctx);
+}
+
+static inline void 
+_system_allocator_stats(struct system_alloc *alloc, struct system_stats *stats, 
+	MAYBE_UNUSED va_list argptr)
+{
+	return system_stats(alloc, stats);
+}
+
+#define MEM_CHECK_FUNC(prefix, func, param)					\
+static inline void								\
+prefix##_mem_check(MAYBE_UNUSED struct prefix##_alloc *alloc)			\
+{										\
+	func(alloc->param);							\
+}
+MEM_CHECK_FUNC(small, slab_cache_check, cache)
+MEM_CHECK_FUNC(system, noop_one_arg, noop)
+
+/**
+ * Global abstract method to memory alloc
+ */
+typedef void *(*global_alloc)(void *alloc, size_t bytes);
+static global_alloc memtx_global_alloc;
+
+/**
+ * Global abstract method to memory free
+ */
+typedef void (*global_free)(void *alloc, void *ptr, size_t bytes);
+static global_free memtx_global_free;
+
+/**
+ * Global abstract method to delayed memory free
+ */
+typedef void (*global_free_delayed)(void *alloc, void *ptr, size_t bytes);
+static global_free_delayed memtx_global_free_delayed;
+
+#define DECLARE_MEMTX_ALLOCATOR_DESTROY(prefix) 				\
+static inline void								\
+prefix##_allocator_destroy(struct memtx_engine *memtx)  			\
+{										\
+	prefix##_alloc_destroy(memtx->alloc);					\
+}
+DECLARE_MEMTX_ALLOCATOR_DESTROY(small)
+DECLARE_MEMTX_ALLOCATOR_DESTROY(system)
+
+#define DECLARE_MEMTX_ALLOCATOR_CREATE(prefix)  				\
+static inline void								\
+prefix##_allocator_create(struct memtx_engine *memtx, ...)  			\
+{										\
+	va_list argptr;								\
+	va_start(argptr, memtx);						\
+	_##prefix##_allocator_create(memtx, argptr);				\
+	va_end(argptr);								\
+}
+DECLARE_MEMTX_ALLOCATOR_CREATE(small)
+DECLARE_MEMTX_ALLOCATOR_CREATE(system)
+
+#define DECLARE_MEMTX_ALLOCATOR_ENTER_DELAYED_FREE_MODE(prefix, PREFIX) 	\
+static inline void								\
+prefix##_allocator_enter_delayed_free_mode(struct memtx_engine *memtx)  	\
+{										\
+	return prefix##_##alloc_setopt(memtx->alloc,				\
+		PREFIX##_##DELAYED_FREE_MODE, true);				\
+}
+DECLARE_MEMTX_ALLOCATOR_ENTER_DELAYED_FREE_MODE(small, SMALL)
+DECLARE_MEMTX_ALLOCATOR_ENTER_DELAYED_FREE_MODE(system, SYSTEM)
+
+#define DECLARE_MEMTX_ALLOCATOR_LEAVE_DELAYED_FREE_MODE(prefix, PREFIX) 	\
+static inline void								\
+prefix##_allocator_leave_delayed_free_mode(struct memtx_engine *memtx)  	\
+{										\
+	return prefix##_##alloc_setopt(memtx->alloc,				\
+		PREFIX##_##DELAYED_FREE_MODE, false);				\
+}
+DECLARE_MEMTX_ALLOCATOR_LEAVE_DELAYED_FREE_MODE(small, SMALL)
+DECLARE_MEMTX_ALLOCATOR_LEAVE_DELAYED_FREE_MODE(system, SYSTEM)
+
+#define DECLARE_MEMTX_ALLOCATOR_STATS(prefix)   				\
+static inline void								\
+prefix##_allocator_stats(struct memtx_engine *memtx,				\
+		struct allocator_stats *stats, ...)				\
+{										\
+	va_list argptr;								\
+	va_start(argptr, stats);						\
+	struct prefix##_stats data_stats;					\
+	_##prefix##_allocator_stats(memtx->alloc, &data_stats, argptr);		\
+	va_end(argptr);								\
+	stats->used = data_stats.used;						\
+	stats->total = data_stats.total;					\
+}
+DECLARE_MEMTX_ALLOCATOR_STATS(small)
+DECLARE_MEMTX_ALLOCATOR_STATS(system)
+
+#define DECLARE_MEMTX_MEM_CHECK(prefix)  					\
+static inline void								\
+prefix##_allocator_mem_check(struct memtx_engine *memtx)			\
+{										\
+	prefix##_mem_check((struct prefix##_alloc *)(memtx->alloc));    	\
+}
+DECLARE_MEMTX_MEM_CHECK(small)
+DECLARE_MEMTX_MEM_CHECK(system)
+
+#define DECLARE_MEMTX_ALLOCATOR_CHOICE(prefix, alloc_func, free_func,   	\
+			free_dealyed_func)					\
+static inline void								\
+prefix##_memtx_allocator_choice(struct memtx_engine *memtx)			\
+{										\
+	memtx_global_alloc = (void *)alloc_func;				\
+	memtx_global_free = (void *)free_func;					\
+	memtx_global_free_delayed = (void *)free_dealyed_func;  		\
+	memtx->alloc = &memtx->prefix##_alloc; 					\
+	memtx->memtx_allocator_create = prefix##_allocator_create;  		\
+	memtx->memtx_allocator_destroy = prefix##_allocator_destroy;		\
+	memtx->memtx_enter_delayed_free_mode =  				\
+		prefix##_allocator_enter_delayed_free_mode; 			\
+	memtx->memtx_leave_delayed_free_mode =  				\
+		prefix##_allocator_leave_delayed_free_mode; 			\
+	memtx->memtx_allocator_stats = prefix##_allocator_stats;		\
+	memtx->memtx_mem_check = prefix##_allocator_mem_check;  		\
+}
+DECLARE_MEMTX_ALLOCATOR_CHOICE(small, smalloc, smfree, smfree_delayed)
+DECLARE_MEMTX_ALLOCATOR_CHOICE(system, sysalloc, sysfree, sysfree_delayed)
+
+#if defined(__cplusplus)
+} /* extern "C" */
+#endif /* defined(__cplusplus) */
diff --git a/src/box/box.cc b/src/box/box.cc
index a8bc3471d..66f6030df 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -2250,6 +2250,7 @@ engine_init()
 				    cfg_getd("memtx_memory"),
 				    cfg_geti("memtx_min_tuple_size"),
 				    cfg_geti("strip_core"),
+				    cfg_gets("allocator"),
 				    cfg_getd("slab_alloc_factor"));
 	engine_register((struct engine *)memtx);
 	box_set_memtx_max_tuple_size();
diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua
index 770442052..817b8dbd5 100644
--- a/src/box/lua/load_cfg.lua
+++ b/src/box/lua/load_cfg.lua
@@ -43,6 +43,7 @@ local default_cfg = {
     memtx_min_tuple_size = 16,
     memtx_max_tuple_size = 1024 * 1024,
     slab_alloc_factor   = 1.05,
+    allocator           = "small",
     work_dir            = nil,
     memtx_dir           = ".",
     wal_dir             = ".",
@@ -123,6 +124,7 @@ local template_cfg = {
     memtx_min_tuple_size  = 'number',
     memtx_max_tuple_size  = 'number',
     slab_alloc_factor   = 'number',
+    allocator           = 'string',
     work_dir            = 'string',
     memtx_dir            = 'string',
     wal_dir             = 'string',
diff --git a/src/box/lua/slab.c b/src/box/lua/slab.c
index 9f5e7e95c..e3f140570 100644
--- a/src/box/lua/slab.c
+++ b/src/box/lua/slab.c
@@ -43,6 +43,7 @@
 #include "memory.h"
 #include "box/engine.h"
 #include "box/memtx_engine.h"
+#include "box/allocator.h"
 
 static int
 small_stats_noop_cb(const struct mempool_stats *stats, void *cb_ctx)
@@ -108,17 +109,31 @@ lbox_slab_stats(struct lua_State *L)
 	struct memtx_engine *memtx;
 	memtx = (struct memtx_engine *)engine_by_name("memtx");
 
-	struct small_stats totals;
+	struct allocator_stats totals;
 	lua_newtable(L);
 	/*
 	 * List all slabs used for tuples and slabs used for
 	 * indexes, with their stats.
 	 */
-	small_stats(&memtx->alloc, &totals, small_stats_lua_cb, L);
+	if (memtx->alloc == &memtx->small_alloc) {
+		memtx->memtx_allocator_stats(memtx, &totals, small_stats_lua_cb, L);
+	} else {
+		memtx->memtx_allocator_stats(memtx, &totals);
+		lua_pushnumber(L, lua_objlen(L, -1) + 1);
+		lua_newtable(L);
+		luaL_setmaphint(L, -1);
+		lua_pushstring(L, "mem_used");
+		luaL_pushuint64(L, totals.used);
+		lua_settable(L, -3);
+
+		lua_pushstring(L, "mem_free");
+		luaL_pushuint64(L, totals.total - totals.used);
+		lua_settable(L, -3);
+		lua_settable(L, -3);
+	}
 	struct mempool_stats index_stats;
 	mempool_stats(&memtx->index_extent_pool, &index_stats);
 	small_stats_lua_cb(&index_stats, L);
-
 	return 1;
 }
 
@@ -128,14 +143,21 @@ lbox_slab_info(struct lua_State *L)
 	struct memtx_engine *memtx;
 	memtx = (struct memtx_engine *)engine_by_name("memtx");
 
-	struct small_stats totals;
+	struct allocator_stats totals;
+	bool is_small_alloc;
 
 	/*
 	 * List all slabs used for tuples and slabs used for
 	 * indexes, with their stats.
 	 */
 	lua_newtable(L);
-	small_stats(&memtx->alloc, &totals, small_stats_noop_cb, L);
+	if (memtx->alloc == &memtx->small_alloc) {
+		memtx->memtx_allocator_stats(memtx, &totals, small_stats_noop_cb, L);
+		is_small_alloc = true;
+	} else {
+		memtx->memtx_allocator_stats(memtx, &totals);
+		is_small_alloc = false;
+	}
 	struct mempool_stats index_stats;
 	mempool_stats(&memtx->index_extent_pool, &index_stats);
 
@@ -187,10 +209,10 @@ lbox_slab_info(struct lua_State *L)
 	 * data (tuples and indexes).
 	 */
 	lua_pushstring(L, "arena_used");
-	luaL_pushuint64(L, totals.used + index_stats.totals.used);
+	luaL_pushuint64(L, (is_small_alloc ? totals.used : 0) + index_stats.totals.used);
 	lua_settable(L, -3);
 
-	ratio = 100 * ((double) (totals.used + index_stats.totals.used)
+	ratio = 100 * ((double) ((is_small_alloc ? totals.used : 0) + index_stats.totals.used)
 		       / (double) arena_size);
 	snprintf(ratio_buf, sizeof(ratio_buf), "%0.1lf%%", ratio);
 
@@ -227,7 +249,6 @@ lbox_slab_info(struct lua_State *L)
 	lua_pushstring(L, "quota_used_ratio");
 	lua_pushstring(L, ratio_buf);
 	lua_settable(L, -3);
-
 	return 1;
 }
 
@@ -259,7 +280,7 @@ lbox_slab_check(MAYBE_UNUSED struct lua_State *L)
 {
 	struct memtx_engine *memtx;
 	memtx = (struct memtx_engine *)engine_by_name("memtx");
-	slab_cache_check(memtx->alloc.cache);
+	memtx->memtx_mem_check(memtx);
 	return 0;
 }
 
diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c
index db2bb2333..e70cfc35a 100644
--- a/src/box/memtx_engine.c
+++ b/src/box/memtx_engine.c
@@ -50,6 +50,7 @@
 #include "schema.h"
 #include "gc.h"
 #include "raft.h"
+#include "allocator.h"
 
 /* sync snapshot every 16MB */
 #define SNAP_SYNC_INTERVAL	(1 << 24)
@@ -141,8 +142,9 @@ memtx_engine_shutdown(struct engine *engine)
 		mempool_destroy(&memtx->rtree_iterator_pool);
 	mempool_destroy(&memtx->index_extent_pool);
 	slab_cache_destroy(&memtx->index_slab_cache);
-	small_alloc_destroy(&memtx->alloc);
-	slab_cache_destroy(&memtx->slab_cache);
+	memtx->memtx_allocator_destroy(memtx);
+	if (memtx->alloc == &memtx->small_alloc)
+		slab_cache_destroy(&memtx->slab_cache);	
 	tuple_arena_destroy(&memtx->arena);
 	xdir_destroy(&memtx->snap_dir);
 	free(memtx);
@@ -971,10 +973,13 @@ static void
 memtx_engine_memory_stat(struct engine *engine, struct engine_memory_stat *stat)
 {
 	struct memtx_engine *memtx = (struct memtx_engine *)engine;
-	struct small_stats data_stats;
+	struct allocator_stats data_stats;
 	struct mempool_stats index_stats;
 	mempool_stats(&memtx->index_extent_pool, &index_stats);
-	small_stats(&memtx->alloc, &data_stats, small_stats_noop_cb, NULL);
+	if (memtx->alloc == &memtx->small_alloc)
+		memtx->memtx_allocator_stats(memtx, &data_stats, small_stats_noop_cb, NULL);
+	else
+		memtx->memtx_allocator_stats(memtx, &data_stats);
 	stat->data += data_stats.used;
 	stat->index += index_stats.totals.used;
 }
@@ -1052,7 +1057,7 @@ memtx_engine_gc_f(va_list va)
 struct memtx_engine *
 memtx_engine_new(const char *snap_dirname, bool force_recovery,
 		 uint64_t tuple_arena_max_size, uint32_t objsize_min,
-		 bool dontdump, float alloc_factor)
+		 bool dontdump, const char *allocator, float alloc_factor)
 {
 	struct memtx_engine *memtx = calloc(1, sizeof(*memtx));
 	if (memtx == NULL) {
@@ -1061,6 +1066,18 @@ memtx_engine_new(const char *snap_dirname, bool force_recovery,
 		return NULL;
 	}
 
+	assert(allocator != NULL);
+	/* Default allocator */
+	if(!strcmp(allocator, "small")) {
+		small_memtx_allocator_choice(memtx);
+	} else if (!strcmp(allocator, "system")) {
+		system_memtx_allocator_choice(memtx);
+	} else {
+		diag_set(IllegalParams, "Bad memory allocator name");
+		free(memtx);
+		return NULL;
+	}
+
 	xdir_create(&memtx->snap_dir, snap_dirname, SNAP, &INSTANCE_UUID,
 		    &xlog_opts_default);
 	memtx->snap_dir.force_recovery = force_recovery;
@@ -1108,9 +1125,12 @@ memtx_engine_new(const char *snap_dirname, bool force_recovery,
 	quota_init(&memtx->quota, tuple_arena_max_size);
 	tuple_arena_create(&memtx->arena, &memtx->quota, tuple_arena_max_size,
 			   SLAB_SIZE, dontdump, "memtx");
-	slab_cache_create(&memtx->slab_cache, &memtx->arena);
-	small_alloc_create(&memtx->alloc, &memtx->slab_cache,
-			   objsize_min, alloc_factor);
+	if (memtx->alloc == &memtx->small_alloc) {
+		slab_cache_create(&memtx->slab_cache, &memtx->arena);
+		memtx->memtx_allocator_create(memtx, objsize_min, (double)alloc_factor);
+	} else {
+		memtx->memtx_allocator_create(memtx);
+	}
 
 	/* Initialize index extent allocator. */
 	slab_cache_create(&memtx->index_slab_cache, &memtx->arena);
@@ -1175,7 +1195,7 @@ memtx_enter_delayed_free_mode(struct memtx_engine *memtx)
 {
 	memtx->snapshot_version++;
 	if (memtx->delayed_free_mode++ == 0)
-		small_alloc_setopt(&memtx->alloc, SMALL_DELAYED_FREE_MODE, true);
+		memtx->memtx_enter_delayed_free_mode(memtx);
 }
 
 void
@@ -1183,7 +1203,7 @@ memtx_leave_delayed_free_mode(struct memtx_engine *memtx)
 {
 	assert(memtx->delayed_free_mode > 0);
 	if (--memtx->delayed_free_mode == 0)
-		small_alloc_setopt(&memtx->alloc, SMALL_DELAYED_FREE_MODE, false);
+		memtx->memtx_leave_delayed_free_mode(memtx);
 }
 
 struct tuple *
@@ -1225,7 +1245,7 @@ memtx_tuple_new(struct tuple_format *format, const char *data, const char *end)
 	}
 
 	struct memtx_tuple *memtx_tuple;
-	while ((memtx_tuple = smalloc(&memtx->alloc, total)) == NULL) {
+	while ((memtx_tuple = memtx_global_alloc(memtx->alloc, total)) == NULL) {
 		bool stop;
 		memtx_engine_run_gc(memtx, &stop);
 		if (stop)
@@ -1262,12 +1282,11 @@ memtx_tuple_delete(struct tuple_format *format, struct tuple *tuple)
 	struct memtx_tuple *memtx_tuple =
 		container_of(tuple, struct memtx_tuple, base);
 	size_t total = tuple_size(tuple) + offsetof(struct memtx_tuple, base);
-	if (memtx->alloc.free_mode != SMALL_DELAYED_FREE ||
-	    memtx_tuple->version == memtx->snapshot_version ||
+	if (memtx_tuple->version == memtx->snapshot_version ||
 	    format->is_temporary)
-		smfree(&memtx->alloc, memtx_tuple, total);
+		memtx_global_free(memtx->alloc, memtx_tuple, total);
 	else
-		smfree_delayed(&memtx->alloc, memtx_tuple, total);
+		memtx_global_free_delayed(memtx->alloc, memtx_tuple, total);
 	tuple_format_unref(format);
 }
 
@@ -1279,7 +1298,7 @@ metmx_tuple_chunk_delete(struct tuple_format *format, const char *data)
 		container_of((const char (*)[0])data,
 			     struct tuple_chunk, data);
 	uint32_t sz = tuple_chunk_sz(tuple_chunk->data_sz);
-	smfree(&memtx->alloc, tuple_chunk, sz);
+	memtx_global_free(memtx->alloc, tuple_chunk, sz);
 }
 
 const char *
@@ -1289,7 +1308,7 @@ memtx_tuple_chunk_new(struct tuple_format *format, struct tuple *tuple,
 	struct memtx_engine *memtx = (struct memtx_engine *)format->engine;
 	uint32_t sz = tuple_chunk_sz(data_sz);
 	struct tuple_chunk *tuple_chunk =
-		(struct tuple_chunk *) smalloc(&memtx->alloc, sz);
+		(struct tuple_chunk *) memtx_global_alloc(memtx->alloc, sz);
 	if (tuple == NULL) {
 		diag_set(OutOfMemory, sz, "smalloc", "tuple");
 		return NULL;
diff --git a/src/box/memtx_engine.h b/src/box/memtx_engine.h
index 8b380bf3c..cb7d4c1ce 100644
--- a/src/box/memtx_engine.h
+++ b/src/box/memtx_engine.h
@@ -40,6 +40,7 @@
 #include "engine.h"
 #include "xlog.h"
 #include "salad/stailq.h"
+#include "system_allocator.h"
 
 #if defined(__cplusplus)
 extern "C" {
@@ -49,6 +50,7 @@ struct index;
 struct fiber;
 struct tuple;
 struct tuple_format;
+struct allocator_stats;
 
 /**
  * The state of memtx recovery process.
@@ -135,8 +137,12 @@ struct memtx_engine {
 	struct slab_arena arena;
 	/** Slab cache for allocating tuples. */
 	struct slab_cache slab_cache;
-	/** Tuple allocator. */
-	struct small_alloc alloc;
+	/** Small tuple allocator. */
+	struct small_alloc small_alloc;
+	/** System tuple allocator */
+	struct system_alloc system_alloc;
+	/** Tuple allocator currently used */
+	void *alloc;
 	/** Slab cache for allocating index extents. */
 	struct slab_cache index_slab_cache;
 	/** Index extent allocator. */
@@ -178,6 +184,31 @@ struct memtx_engine {
 	 * memtx_gc_task::link.
 	 */
 	struct stailq gc_queue;
+	/**
+	  * Method to create memtx allocator
+	  */
+	void (*memtx_allocator_create)(struct memtx_engine *memtx, ...);
+	/**
+	  * Method to destroy memtx allocator
+	  */
+	void (*memtx_allocator_destroy)(struct memtx_engine *memtx);
+	/**
+	  * Method to enter delayed free mode
+	  */
+	void (*memtx_enter_delayed_free_mode)(struct memtx_engine *memtx);
+	/**
+	  * Method to leave delayed free mode
+	  */
+	void (*memtx_leave_delayed_free_mode)(struct memtx_engine *memtx);
+	/**
+	  * Method to get allocation statistic
+	  */
+	void (*memtx_allocator_stats)(struct memtx_engine *memtx, 
+		struct allocator_stats *stats, ...);
+	/**
+	  * Method to memtx memory check
+	  */
+	void (*memtx_mem_check)(struct memtx_engine *memtx);
 };
 
 struct memtx_gc_task;
@@ -213,7 +244,7 @@ struct memtx_engine *
 memtx_engine_new(const char *snap_dirname, bool force_recovery,
 		 uint64_t tuple_arena_max_size,
 		 uint32_t objsize_min, bool dontdump,
-		 float alloc_factor);
+		 const char *allocator, float alloc_factor);
 
 int
 memtx_engine_recover_snapshot(struct memtx_engine *memtx,
@@ -299,13 +330,13 @@ static inline struct memtx_engine *
 memtx_engine_new_xc(const char *snap_dirname, bool force_recovery,
 		    uint64_t tuple_arena_max_size,
 		    uint32_t objsize_min, bool dontdump,
-		    float alloc_factor)
+		    const char *allocator, float alloc_factor)
 {
 	struct memtx_engine *memtx;
 	memtx = memtx_engine_new(snap_dirname, force_recovery,
 				 tuple_arena_max_size,
 				 objsize_min, dontdump,
-				 alloc_factor);
+				 allocator, alloc_factor);
 	if (memtx == NULL)
 		diag_raise();
 	return memtx;
diff --git a/src/box/system_allocator.h b/src/box/system_allocator.h
new file mode 100644
index 000000000..8e039f8e4
--- /dev/null
+++ b/src/box/system_allocator.h
@@ -0,0 +1,226 @@
+#pragma once
+/*
+ * Copyright 2010-2020, Tarantool AUTHORS, please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <stdlib.h>
+#include <trivia/util.h>
+#include <trivia/config.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* defined(__cplusplus) */
+
+#if HAVE_MALLOC_SIZE_DARWIN
+#include <malloc/malloc.h>
+static inline size_t
+portable_malloc_usable_size(void *p)
+{
+	return malloc_size(p);
+}
+#elif HAVE_MALLOC_USABLE_SIZE_BSD
+#include <malloc_np.h>
+static inline size_t
+portable_malloc_usable_size(void *p)
+{
+	return malloc_usable_size(p);
+}
+#elif HAVE_MALLOC_USABLE_SIZE_LINUX
+#include <malloc.h>
+static inline size_t
+portable_malloc_usable_size(void *p)
+{
+	return malloc_usable_size(p);
+}
+#else
+#error "Undefined system type"
+#endif
+
+/**
+ * Free mode
+ */
+enum system_free_mode {
+	/** Free objects immediately. */
+	SYSTEM_FREE,
+	/** Collect garbage after delayed free. */
+	SYSTEM_COLLECT_GARBAGE,
+	/** Postpone deletion of objects. */
+	SYSTEM_DELAYED_FREE,
+};
+
+struct system_alloc {
+	/**
+	 * Bytes allocated by system allocator
+	 */
+	uint64_t used_bytes;
+	/**
+	 * Allocator quota
+	 */
+	struct quota *quota;
+	/**
+	 * Free mode.
+	 */
+	enum system_free_mode free_mode;
+	/**
+	  * List of pointers for delayed free.
+	  */
+	struct lifo delayed;
+	bool init;
+};
+
+struct system_stats {
+	size_t used;
+	size_t total;
+};
+
+enum system_opt {
+	SYSTEM_DELAYED_FREE_MODE
+};
+
+static inline void
+sysfree(struct system_alloc *alloc, void *ptr, MAYBE_UNUSED size_t bytes)
+{
+	assert(alloc->init == true);
+	size_t size = portable_malloc_usable_size(ptr);
+	uint32_t s = size % QUOTA_UNIT_SIZE, units = size / QUOTA_UNIT_SIZE;
+	size_t used_bytes =  pm_atomic_fetch_sub(&alloc->used_bytes, size);
+	if (small_align(used_bytes, QUOTA_UNIT_SIZE) >
+	    small_align(used_bytes - s, QUOTA_UNIT_SIZE))
+		units++;
+	if (units > 0)
+		quota_release(alloc->quota, units * QUOTA_UNIT_SIZE);
+	free(ptr);
+}
+
+static inline void
+system_collect_garbage(struct system_alloc *alloc)
+{
+	assert(alloc->init == true);
+	if (alloc->free_mode != SYSTEM_COLLECT_GARBAGE)
+		return;
+
+	const int BATCH = 100;
+	if (!lifo_is_empty(&alloc->delayed)) {
+		for (int i = 0; i < BATCH; i++) {
+			void *item = lifo_pop(&alloc->delayed);
+			if (item == NULL)
+				break;
+			sysfree(alloc, item, 0 /* unused parameter */);
+		}
+	} else {
+		/* Finish garbage collection and switch to regular mode */
+		alloc->free_mode = SYSTEM_FREE;
+	}
+}
+
+static inline void
+system_alloc_setopt(struct system_alloc *alloc, enum system_opt opt, bool val)
+{
+	assert(alloc->init == true);
+	switch (opt) {
+	case SYSTEM_DELAYED_FREE_MODE:
+		alloc->free_mode = val ? SYSTEM_DELAYED_FREE :
+			SYSTEM_COLLECT_GARBAGE;
+		break;
+	default:
+		assert(false);
+		break;
+	}
+}
+
+static inline void
+system_stats(struct system_alloc *alloc, struct system_stats *totals)
+{
+	assert(alloc->init == true);
+	totals->used = pm_atomic_load_explicit(&alloc->used_bytes,
+		pm_memory_order_relaxed);
+	totals->total = quota_total(alloc->quota);
+}
+
+static inline void
+system_alloc_create(struct system_alloc *alloc, struct quota *quota)
+{
+	alloc->used_bytes = 0;
+	alloc->quota = quota;
+	lifo_init(&alloc->delayed);
+	alloc->init = true;
+}
+
+static inline void
+system_alloc_destroy(MAYBE_UNUSED struct system_alloc *alloc)
+{
+	alloc->init = false;
+}
+
+static inline void
+sysfree_delayed(struct system_alloc *alloc, void *ptr, size_t bytes)
+{
+	assert(alloc->init == true);
+	if (alloc->free_mode == SYSTEM_DELAYED_FREE && ptr) {
+		lifo_push(&alloc->delayed, ptr);
+	} else {
+		sysfree(alloc, ptr, bytes);
+	}
+}
+
+static inline void *
+sysalloc(struct system_alloc *alloc, size_t bytes)
+{
+	assert(alloc->init == true);
+	system_collect_garbage(alloc);
+
+	void *ptr = malloc(bytes);
+	if (!ptr)
+		return NULL;
+	size_t size = portable_malloc_usable_size(ptr);
+	uint32_t s = size % QUOTA_UNIT_SIZE, units = size / QUOTA_UNIT_SIZE;
+	while (1) {
+		size_t used_bytes =  pm_atomic_load(&alloc->used_bytes);
+		if (small_align(used_bytes, QUOTA_UNIT_SIZE) <
+		    small_align(used_bytes + s, QUOTA_UNIT_SIZE))
+			units++;
+		if (units > 0) {
+			if (quota_use(alloc->quota,
+				units * QUOTA_UNIT_SIZE) < 0) {
+				free(ptr);
+				return NULL;
+			}
+		}
+		if (pm_atomic_compare_exchange_strong(&alloc->used_bytes,
+			&used_bytes, used_bytes + size))
+			break;
+		if (units > 0)
+			quota_release(alloc->quota, units * QUOTA_UNIT_SIZE);
+	}
+	return ptr;
+}
+
+#if defined(__cplusplus)
+} /* extern "C" */
+#endif /* defined(__cplusplus) */
diff --git a/src/trivia/config.h.cmake b/src/trivia/config.h.cmake
index 89e0d39c6..107cd8049 100644
--- a/src/trivia/config.h.cmake
+++ b/src/trivia/config.h.cmake
@@ -169,6 +169,9 @@
 #cmakedefine HAVE_POSIX_FADVISE 1
 #cmakedefine HAVE_FALLOCATE 1
 #cmakedefine HAVE_MREMAP 1
+#cmakedefine HAVE_MALLOC_USABLE_SIZE_LINUX 1
+#cmakedefine HAVE_MALLOC_USABLE_SIZE_BSD 1
+#cmakedefine HAVE_MALLOC_SIZE_DARWIN 1
 #cmakedefine HAVE_SYNC_FILE_RANGE 1
 
 #cmakedefine HAVE_MSG_NOSIGNAL 1
diff --git a/test/app-tap/init_script.result b/test/app-tap/init_script.result
index 72aa67db2..3b5211a90 100644
--- a/test/app-tap/init_script.result
+++ b/test/app-tap/init_script.result
@@ -3,6 +3,7 @@
 --
 
 box.cfg
+allocator:small
 background:false
 checkpoint_count:2
 checkpoint_interval:3600
diff --git a/test/box/admin.result b/test/box/admin.result
index e05440f66..9e4813133 100644
--- a/test/box/admin.result
+++ b/test/box/admin.result
@@ -27,7 +27,9 @@ help()
 ...
 cfg_filter(box.cfg)
 ---
-- - - background
+- - - allocator
+    - small
+  - - background
     - false
   - - checkpoint_count
     - 2
diff --git a/test/box/cfg.result b/test/box/cfg.result
index 10fef006c..d23255872 100644
--- a/test/box/cfg.result
+++ b/test/box/cfg.result
@@ -15,7 +15,9 @@ box.cfg.nosuchoption = 1
  | ...
 cfg_filter(box.cfg)
  | ---
- | - - - background
+ | - - - allocator
+ |     - small
+ |   - - background
  |     - false
  |   - - checkpoint_count
  |     - 2
@@ -128,7 +130,9 @@ box.cfg()
  | ...
 cfg_filter(box.cfg)
  | ---
- | - - - background
+ | - - - allocator
+ |     - small
+ |   - - background
  |     - false
  |   - - checkpoint_count
  |     - 2
diff --git a/test/box/choose_memtx_allocator.lua b/test/box/choose_memtx_allocator.lua
new file mode 100644
index 000000000..77a0ec638
--- /dev/null
+++ b/test/box/choose_memtx_allocator.lua
@@ -0,0 +1,9 @@
+#!/usr/bin/env tarantool
+
+require('console').listen(os.getenv('ADMIN'))
+
+box.cfg({
+    listen = os.getenv("LISTEN"),
+    allocator=arg[1],
+    checkpoint_interval=10
+})
diff --git a/test/box/choose_memtx_allocator.result b/test/box/choose_memtx_allocator.result
new file mode 100644
index 000000000..dab316b93
--- /dev/null
+++ b/test/box/choose_memtx_allocator.result
@@ -0,0 +1,135 @@
+-- test-run result file version 2
+
+-- write data recover from latest snapshot
+env = require('test_run')
+ | ---
+ | ...
+test_run = env.new()
+ | ---
+ | ...
+test_run:cmd('create server test with script="box/choose_memtx_allocator.lua"')
+ | ---
+ | - true
+ | ...
+--test small allocator
+test_run:cmd('start server test with args="small"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('switch test')
+ | ---
+ | - true
+ | ...
+space = box.schema.space.create('test')
+ | ---
+ | ...
+space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} })
+ | ---
+ | ...
+s = space:create_index('primary', { parts = {'id'} })
+ | ---
+ | ...
+for key = 1, 1000 do space:insert({key, key + 1000}) end
+ | ---
+ | ...
+for key = 1, 1000 do space:replace({key, key + 5000}) end
+ | ---
+ | ...
+for key = 1, 1000 do space:delete(key) end
+ | ---
+ | ...
+space:drop()
+ | ---
+ | ...
+test_run:cmd('switch default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server test')
+ | ---
+ | - true
+ | ...
+--test system(malloc) allocator
+test_run:cmd('start server test with args="system"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('switch test')
+ | ---
+ | - true
+ | ...
+space = box.schema.space.create('test')
+ | ---
+ | ...
+space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} })
+ | ---
+ | ...
+s = space:create_index('primary', { parts = {'id'} })
+ | ---
+ | ...
+for key = 1, 500000 do space:insert({key, key + 1000}) end
+ | ---
+ | ...
+for key = 1, 500000 do space:replace({key, key + 5000}) end
+ | ---
+ | ...
+for key = 1, 500000 do space:delete(key) end
+ | ---
+ | ...
+space:drop()
+ | ---
+ | ...
+test_run:cmd('switch default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server test')
+ | ---
+ | - true
+ | ...
+--test default (small) allocator
+test_run:cmd('start server test')
+ | ---
+ | - true
+ | ...
+test_run:cmd('switch test')
+ | ---
+ | - true
+ | ...
+space = box.schema.space.create('test')
+ | ---
+ | ...
+space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} })
+ | ---
+ | ...
+s = space:create_index('primary', { parts = {'id'} })
+ | ---
+ | ...
+for key = 1, 1000 do space:insert({key, key + 1000}) end
+ | ---
+ | ...
+for key = 1, 1000 do space:replace({key, key + 5000}) end
+ | ---
+ | ...
+for key = 1, 1000 do space:delete(key) end
+ | ---
+ | ...
+space:drop()
+ | ---
+ | ...
+test_run:cmd('switch default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server test')
+ | ---
+ | - true
+ | ...
+test_run:cmd('cleanup server test')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server test')
+ | ---
+ | - true
+ | ...
diff --git a/test/box/choose_memtx_allocator.test.lua b/test/box/choose_memtx_allocator.test.lua
new file mode 100644
index 000000000..007b01d80
--- /dev/null
+++ b/test/box/choose_memtx_allocator.test.lua
@@ -0,0 +1,43 @@
+
+-- write data recover from latest snapshot
+env = require('test_run')
+test_run = env.new()
+test_run:cmd('create server test with script="box/choose_memtx_allocator.lua"')
+--test small allocator
+test_run:cmd('start server test with args="small"')
+test_run:cmd('switch test')
+space = box.schema.space.create('test')
+space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} })
+s = space:create_index('primary', { parts = {'id'} })
+for key = 1, 1000 do space:insert({key, key + 1000}) end
+for key = 1, 1000 do space:replace({key, key + 5000}) end
+for key = 1, 1000 do space:delete(key) end
+space:drop()
+test_run:cmd('switch default')
+test_run:cmd('stop server test')
+--test system(malloc) allocator
+test_run:cmd('start server test with args="system"')
+test_run:cmd('switch test')
+space = box.schema.space.create('test')
+space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} })
+s = space:create_index('primary', { parts = {'id'} })
+for key = 1, 500000 do space:insert({key, key + 1000}) end
+for key = 1, 500000 do space:replace({key, key + 5000}) end
+for key = 1, 500000 do space:delete(key) end
+space:drop()
+test_run:cmd('switch default')
+test_run:cmd('stop server test')
+--test default (small) allocator
+test_run:cmd('start server test')
+test_run:cmd('switch test')
+space = box.schema.space.create('test')
+space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} })
+s = space:create_index('primary', { parts = {'id'} })
+for key = 1, 1000 do space:insert({key, key + 1000}) end
+for key = 1, 1000 do space:replace({key, key + 5000}) end
+for key = 1, 1000 do space:delete(key) end
+space:drop()
+test_run:cmd('switch default')
+test_run:cmd('stop server test')
+test_run:cmd('cleanup server test')
+test_run:cmd('delete server test')
-- 
2.20.1



More information about the Tarantool-patches mailing list