From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-lf1-f41.google.com (mail-lf1-f41.google.com [209.85.167.41]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id D2AB745C304 for ; Fri, 18 Dec 2020 16:58:09 +0300 (MSK) Received: by mail-lf1-f41.google.com with SMTP id o17so5674622lfg.4 for ; Fri, 18 Dec 2020 05:58:09 -0800 (PST) From: mechanik20051988 Date: Fri, 18 Dec 2020 16:58:04 +0300 Message-Id: <20201218135804.24777-1-mechanik20051988@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH] Add new 'allocator' option in box.cfg List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: tarantool-patches@dev.tarantool.org, Vladislav Shpilevoy Slab allocator, which is used for tuples allocation, has a certa$ disadvantage - it tends to unresolvable fragmentation on certain workloads (size migration). New option allows to select the appropriate allocator if necessary. @TarantoolBot document Title: Add new 'allocator' option Add new 'allocator' option which allows to select the appropriate allocator for memtx tuples if necessary. Use box.cfg{allocator="small"} or no option to use default small allocator, use box.cfg{allocator="system"} to use libc malloc. Closes #5419 --- CMakeLists.txt | 11 ++ perf/allocator_perf.test.lua | 34 ++++ src/box/allocator.h | 200 ++++++++++++++++++++ src/box/box.cc | 1 + src/box/lua/load_cfg.lua | 2 + src/box/lua/slab.c | 39 +++- src/box/memtx_engine.c | 53 ++++-- src/box/memtx_engine.h | 41 +++- src/box/system_allocator.h | 226 +++++++++++++++++++++++ src/trivia/config.h.cmake | 3 + test/app-tap/init_script.result | 1 + test/box/admin.result | 4 +- test/box/cfg.result | 8 +- test/box/choose_memtx_allocator.lua | 9 + test/box/choose_memtx_allocator.result | 135 ++++++++++++++ test/box/choose_memtx_allocator.test.lua | 43 +++++ 16 files changed, 776 insertions(+), 34 deletions(-) create mode 100755 perf/allocator_perf.test.lua create mode 100644 src/box/allocator.h create mode 100644 src/box/system_allocator.h create mode 100644 test/box/choose_memtx_allocator.lua create mode 100644 test/box/choose_memtx_allocator.result create mode 100644 test/box/choose_memtx_allocator.test.lua diff --git a/CMakeLists.txt b/CMakeLists.txt index fa6818f8e..290cd535a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,6 +92,17 @@ check_symbol_exists(posix_fadvise fcntl.h HAVE_POSIX_FADVISE) check_symbol_exists(fallocate fcntl.h HAVE_FALLOCATE) check_symbol_exists(mremap sys/mman.h HAVE_MREMAP) +check_function_exists(malloc_usable_size HAVE_MALLOC_USABLE_SIZE) +check_symbol_exists(malloc_size malloc/malloc.h HAVE_MALLOC_SIZE_DARWIN) + +if (HAVE_MALLOC_USABLE_SIZE) + if (TARGET_OS_LINUX) + set(HAVE_MALLOC_USABLE_SIZE_LINUX 1) + else () + set(HAVE_MALLOC_USABLE_SIZE_BSD 1) + endif () +endif () + check_function_exists(sync_file_range HAVE_SYNC_FILE_RANGE) check_function_exists(memmem HAVE_MEMMEM) check_function_exists(memrchr HAVE_MEMRCHR) diff --git a/perf/allocator_perf.test.lua b/perf/allocator_perf.test.lua new file mode 100755 index 000000000..be270379b --- /dev/null +++ b/perf/allocator_perf.test.lua @@ -0,0 +1,34 @@ +#!/usr/bin/env ../src/tarantool +os.execute('rm -rf *.snap *.xlog *.vylog ./512 ./513 ./514 ./515 ./516 ./517 ./518 ./519 ./520 ./521') +local clock = require('clock') +box.cfg{listen = 3301, wal_mode='none', allocator=arg[1]} +local space = box.schema.space.create('test') +space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} }) +space:create_index('primary', { parts = {'id'} }) +local time_insert = 0 +local time_replace = 0 +local time_delete = 0 +local cnt = 0 +local cnt_max = 20 +local op_max = 2500000 +local nanosec = 1.0e9 +while cnt < cnt_max do + cnt = cnt + 1 + local time_before = clock.monotonic64() + for key = 1, op_max do space:insert({key, key + 1000}) end + local time_after = clock.monotonic64() + time_insert = time_insert + (time_after - time_before) + time_before = clock.monotonic64() + for key = 1, op_max do space:replace({key, key + 5000}) end + time_after = clock.monotonic64() + time_replace = time_replace + (time_after - time_before) + time_before = clock.monotonic64() + for key = 1, op_max do space:delete(key) end + time_after = clock.monotonic64() + time_delete = time_delete + (time_after - time_before) +end +io.write("{\n") +io.write(string.format(" \"alloc time\": \"%.3f\"\n", tonumber(time_insert) / (nanosec * cnt_max))) +io.write(string.format(" \"replace time\": \"%.3f\"\n", tonumber(time_replace) / (nanosec * cnt_max))) +io.write(string.format(" \"delete time\": \"%.3f\"\n}\n", tonumber(time_delete) / (nanosec * cnt_max))) +os.exit() diff --git a/src/box/allocator.h b/src/box/allocator.h new file mode 100644 index 000000000..3bea67f50 --- /dev/null +++ b/src/box/allocator.h @@ -0,0 +1,200 @@ +#pragma once +/* + * Copyright 2010-2020, Tarantool AUTHORS, please see AUTHORS file. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +#include +#include + +#include "memtx_engine.h" +#include "system_allocator.h" +#include "tuple.h" + +#if defined(__cplusplus) +extern "C" { +#endif /* defined(__cplusplus) */ + +#define noop_one_arg(a) +#define noop_two_arg(a, b) + +struct allocator_stats { + size_t used; + size_t total; +}; + +static inline void +_small_allocator_create(struct memtx_engine *memtx, va_list argptr) +{ + uint32_t objsize_min = va_arg(argptr, uint32_t); + double alloc_factor = va_arg(argptr, double); + return small_alloc_create(memtx->alloc, &memtx->slab_cache, + objsize_min, (float)alloc_factor); +} + +static inline void +_system_allocator_create(struct memtx_engine *memtx, MAYBE_UNUSED va_list argptr) +{ + return system_alloc_create(memtx->alloc, &memtx->quota); +} + +static inline void +_small_allocator_stats(struct small_alloc *alloc, struct small_stats *stats, + va_list argptr) +{ + mempool_stats_cb stats_cb = + va_arg(argptr, mempool_stats_cb); + void *cb_ctx = va_arg(argptr, void *); + return small_stats(alloc, stats, stats_cb, cb_ctx); +} + +static inline void +_system_allocator_stats(struct system_alloc *alloc, struct system_stats *stats, + MAYBE_UNUSED va_list argptr) +{ + return system_stats(alloc, stats); +} + +#define MEM_CHECK_FUNC(prefix, func, param) \ +static inline void \ +prefix##_mem_check(MAYBE_UNUSED struct prefix##_alloc *alloc) \ +{ \ + func(alloc->param); \ +} +MEM_CHECK_FUNC(small, slab_cache_check, cache) +MEM_CHECK_FUNC(system, noop_one_arg, noop) + +/** + * Global abstract method to memory alloc + */ +typedef void *(*global_alloc)(void *alloc, size_t bytes); +static global_alloc memtx_global_alloc; + +/** + * Global abstract method to memory free + */ +typedef void (*global_free)(void *alloc, void *ptr, size_t bytes); +static global_free memtx_global_free; + +/** + * Global abstract method to delayed memory free + */ +typedef void (*global_free_delayed)(void *alloc, void *ptr, size_t bytes); +static global_free_delayed memtx_global_free_delayed; + +#define DECLARE_MEMTX_ALLOCATOR_DESTROY(prefix) \ +static inline void \ +prefix##_allocator_destroy(struct memtx_engine *memtx) \ +{ \ + prefix##_alloc_destroy(memtx->alloc); \ +} +DECLARE_MEMTX_ALLOCATOR_DESTROY(small) +DECLARE_MEMTX_ALLOCATOR_DESTROY(system) + +#define DECLARE_MEMTX_ALLOCATOR_CREATE(prefix) \ +static inline void \ +prefix##_allocator_create(struct memtx_engine *memtx, ...) \ +{ \ + va_list argptr; \ + va_start(argptr, memtx); \ + _##prefix##_allocator_create(memtx, argptr); \ + va_end(argptr); \ +} +DECLARE_MEMTX_ALLOCATOR_CREATE(small) +DECLARE_MEMTX_ALLOCATOR_CREATE(system) + +#define DECLARE_MEMTX_ALLOCATOR_ENTER_DELAYED_FREE_MODE(prefix, PREFIX) \ +static inline void \ +prefix##_allocator_enter_delayed_free_mode(struct memtx_engine *memtx) \ +{ \ + return prefix##_##alloc_setopt(memtx->alloc, \ + PREFIX##_##DELAYED_FREE_MODE, true); \ +} +DECLARE_MEMTX_ALLOCATOR_ENTER_DELAYED_FREE_MODE(small, SMALL) +DECLARE_MEMTX_ALLOCATOR_ENTER_DELAYED_FREE_MODE(system, SYSTEM) + +#define DECLARE_MEMTX_ALLOCATOR_LEAVE_DELAYED_FREE_MODE(prefix, PREFIX) \ +static inline void \ +prefix##_allocator_leave_delayed_free_mode(struct memtx_engine *memtx) \ +{ \ + return prefix##_##alloc_setopt(memtx->alloc, \ + PREFIX##_##DELAYED_FREE_MODE, false); \ +} +DECLARE_MEMTX_ALLOCATOR_LEAVE_DELAYED_FREE_MODE(small, SMALL) +DECLARE_MEMTX_ALLOCATOR_LEAVE_DELAYED_FREE_MODE(system, SYSTEM) + +#define DECLARE_MEMTX_ALLOCATOR_STATS(prefix) \ +static inline void \ +prefix##_allocator_stats(struct memtx_engine *memtx, \ + struct allocator_stats *stats, ...) \ +{ \ + va_list argptr; \ + va_start(argptr, stats); \ + struct prefix##_stats data_stats; \ + _##prefix##_allocator_stats(memtx->alloc, &data_stats, argptr); \ + va_end(argptr); \ + stats->used = data_stats.used; \ + stats->total = data_stats.total; \ +} +DECLARE_MEMTX_ALLOCATOR_STATS(small) +DECLARE_MEMTX_ALLOCATOR_STATS(system) + +#define DECLARE_MEMTX_MEM_CHECK(prefix) \ +static inline void \ +prefix##_allocator_mem_check(struct memtx_engine *memtx) \ +{ \ + prefix##_mem_check((struct prefix##_alloc *)(memtx->alloc)); \ +} +DECLARE_MEMTX_MEM_CHECK(small) +DECLARE_MEMTX_MEM_CHECK(system) + +#define DECLARE_MEMTX_ALLOCATOR_CHOICE(prefix, alloc_func, free_func, \ + free_dealyed_func) \ +static inline void \ +prefix##_memtx_allocator_choice(struct memtx_engine *memtx) \ +{ \ + memtx_global_alloc = (void *)alloc_func; \ + memtx_global_free = (void *)free_func; \ + memtx_global_free_delayed = (void *)free_dealyed_func; \ + memtx->alloc = &memtx->prefix##_alloc; \ + memtx->memtx_allocator_create = prefix##_allocator_create; \ + memtx->memtx_allocator_destroy = prefix##_allocator_destroy; \ + memtx->memtx_enter_delayed_free_mode = \ + prefix##_allocator_enter_delayed_free_mode; \ + memtx->memtx_leave_delayed_free_mode = \ + prefix##_allocator_leave_delayed_free_mode; \ + memtx->memtx_allocator_stats = prefix##_allocator_stats; \ + memtx->memtx_mem_check = prefix##_allocator_mem_check; \ +} +DECLARE_MEMTX_ALLOCATOR_CHOICE(small, smalloc, smfree, smfree_delayed) +DECLARE_MEMTX_ALLOCATOR_CHOICE(system, sysalloc, sysfree, sysfree_delayed) + +#if defined(__cplusplus) +} /* extern "C" */ +#endif /* defined(__cplusplus) */ diff --git a/src/box/box.cc b/src/box/box.cc index a8bc3471d..66f6030df 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -2250,6 +2250,7 @@ engine_init() cfg_getd("memtx_memory"), cfg_geti("memtx_min_tuple_size"), cfg_geti("strip_core"), + cfg_gets("allocator"), cfg_getd("slab_alloc_factor")); engine_register((struct engine *)memtx); box_set_memtx_max_tuple_size(); diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua index 770442052..817b8dbd5 100644 --- a/src/box/lua/load_cfg.lua +++ b/src/box/lua/load_cfg.lua @@ -43,6 +43,7 @@ local default_cfg = { memtx_min_tuple_size = 16, memtx_max_tuple_size = 1024 * 1024, slab_alloc_factor = 1.05, + allocator = "small", work_dir = nil, memtx_dir = ".", wal_dir = ".", @@ -123,6 +124,7 @@ local template_cfg = { memtx_min_tuple_size = 'number', memtx_max_tuple_size = 'number', slab_alloc_factor = 'number', + allocator = 'string', work_dir = 'string', memtx_dir = 'string', wal_dir = 'string', diff --git a/src/box/lua/slab.c b/src/box/lua/slab.c index 9f5e7e95c..e3f140570 100644 --- a/src/box/lua/slab.c +++ b/src/box/lua/slab.c @@ -43,6 +43,7 @@ #include "memory.h" #include "box/engine.h" #include "box/memtx_engine.h" +#include "box/allocator.h" static int small_stats_noop_cb(const struct mempool_stats *stats, void *cb_ctx) @@ -108,17 +109,31 @@ lbox_slab_stats(struct lua_State *L) struct memtx_engine *memtx; memtx = (struct memtx_engine *)engine_by_name("memtx"); - struct small_stats totals; + struct allocator_stats totals; lua_newtable(L); /* * List all slabs used for tuples and slabs used for * indexes, with their stats. */ - small_stats(&memtx->alloc, &totals, small_stats_lua_cb, L); + if (memtx->alloc == &memtx->small_alloc) { + memtx->memtx_allocator_stats(memtx, &totals, small_stats_lua_cb, L); + } else { + memtx->memtx_allocator_stats(memtx, &totals); + lua_pushnumber(L, lua_objlen(L, -1) + 1); + lua_newtable(L); + luaL_setmaphint(L, -1); + lua_pushstring(L, "mem_used"); + luaL_pushuint64(L, totals.used); + lua_settable(L, -3); + + lua_pushstring(L, "mem_free"); + luaL_pushuint64(L, totals.total - totals.used); + lua_settable(L, -3); + lua_settable(L, -3); + } struct mempool_stats index_stats; mempool_stats(&memtx->index_extent_pool, &index_stats); small_stats_lua_cb(&index_stats, L); - return 1; } @@ -128,14 +143,21 @@ lbox_slab_info(struct lua_State *L) struct memtx_engine *memtx; memtx = (struct memtx_engine *)engine_by_name("memtx"); - struct small_stats totals; + struct allocator_stats totals; + bool is_small_alloc; /* * List all slabs used for tuples and slabs used for * indexes, with their stats. */ lua_newtable(L); - small_stats(&memtx->alloc, &totals, small_stats_noop_cb, L); + if (memtx->alloc == &memtx->small_alloc) { + memtx->memtx_allocator_stats(memtx, &totals, small_stats_noop_cb, L); + is_small_alloc = true; + } else { + memtx->memtx_allocator_stats(memtx, &totals); + is_small_alloc = false; + } struct mempool_stats index_stats; mempool_stats(&memtx->index_extent_pool, &index_stats); @@ -187,10 +209,10 @@ lbox_slab_info(struct lua_State *L) * data (tuples and indexes). */ lua_pushstring(L, "arena_used"); - luaL_pushuint64(L, totals.used + index_stats.totals.used); + luaL_pushuint64(L, (is_small_alloc ? totals.used : 0) + index_stats.totals.used); lua_settable(L, -3); - ratio = 100 * ((double) (totals.used + index_stats.totals.used) + ratio = 100 * ((double) ((is_small_alloc ? totals.used : 0) + index_stats.totals.used) / (double) arena_size); snprintf(ratio_buf, sizeof(ratio_buf), "%0.1lf%%", ratio); @@ -227,7 +249,6 @@ lbox_slab_info(struct lua_State *L) lua_pushstring(L, "quota_used_ratio"); lua_pushstring(L, ratio_buf); lua_settable(L, -3); - return 1; } @@ -259,7 +280,7 @@ lbox_slab_check(MAYBE_UNUSED struct lua_State *L) { struct memtx_engine *memtx; memtx = (struct memtx_engine *)engine_by_name("memtx"); - slab_cache_check(memtx->alloc.cache); + memtx->memtx_mem_check(memtx); return 0; } diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c index db2bb2333..e70cfc35a 100644 --- a/src/box/memtx_engine.c +++ b/src/box/memtx_engine.c @@ -50,6 +50,7 @@ #include "schema.h" #include "gc.h" #include "raft.h" +#include "allocator.h" /* sync snapshot every 16MB */ #define SNAP_SYNC_INTERVAL (1 << 24) @@ -141,8 +142,9 @@ memtx_engine_shutdown(struct engine *engine) mempool_destroy(&memtx->rtree_iterator_pool); mempool_destroy(&memtx->index_extent_pool); slab_cache_destroy(&memtx->index_slab_cache); - small_alloc_destroy(&memtx->alloc); - slab_cache_destroy(&memtx->slab_cache); + memtx->memtx_allocator_destroy(memtx); + if (memtx->alloc == &memtx->small_alloc) + slab_cache_destroy(&memtx->slab_cache); tuple_arena_destroy(&memtx->arena); xdir_destroy(&memtx->snap_dir); free(memtx); @@ -971,10 +973,13 @@ static void memtx_engine_memory_stat(struct engine *engine, struct engine_memory_stat *stat) { struct memtx_engine *memtx = (struct memtx_engine *)engine; - struct small_stats data_stats; + struct allocator_stats data_stats; struct mempool_stats index_stats; mempool_stats(&memtx->index_extent_pool, &index_stats); - small_stats(&memtx->alloc, &data_stats, small_stats_noop_cb, NULL); + if (memtx->alloc == &memtx->small_alloc) + memtx->memtx_allocator_stats(memtx, &data_stats, small_stats_noop_cb, NULL); + else + memtx->memtx_allocator_stats(memtx, &data_stats); stat->data += data_stats.used; stat->index += index_stats.totals.used; } @@ -1052,7 +1057,7 @@ memtx_engine_gc_f(va_list va) struct memtx_engine * memtx_engine_new(const char *snap_dirname, bool force_recovery, uint64_t tuple_arena_max_size, uint32_t objsize_min, - bool dontdump, float alloc_factor) + bool dontdump, const char *allocator, float alloc_factor) { struct memtx_engine *memtx = calloc(1, sizeof(*memtx)); if (memtx == NULL) { @@ -1061,6 +1066,18 @@ memtx_engine_new(const char *snap_dirname, bool force_recovery, return NULL; } + assert(allocator != NULL); + /* Default allocator */ + if(!strcmp(allocator, "small")) { + small_memtx_allocator_choice(memtx); + } else if (!strcmp(allocator, "system")) { + system_memtx_allocator_choice(memtx); + } else { + diag_set(IllegalParams, "Bad memory allocator name"); + free(memtx); + return NULL; + } + xdir_create(&memtx->snap_dir, snap_dirname, SNAP, &INSTANCE_UUID, &xlog_opts_default); memtx->snap_dir.force_recovery = force_recovery; @@ -1108,9 +1125,12 @@ memtx_engine_new(const char *snap_dirname, bool force_recovery, quota_init(&memtx->quota, tuple_arena_max_size); tuple_arena_create(&memtx->arena, &memtx->quota, tuple_arena_max_size, SLAB_SIZE, dontdump, "memtx"); - slab_cache_create(&memtx->slab_cache, &memtx->arena); - small_alloc_create(&memtx->alloc, &memtx->slab_cache, - objsize_min, alloc_factor); + if (memtx->alloc == &memtx->small_alloc) { + slab_cache_create(&memtx->slab_cache, &memtx->arena); + memtx->memtx_allocator_create(memtx, objsize_min, (double)alloc_factor); + } else { + memtx->memtx_allocator_create(memtx); + } /* Initialize index extent allocator. */ slab_cache_create(&memtx->index_slab_cache, &memtx->arena); @@ -1175,7 +1195,7 @@ memtx_enter_delayed_free_mode(struct memtx_engine *memtx) { memtx->snapshot_version++; if (memtx->delayed_free_mode++ == 0) - small_alloc_setopt(&memtx->alloc, SMALL_DELAYED_FREE_MODE, true); + memtx->memtx_enter_delayed_free_mode(memtx); } void @@ -1183,7 +1203,7 @@ memtx_leave_delayed_free_mode(struct memtx_engine *memtx) { assert(memtx->delayed_free_mode > 0); if (--memtx->delayed_free_mode == 0) - small_alloc_setopt(&memtx->alloc, SMALL_DELAYED_FREE_MODE, false); + memtx->memtx_leave_delayed_free_mode(memtx); } struct tuple * @@ -1225,7 +1245,7 @@ memtx_tuple_new(struct tuple_format *format, const char *data, const char *end) } struct memtx_tuple *memtx_tuple; - while ((memtx_tuple = smalloc(&memtx->alloc, total)) == NULL) { + while ((memtx_tuple = memtx_global_alloc(memtx->alloc, total)) == NULL) { bool stop; memtx_engine_run_gc(memtx, &stop); if (stop) @@ -1262,12 +1282,11 @@ memtx_tuple_delete(struct tuple_format *format, struct tuple *tuple) struct memtx_tuple *memtx_tuple = container_of(tuple, struct memtx_tuple, base); size_t total = tuple_size(tuple) + offsetof(struct memtx_tuple, base); - if (memtx->alloc.free_mode != SMALL_DELAYED_FREE || - memtx_tuple->version == memtx->snapshot_version || + if (memtx_tuple->version == memtx->snapshot_version || format->is_temporary) - smfree(&memtx->alloc, memtx_tuple, total); + memtx_global_free(memtx->alloc, memtx_tuple, total); else - smfree_delayed(&memtx->alloc, memtx_tuple, total); + memtx_global_free_delayed(memtx->alloc, memtx_tuple, total); tuple_format_unref(format); } @@ -1279,7 +1298,7 @@ metmx_tuple_chunk_delete(struct tuple_format *format, const char *data) container_of((const char (*)[0])data, struct tuple_chunk, data); uint32_t sz = tuple_chunk_sz(tuple_chunk->data_sz); - smfree(&memtx->alloc, tuple_chunk, sz); + memtx_global_free(memtx->alloc, tuple_chunk, sz); } const char * @@ -1289,7 +1308,7 @@ memtx_tuple_chunk_new(struct tuple_format *format, struct tuple *tuple, struct memtx_engine *memtx = (struct memtx_engine *)format->engine; uint32_t sz = tuple_chunk_sz(data_sz); struct tuple_chunk *tuple_chunk = - (struct tuple_chunk *) smalloc(&memtx->alloc, sz); + (struct tuple_chunk *) memtx_global_alloc(memtx->alloc, sz); if (tuple == NULL) { diag_set(OutOfMemory, sz, "smalloc", "tuple"); return NULL; diff --git a/src/box/memtx_engine.h b/src/box/memtx_engine.h index 8b380bf3c..cb7d4c1ce 100644 --- a/src/box/memtx_engine.h +++ b/src/box/memtx_engine.h @@ -40,6 +40,7 @@ #include "engine.h" #include "xlog.h" #include "salad/stailq.h" +#include "system_allocator.h" #if defined(__cplusplus) extern "C" { @@ -49,6 +50,7 @@ struct index; struct fiber; struct tuple; struct tuple_format; +struct allocator_stats; /** * The state of memtx recovery process. @@ -135,8 +137,12 @@ struct memtx_engine { struct slab_arena arena; /** Slab cache for allocating tuples. */ struct slab_cache slab_cache; - /** Tuple allocator. */ - struct small_alloc alloc; + /** Small tuple allocator. */ + struct small_alloc small_alloc; + /** System tuple allocator */ + struct system_alloc system_alloc; + /** Tuple allocator currently used */ + void *alloc; /** Slab cache for allocating index extents. */ struct slab_cache index_slab_cache; /** Index extent allocator. */ @@ -178,6 +184,31 @@ struct memtx_engine { * memtx_gc_task::link. */ struct stailq gc_queue; + /** + * Method to create memtx allocator + */ + void (*memtx_allocator_create)(struct memtx_engine *memtx, ...); + /** + * Method to destroy memtx allocator + */ + void (*memtx_allocator_destroy)(struct memtx_engine *memtx); + /** + * Method to enter delayed free mode + */ + void (*memtx_enter_delayed_free_mode)(struct memtx_engine *memtx); + /** + * Method to leave delayed free mode + */ + void (*memtx_leave_delayed_free_mode)(struct memtx_engine *memtx); + /** + * Method to get allocation statistic + */ + void (*memtx_allocator_stats)(struct memtx_engine *memtx, + struct allocator_stats *stats, ...); + /** + * Method to memtx memory check + */ + void (*memtx_mem_check)(struct memtx_engine *memtx); }; struct memtx_gc_task; @@ -213,7 +244,7 @@ struct memtx_engine * memtx_engine_new(const char *snap_dirname, bool force_recovery, uint64_t tuple_arena_max_size, uint32_t objsize_min, bool dontdump, - float alloc_factor); + const char *allocator, float alloc_factor); int memtx_engine_recover_snapshot(struct memtx_engine *memtx, @@ -299,13 +330,13 @@ static inline struct memtx_engine * memtx_engine_new_xc(const char *snap_dirname, bool force_recovery, uint64_t tuple_arena_max_size, uint32_t objsize_min, bool dontdump, - float alloc_factor) + const char *allocator, float alloc_factor) { struct memtx_engine *memtx; memtx = memtx_engine_new(snap_dirname, force_recovery, tuple_arena_max_size, objsize_min, dontdump, - alloc_factor); + allocator, alloc_factor); if (memtx == NULL) diag_raise(); return memtx; diff --git a/src/box/system_allocator.h b/src/box/system_allocator.h new file mode 100644 index 000000000..8e039f8e4 --- /dev/null +++ b/src/box/system_allocator.h @@ -0,0 +1,226 @@ +#pragma once +/* + * Copyright 2010-2020, Tarantool AUTHORS, please see AUTHORS file. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif /* defined(__cplusplus) */ + +#if HAVE_MALLOC_SIZE_DARWIN +#include +static inline size_t +portable_malloc_usable_size(void *p) +{ + return malloc_size(p); +} +#elif HAVE_MALLOC_USABLE_SIZE_BSD +#include +static inline size_t +portable_malloc_usable_size(void *p) +{ + return malloc_usable_size(p); +} +#elif HAVE_MALLOC_USABLE_SIZE_LINUX +#include +static inline size_t +portable_malloc_usable_size(void *p) +{ + return malloc_usable_size(p); +} +#else +#error "Undefined system type" +#endif + +/** + * Free mode + */ +enum system_free_mode { + /** Free objects immediately. */ + SYSTEM_FREE, + /** Collect garbage after delayed free. */ + SYSTEM_COLLECT_GARBAGE, + /** Postpone deletion of objects. */ + SYSTEM_DELAYED_FREE, +}; + +struct system_alloc { + /** + * Bytes allocated by system allocator + */ + uint64_t used_bytes; + /** + * Allocator quota + */ + struct quota *quota; + /** + * Free mode. + */ + enum system_free_mode free_mode; + /** + * List of pointers for delayed free. + */ + struct lifo delayed; + bool init; +}; + +struct system_stats { + size_t used; + size_t total; +}; + +enum system_opt { + SYSTEM_DELAYED_FREE_MODE +}; + +static inline void +sysfree(struct system_alloc *alloc, void *ptr, MAYBE_UNUSED size_t bytes) +{ + assert(alloc->init == true); + size_t size = portable_malloc_usable_size(ptr); + uint32_t s = size % QUOTA_UNIT_SIZE, units = size / QUOTA_UNIT_SIZE; + size_t used_bytes = pm_atomic_fetch_sub(&alloc->used_bytes, size); + if (small_align(used_bytes, QUOTA_UNIT_SIZE) > + small_align(used_bytes - s, QUOTA_UNIT_SIZE)) + units++; + if (units > 0) + quota_release(alloc->quota, units * QUOTA_UNIT_SIZE); + free(ptr); +} + +static inline void +system_collect_garbage(struct system_alloc *alloc) +{ + assert(alloc->init == true); + if (alloc->free_mode != SYSTEM_COLLECT_GARBAGE) + return; + + const int BATCH = 100; + if (!lifo_is_empty(&alloc->delayed)) { + for (int i = 0; i < BATCH; i++) { + void *item = lifo_pop(&alloc->delayed); + if (item == NULL) + break; + sysfree(alloc, item, 0 /* unused parameter */); + } + } else { + /* Finish garbage collection and switch to regular mode */ + alloc->free_mode = SYSTEM_FREE; + } +} + +static inline void +system_alloc_setopt(struct system_alloc *alloc, enum system_opt opt, bool val) +{ + assert(alloc->init == true); + switch (opt) { + case SYSTEM_DELAYED_FREE_MODE: + alloc->free_mode = val ? SYSTEM_DELAYED_FREE : + SYSTEM_COLLECT_GARBAGE; + break; + default: + assert(false); + break; + } +} + +static inline void +system_stats(struct system_alloc *alloc, struct system_stats *totals) +{ + assert(alloc->init == true); + totals->used = pm_atomic_load_explicit(&alloc->used_bytes, + pm_memory_order_relaxed); + totals->total = quota_total(alloc->quota); +} + +static inline void +system_alloc_create(struct system_alloc *alloc, struct quota *quota) +{ + alloc->used_bytes = 0; + alloc->quota = quota; + lifo_init(&alloc->delayed); + alloc->init = true; +} + +static inline void +system_alloc_destroy(MAYBE_UNUSED struct system_alloc *alloc) +{ + alloc->init = false; +} + +static inline void +sysfree_delayed(struct system_alloc *alloc, void *ptr, size_t bytes) +{ + assert(alloc->init == true); + if (alloc->free_mode == SYSTEM_DELAYED_FREE && ptr) { + lifo_push(&alloc->delayed, ptr); + } else { + sysfree(alloc, ptr, bytes); + } +} + +static inline void * +sysalloc(struct system_alloc *alloc, size_t bytes) +{ + assert(alloc->init == true); + system_collect_garbage(alloc); + + void *ptr = malloc(bytes); + if (!ptr) + return NULL; + size_t size = portable_malloc_usable_size(ptr); + uint32_t s = size % QUOTA_UNIT_SIZE, units = size / QUOTA_UNIT_SIZE; + while (1) { + size_t used_bytes = pm_atomic_load(&alloc->used_bytes); + if (small_align(used_bytes, QUOTA_UNIT_SIZE) < + small_align(used_bytes + s, QUOTA_UNIT_SIZE)) + units++; + if (units > 0) { + if (quota_use(alloc->quota, + units * QUOTA_UNIT_SIZE) < 0) { + free(ptr); + return NULL; + } + } + if (pm_atomic_compare_exchange_strong(&alloc->used_bytes, + &used_bytes, used_bytes + size)) + break; + if (units > 0) + quota_release(alloc->quota, units * QUOTA_UNIT_SIZE); + } + return ptr; +} + +#if defined(__cplusplus) +} /* extern "C" */ +#endif /* defined(__cplusplus) */ diff --git a/src/trivia/config.h.cmake b/src/trivia/config.h.cmake index 89e0d39c6..107cd8049 100644 --- a/src/trivia/config.h.cmake +++ b/src/trivia/config.h.cmake @@ -169,6 +169,9 @@ #cmakedefine HAVE_POSIX_FADVISE 1 #cmakedefine HAVE_FALLOCATE 1 #cmakedefine HAVE_MREMAP 1 +#cmakedefine HAVE_MALLOC_USABLE_SIZE_LINUX 1 +#cmakedefine HAVE_MALLOC_USABLE_SIZE_BSD 1 +#cmakedefine HAVE_MALLOC_SIZE_DARWIN 1 #cmakedefine HAVE_SYNC_FILE_RANGE 1 #cmakedefine HAVE_MSG_NOSIGNAL 1 diff --git a/test/app-tap/init_script.result b/test/app-tap/init_script.result index 72aa67db2..3b5211a90 100644 --- a/test/app-tap/init_script.result +++ b/test/app-tap/init_script.result @@ -3,6 +3,7 @@ -- box.cfg +allocator:small background:false checkpoint_count:2 checkpoint_interval:3600 diff --git a/test/box/admin.result b/test/box/admin.result index e05440f66..9e4813133 100644 --- a/test/box/admin.result +++ b/test/box/admin.result @@ -27,7 +27,9 @@ help() ... cfg_filter(box.cfg) --- -- - - background +- - - allocator + - small + - - background - false - - checkpoint_count - 2 diff --git a/test/box/cfg.result b/test/box/cfg.result index 10fef006c..d23255872 100644 --- a/test/box/cfg.result +++ b/test/box/cfg.result @@ -15,7 +15,9 @@ box.cfg.nosuchoption = 1 | ... cfg_filter(box.cfg) | --- - | - - - background + | - - - allocator + | - small + | - - background | - false | - - checkpoint_count | - 2 @@ -128,7 +130,9 @@ box.cfg() | ... cfg_filter(box.cfg) | --- - | - - - background + | - - - allocator + | - small + | - - background | - false | - - checkpoint_count | - 2 diff --git a/test/box/choose_memtx_allocator.lua b/test/box/choose_memtx_allocator.lua new file mode 100644 index 000000000..77a0ec638 --- /dev/null +++ b/test/box/choose_memtx_allocator.lua @@ -0,0 +1,9 @@ +#!/usr/bin/env tarantool + +require('console').listen(os.getenv('ADMIN')) + +box.cfg({ + listen = os.getenv("LISTEN"), + allocator=arg[1], + checkpoint_interval=10 +}) diff --git a/test/box/choose_memtx_allocator.result b/test/box/choose_memtx_allocator.result new file mode 100644 index 000000000..dab316b93 --- /dev/null +++ b/test/box/choose_memtx_allocator.result @@ -0,0 +1,135 @@ +-- test-run result file version 2 + +-- write data recover from latest snapshot +env = require('test_run') + | --- + | ... +test_run = env.new() + | --- + | ... +test_run:cmd('create server test with script="box/choose_memtx_allocator.lua"') + | --- + | - true + | ... +--test small allocator +test_run:cmd('start server test with args="small"') + | --- + | - true + | ... +test_run:cmd('switch test') + | --- + | - true + | ... +space = box.schema.space.create('test') + | --- + | ... +space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} }) + | --- + | ... +s = space:create_index('primary', { parts = {'id'} }) + | --- + | ... +for key = 1, 1000 do space:insert({key, key + 1000}) end + | --- + | ... +for key = 1, 1000 do space:replace({key, key + 5000}) end + | --- + | ... +for key = 1, 1000 do space:delete(key) end + | --- + | ... +space:drop() + | --- + | ... +test_run:cmd('switch default') + | --- + | - true + | ... +test_run:cmd('stop server test') + | --- + | - true + | ... +--test system(malloc) allocator +test_run:cmd('start server test with args="system"') + | --- + | - true + | ... +test_run:cmd('switch test') + | --- + | - true + | ... +space = box.schema.space.create('test') + | --- + | ... +space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} }) + | --- + | ... +s = space:create_index('primary', { parts = {'id'} }) + | --- + | ... +for key = 1, 500000 do space:insert({key, key + 1000}) end + | --- + | ... +for key = 1, 500000 do space:replace({key, key + 5000}) end + | --- + | ... +for key = 1, 500000 do space:delete(key) end + | --- + | ... +space:drop() + | --- + | ... +test_run:cmd('switch default') + | --- + | - true + | ... +test_run:cmd('stop server test') + | --- + | - true + | ... +--test default (small) allocator +test_run:cmd('start server test') + | --- + | - true + | ... +test_run:cmd('switch test') + | --- + | - true + | ... +space = box.schema.space.create('test') + | --- + | ... +space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} }) + | --- + | ... +s = space:create_index('primary', { parts = {'id'} }) + | --- + | ... +for key = 1, 1000 do space:insert({key, key + 1000}) end + | --- + | ... +for key = 1, 1000 do space:replace({key, key + 5000}) end + | --- + | ... +for key = 1, 1000 do space:delete(key) end + | --- + | ... +space:drop() + | --- + | ... +test_run:cmd('switch default') + | --- + | - true + | ... +test_run:cmd('stop server test') + | --- + | - true + | ... +test_run:cmd('cleanup server test') + | --- + | - true + | ... +test_run:cmd('delete server test') + | --- + | - true + | ... diff --git a/test/box/choose_memtx_allocator.test.lua b/test/box/choose_memtx_allocator.test.lua new file mode 100644 index 000000000..007b01d80 --- /dev/null +++ b/test/box/choose_memtx_allocator.test.lua @@ -0,0 +1,43 @@ + +-- write data recover from latest snapshot +env = require('test_run') +test_run = env.new() +test_run:cmd('create server test with script="box/choose_memtx_allocator.lua"') +--test small allocator +test_run:cmd('start server test with args="small"') +test_run:cmd('switch test') +space = box.schema.space.create('test') +space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} }) +s = space:create_index('primary', { parts = {'id'} }) +for key = 1, 1000 do space:insert({key, key + 1000}) end +for key = 1, 1000 do space:replace({key, key + 5000}) end +for key = 1, 1000 do space:delete(key) end +space:drop() +test_run:cmd('switch default') +test_run:cmd('stop server test') +--test system(malloc) allocator +test_run:cmd('start server test with args="system"') +test_run:cmd('switch test') +space = box.schema.space.create('test') +space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} }) +s = space:create_index('primary', { parts = {'id'} }) +for key = 1, 500000 do space:insert({key, key + 1000}) end +for key = 1, 500000 do space:replace({key, key + 5000}) end +for key = 1, 500000 do space:delete(key) end +space:drop() +test_run:cmd('switch default') +test_run:cmd('stop server test') +--test default (small) allocator +test_run:cmd('start server test') +test_run:cmd('switch test') +space = box.schema.space.create('test') +space:format({ {name = 'id', type = 'unsigned'}, {name = 'year', type = 'unsigned'} }) +s = space:create_index('primary', { parts = {'id'} }) +for key = 1, 1000 do space:insert({key, key + 1000}) end +for key = 1, 1000 do space:replace({key, key + 5000}) end +for key = 1, 1000 do space:delete(key) end +space:drop() +test_run:cmd('switch default') +test_run:cmd('stop server test') +test_run:cmd('cleanup server test') +test_run:cmd('delete server test') -- 2.20.1