From: "m.kokryashkin" <m.kokryashkin@m-kokryashkin.local> It is impossible to run gdb on M1 devices, the only available debugger is lldb. The luajit-gdb extension doesn't work with lldb, so this patch introduces the luajit-lldb extension, which re-implements exactly the same functionality. Part of tarantool/tarantool#4808 --- Issue: https://github.com/tarantool/tarantool/issues/4808 Branch: https://github.com/tarantool/luajit/tree/gh-fckxorg/luajit-lldb src/luajit_lldb.py | 1034 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1034 insertions(+) create mode 100644 src/luajit_lldb.py diff --git a/src/luajit_lldb.py b/src/luajit_lldb.py new file mode 100644 index 00000000..69b50232 --- /dev/null +++ b/src/luajit_lldb.py @@ -0,0 +1,1034 @@ +import abc +import argparse +import re +import shlex + +import lldb + +LJ_64 = None +LJ_GC64 = None +LJ_FR2 = None +LJ_DUALNUM = None +PADDING = None + +# Constants +IRT_P64 = 9 +LJ_GCVMASK = ((1 << 47) - 1) +LJ_TISNUM = None + +# Global +target = None + +class Ptr: + def __init__(self, value, normal_type): + self.value = value + self.normal_type = normal_type + + @property + def deref(self): + return self.normal_type(self.value.Dereference()) + + def __add__(self, other): + assert isinstance(other, int) + return self.__class__(cast(self.normal_type.__name__ + ' *', cast('uintptr_t', self.value.unsigned + other * self.value.deref.size))) + + def __sub__(self, other): + assert isinstance(other, int) or isinstance(other, Ptr) + if isinstance(other, int): + return self.__add__(-other) + else: + return self.value.unsigned - other.value.unsigned + + + def __eq__(self, other): + assert isinstance(other, Ptr) or (isinstance(other, int) and other >= 0) + if isinstance(other, Ptr): + return self.value.unsigned == other.value.unsigned + else: + return self.value.unsigned == other + + def __ne__(self, other): + return not self == other + + def __gt__(self, other): + assert isinstance(other, Ptr) + return self.value.unsigned > other.value.unsigned + + def __bool__(self): + return self.value.unsigned != 0 + + def __int__(self): + return self.value.unsigned + + def __str__(self): + return self.value.value + +class MetaStruct(type): + def __init__(cls, name, bases, nmspc): + super(MetaStruct, cls).__init__(name, bases, nmspc) + + def make_general(field, tp): + extras = { + 'raw' : None, + 'uints': 'unsigned', + 'ints': 'signed', + 'strings': 'value', + } + if tp in extras.keys(): + tp = extras[tp] + if tp is None: + return lambda self: self[field] + else: + return lambda self: getattr(self[field], tp) + else: + return lambda self: globals()[tp](self[field]) + + if hasattr(cls, 'metainfo'): + for tp in cls.metainfo.keys(): + if tp == 'custom': + for field, value in cls.metainfo['custom'].items(): + setattr(cls, field, value) + else: + for field in cls.metainfo[tp]: + setattr(cls, field, property(make_general(field, tp))) + +class Struct(metaclass=MetaStruct): + def __init__(self, value): + self.value = value + + def __getitem__(self, name): + return self.value.GetChildMemberWithName(name) + + @property + def addr(self): + return self.value.address_of + +c_structs = { + 'MRef': { + 'custom': { + 'ptr': property(lambda self: self['ptr64'] if LJ_GC64 else self['ptr32']) + } + }, + 'GCRef': { + 'custom': { + 'gcptr': property(lambda self: self['gcptr64'] if LJ_GC64 else self['gcptr32']) + } + }, + 'TValue': { + 'GCRef': ['gcr'], + 'uints': ['it', 'i'], + 'ints': ['it64'], + 'strings': ['n'], + 'custom': { + 'ftsz': property(lambda self: self['ftsz'].signed if LJ_GC64 else None), + 'fr': property(lambda self: FR(self['fr']) if not LJ_GC64 else None) + } + }, + 'GCState': { + 'GCRef': ['root', 'gray', 'grayagain', 'weak', 'mmudata'], + 'uints': ['state', 'total', 'threshold', 'debt', 'estimate', + 'stepmul', 'pause', 'sweepstr'] + }, + 'lua_State': { + 'MRef': ['glref', 'stack', 'maxstack'], + 'TValuePtr': ['top', 'base'] + }, + 'global_State': { + 'GCState': ['gc'], + 'uints': ['vmstate', 'strmask'] + }, + 'jit_State': { + 'uints': ['state'] + }, + 'GChead': { + 'GCRef': ['nextgc'] + }, + 'GCobj': { + 'GChead': ['gch'] + }, + 'GCstr': { + 'uints': ['hash', 'len'] + }, + 'FrameLink': { + 'MRef': ['pcr'], + 'ints': ['ftsz'] + }, + 'FR': { + 'FrameLink': ['tp'] + }, + 'GCfuncC': { + 'MRef': ['pc'], + 'uints': ['ffid', 'nupvalues'], + 'raw': ['f'] + }, + 'GCtab': { + 'MRef': ['array', 'node'], + 'GCRef': ['metatable'], + 'uints': ['asize', 'hmask'] + }, + 'GCproto': { + 'GCRef': ['chunkname'], + 'raw': ['firstline'] + }, + 'GCtrace': { + 'uints': ['traceno'] + }, + 'Node': { + 'TValue': ['key', 'val'], + 'MRef': ['next'] + }, + 'BCIns': {} +} + +for cls in c_structs.keys(): + globals()[cls] = type(cls, (Struct, ), {'metainfo': c_structs[cls]} ) + +for cls in Struct.__subclasses__(): + ptr_name = cls.__name__ + 'Ptr' + def make_ptr_init(nm, cls): + return type( + nm, + (Ptr,), + { + '__init__': lambda self, value: super(type(self), self).__init__(value, cls) + } + ) + + globals()[ptr_name] = make_ptr_init(ptr_name, cls) + + +class Command(object): + def __init__(self, debugger, unused): + pass + + def get_short_help(self): + return self.__doc__.splitlines()[0] + + def get_long_help(self): + return self.__doc__ + + def __call__(self, debugger, command, exe_ctx, result): + try: + args = self.argument_parser.parse_args(shlex.split(command)) + self.execute(debugger, args, result) + except Exception as e: + msg = u'Failed to execute command `{}`: {}'.format(self.command, e) + result.SetError(msg) + + @property + def argument_parser(self): + return argparse.ArgumentParser( + prog=self.command, + description=self.get_long_help(), + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + @abc.abstractproperty + def command(self): + """Command name. + This name will be used by LLDB in order to unique/ly identify an + implementation that should be executed when a command is run + in the REPL. + """ + + @abc.abstractmethod + def execute(self, debugger, args, result): + """Implementation of the command. + Subclasses override this method to implement the logic of a given + command, e.g. printing a stacktrace. The command output should be + communicated back via the provided result object, so that it's + properly routed to LLDB frontend. Any unhandled exception will be + automatically transformed into proper errors. + Args: + debugger: lldb.SBDebugger: the primary interface to LLDB scripting + args: argparse.Namespace: an object holding parsed command arguments + result: lldb.SBCommandReturnObject: a container which holds the + result from command execution + """ + +def gcval(obj): + return cast(GCobjPtr, cast('uintptr_t', obj.gcptr.unsigned & LJ_GCVMASK) if LJ_GC64 + else cast('uintptr_t', obj.gcptr)) + +def gcref(obj): + return cast(GCobjPtr, obj.gcptr if LJ_GC64 + else cast('uintptr_t', obj.gcptr)) + +def gcnext(obj): + return gcref(obj).deref.gch.nextgc + +def gclistlen(root, end=0x0): + count = 0 + while(gcref(root) != end): + count += 1 + root = gcnext(root) + return count + +def gcringlen(root): + if not gcref(root): + return 0 + elif gcref(root) == gcref(gcnext(root)): + return 1 + else: + return 1 + gclistlen(gcnext(root), gcref(root).deref) + +gclen = { + 'root': gclistlen, + 'gray': gclistlen, + 'grayagain': gclistlen, + 'weak': gclistlen, + # XXX: gc.mmudata is a ring-list. + 'mmudata': gcringlen, +} + +def dump_gc(g): + gc = g.gc + stats = [ '{key}: {value}'.format(key = f, value = getattr(gc, f)) for f in ( + 'total', 'threshold', 'debt', 'estimate', 'stepmul', 'pause' + ) ] + + stats += [ 'sweepstr: {sweepstr}/{strmask}'.format( + sweepstr = gc.sweepstr, + # String hash mask (size of hash table - 1). + strmask = g.strmask + 1, + ) ] + + stats += [ '{key}: {number} objects'.format( + key = stat, + number = handler(getattr(gc, stat)) + ) for stat, handler in gclen.items() ] + return '\n'.join(map(lambda s: '\t' + s, stats)) + +def cast(typename, value): + pointer_type = False + name = None + if isinstance(value, Struct) or isinstance(value, Ptr): + value = value.value + if isinstance(typename, type): + name = typename.__name__ + if name.endswith('Ptr'): + pointer_type = True + name = name[:-3] + else: + name = typename + if name[-1] == '*': + name = name[:-1].strip() + pointer_type = True + t = target.FindFirstType(name) + if pointer_type: + t = t.GetPointerType() + + if isinstance(value, int): + if pointer_type: + return target.CreateValueFromAddress('value', lldb.SBAddress(value, target), t.GetPointeeType()).address_of + else: + return target.CreateValueFromData(name = 'value', data = lldb.SBData.CreateDataFromInt(value), type=t) + + if isinstance(typename, type): + return typename(value.Cast(t)) + else: + return value.Cast(t) + +def lookup_global(name): + global target + return target.FindFirstGlobalVariable(name) + +def mref(typename, obj): + return cast(typename, obj.ptr) + +def type_member(type_obj, name): + return next((x for x in type_obj.members if x.name == name), None) + +def J(g): + global target + typeGG = target.FindFirstType('GG_State') + g_member = type_member(typeGG, 'g') + j_member = type_member(typeGG, 'J') + jtype = target.FindFirstType('jit_State').GetPointerType() + + return jit_State(lldb.SBValue().CreateValueFromData(name = 'jit_state_ptr', data = lldb.SBData.CreateDataFromInt(cast('char *', g).unsigned + - g_member.GetOffsetInBytes() + + j_member.GetOffsetInBytes()), type=jtype)) + +def G(L): + return mref(global_StatePtr, L.glref).deref + +def L(L=None): + # lookup a symbol for the main coroutine considering the host app + # XXX Fragile: though the loop initialization looks like a crap but it + # respects both Python 2 and Python 3. + for l in [ L ] + list(map(lambda l: lookup_global(l), ( + # LuaJIT main coro (see luajit/src/luajit.c) + 'globalL', + # Tarantool main coro (see tarantool/src/lua/init.h) + 'tarantool_L', + # TODO: Add more + ))): + if l: + return lua_State(l) + +def tou32(val): + return val & 0xFFFFFFFF + +def i2notu32(val): + return ~int(val) & 0xFFFFFFFF + +def vm_state(g): + return { + i2notu32(0): 'INTERP', + i2notu32(1): 'LFUNC', + i2notu32(2): 'FFUNC', + i2notu32(3): 'CFUNC', + i2notu32(4): 'GC', + i2notu32(5): 'EXIT', + i2notu32(6): 'RECORD', + i2notu32(7): 'OPT', + i2notu32(8): 'ASM', + }.get(int(tou32(g.vmstate)), 'TRACE') + +def gc_state(g): + return { + 0: 'PAUSE', + 1: 'PROPAGATE', + 2: 'ATOMIC', + 3: 'SWEEPSTRING', + 4: 'SWEEP', + 5: 'FINALIZE', + 6: 'LAST', + }.get(g.gc.state, 'INVALID') + +def jit_state(g): + return { + 0: 'IDLE', + 0x10: 'ACTIVE', + 0x11: 'RECORD', + 0x12: 'START', + 0x13: 'END', + 0x14: 'ASM', + 0x15: 'ERR', + }.get(J(g).state, 'INVALID') + +def strx64(val): + return re.sub('L?$', '', + hex(int(val) & 0xFFFFFFFFFFFFFFFF)) + +def funcproto(func): + assert(func.ffid == 0) + + type_proto = target.FindFirstType('GCproto') + type_proto_size = type_proto.GetByteSize() + value = cast('uintptr_t', mref('char *', func.pc).unsigned - type_proto_size) + return cast(GCprotoPtr, value) + +def strdata(obj): + process = target.GetProcess() + thread = process.GetSelectedThread() + frame = thread.GetSelectedFrame() + try: + ptr = cast('char *', obj + 1) + return ptr.summary + except UnicodeEncodeError: + return "<luajit-lldb: error occured while rendering non-ascii slot>" + +def itype(o): + return tou32(o.it64 >> 47) if LJ_GC64 else o.it + +def tvisint(o): + return LJ_DUALNUM and itype(o) == LJ_TISNUM + +def tvislightud(o): + if LJ_64 and not LJ_GC64: + return (cast('int32_t', itype(o)) >> 15) == -2 + else: + return itype(o) == LJ_T['LIGHTUD'] + +def tvisnumber(o): + return itype(o) <= LJ_TISNUM + +def dump_lj_tnil(tv): + return 'nil' + +def dump_lj_tfalse(tv): + return 'false' + +def dump_lj_ttrue(tv): + return 'true' + +def dump_lj_tlightud(tv): + return 'light userdata @ {}'.format(strx64(gcval(tv.deref.gcr))) + +def dump_lj_tstr(tv): + return 'string {body} @ {address}'.format( + body = strdata(cast(GCstrPtr, gcval(tv.deref.gcr))), + address = strx64(gcval(tv.deref.gcr)) + ) + +def dump_lj_tupval(tv): + return 'upvalue @ {}'.format(strx64(gcval(tv.deref.gcr))) + +def dump_lj_tthread(tv): + return 'thread @ {}'.format(strx64(gcval(tv.deref.gcr))) + +def dump_lj_tproto(tv): + return 'proto @ {}'.format(strx64(gcval(tv.deref.gcr))) + +def dump_lj_tfunc(tv): + func = cast(GCfuncCPtr, gcval(tv.deref.gcr)) + ffid = func.deref.ffid + + if ffid == 0: + pt = funcproto(func.deref).deref + return 'Lua function @ {addr}, {nupvals} upvalues, {chunk}:{line}'.format( + addr = strx64(func), + nupvals = func.deref.nupvalues, + chunk = strdata(cast(GCstrPtr, gcval(pt.chunkname))), + line = pt.firstline + ) + elif ffid == 1: + return 'C function @ {}'.format(strx64(func.deref.f.unsigned)) + else: + return 'fast function #{}'.format(ffid) + +def dump_lj_ttrace(tv): + trace = cast(GCtracePtr, gcval(tv.deref.gcr)) + return 'trace {traceno} @ {addr}'.format( + traceno = strx64(trace.deref.traceno), + addr = strx64(trace) + ) + +def dump_lj_tcdata(tv): + return 'cdata @ {}'.format(strx64(gcval(tv.deref.gcr))) + +def dump_lj_ttab(tv): + table = cast(GCtabPtr, gcval(tv.deref.gcr)) + return 'table @ {gcr} (asize: {asize}, hmask: {hmask})'.format( + gcr = strx64(table), + asize = table.deref.asize, + hmask = strx64(table.deref.hmask), + ) + +def dump_lj_tudata(tv): + return 'userdata @ {}'.format(strx64(gcval(tv.deref.gcr))) + +def dump_lj_tnumx(tv): + if tvisint(tv.deref): + return 'integer {}'.format(cast('int32_t', tv.deref.i)) + else: + return 'number {}'.format(tv.deref.n) + +def dump_lj_invalid(tv): + return 'not valid type @ {}'.format(strx64(gcval(tv.deref.gcr))) + +dumpers = { + 'LJ_TNIL': dump_lj_tnil, + 'LJ_TFALSE': dump_lj_tfalse, + 'LJ_TTRUE': dump_lj_ttrue, + 'LJ_TLIGHTUD': dump_lj_tlightud, + 'LJ_TSTR': dump_lj_tstr, + 'LJ_TUPVAL': dump_lj_tupval, + 'LJ_TTHREAD': dump_lj_tthread, + 'LJ_TPROTO': dump_lj_tproto, + 'LJ_TFUNC': dump_lj_tfunc, + 'LJ_TTRACE': dump_lj_ttrace, + 'LJ_TCDATA': dump_lj_tcdata, + 'LJ_TTAB': dump_lj_ttab, + 'LJ_TUDATA': dump_lj_tudata, + 'LJ_TNUMX': dump_lj_tnumx, +} + +LJ_T = { + 'NIL' : i2notu32(0), + 'FALSE' : i2notu32(1), + 'TRUE' : i2notu32(2), + 'LIGHTUD' : i2notu32(3), + 'STR' : i2notu32(4), + 'UPVAL' : i2notu32(5), + 'THREAD' : i2notu32(6), + 'PROTO' : i2notu32(7), + 'FUNC' : i2notu32(8), + 'TRACE' : i2notu32(9), + 'CDATA' : i2notu32(10), + 'TAB' : i2notu32(11), + 'UDATA' : i2notu32(12), + 'NUMX' : i2notu32(13), +} + +def itypemap(o): + if LJ_64 and not LJ_GC64: + return LJ_T['NUMX'] if tvisnumber(o) \ + else LJ_T['LIGHTUD'] if tvislightud(o) else itype(o) + else: + return LJ_T['NUMX'] if tvisnumber(o) else itype(o) + +def typenames(value): + return { + LJ_T[k]: 'LJ_T' + k for k in LJ_T.keys() + }.get(int(value), 'LJ_TINVALID') + +def dump_tvalue(tvptr): + return dumpers.get(typenames(itypemap(tvptr.deref)), dump_lj_invalid)(tvptr) + +FRAME_TYPE = 0x3 +FRAME_P = 0x4 +FRAME_TYPEP = FRAME_TYPE | FRAME_P + +FRAME = { + 'LUA': 0x0, + 'C': 0x1, + 'CONT': 0x2, + 'VARG': 0x3, + 'LUAP': 0x4, + 'CP': 0x5, + 'PCALL': 0x6, + 'PCALLH': 0x7, +} + +def frametypes(ft): + return { + FRAME['LUA'] : 'L', + FRAME['C'] : 'C', + FRAME['CONT'] : 'M', + FRAME['VARG'] : 'V', + }.get(ft, '?') + +def bc_a(ins): + return (ins >> 8) & 0xff + +def frame_ftsz(framelink): + return cast('ptrdiff_t', framelink.ftsz if LJ_FR2 \ + else framelink.fr.tp.ftsz) + +def frame_pc(framelink): + return cast(BCInsPtr, frame_ftsz(framelink.deref)) if LJ_FR2 \ + else mref(BCInsPtr, framelink.fr.tp.pcr) + +def frame_prevl(framelink): + process = target.GetProcess() + thread = process.GetSelectedThread() + frame = thread.GetSelectedFrame() + return framelink - (1 + LJ_FR2 + bc_a(frame.EvaluateExpression('((BCIns *)' + str(frame_pc(framelink)) + ')[-1]').unsigned)) + +def frame_ispcall(framelink): + return (frame_ftsz(framelink).unsigned & FRAME['PCALL']) == FRAME['PCALL'] + +def frame_sized(framelink): + return (frame_ftsz(framelink).unsigned & ~FRAME_TYPEP) + +def frame_prevd(framelink): + return framelink - frame_sized(framelink.deref) + +def frame_type(framelink): + return frame_ftsz(framelink).unsigned & FRAME_TYPE + +def frame_typep(framelink): + return frame_ftsz(framelink).unsigned & FRAME_TYPEP + +def frame_islua(framelink): + return frametypes(frame_type(framelink)) == 'L' \ + and frame_ftsz(framelink).unsigned > 0 + +def frame_prev(framelink): + return frame_prevl(framelink) if frame_islua(framelink.deref) \ + else frame_prevd(framelink) + +def dump_framelink(L, fr): + fr2 = fr + LJ_FR2 + + return '{fr}{padding} [ ] FRAME: [{pp}] delta={d}, {f}\n'.format( + fr = strx64(fr), + padding = ':{fr2: <{width}}'.format(fr2 = strx64(fr2), width=len(PADDING) - 1) if LJ_FR2 else PADDING, + pp = 'PP' if frame_ispcall(fr2.deref) else '{frname}{p}'.format( + frname = frametypes(frame_type(fr2.deref)), + p = 'P' if frame_typep(fr2.deref) & FRAME_P else '' + ), + d = fr2 - frame_prev(fr2), + f = dump_lj_tfunc(fr), + ) + +def dump_stack_slot(L, slot, base=None, top=None, eol='\n'): + base = base or L.base + top = top or L.top + + return '{addr}{padding} [ {B}{T}{M}] VALUE: {value}{eol}'.format( + addr = strx64(slot), + padding = PADDING, + B = 'B' if slot == base else ' ', + T = 'T' if slot == top else ' ', + M = 'M' if slot == mref(TValuePtr, L.maxstack) else ' ', + value = dump_tvalue(slot), + eol = eol, + ) + +def dump_stack(L, base=None, top=None): + base = base or L.base + top = top or L.top + stack = mref(TValuePtr, L.stack) + maxstack = mref(TValuePtr, L.maxstack) + red = 5 + 2 * LJ_FR2 + + dump = '\n'.join([ + '{padding} Red zone: {nredslots: >2} slots {padding}'.format( + padding = '-' * len(PADDING), + nredslots = red, + ), + *( + dump_stack_slot(L, maxstack + offset, base, top, '') + for offset in range(red, 0, -1) + ), + '{padding} Stack: {nstackslots: >5} slots {padding}'.format( + padding = '-' * len(PADDING), + nstackslots = int((maxstack - stack) >> 3), + ), + dump_stack_slot(L, maxstack, base, top, ''), + '{start}:{end: <{width}} [ ] {nfreeslots} slots: Free stack slots'.format( + start = strx64(top + 1), + end = strx64(maxstack - 1), + width = len(PADDING) - 1, + nfreeslots = int((maxstack - top - 8) >> 3), + ), + ]) + '\n' + + slot = top + framelink = base - (1 + LJ_FR2) + + # XXX: Lua stack unwinding algorithm consists of the following steps: + # 1. dump all data slots in the (framelink, top) interval + # 2. check whether there are remaining frames + # 3. if there are no slots further, stop the unwinding loop + # 4. otherwise, resolve the next framelink and top and go to (1) + # + # Postcondition (i.e. do-while) loops is the most fitting idiom for such + # case, but Python doesn't provide such lexical construction. Hence step (1) + # is unrolled for the topmost stack frame. + while slot > framelink + LJ_FR2: + dump += dump_stack_slot(L, slot, base, top) + slot -= 1 + + while framelink > stack: + assert slot == framelink + LJ_FR2, "Invalid slot during frame unwind" + dump += dump_framelink(L, framelink) + framelink = frame_prev(framelink + LJ_FR2) - LJ_FR2 + slot -= 1 + LJ_FR2 + while slot > framelink + LJ_FR2: + dump += dump_stack_slot(L, slot, base, top) + slot -= 1 + + assert slot == framelink + LJ_FR2, "Invalid slot after frame unwind" + # Skip a nil slot for the last frame for 2-slot frames. + slot -= LJ_FR2 + + dump += '{fr}{padding} [S ] FRAME: dummy L'.format( + fr = strx64(slot), + padding = ':{nilslot: <{offset}}'.format(nilslot = strx64(slot + 1), offset=len(PADDING) - 1) if LJ_FR2 else PADDING + ) + + return dump + + + +class LJDumpTValue(Command): + ''' +lj-tv <TValue *> + +The command receives a pointer to <tv> (TValue address) and dumps +the type and some info related to it. + +* LJ_TNIL: nil +* LJ_TFALSE: false +* LJ_TTRUE: true +* LJ_TLIGHTUD: light userdata @ <gcr> +* LJ_TSTR: string <string payload> @ <gcr> +* LJ_TUPVAL: upvalue @ <gcr> +* LJ_TTHREAD: thread @ <gcr> +* LJ_TPROTO: proto @ <gcr> +* LJ_TFUNC: <LFUNC|CFUNC|FFUNC> + <LFUNC>: Lua function @ <gcr>, <nupvals> upvalues, <chunk:line> + <CFUNC>: C function <mcode address> + <FFUNC>: fast function #<ffid> +* LJ_TTRACE: trace <traceno> @ <gcr> +* LJ_TCDATA: cdata @ <gcr> +* LJ_TTAB: table @ <gcr> (asize: <asize>, hmask: <hmask>) +* LJ_TUDATA: userdata @ <gcr> +* LJ_TNUMX: number <numeric payload> + +Whether the type of the given address differs from the listed above, then +error message occurs. + ''' + command = 'lj-tv' + + @property + def argument_parser(self): + parser = super(LJDumpTValue, self).argument_parser + + parser.add_argument('tv', nargs=1, type=str, default=None) + + return parser + + + def execute(self, debugger, args, result): + global target + expr = args.tv[0] + target = debugger.GetSelectedTarget() + process = target.GetProcess() + thread = process.GetSelectedThread() + frame = thread.GetSelectedFrame() + + tvptr = TValuePtr(frame.EvaluateExpression(expr)) + print('{}\n'.format(dump_tvalue(tvptr))) + + + +class LJState(Command): + ''' +lj-state +The command requires no args and dumps current VM and GC states +* VM state: <INTERP|C|GC|EXIT|RECORD|OPT|ASM|TRACE> +* GC state: <PAUSE|PROPAGATE|ATOMIC|SWEEPSTRING|SWEEP|FINALIZE|LAST> +* JIT state: <IDLE|ACTIVE|RECORD|START|END|ASM|ERR> + ''' + + command = 'lj-state' + + def execute(self, debugger, args, result): + global target + target = debugger.GetSelectedTarget() + g = G(L(None)) + print('{}\n'.format('\n'.join( + map(lambda t: '{} state: {}'.format(*t), { + 'VM': vm_state(g), + 'GC': gc_state(g), + 'JIT': jit_state(g), + }.items()) + ))) + +class LJDumpArch(Command): + ''' +lj-arch + +The command requires no args and dumps values of LJ_64 and LJ_GC64 +compile-time flags. These values define the sizes of host and GC +pointers respectively. + ''' + command = 'lj-arch' + + def execute(self, debugger, args, result): + print( + 'LJ_64: {LJ_64}, LJ_GC64: {LJ_GC64}, LJ_DUALNUM: {LJ_DUALNUM}\n' + .format( + LJ_64 = LJ_64, + LJ_GC64 = LJ_GC64, + LJ_DUALNUM = LJ_DUALNUM + ) + ) + +class LJGC(Command): + ''' +lj-gc + +The command requires no args and dumps current GC stats: +* total: <total number of allocated bytes in GC area> +* threshold: <limit when gc step is triggered> +* debt: <how much GC is behind schedule> +* estimate: <estimate of memory actually in use> +* stepmul: <incremental GC step granularity> +* pause: <pause between successive GC cycles> +* sweepstr: <sweep position in string table> +* root: <number of all collectable objects> +* gray: <number of gray objects> +* grayagain: <number of objects for atomic traversal> +* weak: <number of weak tables (to be cleared)> +* mmudata: <number of udata|cdata to be finalized> + ''' + command = 'lj-gc' + + def execute(self, debugger, args, result): + global target + target = debugger.GetSelectedTarget() + g = G(L(None)) + print('GC stats: {state}\n{stats}\n'.format( + state = gc_state(g), + stats = dump_gc(g) + )) + +class LJDumpString(Command): + ''' +lj-str <GCstr *> + +The command receives a <gcr> of the corresponding GCstr object and dumps +the payload, size in bytes and hash. + +*Caveat*: Since Python 2 provides no native Unicode support, the payload +is replaced with the corresponding error when decoding fails. + ''' + command = 'lj-str' + + @property + def argument_parser(self): + parser = super(LJDumpString, self).argument_parser + parser.add_argument('gcr', nargs=1, type=str, default=None) + return parser + + def execute(self, debugger, args, result): + global target + expr = args.gcr[0] + target = debugger.GetSelectedTarget() + process = target.GetProcess() + thread = process.GetSelectedThread() + frame = thread.GetSelectedFrame() + + string_ptr = GCstrPtr(frame.EvaluateExpression(expr)) + print("String: {body} [{len} bytes] with hash {hash}\n".format( + body = strdata(string_ptr), + hash = strx64(string_ptr.deref.hash), + len = string_ptr.deref.len, + )) + +class LJDumpTable(Command): + ''' +lj-tab <GCtab *> + +The command receives a GCtab adress and dumps the table contents: +* Metatable address whether the one is set +* Array part <asize> slots: + <aslot ptr>: [<index>]: <tv> +* Hash part <hsize> nodes: + <hnode ptr>: { <tv> } => { <tv> }; next = <next hnode ptr> + ''' + command = 'lj-tab' + + @property + def argument_parser(self): + parser = super(LJDumpTable, self).argument_parser + parser.add_argument('gctab', nargs=1, type=str, default=None) + return parser + + def execute(self, debugger, args, result): + global target + expr = args.gctab[0] + target = debugger.GetSelectedTarget() + process = target.GetProcess() + thread = process.GetSelectedThread() + frame = thread.GetSelectedFrame() + + t = GCtabPtr(frame.EvaluateExpression(expr)) + array = mref(TValuePtr, t.deref.array) + nodes = mref(NodePtr, t.deref.node) + mt = gcval(t.deref.metatable) + capacity = { + 'apart': int(t.deref.asize), + 'hpart': int(t.deref.hmask + 1) if t.deref.hmask > 0 else 0 + } + + if mt: + print('Metatable detected: {}\n'.format(strx64(mt))) + + print('Array part: {} slots\n'.format(capacity['apart'])) + for i in range(capacity['apart']): + slot = array + i + print('{ptr}: [{index}]: {value}\n'.format( + ptr = strx64(slot), + index = i, + value = dump_tvalue(slot) + )) + + print('Hash part: {} nodes\n'.format(capacity['hpart'])) + # See hmask comment in lj_obj.h + for i in range(capacity['hpart']): + node = nodes + i + print('{ptr}: {{ {key} }} => {{ {val} }}; next = {n}\n'.format( + ptr = strx64(node), + key = dump_tvalue(TValuePtr(node.deref.key.addr)), + val= dump_tvalue(TValuePtr(node.deref.val.addr)), + n = strx64(mref(NodePtr, node.deref.next)) + )) + +class LJDumpStack(Command): + ''' +lj-stack [<lua_State *>] + +The command receives a lua_State address and dumps the given Lua +coroutine guest stack: + +<slot ptr> [<slot attributes>] <VALUE|FRAME> + +* <slot ptr>: guest stack slot address +* <slot attributes>: + - S: Bottom of the stack (the slot L->stack points to) + - B: Base of the current guest frame (the slot L->base points to) + - T: Top of the current guest frame (the slot L->top points to) + - M: Last slot of the stack (the slot L->maxstack points to) +* <VALUE>: see help lj-tv for more info +* <FRAME>: framelink slot differs from the value slot: it contains info + related to the function being executed within this guest frame, its + type and link to the parent guest frame + [<frame type>] delta=<slots in frame>, <lj-tv for LJ_TFUNC slot> + - <frame type>: + + L: VM performs a call as a result of bytecode execution + + C: VM performs a call as a result of lj_vm_call + + M: VM performs a call to a metamethod as a result of bytecode + execution + + V: Variable-length frame for storing arguments of a variadic + function + + CP: Protected C frame + + PP: VM performs a call as a result of executinig pcall or xpcall + +If L is ommited the main coroutine is used. + ''' + command = 'lj-stack' + + @property + def argument_parser(self): + parser = super(LJDumpStack, self).argument_parser + parser.add_argument('lua_State', nargs='?', type=str, default=[None]) + return parser + + def execute(self, debugger, args, result): + global target + expr = args.lua_State[0] + target = debugger.GetSelectedTarget() + process = target.GetProcess() + thread = process.GetSelectedThread() + frame = thread.GetSelectedFrame() + + l = frame.EvaluateExpression(expr) if expr else None + + print('{}\n'.format(dump_stack(L(l)))) + + + + + +def register_commands(debugger): + for cls in Command.__subclasses__(): + debugger.HandleCommand( + 'command script add -c luajit_lldb.{cls} {command}'.format( + cls=cls.__name__, + command=cls.command, + ) + ) + +def configure(debugger): + global LJ_64, LJ_GC64, LJ_FR2, LJ_DUALNUM, PADDING, LJ_TISNUM + target = debugger.GetSelectedTarget() + module = target.modules[0] + LJ_DUALNUM = module.FindSymbol('lj_lib_checknumber') != None + + try: + irtype_enum = target.FindFirstType('IRType').enum_members + for member in irtype_enum: + if member.name == 'IRT_PTR': + LJ_64 = member.unsigned & 0x1f == IRT_P64 + if member.name == 'IRT_PGC': + LJ_FR2 = LJ_GC64 = member.unsigned & 0x1f == IRT_P64 + except: + print('luajit-lldb.py failed to load: ' + 'no debugging symbols found for libluajit\n') + + PADDING = ' ' * len(':' + hex((1 << (47 if LJ_GC64 else 32)) - 1)) + LJ_TISNUM = 0xfffeffff if LJ_64 and not LJ_GC64 else LJ_T['NUMX'] + + + +def __lldb_init_module(debugger, internal_dict): + configure(debugger) + register_commands(debugger) -- 2.32.1 (Apple Git-133)
* Дмитрий Обухов via Tarantool-discussions <tarantool-discussions@dev.tarantool.org> [21/08/15 03:02]:
Сначала надо реализовать изменение конфигурации как часть
протокола Рафт, и реализовать raft learner расширение протокола,
описанное в PhD.
Проблему чётного числа узлов нужно решать именно с помощью raft
learners (non-voting nodes).
Режим автоматического уменьшения размера кворума описан в PhD и
был реализован, по сути это автоматическая инициация configuration
change после длительной недоступности одного узла.
Репликационный фактор, как и placement/locality - это свойство
данных, а не свойство топологии. Для одной таблицы он может быть
3, для других 7, при этом дата центра может быть хоть 2 хоть 10. И
менятьего должен DBA а не СУБД автоматически. То что в тарантуле
это "слито" воедино - просто наследственность.
> А вот давайте попробуем пообсуждать здесь. Может такой формат больше народу подойдёт.
>
> В этом релизе у нас появляется автоматический фейловер «на борту» Тарантула — RAFT.
>
> Это прекрасное событие, однако у него есть некоторые недостатки:
>
> - Беспроблемные гарантированные выборы возможны только если число участников выборов нечётное. Или число кворума больше половины числа участников на 1. Для 2 — это 2. Для 3 это 2. Для 4 это 3.
>
> Кроме того ещё несколько вводных в виде F.A.Q:
>
> Q: Для чего пользователи ставят репликасет в нескольких ДЦ?
> A: Чтобы при недоступности одного (или нескольких) ДЦ сервис продолжал работу.
>
> Q: Если сервис располагается в X датацентрах, умерли все кроме одного последнего. Хочет ли пользователь чтоб его сервис был доступен клиентам?
> A: Безусловно
>
> Q: Какая инсталляция по нескольким ДЦ самая популярная?
> A: Инсталляция на 2 независимых ДЦ (минимальный случай резервирования, экономически самый дешёвый)
>
>
> Если порефлексировать над этими вводными, то мы можем сформулировать требования к «идеальному» фейловеру:
>
> - Работоспособность сервиса должна сохраняться «до последнего ДЦ»
> - Из предыдущего пункта следует необходимость поддержки «режима деградации» — по аналогии с режимом деградациии в RAID: отключили винчестер, избыточность исчезла, но RAID продолжает работу
> - Работоспособность сервиса не должна «предпочитать» чётные/нечётные числа, а должна сохраняться при снижении числа работоспособных узлов от N до 1.
>
>
> Исходя из перечисленного, я вижу RAFT — это только подузел такого механизма, а над ним действуют какие-то правила, которые плавно снижают кворум, выводя из игры недоступные узлы, вплоть до 1 (кворума нет, остался последний боец).
>
> Очевидно (мне очевидно, я могу ошибаться), что такой фейловер невозможен, если только сами узлы будут решать кто главный: рано или поздно ситуация что кластер разделился на две независимые половины, каждая со своим главным — произойдёт.
>
> Если взглянуть на многие пользовательские сервисы, то увидим, что пользователи заходят на них через одну точку входа: на mail.ru — через адрес mail.ru. На сервис банка — через адрес банка. И так далее. Возможно, если разместить stateful мониторы в этих точках, то подобный фейловер можно реализовать?
>
> Есть у кого-то мысли как построить подобный фейловер?
>
> --
> Дмитрий Обухов
--
Konstantin Osipov, Moscow, Russia
[-- Attachment #1: Type: text/plain, Size: 4142 bytes --] А вот давайте попробуем пообсуждать здесь. Может такой формат больше народу подойдёт. В этом релизе у нас появляется автоматический фейловер «на борту» Тарантула — RAFT. Это прекрасное событие, однако у него есть некоторые недостатки: - Беспроблемные гарантированные выборы возможны только если число участников выборов нечётное. Или число кворума больше половины числа участников на 1. Для 2 — это 2. Для 3 это 2. Для 4 это 3. Кроме того ещё несколько вводных в виде F.A.Q: Q: Для чего пользователи ставят репликасет в нескольких ДЦ? A: Чтобы при недоступности одного (или нескольких) ДЦ сервис продолжал работу. Q: Если сервис располагается в X датацентрах, умерли все кроме одного последнего. Хочет ли пользователь чтоб его сервис был доступен клиентам? A: Безусловно Q: Какая инсталляция по нескольким ДЦ самая популярная? A: Инсталляция на 2 независимых ДЦ (минимальный случай резервирования, экономически самый дешёвый) Если порефлексировать над этими вводными, то мы можем сформулировать требования к «идеальному» фейловеру: - Работоспособность сервиса должна сохраняться «до последнего ДЦ» - Из предыдущего пункта следует необходимость поддержки «режима деградации» — по аналогии с режимом деградациии в RAID: отключили винчестер, избыточность исчезла, но RAID продолжает работу - Работоспособность сервиса не должна «предпочитать» чётные/нечётные числа, а должна сохраняться при снижении числа работоспособных узлов от N до 1. Исходя из перечисленного, я вижу RAFT — это только подузел такого механизма, а над ним действуют какие-то правила, которые плавно снижают кворум, выводя из игры недоступные узлы, вплоть до 1 (кворума нет, остался последний боец). Очевидно (мне очевидно, я могу ошибаться), что такой фейловер невозможен, если только сами узлы будут решать кто главный: рано или поздно ситуация что кластер разделился на две независимые половины, каждая со своим главным — произойдёт. Если взглянуть на многие пользовательские сервисы, то увидим, что пользователи заходят на них через одну точку входа: на mail.ru — через адрес mail.ru. На сервис банка — через адрес банка. И так далее. Возможно, если разместить stateful мониторы в этих точках, то подобный фейловер можно реализовать? Есть у кого-то мысли как построить подобный фейловер? -- Дмитрий Обухов [-- Attachment #2: Type: text/html, Size: 4667 bytes --]
[-- Attachment #1: Type: text/plain, Size: 2352 bytes --] I forgot to provide github link to branch: https://github.com/tarantool/tarantool/tree/fckxorg/rfc-platform-profiler пн, 12 июл. 2021 г. в 15:25, Maxim Kokryashkin <max.kokryashkin@gmail.com>: > From: Maxim Kokryashkin <m.kokryashkin@tarantool.org> > > --- > doc/rfc/781-luajit-platform-profiler.md | 13 ++++++++++++- > 1 file changed, 12 insertions(+), 1 deletion(-) > > diff --git a/doc/rfc/781-luajit-platform-profiler.md > b/doc/rfc/781-luajit-platform-profiler.md > index fda3d535b..74132c2d4 100644 > --- a/doc/rfc/781-luajit-platform-profiler.md > +++ b/doc/rfc/781-luajit-platform-profiler.md > @@ -14,6 +14,17 @@ Currently, available options for profiling LuaJIT are > not fine enough to get an > > To get a detailed perspective of platform performance, a more advanced > profiler is needed. The desired profiler must be able to capture both guest > and host stacks simultaneously, along with virtual machine states. > > +To get the difference, you can take a look at flamegraphs generated by > pref, jit.p, and PoC for the proposed profiler below. > +### jit.p > +![jit.p](https://i.imgur.com/sDZZDZx.png) > + > +### perf > +![perf](https://i.imgur.com/DlKbFpo.png) > + > +### sysprof > +![sysprof](https://i.imgur.com/Yf80MDE.png) > + > + > ## Detailed design > > The proposed approach is to extend existing profiler embedded into > LuaJIT, so it will be able to capture host stack too. > @@ -69,4 +80,4 @@ Another way to implement such a thing is to make perf to > see guest stack. To do > Stack unwinding from outside of the LuaJIT is the problem we didn’t > manage to solve for today. There are different approaches to do this: > - *Save rsp register value to rbp and preserve rbp.* However, LuaJIT uses > rbp as a general-purpose register, and it is hard not to break everything > trying to use it only for stack frames. > - *Coordinated work of `jit.p` and perf.* This approach requires > modifying perf the way it will send LuaJIT suspension signal, and after > getting info about the host stack, it will receive information about the > guest stack and join them. This solution is quite possible, but modified > perf doesn't seem like a production-ready solution. > -- *Dwarf unwinding* > \ No newline at end of file > +- *Dwarf unwinding* > -- > 2.32.0 > > [-- Attachment #2: Type: text/html, Size: 3093 bytes --]
From: Maxim Kokryashkin <m.kokryashkin@tarantool.org> --- doc/rfc/781-luajit-platform-profiler.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/rfc/781-luajit-platform-profiler.md b/doc/rfc/781-luajit-platform-profiler.md index fda3d535b..74132c2d4 100644 --- a/doc/rfc/781-luajit-platform-profiler.md +++ b/doc/rfc/781-luajit-platform-profiler.md @@ -14,6 +14,17 @@ Currently, available options for profiling LuaJIT are not fine enough to get an To get a detailed perspective of platform performance, a more advanced profiler is needed. The desired profiler must be able to capture both guest and host stacks simultaneously, along with virtual machine states. +To get the difference, you can take a look at flamegraphs generated by pref, jit.p, and PoC for the proposed profiler below. +### jit.p +![jit.p](https://i.imgur.com/sDZZDZx.png) + +### perf +![perf](https://i.imgur.com/DlKbFpo.png) + +### sysprof +![sysprof](https://i.imgur.com/Yf80MDE.png) + + ## Detailed design The proposed approach is to extend existing profiler embedded into LuaJIT, so it will be able to capture host stack too. @@ -69,4 +80,4 @@ Another way to implement such a thing is to make perf to see guest stack. To do Stack unwinding from outside of the LuaJIT is the problem we didn’t manage to solve for today. There are different approaches to do this: - *Save rsp register value to rbp and preserve rbp.* However, LuaJIT uses rbp as a general-purpose register, and it is hard not to break everything trying to use it only for stack frames. - *Coordinated work of `jit.p` and perf.* This approach requires modifying perf the way it will send LuaJIT suspension signal, and after getting info about the host stack, it will receive information about the guest stack and join them. This solution is quite possible, but modified perf doesn't seem like a production-ready solution. -- *Dwarf unwinding* \ No newline at end of file +- *Dwarf unwinding* -- 2.32.0
From: Maxim Kokryashkin <m.kokryashkin@tarantool.org> It has been proposed to implement a platform performance profiler several times by now, so this commit adds the document, which describes one of the possible implementations. Github branch: https://github.com/tarantool/tarantool/tree/fckxorg/rfc-platform-profiler Needed for: #781 See also: #4001 --- doc/rfc/781-luajit-platform-profiler.md | 72 +++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 doc/rfc/781-luajit-platform-profiler.md diff --git a/doc/rfc/781-luajit-platform-profiler.md b/doc/rfc/781-luajit-platform-profiler.md new file mode 100644 index 000000000..fda3d535b --- /dev/null +++ b/doc/rfc/781-luajit-platform-profiler.md @@ -0,0 +1,72 @@ +# lua: system-wide profiler + +* **Status**: In progress +* **Start date**: 02-07-2021 +* **Authors**: Mikhail Shishatskiy @Shishqa m.shishatskiy@tarantool.org, Maxim Kokryashkin @fckxorg m.kokryashkin@tarantool.org +* **Issues**: [#781](https://github.com/tarantool/tarantool/issues/781) + +## Summary +The document describes the platform profiler for LuaJIT. It is needed to obtain a complete view of platform performance. Existing LuaJIT profiler only able to give you information about virtual machine states and guest stack. Hence, the document proposes to extend the existing LuaJIT profiler, so it will be able to gather stack traces from both C and Lua. + +## Background and motivation + +Currently, available options for profiling LuaJIT are not fine enough to get an understanding of performance. For example, perf only able to show host stack, so all the Lua calls are seen as single pcall. Oppositely, jit.p module provided with LuaJIT is not able to give any information about the host stack. + +To get a detailed perspective of platform performance, a more advanced profiler is needed. The desired profiler must be able to capture both guest and host stacks simultaneously, along with virtual machine states. + +## Detailed design + +The proposed approach is to extend existing profiler embedded into LuaJIT, so it will be able to capture host stack too. + +### Host stack + +The default sampling profiler implementation in LuaJIT, which can be seen [here](https://github.com/tarantool/luajit/blob/tarantool/src/lj_profile.c), follows this flow: +``` +luaJIT_profile_start --> profile_timer_start + +... + |lock VM state +[signal emmited] --> profile_signal_trigger: __|prepare args for a callback + |schedule callback execution + |unlock VM state +... + +luaJIT_profile_stop --> profile_timer_stop +``` + +Callback, which is scheduled by `profile_signal_trigger` can be used to dump needed information, including VM stack. However, even though the guest stack is still the same by the time when callback executed, the host stack is already have been changed, so the final stack dump can not be considered valid. + +Hence, to get a valid final snapshot of both stacks, a dump should be done right at the signal, like [there](https://github.com/Shishqa/luajit/blob/c0da971640512696f5c166e8f2dc1ed982a8f451/src/profile/sysprof.c#L63). + +The host stack can be dumped with`backtrace(void**, int)`. + +### VM stack +We are using an implementation similar to the one, which is used in [lj_debug_dumpstack](https://github.com/tarantool/luajit/blob/af889e4608e6eca495dd85e6161d8bcd7d3628e6/src/lj_debug.c#L580) to dump guest stack. But there is a problem with that because sometimes the VM stack can be invalid, thanks to this [bug](https://github.com/tarantool/luajit/blob/af889e4608e6eca495dd85e6161d8bcd7d3628e6/src/vm_x64.dasc#L4594). As you can see down the link, VM state changes to LFUNC, and after that stack reallocation takes place. So if our signal takes place in between, we will get a segmentation fault. Anyway, that issue is easy to fix, so this approach is suitable. + +### Symbol table + +It is a heavy task to dump names of functions every time, so instead, we will dump a symbol table in the beginning. Later on, it will be sufficient to dump only a function's address. However, some functions can be loaded and unloaded several times, and their addresses will be different each time. Hence, we will update the symbol table accordingly. To carry out the symtab update, we will drop in new symtab record into the file, where the profiler stores data. + +A symbol table looks like this (the same format as symtab in memprof): +``` + 1 byte 8 bytes 8 bytes + _______________________________________________________________ +| type | address of function | function name | first line number| + --------------------------------------------------------------- +``` + + + +### Traces + +Traces are the real problem here because there is no mechanism in LuaJIT to unwind them. Consequently, we need to introduce our own. The basic idea is to place some markers into the bytecode of a trace to indicate the start and the end of each function call and use them to unwind the whole call stack of a trace. + +<span style="color:red">A more specific description is needed</span>. + +## Rationale and alternatives + +Another way to implement such a thing is to make perf to see guest stack. To do so, we need to map virtual machine symbols (and that functionality is present in LuaJIT ([link](https://github.com/tarantool/luajit/blob/d4e12d7ac28e3bc857d30971dd77deec66a67297/src/lj_trace.c#L96))) and do something so perf could unwind the virtual machine stack. +Stack unwinding from outside of the LuaJIT is the problem we didn’t manage to solve for today. There are different approaches to do this: +- *Save rsp register value to rbp and preserve rbp.* However, LuaJIT uses rbp as a general-purpose register, and it is hard not to break everything trying to use it only for stack frames. +- *Coordinated work of `jit.p` and perf.* This approach requires modifying perf the way it will send LuaJIT suspension signal, and after getting info about the host stack, it will receive information about the guest stack and join them. This solution is quite possible, but modified perf doesn't seem like a production-ready solution. +- *Dwarf unwinding* \ No newline at end of file -- 2.31.1
Sergos, I've checked the RFC (with some tweaks here and there, that we discussed offline) into master. On 27.02.21, Sergey Ostanevich via Tarantool-discussions wrote: > Subject: > An RFC on bringing debugger facility into Tarantool. > > Part of #5857 > --- > doc/rfc/inter-fiber-debugger.md | 204 ++++++++++++++++++++++++++++++++ > 1 file changed, 204 insertions(+) > create mode 100644 doc/rfc/inter-fiber-debugger.md > <snipped> > -- > 2.24.3 (Apple Git-128) -- Best regards, IM
[-- Attachment #1: Type: text/plain, Size: 10150 bytes --] Subject: An RFC on bringing debugger facility into Tarantool. Part of #5857 --- doc/rfc/inter-fiber-debugger.md | 204 ++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 doc/rfc/inter-fiber-debugger.md diff --git a/doc/rfc/inter-fiber-debugger.md b/doc/rfc/inter-fiber-debugger.md new file mode 100644 index 000000000..e4b64490c --- /dev/null +++ b/doc/rfc/inter-fiber-debugger.md @@ -0,0 +1,204 @@ +# Inter-fiber Debugger for Tarantool +* **Status**: In progress +* **Start date**: 20-01-2021 +* **Authors**: Sergey Ostanevich @sergos sergos@tarantool.org <mailto:sergos@tarantool.org>, + Igor Munkin @imun imun@tarantool.org <mailto:imun@tarantool.org> +* **Discussion**: https://github.com/tarantool/tarantool/discussions/5857 <https://github.com/tarantool/tarantool/discussions/5857> + +[TOC] + +### Rationale + +To make Tarantool platform developer-friendly we should provide a set of basic +developer tools. One of such tool is debugger. There are number of debuggers +available for the Lua environments, although all of them are missing the +critical feature needed for the Tarantool platform: they should not cause a +full-stop of the debugged program during the debug session. + +In this RFC I propose to overcome the problem with a solution that will stop +only the fiber to be debugged. It will allow developers to debug their +application, while Tarantool can keep processing requests, perform replication +and so on. + +### Approach + +To do not reinvent the debugger techniques we may borrow the already existent +Lua debugger, put the rules about fiber use, data manipulation tweaks and so +on. + +Every fiber can be considered as a 'debuggee' or a regular fiber, switching +from one state to the other. To control the status we can either patch fiber +machinery - which seems excessive as fibers can serve pure C tasks - or tweak +the breakpoint hook to employ the fiber yield. The fiber will appear in a state +it waits for commands from the debugger and set the LuaJIT machinery hooks to +be prepared for the next fiber to be scheduled. + +### Debug techniques + +Regular debuggers provide interruption for all threads at once hence they don't +distinguish breakpoints appearance across the threads - they just stop +execution. For our case we have to introduce some specifics so that debugger +will align with the fiber nature of the server behavior. Let's consider some +techniques we can propose to the user. + +#### 1) Break first fiber met + +User puts a breakpoint that triggers once, stopping the first fiber the break +happens in. After breakpoint is met the fiber reports its status to the +debugger server, put itself in a wait state, clears the breakpoint and yields. +As soon as server issue a command, the debuggee will reset the breakpoint, +handle the command and proceed with execution or yield again. + +#### 2) Regular breakpoint + +This mode will start the same way as previous mode, but keep the breakpoint +before yield, so that the breakpoint still can trigger in another fiber. As the +server may deliver huge number of fibers during its performance, we have to set +up a user-configurable limit for the number of debuggee fibers can be set at +once. As soon as limit is reached the debuggee fiber starts behave exactly as +in previous mode, clearing the breakpoint before the yield from the debuggee. + +#### 3) Run a function under debug session + +This is the most straightforward way to debug a function: perform a call +through the debug interface. A new fiber will be created and break will appear +at the function entrance. The limit of debuggee fibers should be increased and +the fiber will behave similar to the modes above. + +#### 4) Attach debugger to a fiber by ID + +Every fiber has its numerical ID, so debugger can provide interface to start +debugging for a particular fiber. The fiber will be put in a wait state as soon +as it start execution after the debugger is attached. + +### Basic mechanism + +The Tarantool side of the debugger will consist of a dedicated fiber named +DebugSRV that will handle requests from the developer and make bookkeeping of +debuggee fibers and their breakpoints and a Lua function DebugHook set as a +hook in Lua debug [https://www.lua.org/pil/23.html <https://www.lua.org/pil/23.html>] library. Users should not +use this hook for the period of debugging to avoid interference. The external +interface can be organized over arbitrary protocol, be it a socket connection, +console or even IPROTO (using IPROTO_CALL). + +Debuggee fiber will be controlled by a debug hook function named DebugHook. It +is responsibility of the DebugHook to set the debuggee fiber status, check the +breakpoints appearance, its condition including the ignore count and update +hit_count. As soon as breakpoint is met, the DebugHook has to put its state to +pending and wait for command from the DebugSRV. + +Communication between DebugSRV and the debuggee fiber can be done via +fiber.channel mechanism. It will simplify the wait-for semantics. + +#### Data structure + +Every debuggee fiber is present in the corresponding table in the DebugSRV +fiber. The table has the following format: + +``` +debuggees = { + max_debuggee = number, + preserved_hook = { + [1] = function, + [2] = type, + [3] = number + } + fibers = { + [<fiber_id>] = { + state = ['pending'|'operation'], + current_breakpoint = <breakpoint_id>, + channel = fiber.channel, + breakpoints = { + [<breakpoint_id>] = { + type = ['l'|'c'|'r'|'i'], + value = [number|string] + condition = function, + hit_count = number, + ignore_count = number + } + } + } + } + global_breakpoints = { + [<breakpoint_id>] = { + type = ['l'|'c'|'r'|'i'], + value = [number|string] + condition = function, + hit_count = number, + ignore_count = number + } +} +``` +As DebugSRV receives commands it updates the structure of the debuggees and +forces the fiber wakeup to reset its hook state. The state of the debuggee is +one of the following: + +- 'operation': the fiber is already in the debuggees list, but it issued yield + without any breakpoint met +- 'pending': DebugHook waits for a new command from the channel in the + debuggees.fibers of its own ID + + +#### DebugHook behavior + +For the techniques 3) and 4) fiber appears in the list of debuggees.fibers +first, with its status set as 'operation' with a list of breakpoints set. + +For the techniques 1) and 2) there is a list of global_breakpoints that should +be checked by every fiber. + +In case a fiber receives control from the debug machinery it should check if it +is present in ```debuggees.fibers[ID]```. If it is - it should check if its +current position meets any breakpoint from the +```debuggees.fibers[ID].breakpoints``` or ```debuggees.global_breakponts```. If +breakpoint is met, the fiber sets its state into 'pending' and waits for a +command from the ```debuggees.fibers[ID].channel```. + +In case a fiber is not present in the ```debuggees.fibers[ID]``` it should +check that the number of fibers entries in the debuggees structure is less than +max_debuggee. In such a case it checks if it met any of the +```global_breakpoint``` it and put itself into the fibers list, updating the +array size [https://www.lua.org/pil/19.1.html <https://www.lua.org/pil/19.1.html>]. Also it should open a channel +to the DebugSVR and put itself into the 'pending' state. + +#### DebugSRV behavior + +DebugSRV handles the input from the user and supports the following list of +commands (as mentioned, it can be used from any interface, so commands are +function calls for general case): + +- ```break_info([fiber ID])``` - list all breakpoints with counts and + conditions, limits output for the fiber with ID +- ```break_cond(<breakpoint id>, <condition>)``` - set a condition for the + breakpoint, condition should be Lua code evaluating into a boolean value +- ```break_ignore(<breakpoint id>, <count>)``` - ignore the number of + breakpoint executions +- ```break_delete(<breakpoint id>)``` - removes a breakpoint +- ```step(<fiber ID>)``` - continue execution, stepping into the call +- ```step_over(<fiber ID>)``` - continue execution until the next source line, + skip calls +- ```step_out(<fiber ID>)``` - continue execution until return from the current + function + +The functions above are common for many debuggers, just some tweaks to adopt +fibers. Functions below are more specific, so let's get into some details: + +- ```set_max_debuggee(number)``` - set the number of fibers can be debugged + simultaneously. It modifies the ```debuggees.max_debuggee``` so that new fibers + will respect the amount of debuggees. For example, if at some point of + debugging there were 5 debuggee fibers user can set this value to 3 - it will + not cause any problem, just a new fiber will not become a debuggee if it meet + some global breakpoint. +- ```debug_eval(<fiber ID>, <code>)``` - allows to evaluate the code in the + context of the debuggee fiber if it is in 'pending' mode. User can issue a + ```debug_eval(113, function() return fiber.id <http://fiber.id/>() end)``` to receive 113 as a + result +- ```break(<breakpoint description>, [fiber ID])``` - add a new breakpoint in + the fiber's breakpoint list on in the global list if no fiber ID provided +- ```debug_start()``` - starts debug session: creates debuggees structure, + preserve current debug hook in ```debuggees.preserved_hook``` and sets + DebugHook as the current hook +- ```debug_stop()``` - quits debug session: resets the debug hook, clears + debuggees structure + + -- 2.24.3 (Apple Git-128) [-- Attachment #2: Type: text/html, Size: 35746 bytes --]
Sergey, On 25.12.20, Sergey Kaplun wrote: > Part of #5442 > --- > > RFC on branch: https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler/doc/rfc/5442-luajit-memory-profiler.md > > Changes in v3: > * More comments in example. > * More verbose benchmark information. > * Grammar and spelling fixes. > > Changes in v2: > * Removed C API, Tarantool integration and description of additional > features -- they will be added in another RFC if necessary. > * Removed checking profile is running from the public API. > * Added benchmarks and more meaningful example. > * Grammar fixes. > > doc/rfc/5442-luajit-memory-profiler.md | 314 +++++++++++++++++++++++++ > 1 file changed, 314 insertions(+) > create mode 100644 doc/rfc/5442-luajit-memory-profiler.md I've checked your patch into 2.7 and master. > <snipped> > -- > 2.28.0 > -- Best regards, IM
Sergey, Thanks for the fixes, LGTM. On 20.01.21, Sergey Kaplun wrote: > Hi, Igor! > > Thanks for the review! > > On 15.01.21, Igor Munkin wrote: > > Sergey, > > > > Thanks for the changes. There is a bit of nitpicking below and I > > believe we'll push the next version doc to the trunk. > > I've fixed all your comments, plus added some insignificant fixes. > See two iterative patches below. Branch is force pushed. Great, thanks! I also changed the commit subject to the following: | rfc: describe a LuaJIT memory profiler toolchain > > > > > On 25.12.20, Sergey Kaplun wrote: > > > Part of #5442 > > > --- > > > > > > RFC on branch: https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler/doc/rfc/5442-luajit-memory-profiler.md > > Side note: branch name is updated. > New RFC version: https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler-rfc/doc/rfc/5442-luajit-memory-profiler.md > > > > > > > Changes in v3: > > > * More comments in example. > > > * More verbose benchmark information. > > > * Grammar and spelling fixes. > > > > > > Changes in v2: > > > * Removed C API, Tarantool integration and description of additional > > > features -- they will be added in another RFC if necessary. > > > * Removed checking profile is running from the public API. > > > * Added benchmarks and more meaningful example. > > > * Grammar fixes. > > > > > > doc/rfc/5442-luajit-memory-profiler.md | 314 +++++++++++++++++++++++++ > > > 1 file changed, 314 insertions(+) > > > create mode 100644 doc/rfc/5442-luajit-memory-profiler.md > > > <snipped> > -- > Best regards, > Sergey Kaplun -- Best regards, IM
Hi, Sergos! Thanks, for the review! On 20.01.21, Sergey Ostanevich wrote: > Hi! > > Thanks for the patch, I've looked into > https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler-rfc/doc/rfc/5442-luajit-memory-profiler.md <https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler-rfc/doc/rfc/5442-luajit-memory-profiler.md> > > in ‘Prerequisites’: > > Also all, deallocations are reported as internal too. > > the comma is not needed Indeed! Fixed! > > > Lua developers can do nothing with allocations made inside the built-ins except reducing its usage. > > ‘its’ doesn’t explain exact matter. I would rephrase: "As for allocations made inside the built-ins user can do nothing but reduce use of these built-ins." Thanks, applied! > > > Currently VM state identifies C function execution only, so Fast and Lua functions states are added. > > ‘Currently’ -> ‘Originally’ Fixed, thanks! See the iterative patch below. Branch is force pushed. > > Otherwise LGTM. > Sergos > <snipped> > =================================================================== diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md index f9c43f91f..cb8adab79 100644 --- a/doc/rfc/5442-luajit-memory-profiler.md +++ b/doc/rfc/5442-luajit-memory-profiler.md @@ -32,7 +32,7 @@ This section describes additional changes in LuaJIT required for the feature implementation. This version of LuaJIT memory profiler does not support verbose reporting for allocations made on traces. All allocation from traces are reported as internal. But trace code semantics should be totally the same as -for the Lua interpreter (excluding sink optimizations). Also all, deallocations +for the Lua interpreter (excluding sink optimizations). Also all deallocations are reported as internal too. There are two different representations of functions in LuaJIT: the function's @@ -44,8 +44,8 @@ that is used for LuaJIT built-ins Tail call optimization does not create a new call frame, so all allocations inside the function called via `CALLT`/`CALLMT` are attributed to its caller. -Lua developers can do nothing with allocations made inside the built-ins except -reducing its usage. So if fast function is called from a Lua function all +As for allocations made inside the built-ins user can do nothing but reduce use +of these built-ins. So if fast function is called from a Lua function all allocations made in its scope are attributed to this Lua function (i.e. the built-in caller). Otherwise, this event is attributed to a C function. @@ -98,7 +98,7 @@ INTERNAL: 20 0 1481 ``` So we need to know a type of function being executed by the virtual machine -(VM). Currently VM state identifies C function execution only, so Fast and Lua +(VM). Originally VM state identifies C function execution only, so Fast and Lua functions states are added. To determine currently allocating coroutine (that may not be equal to currently =================================================================== -- Best regards, Sergey Kaplun
[-- Attachment #1: Type: text/plain, Size: 18266 bytes --] Hi! Thanks for the patch, I've looked into https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler-rfc/doc/rfc/5442-luajit-memory-profiler.md <https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler-rfc/doc/rfc/5442-luajit-memory-profiler.md> in ‘Prerequisites’: > Also all, deallocations are reported as internal too. the comma is not needed > Lua developers can do nothing with allocations made inside the built-ins except reducing its usage. ‘its’ doesn’t explain exact matter. I would rephrase: "As for allocations made inside the built-ins user can do nothing but reduce use of these built-ins." > Currently VM state identifies C function execution only, so Fast and Lua functions states are added. ‘Currently’ -> ‘Originally’ Otherwise LGTM. Sergos > On 20 Jan 2021, at 11:19, Sergey Kaplun <skaplun@tarantool.org> wrote: > > Hi, Igor! > > Thanks for the review! > > On 15.01.21, Igor Munkin wrote: >> Sergey, >> >> Thanks for the changes. There is a bit of nitpicking below and I >> believe we'll push the next version doc to the trunk. > > I've fixed all your comments, plus added some insignificant fixes. > See two iterative patches below. Branch is force pushed. > >> >> On 25.12.20, Sergey Kaplun wrote: >>> Part of #5442 >>> --- >>> >>> RFC on branch: https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler/doc/rfc/5442-luajit-memory-profiler.md <https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler/doc/rfc/5442-luajit-memory-profiler.md> > > Side note: branch name is updated. > New RFC version: https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler-rfc/doc/rfc/5442-luajit-memory-profiler.md <https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler-rfc/doc/rfc/5442-luajit-memory-profiler.md> > >>> >>> Changes in v3: >>> * More comments in example. >>> * More verbose benchmark information. >>> * Grammar and spelling fixes. >>> >>> Changes in v2: >>> * Removed C API, Tarantool integration and description of additional >>> features -- they will be added in another RFC if necessary. >>> * Removed checking profile is running from the public API. >>> * Added benchmarks and more meaningful example. >>> * Grammar fixes. >>> >>> doc/rfc/5442-luajit-memory-profiler.md | 314 +++++++++++++++++++++++++ >>> 1 file changed, 314 insertions(+) >>> create mode 100644 doc/rfc/5442-luajit-memory-profiler.md >>> >>> diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md >>> new file mode 100644 >>> index 000000000..85a61462a >>> --- /dev/null >>> +++ b/doc/rfc/5442-luajit-memory-profiler.md >>> @@ -0,0 +1,314 @@ >> >> <snipped> >> >>> +### Prerequisites >>> + >>> +This section describes additional changes in LuaJIT required for the feature >>> +implementation. This version of LuaJIT memory profiler does not support verbose >>> +reporting allocations from traces. All allocation from traces are reported as >> >> Typo: s/reporting allocations from/reporting for allocations made on/. > > Fixed, thanks! > >> >>> +internal. But trace code semantics should be totally the same as for the Lua >>> +interpreter (excluding sink optimizations). Also all deallocations reported as >> >> Typo: s/deallocations reported/deallocation are reported/. > > Fixed, thanks! > >> >>> +internal too. >>> + >>> +There are two different representations of functions in LuaJIT: the function's >>> +prototype (`GCproto`) and the function object so called closure (`GCfunc`). >>> +The closures are represented as `GCfuncL` and `GCfuncC` for Lua and C closures >>> +correspondingly. Also LuaJIT has a special function's type aka Fast Function. >> >> Typo: s/correspondingly/respectively/. >> >>> +It is used for LuaJIT builtins. >> >> It's better to not split this sentence. Consider the rewording: >> | Besides LuaJIT has a special function type a.k.a. Fast Function that >> | is used for LuaJIT builtins. > > Applied! Thanks! > >> >>> + >> >> <snipped> >> >>> +Usually developers are not interested in information about allocations inside >>> +builtins. So if fast function was called from a Lua function all >>> +allocations are attributed to this Lua function. Otherwise attribute this event >>> +to a C function. >> >> I propose the following rewording: >> | Lua developers can do nothing with allocations made inside the >> | builtins except reducing its usage. So if fast function is called from >> | a Lua function all allocations made in its scope are attributed to this >> | Lua function (i.e. the builtin caller). Otherwise this event is >> | attributed to a C function. >> > > Applied, thanks! > >>> + >> >> <snipped> >> >>> +If one run the chunk above the profiler reports approximately the following >> >> Typo: s/run/runs/. > > Fixed. > >> >>> +(see legend [here](#reading-and-displaying-saved-data)): >> >> <snipped> >> >>> +So we need to know a type of function being executed by the virtual machine >>> +(VM). Currently VM state identifies C function execution only, so Fast and Lua >>> +functions states will be added. >> >> Typo: s/will be/are/. > > Sure, thanks! > >> >>> + >>> +To determine currently allocating coroutine (that may not be equal to currently >>> +executed one) a new field called `mem_L` is added to `global_State` structure >>> +to keep the coroutine address. This field is set at each reallocation to >> >> Typo: /at each reallocation to/on each reallocation to the/. > > Fixed. > >> >>> +corresponding `L` with which it was called. >> >> Typo: s/it was/it is/. > > Thanks, fixed! > >> >>> + >> >> <snipped> >> >>> +When the profiling is stopped the `fclose()` is called. If it is impossible to >> >> Typo: s/the `fclose()`/`fclose()`/. > > Fixed. > >> >>> +open a file for writing or profiler fails to start, returns `nil` on failure >> >> Typo: s/returns `nil`/`nil` is returned/. > > Fixed. > >> >>> +(plus an error message as a second result and a system-dependent error code as >>> +a third result). Otherwise returns some true value. >> >> It would be nice to mention that the function contract is similar to >> other standart io.* interfaces. >> >> I glanced the source code: it's not "some" true value; it is exactly the >> *true* value. > > All right! Fixed. > >> >>> + >> >> <snipped> >> >>> +Memory profiler is expected to be thread safe, so it has a corresponding >>> +lock/unlock at internal mutex whenever you call corresponding memprof >>> +functions. If you want to build LuaJIT without thread safety use >>> +`-DLUAJIT_DISABLE_THREAD_SAFE`. >> >> This is not implemented in scope of the MVP, so drop this part. > > Done. > >> >>> + >>> +### Reading and displaying saved data >>> + >>> +Binary data can be read by `lj-parse-memprof` utility. It parses the binary >> >> Typo: s/lj-parse-memprof/luajit-parse-memprof/. > > Fixed, thanks! > >> >>> +format provided by memory profiler and render it on human-readable format. >> >> Typo: s/it on/it to/. > > Fixed, thanks! > >> >>> + >> >> <snipped> >> >>> +This table shows performance deviation in relation to REFerence value (before >>> +commit) with stopped and running profiler. The table shows the average value >>> +for 11 runs. The first field of the column indicates the change in the average >>> +time in seconds (less is better). The second field is the standard deviation >>> +for the found difference. >>> + >>> +``` >>> + Name | REF | AFTER, memprof off | AFTER, memprof on >>> +----------------+------+--------------------+------------------ >>> +array3d | 0.21 | +0.00 (0.01) | +0.00 (0.01) >>> +binary-trees | 3.25 | -0.01 (0.06) | +0.53 (0.10) >>> +chameneos | 2.97 | +0.14 (0.04) | +0.13 (0.06) >>> +coroutine-ring | 1.00 | +0.01 (0.04) | +0.01 (0.04) >>> +euler14-bit | 1.03 | +0.01 (0.02) | +0.00 (0.02) >>> +fannkuch | 6.81 | -0.21 (0.06) | -0.20 (0.06) >>> +fasta | 8.20 | -0.07 (0.05) | -0.08 (0.03) >> >> Side note: Still curious how this can happen. It looks OK when this is >> negative difference in within its deviation. But this is sorta magic. > > Yes, me too. Unfortunately, we have neither any benchmark tests nor > performance analisis for LuaJIT for now. > >> >>> +life | 0.46 | +0.00 (0.01) | +0.35 (0.01) >>> +mandelbrot | 2.65 | +0.00 (0.01) | +0.01 (0.01) >>> +mandelbrot-bit | 1.97 | +0.00 (0.01) | +0.01 (0.02) >>> +md5 | 1.58 | -0.01 (0.04) | -0.04 (0.04) >>> +nbody | 1.34 | +0.00 (0.01) | -0.02 (0.01) >>> +nsieve | 2.07 | -0.03 (0.03) | -0.01 (0.04) >>> +nsieve-bit | 1.50 | -0.02 (0.04) | +0.00 (0.04) >>> +nsieve-bit-fp | 4.44 | -0.03 (0.07) | -0.01 (0.07) >>> +partialsums | 0.54 | +0.00 (0.01) | +0.00 (0.01) >>> +pidigits-nogmp | 3.47 | -0.01 (0.02) | -0.10 (0.02) >>> +ray | 1.62 | -0.02 (0.03) | +0.00 (0.02) >>> +recursive-ack | 0.20 | +0.00 (0.01) | +0.00 (0.01) >>> +recursive-fib | 1.63 | +0.00 (0.01) | +0.01 (0.02) >>> +scimark-fft | 5.72 | +0.06 (0.09) | -0.01 (0.10) >>> +scimark-lu | 3.47 | +0.02 (0.27) | -0.03 (0.26) >>> +scimark-sor | 2.34 | +0.00 (0.01) | -0.01 (0.01) >>> +scimark-sparse | 4.95 | -0.02 (0.04) | -0.02 (0.04) >>> +series | 0.95 | +0.00 (0.02) | +0.00 (0.01) >>> +spectral-norm | 0.96 | +0.00 (0.02) | -0.01 (0.02) >>> +``` >>> -- >>> 2.28.0 >>> >> >> -- >> Best regards, >> IM > > =================================================================== > diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md > index 85a61462a..2721f1cc1 100644 > --- a/doc/rfc/5442-luajit-memory-profiler.md > +++ b/doc/rfc/5442-luajit-memory-profiler.md > @@ -30,39 +30,39 @@ The whole toolchain of memory profiling will be divided into several parts: > > This section describes additional changes in LuaJIT required for the feature > implementation. This version of LuaJIT memory profiler does not support verbose > -reporting allocations from traces. All allocation from traces are reported as > -internal. But trace code semantics should be totally the same as for the Lua > -interpreter (excluding sink optimizations). Also all deallocations reported as > -internal too. > +reporting for allocations made on traces. All allocation from traces are > +reported as internal. But trace code semantics should be totally the same as > +for the Lua interpreter (excluding sink optimizations). Also all, deallocations > +are reported as internal too. > > There are two different representations of functions in LuaJIT: the function's > prototype (`GCproto`) and the function object so called closure (`GCfunc`). > The closures are represented as `GCfuncL` and `GCfuncC` for Lua and C closures > -correspondingly. Also LuaJIT has a special function's type aka Fast Function. > -It is used for LuaJIT builtins. > +respectively. Besides LuaJIT has a special function type, a.k.a. Fast Function > +that is used for LuaJIT built-ins > > Tail call optimization does not create a new call frame, so all allocations > inside the function called via `CALLT`/`CALLMT` are attributed to its caller. > > -Usually developers are not interested in information about allocations inside > -builtins. So if fast function was called from a Lua function all > -allocations are attributed to this Lua function. Otherwise attribute this event > -to a C function. > +Lua developers can do nothing with allocations made inside the built-ins except > +reducing its usage. So if fast function is called from a Lua function all > +allocations made in its scope are attributed to this Lua function (i.e. the > +built-in caller). Otherwise, this event is attributed to a C function. > > Assume we have the following Lua chunk named <test.lua>: > > -``` > +```lua > 1 jit.off() > 2 misc.memprof.start("memprof_new.bin") > -3 -- Lua does not create a new frame to call string.rep and all allocations are > -4 -- attributed not to `append()` function but to the parent scope. > +3 -- Lua does not create a new frame to call string.rep() and all allocations > +4 -- are attributed not to append() function but to the parent scope. > 5 local function append(str, rep) > 6 return string.rep(str, rep) > 7 end > 8 > 9 local t = {} > 10 for _ = 1, 1e5 do > -11 -- table.insert is a builtin and all corresponding allocations > +11 -- table.insert() is a built-in and all corresponding allocations > 12 -- are reported in the scope of main chunk > 13 table.insert(t, > 14 append('q', _) > @@ -71,7 +71,7 @@ Assume we have the following Lua chunk named <test.lua>: > 17 misc.memprof.stop() > ``` > > -If one run the chunk above the profiler reports approximately the following > +If one runs the chunk above the profiler reports approximately the following > (see legend [here](#reading-and-displaying-saved-data)): > ``` > ALLOCATIONS > @@ -99,15 +99,15 @@ INTERNAL: 20 0 1481 > > So we need to know a type of function being executed by the virtual machine > (VM). Currently VM state identifies C function execution only, so Fast and Lua > -functions states will be added. > +functions states are added. > > To determine currently allocating coroutine (that may not be equal to currently > executed one) a new field called `mem_L` is added to `global_State` structure > -to keep the coroutine address. This field is set at each reallocation to > -corresponding `L` with which it was called. > +to keep the coroutine address. This field is set on each reallocation to the > +corresponding `L` with which it is called. > > There is a static function (`lj_debug_getframeline`) that returns line number > -for current `BCPos` in `lj_debug.c` already. It will be added to the debug > +for current `BCPos` in `lj_debug.c` already. It is added to the debug > module API to be used in memory profiler. > > ### Information recording > @@ -211,10 +211,11 @@ local started, err, errno = misc.memprof.start(fname) > ``` > where `fname` is name of the file where profile events are written. Writer for > this function perform `fwrite()` for each call retrying in case of `EINTR`. > -When the profiling is stopped the `fclose()` is called. If it is impossible to > -open a file for writing or profiler fails to start, returns `nil` on failure > +When the profiling is stopped `fclose()` is called. The profiler's function's > +contract is similar to standard `io.*` interfaces. If it is impossible to open > +a file for writing or profiler fails to start, `nil` is returned on failure > (plus an error message as a second result and a system-dependent error code as > -a third result). Otherwise returns some true value. > +a third result). Otherwise, returns `true` value. > > Stopping profiler from Lua is simple too: > ```lua > @@ -230,17 +231,12 @@ If you want to build LuaJIT without memory profiler, you should build it with > `-DLUAJIT_DISABLE_MEMPROF`. If it is disabled `misc.memprof.start()` and > `misc.memprof.stop()` always return `false`. > > -Memory profiler is expected to be thread safe, so it has a corresponding > -lock/unlock at internal mutex whenever you call corresponding memprof > -functions. If you want to build LuaJIT without thread safety use > -`-DLUAJIT_DISABLE_THREAD_SAFE`. > - > ### Reading and displaying saved data > > -Binary data can be read by `lj-parse-memprof` utility. It parses the binary > -format provided by memory profiler and render it on human-readable format. > +Binary data can be read by `luajit-parse-memprof` utility. It parses the binary > +format provided by memory profiler and render it to human-readable format. > > -The usage is very simple: > +The usage for LuaJIT itself is very simple: > ``` > $ ./luajit-parse-memprof --help > luajit-parse-memprof - parser of the memory usage profile collected > @@ -266,6 +262,12 @@ structures. Note that events are sorted from the most often to the least. > > `Overrides` means what allocation this reallocation overrides. > > +If you want to parse binary data via Tarantool only, use the following > +command (dash is important): > +```bash > +$ tarantool -e 'require("memprof")(arg[1])' - memprof.bin > +``` > + > ## Benchmarks > > Benchmarks were taken from repo: > =================================================================== > > And one more iterative patch (over the previous one). Branch is > force pushed. > =================================================================== > diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md > index 2721f1cc1..f9c43f91f 100644 > --- a/doc/rfc/5442-luajit-memory-profiler.md > +++ b/doc/rfc/5442-luajit-memory-profiler.md > @@ -5,7 +5,7 @@ > * **Authors**: Sergey Kaplun @Buristan skaplun@tarantool.org <mailto:skaplun@tarantool.org>, > Igor Munkin @igormunkin imun@tarantool.org <mailto:imun@tarantool.org>, > Sergey Ostanevich @sergos sergos@tarantool.org <mailto:sergos@tarantool.org> > -* **Issues**: [#5442](https://github.com/tarantool/tarantool/issues/5442 <https://github.com/tarantool/tarantool/issues/5442>) > +* **Issues**: [#5442](https://github.com/tarantool/tarantool/issues/5442 <https://github.com/tarantool/tarantool/issues/5442>), [#5490](https://github.com/tarantool/tarantool/issues/5490 <https://github.com/tarantool/tarantool/issues/5490>) > > ## Summary > =================================================================== > -- > Best regards, > Sergey Kaplun [-- Attachment #2: Type: text/html, Size: 174277 bytes --]
Hi, Igor! Thanks for the review! On 15.01.21, Igor Munkin wrote: > Sergey, > > Thanks for the changes. There is a bit of nitpicking below and I > believe we'll push the next version doc to the trunk. I've fixed all your comments, plus added some insignificant fixes. See two iterative patches below. Branch is force pushed. > > On 25.12.20, Sergey Kaplun wrote: > > Part of #5442 > > --- > > > > RFC on branch: https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler/doc/rfc/5442-luajit-memory-profiler.md Side note: branch name is updated. New RFC version: https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler-rfc/doc/rfc/5442-luajit-memory-profiler.md > > > > Changes in v3: > > * More comments in example. > > * More verbose benchmark information. > > * Grammar and spelling fixes. > > > > Changes in v2: > > * Removed C API, Tarantool integration and description of additional > > features -- they will be added in another RFC if necessary. > > * Removed checking profile is running from the public API. > > * Added benchmarks and more meaningful example. > > * Grammar fixes. > > > > doc/rfc/5442-luajit-memory-profiler.md | 314 +++++++++++++++++++++++++ > > 1 file changed, 314 insertions(+) > > create mode 100644 doc/rfc/5442-luajit-memory-profiler.md > > > > diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md > > new file mode 100644 > > index 000000000..85a61462a > > --- /dev/null > > +++ b/doc/rfc/5442-luajit-memory-profiler.md > > @@ -0,0 +1,314 @@ > > <snipped> > > > +### Prerequisites > > + > > +This section describes additional changes in LuaJIT required for the feature > > +implementation. This version of LuaJIT memory profiler does not support verbose > > +reporting allocations from traces. All allocation from traces are reported as > > Typo: s/reporting allocations from/reporting for allocations made on/. Fixed, thanks! > > > +internal. But trace code semantics should be totally the same as for the Lua > > +interpreter (excluding sink optimizations). Also all deallocations reported as > > Typo: s/deallocations reported/deallocation are reported/. Fixed, thanks! > > > +internal too. > > + > > +There are two different representations of functions in LuaJIT: the function's > > +prototype (`GCproto`) and the function object so called closure (`GCfunc`). > > +The closures are represented as `GCfuncL` and `GCfuncC` for Lua and C closures > > +correspondingly. Also LuaJIT has a special function's type aka Fast Function. > > Typo: s/correspondingly/respectively/. > > > +It is used for LuaJIT builtins. > > It's better to not split this sentence. Consider the rewording: > | Besides LuaJIT has a special function type a.k.a. Fast Function that > | is used for LuaJIT builtins. Applied! Thanks! > > > + > > <snipped> > > > +Usually developers are not interested in information about allocations inside > > +builtins. So if fast function was called from a Lua function all > > +allocations are attributed to this Lua function. Otherwise attribute this event > > +to a C function. > > I propose the following rewording: > | Lua developers can do nothing with allocations made inside the > | builtins except reducing its usage. So if fast function is called from > | a Lua function all allocations made in its scope are attributed to this > | Lua function (i.e. the builtin caller). Otherwise this event is > | attributed to a C function. > Applied, thanks! > > + > > <snipped> > > > +If one run the chunk above the profiler reports approximately the following > > Typo: s/run/runs/. Fixed. > > > +(see legend [here](#reading-and-displaying-saved-data)): > > <snipped> > > > +So we need to know a type of function being executed by the virtual machine > > +(VM). Currently VM state identifies C function execution only, so Fast and Lua > > +functions states will be added. > > Typo: s/will be/are/. Sure, thanks! > > > + > > +To determine currently allocating coroutine (that may not be equal to currently > > +executed one) a new field called `mem_L` is added to `global_State` structure > > +to keep the coroutine address. This field is set at each reallocation to > > Typo: /at each reallocation to/on each reallocation to the/. Fixed. > > > +corresponding `L` with which it was called. > > Typo: s/it was/it is/. Thanks, fixed! > > > + > > <snipped> > > > +When the profiling is stopped the `fclose()` is called. If it is impossible to > > Typo: s/the `fclose()`/`fclose()`/. Fixed. > > > +open a file for writing or profiler fails to start, returns `nil` on failure > > Typo: s/returns `nil`/`nil` is returned/. Fixed. > > > +(plus an error message as a second result and a system-dependent error code as > > +a third result). Otherwise returns some true value. > > It would be nice to mention that the function contract is similar to > other standart io.* interfaces. > > I glanced the source code: it's not "some" true value; it is exactly the > *true* value. All right! Fixed. > > > + > > <snipped> > > > +Memory profiler is expected to be thread safe, so it has a corresponding > > +lock/unlock at internal mutex whenever you call corresponding memprof > > +functions. If you want to build LuaJIT without thread safety use > > +`-DLUAJIT_DISABLE_THREAD_SAFE`. > > This is not implemented in scope of the MVP, so drop this part. Done. > > > + > > +### Reading and displaying saved data > > + > > +Binary data can be read by `lj-parse-memprof` utility. It parses the binary > > Typo: s/lj-parse-memprof/luajit-parse-memprof/. Fixed, thanks! > > > +format provided by memory profiler and render it on human-readable format. > > Typo: s/it on/it to/. Fixed, thanks! > > > + > > <snipped> > > > +This table shows performance deviation in relation to REFerence value (before > > +commit) with stopped and running profiler. The table shows the average value > > +for 11 runs. The first field of the column indicates the change in the average > > +time in seconds (less is better). The second field is the standard deviation > > +for the found difference. > > + > > +``` > > + Name | REF | AFTER, memprof off | AFTER, memprof on > > +----------------+------+--------------------+------------------ > > +array3d | 0.21 | +0.00 (0.01) | +0.00 (0.01) > > +binary-trees | 3.25 | -0.01 (0.06) | +0.53 (0.10) > > +chameneos | 2.97 | +0.14 (0.04) | +0.13 (0.06) > > +coroutine-ring | 1.00 | +0.01 (0.04) | +0.01 (0.04) > > +euler14-bit | 1.03 | +0.01 (0.02) | +0.00 (0.02) > > +fannkuch | 6.81 | -0.21 (0.06) | -0.20 (0.06) > > +fasta | 8.20 | -0.07 (0.05) | -0.08 (0.03) > > Side note: Still curious how this can happen. It looks OK when this is > negative difference in within its deviation. But this is sorta magic. Yes, me too. Unfortunately, we have neither any benchmark tests nor performance analisis for LuaJIT for now. > > > +life | 0.46 | +0.00 (0.01) | +0.35 (0.01) > > +mandelbrot | 2.65 | +0.00 (0.01) | +0.01 (0.01) > > +mandelbrot-bit | 1.97 | +0.00 (0.01) | +0.01 (0.02) > > +md5 | 1.58 | -0.01 (0.04) | -0.04 (0.04) > > +nbody | 1.34 | +0.00 (0.01) | -0.02 (0.01) > > +nsieve | 2.07 | -0.03 (0.03) | -0.01 (0.04) > > +nsieve-bit | 1.50 | -0.02 (0.04) | +0.00 (0.04) > > +nsieve-bit-fp | 4.44 | -0.03 (0.07) | -0.01 (0.07) > > +partialsums | 0.54 | +0.00 (0.01) | +0.00 (0.01) > > +pidigits-nogmp | 3.47 | -0.01 (0.02) | -0.10 (0.02) > > +ray | 1.62 | -0.02 (0.03) | +0.00 (0.02) > > +recursive-ack | 0.20 | +0.00 (0.01) | +0.00 (0.01) > > +recursive-fib | 1.63 | +0.00 (0.01) | +0.01 (0.02) > > +scimark-fft | 5.72 | +0.06 (0.09) | -0.01 (0.10) > > +scimark-lu | 3.47 | +0.02 (0.27) | -0.03 (0.26) > > +scimark-sor | 2.34 | +0.00 (0.01) | -0.01 (0.01) > > +scimark-sparse | 4.95 | -0.02 (0.04) | -0.02 (0.04) > > +series | 0.95 | +0.00 (0.02) | +0.00 (0.01) > > +spectral-norm | 0.96 | +0.00 (0.02) | -0.01 (0.02) > > +``` > > -- > > 2.28.0 > > > > -- > Best regards, > IM =================================================================== diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md index 85a61462a..2721f1cc1 100644 --- a/doc/rfc/5442-luajit-memory-profiler.md +++ b/doc/rfc/5442-luajit-memory-profiler.md @@ -30,39 +30,39 @@ The whole toolchain of memory profiling will be divided into several parts: This section describes additional changes in LuaJIT required for the feature implementation. This version of LuaJIT memory profiler does not support verbose -reporting allocations from traces. All allocation from traces are reported as -internal. But trace code semantics should be totally the same as for the Lua -interpreter (excluding sink optimizations). Also all deallocations reported as -internal too. +reporting for allocations made on traces. All allocation from traces are +reported as internal. But trace code semantics should be totally the same as +for the Lua interpreter (excluding sink optimizations). Also all, deallocations +are reported as internal too. There are two different representations of functions in LuaJIT: the function's prototype (`GCproto`) and the function object so called closure (`GCfunc`). The closures are represented as `GCfuncL` and `GCfuncC` for Lua and C closures -correspondingly. Also LuaJIT has a special function's type aka Fast Function. -It is used for LuaJIT builtins. +respectively. Besides LuaJIT has a special function type, a.k.a. Fast Function +that is used for LuaJIT built-ins Tail call optimization does not create a new call frame, so all allocations inside the function called via `CALLT`/`CALLMT` are attributed to its caller. -Usually developers are not interested in information about allocations inside -builtins. So if fast function was called from a Lua function all -allocations are attributed to this Lua function. Otherwise attribute this event -to a C function. +Lua developers can do nothing with allocations made inside the built-ins except +reducing its usage. So if fast function is called from a Lua function all +allocations made in its scope are attributed to this Lua function (i.e. the +built-in caller). Otherwise, this event is attributed to a C function. Assume we have the following Lua chunk named <test.lua>: -``` +```lua 1 jit.off() 2 misc.memprof.start("memprof_new.bin") -3 -- Lua does not create a new frame to call string.rep and all allocations are -4 -- attributed not to `append()` function but to the parent scope. +3 -- Lua does not create a new frame to call string.rep() and all allocations +4 -- are attributed not to append() function but to the parent scope. 5 local function append(str, rep) 6 return string.rep(str, rep) 7 end 8 9 local t = {} 10 for _ = 1, 1e5 do -11 -- table.insert is a builtin and all corresponding allocations +11 -- table.insert() is a built-in and all corresponding allocations 12 -- are reported in the scope of main chunk 13 table.insert(t, 14 append('q', _) @@ -71,7 +71,7 @@ Assume we have the following Lua chunk named <test.lua>: 17 misc.memprof.stop() ``` -If one run the chunk above the profiler reports approximately the following +If one runs the chunk above the profiler reports approximately the following (see legend [here](#reading-and-displaying-saved-data)): ``` ALLOCATIONS @@ -99,15 +99,15 @@ INTERNAL: 20 0 1481 So we need to know a type of function being executed by the virtual machine (VM). Currently VM state identifies C function execution only, so Fast and Lua -functions states will be added. +functions states are added. To determine currently allocating coroutine (that may not be equal to currently executed one) a new field called `mem_L` is added to `global_State` structure -to keep the coroutine address. This field is set at each reallocation to -corresponding `L` with which it was called. +to keep the coroutine address. This field is set on each reallocation to the +corresponding `L` with which it is called. There is a static function (`lj_debug_getframeline`) that returns line number -for current `BCPos` in `lj_debug.c` already. It will be added to the debug +for current `BCPos` in `lj_debug.c` already. It is added to the debug module API to be used in memory profiler. ### Information recording @@ -211,10 +211,11 @@ local started, err, errno = misc.memprof.start(fname) ``` where `fname` is name of the file where profile events are written. Writer for this function perform `fwrite()` for each call retrying in case of `EINTR`. -When the profiling is stopped the `fclose()` is called. If it is impossible to -open a file for writing or profiler fails to start, returns `nil` on failure +When the profiling is stopped `fclose()` is called. The profiler's function's +contract is similar to standard `io.*` interfaces. If it is impossible to open +a file for writing or profiler fails to start, `nil` is returned on failure (plus an error message as a second result and a system-dependent error code as -a third result). Otherwise returns some true value. +a third result). Otherwise, returns `true` value. Stopping profiler from Lua is simple too: ```lua @@ -230,17 +231,12 @@ If you want to build LuaJIT without memory profiler, you should build it with `-DLUAJIT_DISABLE_MEMPROF`. If it is disabled `misc.memprof.start()` and `misc.memprof.stop()` always return `false`. -Memory profiler is expected to be thread safe, so it has a corresponding -lock/unlock at internal mutex whenever you call corresponding memprof -functions. If you want to build LuaJIT without thread safety use -`-DLUAJIT_DISABLE_THREAD_SAFE`. - ### Reading and displaying saved data -Binary data can be read by `lj-parse-memprof` utility. It parses the binary -format provided by memory profiler and render it on human-readable format. +Binary data can be read by `luajit-parse-memprof` utility. It parses the binary +format provided by memory profiler and render it to human-readable format. -The usage is very simple: +The usage for LuaJIT itself is very simple: ``` $ ./luajit-parse-memprof --help luajit-parse-memprof - parser of the memory usage profile collected @@ -266,6 +262,12 @@ structures. Note that events are sorted from the most often to the least. `Overrides` means what allocation this reallocation overrides. +If you want to parse binary data via Tarantool only, use the following +command (dash is important): +```bash +$ tarantool -e 'require("memprof")(arg[1])' - memprof.bin +``` + ## Benchmarks Benchmarks were taken from repo: =================================================================== And one more iterative patch (over the previous one). Branch is force pushed. =================================================================== diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md index 2721f1cc1..f9c43f91f 100644 --- a/doc/rfc/5442-luajit-memory-profiler.md +++ b/doc/rfc/5442-luajit-memory-profiler.md @@ -5,7 +5,7 @@ * **Authors**: Sergey Kaplun @Buristan skaplun@tarantool.org, Igor Munkin @igormunkin imun@tarantool.org, Sergey Ostanevich @sergos sergos@tarantool.org -* **Issues**: [#5442](https://github.com/tarantool/tarantool/issues/5442) +* **Issues**: [#5442](https://github.com/tarantool/tarantool/issues/5442), [#5490](https://github.com/tarantool/tarantool/issues/5490) ## Summary =================================================================== -- Best regards, Sergey Kaplun
Sergey, Thanks for the changes. There is a bit of nitpicking below and I believe we'll push the next version doc to the trunk. On 25.12.20, Sergey Kaplun wrote: > Part of #5442 > --- > > RFC on branch: https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler/doc/rfc/5442-luajit-memory-profiler.md > > Changes in v3: > * More comments in example. > * More verbose benchmark information. > * Grammar and spelling fixes. > > Changes in v2: > * Removed C API, Tarantool integration and description of additional > features -- they will be added in another RFC if necessary. > * Removed checking profile is running from the public API. > * Added benchmarks and more meaningful example. > * Grammar fixes. > > doc/rfc/5442-luajit-memory-profiler.md | 314 +++++++++++++++++++++++++ > 1 file changed, 314 insertions(+) > create mode 100644 doc/rfc/5442-luajit-memory-profiler.md > > diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md > new file mode 100644 > index 000000000..85a61462a > --- /dev/null > +++ b/doc/rfc/5442-luajit-memory-profiler.md > @@ -0,0 +1,314 @@ <snipped> > +### Prerequisites > + > +This section describes additional changes in LuaJIT required for the feature > +implementation. This version of LuaJIT memory profiler does not support verbose > +reporting allocations from traces. All allocation from traces are reported as Typo: s/reporting allocations from/reporting for allocations made on/. > +internal. But trace code semantics should be totally the same as for the Lua > +interpreter (excluding sink optimizations). Also all deallocations reported as Typo: s/deallocations reported/deallocation are reported/. > +internal too. > + > +There are two different representations of functions in LuaJIT: the function's > +prototype (`GCproto`) and the function object so called closure (`GCfunc`). > +The closures are represented as `GCfuncL` and `GCfuncC` for Lua and C closures > +correspondingly. Also LuaJIT has a special function's type aka Fast Function. Typo: s/correspondingly/respectively/. > +It is used for LuaJIT builtins. It's better to not split this sentence. Consider the rewording: | Besides LuaJIT has a special function type a.k.a. Fast Function that | is used for LuaJIT builtins. > + <snipped> > +Usually developers are not interested in information about allocations inside > +builtins. So if fast function was called from a Lua function all > +allocations are attributed to this Lua function. Otherwise attribute this event > +to a C function. I propose the following rewording: | Lua developers can do nothing with allocations made inside the | builtins except reducing its usage. So if fast function is called from | a Lua function all allocations made in its scope are attributed to this | Lua function (i.e. the builtin caller). Otherwise this event is | attributed to a C function. > + <snipped> > +If one run the chunk above the profiler reports approximately the following Typo: s/run/runs/. > +(see legend [here](#reading-and-displaying-saved-data)): <snipped> > +So we need to know a type of function being executed by the virtual machine > +(VM). Currently VM state identifies C function execution only, so Fast and Lua > +functions states will be added. Typo: s/will be/are/. > + > +To determine currently allocating coroutine (that may not be equal to currently > +executed one) a new field called `mem_L` is added to `global_State` structure > +to keep the coroutine address. This field is set at each reallocation to Typo: /at each reallocation to/on each reallocation to the/. > +corresponding `L` with which it was called. Typo: s/it was/it is/. > + <snipped> > +When the profiling is stopped the `fclose()` is called. If it is impossible to Typo: s/the `fclose()`/`fclose()`/. > +open a file for writing or profiler fails to start, returns `nil` on failure Typo: s/returns `nil`/`nil` is returned/. > +(plus an error message as a second result and a system-dependent error code as > +a third result). Otherwise returns some true value. It would be nice to mention that the function contract is similar to other standart io.* interfaces. I glanced the source code: it's not "some" true value; it is exactly the *true* value. > + <snipped> > +Memory profiler is expected to be thread safe, so it has a corresponding > +lock/unlock at internal mutex whenever you call corresponding memprof > +functions. If you want to build LuaJIT without thread safety use > +`-DLUAJIT_DISABLE_THREAD_SAFE`. This is not implemented in scope of the MVP, so drop this part. > + > +### Reading and displaying saved data > + > +Binary data can be read by `lj-parse-memprof` utility. It parses the binary Typo: s/lj-parse-memprof/luajit-parse-memprof/. > +format provided by memory profiler and render it on human-readable format. Typo: s/it on/it to/. > + <snipped> > +This table shows performance deviation in relation to REFerence value (before > +commit) with stopped and running profiler. The table shows the average value > +for 11 runs. The first field of the column indicates the change in the average > +time in seconds (less is better). The second field is the standard deviation > +for the found difference. > + > +``` > + Name | REF | AFTER, memprof off | AFTER, memprof on > +----------------+------+--------------------+------------------ > +array3d | 0.21 | +0.00 (0.01) | +0.00 (0.01) > +binary-trees | 3.25 | -0.01 (0.06) | +0.53 (0.10) > +chameneos | 2.97 | +0.14 (0.04) | +0.13 (0.06) > +coroutine-ring | 1.00 | +0.01 (0.04) | +0.01 (0.04) > +euler14-bit | 1.03 | +0.01 (0.02) | +0.00 (0.02) > +fannkuch | 6.81 | -0.21 (0.06) | -0.20 (0.06) > +fasta | 8.20 | -0.07 (0.05) | -0.08 (0.03) Side note: Still curious how this can happen. It looks OK when this is negative difference in within its deviation. But this is sorta magic. > +life | 0.46 | +0.00 (0.01) | +0.35 (0.01) > +mandelbrot | 2.65 | +0.00 (0.01) | +0.01 (0.01) > +mandelbrot-bit | 1.97 | +0.00 (0.01) | +0.01 (0.02) > +md5 | 1.58 | -0.01 (0.04) | -0.04 (0.04) > +nbody | 1.34 | +0.00 (0.01) | -0.02 (0.01) > +nsieve | 2.07 | -0.03 (0.03) | -0.01 (0.04) > +nsieve-bit | 1.50 | -0.02 (0.04) | +0.00 (0.04) > +nsieve-bit-fp | 4.44 | -0.03 (0.07) | -0.01 (0.07) > +partialsums | 0.54 | +0.00 (0.01) | +0.00 (0.01) > +pidigits-nogmp | 3.47 | -0.01 (0.02) | -0.10 (0.02) > +ray | 1.62 | -0.02 (0.03) | +0.00 (0.02) > +recursive-ack | 0.20 | +0.00 (0.01) | +0.00 (0.01) > +recursive-fib | 1.63 | +0.00 (0.01) | +0.01 (0.02) > +scimark-fft | 5.72 | +0.06 (0.09) | -0.01 (0.10) > +scimark-lu | 3.47 | +0.02 (0.27) | -0.03 (0.26) > +scimark-sor | 2.34 | +0.00 (0.01) | -0.01 (0.01) > +scimark-sparse | 4.95 | -0.02 (0.04) | -0.02 (0.04) > +series | 0.95 | +0.00 (0.02) | +0.00 (0.01) > +spectral-norm | 0.96 | +0.00 (0.02) | -0.01 (0.02) > +``` > -- > 2.28.0 > -- Best regards, IM
On Sat, Jan 09, 2021 at 09:59:34PM +0300, Cyrill Gorcunov wrote:
> Please ignore this message. There gonna be a number of such
> messages while we're moving to a new server.
Another one.
Cyrill
Please ignore this message. There gonna be a number of such messages while we're moving to a new server. Cyrill
Sorry. Please, drop this message. It was sent by mistake. -- Best regards, Sergey Kaplun
This patch embeds a parser for binary data dumped via the memory profiler to Tarantool binary. It is a set of the following Lua modules: * utils/bufread.lua: read binary data from a file. * utils/symtab.lua: symbol table decode functions * memprof/parse.lua: decode the memory profiler event stream * memprof/humanize.lua: display decoded data in human readable format * memprof.lua: Lua script and module to display data It launch with the following command: $ tarantool -e 'require("memprof")(arg[1])' - filename.bin Closed #5490 --- Issue: https://github.com/tarantool/tarantool/issues/5490 Branch: https://github.com/tarantool/tarantool/tree/skaplun/gh-5442-luajit-memory-profiler CI: https://gitlab.com/tarantool/tarantool/-/commits/skaplun/gh-5442-luajit-memory-profiler src/CMakeLists.txt | 6 ++++++ src/lua/init.c | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b95688c1a..9a712bc29 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -61,6 +61,12 @@ lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/src/jit/vmdef.lua lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/src/jit/v.lua") lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/src/jit/p.lua") lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/src/jit/zone.lua") +# LuaJIT tools.* library +lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/tools/memprof.lua") +lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/tools/memprof/humanize.lua") +lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/tools/memprof/parse.lua") +lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/tools/utils/bufread.lua") +lua_source(lua_sources "${CMAKE_BINARY_DIR}/third_party/luajit/tools/utils/symtab.lua") add_custom_target(generate_lua_sources WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/src/box diff --git a/src/lua/init.c b/src/lua/init.c index a0b2fc775..82075c595 100644 --- a/src/lua/init.c +++ b/src/lua/init.c @@ -121,7 +121,14 @@ extern char strict_lua[], string_lua[], swim_lua[], p_lua[], /* LuaJIT 2.1 profiler */ - zone_lua[] /* LuaJIT 2.1 profiler */; + zone_lua[], /* LuaJIT 2.1 profiler */ + /* tools.* libraries. */ + bufread_lua[], + symtab_lua[], + parse_lua[], + humanize_lua[], + memprof_lua[] +; static const char *lua_modules[] = { /* Make it first to affect load of all other modules */ @@ -167,6 +174,12 @@ static const char *lua_modules[] = { /* Profiler */ "jit.p", p_lua, "jit.zone", zone_lua, + /* tools.* libraries. Order is important. */ + "utils.bufread", bufread_lua, + "utils.symtab", symtab_lua, + "memprof.parse", parse_lua, + "memprof.humanize", humanize_lua, + "memprof", memprof_lua, NULL }; -- 2.28.0
Hi, Igor! Thanks for the review! On 21.12.20, Igor Munkin wrote: > Sergey, > > Thanks for the new version! The design is fine, but please consider my > minor comments regarding the document itself. Please, see new version here [1]. > > On 16.12.20, Sergey Kaplun wrote: > > Part of #5442 > > --- > > > > Issues: https://github.com/tarantool/tarantool/issues/5442 > > Branch: https://github.com/tarantool/tarantool/tree/skaplun/gh-5442-luajit-memory-profiler > > > > Changes in v2: > > * Removed C API, Tarantool integration and description of additional > > features -- they will be added in another RFC if necessary. > > * Removed checking profile is running from the public API. > > * Added benchmarks and more meaningful example. > > * Grammar fixes. > > > > doc/rfc/5442-luajit-memory-profiler.md | 306 +++++++++++++++++++++++++ > > 1 file changed, 306 insertions(+) > > create mode 100644 doc/rfc/5442-luajit-memory-profiler.md > > > > diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md > > new file mode 100644 > > index 000000000..720105009 > > --- /dev/null > > +++ b/doc/rfc/5442-luajit-memory-profiler.md > > @@ -0,0 +1,306 @@ > > <snipped> > > > +## Detailed design > > + > > +The whole toolchain of memory profiling will be divided by several parts: > > +1) Prerequisites. > > +2) Recording information about memory usage and saving it. > > +3) Reading saved data and display it in human-readable format. > > I believe the links would also be great here. Done in the new version. > > > + > > +### Prerequisites > > + > > +This section describes additional changes in LuaJIT required to feature > > +implementation. This version of LuaJIT memory profiler does not support > > +reporting allocations from traces. But trace code semantics should be totally > > +the same as for Lua interpreter. So profiling with `jit.off()` should be > > +enough. > > From this part I guess nothing works with enabled JIT. As a result of > the offline discussion it occurs everything works but allocation on > traces are attributed as internal ones. I think this is not a big deal > since JIT is a black internal box. Please mention this explicitly above. > > Minor: It's worth to mention that there might be less GC pressure while > trace execution (considering allocation sinking optimization impact), > but the traces itself are allocated in GC area. Added, thanks! > > > + > > <snipped> > > > + > > +Fast function allocation events always belong to the previous frame with > > +considering of tail call optimizations (TCO). > > Still don't get it. In the previous version I asked to explicitly > mention that allocations in builtins are attributed with their caller. > And another case is a tail call optimization, when allocations are also > attributes with the caller. > > So, the example is fine (but comments are desired, IMHO), but I suggest > to separate these cases in the passage above. Reformulated in the new verison as we discussed offline. > > > + > > +Assume we have the following Lua chunk named <test.lua>: > > + > > +``` > > +1 jit.off() > > +2 misc.memprof.start("memprof_new.bin") > > +3 local function append(str, rep) > > +4 return string.rep(str, rep) > > +5 end > > +6 > > +7 local t = {} > > +8 for _ = 1, 1e5 do > > +9 table.insert(t, > > +10 append('q', _) > > +11 ) > > +12 end > > +13 misc.memprof.stop() > > +``` > > + > > +Profilers output is like the follows: > > +``` > > +ALLOCATIONS > > +@test.lua:0, line 10: 100007 5004638934 0 > > +@test.lua:0, line 5: 1 40 0 > > +@test.lua:0, line 7: 1 72 0 > > +@test.lua:0, line 9: 1 48 0 > > + > > +REALLOCATIONS > > +@test.lua:0, line 9: 16 4194496 2097376 > > + Overrides: > > + @test.lua:0, line 9 > > + > > +@test.lua:0, line 10: 12 262080 131040 > > + Overrides: > > + @test.lua:0, line 10 > > + > > + > > +DEALLOCATIONS > > +INTERNAL: 21 0 2463 > > +@test.lua:0, line 10: 8 0 1044480 > > + Overrides: > > + @test.lua:0, line 10 > > +``` > > I have no idea what is dumped here. There is a legend below (in reading > and displaying saved data section), so leave a link to it right here. > Otherwise this is some kind of elvish. Added link. > > > + > > +In Lua functions for profile events, we had to determine the line number of the > > +function definition and corresponding `GCproto` address. For C functions only > > +address will be enough. If Fast function is called from Lua function we had to > > +report the Lua function for more meaningful output. Otherwise report the C > > +function. > > Still don't understand two last sentences. In the previous reply you > told this relates to the part above (about the fast functions). You > reworded that part, but this is still left unclear. Reformulated in the new verison as we discussed offline. > > > + > > +So we need to know in what type of function CALL/RETURN virtual machine (VM) > > AFAICS the function type is enough, isn't it? As you mentioned above > allocation are attributed to the callers in case of tail call > optimization. Reformulated in the new verison as we discussed offline. > > > +is. LuaJIT has already determined C function execution VM state but neither > > +Fast functions nor Lua function. So corresponding VM states will be added. > > <snipped> > > > +### Information recording > > + > > +Each allocate/reallocate/free is considered as a type of event that are > > +reported. Event stream has the following format: > > + > > +```c > > +/* > > +** Event stream format: > > +** > > +** stream := symtab memprof > > +** symtab := see <ljp_symtab.h> > > +** memprof := prologue event* epilogue > > +** prologue := 'l' 'j' 'm' version reserved > > +** version := <BYTE> > > +** reserved := <BYTE> <BYTE> <BYTE> > > +** event := event-alloc | event-realloc | event-free > > +** event-alloc := event-header loc? naddr nsize > > +** event-realloc := event-header loc? oaddr osize naddr nsize > > +** event-free := event-header loc? oaddr osize > > +** event-header := <BYTE> > > +** loc := loc-lua | loc-c > > +** loc-lua := sym-addr line-no > > +** loc-c := sym-addr > > +** sym-addr := <ULEB128> > > +** line-no := <ULEB128> > > +** oaddr := <ULEB128> > > +** naddr := <ULEB128> > > +** osize := <ULEB128> > > +** nsize := <ULEB128> > > +** epilogue := event-header > > +** > > +** <BYTE> : A single byte (no surprises here) > > +** <ULEB128>: Unsigned integer represented in ULEB128 encoding > > +** > > +** (Order of bits below is hi -> lo) > > +** > > +** version: [VVVVVVVV] > > +** * VVVVVVVV: Byte interpreted as a plain integer version number > > +** > > +** event-header: [FTUUSSEE] > > +** * EE : 2 bits for representing allocation event type (AEVENT_*) > > +** * SS : 2 bits for representing allocation source type (ASOURCE_*) > > +** * UU : 2 unused bits > > +** * T : Reserved. 0 for regular events, 1 for the events marked with > > +** the timestamp mark. It is assumed that the time distance between > > +** two marked events is approximately the same and is equal > > +** to 1 second. Always zero for now. > > It looks this is zero always in our case, so we have 3 unused bits, > right? Then just drop this part. I've dropped it according your and Sergos comments. > <snipped> > > +Starting profiler from Lua is quite simple: > > +```lua > > +local started, err = misc.memprof.start(fname) > > +``` > > +Where `fname` is name of the file where profile events are written. Writer for > > +this function perform `fwrite()` for each call retrying in case of `EINTR`. > > +Final callback calls `fclose()` at the end of profiling. If it is impossible to > > +open a file for writing or profiler fails to start, returns `nil` on failure > > +(plus an error message as a second result and a system-dependent error code as > > +a third result). Otherwise returns some true value. > > What the heck is the third result and why is it ignored in the example > above? I think this error should be incorporated in the second argument > kinda <perror> does. I've used the same formulation as in Lua Reference Manual [2]. It's errno, in fact, but IINM it is system dependent. > > > + > > +Stopping profiler from Lua is simple too: > > +```lua > > +local stopped, err = misc.memprof.stop() > > +``` > > + > > +If there is any error occurred at profiling stopping (an error when file > > +descriptor was closed) `memprof.stop()` returns `nil` (plus an error message as > > +a second result and a system-dependent error code as a third result). Returns > > +`true` otherwise. > > Ditto. Same meaning as above. > > > + > > <snipped> > > > +Memory profiler is expected to be thread safe, so it has a corresponding > > +lock/unlock at internal mutex whenever you call `luaM_memprof_*`. If you want > > +to build LuaJIT without thread safety use `-DLUAJIT_DISABLE_THREAD_SAFE`. > > I see no luaM_memprof_* interfaces. So what is this thread safety for? It is used if you've created different VMs and try to start profiling from different threads. But unfortunately for now we have one static structure for all threads. It is necessary to avoid polushion from other profiled VMs. > > > + > > <snipped> > > > +## Benchmarks > > + > > +Benchmarks were taken from repo: > > +[LuaJIT-test-cleanup](https://github.com/LuaJIT/LuaJIT-test-cleanup). > > + > > +Example of usage: > > +```bash > > +/usr/bin/time -f"array3d %U" ./luajit $BENCH_DIR/array3d.lua 300 >/dev/null > > +``` > > + > > +Benchmark results before and after the patch (less is better): > > There are some considerable differences in benchmark results, so I have > several questions: > * Mention whether you tested with enabled JIT or not Enabled of course. For each test I've added this preambule: | local start = misc.memprof.start("/tmp/memprof_tmp.bin") | assert(start) and this postambule: | local stop = misc.memprof.stop() | assert(stop) > * How many iterations did you made? What is the dispersion/noise for > these runs? In the new version I've run 11 itterations for each test. And I've added standard deviation for each test. > * How come the values with the *enabled* memprof are less than values > with disabled memprof and even the vanilla LuaJIT? E.g. fasta or fast > fourier transform. Good question. I've no answer for now. My wild guess: garbage collector starts earlier (when amount of objects is less) and so runs faster. > > > + > > +``` > > + | BEFORE | AFTER,memprof off | AFTER,memprof on > > +---------------+--------+-------------------+----------------- > > +array3d | 0.22 | 0.20 | 0.21 > > +binary-trees | 3.32 | 3.33 | 3.94 > > +chameneos | 2.92 | 3.18 | 3.12 > > +coroutine-ring | 0.99 | 1.00 | 0.99 > > +euler14-bit | 1.04 | 1.05 | 1.03 > > +fannkuch | 6.77 | 6.69 | 6.64 > > +fasta | 8.27 | 8.30 | 8.25 > > +life | 0.48 | 0.48 | 1.03 > > +mandelbrot | 2.69 | 2.70 | 2.75 > > +mandelbrot-bit | 1.99 | 2.00 | 2.08 > > +md5 | 1.57 | 1.61 | 1.56 > > +nbody | 1.35 | 1.38 | 1.33 > > +nsieve | 2.11 | 2.19 | 2.09 > > +nsieve-bit | 1.50 | 1.55 | 1.47 > > +nsieve-bit-fp | 4.40 | 4.63 | 4.44 > > +partialsums | 0.54 | 0.58 | 0.55 > > +pidigits-nogmp | 3.48 | 3.50 | 3.47 > > +ray | 1.63 | 1.68 | 1.64 > > +recursive-ack | 0.19 | 0.22 | 0.20 > > +recursive-fib | 1.62 | 1.71 | 1.63 > > +scimark-fft | 5.78 | 5.94 | 5.69 > > +scimark-lu | 3.26 | 3.57 | 3.59 > > +scimark-sor | 2.34 | 2.35 | 2.33 > > +scimark-sparse | 5.03 | 4.92 | 4.91 > > +series | 0.94 | 0.96 | 0.95 > > +spectral-norm | 0.96 | 0.96 | 0.95 > > +``` > > -- > > 2.28.0 > > > > -- > Best regards, > IM [1]: https://lists.tarantool.org/pipermail/tarantool-discussions/2020-December/000147.html [2]: https://www.lua.org/manual/5.1/manual.html#5.7 -- Best regards, Sergey Kaplun
Hi! Thanks for the review! On 22.12.20, Sergey Ostanevich wrote: > Hi! > > Thanks for the patch! > > See some comments below, after being applied is LGTM. Please, see new verison here [1]. > > Sergos > > > On 16 Dec 2020, at 22:09, Sergey Kaplun <skaplun@tarantool.org> wrote: > > > > Part of #5442 > > --- > > > > Issues: https://github.com/tarantool/tarantool/issues/5442 > > Branch: https://github.com/tarantool/tarantool/tree/skaplun/gh-5442-luajit-memory-profiler > > > > Changes in v2: > > * Removed C API, Tarantool integration and description of additional > > features -- they will be added in another RFC if necessary. > > * Removed checking profile is running from the public API. > > * Added benchmarks and more meaningful example. > > * Grammar fixes. > > > > doc/rfc/5442-luajit-memory-profiler.md | 306 +++++++++++++++++++++++++ > > 1 file changed, 306 insertions(+) > > create mode 100644 doc/rfc/5442-luajit-memory-profiler.md > > > > diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md > > new file mode 100644 > > index 000000000..720105009 > > --- /dev/null > > +++ b/doc/rfc/5442-luajit-memory-profiler.md > > @@ -0,0 +1,306 @@ > > +# LuaJIT memory profiler > > + > > +* **Status**: In progress > > +* **Start date**: 24-10-2020 > > +* **Authors**: Sergey Kaplun @Buristan skaplun@tarantool.org, > > + Igor Munkin @igormunkin imun@tarantool.org, > > + Sergey Ostanevich @sergos sergos@tarantool.org > > +* **Issues**: [#5442](https://github.com/tarantool/tarantool/issues/5442) > > + > > +## Summary > > + > > +LuaJIT memory profiler is a toolchain for analysis of memory usage by user's > > +application. > > + > > +## Background and motivation > > + > > +Garbage collector (GC) is a curse of performance for most of Lua applications. > > This is something that is very questionable. Do you have any data on performance > of Lua code against the GC code in real life? Is there any about Tarantool > applications? > > I’m not questioning the mem profiler itself, rather trying to weigh the GC problem. I can give benchmarks for vshard [2] for now. There are approximately 20% of time spent in `lj_gc_steps()`, IINM. > > > +Memory usage of Lua application should be profiled to find out various > ^^^^^^^ locate Fixed. > > > +memory-unoptimized code blocks. If the application has memory leaks they can be > > +found with the profiler. > ^ also. Fixed. > > + > > +## Detailed design > > + > > +The whole toolchain of memory profiling will be divided by several parts: > ^^^ into Thank you! Fixed. > > +1) Prerequisites. > > +2) Recording information about memory usage and saving it. > > +3) Reading saved data and display it in human-readable format. > > + > > +### Prerequisites > > + > > +This section describes additional changes in LuaJIT required to feature > ^^ for the Fixed. > > +implementation. This version of LuaJIT memory profiler does not support > > +reporting allocations from traces. But trace code semantics should be totally > > +the same as for Lua interpreter. So profiling with `jit.off()` should be > > +enough. > > + > > +There are two different representations of functions in LuaJIT: the function's > > +prototype (`GCproto`) and the function object so called closure (`GCfunc`). > > +The closures are represented as `GCfuncL` and `GCfuncC` for Lua and C closures > > +correspondingly. Also LuaJIT has special function's type aka Fast Function. It > ^ a Fixed. > > +is used for LuaJIT builtins. > > + > > +Fast function allocation events always belong to the previous frame with > > +considering of tail call optimizations (TCO). > > + > > +Assume we have the following Lua chunk named <test.lua>: > > + > > +``` > > +1 jit.off() > > +2 misc.memprof.start("memprof_new.bin") > > +3 local function append(str, rep) > > +4 return string.rep(str, rep) > > +5 end > > +6 > > +7 local t = {} > > +8 for _ = 1, 1e5 do > > +9 table.insert(t, > > +10 append('q', _) > > +11 ) > > +12 end > > +13 misc.memprof.stop() > > +``` > > + > > +Profilers output is like the follows: > > +``` > > +ALLOCATIONS > > +@test.lua:0, line 10: 100007 5004638934 0 > > +@test.lua:0, line 5: 1 40 0 > > +@test.lua:0, line 7: 1 72 0 > > +@test.lua:0, line 9: 1 48 0 > > + > > +REALLOCATIONS > > +@test.lua:0, line 9: 16 4194496 2097376 > > + Overrides: > > + @test.lua:0, line 9 > > + > > +@test.lua:0, line 10: 12 262080 131040 > > + Overrides: > > + @test.lua:0, line 10 > > + > > + > > +DEALLOCATIONS > > +INTERNAL: 21 0 2463 > > +@test.lua:0, line 10: 8 0 1044480 > > + Overrides: > > + @test.lua:0, line 10 > > +``` > > + > > +In Lua functions for profile events, we had to determine the line number of the > > +function definition and corresponding `GCproto` address. For C functions only > > +address will be enough. If Fast function is called from Lua function we had to > > +report the Lua function for more meaningful output. Otherwise report the C > > +function. > > + > > +So we need to know in what type of function CALL/RETURN virtual machine (VM) > ^^^^^^^ a ^ the > > +is. LuaJIT has already determined C function execution VM state but neither > ^ currently in. > > > +is. LuaJIT has already determined C function execution VM state but neither > Currently VM state identifies C function execution only, so Fast and Lua > functions states will be added. Reformulated in the new version. > > > +Fast functions nor Lua function. So corresponding VM states will be added. > > + > > +To determine currently allocating coroutine (that may not be equal to currently > > +executed) new field will be added to `global_State` structure named `mem_L` > ^ a Fixed. > > > +kept coroutine address. This field sets at each reallocation to corresponding > ^^^ to keep the ^^^ is set Fixed. > > > +`L` with which it was called. > > + > > +There is the static function (`lj_debug_getframeline`) returned line number for > ^^^ a ^^^ that returns Fixed. > > +current `BCPos` in `lj_debug.c` already. It will be added to the debug module > > +API to be used in memory profiler. > > + > > +### Information recording > > + > > +Each allocate/reallocate/free is considered as a type of event that are > > +reported. Event stream has the following format: > > + > > +```c > > +/* > > +** Event stream format: > > +** > > +** stream := symtab memprof > > +** symtab := see <ljp_symtab.h> > > +** memprof := prologue event* epilogue > > +** prologue := 'l' 'j' 'm' version reserved > > +** version := <BYTE> > > +** reserved := <BYTE> <BYTE> <BYTE> > > +** event := event-alloc | event-realloc | event-free > > +** event-alloc := event-header loc? naddr nsize > > +** event-realloc := event-header loc? oaddr osize naddr nsize > > +** event-free := event-header loc? oaddr osize > > +** event-header := <BYTE> > > +** loc := loc-lua | loc-c > > +** loc-lua := sym-addr line-no > > +** loc-c := sym-addr > > +** sym-addr := <ULEB128> > > +** line-no := <ULEB128> > > +** oaddr := <ULEB128> > > +** naddr := <ULEB128> > > +** osize := <ULEB128> > > +** nsize := <ULEB128> > > +** epilogue := event-header > > +** > > +** <BYTE> : A single byte (no surprises here) > > +** <ULEB128>: Unsigned integer represented in ULEB128 encoding > > +** > > +** (Order of bits below is hi -> lo) > > +** > > +** version: [VVVVVVVV] > > +** * VVVVVVVV: Byte interpreted as a plain integer version number > > +** > > +** event-header: [FTUUSSEE] > > +** * EE : 2 bits for representing allocation event type (AEVENT_*) > > +** * SS : 2 bits for representing allocation source type (ASOURCE_*) > > +** * UU : 2 unused bits > > +** * T : Reserved. 0 for regular events, 1 for the events marked with > > +** the timestamp mark. It is assumed that the time distance between > > No representation of a timestamp so far? If so - maybe remove excessive description > and put ‘reserved for timestamp’. I've dropped it according your and Igors comments. > > > +** two marked events is approximately the same and is equal > > +** to 1 second. Always zero for now. > > +** * F : 0 for regular events, 1 for epilogue's *F*inal header > > +** (if F is set to 1, all other bits are currently ignored) > > +*/ > > +``` > > + > > +It is enough to know the address of LUA/C function to determine it. Symbolic > > +table (symtab) dumps at start of profiling to avoid determine and write line > > +number of Lua code and corresponding chunk of code each time, when memory event > > +happens. Each line contains the address, Lua chunk definition as the filename > > +and line number of the function's declaration. This table of symbols has the > > +following format described at <ljp_symtab.h>: > > + > > +```c > > +/* > > +** symtab format: > > +** > > +** symtab := prologue sym* > > +** prologue := 'l' 'j' 's' version reserved > > +** version := <BYTE> > > +** reserved := <BYTE> <BYTE> <BYTE> > > +** sym := sym-lua | sym-final > > +** sym-lua := sym-header sym-addr sym-chunk sym-line > > +** sym-header := <BYTE> > > +** sym-addr := <ULEB128> > > +** sym-chunk := string > > +** sym-line := <ULEB128> > > +** sym-final := sym-header > > +** string := string-len string-payload > > +** string-len := <ULEB128> > > +** string-payload := <BYTE> {string-len} > > +** > > +** <BYTE> : A single byte (no surprises here) > > +** <ULEB128>: Unsigned integer represented in ULEB128 encoding > > +** > > +** (Order of bits below is hi -> lo) > > +** > > +** version: [VVVVVVVV] > > +** * VVVVVVVV: Byte interpreted as a plain numeric version number > > +** > > +** sym-header: [FUUUUUTT] > > +** * TT : 2 bits for representing symbol type > > +** * UUUUU : 5 unused bits > > +** * F : 1 bit marking the end of the symtab (final symbol) > > +*/ > > +``` > > + > > +So when memory profiling starts default allocation function is replaced by the > > +new allocation function as additional wrapper to write inspected profiling > > +events. When profiler stops old allocation function is substituted back. > > + > > +Starting profiler from Lua is quite simple: > > +```lua > > +local started, err = misc.memprof.start(fname) > > +``` > > +Where `fname` is name of the file where profile events are written. Writer for > > +this function perform `fwrite()` for each call retrying in case of `EINTR`. > > +Final callback calls `fclose()` at the end of profiling. If it is impossible to > > +open a file for writing or profiler fails to start, returns `nil` on failure > > +(plus an error message as a second result and a system-dependent error code as > > +a third result). Otherwise returns some true value. > > + > > +Stopping profiler from Lua is simple too: > > +```lua > > +local stopped, err = misc.memprof.stop() > > +``` > > + > > +If there is any error occurred at profiling stopping (an error when file > > +descriptor was closed) `memprof.stop()` returns `nil` (plus an error message as > > +a second result and a system-dependent error code as a third result). Returns > > Why do you need to separate an ‘error’ from a ’system error’? I've used the same formulation as in Lua Reference Manual [3]. It's errno, in fact, but IINM it is system dependent. > > > +`true` otherwise. > > + > > +If you want to build LuaJIT without memory profiler, you should build it with > > +`-DLUAJIT_DISABLE_MEMPROF`. If it is disabled `misc.memprof.start()` and > > +`misc.memprof.stop()` always return `false`. > > + > > +Memory profiler is expected to be thread safe, so it has a corresponding > > +lock/unlock at internal mutex whenever you call `luaM_memprof_*`. If you want > > +to build LuaJIT without thread safety use `-DLUAJIT_DISABLE_THREAD_SAFE`. > > + > > +### Reading and displaying saved data > > + > > +Binary data can be read by `lj-parse-memprof` utility. It parses the binary > > +format provided from memory profiler and render it in human-readable format. > > + > > +The usage is very simple: > > +``` > > +$ ./luajit-parse-memprof --help > > +luajit-parse-memprof - parser of the memory usage profile collected > > + with LuaJIT's memprof. > > + > > +SYNOPSIS > > + > > +luajit-parse-memprof [options] memprof.bin > > + > > +Supported options are: > > + > > + --help Show this help and exit > > +``` > > + > > +Plain text of profiled info has the following format: > > +``` > > +@<filename>:<function_line>, line <line where event was detected>: <number of events> <allocated> <freed> > > +``` > > +See example above. > > + > > +`INTERNAL` means that this allocations are caused by internal LuaJIT > > +structures. Note that events are sorted from the most often to the least. > > + > > +`Overrides` means what allocation this reallocation overrides. > > + > > +## Benchmarks > > + > > +Benchmarks were taken from repo: > > +[LuaJIT-test-cleanup](https://github.com/LuaJIT/LuaJIT-test-cleanup). > > + > > +Example of usage: > Either ‘usage’ or ‘example of use’ Fixed, thank you! > > > > +```bash > > +/usr/bin/time -f"array3d %U" ./luajit $BENCH_DIR/array3d.lua 300 >/dev/null > > +``` > > + > > +Benchmark results before and after the patch (less is better): > > Comparative to ‘BEFORE’ is more reafable, dispersion should also help with > reading of results. Done in the new version. > > > + > > +``` > > + | BEFORE | AFTER,memprof off | AFTER,memprof on > > +---------------+--------+-------------------+----------------- > > +array3d | 0.22 | 0.20 | 0.21 > > +binary-trees | 3.32 | 3.33 | 3.94 > > +chameneos | 2.92 | 3.18 | 3.12 > > +coroutine-ring | 0.99 | 1.00 | 0.99 > > +euler14-bit | 1.04 | 1.05 | 1.03 > > +fannkuch | 6.77 | 6.69 | 6.64 > > +fasta | 8.27 | 8.30 | 8.25 > > +life | 0.48 | 0.48 | 1.03 > > +mandelbrot | 2.69 | 2.70 | 2.75 > > +mandelbrot-bit | 1.99 | 2.00 | 2.08 > > +md5 | 1.57 | 1.61 | 1.56 > > +nbody | 1.35 | 1.38 | 1.33 > > +nsieve | 2.11 | 2.19 | 2.09 > > +nsieve-bit | 1.50 | 1.55 | 1.47 > > +nsieve-bit-fp | 4.40 | 4.63 | 4.44 > > +partialsums | 0.54 | 0.58 | 0.55 > > +pidigits-nogmp | 3.48 | 3.50 | 3.47 > > +ray | 1.63 | 1.68 | 1.64 > > +recursive-ack | 0.19 | 0.22 | 0.20 > > +recursive-fib | 1.62 | 1.71 | 1.63 > > +scimark-fft | 5.78 | 5.94 | 5.69 > > +scimark-lu | 3.26 | 3.57 | 3.59 > > +scimark-sor | 2.34 | 2.35 | 2.33 > > +scimark-sparse | 5.03 | 4.92 | 4.91 > > +series | 0.94 | 0.96 | 0.95 > > +spectral-norm | 0.96 | 0.96 | 0.95 > > +``` > > -- > > 2.28.0 > > > [1]: https://lists.tarantool.org/pipermail/tarantool-discussions/2020-December/000147.html [2]: https://github.com/tarantool/vshard/issues/224#issuecomment-637632349 [3]: https://www.lua.org/manual/5.1/manual.html#5.7 -- Best regards, Sergey Kaplun
Part of #5442 --- RFC on branch: https://github.com/tarantool/tarantool/blob/skaplun/gh-5442-luajit-memory-profiler/doc/rfc/5442-luajit-memory-profiler.md Changes in v3: * More comments in example. * More verbose benchmark information. * Grammar and spelling fixes. Changes in v2: * Removed C API, Tarantool integration and description of additional features -- they will be added in another RFC if necessary. * Removed checking profile is running from the public API. * Added benchmarks and more meaningful example. * Grammar fixes. doc/rfc/5442-luajit-memory-profiler.md | 314 +++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 doc/rfc/5442-luajit-memory-profiler.md diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md new file mode 100644 index 000000000..85a61462a --- /dev/null +++ b/doc/rfc/5442-luajit-memory-profiler.md @@ -0,0 +1,314 @@ +# LuaJIT memory profiler + +* **Status**: In progress +* **Start date**: 24-10-2020 +* **Authors**: Sergey Kaplun @Buristan skaplun@tarantool.org, + Igor Munkin @igormunkin imun@tarantool.org, + Sergey Ostanevich @sergos sergos@tarantool.org +* **Issues**: [#5442](https://github.com/tarantool/tarantool/issues/5442) + +## Summary + +LuaJIT memory profiler is a toolchain for analysis of memory usage by user's +application. + +## Background and motivation + +Garbage collector (GC) is a curse of performance for most of Lua applications. +Memory usage of Lua application should be profiled to locate various +memory-unoptimized code blocks. If the application has memory leaks they can be +found with the profiler also. + +## Detailed design + +The whole toolchain of memory profiling will be divided into several parts: +1) [Prerequisites](#prerequisites). +2) [Recording information about memory usage and saving it](#information-recording). +3) [Reading saved data and display it in human-readable format](#reading-and-displaying-saved-data). + +### Prerequisites + +This section describes additional changes in LuaJIT required for the feature +implementation. This version of LuaJIT memory profiler does not support verbose +reporting allocations from traces. All allocation from traces are reported as +internal. But trace code semantics should be totally the same as for the Lua +interpreter (excluding sink optimizations). Also all deallocations reported as +internal too. + +There are two different representations of functions in LuaJIT: the function's +prototype (`GCproto`) and the function object so called closure (`GCfunc`). +The closures are represented as `GCfuncL` and `GCfuncC` for Lua and C closures +correspondingly. Also LuaJIT has a special function's type aka Fast Function. +It is used for LuaJIT builtins. + +Tail call optimization does not create a new call frame, so all allocations +inside the function called via `CALLT`/`CALLMT` are attributed to its caller. + +Usually developers are not interested in information about allocations inside +builtins. So if fast function was called from a Lua function all +allocations are attributed to this Lua function. Otherwise attribute this event +to a C function. + +Assume we have the following Lua chunk named <test.lua>: + +``` +1 jit.off() +2 misc.memprof.start("memprof_new.bin") +3 -- Lua does not create a new frame to call string.rep and all allocations are +4 -- attributed not to `append()` function but to the parent scope. +5 local function append(str, rep) +6 return string.rep(str, rep) +7 end +8 +9 local t = {} +10 for _ = 1, 1e5 do +11 -- table.insert is a builtin and all corresponding allocations +12 -- are reported in the scope of main chunk +13 table.insert(t, +14 append('q', _) +15 ) +16 end +17 misc.memprof.stop() +``` + +If one run the chunk above the profiler reports approximately the following +(see legend [here](#reading-and-displaying-saved-data)): +``` +ALLOCATIONS +@test.lua:0, line 14: 1002 531818 0 +@test.lua:0, line 13: 1 24 0 +@test.lua:0, line 9: 1 32 0 +@test.lua:0, line 7: 1 20 0 + +REALLOCATIONS +@test.lua:0, line 13: 9 16424 8248 + Overrides: + @test.lua:0, line 13 + +@test.lua:0, line 14: 5 1984 992 + Overrides: + @test.lua:0, line 14 + + +DEALLOCATIONS +INTERNAL: 20 0 1481 +@test.lua:0, line 14: 3 0 7168 + Overrides: + @test.lua:0, line 14 +``` + +So we need to know a type of function being executed by the virtual machine +(VM). Currently VM state identifies C function execution only, so Fast and Lua +functions states will be added. + +To determine currently allocating coroutine (that may not be equal to currently +executed one) a new field called `mem_L` is added to `global_State` structure +to keep the coroutine address. This field is set at each reallocation to +corresponding `L` with which it was called. + +There is a static function (`lj_debug_getframeline`) that returns line number +for current `BCPos` in `lj_debug.c` already. It will be added to the debug +module API to be used in memory profiler. + +### Information recording + +Each allocate/reallocate/free is considered as a type of event that are +reported. Event stream has the following format: + +```c +/* +** Event stream format: +** +** stream := symtab memprof +** symtab := see symtab description +** memprof := prologue event* epilogue +** prologue := 'l' 'j' 'm' version reserved +** version := <BYTE> +** reserved := <BYTE> <BYTE> <BYTE> +** event := event-alloc | event-realloc | event-free +** event-alloc := event-header loc? naddr nsize +** event-realloc := event-header loc? oaddr osize naddr nsize +** event-free := event-header loc? oaddr osize +** event-header := <BYTE> +** loc := loc-lua | loc-c +** loc-lua := sym-addr line-no +** loc-c := sym-addr +** sym-addr := <ULEB128> +** line-no := <ULEB128> +** oaddr := <ULEB128> +** naddr := <ULEB128> +** osize := <ULEB128> +** nsize := <ULEB128> +** epilogue := event-header +** +** <BYTE> : A single byte (no surprises here) +** <ULEB128>: Unsigned integer represented in ULEB128 encoding +** +** (Order of bits below is hi -> lo) +** +** version: [VVVVVVVV] +** * VVVVVVVV: Byte interpreted as a plain integer version number +** +** event-header: [FUUUSSEE] +** * EE : 2 bits for representing allocation event type (AEVENT_*) +** * SS : 2 bits for representing allocation source type (ASOURCE_*) +** * UUU : 3 unused bits +** * F : 0 for regular events, 1 for epilogue's *F*inal header +** (if F is set to 1, all other bits are currently ignored) +*/ +``` + +It is enough to know the address of LUA/C function to determine it. Symbolic +table (symtab) is dumped at the start of profiling to avoid dumping function +location on each memory event for saving both CPU usage and binary profile +size. + +Each line contains the address, Lua chunk definition as the filename and line +number of the function's declaration. This table of symbols has the following +format described at <lj_memprof.h>: + +```c +/* +** symtab format: +** +** symtab := prologue sym* +** prologue := 'l' 'j' 's' version reserved +** version := <BYTE> +** reserved := <BYTE> <BYTE> <BYTE> +** sym := sym-lua | sym-final +** sym-lua := sym-header sym-addr sym-chunk sym-line +** sym-header := <BYTE> +** sym-addr := <ULEB128> +** sym-chunk := string +** sym-line := <ULEB128> +** sym-final := sym-header +** string := string-len string-payload +** string-len := <ULEB128> +** string-payload := <BYTE> {string-len} +** +** <BYTE> : A single byte (no surprises here) +** <ULEB128>: Unsigned integer represented in ULEB128 encoding +** +** (Order of bits below is hi -> lo) +** +** version: [VVVVVVVV] +** * VVVVVVVV: Byte interpreted as a plain numeric version number +** +** sym-header: [FUUUUUTT] +** * TT : 2 bits for representing symbol type +** * UUUUU : 5 unused bits +** * F : 1 bit marking the end of the symtab (final symbol) +*/ +``` + +So when memory profiling starts the current allocation function is replaced by +the new allocation function additionally wrapped to write the profiling events. +When profiler stops the previous allocation function is restored. + +Starting profiler from Lua is quite simple: +```lua +local started, err, errno = misc.memprof.start(fname) +``` +where `fname` is name of the file where profile events are written. Writer for +this function perform `fwrite()` for each call retrying in case of `EINTR`. +When the profiling is stopped the `fclose()` is called. If it is impossible to +open a file for writing or profiler fails to start, returns `nil` on failure +(plus an error message as a second result and a system-dependent error code as +a third result). Otherwise returns some true value. + +Stopping profiler from Lua is simple too: +```lua +local stopped, err, errno = misc.memprof.stop() +``` + +If there is any error occurred at profiling stopping (an error when file +descriptor was closed) `memprof.stop()` returns `nil` (plus an error message as +a second result and a system-dependent error code as a third result). Returns +`true` otherwise. + +If you want to build LuaJIT without memory profiler, you should build it with +`-DLUAJIT_DISABLE_MEMPROF`. If it is disabled `misc.memprof.start()` and +`misc.memprof.stop()` always return `false`. + +Memory profiler is expected to be thread safe, so it has a corresponding +lock/unlock at internal mutex whenever you call corresponding memprof +functions. If you want to build LuaJIT without thread safety use +`-DLUAJIT_DISABLE_THREAD_SAFE`. + +### Reading and displaying saved data + +Binary data can be read by `lj-parse-memprof` utility. It parses the binary +format provided by memory profiler and render it on human-readable format. + +The usage is very simple: +``` +$ ./luajit-parse-memprof --help +luajit-parse-memprof - parser of the memory usage profile collected + with LuaJIT's memprof. + +SYNOPSIS + +luajit-parse-memprof [options] memprof.bin + +Supported options are: + + --help Show this help and exit +``` + +Plain text of profiled info has the following format: +``` +@<filename>:<function_line>, line <line where event was detected>: <number of events> <allocated> <freed> +``` +See the example [above](#prerequisites). + +`INTERNAL` means that these allocations are caused by internal LuaJIT +structures. Note that events are sorted from the most often to the least. + +`Overrides` means what allocation this reallocation overrides. + +## Benchmarks + +Benchmarks were taken from repo: +[LuaJIT-test-cleanup](https://github.com/LuaJIT/LuaJIT-test-cleanup). + +Example of measuring: +```bash +/usr/bin/time -f"array3d %U" ./luajit $BENCH_DIR/array3d.lua 300 >/dev/null +``` + +This table shows performance deviation in relation to REFerence value (before +commit) with stopped and running profiler. The table shows the average value +for 11 runs. The first field of the column indicates the change in the average +time in seconds (less is better). The second field is the standard deviation +for the found difference. + +``` + Name | REF | AFTER, memprof off | AFTER, memprof on +----------------+------+--------------------+------------------ +array3d | 0.21 | +0.00 (0.01) | +0.00 (0.01) +binary-trees | 3.25 | -0.01 (0.06) | +0.53 (0.10) +chameneos | 2.97 | +0.14 (0.04) | +0.13 (0.06) +coroutine-ring | 1.00 | +0.01 (0.04) | +0.01 (0.04) +euler14-bit | 1.03 | +0.01 (0.02) | +0.00 (0.02) +fannkuch | 6.81 | -0.21 (0.06) | -0.20 (0.06) +fasta | 8.20 | -0.07 (0.05) | -0.08 (0.03) +life | 0.46 | +0.00 (0.01) | +0.35 (0.01) +mandelbrot | 2.65 | +0.00 (0.01) | +0.01 (0.01) +mandelbrot-bit | 1.97 | +0.00 (0.01) | +0.01 (0.02) +md5 | 1.58 | -0.01 (0.04) | -0.04 (0.04) +nbody | 1.34 | +0.00 (0.01) | -0.02 (0.01) +nsieve | 2.07 | -0.03 (0.03) | -0.01 (0.04) +nsieve-bit | 1.50 | -0.02 (0.04) | +0.00 (0.04) +nsieve-bit-fp | 4.44 | -0.03 (0.07) | -0.01 (0.07) +partialsums | 0.54 | +0.00 (0.01) | +0.00 (0.01) +pidigits-nogmp | 3.47 | -0.01 (0.02) | -0.10 (0.02) +ray | 1.62 | -0.02 (0.03) | +0.00 (0.02) +recursive-ack | 0.20 | +0.00 (0.01) | +0.00 (0.01) +recursive-fib | 1.63 | +0.00 (0.01) | +0.01 (0.02) +scimark-fft | 5.72 | +0.06 (0.09) | -0.01 (0.10) +scimark-lu | 3.47 | +0.02 (0.27) | -0.03 (0.26) +scimark-sor | 2.34 | +0.00 (0.01) | -0.01 (0.01) +scimark-sparse | 4.95 | -0.02 (0.04) | -0.02 (0.04) +series | 0.95 | +0.00 (0.02) | +0.00 (0.01) +spectral-norm | 0.96 | +0.00 (0.02) | -0.01 (0.02) +``` -- 2.28.0
Hi! Thanks for the patch! See some comments below, after being applied is LGTM. Sergos > On 16 Dec 2020, at 22:09, Sergey Kaplun <skaplun@tarantool.org> wrote: > > Part of #5442 > --- > > Issues: https://github.com/tarantool/tarantool/issues/5442 > Branch: https://github.com/tarantool/tarantool/tree/skaplun/gh-5442-luajit-memory-profiler > > Changes in v2: > * Removed C API, Tarantool integration and description of additional > features -- they will be added in another RFC if necessary. > * Removed checking profile is running from the public API. > * Added benchmarks and more meaningful example. > * Grammar fixes. > > doc/rfc/5442-luajit-memory-profiler.md | 306 +++++++++++++++++++++++++ > 1 file changed, 306 insertions(+) > create mode 100644 doc/rfc/5442-luajit-memory-profiler.md > > diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md > new file mode 100644 > index 000000000..720105009 > --- /dev/null > +++ b/doc/rfc/5442-luajit-memory-profiler.md > @@ -0,0 +1,306 @@ > +# LuaJIT memory profiler > + > +* **Status**: In progress > +* **Start date**: 24-10-2020 > +* **Authors**: Sergey Kaplun @Buristan skaplun@tarantool.org, > + Igor Munkin @igormunkin imun@tarantool.org, > + Sergey Ostanevich @sergos sergos@tarantool.org > +* **Issues**: [#5442](https://github.com/tarantool/tarantool/issues/5442) > + > +## Summary > + > +LuaJIT memory profiler is a toolchain for analysis of memory usage by user's > +application. > + > +## Background and motivation > + > +Garbage collector (GC) is a curse of performance for most of Lua applications. This is something that is very questionable. Do you have any data on performance of Lua code against the GC code in real life? Is there any about Tarantool applications? I’m not questioning the mem profiler itself, rather trying to weigh the GC problem. > +Memory usage of Lua application should be profiled to find out various ^^^^^^^ locate > +memory-unoptimized code blocks. If the application has memory leaks they can be > +found with the profiler. ^ also. > + > +## Detailed design > + > +The whole toolchain of memory profiling will be divided by several parts: ^^^ into > +1) Prerequisites. > +2) Recording information about memory usage and saving it. > +3) Reading saved data and display it in human-readable format. > + > +### Prerequisites > + > +This section describes additional changes in LuaJIT required to feature ^^ for the > +implementation. This version of LuaJIT memory profiler does not support > +reporting allocations from traces. But trace code semantics should be totally > +the same as for Lua interpreter. So profiling with `jit.off()` should be > +enough. > + > +There are two different representations of functions in LuaJIT: the function's > +prototype (`GCproto`) and the function object so called closure (`GCfunc`). > +The closures are represented as `GCfuncL` and `GCfuncC` for Lua and C closures > +correspondingly. Also LuaJIT has special function's type aka Fast Function. It ^ a > +is used for LuaJIT builtins. > + > +Fast function allocation events always belong to the previous frame with > +considering of tail call optimizations (TCO). > + > +Assume we have the following Lua chunk named <test.lua>: > + > +``` > +1 jit.off() > +2 misc.memprof.start("memprof_new.bin") > +3 local function append(str, rep) > +4 return string.rep(str, rep) > +5 end > +6 > +7 local t = {} > +8 for _ = 1, 1e5 do > +9 table.insert(t, > +10 append('q', _) > +11 ) > +12 end > +13 misc.memprof.stop() > +``` > + > +Profilers output is like the follows: > +``` > +ALLOCATIONS > +@test.lua:0, line 10: 100007 5004638934 0 > +@test.lua:0, line 5: 1 40 0 > +@test.lua:0, line 7: 1 72 0 > +@test.lua:0, line 9: 1 48 0 > + > +REALLOCATIONS > +@test.lua:0, line 9: 16 4194496 2097376 > + Overrides: > + @test.lua:0, line 9 > + > +@test.lua:0, line 10: 12 262080 131040 > + Overrides: > + @test.lua:0, line 10 > + > + > +DEALLOCATIONS > +INTERNAL: 21 0 2463 > +@test.lua:0, line 10: 8 0 1044480 > + Overrides: > + @test.lua:0, line 10 > +``` > + > +In Lua functions for profile events, we had to determine the line number of the > +function definition and corresponding `GCproto` address. For C functions only > +address will be enough. If Fast function is called from Lua function we had to > +report the Lua function for more meaningful output. Otherwise report the C > +function. > + > +So we need to know in what type of function CALL/RETURN virtual machine (VM) ^^^^^^^ a ^ the > +is. LuaJIT has already determined C function execution VM state but neither ^ currently in. > +is. LuaJIT has already determined C function execution VM state but neither Currently VM state identifies C function execution only, so Fast and Lua functions states will be added. > +Fast functions nor Lua function. So corresponding VM states will be added. > + > +To determine currently allocating coroutine (that may not be equal to currently > +executed) new field will be added to `global_State` structure named `mem_L` ^ a > +kept coroutine address. This field sets at each reallocation to corresponding ^^^ to keep the ^^^ is set > +`L` with which it was called. > + > +There is the static function (`lj_debug_getframeline`) returned line number for ^^^ a ^^^ that returns > +current `BCPos` in `lj_debug.c` already. It will be added to the debug module > +API to be used in memory profiler. > + > +### Information recording > + > +Each allocate/reallocate/free is considered as a type of event that are > +reported. Event stream has the following format: > + > +```c > +/* > +** Event stream format: > +** > +** stream := symtab memprof > +** symtab := see <ljp_symtab.h> > +** memprof := prologue event* epilogue > +** prologue := 'l' 'j' 'm' version reserved > +** version := <BYTE> > +** reserved := <BYTE> <BYTE> <BYTE> > +** event := event-alloc | event-realloc | event-free > +** event-alloc := event-header loc? naddr nsize > +** event-realloc := event-header loc? oaddr osize naddr nsize > +** event-free := event-header loc? oaddr osize > +** event-header := <BYTE> > +** loc := loc-lua | loc-c > +** loc-lua := sym-addr line-no > +** loc-c := sym-addr > +** sym-addr := <ULEB128> > +** line-no := <ULEB128> > +** oaddr := <ULEB128> > +** naddr := <ULEB128> > +** osize := <ULEB128> > +** nsize := <ULEB128> > +** epilogue := event-header > +** > +** <BYTE> : A single byte (no surprises here) > +** <ULEB128>: Unsigned integer represented in ULEB128 encoding > +** > +** (Order of bits below is hi -> lo) > +** > +** version: [VVVVVVVV] > +** * VVVVVVVV: Byte interpreted as a plain integer version number > +** > +** event-header: [FTUUSSEE] > +** * EE : 2 bits for representing allocation event type (AEVENT_*) > +** * SS : 2 bits for representing allocation source type (ASOURCE_*) > +** * UU : 2 unused bits > +** * T : Reserved. 0 for regular events, 1 for the events marked with > +** the timestamp mark. It is assumed that the time distance between No representation of a timestamp so far? If so - maybe remove excessive description and put ‘reserved for timestamp’. > +** two marked events is approximately the same and is equal > +** to 1 second. Always zero for now. > +** * F : 0 for regular events, 1 for epilogue's *F*inal header > +** (if F is set to 1, all other bits are currently ignored) > +*/ > +``` > + > +It is enough to know the address of LUA/C function to determine it. Symbolic > +table (symtab) dumps at start of profiling to avoid determine and write line > +number of Lua code and corresponding chunk of code each time, when memory event > +happens. Each line contains the address, Lua chunk definition as the filename > +and line number of the function's declaration. This table of symbols has the > +following format described at <ljp_symtab.h>: > + > +```c > +/* > +** symtab format: > +** > +** symtab := prologue sym* > +** prologue := 'l' 'j' 's' version reserved > +** version := <BYTE> > +** reserved := <BYTE> <BYTE> <BYTE> > +** sym := sym-lua | sym-final > +** sym-lua := sym-header sym-addr sym-chunk sym-line > +** sym-header := <BYTE> > +** sym-addr := <ULEB128> > +** sym-chunk := string > +** sym-line := <ULEB128> > +** sym-final := sym-header > +** string := string-len string-payload > +** string-len := <ULEB128> > +** string-payload := <BYTE> {string-len} > +** > +** <BYTE> : A single byte (no surprises here) > +** <ULEB128>: Unsigned integer represented in ULEB128 encoding > +** > +** (Order of bits below is hi -> lo) > +** > +** version: [VVVVVVVV] > +** * VVVVVVVV: Byte interpreted as a plain numeric version number > +** > +** sym-header: [FUUUUUTT] > +** * TT : 2 bits for representing symbol type > +** * UUUUU : 5 unused bits > +** * F : 1 bit marking the end of the symtab (final symbol) > +*/ > +``` > + > +So when memory profiling starts default allocation function is replaced by the > +new allocation function as additional wrapper to write inspected profiling > +events. When profiler stops old allocation function is substituted back. > + > +Starting profiler from Lua is quite simple: > +```lua > +local started, err = misc.memprof.start(fname) > +``` > +Where `fname` is name of the file where profile events are written. Writer for > +this function perform `fwrite()` for each call retrying in case of `EINTR`. > +Final callback calls `fclose()` at the end of profiling. If it is impossible to > +open a file for writing or profiler fails to start, returns `nil` on failure > +(plus an error message as a second result and a system-dependent error code as > +a third result). Otherwise returns some true value. > + > +Stopping profiler from Lua is simple too: > +```lua > +local stopped, err = misc.memprof.stop() > +``` > + > +If there is any error occurred at profiling stopping (an error when file > +descriptor was closed) `memprof.stop()` returns `nil` (plus an error message as > +a second result and a system-dependent error code as a third result). Returns Why do you need to separate an ‘error’ from a ’system error’? > +`true` otherwise. > + > +If you want to build LuaJIT without memory profiler, you should build it with > +`-DLUAJIT_DISABLE_MEMPROF`. If it is disabled `misc.memprof.start()` and > +`misc.memprof.stop()` always return `false`. > + > +Memory profiler is expected to be thread safe, so it has a corresponding > +lock/unlock at internal mutex whenever you call `luaM_memprof_*`. If you want > +to build LuaJIT without thread safety use `-DLUAJIT_DISABLE_THREAD_SAFE`. > + > +### Reading and displaying saved data > + > +Binary data can be read by `lj-parse-memprof` utility. It parses the binary > +format provided from memory profiler and render it in human-readable format. > + > +The usage is very simple: > +``` > +$ ./luajit-parse-memprof --help > +luajit-parse-memprof - parser of the memory usage profile collected > + with LuaJIT's memprof. > + > +SYNOPSIS > + > +luajit-parse-memprof [options] memprof.bin > + > +Supported options are: > + > + --help Show this help and exit > +``` > + > +Plain text of profiled info has the following format: > +``` > +@<filename>:<function_line>, line <line where event was detected>: <number of events> <allocated> <freed> > +``` > +See example above. > + > +`INTERNAL` means that this allocations are caused by internal LuaJIT > +structures. Note that events are sorted from the most often to the least. > + > +`Overrides` means what allocation this reallocation overrides. > + > +## Benchmarks > + > +Benchmarks were taken from repo: > +[LuaJIT-test-cleanup](https://github.com/LuaJIT/LuaJIT-test-cleanup). > + > +Example of usage: Either ‘usage’ or ‘example of use’ > +```bash > +/usr/bin/time -f"array3d %U" ./luajit $BENCH_DIR/array3d.lua 300 >/dev/null > +``` > + > +Benchmark results before and after the patch (less is better): Comparative to ‘BEFORE’ is more reafable, dispersion should also help with reading of results. > + > +``` > + | BEFORE | AFTER,memprof off | AFTER,memprof on > +---------------+--------+-------------------+----------------- > +array3d | 0.22 | 0.20 | 0.21 > +binary-trees | 3.32 | 3.33 | 3.94 > +chameneos | 2.92 | 3.18 | 3.12 > +coroutine-ring | 0.99 | 1.00 | 0.99 > +euler14-bit | 1.04 | 1.05 | 1.03 > +fannkuch | 6.77 | 6.69 | 6.64 > +fasta | 8.27 | 8.30 | 8.25 > +life | 0.48 | 0.48 | 1.03 > +mandelbrot | 2.69 | 2.70 | 2.75 > +mandelbrot-bit | 1.99 | 2.00 | 2.08 > +md5 | 1.57 | 1.61 | 1.56 > +nbody | 1.35 | 1.38 | 1.33 > +nsieve | 2.11 | 2.19 | 2.09 > +nsieve-bit | 1.50 | 1.55 | 1.47 > +nsieve-bit-fp | 4.40 | 4.63 | 4.44 > +partialsums | 0.54 | 0.58 | 0.55 > +pidigits-nogmp | 3.48 | 3.50 | 3.47 > +ray | 1.63 | 1.68 | 1.64 > +recursive-ack | 0.19 | 0.22 | 0.20 > +recursive-fib | 1.62 | 1.71 | 1.63 > +scimark-fft | 5.78 | 5.94 | 5.69 > +scimark-lu | 3.26 | 3.57 | 3.59 > +scimark-sor | 2.34 | 2.35 | 2.33 > +scimark-sparse | 5.03 | 4.92 | 4.91 > +series | 0.94 | 0.96 | 0.95 > +spectral-norm | 0.96 | 0.96 | 0.95 > +``` > -- > 2.28.0 >
Sergey, Thanks for the new version! The design is fine, but please consider my minor comments regarding the document itself. On 16.12.20, Sergey Kaplun wrote: > Part of #5442 > --- > > Issues: https://github.com/tarantool/tarantool/issues/5442 > Branch: https://github.com/tarantool/tarantool/tree/skaplun/gh-5442-luajit-memory-profiler > > Changes in v2: > * Removed C API, Tarantool integration and description of additional > features -- they will be added in another RFC if necessary. > * Removed checking profile is running from the public API. > * Added benchmarks and more meaningful example. > * Grammar fixes. > > doc/rfc/5442-luajit-memory-profiler.md | 306 +++++++++++++++++++++++++ > 1 file changed, 306 insertions(+) > create mode 100644 doc/rfc/5442-luajit-memory-profiler.md > > diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md > new file mode 100644 > index 000000000..720105009 > --- /dev/null > +++ b/doc/rfc/5442-luajit-memory-profiler.md > @@ -0,0 +1,306 @@ <snipped> > +## Detailed design > + > +The whole toolchain of memory profiling will be divided by several parts: > +1) Prerequisites. > +2) Recording information about memory usage and saving it. > +3) Reading saved data and display it in human-readable format. I believe the links would also be great here. > + > +### Prerequisites > + > +This section describes additional changes in LuaJIT required to feature > +implementation. This version of LuaJIT memory profiler does not support > +reporting allocations from traces. But trace code semantics should be totally > +the same as for Lua interpreter. So profiling with `jit.off()` should be > +enough. From this part I guess nothing works with enabled JIT. As a result of the offline discussion it occurs everything works but allocation on traces are attributed as internal ones. I think this is not a big deal since JIT is a black internal box. Please mention this explicitly above. Minor: It's worth to mention that there might be less GC pressure while trace execution (considering allocation sinking optimization impact), but the traces itself are allocated in GC area. > + <snipped> > + > +Fast function allocation events always belong to the previous frame with > +considering of tail call optimizations (TCO). Still don't get it. In the previous version I asked to explicitly mention that allocations in builtins are attributed with their caller. And another case is a tail call optimization, when allocations are also attributes with the caller. So, the example is fine (but comments are desired, IMHO), but I suggest to separate these cases in the passage above. > + > +Assume we have the following Lua chunk named <test.lua>: > + > +``` > +1 jit.off() > +2 misc.memprof.start("memprof_new.bin") > +3 local function append(str, rep) > +4 return string.rep(str, rep) > +5 end > +6 > +7 local t = {} > +8 for _ = 1, 1e5 do > +9 table.insert(t, > +10 append('q', _) > +11 ) > +12 end > +13 misc.memprof.stop() > +``` > + > +Profilers output is like the follows: > +``` > +ALLOCATIONS > +@test.lua:0, line 10: 100007 5004638934 0 > +@test.lua:0, line 5: 1 40 0 > +@test.lua:0, line 7: 1 72 0 > +@test.lua:0, line 9: 1 48 0 > + > +REALLOCATIONS > +@test.lua:0, line 9: 16 4194496 2097376 > + Overrides: > + @test.lua:0, line 9 > + > +@test.lua:0, line 10: 12 262080 131040 > + Overrides: > + @test.lua:0, line 10 > + > + > +DEALLOCATIONS > +INTERNAL: 21 0 2463 > +@test.lua:0, line 10: 8 0 1044480 > + Overrides: > + @test.lua:0, line 10 > +``` I have no idea what is dumped here. There is a legend below (in reading and displaying saved data section), so leave a link to it right here. Otherwise this is some kind of elvish. > + > +In Lua functions for profile events, we had to determine the line number of the > +function definition and corresponding `GCproto` address. For C functions only > +address will be enough. If Fast function is called from Lua function we had to > +report the Lua function for more meaningful output. Otherwise report the C > +function. Still don't understand two last sentences. In the previous reply you told this relates to the part above (about the fast functions). You reworded that part, but this is still left unclear. > + > +So we need to know in what type of function CALL/RETURN virtual machine (VM) AFAICS the function type is enough, isn't it? As you mentioned above allocation are attributed to the callers in case of tail call optimization. > +is. LuaJIT has already determined C function execution VM state but neither > +Fast functions nor Lua function. So corresponding VM states will be added. <snipped> > +### Information recording > + > +Each allocate/reallocate/free is considered as a type of event that are > +reported. Event stream has the following format: > + > +```c > +/* > +** Event stream format: > +** > +** stream := symtab memprof > +** symtab := see <ljp_symtab.h> > +** memprof := prologue event* epilogue > +** prologue := 'l' 'j' 'm' version reserved > +** version := <BYTE> > +** reserved := <BYTE> <BYTE> <BYTE> > +** event := event-alloc | event-realloc | event-free > +** event-alloc := event-header loc? naddr nsize > +** event-realloc := event-header loc? oaddr osize naddr nsize > +** event-free := event-header loc? oaddr osize > +** event-header := <BYTE> > +** loc := loc-lua | loc-c > +** loc-lua := sym-addr line-no > +** loc-c := sym-addr > +** sym-addr := <ULEB128> > +** line-no := <ULEB128> > +** oaddr := <ULEB128> > +** naddr := <ULEB128> > +** osize := <ULEB128> > +** nsize := <ULEB128> > +** epilogue := event-header > +** > +** <BYTE> : A single byte (no surprises here) > +** <ULEB128>: Unsigned integer represented in ULEB128 encoding > +** > +** (Order of bits below is hi -> lo) > +** > +** version: [VVVVVVVV] > +** * VVVVVVVV: Byte interpreted as a plain integer version number > +** > +** event-header: [FTUUSSEE] > +** * EE : 2 bits for representing allocation event type (AEVENT_*) > +** * SS : 2 bits for representing allocation source type (ASOURCE_*) > +** * UU : 2 unused bits > +** * T : Reserved. 0 for regular events, 1 for the events marked with > +** the timestamp mark. It is assumed that the time distance between > +** two marked events is approximately the same and is equal > +** to 1 second. Always zero for now. It looks this is zero always in our case, so we have 3 unused bits, right? Then just drop this part. > +** * F : 0 for regular events, 1 for epilogue's *F*inal header > +** (if F is set to 1, all other bits are currently ignored) > +*/ > +``` > + > +It is enough to know the address of LUA/C function to determine it. Symbolic > +table (symtab) dumps at start of profiling to avoid determine and write line > +number of Lua code and corresponding chunk of code each time, when memory event > +happens. Each line contains the address, Lua chunk definition as the filename > +and line number of the function's declaration. This table of symbols has the > +following format described at <ljp_symtab.h>: > + > +```c > +/* > +** symtab format: > +** > +** symtab := prologue sym* > +** prologue := 'l' 'j' 's' version reserved > +** version := <BYTE> > +** reserved := <BYTE> <BYTE> <BYTE> > +** sym := sym-lua | sym-final > +** sym-lua := sym-header sym-addr sym-chunk sym-line > +** sym-header := <BYTE> > +** sym-addr := <ULEB128> > +** sym-chunk := string > +** sym-line := <ULEB128> > +** sym-final := sym-header > +** string := string-len string-payload > +** string-len := <ULEB128> > +** string-payload := <BYTE> {string-len} > +** > +** <BYTE> : A single byte (no surprises here) > +** <ULEB128>: Unsigned integer represented in ULEB128 encoding > +** > +** (Order of bits below is hi -> lo) > +** > +** version: [VVVVVVVV] > +** * VVVVVVVV: Byte interpreted as a plain numeric version number > +** > +** sym-header: [FUUUUUTT] > +** * TT : 2 bits for representing symbol type > +** * UUUUU : 5 unused bits > +** * F : 1 bit marking the end of the symtab (final symbol) > +*/ > +``` <snipped> > +Starting profiler from Lua is quite simple: > +```lua > +local started, err = misc.memprof.start(fname) > +``` > +Where `fname` is name of the file where profile events are written. Writer for > +this function perform `fwrite()` for each call retrying in case of `EINTR`. > +Final callback calls `fclose()` at the end of profiling. If it is impossible to > +open a file for writing or profiler fails to start, returns `nil` on failure > +(plus an error message as a second result and a system-dependent error code as > +a third result). Otherwise returns some true value. What the heck is the third result and why is it ignored in the example above? I think this error should be incorporated in the second argument kinda <perror> does. > + > +Stopping profiler from Lua is simple too: > +```lua > +local stopped, err = misc.memprof.stop() > +``` > + > +If there is any error occurred at profiling stopping (an error when file > +descriptor was closed) `memprof.stop()` returns `nil` (plus an error message as > +a second result and a system-dependent error code as a third result). Returns > +`true` otherwise. Ditto. > + <snipped> > +Memory profiler is expected to be thread safe, so it has a corresponding > +lock/unlock at internal mutex whenever you call `luaM_memprof_*`. If you want > +to build LuaJIT without thread safety use `-DLUAJIT_DISABLE_THREAD_SAFE`. I see no luaM_memprof_* interfaces. So what is this thread safety for? > + <snipped> > +## Benchmarks > + > +Benchmarks were taken from repo: > +[LuaJIT-test-cleanup](https://github.com/LuaJIT/LuaJIT-test-cleanup). > + > +Example of usage: > +```bash > +/usr/bin/time -f"array3d %U" ./luajit $BENCH_DIR/array3d.lua 300 >/dev/null > +``` > + > +Benchmark results before and after the patch (less is better): There are some considerable differences in benchmark results, so I have several questions: * Mention whether you tested with enabled JIT or not * How many iterations did you made? What is the dispersion/noise for these runs? * How come the values with the *enabled* memprof are less than values with disabled memprof and even the vanilla LuaJIT? E.g. fasta or fast fourier transform. > + > +``` > + | BEFORE | AFTER,memprof off | AFTER,memprof on > +---------------+--------+-------------------+----------------- > +array3d | 0.22 | 0.20 | 0.21 > +binary-trees | 3.32 | 3.33 | 3.94 > +chameneos | 2.92 | 3.18 | 3.12 > +coroutine-ring | 0.99 | 1.00 | 0.99 > +euler14-bit | 1.04 | 1.05 | 1.03 > +fannkuch | 6.77 | 6.69 | 6.64 > +fasta | 8.27 | 8.30 | 8.25 > +life | 0.48 | 0.48 | 1.03 > +mandelbrot | 2.69 | 2.70 | 2.75 > +mandelbrot-bit | 1.99 | 2.00 | 2.08 > +md5 | 1.57 | 1.61 | 1.56 > +nbody | 1.35 | 1.38 | 1.33 > +nsieve | 2.11 | 2.19 | 2.09 > +nsieve-bit | 1.50 | 1.55 | 1.47 > +nsieve-bit-fp | 4.40 | 4.63 | 4.44 > +partialsums | 0.54 | 0.58 | 0.55 > +pidigits-nogmp | 3.48 | 3.50 | 3.47 > +ray | 1.63 | 1.68 | 1.64 > +recursive-ack | 0.19 | 0.22 | 0.20 > +recursive-fib | 1.62 | 1.71 | 1.63 > +scimark-fft | 5.78 | 5.94 | 5.69 > +scimark-lu | 3.26 | 3.57 | 3.59 > +scimark-sor | 2.34 | 2.35 | 2.33 > +scimark-sparse | 5.03 | 4.92 | 4.91 > +series | 0.94 | 0.96 | 0.95 > +spectral-norm | 0.96 | 0.96 | 0.95 > +``` > -- > 2.28.0 > -- Best regards, IM
Part of #5442 --- Issues: https://github.com/tarantool/tarantool/issues/5442 Branch: https://github.com/tarantool/tarantool/tree/skaplun/gh-5442-luajit-memory-profiler Changes in v2: * Removed C API, Tarantool integration and description of additional features -- they will be added in another RFC if necessary. * Removed checking profile is running from the public API. * Added benchmarks and more meaningful example. * Grammar fixes. doc/rfc/5442-luajit-memory-profiler.md | 306 +++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 doc/rfc/5442-luajit-memory-profiler.md diff --git a/doc/rfc/5442-luajit-memory-profiler.md b/doc/rfc/5442-luajit-memory-profiler.md new file mode 100644 index 000000000..720105009 --- /dev/null +++ b/doc/rfc/5442-luajit-memory-profiler.md @@ -0,0 +1,306 @@ +# LuaJIT memory profiler + +* **Status**: In progress +* **Start date**: 24-10-2020 +* **Authors**: Sergey Kaplun @Buristan skaplun@tarantool.org, + Igor Munkin @igormunkin imun@tarantool.org, + Sergey Ostanevich @sergos sergos@tarantool.org +* **Issues**: [#5442](https://github.com/tarantool/tarantool/issues/5442) + +## Summary + +LuaJIT memory profiler is a toolchain for analysis of memory usage by user's +application. + +## Background and motivation + +Garbage collector (GC) is a curse of performance for most of Lua applications. +Memory usage of Lua application should be profiled to find out various +memory-unoptimized code blocks. If the application has memory leaks they can be +found with the profiler. + +## Detailed design + +The whole toolchain of memory profiling will be divided by several parts: +1) Prerequisites. +2) Recording information about memory usage and saving it. +3) Reading saved data and display it in human-readable format. + +### Prerequisites + +This section describes additional changes in LuaJIT required to feature +implementation. This version of LuaJIT memory profiler does not support +reporting allocations from traces. But trace code semantics should be totally +the same as for Lua interpreter. So profiling with `jit.off()` should be +enough. + +There are two different representations of functions in LuaJIT: the function's +prototype (`GCproto`) and the function object so called closure (`GCfunc`). +The closures are represented as `GCfuncL` and `GCfuncC` for Lua and C closures +correspondingly. Also LuaJIT has special function's type aka Fast Function. It +is used for LuaJIT builtins. + +Fast function allocation events always belong to the previous frame with +considering of tail call optimizations (TCO). + +Assume we have the following Lua chunk named <test.lua>: + +``` +1 jit.off() +2 misc.memprof.start("memprof_new.bin") +3 local function append(str, rep) +4 return string.rep(str, rep) +5 end +6 +7 local t = {} +8 for _ = 1, 1e5 do +9 table.insert(t, +10 append('q', _) +11 ) +12 end +13 misc.memprof.stop() +``` + +Profilers output is like the follows: +``` +ALLOCATIONS +@test.lua:0, line 10: 100007 5004638934 0 +@test.lua:0, line 5: 1 40 0 +@test.lua:0, line 7: 1 72 0 +@test.lua:0, line 9: 1 48 0 + +REALLOCATIONS +@test.lua:0, line 9: 16 4194496 2097376 + Overrides: + @test.lua:0, line 9 + +@test.lua:0, line 10: 12 262080 131040 + Overrides: + @test.lua:0, line 10 + + +DEALLOCATIONS +INTERNAL: 21 0 2463 +@test.lua:0, line 10: 8 0 1044480 + Overrides: + @test.lua:0, line 10 +``` + +In Lua functions for profile events, we had to determine the line number of the +function definition and corresponding `GCproto` address. For C functions only +address will be enough. If Fast function is called from Lua function we had to +report the Lua function for more meaningful output. Otherwise report the C +function. + +So we need to know in what type of function CALL/RETURN virtual machine (VM) +is. LuaJIT has already determined C function execution VM state but neither +Fast functions nor Lua function. So corresponding VM states will be added. + +To determine currently allocating coroutine (that may not be equal to currently +executed) new field will be added to `global_State` structure named `mem_L` +kept coroutine address. This field sets at each reallocation to corresponding +`L` with which it was called. + +There is the static function (`lj_debug_getframeline`) returned line number for +current `BCPos` in `lj_debug.c` already. It will be added to the debug module +API to be used in memory profiler. + +### Information recording + +Each allocate/reallocate/free is considered as a type of event that are +reported. Event stream has the following format: + +```c +/* +** Event stream format: +** +** stream := symtab memprof +** symtab := see <ljp_symtab.h> +** memprof := prologue event* epilogue +** prologue := 'l' 'j' 'm' version reserved +** version := <BYTE> +** reserved := <BYTE> <BYTE> <BYTE> +** event := event-alloc | event-realloc | event-free +** event-alloc := event-header loc? naddr nsize +** event-realloc := event-header loc? oaddr osize naddr nsize +** event-free := event-header loc? oaddr osize +** event-header := <BYTE> +** loc := loc-lua | loc-c +** loc-lua := sym-addr line-no +** loc-c := sym-addr +** sym-addr := <ULEB128> +** line-no := <ULEB128> +** oaddr := <ULEB128> +** naddr := <ULEB128> +** osize := <ULEB128> +** nsize := <ULEB128> +** epilogue := event-header +** +** <BYTE> : A single byte (no surprises here) +** <ULEB128>: Unsigned integer represented in ULEB128 encoding +** +** (Order of bits below is hi -> lo) +** +** version: [VVVVVVVV] +** * VVVVVVVV: Byte interpreted as a plain integer version number +** +** event-header: [FTUUSSEE] +** * EE : 2 bits for representing allocation event type (AEVENT_*) +** * SS : 2 bits for representing allocation source type (ASOURCE_*) +** * UU : 2 unused bits +** * T : Reserved. 0 for regular events, 1 for the events marked with +** the timestamp mark. It is assumed that the time distance between +** two marked events is approximately the same and is equal +** to 1 second. Always zero for now. +** * F : 0 for regular events, 1 for epilogue's *F*inal header +** (if F is set to 1, all other bits are currently ignored) +*/ +``` + +It is enough to know the address of LUA/C function to determine it. Symbolic +table (symtab) dumps at start of profiling to avoid determine and write line +number of Lua code and corresponding chunk of code each time, when memory event +happens. Each line contains the address, Lua chunk definition as the filename +and line number of the function's declaration. This table of symbols has the +following format described at <ljp_symtab.h>: + +```c +/* +** symtab format: +** +** symtab := prologue sym* +** prologue := 'l' 'j' 's' version reserved +** version := <BYTE> +** reserved := <BYTE> <BYTE> <BYTE> +** sym := sym-lua | sym-final +** sym-lua := sym-header sym-addr sym-chunk sym-line +** sym-header := <BYTE> +** sym-addr := <ULEB128> +** sym-chunk := string +** sym-line := <ULEB128> +** sym-final := sym-header +** string := string-len string-payload +** string-len := <ULEB128> +** string-payload := <BYTE> {string-len} +** +** <BYTE> : A single byte (no surprises here) +** <ULEB128>: Unsigned integer represented in ULEB128 encoding +** +** (Order of bits below is hi -> lo) +** +** version: [VVVVVVVV] +** * VVVVVVVV: Byte interpreted as a plain numeric version number +** +** sym-header: [FUUUUUTT] +** * TT : 2 bits for representing symbol type +** * UUUUU : 5 unused bits +** * F : 1 bit marking the end of the symtab (final symbol) +*/ +``` + +So when memory profiling starts default allocation function is replaced by the +new allocation function as additional wrapper to write inspected profiling +events. When profiler stops old allocation function is substituted back. + +Starting profiler from Lua is quite simple: +```lua +local started, err = misc.memprof.start(fname) +``` +Where `fname` is name of the file where profile events are written. Writer for +this function perform `fwrite()` for each call retrying in case of `EINTR`. +Final callback calls `fclose()` at the end of profiling. If it is impossible to +open a file for writing or profiler fails to start, returns `nil` on failure +(plus an error message as a second result and a system-dependent error code as +a third result). Otherwise returns some true value. + +Stopping profiler from Lua is simple too: +```lua +local stopped, err = misc.memprof.stop() +``` + +If there is any error occurred at profiling stopping (an error when file +descriptor was closed) `memprof.stop()` returns `nil` (plus an error message as +a second result and a system-dependent error code as a third result). Returns +`true` otherwise. + +If you want to build LuaJIT without memory profiler, you should build it with +`-DLUAJIT_DISABLE_MEMPROF`. If it is disabled `misc.memprof.start()` and +`misc.memprof.stop()` always return `false`. + +Memory profiler is expected to be thread safe, so it has a corresponding +lock/unlock at internal mutex whenever you call `luaM_memprof_*`. If you want +to build LuaJIT without thread safety use `-DLUAJIT_DISABLE_THREAD_SAFE`. + +### Reading and displaying saved data + +Binary data can be read by `lj-parse-memprof` utility. It parses the binary +format provided from memory profiler and render it in human-readable format. + +The usage is very simple: +``` +$ ./luajit-parse-memprof --help +luajit-parse-memprof - parser of the memory usage profile collected + with LuaJIT's memprof. + +SYNOPSIS + +luajit-parse-memprof [options] memprof.bin + +Supported options are: + + --help Show this help and exit +``` + +Plain text of profiled info has the following format: +``` +@<filename>:<function_line>, line <line where event was detected>: <number of events> <allocated> <freed> +``` +See example above. + +`INTERNAL` means that this allocations are caused by internal LuaJIT +structures. Note that events are sorted from the most often to the least. + +`Overrides` means what allocation this reallocation overrides. + +## Benchmarks + +Benchmarks were taken from repo: +[LuaJIT-test-cleanup](https://github.com/LuaJIT/LuaJIT-test-cleanup). + +Example of usage: +```bash +/usr/bin/time -f"array3d %U" ./luajit $BENCH_DIR/array3d.lua 300 >/dev/null +``` + +Benchmark results before and after the patch (less is better): + +``` + | BEFORE | AFTER,memprof off | AFTER,memprof on +---------------+--------+-------------------+----------------- +array3d | 0.22 | 0.20 | 0.21 +binary-trees | 3.32 | 3.33 | 3.94 +chameneos | 2.92 | 3.18 | 3.12 +coroutine-ring | 0.99 | 1.00 | 0.99 +euler14-bit | 1.04 | 1.05 | 1.03 +fannkuch | 6.77 | 6.69 | 6.64 +fasta | 8.27 | 8.30 | 8.25 +life | 0.48 | 0.48 | 1.03 +mandelbrot | 2.69 | 2.70 | 2.75 +mandelbrot-bit | 1.99 | 2.00 | 2.08 +md5 | 1.57 | 1.61 | 1.56 +nbody | 1.35 | 1.38 | 1.33 +nsieve | 2.11 | 2.19 | 2.09 +nsieve-bit | 1.50 | 1.55 | 1.47 +nsieve-bit-fp | 4.40 | 4.63 | 4.44 +partialsums | 0.54 | 0.58 | 0.55 +pidigits-nogmp | 3.48 | 3.50 | 3.47 +ray | 1.63 | 1.68 | 1.64 +recursive-ack | 0.19 | 0.22 | 0.20 +recursive-fib | 1.62 | 1.71 | 1.63 +scimark-fft | 5.78 | 5.94 | 5.69 +scimark-lu | 3.26 | 3.57 | 3.59 +scimark-sor | 2.34 | 2.35 | 2.33 +scimark-sparse | 5.03 | 4.92 | 4.91 +series | 0.94 | 0.96 | 0.95 +spectral-norm | 0.96 | 0.96 | 0.95 +``` -- 2.28.0
Igor, Thanks for your feedback, I'll send RFC v2 with the patch for the profiler in LuaJIT for more detailed review. On 11.12.20, Igor Munkin wrote: > Sergey, > > Thanks for the clarification! I read the doc once more and answered the > remaining questions below. I guess we have resolved the major points so > I wait for the second version of the RFC. > > On 16.11.20, Sergey Kaplun wrote: <snipped> > > > > + > > > > +Extended functions to control profiler are added to <lmisclib.h>. > > > > +Profiler is configured by this options structure: > > > > + > > > > +```c > > > > +/* Profiler options. */ > > > > +struct luam_Prof_options { > > > > + /* Options for the profile writer and final callback. */ > > > > + void *arg; > > > > + /* > > > > + ** Writer function for profile events. > > > > + ** Should return amount of written bytes on success or zero in case of error. > > > > + */ > > > > + size_t (*writer)(const void *data, size_t len, void *arg); > > > > + /* > > > > + ** Callback on profiler stopping. Required for correctly cleaning > > > > + ** at vm shoutdown when profiler still running. > > > > + ** Returns zero on success. > > > > + */ > > > > + int (*on_stop)(void *arg); > > > > +}; > > > > +``` > > > > > > Well, maybe it's better to introduce a special interface to fill this > > > struct? Something similar to luaE_coveragestart_cb[1]. As a result the > > > structure is encapsulated in LuaJIT, that looks more convenient for the > > > further maintenance. > > > > Yes, but on the other side for each profiler we should create N > > additional interfaces how to start it. > > As well as N additional structs for profiler options. So what? Why? One structure for *all* profilers. It's the point. > <snipped> > > > > > > What does this "Overrides" attribute mean? > > > > What allocation this reallocation overrides. > > Well, I guess I get it, but doubt that "overrides" fits this definition. > It can be named "overwrites". > > > > <snipped> > > > > > +#### Dump of Lua universe > > Let's sort the needy from the greedy right here. While reading the RFC > once more, I bethought to move this part to a separate document to not > spoil this one. Thoughts? Totally agree, insofar as it will be provided in the next version of the profiler. > <snipped> > > -- > Best regards, > IM -- Best regards, Sergey Kaplun