From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Date: Tue, 23 Apr 2019 15:21:46 +0300 From: Konstantin Osipov Subject: Re: [tarantool-patches] [PATCH] Add MsgPack ext types handling. Message-ID: <20190423122146.GA5668@chai> References: <20190423113732.20786-1-sergepetrenko@tarantool.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20190423113732.20786-1-sergepetrenko@tarantool.org> To: tarantool-patches@freelists.org Cc: vdavydov.dev@gmail.com, Serge Petrenko List-ID: * Serge Petrenko [19/04/23 15:09]: I mentioned before that this would break all clients. It's high time to reconsider our in-memory storage format and make it different from serialization format: we're seeing significant performance penalty with serialization to and from msgpack in SQL, and DECIMAL is a yet another blow on the single-format-approach: our clients are simply not prepared to handle non-standard extensions. Let's investigate what it takes to store decimal as an mp ext, and serialize as mp string. > Add the ability to encode/decode MsgPack ext types, > introduce the first extension type: MP_EXT_DECIMAL: it will be used to > store decimals in tarantool. > > Needed-for tarantool/tarantool#692 > --- > https://github.com/tarantool/msgpuck/tree/sp/add-extension-types > https://github.com/tarantool/tarantool/issues/692 > > hints.c | 24 +++++++++ > msgpuck.c | 20 +++++++- > msgpuck.h | 133 +++++++++++++++++++++++++++++++++++++++++++++++++ > test/msgpuck.c | 7 ++- > 4 files changed, 178 insertions(+), 6 deletions(-) > > diff --git a/hints.c b/hints.c > index be859e9..0ab4f4d 100644 > --- a/hints.c > +++ b/hints.c > @@ -672,3 +672,27 @@ const char *mp_char2escape[128] = { > NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, > NULL, NULL, NULL, NULL, NULL, NULL, NULL, "\\u007f" > }; > + > +/* > + * This lookup table is used by mp_encode_ext() to > + * determine ext code (fixext 1, fixext 2, fixext 4, fixext 8, > + * fixext 16) to encode using size. > + */ > +const uint8_t mp_ext_hint[16] = { > + 0xd4, /* 1 */ > + 0xd5, /* 2 */ > + 0xd6, /* 3 */ > + 0xd6, /* 4 */ > + 0xd7, /* 5 */ > + 0xd7, /* 6 */ > + 0xd7, /* 7 */ > + 0xd7, /* 8 */ > + 0xd8, /* 9 */ > + 0xd8, /* 10 */ > + 0xd8, /* 11 */ > + 0xd8, /* 12 */ > + 0xd8, /* 13 */ > + 0xd8, /* 14 */ > + 0xd8, /* 15 */ > + 0xd8 /* 16 */ > +}; > diff --git a/msgpuck.c b/msgpuck.c > index d0ffb83..04aeb0f 100644 > --- a/msgpuck.c > +++ b/msgpuck.c > @@ -33,6 +33,17 @@ > #define MP_LIBRARY 1 > #include "msgpuck.h" > > +const char * > +mp_ext_type_str(enum mp_ext_type type) > +{ > + switch(type) { > + case MP_EXT_DECIMAL: > + return "decimal"; > + default: > + return "undefined"; > + } > +} > + > size_t > mp_vformat(char *data, size_t data_size, const char *format, va_list vl) > { > @@ -298,9 +309,14 @@ next: \ > PRINTF("%lg", mp_decode_double(&data)); \ > break; \ > case MP_EXT: \ > - mp_next(&data); \ > - PRINTF("undefined"); \ > + { \ > + uint8_t type; \ > + uint32_t len; \ > + len = mp_decode_ext(&data, &len, &type); \ > + PRINTF("%s", mp_ext_type_str(type)); \ > + data += len; \ > break; \ > + } \ > default: \ > mp_unreachable(); \ > return -1; \ > diff --git a/msgpuck.h b/msgpuck.h > index 6b29cd6..0a7d230 100644 > --- a/msgpuck.h > +++ b/msgpuck.h > @@ -382,6 +382,13 @@ enum mp_type { > MP_EXT > }; > > +/** > + * \brief MsgPack extension data types. > + */ > +enum mp_ext_type { > + MP_EXT_DECIMAL = 0 > +}; > + > /** > * \brief Determine MsgPack type by a first byte \a c of encoded data. > * > @@ -532,6 +539,49 @@ mp_check_map(const char *cur, const char *end); > MP_PROTO uint32_t > mp_decode_map(const char **data); > > +/** > + * \brief calculate exact buffer size needed to store > + * ext value of length \a len. > + * \param len value length in bytes. > + * \retval buffer size in bytes > + */ > +MP_PROTO uint32_t > +mp_sizeof_ext(uint32_t len); > + > +/** > + * \brief Encode extension header with \a type and > + * value length \a len. > + * The value must be encoded after the header. > + * \return \a data + \link mp_sizeof_ext() mp_sizeof_ext(size)\endlink > + */ > +MP_PROTO char * > +mp_encode_ext(char *data, uint8_t type, uint32_t len); > + > +/** > + * \brief Check that \a cur buffer has enough bytes to decode a map header. > + * \param cur buffer > + * \param end end of the buffer > + * \retval 0 - buffer has enough bytes > + * \retval > 0 - the numbeer of remaining bytes to read > + * \pre cur < end > + * \pre mp_typeof(*cur) == MP_EXT > + */ > +MP_PROTO MP_PURE ptrdiff_t > +mp_check_ext(const char *cur, const char *end); > + > +/** > + * \brief Decode an extension header from MsgPack \a data. > + * > + * The extension type value must be decoded after the header. > + * \param data - the pointer to a buffer. > + * \param len - decoded length of the following value. > + * \param type - decoded type of the following value. > + * \retval - the length of the following ext value. > + * \post *data = *data + mp_sizeof_ext(length) > + */ > +MP_PROTO uint32_t > +mp_decode_ext(const char **data, uint32_t *len, uint8_t *type); > + > /** > * \brief Calculate exact buffer size needed to store an integer \a num. > * Maximum return value is 9. For performance reasons you can preallocate > @@ -1326,6 +1376,7 @@ mp_frame_advance(struct mp_frame *frame); > extern const enum mp_type mp_type_hint[]; > extern const int8_t mp_parser_hint[]; > extern const char *mp_char2escape[]; > +extern const uint8_t mp_ext_hint[]; > > MP_IMPL MP_ALWAYSINLINE enum mp_type > mp_typeof(const char c) > @@ -1462,6 +1513,88 @@ mp_decode_map(const char **data) > } > } > > +MP_IMPL uint32_t > +mp_sizeof_ext(uint32_t len) > +{ > + if (len <= 16) return 2; > + if (len <= UINT8_MAX) return 3; > + if (len <= UINT16_MAX) return 4; > + else return 5; > +} > + > +MP_IMPL char * > +mp_encode_ext(char *data, uint8_t type, uint32_t len) > +{ > + /* > + * zero-length should be encoded in EXT 8, because > + * fixext always assumes exact length (1, 2, 4, 8 and 16). > + */ > + if (len <= 16 && len > 0) { > + data = mp_store_u8(data, mp_ext_hint[len-1]); > + } else if (len <= UINT8_MAX) { > + data = mp_store_u8(data, 0xc7); > + data = mp_store_u8(data, (uint8_t) len); > + } else if (len <= UINT16_MAX) { > + data = mp_store_u8(data, 0xc8); > + data = mp_store_u16(data, (uint16_t) len); > + } else { > + data = mp_store_u8(data, 0xc9); > + data = mp_store_u32(data, (uint32_t) len); > + } > + data = mp_store_u8(data, type); > + return data; > +} > + > +MP_IMPL ptrdiff_t > +mp_check_ext(const char *cur, const char *end) > +{ > + assert(cur < end); > + assert(mp_typeof(*cur) == MP_EXT); > + uint8_t c = mp_load_u8(&cur); > + switch (c) { > + case 0xd4: > + case 0xd5: > + case 0xd6: > + case 0xd7: > + case 0xd8: > + return 1 - (end - cur); > + case 0xc7: > + case 0xc8: > + return c - 0xc4 - (end - cur); > + case 0xc9: > + return 6 - (end - cur); > + default: > + mp_unreachable(); > + } > +} > + > +MP_IMPL uint32_t > +mp_decode_ext(const char **data, uint32_t *len, uint8_t *type) { > + uint8_t c = mp_load_u8(data); > + switch (c) { > + case 0xd4: > + case 0xd5: > + case 0xd6: > + case 0xd7: > + case 0xd8: > + *len = 1u << (c - 0xd4); > + break; > + case 0xc7: > + *len = mp_load_u8(data); > + break; > + case 0xc8: > + *len = mp_load_u16(data); > + break; > + case 0xc9: > + *len = mp_load_u32(data); > + break; > + default: > + mp_unreachable(); > + } > + *type = mp_load_u8(data); > + return *len; > +} > + > MP_IMPL uint32_t > mp_sizeof_uint(uint64_t num) > { > diff --git a/test/msgpuck.c b/test/msgpuck.c > index 3b31bfc..5ddaba7 100644 > --- a/test/msgpuck.c > +++ b/test/msgpuck.c > @@ -789,9 +789,8 @@ test_mp_print() > d = mp_encode_double(d, 3.14); > d = mp_encode_uint(d, 100); > d = mp_encode_uint(d, 500); > - *d++ = 0xd4; /* let's pack smallest fixed ext */ > - *d++ = 0; > - *d++ = 0; > + /* let's pack zero-length ext */ > + d = mp_encode_ext(d, MP_EXT_DECIMAL, 0); > char bin[] = "\x12test\x34\b\t\n\"bla\\-bla\"\f\r"; > d = mp_encode_bin(d, bin, sizeof(bin)); > d = mp_encode_map(d, 0); > @@ -800,7 +799,7 @@ test_mp_print() > const char *expected = > "[-5, 42, \"kill bill\", [], " > "{\"bool true\": true, \"bool false\": false, \"null\": null, " > - "\"float\": 3.14, \"double\": 3.14, 100: 500}, undefined, " > + "\"float\": 3.14, \"double\": 3.14, 100: 500}, decimal, " > "\"\\u0012test4\\b\\t\\n\\\"bla\\\\-bla\\\"\\f\\r\\u0000\", {}]"; > int esize = strlen(expected); > > -- > 2.20.1 (Apple Git-117) > -- Konstantin Osipov, Moscow, Russia, +7 903 626 22 32 http://tarantool.io - www.twitter.com/kostja_osipov