[tarantool-patches] Re: [PATCH v1 1/1] implement mp_stack class

Vladimir Davydov vdavydov.dev at gmail.com
Tue Jan 22 23:21:36 MSK 2019


On Tue, Jan 22, 2019 at 03:28:03PM +0300, Kirill Shcherbatov wrote:
> Hi! Thank you for review. I like your idea with rewriting mp_fprint and mp_snprint routines.
> 
> =================================================
> 
> Introduced a new mp_stack class to make complex msgpack parsing.
> The structure is needed in order to facilitate the decoding
> nested MP_ARRAY and MP_MAP msgpack containers without recursion.
> This allows you to determine that the parsing of the current
> container is complete.
> Reworked mp_snprint and mp_fprint routines to use mp_stack
> instead of recursion.
> 
> Needed for #1012
> ---
>  CMakeLists.txt |   1 +
>  msgpuck.c      | 170 +++++++++++++++++++++++++++----------------------
>  msgpuck.h      | 168 ++++++++++++++++++++++++++++++++++++++++++++++++
>  test/msgpuck.c |  24 ++++++-
>  4 files changed, 287 insertions(+), 76 deletions(-)
> 
> diff --git a/CMakeLists.txt b/CMakeLists.txt
> index c129b77..6f62f54 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -15,6 +15,7 @@ check_c_compiler_flag("-mno-unaligned-access" CC_HAS_MNO_UNALIGNED_ACCESS)
>  add_library(msgpuck STATIC msgpuck.c hints.c)
>  set_target_properties(msgpuck PROPERTIES VERSION 1.0 SOVERSION 1)
>  set_target_properties(msgpuck PROPERTIES OUTPUT_NAME "msgpuck")
> +add_definitions(-DMP_PRINT_MAX_DEPTH=3)

This won't allow us to override this macro in Tarantool cmake file
AFAIU. Come to think of it, I guess we can the define the macro without
touching CMakeLists.txt at all, by simply adding

#ifndef MP_PRINT_MAX_DEPTH
# define MP_RPINT_MAX_DEPTH 32
#endif

to msgpuck.h instead.

>  
>  if (NOT ${PROJECT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})
>      # Embedded mode, skip tests, documentation and the install targets
> diff --git a/msgpuck.c b/msgpuck.c
> index 72b800c..84b9d63 100644
> --- a/msgpuck.c
> +++ b/msgpuck.c
> @@ -232,84 +232,106 @@ mp_format(char *data, size_t data_size, const char *format, ...)
>  	return res;
>  }
>  
> -#define MP_PRINT(SELF, PRINTF) \
> +#define MP_PRINT(PRINTF) \
>  {										\
> -	switch (mp_typeof(**data)) {						\
> -	case MP_NIL:								\
> -		mp_decode_nil(data);						\
> -		PRINTF("null");							\
> -		break;								\
> -	case MP_UINT:								\
> -		PRINTF("%llu", (unsigned long long) mp_decode_uint(data));	\
> -		break;								\
> -	case MP_INT:								\
> -		PRINTF("%lld", (long long) mp_decode_int(data));		\
> -		break;								\
> -	case MP_STR:								\
> -	case MP_BIN:								\
> -	{									\
> -		uint32_t len = mp_typeof(**data) == MP_STR ?			\
> -			mp_decode_strl(data) : mp_decode_binl(data);		\
> -		PRINTF("\"");							\
> -		const char *s;							\
> -		for (s = *data; s < *data + len; s++) {				\
> -			unsigned char c = (unsigned char ) *s;			\
> -			if (c < 128 && mp_char2escape[c] != NULL) {		\
> -				/* Escape character */				\
> -				PRINTF("%s", mp_char2escape[c]);		\
> +	struct mp_frame mp_frames[MP_PRINT_MAX_DEPTH];				\
> +	struct mp_stack mp_stack;						\

We usually try to shorten local variable names provided it doesn't
hurt readability so I'd rename 'mp_stack' to 'stack' and 'mp_frames'
to 'frames'.

> +	mp_stack_create(&mp_stack, MP_PRINT_MAX_DEPTH, mp_frames);		\
> +	do {									\
> +		bool is_map_value = !mp_stack_is_empty(&mp_stack) &&		\
> +				    mp_stack_top(&mp_stack)->type == MP_MAP &&	\
> +				    mp_stack_top(&mp_stack)->curr % 2 == 0;	\
> +		if (is_map_value)						\
> +			PRINTF(": ");						\

I'd like all map/array formatting to be gathered in one place - the code
would be easier to follow then. I don't see why you need to print ':'
here, '[' and '{' in switch-case, while ',', '}' and ']' in the end of
the loop. Can't you print all those elements (of course, except "[...]"
and "{...}") in the end of the loop, after emitting a token?

> +		switch (mp_typeof(**data)) {					\
> +		case MP_NIL:							\
> +			mp_decode_nil(data);					\
> +			PRINTF("null");						\
> +			break;							\
> +		case MP_UINT:							\
> +			PRINTF("%llu",						\
> +			       (unsigned long long)mp_decode_uint(data));	\
> +			break;							\
> +		case MP_INT:							\
> +			PRINTF("%lld", (long long) mp_decode_int(data));	\
> +			break;							\
> +		case MP_STR:							\
> +		case MP_BIN: {							\
> +			uint32_t len = mp_typeof(**data) == MP_STR ?		\
> +				mp_decode_strl(data) : mp_decode_binl(data);	\
> +			PRINTF("\"");						\
> +			const char *s;						\
> +			for (s = *data; s < *data + len; s++) {			\
> +				unsigned char c = (unsigned char ) *s;		\
> +				if (c < 128 && mp_char2escape[c] != NULL) {	\
> +					/* Escape character */			\
> +					PRINTF("%s", mp_char2escape[c]);	\
> +				} else {					\
> +					PRINTF("%c", c);			\
> +				}						\
> +			}							\
> +			PRINTF("\"");						\
> +			*data += len;						\
> +			break;							\
> +		}								\
> +		case MP_ARRAY: {						\
> +			PRINTF("[");						\
> +			if (!mp_stack_is_full(&mp_stack)) {			\
> +				mp_stack_push(&mp_stack, MP_ARRAY,		\
> +					      mp_decode_array(data));		\
>  			} else {						\
> -				PRINTF("%c", c);				\
> +				/*						\
> +				 * Skip msgpack items that do not		\
> +				 * fit onto the stack.				\
> +				 */						\
> +				PRINTF("...]");					\
> +				mp_next(data);					\
>  			}							\
> +			break;							\
>  		}								\
> -		PRINTF("\"");							\
> -		*data += len;							\
> -		break;								\
> -	}									\
> -	case MP_ARRAY:								\
> -	{									\
> -		uint32_t count = mp_decode_array(data);				\
> -		PRINTF("[");							\
> -		uint32_t i;							\
> -		for (i = 0; i < count; i++) {					\
> -			if (i)							\
> -				PRINTF(", ");					\
> -			SELF(data);						\
> +		case MP_MAP: {							\
> +			PRINTF("{");						\
> +			if (!mp_stack_is_full(&mp_stack)) {			\
> +				mp_stack_push(&mp_stack, MP_MAP,		\
> +					      2 * mp_decode_map(data));		\
> +			} else {						\
> +				/*						\
> +				 * Skip msgpack items that do not		\
> +				 * fit onto the stack.				\
> +				 */						\

I don't like that you copied-and-pasted the comment. IMO it would be
better to make MP_MAP and MP_ARRAY share the same case.

> +				PRINTF("...}");					\
> +				mp_next(data);					\
> +			}							\
> +			break;							\
>  		}								\
> -		PRINTF("]");							\
> -		break;								\
> -	}									\
> -	case MP_MAP:								\
> -	{									\
> -		uint32_t count = mp_decode_map(data);				\
> -		PRINTF("{");							\
> -		uint32_t i;							\
> -		for (i = 0; i < count; i++) {					\
> -			if (i)							\
> +		case MP_BOOL:							\
> +			PRINTF(mp_decode_bool(data) ? "true" : "false");	\
> +			break;							\
> +		case MP_FLOAT:							\
> +			PRINTF("%g", mp_decode_float(data));			\
> +			break;							\
> +		case MP_DOUBLE:							\
> +			PRINTF("%lg", mp_decode_double(data));			\
> +			break;							\
> +		case MP_EXT:							\
> +			mp_next(data);						\
> +			PRINTF("undefined");					\
> +			break;							\
> +		default:							\
> +			mp_unreachable();					\
> +			return -1;						\
> +		}								\
> +		while (!mp_stack_is_empty(&mp_stack)) {				\
> +			struct mp_frame *frame = mp_stack_top(&mp_stack);	\
> +			if (frame->curr > 0 && frame->curr < frame->size &&	\

You check (frame->curr < frame->size) explicitly here and then
implicitly in mp_stack_advance() below. This looks ugly. Please
rearrange the code so as not to access frame->size directly.

> +			    (frame->type == MP_ARRAY || is_map_value))		\
>  				PRINTF(", ");					\
> -			SELF(data);						\
> -			PRINTF(": ");						\
> -			SELF(data);						\
> +			if (!mp_stack_advance(&mp_stack))			\
> +				break;						\

Let's please rewrite this function without the outer loop - it would
shrink the diff making the code easier for review. All you need to do
is replace this 'break' with 'goto next', place 'next' label at the
beginning of this function, and remove the outer loop altogether.

> +			PRINTF(frame->type == MP_MAP ? "}" : "]");		\
> +			mp_stack_pop(&mp_stack);				\
>  		}								\
> -		PRINTF("}");							\
> -		break;								\
> -	}									\
> -	case MP_BOOL:								\
> -		PRINTF(mp_decode_bool(data) ? "true" : "false");		\
> -		break;								\
> -	case MP_FLOAT:								\
> -		PRINTF("%g", mp_decode_float(data));				\
> -		break;								\
> -	case MP_DOUBLE:								\
> -		PRINTF("%lg", mp_decode_double(data));				\
> -		break;								\
> -	case MP_EXT:								\
> -		mp_next(data);							\
> -		PRINTF("undefined");						\
> -		break;								\
> -	default:								\
> -		mp_unreachable();						\
> -		return -1;							\
> -	}									\
> +	} while (!mp_stack_is_empty(&mp_stack));									\

Bad formatting: too many spaces before \

>  }
>  
>  static inline int
> @@ -323,8 +345,7 @@ mp_fprint_internal(FILE *file, const char **data)
>  	total_bytes += bytes;							\
>  } while (0)
>  #define PRINT(...) HANDLE(fprintf, __VA_ARGS__)
> -#define SELF(...) HANDLE(mp_fprint_internal, __VA_ARGS__)
> -MP_PRINT(SELF, PRINT)
> +MP_PRINT(PRINT)

You don't need HANDLE anymore - you can define PRINT directly.
Also, you don't seem to need mp_fprint_internal.

>  #undef HANDLE
>  #undef SELF

Pointless undef - SELF isn't defined anymore.

>  #undef PRINT
> @@ -359,8 +380,7 @@ mp_snprint_internal(char *buf, int size, const char **data)
>  	}									\
>  } while (0)
>  #define PRINT(...) HANDLE(snprintf, __VA_ARGS__)
> -#define SELF(...) HANDLE(mp_snprint_internal, __VA_ARGS__)
> -MP_PRINT(SELF, PRINT)
> +MP_PRINT(PRINT)
>  #undef HANDLE
>  #undef SELF

Ditto.

>  #undef PRINT
> diff --git a/msgpuck.h b/msgpuck.h
> index b0d4967..de8b2d0 100644
> --- a/msgpuck.h
> +++ b/msgpuck.h
> @@ -1176,6 +1176,123 @@ mp_next(const char **data);
>  MP_PROTO int
>  mp_check(const char **data, const char *end);
>  
> +/**
> + * The maximum depth of the message pack nesting maps and arrays
> + * supported by mp_fprint and mp_snprintf routines. Upon reaching
> + * the maximum depth, the parsing will not be carried out.

The maximum msgpack nesting depth supported by mp_snprint().
Everything beyond that will be omitted (replaced with "...").

> + */
> +#ifndef MP_PRINT_MAX_DEPTH
> +#error "MP_PRINT_MAX_DEPTH must be defined"
> +#endif /* MP_PRINT_MAX_DEPTH */
> +
> +/** Message pack MP_MAP or MP_ARRAY container descriptor. */
> +struct mp_frame {
> +	/** MP frame type calculated with mp_typeof(). */
> +	enum mp_type type;
> +	/**
> +	 * Total number of items in MP_MAP or MP_ARRAY container
> +	 * calculated with mp_decode_map() or mp_decode_array().
> +	 */
> +	uint32_t size;
> +	/**
> +	 * Count of items already processed. Must be less than or
> +	 * equal to mp_frame::size member.
> +	 */
> +	uint32_t curr;
> +};
> +
> +/**
> + * Stack of map/array descriptors mp_frame to do complex msgpack
> + * parse. The structure is needed in order to facilitate
> + * decoding nested MP_ARRAY and MP_MAP msgpack containers without
> + * recursion. This allows to determine that the parsing of the
> + * current container is complete.
> +*/
> +struct mp_stack {
> +	/** The maximum stack depth. */
> +	uint32_t size;
> +	/**
> +	 * Count of used stack frames. Corresponds to the index
> +	 * in the array to perform the push operation. Must be
> +	 * less or equal to mp_stack::size member.
> +	 */
> +	uint32_t used;
> +	/**
> +	 * Array of size mp_stack::size of mp_frames.
> +	 */
> +	struct mp_frame *frames;
> +};
> +
> +/**
> + * \brief Initialize mp_stack \a stack of specified size \a size
> + * and user-allocated array \a frames.
> + * The \a frames allocation must have at least \a size mp_frame
> + * items.
> + * \param stack - the pointer to a mp_stack to initialize.
> + * \param size - stack size, count of stack::frames to use.
> + * \param frames - mp_frame preallocated array of size \a size
> + *                 of struct mp_frame items
> + */
> +MP_PROTO void
> +mp_stack_create(struct mp_stack *stack, uint32_t size, struct mp_frame *frames);
> +
> +/**
> + * \brief Test if mp_stack \a stack is empty.
> + * \param stack - the pointer to a mp_stack to a stack to test.
> + * \retval true if mp_stack is empty, false otherwise.
> + */
> +MP_PROTO bool
> +mp_stack_is_empty(struct mp_stack *stack);
> +
> +/**
> + * \brief Test if mp_stack \a stack is full.
> + * \param stack - the pointer to a mp_stack to a stack to test.
> + * \retval true if mp_stack is full, false otherwise.
> + */
> +MP_PROTO bool
> +mp_stack_is_full(struct mp_stack *stack);
> +
> +/**
> + * \brief Pop the top mp_stack \a stack frame.
> + * \param stack - the pointer to a mp_stack to operate with.
> + * \pre mp_stack_is_empty(stack) == false
> + */
> +MP_PROTO void
> +mp_stack_pop(struct mp_stack *stack);
> +
> +/**
> + * \brief Get a pointer to a top mp_stack \a stack frame.
> + * \param stack - the pointer to a mp_stack to operate with.
> + * \retval a pointer to a top stack frame.
> + * \pre mp_stack_is_empty(stack) == false
> + */
> +MP_PROTO struct mp_frame *
> +mp_stack_top(struct mp_stack *stack);
> +
> +/**
> + * \brief Construct a new mp_frame and push it on to mp_stack
> + * \a stack.
> + * \param stack - the pointer to a stack to operate with.
> + * \param type - the type of mp_frame to create.
> + * \param size - size size of mp_frame to create.
> + * \pre mp_stack_is_full(stack) == false
> + */
> +MP_PROTO void
> +mp_stack_push(struct mp_stack *stack, enum mp_type type, uint32_t size);
> +
> +/**
> + * \brief Advance a mp_stack_top()::\a curr attribute of the
> + * \a stack. The function also signals when it moves over the
> + * last item.
> + * \param stack - the pointer to a stack to operate with
> + * \retval true if the top frame has been already parsed:

'has already been parsed' would be grammatically correct.

Anyway, let's not use 'already' at all here - it's confusing. Let's just
say, 'if the top stack frame has been fully processed'?

Also, I'd return 'false' in this case, because they typically return
false on the end of input.

> + *         mp_stack_top()::\a curr > mp_stack_top()::\a size;
> + *         false otherwise.
> + * \pre mp_stack_is_empty(stack) == false
> + */
> +MP_PROTO bool
> +mp_stack_advance(struct mp_stack *stack);
> +
>  /*
>   * }}}
>   */
> @@ -1184,6 +1301,57 @@ mp_check(const char **data, const char *end);
>   * {{{ Implementation
>   */
>  
> +MP_IMPL void
> +mp_stack_create(struct mp_stack *stack, uint32_t size, struct mp_frame *frames)
> +{
> +	stack->frames = frames;
> +	stack->size = size;
> +	stack->used = 0;
> +}
> +
> +MP_IMPL bool
> +mp_stack_is_empty(struct mp_stack *stack)
> +{
> +	return stack->used == 0;
> +}
> +
> +MP_IMPL bool
> +mp_stack_is_full(struct mp_stack *stack)
> +{
> +	return stack->used >= stack->size;
> +}
> +
> +MP_IMPL void
> +mp_stack_pop(struct mp_stack *stack)
> +{
> +	assert(!mp_stack_is_empty(stack));
> +	--stack->used;
> +}
> +
> +MP_IMPL struct mp_frame *
> +mp_stack_top(struct mp_stack *stack)
> +{
> +	assert(!mp_stack_is_empty(stack));
> +	return &stack->frames[stack->used - 1];
> +}
> +
> +MP_IMPL void
> +mp_stack_push(struct mp_stack *stack, enum mp_type type, uint32_t size)
> +{
> +	assert(!mp_stack_is_full(stack));
> +	uint32_t idx = stack->used++;
> +	stack->frames[idx].type = type;
> +	stack->frames[idx].size = size;
> +	stack->frames[idx].curr = 0;
> +}
> +
> +MP_IMPL bool
> +mp_stack_advance(struct mp_stack *stack)
> +{
> +	struct mp_frame *frame = mp_stack_top(stack);
> +	return ++frame->curr > frame->size;
> +}
> +

These functions go before '\cond false' while the rest of MP_IMPL goes
after it. I don't know why it is like that, but I'm pretty sure mp_stack
implementation should go after '\cond false' too :) Let's please move
these functions to the end of the implementation section - I'd like
MP_IMPL to be defined in the same order as MP_PROTO.

>  /** \cond false */
>  extern const enum mp_type mp_type_hint[];
>  extern const int8_t mp_parser_hint[];
> diff --git a/test/msgpuck.c b/test/msgpuck.c
> index 9265453..46c9eed 100644
> --- a/test/msgpuck.c
> +++ b/test/msgpuck.c
> @@ -764,7 +764,7 @@ test_format(void)
>  int
>  test_mp_print()
>  {
> -	plan(10);
> +	plan(11);
>  	header();
>  
>  	char msgpack[128];
> @@ -838,6 +838,28 @@ test_mp_print()
>  	int rc = mp_fprint(stdin, msgpack);
>  	is(rc, -1, "mp_fprint I/O error");
>  
> +	/* Test mp_snprint parsing stack overflow. */
> +	char mp_buf[MP_PRINT_MAX_DEPTH * mp_sizeof_array(1) + mp_sizeof_uint(1)];
> +	int json_sz = 2 * (MP_PRINT_MAX_DEPTH + 1) + 3 + 1;

Let's please not use 'json' in variable names - we don't mention json
anywhere else in the library.

> +	char json[json_sz];

Better allocate the buffers with malloc().

> +	char *buf_wptr = mp_buf;
> +	char *json_wptr = json;
> +	for (int i = 0; i <= 2 * (MP_PRINT_MAX_DEPTH + 1); i++) {
> +		if (i < MP_PRINT_MAX_DEPTH + 1) {
> +			buf_wptr = mp_encode_array(buf_wptr, 1);
> +			*(json_wptr++) = '[';
> +		} else if (i == MP_PRINT_MAX_DEPTH + 1) {
> +			buf_wptr = mp_encode_uint(buf_wptr, 1);
> +			json_wptr += sprintf(json_wptr, "...");
> +		} else {
> +			*(json_wptr++) = ']';
> +		}
> +	}
> +	*(json_wptr++) = '\0';
> +	char decoded[json_sz];
> +	rc = mp_snprint(decoded, json_sz, mp_buf);
> +	ok(rc == json_sz - 1 && memcmp(decoded, json, json_sz - 1) == 0,
> +	   "mp_snprint parsing stack overflow test");

"mp_snprint max nesting depth"

>  	footer();
>  	return check_plan();
>  }



More information about the Tarantool-patches mailing list