[PATCH v8 2/5] box: refactor tuple_field_raw_by_path routine

Wed Jan 23 16:15:50 MSK 2019

On Wed, Jan 16, 2019 at 04:44:40PM +0300, Kirill Shcherbatov wrote:
> diff --git a/src/box/tuple.h b/src/box/tuple.h
> index 83e5b7013..4368dac4e 100644
> --- a/src/box/tuple.h
> +++ b/src/box/tuple.h
> @@ -513,6 +513,23 @@ tuple_field(const struct tuple *tuple, uint32_t fieldno)
>  			       tuple_field_map(tuple), fieldno);
>  }
>  
> +/**
> + * Get a field refereed by index @part in tuple.
> + * @param tuple Tuple to get the field from.
> + * @param part Index part to use.

There's no 'part' in this function. Please fix the comment.

> + * @param path Relative JSON path to field data.
> + * @param path_len The length of the @path.
> + * @retval Field data if the field exists or NULL.
> + */
> +static inline const char *
> +tuple_field_by_path(const struct tuple *tuple, uint32_t fieldno,
> +		    const char *path, uint32_t path_len)
> +{
> +	return tuple_field_raw_by_path(tuple_format(tuple), tuple_data(tuple),
> +				       tuple_field_map(tuple), fieldno,
> +				       path, path_len);
> +}
> +
>  /**
>   * Get a field refereed by index @part in tuple.
>   * @param tuple Tuple to get the field from.
> @@ -532,38 +549,16 @@ tuple_field_by_part(const struct tuple *tuple, struct key_part *part)
>   * @param path Field JSON path.
>   * @param path_len Length of @a path.
>   * @param path_hash Hash of @a path.
> - * @param[out] field Found field, or NULL, if not found.
> - *
> - * @retval  0 Success.
> - * @retval -1 Error in JSON path.
> - */
> -static inline int
> -tuple_field_by_path(const struct tuple *tuple, const char *path,
> -                    uint32_t path_len, uint32_t path_hash,
> -                    const char **field)
> -{
> -	return tuple_field_raw_by_path(tuple_format(tuple), tuple_data(tuple),
> -	                               tuple_field_map(tuple), path, path_len,
> -	                               path_hash, field);
> -}
> -
> -/**
> - * Get tuple field by its name.
> - * @param tuple Tuple to get field from.
> - * @param name Field name.
> - * @param name_len Length of @a name.
> - * @param name_hash Hash of @a name.
>   *
> - * @retval not NULL MessagePack field.
> - * @retval     NULL No field with @a name.
> + * @retval field data if field exists or NULL

Missing dot (.)

Please update the comment so that it's clear what's the difference
between tuple_field_by_full_path and tuple_field_path. Don't forget
about tuple_field_raw_by_full_path.

>   */
>  static inline const char *
> -tuple_field_by_name(const struct tuple *tuple, const char *name,
> -		    uint32_t name_len, uint32_t name_hash)
> +tuple_field_by_full_path(const struct tuple *tuple, const char *path,
> +			 uint32_t path_len, uint32_t path_hash)
>  {
> -	return tuple_field_raw_by_name(tuple_format(tuple), tuple_data(tuple),
> -				       tuple_field_map(tuple), name, name_len,
> -				       name_hash);
> +	return tuple_field_raw_by_full_path(tuple_format(tuple), tuple_data(tuple),
> +					    tuple_field_map(tuple),
> +					    path, path_len, path_hash);
>  }
>  
>  /**
> diff --git a/src/box/tuple_format.h b/src/box/tuple_format.h
> index 30b93b610..d32bb91f2 100644
> --- a/src/box/tuple_format.h
> +++ b/src/box/tuple_format.h
> @@ -397,27 +397,33 @@ tuple_field_raw(struct tuple_format *format, const char *tuple,
>  }
>  
>  /**
> - * Get tuple field by its name.
> - * @param format Tuple format.
> - * @param tuple MessagePack tuple's body.
> - * @param field_map Tuple field map.
> - * @param name Field name.
> - * @param name_len Length of @a name.
> - * @param name_hash Hash of @a name.
> - *
> - * @retval not NULL MessagePack field.
> - * @retval     NULL No field with @a name.
> + * Retrieve msgpack data by JSON path.
> + * @param data Pointer to msgpack with data.
> + * @param path The path to process.
> + * @param path_len The length of the @path.
> + * @retval 0 On success.
> + * @retval >0 On path parsing error, invalid character position.
> + */
> +int
> +tuple_field_go_to_path(const char **data, const char *path, uint32_t path_len);

The error position returned by this function isn't used anywhere.
Let's make it return -1 on any error in the scope of this patch.

> +
> +/**
> + * Get a field at the specific position in this MessagePack
> + * array by fieldno and path.

All related functions use doxygen comments: see tuple_field_raw,
tuple_field_raw_by_full_path, tuple_field_by_part. Let's write
a proper comment for this one too for the sake of consistency.

>   */
>  static inline const char *
> -tuple_field_raw_by_name(struct tuple_format *format, const char *tuple,
> -			const uint32_t *field_map, const char *name,
> -			uint32_t name_len, uint32_t name_hash)
> +tuple_field_raw_by_path(struct tuple_format *format, const char *tuple,
> +			const uint32_t *field_map, uint32_t fieldno,
> +			const char *path, uint32_t path_len)
>  {
> -	uint32_t fieldno;
> -	if (tuple_fieldno_by_name(format->dict, name, name_len, name_hash,
> -				  &fieldno) != 0)
> +	tuple = tuple_field_raw(format, tuple, field_map, fieldno);
> +	if (path == NULL)
> +		return tuple;
> +	if (unlikely(tuple == NULL))
>  		return NULL;
> -	return tuple_field_raw(format, tuple, field_map, fieldno);
> +	if (unlikely(tuple_field_go_to_path(&tuple, path, path_len) != 0))
> +		return NULL;
> +	return tuple;
>  }
>  
>  /**

------------------------------------------------------------------------

To explain what I don't like about the refactoring done by this and
following patches, let me paste the resulting code here and comment
on it.

> /**
>  * Get a field at the specific position in this MessagePack array.
>  * Returns a pointer to MessagePack data.
>  * @param format tuple format
>  * @param tuple a pointer to MessagePack array
>  * @param field_map a pointer to the LAST element of field map
>  * @param field_no the index of field to return
>  *
>  * @returns field data if field exists or NULL
>  * @sa tuple_init_field_map()
>  */
> static inline const char *
> tuple_field_raw(struct tuple_format *format, const char *tuple,
> 		const uint32_t *field_map, uint32_t field_no)
> {
> 	if (likely(field_no < format->index_field_count)) {
> 		/* Indexed field */
> 
> 		if (field_no == 0) {
> 			mp_decode_array(&tuple);
> 			return tuple;
> 		}
> 
> 		int32_t offset_slot = tuple_format_field(format,
> 					field_no)->offset_slot;
> 		if (offset_slot != TUPLE_OFFSET_SLOT_NIL) {
> 			if (field_map[offset_slot] != 0)
> 				return tuple + field_map[offset_slot];
> 			else
> 				return NULL;
> 		}

Note field_map manipulation.

> 	}
> 	ERROR_INJECT(ERRINJ_TUPLE_FIELD, return NULL);
> 	uint32_t field_count = mp_decode_array(&tuple);
> 	if (unlikely(field_no >= field_count))
> 		return NULL;
> 	for (uint32_t k = 0; k < field_no; k++)
> 		mp_next(&tuple);
> 	return tuple;
> }
> 
> /**
>  * Retrieve msgpack data by JSON path.
>  * @param data Pointer to msgpack with data.
>  * @param path The path to process.
>  * @param path_len The length of the @path.
>  * @retval 0 On success.
>  * @retval >0 On path parsing error, invalid character position.
>  */
> int
> tuple_field_go_to_path(const char **data, const char *path, uint32_t path_len);
> 
> /**
>  * Get a field at the specific position in this MessagePack
>  * array by fieldno and path.
>  * The offset_slot[out] may be specified to save it on use,
>  * set NULL otherwise.
>  */
> static inline const char *
> tuple_field_raw_by_path(struct tuple_format *format, const char *tuple,
> 			const uint32_t *field_map, uint32_t fieldno,
> 			const char *path, uint32_t path_len,
> 			int32_t *offset_slot)
> {
> 	if (offset_slot != NULL)
> 		*offset_slot = TUPLE_OFFSET_SLOT_NIL;
> 	if (likely(path != NULL && fieldno < format->index_field_count)) {
> 		/* Indexed field */
> 		struct tuple_field *field =
> 			tuple_format_field_by_path(format, fieldno,
> 						   path, path_len);
> 		if (field == NULL)
> 			goto parse;
> 		assert(field != NULL);
> 		if (offset_slot != NULL)
> 			*offset_slot = field->offset_slot;
> 		if (field->offset_slot != TUPLE_OFFSET_SLOT_NIL) {
> 			assert(-field->offset_slot * sizeof(uint32_t) <=
> 			       format->field_map_size);
> 			return field_map[field->offset_slot] == 0 ?
> 			       NULL : tuple + field_map[field->offset_slot];
> 		}

Note another field_map manipulation, copied from tuple_field_raw.

> 	}
> parse:
> 	tuple = tuple_field_raw(format, tuple, field_map, fieldno);

And here we'll get exactly the same piece of code inlined in this
function...

> 	if (path == NULL)
> 		return tuple;
> 	if (unlikely(tuple == NULL))
> 		return NULL;
> 	if (unlikely(tuple_field_go_to_path(&tuple, path, path_len) != 0))
> 		return NULL;
> 	return tuple;
> }
> 
> /**
>  * Get tuple field by its path.
>  * @param format Tuple format.
>  * @param tuple MessagePack tuple's body.
>  * @param field_map Tuple field map.
>  * @param path Field path.
>  * @param path_len Length of @a path.
>  * @param path_hash Hash of @a path.
>  *
>  * @retval field data if field exists or NULL
>  */
> const char *
> tuple_field_raw_by_full_path(struct tuple_format *format, const char *tuple,
> 			     const uint32_t *field_map, const char *path,
> 			     uint32_t path_len, uint32_t path_hash);
> 
> /**
>  * Get a tuple field pointed to by an index part.
>  * @param format Tuple format.
>  * @param tuple A pointer to MessagePack array.
>  * @param field_map A pointer to the LAST element of field map.
>  * @param part Index part to use.
>  * @retval Field data if the field exists or NULL.
>  */
> static inline const char *
> tuple_field_by_part_raw(struct tuple_format *format, const char *data,
> 			const uint32_t *field_map, struct key_part *part)
> {
> 	if (likely(part->format_epoch == format->epoch)) {
> 		int32_t offset_slot = part->offset_slot_cache;
> 		assert(-offset_slot * sizeof(uint32_t) <=
> 		       format->field_map_size);
> 		return field_map[offset_slot] != 0 ?
> 		       data + field_map[offset_slot] : NULL;

Note yet another field_map manipulation.

It is ugly that we have three places in the code where we use
field_map+offset_slot in exactly the same fashion. What's especially
ugly that this code will be multiplied by the compiler: two times for
tuple_field_raw_by_path and three times for tuple_field_by_part_raw.

I think that

 1. We should implement tuple_field_raw_by_path without tuple_field_raw.
 2. tuple_field_raw should be an alias for tuple_field_raw_by_path with
    path = NULL. That's OK - the compiler will remove unused code then
    so it shouldn't result in any performance penalty.
 3. tuple_field_by_part_raw shouldn't check field_map directly. Instead
    it should pass the cached offset_slot to tuple_field_raw_by_path to
    do the job. I mean offset_slot ptr should become an in-out parameter
    of tuple_field_raw_by_path: if it's not NULL/OFFSET_SLOT_NIL then it
    should use it, otherwise it should try to look up offset_slot in
    tuple_format; if found and offset_slot is not NULL it should update
    the offset_slot with it.

> 	} else {
> 		assert(format->epoch != 0);
> 		int32_t offset_slot;
> 		const char *field =
> 			tuple_field_raw_by_path(format, data, field_map,
> 						part->fieldno, part->path,
> 						part->path_len, &offset_slot);
> 		if (offset_slot != TUPLE_OFFSET_SLOT_NIL) {
> 			part->format_epoch = format->epoch;
> 			part->offset_slot_cache = offset_slot;
> 		}
> 		return field;
> 	}
> }