[tarantool-patches] [PATCH] json: clarify bad syntax error messages

Tue Sep 24 10:41:12 MSK 2019

Count lines in the json parsing structure. It is needed to print
the number of line and column where a mistake was made.

Closes #3316
---
Branch: https://github.com/tarantool/tarantool/tree/romanhabibov/gh-3316-json-decode
Issue: https://github.com/tarantool/tarantool/issues/3316

 test/app-tap/json.test.lua        | 26 ++++++++++++++++-
 third_party/lua-cjson/lua_cjson.c | 47 ++++++++++++++++---------------
 2 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/test/app-tap/json.test.lua b/test/app-tap/json.test.lua
index 10f6f4ab2..f583ea653 100755
--- a/test/app-tap/json.test.lua
+++ b/test/app-tap/json.test.lua
@@ -22,7 +22,7 @@ end
 
 tap.test("json", function(test)
     local serializer = require('json')
-    test:plan(32)
+    test:plan(37)
 
     test:test("unsigned", common.test_unsigned, serializer)
     test:test("signed", common.test_signed, serializer)
@@ -121,4 +121,28 @@ tap.test("json", function(test)
     test:is(serializer.encode(rec4),
             '{"a":{"a":null,"b":null},"b":{"a":null,"b":null}}')
     serializer.cfg({encode_max_depth = orig_encode_max_depth})
+
+    --
+    -- gh-3316: Make sure that line number is printed in the error
+    -- message.
+    --
+    local _, err_msg
+    _, err_msg = pcall(serializer.decode, 'a{"hello": \n"world"}')
+    test:ok(string.find(err_msg, 'line 1 at character 1') ~= nil,
+            'mistake on first line')
+    _, err_msg = pcall(serializer.decode, '{"hello": \n"world"a}')
+    test:ok(string.find(err_msg, 'line 2 at character 8') ~= nil,
+            'mistake on second line')
+    _, err_msg = pcall(serializer.decode, '\n\n\n\n{"hello": "world"a}')
+    test:ok(string.find(err_msg, 'line 5 at character 18') ~= nil,
+            'mistake on fifth line')
+    serializer.cfg{decode_max_depth = 1}
+    _, err_msg = pcall(serializer.decode,
+                       '{"hello": {"world": {"hello": "world"}}}')
+    test:ok(string.find(err_msg, 'line 1 at character 11') ~= nil,
+            'mistake on first line')
+    _, err_msg = pcall(serializer.decode,
+                       '{"hello": \n{"world": {"hello": "world"}}}')
+    test:ok(string.find(err_msg, 'line 2 at character 1') ~= nil,
+            'mistake on second line')
 end)
diff --git a/third_party/lua-cjson/lua_cjson.c b/third_party/lua-cjson/lua_cjson.c
index 6b03e391a..0dee2deb9 100644
--- a/third_party/lua-cjson/lua_cjson.c
+++ b/third_party/lua-cjson/lua_cjson.c
@@ -66,6 +66,7 @@ typedef enum {
     T_COMMA,
     T_END,
     T_WHITESPACE,
+    T_LINEFEED,
     T_ERROR,
     T_UNKNOWN
 } json_token_type_t;
@@ -85,6 +86,7 @@ static const char *json_token_type_name[] = {
     "T_COMMA",
     "T_END",
     "T_WHITESPACE",
+    "T_LINEFEED",
     "T_ERROR",
     "T_UNKNOWN",
     NULL
@@ -105,11 +107,12 @@ typedef struct {
     strbuf_t *tmp;    /* Temporary storage for strings */
     struct luaL_serializer *cfg;
     int current_depth;
+    int line_count;
+    const char *cur_line_ptr;
 } json_parse_t;
 
 typedef struct {
     json_token_type_t type;
-    int index;
     union {
         const char *string;
         double number;
@@ -195,7 +198,7 @@ static void json_create_tokens()
     ch2token['\0'] = T_END;
     ch2token[' '] = T_WHITESPACE;
     ch2token['\t'] = T_WHITESPACE;
-    ch2token['\n'] = T_WHITESPACE;
+    ch2token['\n'] = T_LINEFEED;
     ch2token['\r'] = T_WHITESPACE;
 
     /* Update characters that require further processing */
@@ -588,11 +591,9 @@ static int json_append_unicode_escape(json_parse_t *json)
     return 0;
 }
 
-static void json_set_token_error(json_token_t *token, json_parse_t *json,
-                                 const char *errtype)
+static void json_set_token_error(json_token_t *token, const char *errtype)
 {
     token->type = T_ERROR;
-    token->index = json->ptr - json->data;
     token->value.string = errtype;
 }
 
@@ -615,7 +616,7 @@ static void json_next_string_token(json_parse_t *json, json_token_t *token)
     while ((ch = *json->ptr) != '"') {
         if (!ch) {
             /* Premature end of the string */
-            json_set_token_error(token, json, "unexpected end of string");
+            json_set_token_error(token, "unexpected end of string");
             return;
         }
 
@@ -630,12 +631,11 @@ static void json_next_string_token(json_parse_t *json, json_token_t *token)
                 if (json_append_unicode_escape(json) == 0)
                     continue;
 
-                json_set_token_error(token, json,
-                                     "invalid unicode escape code");
+                json_set_token_error(token, "invalid unicode escape code");
                 return;
             }
             if (!ch) {
-                json_set_token_error(token, json, "invalid escape code");
+                json_set_token_error(token, "invalid escape code");
                 return;
             }
 
@@ -715,7 +715,7 @@ static void json_next_number_token(json_parse_t *json, json_token_t *token)
     }
 
     if (json->ptr == endptr)
-        json_set_token_error(token, json, "invalid number");
+        json_set_token_error(token, "invalid number");
     else
         json->ptr = endptr;     /* Skip the processed number */
 }
@@ -732,18 +732,18 @@ static void json_next_token(json_parse_t *json, json_token_t *token)
     while (1) {
         ch = (unsigned char)*(json->ptr);
         token->type = ch2token[ch];
-        if (token->type != T_WHITESPACE)
+        if (token->type == T_LINEFEED) {
+            json->line_count++;
+            json->cur_line_ptr = json->ptr + 1;
+        } else if (token->type != T_WHITESPACE) {
             break;
+        }
         json->ptr++;
     }
 
-    /* Store location of new token. Required when throwing errors
-     * for unexpected tokens (syntax errors). */
-    token->index = json->ptr - json->data;
-
     /* Don't advance the pointer for an error or the end */
     if (token->type == T_ERROR) {
-        json_set_token_error(token, json, "invalid token");
+        json_set_token_error(token, "invalid token");
         return;
     }
 
@@ -808,7 +808,7 @@ static void json_next_token(json_parse_t *json, json_token_t *token)
     }
 
     /* Token starts with t/f/n but isn't recognised above. */
-    json_set_token_error(token, json, "invalid token");
+    json_set_token_error(token, "invalid token");
 }
 
 /* This function does not return.
@@ -829,9 +829,9 @@ static void json_throw_parse_error(lua_State *l, json_parse_t *json,
     else
         found = json_token_type_name[token->type];
 
-    /* Note: token->index is 0 based, display starting from 1 */
-    luaL_error(l, "Expected %s but found %s at character %d",
-               exp, found, token->index + 1);
+    luaL_error(l, "Expected %s but found %s on line %d at character %d",
+               exp, found, json->line_count,
+               json->ptr - json->cur_line_ptr + 1);
 }
 
 static inline void json_decode_ascend(json_parse_t *json)
@@ -849,8 +849,9 @@ static void json_decode_descend(lua_State *l, json_parse_t *json, int slots)
     }
 
     strbuf_free(json->tmp);
-    luaL_error(l, "Found too many nested data structures (%d) at character %d",
-        json->current_depth, json->ptr - json->data);
+    luaL_error(l, "Found too many nested data structures (%d) on line %d at "
+               "character %d", json->current_depth, json->line_count,
+               json->ptr - json->cur_line_ptr);
 }
 
 static void json_parse_object_context(lua_State *l, json_parse_t *json)
@@ -1001,6 +1002,8 @@ static int json_decode(lua_State *l)
     json.data = luaL_checklstring(l, 1, &json_len);
     json.current_depth = 0;
     json.ptr = json.data;
+    json.line_count = 1;
+    json.cur_line_ptr = json.data;
 
     /* Detect Unicode other than UTF-8 (see RFC 4627, Sec 3)
      *
-- 
2.20.1 (Apple Git-117)