[patches] [AVRO 3/3] Allow to preserve extra fields in AST and fingerprint
Kirill Yukhin
kyukhin at tarantool.org
Wed Feb 21 15:22:13 MSK 2018
Hello,
There're few nits, oterwise patch is OK.
On 20 фев 11:26, AKhatskevich wrote:
> From: "AKhatskevich avkhatskevich at tarantool.org" <avkhatskevich at gmail.com>
>
> Add two options (tables of attr names) to `avro.create`:
> - preserve_in_ast: do not delete from AST
Delete what? Re-phrase it please
> - preserve_in_fingerprint: do not delete while calculating fingerprint
Ditto.
> This feature may be useful in case of creating frameworks which works
> over AVRO.
>
> Closes #31
> ---
> CMakeLists.txt | 2 +-
> avro_schema/fingerprint.lua | 29 ++++++++-----
> avro_schema/frontend.lua | 55 ++++++++++++++++---------
> avro_schema/init.lua | 57 ++++++++++++++++++++++----
> avro_schema/utils.lua | 12 ++++++
> test/api_tests.lua | 99 ++++++++++++++++++++++++++++++++++++++++++++-
> 6 files changed, 217 insertions(+), 37 deletions(-)
> create mode 100644 avro_schema/utils.lua
>
> diff --git a/CMakeLists.txt b/CMakeLists.txt
> index b7a80da..5633fab 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -85,7 +85,7 @@ add_custom_target(postprocess_lua ALL DEPENDS
> # Install module
> install(FILES avro_schema/init.lua avro_schema/compiler.lua
> avro_schema/frontend.lua avro_schema/runtime.lua
> - avro_schema/fingerprint.lua
> + avro_schema/fingerprint.lua avro_schema/utils.lua
> DESTINATION ${TARANTOOL_INSTALL_LUADIR}/avro_schema)
>
> install(FILES ${CMAKE_BINARY_DIR}/il.lua
> diff --git a/avro_schema/fingerprint.lua b/avro_schema/fingerprint.lua
> index 0391835..4c8ab4e 100644
> --- a/avro_schema/fingerprint.lua
> +++ b/avro_schema/fingerprint.lua
> @@ -20,21 +20,29 @@ local function is_primitive_type(xtype)
> return false
> end
>
> -local function avro_json_array(data)
> +local function avro_json_array(data, extra_fields)
> local res = {}
> for _,item in ipairs(data) do
> - table.insert(res,avro_json(item))
> + table.insert(res,avro_json(item, extra_fields))
> end
> return string.format("[%s]", table.concat(res, ","))
> end
>
> -local function avro_json_object(data)
> +local function avro_json_object(data, extra_fields)
> local res = {}
> local necessary_order = {"name", "type", "fields", "symbols", "items", "values", "size"}
> + --
> + -- There are a cases in which it is necessary to extend a schema.
> + -- The source below provides method to add those attrs in sustainable way.
Please, make line length less than 80 chars.
Also, please, make cure that new testcases follow this rule.
> + --
> + for _, val in ipairs(extra_fields) do
> + table.insert(necessary_order, val)
> + end
> +
> for _,name in ipairs(necessary_order) do
> local item = data[name]
> if item ~= nil then
> - local inner = avro_json(item)
> + local inner = avro_json(item, extra_fields)
> inner = string.format([[%s:%s]], json.encode(name), inner)
> table.insert(res, inner)
> end
> @@ -44,7 +52,10 @@ end
>
> -- Takes normalized avro schema and produces normalized schema representation
> -- encoded in json format.
> -avro_json = function (data)
> +avro_json = function (data, extra_fields)
> + extra_fields = extra_fields or {}
> + -- should be sorted for consistency
> + table.sort(extra_fields)
> local xtype = type(data)
> if is_primitive_type(xtype) then
> return json.encode(data)
> @@ -54,17 +65,17 @@ avro_json = function (data)
> end
> -- array
> if #data > 0 then
> - return avro_json_array(data)
> + return avro_json_array(data, extra_fields)
> end
> -- object (dict)
> - return avro_json_object(data)
> + return avro_json_object(data, extra_fields)
> end
>
> -local function get_fingerprint(schema, algo, size)
> +local function get_fingerprint(schema, algo, size, options)
> if digest[algo] == nil or type(digest[algo]) ~= "function" then
> raise_error("The hash function %s is not supported", algo)
> end
> - local fp = digest[algo](avro_json(schema))
> + local fp = digest[algo](avro_json(schema, options.preserve_in_fingerprint))
> return fp:sub(1, size)
> end
>
> diff --git a/avro_schema/frontend.lua b/avro_schema/frontend.lua
> index 646b448..2a6cc34 100644
> --- a/avro_schema/frontend.lua
> +++ b/avro_schema/frontend.lua
> @@ -67,7 +67,7 @@ local floor = math.floor
> local clear = require('table.clear')
> local next, type = next, type
>
> -function deepcopy(orig)
> +local function deepcopy(orig)
> local orig_type = type(orig)
> local copy
> if orig_type == 'table' then
> @@ -131,6 +131,7 @@ local function type_tag(t)
> return (type(t) == 'string' and t) or t.name or t.type
> end
>
> +local copy_schema
> local copy_schema_error
> local copy_schema_location_info
>
> @@ -200,13 +201,22 @@ local dcache = setmetatable({}, { __mode = 'k' })
>
> local copy_field_default
>
> +local function copy_fields(from, to, fields)
> + for _,field in ipairs(fields) do
> + if from[field] ~= nil then
> + to[field] = deepcopy(from[field])
> + end
> + end
> +end
> -- create a private copy and sanitize recursively;
> -- [ns] current ns (or nil)
> -- [scope] a dictionary of named types (ocasionally used for unnamed too)
> -- [open_rec] a set consisting of the current record + parent records;
> -- it is used to reject records containing themselves
> -copy_schema = function(schema, ns, scope, open_rec)
> - local res, ptr -- we depend on these being locals #5 and #6
> +-- [options] options table, contains:
> +-- - preserve_in_ast: names of attrs which should not be deleted
> +copy_schema = function(schema, ns, scope, open_rec, options)
> + local res, ptr -- we depend on these being locals #6 and #7
> if type(schema) == 'table' then
> if scope[schema] then
> -- this check is necessary for unnamed complex types (union, array map)
> @@ -219,7 +229,7 @@ copy_schema = function(schema, ns, scope, open_rec)
> res = {}
> for branchno, xbranch in ipairs(schema) do
> ptr = branchno
> - local branch = copy_schema(xbranch, ns, scope)
> + local branch = copy_schema(xbranch, ns, scope, nil, options)
> local bxtype, bxname
> if type(branch) == 'table' and not branch.type then
> copy_schema_error('Union may not immediately contain other unions')
> @@ -248,13 +258,21 @@ copy_schema = function(schema, ns, scope, open_rec)
> nullable, xtype = extract_nullable(xtype)
>
> if primitive_type[xtype] then
> + -- Preserve fields which are asked to be in ast.
ast->AST.
> + res = {}
> + copy_fields(schema, res, options.preserve_in_ast)
> -- primitive type normalization
> - if nullable == nil then
> + if nullable == nil and not next(res) then
> return xtype
> end
> - return {type = xtype, nullable = nullable}
> + res.type = xtype
> + res.nullable = nullable
> + return res
> elseif xtype == 'record' then
> - res = { type = 'record' }
> + -- Preserve fields which are asked to be in ast.
> + res = {}
> + copy_fields(schema, res, options.preserve_in_ast)
> + res.type = 'record'
> res.nullable = nullable
> local name, ns = checkname(schema, ns, scope)
> scope[name] = res
> @@ -298,19 +316,19 @@ copy_schema = function(schema, ns, scope, open_rec)
> if not xtype then
> copy_schema_error('Record field must have a "type"')
> end
> - field.type = copy_schema(xtype, ns, scope, open_rec)
> + field.type = copy_schema(xtype, ns, scope, open_rec, options)
> if open_rec[field.type] then
> local path, n = {}
> for i = 1, 1000000 do
> - local _, res = debug.getlocal(i, 5)
> + local _, res = debug.getlocal(i, 6)
> if res == field.type then
> n = i
> break
> end
> end
> for i = n, 1, -1 do
> - local _, res = debug.getlocal(i, 5)
> - local _, ptr = debug.getlocal(i, 6)
> + local _, res = debug.getlocal(i, 6)
> + local _, ptr = debug.getlocal(i, 7)
> insert(path, res.fields[ptr].name)
> end
> error(format('Record %s contains itself via %s',
> @@ -389,7 +407,7 @@ copy_schema = function(schema, ns, scope, open_rec)
> if not xitems then
> copy_schema_error('Array type must have "items"')
> end
> - res.items = copy_schema(xitems, ns, scope)
> + res.items = copy_schema(xitems, ns, scope, nil, options)
> scope[schema] = nil
> return res
> elseif xtype == 'map' then
> @@ -399,7 +417,7 @@ copy_schema = function(schema, ns, scope, open_rec)
> if not xvalues then
> copy_schema_error('Map type must have "values"')
> end
> - res.values = copy_schema(xvalues, ns, scope)
> + res.values = copy_schema(xvalues, ns, scope, nil, options)
> scope[schema] = nil
> return res
> elseif xtype == 'fixed' then
> @@ -478,13 +496,14 @@ copy_schema_location_info = function()
> local top, bottom = find_frames(copy_schema)
> local res = {}
> for i = bottom, top, -1 do
> - local _, node = debug.getlocal(i, 5)
> - local _, ptr = debug.getlocal(i, 6)
> + -- 6 and 7 are res and ptr vars from copy func
> + local _, node = debug.getlocal(i, 6)
> + local _, ptr = debug.getlocal(i, 7)
> if type(node) == 'table' then
> if node.type == nil then -- union
> insert(res, '<union>')
> if i <= top + 1 then
> - local _, next_node = debug.getlocal(i - 1, 6)
> + local _, next_node = debug.getlocal(i - 1, 7)
> if i == top or (i == top + 1 and
> not (next_node and next_node.name)) then
> insert(res, format('<branch-%d>', ptr))
> @@ -525,8 +544,8 @@ copy_schema_error = function(fmt, ...)
> end
>
> -- validate schema definition (creates a copy)
> -local function create_schema(schema)
> - return copy_schema(schema, nil, {})
> +local function create_schema(schema, options)
> + return copy_schema(schema, nil, {}, nil, options)
> end
>
> -- get a mapping from a (string) type tag -> union branch id
> diff --git a/avro_schema/init.lua b/avro_schema/init.lua
> index 621030d..c3989de 100644
> --- a/avro_schema/init.lua
> +++ b/avro_schema/init.lua
> @@ -5,6 +5,7 @@ local il = require('avro_schema.il')
> local backend_lua = require('avro_schema.backend')
> local rt = require('avro_schema.runtime')
> local fingerprint = require('avro_schema.fingerprint')
> +local utils = require('avro_schema.utils')
>
> local format, find, sub = string.format, string.find, string.sub
> local insert, remove, concat = table.insert, table.remove, table.concat
> @@ -22,6 +23,7 @@ local rt_universal_decode = rt.universal_decode
> local install_lua_backend = backend_lua.install
>
> -- We give away a handle but we never expose schema data.
> +-- {schema=schema, options=options}
> local schema_by_handle = setmetatable( {}, { __mode = 'k' } )
>
> local function get_schema(handle)
> @@ -29,7 +31,7 @@ local function get_schema(handle)
> if not schema then
> error(format('Not a schema: %s', handle), 0)
> end
> - return schema
> + return schema.schema
> end
>
> local function is_schema(schema_handle)
> @@ -62,7 +64,7 @@ local function get_ir(from_schema, to_schema, inverse)
> end
>
> local function schema_to_string(handle)
> - local schema = schema_by_handle[handle]
> + local schema = get_schema(handle)
> return format('Schema (%s)',
> handle[1] or (type(schema) ~= 'table' and schema) or
> schema.name or schema.type or 'union')
> @@ -119,16 +121,53 @@ augment_defaults = function(schema, visited)
> end
> end
>
> +local function create_options_validate(options)
> + options = options or {}
> + options = table.deepcopy(options)
> + if type(options) ~= 'table' then
> + return false, "Options should be a table"
> + end
> + if type(options.preserve_in_ast) ~= 'table' then
> + options.preserve_in_ast = {}
> + end
> + for _, f_ast in ipairs(options.preserve_in_ast) do
> + if type(f_ast) ~= 'string' then
> + return false, "preserve fields should be of string type"
> + end
> + end
> + if type(options.preserve_in_fingerprint) ~= 'table' then
> + options.preserve_in_fingerprint = {}
> + end
> + -- preserve_in_fingerprint should not contain fields which are not
> + -- presented in preserve_in_ast
> + for _, f_f in ipairs(options.preserve_in_fingerprint) do
> + if type(f_f) ~= 'string' then
> + return false, "preserve fields should be of string type"
> + end
> + if not utils.table_contains(options.preserve_in_ast, f_f) then
> + return false, "fingerprint should contain only fields from AST"
> + end
> + end
> + return true, options
> +end
> +
> local function create(raw_schema, options)
> - local ok, schema = pcall(f_create_schema, raw_schema)
> + local ok
> + ok, options = create_options_validate(options)
> + if ok == false then
> + return false, options
> + end
> + local schema
> + ok, schema = pcall(f_create_schema, raw_schema, options)
> if not ok then
> return false, schema
> end
> - if type(options) == 'table' and options.defaults == 'auto' then
> + if options.defaults == 'auto' then
> augment_defaults(schema, {})
> end
> local schema_handle = setmetatable({}, schema_handle_mt)
> - schema_by_handle[schema_handle] = schema
> + schema_by_handle[schema_handle] = {schema = schema,
> + options = options}
> return true, schema_handle
> end
>
> @@ -511,10 +550,12 @@ end
> local function export(schema_h)
> return export_helper(get_schema(schema_h), {})
> end
> -local function get_fingerprint(schema_h, algo, size)
> - if algo == nil then algo = "sha256" end
> +local function get_fingerprint(schema_h, hash, size)
> + if hash == nil then hash = "sha256" end
> if size == nil then size = 8 end
> - return fingerprint.get_fingerprint(get_schema(schema_h), algo, size)
> + local schema = schema_by_handle[schema_h]
> + return fingerprint.get_fingerprint(schema.schema, hash,
> + size, schema.options)
> end
> local function to_json(schema_h)
> return fingerprint.avro_json(get_schema(schema_h))
> diff --git a/avro_schema/utils.lua b/avro_schema/utils.lua
> new file mode 100644
> index 0000000..da10c25
> --- /dev/null
> +++ b/avro_schema/utils.lua
> @@ -0,0 +1,12 @@
> +local function table_contains(t, xval)
> + for k, val in ipairs(t) do
> + if type(k) == "number" and val == xval then
> + return true
> + end
> + end
> + return false
> +end
> +
> +return {
> + table_contains = table_contains
> +}
> \ No newline at end of file
> diff --git a/test/api_tests.lua b/test/api_tests.lua
> index 42fc7f4..02be147 100644
> --- a/test/api_tests.lua
> +++ b/test/api_tests.lua
> @@ -5,7 +5,7 @@ local msgpack = require('msgpack')
>
> local test = tap.test('api-tests')
>
> -test:plan(54)
> +test:plan(64)
>
> test:is_deeply({schema.create()}, {false, 'Unknown Avro type: nil'},
> 'error unknown type')
> @@ -280,5 +280,102 @@ for i, testcase in ipairs(fingerprint_testcases) do
> test:is(string.lower(string.tohex(fingerprint)), testcase.fingerprint, "Fingerprint testcase "..i)
> end
>
> +local schema_preserve_fields_testcases = {
> + {
> + name = "1",
> + schema = {
> + type="int",
> + extra_field="extra_field"
> + },
> + options = {},
> + ast = "int"
> + },
> + {
> + name = "2",
> + schema = {
> + type="int",
> + extra_field="extra_field"
> + },
> + options = {preserve_in_ast={"extra_field"}},
> + ast = {
> + type="int",
> + extra_field="extra_field"
> + }
> + },
> + {
> + name = "3-complex",
> + schema = {
> + type="int",
> + extra_field={extra_field={"extra_field"}}
> + },
> + options = {preserve_in_ast={"extra_field"}},
> + ast = {
> + type="int",
> + extra_field={extra_field={"extra_field"}}
> + }
> + }
> +}
> +
> +for _, testcase in ipairs(schema_preserve_fields_testcases) do
> + res = {schema.create(testcase.schema, testcase.options)}
> + test:is_deeply(schema.export(res[2]), testcase.ast, 'schema extra fields '..testcase.name)
> +end
> +
> +test:is_deeply(
> + {schema.create("int", {
> + preserve_in_ast={},
> + preserve_in_fingerprint={"extra_field"},
> + })},
> + {false, "fingerprint should contain only fields from AST"},
> + 'preserve_in_fingerprint contains more fields than AST')
> +
> +local fingerprint
> +res = {schema.create(
> + {
> + type = "record",
> + name = "test",
> + extra_field = "extra_field",
> + fields = {
> + { name = "bar", type = "null", default = msgpack.NULL, extra_field = "extra" },
> + { name = "foo", type = {"null", "int"}, default = msgpack.NULL },
> + }
> + }, nil)}
> +fingerprint = schema.fingerprint(res[2], "sha256", 32)
> +test:is(string.lower(string.tohex(fingerprint)),
> + "a64098ee437e9020923c6005db88f37a234ed60daae23b26e33d8ae1bf643356",
> + "Fingerprint extra fields 1")
> +
> +res = {schema.create(
> + {
> + type = "record",
> + name = "test",
> + extra_field = "extra_field",
> + fields = {
> + { name = "bar", type = "null", default = msgpack.NULL, extra_field = "extra" },
> + { name = "foo", type = {"null", "int"}, default = msgpack.NULL },
> + }
> + }, {preserve_in_ast={"extra_field"}, preserve_in_fingerprint={"extra_field"}})}
> +fingerprint = schema.fingerprint(res[2], "sha256", 32)
> +test:is(string.lower(string.tohex(fingerprint)),
> + "70bd295335daafff0a4512cadc39a4298cd81c460defec530c7372bdd1ec6f44",
> + "Fingerprint extra fields 2")
> +
> +res = {schema.create(
> + {
> + type = "int",
> + extra_field = "extra_field",
> + }, {preserve_in_ast={"extra_field"}})}
> +fingerprint = schema.fingerprint(res[2], "sha256", 32)
> +test:is_deeply(schema.export(res[2]), {type = "int", extra_field = "extra_field"},
> + "Prevent primitive type collapse by extra field")
> +
> +-- avro_json is used for fingerprint
> +fingerprint = require("avro_schema.fingerprint")
> +test:is(fingerprint.avro_json({field1="1"}), "{}", "avro_json 1")
> +test:is(fingerprint.avro_json({field1="1"}, {"field1"}), '{"field1":"1"}', "avro_json 2")
> +test:is(fingerprint.avro_json({field2="1", field1="1"}, {"field2", "field1"}),
> + '{"field1":"1","field2":"1"}', "avro_json 3 order")
> +
> +
> test:check()
> os.exit(test.planned == test.total and test.failed == 0 and 0 or -1)
> --
> 2.14.1
>
More information about the Tarantool-patches
mailing list