[patches] [V2 AVRO 2/3] Add fingerprint function

AKhatskevich avkhatskevich at tarantool.org
Thu Feb 22 14:20:33 MSK 2018


From: "AKhatskevich avkhatskevich at tarantool.org" <avkhatskevich at gmail.com>

Fingerprinting helps to control schema version.

Fingerprint function is compatible with any hash from Tarantool's digest
hashing library.

This function produces the same value as Apache implementations in
simple cases.
known cases which lead to fingerprint, different from apache one:
  - type references (this Avro copies type to each place which uses it)
  - nullable types (this Avro implements nullable type syntax and
    behavior a little different from Apache one)

The fingerprint is sustainable and the same for different
representations of the same schema.

Closes #30
---
 CMakeLists.txt              |  1 +
 avro_schema/fingerprint.lua | 76 +++++++++++++++++++++++++++++++++++++++
 avro_schema/init.lua        | 15 ++++++--
 test/api_tests.lua          | 86 ++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 174 insertions(+), 4 deletions(-)
 create mode 100644 avro_schema/fingerprint.lua

diff --git a/CMakeLists.txt b/CMakeLists.txt
index acd35d0..b7a80da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,6 +85,7 @@ add_custom_target(postprocess_lua ALL DEPENDS
 # Install module
 install(FILES avro_schema/init.lua avro_schema/compiler.lua
               avro_schema/frontend.lua avro_schema/runtime.lua
+              avro_schema/fingerprint.lua
         DESTINATION ${TARANTOOL_INSTALL_LUADIR}/avro_schema)
 
 install(FILES ${CMAKE_BINARY_DIR}/il.lua
diff --git a/avro_schema/fingerprint.lua b/avro_schema/fingerprint.lua
new file mode 100644
index 0000000..caa6d22
--- /dev/null
+++ b/avro_schema/fingerprint.lua
@@ -0,0 +1,76 @@
+-- This module implements fingerprinting mechanism for Avro schema.
+-- It was necessary to implement our json encoder, because of some special
+-- rules for avro fingerptint generation and Parsing Canonical Form generation.
+
+local json = require "json"
+-- Tarantool specific module
+local digest = require "digest"
+
+local avro_json
+
+local function raise_error(message, ...)
+    error(string.format("avro-fingerprint: "..message, ...))
+end
+
+local function is_primitive_type(xtype)
+    local ptypes = {"string", "number", "boolean"}
+    for _, t in ipairs(ptypes) do
+        if xtype == t then return true end
+    end
+    return false
+end
+
+local function avro_json_array(data)
+    local res = {}
+    for _, item in ipairs(data) do
+        table.insert(res,avro_json(item))
+    end
+    return string.format("[%s]", table.concat(res, ","))
+end
+
+local function avro_json_object(data)
+    local res = {}
+    local necessary_order = {
+        "name", "type", "fields", "symbols", "items", "values", "size"
+    }
+    for _, name in ipairs(necessary_order) do
+        local item = data[name]
+        if item ~= nil then
+            local inner = avro_json(item)
+            inner = string.format([[%s:%s]], json.encode(name), inner)
+            table.insert(res, inner)
+        end
+    end
+    return string.format("{%s}", table.concat(res, ","))
+end
+
+-- Takes normalized avro schema and produces normalized schema representation
+-- encoded in json format.
+avro_json = function (data)
+    local xtype = type(data)
+    if is_primitive_type(xtype) then
+        return json.encode(data)
+    end
+    if xtype ~= "table" then
+        raise_error("data type is not supported: %s", xtype)
+    end
+    -- array
+    if #data > 0 then
+        return avro_json_array(data)
+    end
+    -- object (dict)
+    return avro_json_object(data)
+end
+
+local function get_fingerprint(schema, algo, size)
+    if digest[algo] == nil or type(digest[algo]) ~= "function" then
+        raise_error("The hash function %s is not supported", algo)
+    end
+    local fp = digest[algo](avro_json(schema))
+    return fp:sub(1, size)
+end
+
+return {
+    avro_json = avro_json,
+    get_fingerprint = get_fingerprint,
+}
diff --git a/avro_schema/init.lua b/avro_schema/init.lua
index efd361d..621030d 100644
--- a/avro_schema/init.lua
+++ b/avro_schema/init.lua
@@ -4,6 +4,7 @@ local c           = require('avro_schema.compiler')
 local il          = require('avro_schema.il')
 local backend_lua = require('avro_schema.backend')
 local rt          = require('avro_schema.runtime')
+local fingerprint = require('avro_schema.fingerprint')
 
 local format, find, sub = string.format, string.find, string.sub
 local insert, remove, concat = table.insert, table.remove, table.concat
@@ -510,7 +511,14 @@ end
 local function export(schema_h)
     return export_helper(get_schema(schema_h), {})
 end
-
+local function get_fingerprint(schema_h, algo, size)
+    if algo == nil then algo = "sha256" end
+    if size == nil then size = 8 end
+    return fingerprint.get_fingerprint(get_schema(schema_h), algo, size)
+end
+local function to_json(schema_h)
+    return fingerprint.avro_json(get_schema(schema_h))
+end
 return {
     are_compatible = are_compatible,
     create         = create,
@@ -519,5 +527,6 @@ return {
     get_types      = get_types,
     is             = is_schema,
     validate       = validate,
-    export         = export
-}
+    export         = export,
+    fingerprint    = get_fingerprint,
+}
\ No newline at end of file
diff --git a/test/api_tests.lua b/test/api_tests.lua
index 8efd005..404768b 100644
--- a/test/api_tests.lua
+++ b/test/api_tests.lua
@@ -5,7 +5,7 @@ local msgpack = require('msgpack')
 
 local test = tap.test('api-tests')
 
-test:plan(50)
+test:plan(54)
 
 test:is_deeply({schema.create()}, {false, 'Unknown Avro type: nil'},
                'error unknown type')
@@ -211,5 +211,89 @@ for _, type in ipairs({
     test:is_deeply(schema.export(res[2]), type, 'schema normalization '..type)
 end
 
+-- fingerprint tests
+local fingerprint_testcases = {
+    {
+        schema = [[
+            {
+              "name": "Pet",
+              "type": "record",
+              "fields": [
+                {"name": "kind", "type": {"name": "Kind", "type": "enum", 
+                    "symbols": ["CAT", "DOG"]}},
+                {"name": "name", "type": "string"}
+              ]
+            }
+        ]],
+        fingerprint =
+        "42620f01b34833f1e70cf2a9567fc4d3b9cf8b74afba64af0e9dce9a148b1e90"
+    },
+    {
+        schema = [[{"type": "fixed", "name": "Id", "size": 4}]],
+        fingerprint =
+        "ecd9e5c6039fe40543f95176d664e1b9b56dddf1e8b1e3a6d87a6402b12e305d"
+    },
+    {
+        schema = [[
+            {
+              "type": "record",
+              "name": "HandshakeResponse", "namespace": "org.apache.avro.ipc",
+              "fields": [
+                {"name": "match",
+                 "type": {"type": "enum", "name": "HandshakeMatch",
+                          "symbols": ["BOTH", "CLIENT", "NONE"]}},
+                {"name": "serverProtocol",
+                 "type": ["null", "string"]},
+                {"name": "serverHash",
+                 "type": ["null", {"type": "fixed", "name": "MD5",
+                    "size": 16}]},
+                {"name": "meta",
+                 "type": ["null", {"type": "map", "values": "bytes"}]}
+              ]
+            }
+        ]],
+        fingerprint =
+        "a303cbbfe13958f880605d70c521a4b7be34d9265ac5a848f25916a67b11d889"
+    },
+-- In case of type reuse, it should not be copied. It should only contain type
+-- name.
+-- Example : `{"name": "serverHash", "type": "MD5"}` become
+-- `{"name":"serverHash","type":{"name":"org.apache.avro.ipc.MD5",
+--      "type":"fixed","size":16}}`
+-- correct fingerprint is
+-- "2b2f7a9b22991fe0df9134cb6b5ff7355343e797aaea337e0150e20f3a35800e"
+    {
+        schema = [[
+            {
+              "type": "record",
+              "name": "HandshakeRequest", "namespace":"org.apache.avro.ipc",
+              "fields": [
+                {"name": "clientHash",
+                 "type": {"type": "fixed", "name": "MD5", "size": 16}},
+                {"name": "clientProtocol", "type": ["null", "string"]},
+                {"name": "meta", "type": ["null", {"type": "map", 
+                    "values": "bytes"}]}
+              ]
+            }
+        ]],
+        fingerprint =
+        "ef17a5460289684db839c86a0c2cdcfe69da9dd0a3047e6a91f6d6bc37f76314"
+
+    },
+}
+
+function string.tohex(str)
+    return (str:gsub('.', function (c)
+        return string.format('%02X', string.byte(c))
+    end))
+end
+
+for i, testcase in ipairs(fingerprint_testcases) do
+    local _, schema_handler = schema.create(json.decode(testcase.schema))
+    local fingerprint = schema.fingerprint(schema_handler, "sha256", 32)
+    test:is(string.lower(string.tohex(fingerprint)), testcase.fingerprint,
+            "Fingerprint testcase ".. i)
+end
+
 test:check()
 os.exit(test.planned == test.total and test.failed == 0 and 0 or -1)
-- 
2.14.1




More information about the Tarantool-patches mailing list