[patches] [AVRO 2/3] Add fingerprint function

AKhatskevich avkhatskevich at tarantool.org
Tue Feb 20 11:26:32 MSK 2018


From: "AKhatskevich avkhatskevich at tarantool.org" <avkhatskevich at gmail.com>

The function helps to control schema version.

Fingerprint function is compatibla with any hash from Tarantool's digest
hashing library.

This function produces the same value as spache implementations on simple
cases, but may differ some times. (e.g. in cases with type references,
which copies type in this implementation)

The fingerprint is sustainable and the same for different representations
of the same schema.

Closes #30
---
 CMakeLists.txt              |  1 +
 avro_schema/fingerprint.lua | 74 +++++++++++++++++++++++++++++++++++++++++++++
 avro_schema/init.lua        | 15 +++++++--
 test/api_tests.lua          | 74 ++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 160 insertions(+), 4 deletions(-)
 create mode 100644 avro_schema/fingerprint.lua

diff --git a/CMakeLists.txt b/CMakeLists.txt
index acd35d0..b7a80da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,6 +85,7 @@ add_custom_target(postprocess_lua ALL DEPENDS
 # Install module
 install(FILES avro_schema/init.lua avro_schema/compiler.lua
               avro_schema/frontend.lua avro_schema/runtime.lua
+              avro_schema/fingerprint.lua
         DESTINATION ${TARANTOOL_INSTALL_LUADIR}/avro_schema)
 
 install(FILES ${CMAKE_BINARY_DIR}/il.lua
diff --git a/avro_schema/fingerprint.lua b/avro_schema/fingerprint.lua
new file mode 100644
index 0000000..0391835
--- /dev/null
+++ b/avro_schema/fingerprint.lua
@@ -0,0 +1,74 @@
+-- This file implements fingerprinting mechanism for Avro schema.
+-- It was necessary to implement our json encoder, because of some special
+-- rules for avro fingerptint generation and Parsing Canonical Form generation.
+
+local json = require "json"
+-- Tarantool specific module
+local digest = require "digest"
+
+local avro_json
+
+local function raise_error(message, ...)
+    error(string.format("avro-fingerprint: "..message, ...))
+end
+
+local function is_primitive_type(xtype)
+    local ptypes = {"string", "number", "boolean"}
+    for _,t in ipairs(ptypes) do
+        if xtype == t then return true end
+    end
+    return false
+end
+
+local function avro_json_array(data)
+    local res = {}
+    for _,item in ipairs(data) do
+        table.insert(res,avro_json(item))
+    end
+    return string.format("[%s]", table.concat(res, ","))
+end
+
+local function avro_json_object(data)
+    local res = {}
+    local necessary_order = {"name", "type", "fields", "symbols", "items", "values", "size"}
+    for _,name in ipairs(necessary_order) do
+        local item = data[name]
+        if item ~= nil then
+            local inner = avro_json(item)
+            inner = string.format([[%s:%s]], json.encode(name), inner)
+            table.insert(res, inner)
+        end
+    end
+    return string.format("{%s}", table.concat(res, ","))
+end
+
+-- Takes normalized avro schema and produces normalized schema representation
+-- encoded in json format.
+avro_json = function (data)
+    local xtype = type(data)
+    if is_primitive_type(xtype) then
+        return json.encode(data)
+    end
+    if xtype ~= "table" then
+        raise_error("data type is not supported: %s", xtype)
+    end
+    -- array
+    if #data > 0 then
+        return avro_json_array(data)
+    end
+    -- object (dict)
+    return avro_json_object(data)
+end
+
+local function get_fingerprint(schema, algo, size)
+    if digest[algo] == nil or type(digest[algo]) ~= "function" then
+        raise_error("The hash function %s is not supported", algo)
+    end
+    local fp = digest[algo](avro_json(schema))
+    return fp:sub(1, size)
+end
+
+return {
+    avro_json = avro_json,
+    get_fingerprint = get_fingerprint,
+}
diff --git a/avro_schema/init.lua b/avro_schema/init.lua
index efd361d..621030d 100644
--- a/avro_schema/init.lua
+++ b/avro_schema/init.lua
@@ -4,6 +4,7 @@ local c           = require('avro_schema.compiler')
 local il          = require('avro_schema.il')
 local backend_lua = require('avro_schema.backend')
 local rt          = require('avro_schema.runtime')
+local fingerprint = require('avro_schema.fingerprint')
 
 local format, find, sub = string.format, string.find, string.sub
 local insert, remove, concat = table.insert, table.remove, table.concat
@@ -510,7 +511,14 @@ end
 local function export(schema_h)
     return export_helper(get_schema(schema_h), {})
 end
-
+local function get_fingerprint(schema_h, algo, size)
+    if algo == nil then algo = "sha256" end
+    if size == nil then size = 8 end
+    return fingerprint.get_fingerprint(get_schema(schema_h), algo, size)
+end
+local function to_json(schema_h)
+    return fingerprint.avro_json(get_schema(schema_h))
+end
 return {
     are_compatible = are_compatible,
     create         = create,
@@ -519,5 +527,6 @@ return {
     get_types      = get_types,
     is             = is_schema,
     validate       = validate,
-    export         = export
-}
+    export         = export,
+    fingerprint    = get_fingerprint,
+}
\ No newline at end of file
diff --git a/test/api_tests.lua b/test/api_tests.lua
index b1628bb..42fc7f4 100644
--- a/test/api_tests.lua
+++ b/test/api_tests.lua
@@ -5,7 +5,7 @@ local msgpack = require('msgpack')
 
 local test = tap.test('api-tests')
 
-test:plan(50)
+test:plan(54)
 
 test:is_deeply({schema.create()}, {false, 'Unknown Avro type: nil'},
                'error unknown type')
@@ -208,5 +208,77 @@ for _, type in ipairs({"int", "string", "null", "boolean", "long", "float", "dou
     test:is_deeply(schema.export(res[2]), type, 'schema normalization '..type)
 end
 
+-- fingerprint tests
+local fingerprint_testcases = {
+    {
+        schema = [[
+            {
+              "name": "Pet",
+              "type": "record",
+              "fields": [
+                {"name": "kind", "type": {"name": "Kind", "type": "enum", "symbols": ["CAT", "DOG"]}},
+                {"name": "name", "type": "string"}
+              ]
+            }
+        ]],
+        fingerprint = "42620f01b34833f1e70cf2a9567fc4d3b9cf8b74afba64af0e9dce9a148b1e90"
+    },
+    {
+        schema = [[{"type": "fixed", "name": "Id", "size": 4}]],
+        fingerprint = "ecd9e5c6039fe40543f95176d664e1b9b56dddf1e8b1e3a6d87a6402b12e305d"
+    },
+    {
+        schema = [[
+            {
+              "type": "record",
+              "name": "HandshakeResponse", "namespace": "org.apache.avro.ipc",
+              "fields": [
+                {"name": "match",
+                 "type": {"type": "enum", "name": "HandshakeMatch",
+                          "symbols": ["BOTH", "CLIENT", "NONE"]}},
+                {"name": "serverProtocol",
+                 "type": ["null", "string"]},
+                {"name": "serverHash",
+                 "type": ["null", {"type": "fixed", "name": "MD5", "size": 16}]},
+                {"name": "meta",
+                 "type": ["null", {"type": "map", "values": "bytes"}]}
+              ]
+            }
+        ]],
+        fingerprint = "a303cbbfe13958f880605d70c521a4b7be34d9265ac5a848f25916a67b11d889"
+    },
+    -- in case of type reuse, it should not be copied. It should only contain type name
+    -- {"name": "serverHash", "type": "MD5"}, -- > {"name":"serverHash","type":{"name":"org.apache.avro.ipc.MD5","type":"fixed","size":16}}!!!
+    -- correct fingerprint is "2b2f7a9b22991fe0df9134cb6b5ff7355343e797aaea337e0150e20f3a35800e"
+    {
+        schema = [[
+            {
+              "type": "record",
+              "name": "HandshakeRequest", "namespace":"org.apache.avro.ipc",
+              "fields": [
+                {"name": "clientHash",
+                 "type": {"type": "fixed", "name": "MD5", "size": 16}},
+                {"name": "clientProtocol", "type": ["null", "string"]},
+                {"name": "meta", "type": ["null", {"type": "map", "values": "bytes"}]}
+              ]
+            }
+        ]],
+        fingerprint = "ef17a5460289684db839c86a0c2cdcfe69da9dd0a3047e6a91f6d6bc37f76314"
+
+    },
+}
+
+function string.tohex(str)
+    return (str:gsub('.', function (c)
+        return string.format('%02X', string.byte(c))
+    end))
+end
+
+for i, testcase in ipairs(fingerprint_testcases) do
+    local _, schema_handler = schema.create(json.decode(testcase.schema))
+    local fingerprint = schema.fingerprint(schema_handler, "sha256", 32)
+    test:is(string.lower(string.tohex(fingerprint)), testcase.fingerprint, "Fingerprint testcase "..i)
+end
+
 test:check()
 os.exit(test.planned == test.total and test.failed == 0 and 0 or -1)
-- 
2.14.1




More information about the Tarantool-patches mailing list