[tarantool-patches] Re: [PATCH v2 2/2] buffer: replace all ffi.new(type[1]) with cached union

Mon May 20 17:59:17 MSK 2019

Kostja asked to replace buffer.scalar_array and buffer.scalar
with buffer.reg1 and buffer.reg2. I did:

diff --git a/src/lua/buffer.lua b/src/lua/buffer.lua
index 7fd9f46ed..7467cb938 100644
--- a/src/lua/buffer.lua
+++ b/src/lua/buffer.lua
@@ -38,11 +38,11 @@ void *
 lua_static_aligned_alloc(size_t size, size_t alignment);
 
 /**
- * Scalar is a buffer to use with FFI functions, which usually
+ * Register is a buffer to use with FFI functions, which usually
  * operate with pointers to scalar values like int, char, size_t,
  * void *. To avoid doing 'ffi.new(<type>[1])' on each such FFI
  * function invocation, a module can use one of attributes of the
- * scalar union.
+ * register union.
  *
  * Naming policy of the attributes is easy to remember:
  * 'a' for array type + type name first letters + 'p' for pointer.
@@ -52,7 +52,7 @@ lua_static_aligned_alloc(size_t size, size_t alignment);
  * - const unsigned char *[1] -
  *       <a>rray of <c>onst <u>nsigned <c>har <p> pointer - acucp.
  */
-union scalar {
+union c_register {
     size_t as[1];
     void *ap[1];
     int ai[1];
@@ -225,18 +225,20 @@ local function static_alloc(type, size)
 end
 
 --
--- Sometimes it is wanted to use several temporary scalar cdata
--- values. Then one union object is not enough - its attributes
--- share memory.
+-- Sometimes it is wanted to use several temporary registers at
+-- once. For example, when a C function takes 2 C pointers. Then
+-- one register is not enough - its attributes share memory. Note,
+-- registers are not allocated with separate ffi.new calls
+-- deliberately. With a single allocation they fit into 1 cache
+-- line, and reduce the heap fragmentation.
 --
-local scalar_array = ffi.new('union scalar[?]', 2)
+local reg_array = ffi.new('union c_register[?]', 2)
 
 return {
     ibuf = ibuf_new;
     IBUF_SHARED = ffi.C.tarantool_lua_ibuf;
     READAHEAD = READAHEAD;
     static_alloc = static_alloc,
-    scalar_array = scalar_array,
-    -- Fast access when only one variable is needed.
-    scalar = scalar_array[0],
+    reg1 = reg_array[0],
+    reg2 = reg_array[1]
 }
diff --git a/src/lua/crypto.lua b/src/lua/crypto.lua
index 32107c0ce..d23ba8e3a 100644
--- a/src/lua/crypto.lua
+++ b/src/lua/crypto.lua
@@ -2,7 +2,7 @@
 
 local ffi = require('ffi')
 local buffer = require('buffer')
-local scalar = buffer.scalar
+local reg = buffer.reg1
 
 ffi.cdef[[
     /* from openssl/err.h */
@@ -132,10 +132,10 @@ local function digest_final(self)
         return error('Digest not initialized')
     end
     self.initialized = false
-    if ffi.C.EVP_DigestFinal_ex(self.ctx, self.buf.wpos, scalar.ai) ~= 1 then
+    if ffi.C.EVP_DigestFinal_ex(self.ctx, self.buf.wpos, reg.ai) ~= 1 then
         return error('Can\'t finalize digest: ' .. openssl_err_str())
     end
-    return ffi.string(self.buf.wpos, scalar.ai[0])
+    return ffi.string(self.buf.wpos, reg.ai[0])
 end
 
 local function digest_free(self)
@@ -205,10 +205,10 @@ local function hmac_final(self)
     end
     self.initialized = false
     local buf = buffer.static_alloc('char', 64)
-    if ffi.C.HMAC_Final(self.ctx, buf, scalar.ai) ~= 1 then
+    if ffi.C.HMAC_Final(self.ctx, buf, reg.ai) ~= 1 then
         return error('Can\'t finalize HMAC: ' .. openssl_err_str())
     end
-    return ffi.string(buf, scalar.ai[0])
+    return ffi.string(buf, reg.ai[0])
 end
 
 local function hmac_free(self)
diff --git a/src/lua/msgpackffi.lua b/src/lua/msgpackffi.lua
index 1e4786707..bfeedbc4b 100644
--- a/src/lua/msgpackffi.lua
+++ b/src/lua/msgpackffi.lua
@@ -24,7 +24,7 @@ mp_decode_double(const char **data);
 ]])
 
 local strict_alignment = (jit.arch == 'arm')
-local scalar = buffer.scalar
+local reg = buffer.reg1
 
 local function bswap_u16(num)
     return bit.rshift(bit.bswap(tonumber(num)), 16)
@@ -61,10 +61,10 @@ end
 local encode_u16
 if strict_alignment then
     encode_u16 = function(buf, code, num)
-        scalar.u16 = bswap_u16(num)
+        reg.u16 = bswap_u16(num)
         local p = buf:alloc(3)
         p[0] = code
-        ffi.copy(p + 1, scalar, 2)
+        ffi.copy(p + 1, reg, 2)
     end
 else
     encode_u16 = function(buf, code, num)
@@ -77,10 +77,10 @@ end
 local encode_u32
 if strict_alignment then
     encode_u32 = function(buf, code, num)
-        scalar.u32 = ffi.cast('uint32_t', bit.bswap(tonumber(num)))
+        reg.u32 = ffi.cast('uint32_t', bit.bswap(tonumber(num)))
         local p = buf:alloc(5)
         p[0] = code
-        ffi.copy(p + 1, scalar, 4)
+        ffi.copy(p + 1, reg, 4)
     end
 else
     encode_u32 = function(buf, code, num)
@@ -94,10 +94,10 @@ end
 local encode_u64
 if strict_alignment then
     encode_u64 = function(buf, code, num)
-        scalar.u64 = bit.bswap(ffi.cast('uint64_t', num))
+        reg.u64 = bit.bswap(ffi.cast('uint64_t', num))
         local p = buf:alloc(9)
         p[0] = code
-        ffi.copy(p + 1, scalar, 8)
+        ffi.copy(p + 1, reg, 8)
     end
 else
     encode_u64 = function(buf, code, num)
@@ -314,9 +314,9 @@ end
 local decode_u16
 if strict_alignment then
     decode_u16 = function(data)
-        ffi.copy(scalar, data[0], 2)
+        ffi.copy(reg, data[0], 2)
         data[0] = data[0] + 2
-        return tonumber(bswap_u16(scalar.u16))
+        return tonumber(bswap_u16(reg.u16))
     end
 else
     decode_u16 = function(data)
@@ -329,10 +329,10 @@ end
 local decode_u32
 if strict_alignment then
     decode_u32 = function(data)
-        ffi.copy(scalar, data[0], 4)
+        ffi.copy(reg, data[0], 4)
         data[0] = data[0] + 4
         return tonumber(
-            ffi.cast('uint32_t', bit.bswap(tonumber(scalar.u32))))
+            ffi.cast('uint32_t', bit.bswap(tonumber(reg.u32))))
     end
 else
     decode_u32 = function(data)
@@ -346,9 +346,9 @@ end
 local decode_u64
 if strict_alignment then
     decode_u64 = function(data)
-        ffi.copy(scalar, data[0], 8);
+        ffi.copy(reg, data[0], 8);
         data[0] = data[0] + 8
-        local num = bit.bswap(scalar.u64)
+        local num = bit.bswap(reg.u64)
         if num <= DBL_INT_MAX then
             return tonumber(num) -- return as 'number'
         end
@@ -375,8 +375,8 @@ end
 local decode_i16
 if strict_alignment then
     decode_i16 = function(data)
-        ffi.copy(scalar, data[0], 2)
-        local num = bswap_u16(scalar.u16)
+        ffi.copy(reg, data[0], 2)
+        local num = bswap_u16(reg.u16)
         data[0] = data[0] + 2
         -- note: this double cast is actually necessary
         return tonumber(ffi.cast('int16_t', ffi.cast('uint16_t', num)))
@@ -393,8 +393,8 @@ end
 local decode_i32
 if strict_alignment then
     decode_i32 = function(data)
-        ffi.copy(scalar, data[0], 4)
-        local num = bit.bswap(tonumber(scalar.u32))
+        ffi.copy(reg, data[0], 4)
+        local num = bit.bswap(tonumber(reg.u32))
         data[0] = data[0] + 4
         return num
     end
@@ -409,9 +409,9 @@ end
 local decode_i64
 if strict_alignment then
     decode_i64 = function(data)
-        ffi.copy(scalar, data[0], 8)
+        ffi.copy(reg, data[0], 8)
         data[0] = data[0] + 8
-        local num = bit.bswap(scalar.i64)
+        local num = bit.bswap(reg.i64)
         if num >= -DBL_INT_MAX and num <= DBL_INT_MAX then
             return tonumber(num) -- return as 'number'
         end
@@ -542,7 +542,7 @@ end
 -- element. It is significally faster on LuaJIT to use double pointer than
 -- return result, newpos.
 --
-local bufp = scalar.acucp;
+local bufp = reg.acucp;
 
 local function check_offset(offset, len)
     if offset == nil then
diff --git a/src/lua/socket.lua b/src/lua/socket.lua
index 1ddafee2f..ad894c0fb 100644
--- a/src/lua/socket.lua
+++ b/src/lua/socket.lua
@@ -10,8 +10,8 @@ local fiber = require('fiber')
 local fio = require('fio')
 local log = require('log')
 local buffer = require('buffer')
-local scalar = buffer.scalar
-local scalar_array = buffer.scalar_array
+local reg1 = buffer.reg1
+local reg2 = buffer.reg2
 local static_alloc = buffer.static_alloc
 
 local format = string.format
@@ -475,9 +475,9 @@ local function socket_setsockopt(self, level, name, value)
     end
 
     if info.type == 1 then
-        scalar.ai[0] = value
+        reg1.ai[0] = value
         local res = ffi.C.setsockopt(fd,
-            level, info.iname, scalar.ai, ffi.sizeof('int'))
+            level, info.iname, reg1.ai, ffi.sizeof('int'))
 
         if res < 0 then
             self._errno = boxerrno()
@@ -519,9 +519,9 @@ local function socket_getsockopt(self, level, name)
     self._errno = nil
 
     if info.type == 1 then
-        local value = scalar_array[0].ai
+        local value = reg1.ai
         value[0] = 0
-        local len = scalar_array[1].as
+        local len = reg2.as
         len[0] = ffi.sizeof('int')
         local res = ffi.C.getsockopt(fd, level, info.iname, value, len)
 
@@ -538,7 +538,7 @@ local function socket_getsockopt(self, level, name)
 
     if info.type == 2 then
         local value = static_alloc('char', 256)
-        local len = scalar.as
+        local len = reg1.as
         len[0] = 256
         local res = ffi.C.getsockopt(fd, level, info.iname, value, len)
         if res < 0 then
@@ -562,8 +562,8 @@ local function socket_linger(self, active, timeout)
     self._errno = nil
     if active == nil then
         local value = static_alloc('linger_t')
-        local len = scalar.as
-        len[0] = 2 * ffi.sizeof('linger_t')
+        local len = reg1.as
+        len[0] = ffi.sizeof('linger_t')
         local res = ffi.C.getsockopt(fd, level, info.iname, value, len)
         if res < 0 then
             self._errno = boxerrno()
@@ -806,7 +806,7 @@ local function get_recv_size(self, size)
             -- them using message peek.
             local iflags = get_iflags(internal.SEND_FLAGS, {'MSG_PEEK'})
             assert(iflags ~= nil)
-            size = tonumber(ffi.C.recv(fd, scalar.ac, 1, iflags))
+            size = tonumber(ffi.C.recv(fd, reg1.ac, 1, iflags))
             -- Prevent race condition: proceed with the case when
             -- a datagram of length > 0 has been arrived after the
             -- getsockopt call above.
diff --git a/src/lua/string.lua b/src/lua/string.lua
index 6ed6be8bf..6e12c59ae 100644
--- a/src/lua/string.lua
+++ b/src/lua/string.lua
@@ -15,8 +15,8 @@ ffi.cdef[[
 ]]
 
 local c_char_ptr     = ffi.typeof('const char *')
-local strip_newstart = buffer.scalar_array[0].aul
-local strip_newlen   = buffer.scalar_array[1].aul
+local strip_newstart = buffer.reg1.aul
+local strip_newlen   = buffer.reg2.aul
 
 local memcmp  = ffi.C.memcmp
 local memmem  = ffi.C.memmem
diff --git a/test/app/buffer.result b/test/app/buffer.result
index 10494b812..beccc5a87 100644
--- a/test/app/buffer.result
+++ b/test/app/buffer.result
@@ -7,17 +7,17 @@ buffer = require('buffer')
 ffi = require('ffi')
 ---
 ...
--- Scalar.
-scalar = buffer.scalar
+-- Registers.
+reg1 = buffer.reg1
 ---
 ...
-scalar.u16 = 100
+reg1.u16 = 100
 ---
 ...
 u16 = ffi.new('uint16_t[1]')
 ---
 ...
-ffi.copy(u16, scalar, 2)
+ffi.copy(u16, reg1, 2)
 ---
 ...
 u16[0]
@@ -27,10 +27,10 @@ u16[0]
 u16[0] = 200
 ---
 ...
-ffi.copy(scalar, u16, 2)
+ffi.copy(reg1, u16, 2)
 ---
 ...
-scalar.u16
+reg1.u16
 ---
 - 200
 ...
diff --git a/test/app/buffer.test.lua b/test/app/buffer.test.lua
index 6b1a974ca..a1c380680 100644
--- a/test/app/buffer.test.lua
+++ b/test/app/buffer.test.lua
@@ -2,16 +2,16 @@ test_run = require('test_run').new()
 buffer = require('buffer')
 ffi = require('ffi')
 
--- Scalar.
-scalar = buffer.scalar
-scalar.u16 = 100
+-- Registers.
+reg1 = buffer.reg1
+reg1.u16 = 100
 u16 = ffi.new('uint16_t[1]')
-ffi.copy(u16, scalar, 2)
+ffi.copy(u16, reg1, 2)
 u16[0]
 
 u16[0] = 200
-ffi.copy(scalar, u16, 2)
-scalar.u16
+ffi.copy(reg1, u16, 2)
+reg1.u16
 
 -- Alignment.
 _ = buffer.static_alloc('char') -- This makes buffer pos unaligned.