* [tarantool-patches] Re: [PATCH] lua: add optional 'chars' param to string.strip functions
2018-12-20 18:02 [tarantool-patches] [PATCH] lua: add optional 'chars' param to string.strip functions Michał Durak
@ 2018-12-23 9:23 ` Alexander Turenko
0 siblings, 0 replies; 2+ messages in thread
From: Alexander Turenko @ 2018-12-23 9:23 UTC (permalink / raw)
To: tarantool-patches
Hi!
Thanks for the effort.
Michał, your email is hidden (shown as 'Michał Durak
<dmarc-noreply@freelists.org>'), so I cannot reply you directly. I think
it is something in settings on freelists. Check it, please.
I think searching for a set of certain characters using regular
expressions is a kind of overkill. I created simple benchmark:
```
#!/usr/bin/env tarantool
local clock = require('clock')
local str = '###dfdfweweds'
local start = clock.monotonic()
for i = 1, 1000000 do
local res = string.lstrip(str, '#')
end
local duration = clock.monotonic() - start
print(('%.6f'):format(duration))
```
And tried to reimplement string.lstrip() with straight O(M*N) alrorithm:
```
@@ -373,10 +373,36 @@ local function string_lstrip(inp, chars)
if chars == '' then
return inp
end
-
- chars = chars:gsub('[%^%$%(%)%%%.%[%]%*%+%-%?]', "%%%0"):gsub('%z+', '%%z')
- local pattern = string.format("^[%s]*(.-)", chars)
- return (string.gsub(inp, pattern, "%1"))
+
+ local casted_inp = ffi.cast('const char *', inp)
+ local len = inp:len()
+ local casted_chars = ffi.cast('const char *', chars)
+ local chars_len = chars:len()
+ local skipped = 0
+
+ for i = 0, len - 1 do
+ local c = casted_inp[i]
+ local matched = false
+ for j = 0, chars_len - 1 do
+ if c == casted_chars[j] then
+ skipped = skipped + 1
+ matched = true
+ break
+ end
+ end
+ if not matched then
+ break
+ end
+ end
+
+ local res_len = len - skipped
+ local res = ffi.new('char[?]', res_len)
+
+ for i = 0, res_len - 1 do
+ res[i] = casted_inp[i + skipped]
+ end
+
+ return ffi.string(res, res_len)
end
local function string_rstrip(inp, chars)
```
That give me 3x gain on the benchmark above:
// use regexps
0.373055
0.371193
0.377918
// use pointers
0.115181
0.112625
0.124399
So I propose to use simple algorithm with pointers. It is straight and
quite fast.
Note: This is just sketchy code to proof we can do it w/o regexps, I
likely missed some corner cases here.
BTW, using memchr() is a bit slower, but looks more clear:
```
diff --git a/src/lua/string.lua b/src/lua/string.lua
index 9eab33423..dcfe404f3 100644
--- a/src/lua/string.lua
+++ b/src/lua/string.lua
@@ -6,6 +6,8 @@ ffi.cdef[[
const char *needle, size_t needle_len);
int memcmp(const char *mem1, const char *mem2, size_t num);
int isspace(int c);
+ const char *
+ memchr(const char *s, int c, size_t n);
]]
local c_char_ptr = ffi.typeof('const char *')
@@ -13,6 +15,7 @@ local c_char_ptr = ffi.typeof('const char *')
local memcmp = ffi.C.memcmp
local memmem = ffi.C.memmem
local isspace = ffi.C.isspace
+local memchr = ffi.C.memchr
local err_string_arg = "bad argument #%d to '%s' (%s expected, got %s)"
@@ -373,10 +376,31 @@ local function string_lstrip(inp, chars)
if chars == '' then
return inp
end
-
- chars = chars:gsub('[%^%$%(%)%%%.%[%]%*%+%-%?]', "%%%0"):gsub('%z+', '%%z')
- local pattern = string.format("^[%s]*(.-)", chars)
- return (string.gsub(inp, pattern, "%1"))
+
+ local casted_inp = ffi.cast('const char *', inp)
+ local len = inp:len()
+ local casted_chars = ffi.cast('const char *', chars)
+ local chars_len = chars:len()
+ local skipped = 0
+
+ for i = 0, len - 1 do
+ local c = casted_inp[i]
+ local matched = memchr(casted_chars, c, chars_len) ~= nil;
+ if matched then
+ skipped = skipped + 1
+ else
+ break
+ end
+ end
+
+ local res_len = len - skipped
+ local res = ffi.new('char[?]', res_len)
+
+ for i = 0, res_len - 1 do
+ res[i] = casted_inp[i + skipped]
+ end
+
+ return ffi.string(res, res_len)
end
```
// use memchr
0.127543
0.126210
0.118646
WBR, Alexander Turenko.
On Thu, Dec 20, 2018 at 06:02:51PM +0000, Michał Durak wrote:
> Add optional 'chars' parameter to string.strip, string.lstrip
> and string.rstrip for specifying the unwanted characters.
> Behavior modeled after the equivalent Python built-ins.
>
> Closes: #2977
> ---
>
> branch: https://github.com/gdrbyKo1/tarantool/tree/gh-2977
> issue: https://github.com/tarantool/tarantool/issues/2977
>
> src/lua/string.lua | 51 ++++++++++++++++++++++++++++++++++++++------
> test/app-tap/string.test.lua | 44 +++++++++++++++++++++++++++++++-------
> 2 files changed, 81 insertions(+), 14 deletions(-)
>
> diff --git a/src/lua/string.lua b/src/lua/string.lua
> index cbce26b35..9eab33423 100644
> --- a/src/lua/string.lua
> +++ b/src/lua/string.lua
> @@ -339,25 +339,64 @@ local function string_fromhex(inp)
> return ffi.string(res, len)
> end
>
> -local function string_strip(inp)
> +local function string_strip(inp, chars)
> if type(inp) ~= 'string' then
> error(err_string_arg:format(1, "string.strip", 'string', type(inp)), 2)
> end
> - return (string.gsub(inp, "^%s*(.-)%s*$", "%1"))
> + if chars == nil then
> + return (string.gsub(inp, "^%s*(.-)%s*$", "%1"))
> + end
> +
> + if type(chars) ~= 'string' then
> + error(err_string_arg:format(2, "string.strip", 'string', type(chars)), 2)
> + end
> + if chars == '' then
> + return inp
> + end
> +
> + chars = chars:gsub('[%^%$%(%)%%%.%[%]%*%+%-%?]', "%%%0"):gsub('%z+', '%%z')
> + local pattern = string.format("^[%s]*(.-)[%s]*$", chars, chars)
> + return (string.gsub(inp, pattern, "%1"))
> end
>
> -local function string_lstrip(inp)
> +local function string_lstrip(inp, chars)
> if type(inp) ~= 'string' then
> error(err_string_arg:format(1, "string.lstrip", 'string', type(inp)), 2)
> end
> - return (string.gsub(inp, "^%s*(.-)", "%1"))
> + if chars == nil then
> + return (string.gsub(inp, "^%s*(.-)", "%1"))
> + end
> +
> + if type(chars) ~= 'string' then
> + error(err_string_arg:format(2, "string.lstrip", 'string', type(chars)), 2)
> + end
> + if chars == '' then
> + return inp
> + end
> +
> + chars = chars:gsub('[%^%$%(%)%%%.%[%]%*%+%-%?]', "%%%0"):gsub('%z+', '%%z')
> + local pattern = string.format("^[%s]*(.-)", chars)
> + return (string.gsub(inp, pattern, "%1"))
> end
>
> -local function string_rstrip(inp)
> +local function string_rstrip(inp, chars)
> if type(inp) ~= 'string' then
> error(err_string_arg:format(1, "string.rstrip", 'string', type(inp)), 2)
> end
> - return (string.gsub(inp, "(.-)%s*$", "%1"))
> + if chars == nil then
> + return (string.gsub(inp, "(.-)%s*$", "%1"))
> + end
> +
> + if type(chars) ~= 'string' then
> + error(err_string_arg:format(2, "string.rstrip", 'string', type(chars)), 2)
> + end
> + if chars == '' then
> + return inp
> + end
> +
> + chars = chars:gsub('[%^%$%(%)%%%.%[%]%*%+%-%?]', "%%%0"):gsub('%z+', '%%z')
> + local pattern = string.format("(.-)[%s]*$", chars)
> + return (string.gsub(inp, pattern, "%1"))
> end
>
>
> diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua
> index 7203fcd36..6e87d3285 100755
> --- a/test/app-tap/string.test.lua
> +++ b/test/app-tap/string.test.lua
> @@ -134,18 +134,46 @@ test:test("fromhex", function(test)
> end)
>
> test:test("strip", function(test)
> - test:plan(6)
> + test:plan(21)
> local str = " hello hello "
> - test:is(string.len(string.strip(str)), 11, "strip")
> - test:is(string.len(string.lstrip(str)), 12, "lstrip")
> - test:is(string.len(string.rstrip(str)), 13, "rstrip")
> + test:is(string.len(string.strip(str)), 11, "strip (no chars)")
> + test:is(string.len(string.lstrip(str)), 12, "lstrip (no chars)")
> + test:is(string.len(string.rstrip(str)), 13, "rstrip (no chars)")
> local _, err = pcall(string.strip, 12)
> - test:ok(err and err:match("%(string expected, got number%)"))
> + test:ok(err and err:match("#1 to '.-%.strip' %(string expected, got number%)"))
> _, err = pcall(string.lstrip, 12)
> - test:ok(err and err:match("%(string expected, got number%)"))
> + test:ok(err and err:match("#1 to '.-%.lstrip' %(string expected, got number%)"))
> _, err = pcall(string.rstrip, 12)
> - test:ok(err and err:match("%(string expected, got number%)"))
> -end )
> + test:ok(err and err:match("#1 to '.-%.rstrip' %(string expected, got number%)"))
> +
> + str = "www.example.com/foo"
> + local chars = "w./fomc"
> + test:is(string.len(string.strip(str, chars)), 7, "strip (chars)")
> + test:is(string.len(string.lstrip(str, chars)), 15, "lstrip (chars)")
> + test:is(string.len(string.rstrip(str, chars)), 11, "rstrip (chars)")
> +
> + str, chars = "^$()%.[]*+-?BEEP^$()%.[]*+-?%", "^$()%.[]*+-?"
> + test:is(string.len(string.strip(str, chars)), 4, "strip (magic chars)")
> + test:is(string.len(string.lstrip(str, chars)), 17, "lstrip (magic chars)")
> + test:is(string.len(string.rstrip(str, chars)), 16, "rstrip (magic chars)")
> +
> + str, chars = "\0\00\000HELLO\000\00\0\0", "\0"
> + test:is(string.len(string.strip(str, chars)), 5, "strip (chars with embedded 0s)")
> + test:is(string.len(string.lstrip(str, chars)), 9, "lstrip (chars with embedded 0s)")
> + test:is(string.len(string.rstrip(str, chars)), 8, "rstrip (chars with embedded 0s)")
> +
> + str, chars = " test ", ""
> + test:is(string.strip(str, chars), str, "strip (0-length chars)")
> + test:is(string.lstrip(str, chars), str, "lstrip (0-length chars)")
> + test:is(string.rstrip(str, chars), str, "rstrip (0-length chars)")
> +
> + _, err = pcall(string.strip, 'foo', 12)
> + test:ok(err and err:match("#2 to '.-%.strip' %(string expected, got number%)"))
> + _, err = pcall(string.lstrip, 'bar', 12)
> + test:ok(err and err:match("#2 to '.-%.lstrip' %(string expected, got number%)"))
> + _, err = pcall(string.rstrip, 'baz', 12)
> + test:ok(err and err:match("#2 to '.-%.rstrip' %(string expected, got number%)"))
> +end)
>
> test:test("unicode", function(test)
> test:plan(104)
> --
> 2.11.0
>
^ permalink raw reply [flat|nested] 2+ messages in thread