From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id E9445222A1 for ; Sun, 23 Dec 2018 04:23:40 -0500 (EST) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id JSIfQwAMh6wR for ; Sun, 23 Dec 2018 04:23:40 -0500 (EST) Received: from smtpng3.m.smailru.net (smtpng3.m.smailru.net [94.100.177.149]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 172C22229F for ; Sun, 23 Dec 2018 04:23:40 -0500 (EST) Received: by smtpng3.m.smailru.net with esmtpa (envelope-from ) id 1gazyz-0002x7-IZ for tarantool-patches@freelists.org; Sun, 23 Dec 2018 12:23:38 +0300 Date: Sun, 23 Dec 2018 12:23:39 +0300 From: Alexander Turenko Subject: [tarantool-patches] Re: [PATCH] lua: add optional 'chars' param to string.strip functions Message-ID: <20181223092339.2fv2bbf5kbkdu24a@tkn_work_nb> References: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Disposition: inline Content-Transfer-Encoding: 8bit In-Reply-To: Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org Hi! Thanks for the effort. Michał, your email is hidden (shown as 'Michał Durak '), so I cannot reply you directly. I think it is something in settings on freelists. Check it, please. I think searching for a set of certain characters using regular expressions is a kind of overkill. I created simple benchmark: ``` #!/usr/bin/env tarantool local clock = require('clock') local str = '###dfdfweweds' local start = clock.monotonic() for i = 1, 1000000 do local res = string.lstrip(str, '#') end local duration = clock.monotonic() - start print(('%.6f'):format(duration)) ``` And tried to reimplement string.lstrip() with straight O(M*N) alrorithm: ``` @@ -373,10 +373,36 @@ local function string_lstrip(inp, chars) if chars == '' then return inp end - - chars = chars:gsub('[%^%$%(%)%%%.%[%]%*%+%-%?]', "%%%0"):gsub('%z+', '%%z') - local pattern = string.format("^[%s]*(.-)", chars) - return (string.gsub(inp, pattern, "%1")) + + local casted_inp = ffi.cast('const char *', inp) + local len = inp:len() + local casted_chars = ffi.cast('const char *', chars) + local chars_len = chars:len() + local skipped = 0 + + for i = 0, len - 1 do + local c = casted_inp[i] + local matched = false + for j = 0, chars_len - 1 do + if c == casted_chars[j] then + skipped = skipped + 1 + matched = true + break + end + end + if not matched then + break + end + end + + local res_len = len - skipped + local res = ffi.new('char[?]', res_len) + + for i = 0, res_len - 1 do + res[i] = casted_inp[i + skipped] + end + + return ffi.string(res, res_len) end local function string_rstrip(inp, chars) ``` That give me 3x gain on the benchmark above: // use regexps 0.373055 0.371193 0.377918 // use pointers 0.115181 0.112625 0.124399 So I propose to use simple algorithm with pointers. It is straight and quite fast. Note: This is just sketchy code to proof we can do it w/o regexps, I likely missed some corner cases here. BTW, using memchr() is a bit slower, but looks more clear: ``` diff --git a/src/lua/string.lua b/src/lua/string.lua index 9eab33423..dcfe404f3 100644 --- a/src/lua/string.lua +++ b/src/lua/string.lua @@ -6,6 +6,8 @@ ffi.cdef[[ const char *needle, size_t needle_len); int memcmp(const char *mem1, const char *mem2, size_t num); int isspace(int c); + const char * + memchr(const char *s, int c, size_t n); ]] local c_char_ptr = ffi.typeof('const char *') @@ -13,6 +15,7 @@ local c_char_ptr = ffi.typeof('const char *') local memcmp = ffi.C.memcmp local memmem = ffi.C.memmem local isspace = ffi.C.isspace +local memchr = ffi.C.memchr local err_string_arg = "bad argument #%d to '%s' (%s expected, got %s)" @@ -373,10 +376,31 @@ local function string_lstrip(inp, chars) if chars == '' then return inp end - - chars = chars:gsub('[%^%$%(%)%%%.%[%]%*%+%-%?]', "%%%0"):gsub('%z+', '%%z') - local pattern = string.format("^[%s]*(.-)", chars) - return (string.gsub(inp, pattern, "%1")) + + local casted_inp = ffi.cast('const char *', inp) + local len = inp:len() + local casted_chars = ffi.cast('const char *', chars) + local chars_len = chars:len() + local skipped = 0 + + for i = 0, len - 1 do + local c = casted_inp[i] + local matched = memchr(casted_chars, c, chars_len) ~= nil; + if matched then + skipped = skipped + 1 + else + break + end + end + + local res_len = len - skipped + local res = ffi.new('char[?]', res_len) + + for i = 0, res_len - 1 do + res[i] = casted_inp[i + skipped] + end + + return ffi.string(res, res_len) end ``` // use memchr 0.127543 0.126210 0.118646 WBR, Alexander Turenko. On Thu, Dec 20, 2018 at 06:02:51PM +0000, Michał Durak wrote: > Add optional 'chars' parameter to string.strip, string.lstrip > and string.rstrip for specifying the unwanted characters. > Behavior modeled after the equivalent Python built-ins. > > Closes: #2977 > --- > > branch: https://github.com/gdrbyKo1/tarantool/tree/gh-2977 > issue: https://github.com/tarantool/tarantool/issues/2977 > > src/lua/string.lua | 51 ++++++++++++++++++++++++++++++++++++++------ > test/app-tap/string.test.lua | 44 +++++++++++++++++++++++++++++++------- > 2 files changed, 81 insertions(+), 14 deletions(-) > > diff --git a/src/lua/string.lua b/src/lua/string.lua > index cbce26b35..9eab33423 100644 > --- a/src/lua/string.lua > +++ b/src/lua/string.lua > @@ -339,25 +339,64 @@ local function string_fromhex(inp) > return ffi.string(res, len) > end > > -local function string_strip(inp) > +local function string_strip(inp, chars) > if type(inp) ~= 'string' then > error(err_string_arg:format(1, "string.strip", 'string', type(inp)), 2) > end > - return (string.gsub(inp, "^%s*(.-)%s*$", "%1")) > + if chars == nil then > + return (string.gsub(inp, "^%s*(.-)%s*$", "%1")) > + end > + > + if type(chars) ~= 'string' then > + error(err_string_arg:format(2, "string.strip", 'string', type(chars)), 2) > + end > + if chars == '' then > + return inp > + end > + > + chars = chars:gsub('[%^%$%(%)%%%.%[%]%*%+%-%?]', "%%%0"):gsub('%z+', '%%z') > + local pattern = string.format("^[%s]*(.-)[%s]*$", chars, chars) > + return (string.gsub(inp, pattern, "%1")) > end > > -local function string_lstrip(inp) > +local function string_lstrip(inp, chars) > if type(inp) ~= 'string' then > error(err_string_arg:format(1, "string.lstrip", 'string', type(inp)), 2) > end > - return (string.gsub(inp, "^%s*(.-)", "%1")) > + if chars == nil then > + return (string.gsub(inp, "^%s*(.-)", "%1")) > + end > + > + if type(chars) ~= 'string' then > + error(err_string_arg:format(2, "string.lstrip", 'string', type(chars)), 2) > + end > + if chars == '' then > + return inp > + end > + > + chars = chars:gsub('[%^%$%(%)%%%.%[%]%*%+%-%?]', "%%%0"):gsub('%z+', '%%z') > + local pattern = string.format("^[%s]*(.-)", chars) > + return (string.gsub(inp, pattern, "%1")) > end > > -local function string_rstrip(inp) > +local function string_rstrip(inp, chars) > if type(inp) ~= 'string' then > error(err_string_arg:format(1, "string.rstrip", 'string', type(inp)), 2) > end > - return (string.gsub(inp, "(.-)%s*$", "%1")) > + if chars == nil then > + return (string.gsub(inp, "(.-)%s*$", "%1")) > + end > + > + if type(chars) ~= 'string' then > + error(err_string_arg:format(2, "string.rstrip", 'string', type(chars)), 2) > + end > + if chars == '' then > + return inp > + end > + > + chars = chars:gsub('[%^%$%(%)%%%.%[%]%*%+%-%?]', "%%%0"):gsub('%z+', '%%z') > + local pattern = string.format("(.-)[%s]*$", chars) > + return (string.gsub(inp, pattern, "%1")) > end > > > diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua > index 7203fcd36..6e87d3285 100755 > --- a/test/app-tap/string.test.lua > +++ b/test/app-tap/string.test.lua > @@ -134,18 +134,46 @@ test:test("fromhex", function(test) > end) > > test:test("strip", function(test) > - test:plan(6) > + test:plan(21) > local str = " hello hello " > - test:is(string.len(string.strip(str)), 11, "strip") > - test:is(string.len(string.lstrip(str)), 12, "lstrip") > - test:is(string.len(string.rstrip(str)), 13, "rstrip") > + test:is(string.len(string.strip(str)), 11, "strip (no chars)") > + test:is(string.len(string.lstrip(str)), 12, "lstrip (no chars)") > + test:is(string.len(string.rstrip(str)), 13, "rstrip (no chars)") > local _, err = pcall(string.strip, 12) > - test:ok(err and err:match("%(string expected, got number%)")) > + test:ok(err and err:match("#1 to '.-%.strip' %(string expected, got number%)")) > _, err = pcall(string.lstrip, 12) > - test:ok(err and err:match("%(string expected, got number%)")) > + test:ok(err and err:match("#1 to '.-%.lstrip' %(string expected, got number%)")) > _, err = pcall(string.rstrip, 12) > - test:ok(err and err:match("%(string expected, got number%)")) > -end ) > + test:ok(err and err:match("#1 to '.-%.rstrip' %(string expected, got number%)")) > + > + str = "www.example.com/foo" > + local chars = "w./fomc" > + test:is(string.len(string.strip(str, chars)), 7, "strip (chars)") > + test:is(string.len(string.lstrip(str, chars)), 15, "lstrip (chars)") > + test:is(string.len(string.rstrip(str, chars)), 11, "rstrip (chars)") > + > + str, chars = "^$()%.[]*+-?BEEP^$()%.[]*+-?%", "^$()%.[]*+-?" > + test:is(string.len(string.strip(str, chars)), 4, "strip (magic chars)") > + test:is(string.len(string.lstrip(str, chars)), 17, "lstrip (magic chars)") > + test:is(string.len(string.rstrip(str, chars)), 16, "rstrip (magic chars)") > + > + str, chars = "\0\00\000HELLO\000\00\0\0", "\0" > + test:is(string.len(string.strip(str, chars)), 5, "strip (chars with embedded 0s)") > + test:is(string.len(string.lstrip(str, chars)), 9, "lstrip (chars with embedded 0s)") > + test:is(string.len(string.rstrip(str, chars)), 8, "rstrip (chars with embedded 0s)") > + > + str, chars = " test ", "" > + test:is(string.strip(str, chars), str, "strip (0-length chars)") > + test:is(string.lstrip(str, chars), str, "lstrip (0-length chars)") > + test:is(string.rstrip(str, chars), str, "rstrip (0-length chars)") > + > + _, err = pcall(string.strip, 'foo', 12) > + test:ok(err and err:match("#2 to '.-%.strip' %(string expected, got number%)")) > + _, err = pcall(string.lstrip, 'bar', 12) > + test:ok(err and err:match("#2 to '.-%.lstrip' %(string expected, got number%)")) > + _, err = pcall(string.rstrip, 'baz', 12) > + test:ok(err and err:match("#2 to '.-%.rstrip' %(string expected, got number%)")) > +end) > > test:test("unicode", function(test) > test:plan(104) > -- > 2.11.0 >