From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp37.i.mail.ru (smtp37.i.mail.ru [94.100.177.97]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id 50F2145C310 for ; Thu, 28 May 2020 02:32:36 +0300 (MSK) From: Vladislav Shpilevoy Date: Thu, 28 May 2020 01:32:23 +0200 Message-Id: In-Reply-To: References: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH v2 04/10] crc32: align memory access List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: tarantool-patches@dev.tarantool.org, alyapunov@tarantool.org, korablev@tarantool.org, tsafin@tarantool.org There is some assembly working with a byte array like with an array of unsigned long values. That is incorrect, because the byte array may be not aligned by 'unsigned long'. The patch makes crc32 calculate the hash on the prefix byte-by-byte, then word-by-word on aligned addresses, and again byte-by-byte on a tail which is less than word. Assuming that the word-by-word part is the longest, this should reduce number of memory/cache loads in ~x2 times. Because in case of a not aligned word load it was necessary to load 2 words, and then merge them into one. When addresses are aligned, this is only 1 load. Part of #4609 --- src/cpu_feature.c | 31 ++++++++++++----- test/unit/CMakeLists.txt | 3 ++ test/unit/crc32.c | 73 ++++++++++++++++++++++++++++++++++++++++ test/unit/crc32.result | 11 ++++++ 4 files changed, 110 insertions(+), 8 deletions(-) create mode 100644 test/unit/crc32.c create mode 100644 test/unit/crc32.result diff --git a/src/cpu_feature.c b/src/cpu_feature.c index 98567ccb3..9bf6223de 100644 --- a/src/cpu_feature.c +++ b/src/cpu_feature.c @@ -29,6 +29,7 @@ * SUCH DAMAGE. */ #include "trivia/config.h" +#include "trivia/util.h" #include #include #include @@ -50,7 +51,7 @@ static uint32_t -crc32c_hw_byte(uint32_t crc, unsigned char const *data, unsigned int length) +crc32c_hw_byte(uint32_t crc, char const *data, unsigned int length) { while (length--) { __asm__ __volatile__( @@ -68,6 +69,26 @@ crc32c_hw_byte(uint32_t crc, unsigned char const *data, unsigned int length) uint32_t crc32c_hw(uint32_t crc, const char *buf, unsigned int len) { + const int align = alignof(unsigned long); + unsigned long addr = (unsigned long)buf; + unsigned int not_aligned_prefix = + ((addr - 1 + align) & ~(align - 1)) - addr; + /* + * Calculate CRC32 for the prefix byte-by-byte so as to + * then use aligned words to calculate the rest. This is + * twice less loads, because every load takes exactly one + * word from memory. Not 2 words, which would need to be + * partially merged then. + * But the main reason is that unaligned loads are just + * unsafe, because this is an undefined behaviour. + */ + if (not_aligned_prefix < len) { + crc = crc32c_hw_byte(crc, buf, not_aligned_prefix); + buf += not_aligned_prefix; + len -= not_aligned_prefix; + } else { + return crc32c_hw_byte(crc, buf, len); + } unsigned int iquotient = len / SCALE_F; unsigned int iremainder = len % SCALE_F; unsigned long *ptmp = (unsigned long *)buf; @@ -80,13 +101,7 @@ crc32c_hw(uint32_t crc, const char *buf, unsigned int len) ); ptmp++; } - - if (iremainder) { - crc = crc32c_hw_byte(crc, (unsigned char const*)ptmp, - iremainder); - } - - return crc; + return crc32c_hw_byte(crc, (const char *)ptmp, iremainder); } bool diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 3bdfc6bb1..672122118 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -138,6 +138,9 @@ target_link_libraries(scramble.test scramble) add_executable(guava.test guava.c) target_link_libraries(guava.test salad small) +add_executable(crc32.test crc32.c) +target_link_libraries(crc32.test unit crc32) + add_executable(find_path.test find_path.c ${CMAKE_SOURCE_DIR}/src/find_path.c ) diff --git a/test/unit/crc32.c b/test/unit/crc32.c new file mode 100644 index 000000000..7493bc0db --- /dev/null +++ b/test/unit/crc32.c @@ -0,0 +1,73 @@ +/* + * Copyright 2010-2020, Tarantool AUTHORS, please see AUTHORS file. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the + * following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "unit.h" +#include "crc32.h" + +static void +test_alignment(void) +{ + header(); + plan(4); + + unsigned long buf[1024]; + char *str = (char *)buf; + + strcpy(str, "1234567891234567"); + uint32_t crc = crc32_calc(0, str, strlen(str)); + is(crc, 3333896965, "aligned crc32 buffer without a tail"); + + strcpy(str, "12345678912345678"); + crc = crc32_calc(0, str, strlen(str)); + is(crc, 2400039513, "aligned crc32 buffer with a tail"); + + crc = crc32_calc(0, str + 2, strlen(str) - 2); + is(crc, 984331636, "not aligned crc32 buffer with a tail"); + + strcpy(str, "1234"); + crc = crc32_calc(0, str + 2, strlen(str) - 2); + is(crc, 2211472564, "not aligned buffer less than a word"); + + check_plan(); + footer(); +} + +int +main(void) +{ + crc32_init(); + + header(); + plan(1); + test_alignment(); + int rc = check_plan(); + footer(); + return rc; +} diff --git a/test/unit/crc32.result b/test/unit/crc32.result new file mode 100644 index 000000000..99b62c22a --- /dev/null +++ b/test/unit/crc32.result @@ -0,0 +1,11 @@ + *** main *** +1..1 + *** test_alignment *** + 1..4 + ok 1 - aligned crc32 buffer without a tail + ok 2 - aligned crc32 buffer with a tail + ok 3 - not aligned crc32 buffer with a tail + ok 4 - not aligned buffer less than a word +ok 1 - subtests + *** test_alignment: done *** + *** main: done *** -- 2.21.1 (Apple Git-122.3)