From f3628725ddb2c3f3bef24952b1f5ae879d1d6094 Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Sun, 18 Dec 2022 18:37:52 +0200 Subject: [PATCH] chore: Improve the implementation of simd based packing Signed-off-by: Roman Gershman --- src/core/compact_object.cc | 2 +- src/core/compact_object_test.cc | 12 ++++++++++- src/core/detail/bitpacking.cc | 38 +++++++++++++++++++++++++++++++++ src/core/detail/bitpacking.h | 2 ++ 4 files changed, 52 insertions(+), 2 deletions(-) diff --git a/src/core/compact_object.cc b/src/core/compact_object.cc index 8b971b5444db..3930282b04ba 100644 --- a/src/core/compact_object.cc +++ b/src/core/compact_object.cc @@ -674,7 +674,7 @@ void CompactObj::SetString(std::string_view str) { } tl.tmp_buf.resize(encode_len); - detail::ascii_pack_simd(str.data(), str.size(), tl.tmp_buf.data()); + detail::ascii_pack_simd2(str.data(), str.size(), tl.tmp_buf.data()); encoded = string_view{reinterpret_cast(tl.tmp_buf.data()), encode_len}; if (encoded.size() <= kInlineLen) { diff --git a/src/core/compact_object_test.cc b/src/core/compact_object_test.cc index a550ff7cc461..06003f2c94b4 100644 --- a/src/core/compact_object_test.cc +++ b/src/core/compact_object_test.cc @@ -204,7 +204,7 @@ TEST_F(CompactObjectTest, AsciiUtil) { } string act_str(data3.size(), 'y'); std::vector binvec(detail::binpacked_len(data3.size())); - detail::ascii_pack_simd(data3.data(), data3.size(), binvec.data()); + detail::ascii_pack_simd2(data3.data(), data3.size(), binvec.data()); detail::ascii_unpack_simd(binvec.data(), data3.size(), act_str.data()); ASSERT_EQ(data3, act_str); @@ -546,6 +546,16 @@ static void BM_PackSimd(benchmark::State& state) { } BENCHMARK(BM_PackSimd); +static void BM_PackSimd2(benchmark::State& state) { + string val(1024, 'a'); + uint8_t buf[1024]; + + while (state.KeepRunning()) { + detail::ascii_pack_simd2(val.data(), val.size(), buf); + } +} +BENCHMARK(BM_PackSimd2); + static void BM_UnpackNaive(benchmark::State& state) { string val(1024, 'a'); uint8_t buf[1024]; diff --git a/src/core/detail/bitpacking.cc b/src/core/detail/bitpacking.cc index 738fe82cebcf..ea882d5c0bdf 100644 --- a/src/core/detail/bitpacking.cc +++ b/src/core/detail/bitpacking.cc @@ -141,6 +141,44 @@ void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) { ascii_pack(ascii, end - ascii, bin); } +void ascii_pack_simd2(const char* ascii, size_t len, uint8_t* bin) { + // I leave out 16 bytes in addition to 16 that we load in the loop + // because we store into bin full 16 bytes instead of 14. To prevent data + // overwrite we finish loop one iteration earlier. + const char* end = ascii + len - 32; + + // Skips 8th byte (indexc 7) in the lower 8-byte part. + const __m128i control = _mm_set_epi8(-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0); + + __m128i val, rpart, lpart; + + // Based on the question I asked here: https://stackoverflow.com/q/74831843/2280111 + while (ascii <= end) { + val = _mm_loadu_si128(reinterpret_cast(ascii)); + + /* + x = ((x & 0x7F007F007F007F00) >> 1) | (x & 0x007F007F007F007F); + x = ((x & 0x3FFF00003FFF0000) >> 2) | (x & 0x00003FFF00003FFF); + x = ((x & 0x0FFFFFFF00000000) >> 4) | (x & 0x000000000FFFFFFF); + */ + val = _mm_maddubs_epi16(_mm_set1_epi16(0x8001), val); + val = _mm_madd_epi16(_mm_set1_epi32(0x40000001), val); + + rpart = _mm_and_si128(val, _mm_set1_epi64x(0x000000000FFFFFFF)); + lpart = _mm_and_si128(val, _mm_set1_epi64x(0x0FFFFFFF00000000)); + val = _mm_or_si128(_mm_srli_epi64(lpart, 4), rpart); + + val = _mm_shuffle_epi8(val, control); + _mm_storeu_si128(reinterpret_cast<__m128i*>(bin), val); + bin += 14; + ascii += 16; + } + + end += 32; // Bring back end. + DCHECK(ascii < end); + ascii_pack(ascii, end - ascii, bin); +} + // unpacks 8->7 encoded blob back to ascii. // generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than // the source buffer. diff --git a/src/core/detail/bitpacking.h b/src/core/detail/bitpacking.h index a55c69c34ae5..ada8107a0a82 100644 --- a/src/core/detail/bitpacking.h +++ b/src/core/detail/bitpacking.h @@ -26,6 +26,8 @@ void ascii_pack(const char* ascii, size_t len, uint8_t* bin); void ascii_pack2(const char* ascii, size_t len, uint8_t* bin); void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin); +void ascii_pack_simd2(const char* ascii, size_t len, uint8_t* bin); + bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len); // maps ascii len to 7-bit packed length. Each 8 bytes are converted to 7 bytes.