Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize reciprocal #129

Merged
merged 2 commits into from
Mar 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 12 additions & 21 deletions include/intx/int128.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// intx: extended precision integer library.
// Copyright 2019 Pawel Bylica.
// Copyright 2019-2020 Pawel Bylica.
// Licensed under the Apache License, Version 2.0.

#pragma once
Expand Down Expand Up @@ -506,29 +506,20 @@ constexpr uint16_t reciprocal_table[] = {REPEAT256()};
/// Based on Algorithm 2 from "Improved division by invariant integers".
inline uint64_t reciprocal_2by1(uint64_t d) noexcept
{
auto d9 = uint8_t(d >> 55);
auto v0 = uint64_t{internal::reciprocal_table[d9]};
const uint64_t d9 = d >> 55;
const uint32_t v0 = internal::reciprocal_table[d9 - 256];

auto d40 = (d >> 24) + 1;
auto v1 = (v0 << 11) - (v0 * v0 * d40 >> 40) - 1;
const uint64_t d40 = (d >> 24) + 1;
const uint64_t v1 = (v0 << 11) - uint32_t(v0 * v0 * d40 >> 40) - 1;

auto v2 = (v1 << 13) + (v1 * (0x1000000000000000 - v1 * d40) >> 47);
const uint64_t v2 = (v1 << 13) + (v1 * (0x1000000000000000 - v1 * d40) >> 47);

auto d0 = d % 2;
auto d63 = d / 2 + d0; // ceil(d/2)
auto nd0 = uint64_t(-int64_t(d0));
auto e = ((v2 / 2) & nd0) - v2 * d63;
auto mh = umul(v2, e).hi;
auto v3 = (v2 << 31) + (mh >> 1);

// OPT: The compiler tries a bit too much with 128 + 64 addition and ends up using subtraction.
// Compare with __int128.
auto mf = umul(v3, d);
auto m = fast_add(mf, d);
auto v3a = m.hi + d;

auto v4 = v3 - v3a;
const uint64_t d0 = d & 1;
const uint64_t d63 = (d >> 1) + d0; // ceil(d/2)
const uint64_t e = ((v2 >> 1) & (0 - d0)) - v2 * d63;
const uint64_t v3 = (umul(v2, e).hi >> 1) + (v2 << 31);

const uint64_t v4 = v3 - (umul(v3, d) + d).hi - d;
return v4;
}

Expand All @@ -548,7 +539,7 @@ inline uint64_t reciprocal_3by2(uint128 d) noexcept
p -= d.hi;
}

auto t = umul(v, d.lo);
const auto t = umul(v, d.lo);

p += t.hi;
if (p < t.hi)
Expand Down
3 changes: 3 additions & 0 deletions test/benchmarks/bench_div.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ uint64_t nop(uint64_t x, uint64_t y) noexcept;
uint64_t soft_div_unr_unrolled(uint64_t x, uint64_t y) noexcept;
uint64_t soft_div_unr(uint64_t x, uint64_t y) noexcept;

uint64_t reciprocal_2by1_noinline(uint64_t d) noexcept;

using namespace intx;

inline uint64_t udiv_by_reciprocal(uint64_t uu, uint64_t du) noexcept
Expand Down Expand Up @@ -80,6 +82,7 @@ static void div_unary(benchmark::State& state)
}
BENCHMARK_TEMPLATE(div_unary, neg);
BENCHMARK_TEMPLATE(div_unary, reciprocal_2by1);
BENCHMARK_TEMPLATE(div_unary, reciprocal_2by1_noinline);
BENCHMARK_TEMPLATE(div_unary, reciprocal_naive);

template <uint64_t DivFn(uint64_t, uint64_t)>
Expand Down
7 changes: 6 additions & 1 deletion test/benchmarks/noinline.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// intx: extended precision integer library.
// Copyright 2019 Pawel Bylica.
// Copyright 2019-2020 Pawel Bylica.
// Licensed under the Apache License, Version 2.0.

#include <intx/intx.hpp>
Expand Down Expand Up @@ -30,3 +30,8 @@ auto exp(const uint256& x, const uint256& y) noexcept
{
return intx::exp(x, y);
}

auto reciprocal_2by1_noinline(uint64_t d) noexcept
{
return reciprocal_2by1(d);
}