Skip to content

Commit

Permalink
lib/os/prf.c: alternate implementation for _ldiv5()
Browse files Browse the repository at this point in the history
The _ldiv5() is an optimized divide-by-5 function that is smaller and
faster than the generic libgcc implementation.

Yet it can be made even smaller and faster with this replacement
implementation based on a reciprocal multiplication plus some tricks.

For example, here's the assembly from the original code on ARM:

_ldiv5:
        ldr     r3, [r0]
        movw    ip, zephyrproject-rtos#52429
        ldr     r1, [r0, zephyrproject-rtos#4]
        movt    ip, 52428
        adds    r3, r3, zephyrproject-rtos#2
        push    {r4, r5, r6, r7, lr}
        mov     lr, #0
        adc     r1, r1, lr
        adds    r2, lr, lr
        umull   r7, r6, ip, r1
        lsr     r6, r6, zephyrproject-rtos#2
        adc     r7, r6, r6
        adds    r2, r2, r2
        adc     r7, r7, r7
        adds    r2, r2, lr
        adc     r7, r7, r6
        subs    r3, r3, r2
        sbc     r7, r1, r7
        lsr     r2, r3, zephyrproject-rtos#3
        orr     r2, r2, r7, lsl zephyrproject-rtos#29
        umull   r2, r1, ip, r2
        lsr     r2, r1, zephyrproject-rtos#2
        lsr     r7, r1, zephyrproject-rtos#31
        lsl     r1, r2, zephyrproject-rtos#3
        adds    r4, lr, r1
        adc     r5, r6, r7
        adds    r2, r1, r1
        adds    r2, r2, r2
        adds    r2, r2, r1
        subs    r2, r3, r2
        umull   r3, r2, ip, r2
        lsr     r2, r2, zephyrproject-rtos#2
        adds    r4, r4, r2
        adc     r5, r5, #0
        strd    r4, [r0]
        pop     {r4, r5, r6, r7, pc}

And here's the resulting assembly with this commit applied:

_ldiv5:
        push    {r4, r5, r6, r7}
        movw    r4, zephyrproject-rtos#13107
        ldr     r6, [r0]
        movt    r4, 13107
        ldr     r1, [r0, zephyrproject-rtos#4]
        mov     r3, #0
        umull   r6, r7, r6, r4
        add     r2, r4, r4, lsl zephyrproject-rtos#1
        umull   r4, r5, r1, r4
        adds    r1, r6, r2
        adc     r2, r7, r2
        adds    ip, r6, r4
        adc     r1, r7, r5
        adds    r2, ip, r2
        adc     r2, r1, r3
        adds    r2, r4, r2
        adc     r3, r5, r3
        strd    r2, [r0]
        pop     {r4, r5, r6, r7}
        bx      lr

So we're down to 20 instructions from 36 initially, with only 2 umull
instructions instead of 3, and slightly smaller stack footprint.

Signed-off-by: Nicolas Pitre <[email protected]>
  • Loading branch information
Nicolas Pitre committed Oct 31, 2020
1 parent f4a4583 commit 135b070
Showing 1 changed file with 41 additions and 22 deletions.
63 changes: 41 additions & 22 deletions lib/os/prf.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,37 +130,56 @@ static void _rlrshift(uint64_t *v)
* sense to define this much smaller special case here to avoid
* including it just for printf.
*
* It works by iteratively dividing the most significant 32 bits of
* the 64 bit value by 5. This will leave a remainder of 0-4
* (i.e. three significant bits), ensuring that the top 29 bits of the
* remainder are zero for the next iteration. Thus in the second
* iteration only 35 significant bits remain, and in the third only
* six. This was tested exhaustively through the first ~10B values in
* the input space, and for ~2e12 (4 hours runtime) random inputs
* taken from the full 64 bit space.
* It works by multiplying v by the reciprocal of 5 i.e.:
*
* result = v * ((1 << 64) / 5) / (1 << 64)
*
* This produces a 128-bit result, but we drop the bottom 64 bits which
* accounts for the division by (1 << 64). The product is kept to 64 bits
* by summing partial multiplications and shifting right by 32 which on
* most 32-bit architectures means only a register drop.
*
* Here the multiplier is: (1 << 64) / 5 = 0x3333333333333333
* i.e. a 62 bits value. To compensate for the reduced precision, we
* add an initial bias of 1 to v. Enlarging the multiplier to 64 bits
* would also work but a final right shift would be needed, and carry
* handling on the summing of partial mults would be necessary, requiring
* more instructions. Given that we already want to add bias of 2 for
* the result to be rounded to nearest and not truncated, we might as well
* combine those together into a bias of 3. This also conveniently allows
* for keeping the multiplier in a single 32-bit register given its pattern.
*/
static void _ldiv5(uint64_t *v)
{
uint32_t hi;
uint64_t rem = *v, quot = 0U, q;
int i;
uint32_t v_lo = *v;
uint32_t v_hi = *v >> 32;
uint32_t m = 0x33333333;
uint64_t result;

static const char shifts[] = { 32, 3, 0 };
/*
* Force the multiplier constant into a register and make it
* opaque to the compiler, otherwise gcc tries to be too smart
* for its own good with a large expansion of adds and shifts.
*/
__asm__ ("" : "+r" (m));

/*
* Usage in this file wants rounded behavior, not truncation. So add
* two to get the threshold right.
* Apply the bias of 3. We can't add it to v as this would overflow
* it when at max range. Factor it out with the multiplier upfront.
* Here we multiply the low and high parts separately to avoid an
* unnecessary 64-bit add-with-carry.
*/
rem += 2U;
result = ((uint64_t)(m * 3U) << 32) | (m * 3U);

for (i = 0; i < 3; i++) {
hi = rem >> shifts[i];
q = (uint64_t)(hi / 5U) << shifts[i];
rem -= q * 5U;
quot += q;
}
/* The actual multiplication. */
result += (uint64_t)v_lo * m;
result >>= 32;
result += (uint64_t)v_lo * m;
result += (uint64_t)v_hi * m;
result >>= 32;
result += (uint64_t)v_hi * m;

*v = quot;
*v = result;
}

static char _get_digit(uint64_t *fr, int *digit_count)
Expand Down

0 comments on commit 135b070

Please sign in to comment.