HP Forums - Fast remainder

This thread is inspired by thread: Puzzle - RPL and others

We extended the search not just to decimal, but base n, and allowed diigts 1 .. n-1
One of the hot spot is the expensive (mod k) operation.

Updated code unroll loops to reduce expensive modulo.
(this cut down modulo to 1/6th, with reduce n domain, 0 < n ≤ 190)

d = ((((((d*n + X[i])*n + X[i+1])*n + X[i+2])*n + X[i+3])*n + X[i+4])*n + X[i+5]) % k

I just learned another way, by replacing modulo with 2 multiply and shift.
(Python infinite precision integer need masking M step; C code get this for free.)

It go straight for remainder, and not waste time computing quotient.
Below setup is for 0 ≤ x ≤ 2**16 = 65536

>>> M = 0xffffffff
>>> def mod17(x, c = M//17+1): return (x*c & M) * k >> 32
...
>>> print mod17(12345), 12345 % 17
3 3

Code:

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

// note: with 32-bits c, x and k really limited to 16-bits

static uint32_t fastmod(uint32_t x, uint64_t k, const uint32_t c)

{

    return (x * c * k) >> 32;   // = x % k

}

uint32_t gcd(uint32_t a, uint32_t b)

{

    for(;;) {

        if (b == 0) return a;

        a %= b;

        if (a == 0) return b;

        b %= a;

    }

}

void printx(uint32_t *x, uint32_t n)

{

    printf("{ ");

    for(uint32_t i=1; i<n; i++) printf("%d ", x[i]);

    printf("}\n");

}

void recurse(uint32_t *a, uint32_t n, uint32_t k, uint32_t *x)

{

    if (k==n) return printx(x, k);

    uint32_t c = a[2*n+k], d = 0, step = k+k;

    for(uint32_t i=1; i < k; i++) {

        d = fastmod((d + x[i])*n, k, c);

    }

    d = k - d;                          // valid (mod k)

    if (k%2) {  // odd k

        if (d%2 == 0) d += k;           // k = d = 1 (mod 2)

    } else {    // even k

        if (k%4 == 0) step = k;         // step  = 0 (mod 4)

        else if ((n+d)%4 == 0) d += k;  // n+d+k = 0 (mod 4)

    }

    for(uint32_t bucket = a[n+k]; d < n; d += step) {

        if (a[d] && a[n+d] == bucket) {

            x[k] = d;

            a[d] = 0;                   // mark taken

            recurse(a, n, k+1, x);

            a[d] = d;                   // restore

        }

    }

}

void puzzle(uint32_t n) {       // polydivisible number, base n

    if (n%2) return;            // n must be even

    uint32_t a[3*n], x[n], i;

    for(i=1; i < n; i++) {

        a[i] = i;               //   +1 to  n-1 = duplicate check

        a[n+i] = gcd(n,i);      //  n+1 to 2n-1 = gcd buckets

        a[2*n+i] = (~0U)/i + 1; // 2n+1 to 3n-1 = fastmod factor

    }

    recurse(a, n, 1, x-1);      // arrays all 1-based

}

int main(int argc, char **argv)

{

    uint32_t n = 0;

    if (argc > 1) n = atoi(argv[1]);

    if (n-1 < 181) {puzzle(n); return 0;}

    return printf("Usage %s n, 0 < n <= 181\n", argv[0]);

}

>polydivisible 2
{ 1 }

>polydivisible 4
{ 1 2 3 }
{ 3 2 1 }

>polydivisible 6
{ 1 4 3 2 5 }
{ 5 4 3 2 1 }

>polydivisible 8
{ 3 2 5 4 1 6 7 }
{ 5 2 3 4 7 6 1 }
{ 5 6 7 4 3 2 1 }

>polydivisible 10
{ 3 8 1 6 5 4 7 2 9 }

>polydivisible 12

>polydivisible 14
{ 9 12 3 10 5 4 7 6 11 8 1 2 13 }

For n=60 (no solution): % take 114 sec., fastmod take 58 sec, 114/58 = 2.0X

Reference:
Faster remainders when the divisor is a constant: beating compilers and libdivide
And, for finer points, More fun with fast remainders when the divisor is a constant

Update: I messed up the code (now fixed)
I had forgotten fastmod required very wide bits to work.
For 64-bits machine, fastmod arguments is limited to 16-bits.

This prevented any unrolling (and still use 1 fastmod per loop)
2*(n-1)*n < 2^16 → Base n ≤ 181

With better digit buckets, we improved n=60 timing, from 58 to 16 sec. Smile

Here is the reason the better bucket work.

Time savings is directly related to the much reduced search cases.
Original version, using gcd(n, d) buckets, required 529,631,194 recursive calls.
Updated version, with gcd(2n,d) buckets, required 148,843,222 recursive calls.

148843222 / 529631194 ≈ 28% → 58 * 28% = 16 sec (as expected)

Code:

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

// note: with 32-bits c, x and k really limited to 16-bits

static uint32_t fastmod(uint32_t x, uint64_t k, const uint32_t c)

{

    return (x * c * k) >> 32;   // = x % k

}

uint32_t gcd(uint32_t a, uint32_t b)

{

    for(;;) {

        if (b == 0) return a;

        a %= b;

        if (a == 0) return b;

        b %= a;

    }

}

void printx(uint32_t *x, uint32_t n)

{

    printf("{ ");

    for(uint32_t i=1; i<n; i++) printf("%d ", x[i]);

    printf("}\n");

}

void recurse(uint32_t *a, uint32_t n, uint32_t k, uint32_t *x)

{

    if (k==n) return printx(x, k);

    uint32_t c = a[3*n+k], d = 0, step = k+k;

    for(uint32_t i=2; i < k; i+=2) {

        d = fastmod((d + x[i-1])*n, k, c);

        d = fastmod((d + x[i])*n, k, c);

    }

    if (k%2) {  // odd k

        d = k - d;                      // valid (mod k)

        if (d%2 == 0) d += k;           // k = d = 1 (mod 2)

    } else {    // even k

        d = (d + x[k-1])*n;             // missed iteration

        d = k - fastmod(d, k, c);       // valid (mod k)

        if (k%4 == 0) step = k;         // step  = 0 (mod 4)

        else if ((n+d)%4 == 0) d += k;  // n+d+k = 0 (mod 4)

    }

    for(uint32_t bucket = a[2*n+k]; d < n; d += step) {

        if (a[d] && a[n+d] == bucket) {

            x[k] = d;

            a[d] = 0;                   // mark taken

            recurse(a, n, k+1, x);

            a[d] = d;                   // restore

        }

    }

}

void puzzle(uint32_t n) {       // polydivisible number, base n

    if (n%2) return;            // n must be even

    uint32_t a[4*n], x[n], i, j, g, bad = 0;

    for(i=1; i < n; i++) {

        a[i] = i;               //   +1 to  n-1 = duplicate check

        a[3*n+i] = (~0U)/i + 1; // 3n+1 to 4n-1 = fastmod factor

        if ((g = a[n+i] = gcd(n,i)) != (a[2*n+i] = gcd(2*n,i))) {

            for (x[bad] = g, j=0; x[j] != g; j++) ;

            if (j == bad) bad++;    // bad buckets

        }

    }

    for(i=0; i < bad; i++) {        // fix bad buckets

        for(g = x[i], j=2*n+1; j < 3*n; j++)

            if (g == a[j]) a[j-n] = g+g;

    }

    recurse(a, n, 1, x-1);          // arrays all 1-based

}

int main(int argc, char **argv)

{

    uint32_t n = 0;

    if (argc > 1) n = atoi(argv[1]);

    if (n-1 < 181) {puzzle(n); return 0;}

    return printf("Usage %s n, 0 < n <= 181\n", argv[0]);

}

Update1: fixed fastmod domain problem (see previous post)
Update2: unroll loop 1 time (but, 2 fastmod per loop), n=60 timing at 14.5 sec (1.5 sec saving)

(05-10-2021 02:44 PM)Albert Chan Wrote: [ -> ]Update2: unroll loop 1 time (but, 2 fastmod per loop), n=60 timing at 14.5 sec (1.5 sec saving)

The reason for the update is to compare effect of replacing 2 16-bits fastmod with 1 32-bits mod

First experiment, for the inner loop:

< d = fastmod((d + x[i-1])*n, k, c);
< d = fastmod((d + x[i])*n, k, c);

> d = (d + x[i-1])*n;
> d = (d + x[i])*n % k

For n=60, timings at 19.5 sec. (slower, by 5 sec)
This confirmed Lemire's chart. fastmod is about 3 times as fast (blue vs red)

[Image: hashbenches-skylake-clang-300x180.png]

[Image: hashbenches-skylake-clang-300x180.png]

---

We could pre-compute multiply constants and shifts, to speed-up quotient (x%k = x-q*k)
Unfortunately, code is more involved ... I use libdivide.h, for fast divide.
(It is just a header file, very easy to use; for C++, it is even easier)

libdivide version is good. n=60 timings at 14.8 sec (slower, by only 0.3 sec)
For this tiny cost, we have both quotient and remainder !

And, it cover more bases, 2n^3 ≤ 2^32 → Base n ≤ 1290

Code:

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

#include <libdivide.h>

// 32-bits mod, x % k = x-q*k, where q is the quotient

static uint32_t fastmod(uint32_t x, uint32_t k, struct libdivide_u32_t *c)

{

    return x - libdivide_u32_do(x, c) * k;  // = x % k

}

uint32_t gcd(uint32_t a, uint32_t b)

{

    for(;;) {

        if (b == 0) return a;

        a %= b;

        if (a == 0) return b;

        b %= a;

    }

}

void printx(uint32_t *x, uint32_t n)

{

    printf("{ ");

    for(uint32_t i=1; i<n; i++) printf("%d ", x[i]);

    printf("}\n");

}

void recurse(uint32_t *a, uint32_t n, uint32_t k,

             struct libdivide_u32_t *fast_d, uint32_t *x)

{

    if (k==n) return printx(x, n);

    uint32_t d = 0, step = k+k;

    struct libdivide_u32_t *c = &fast_d[k];

    for(uint32_t i=2; i < k; i+=2) {    // assumed n <= 1290

        d = (d + x[i-1])*n;

        d = fastmod((d + x[i])*n, k, c);

    }

    if (k%2) {  // odd k

        d = k - d;                      // valid (mod k)

        if (d%2 == 0) d += k;           // k = d = 1 (mod 2)

    } else {    // even k

        d = (d + x[k-1])*n;             // missed iteration

        d = k - fastmod(d, k, c);       // valid (mod k)

        if (k%4 == 0) step = k;         // step  = 0 (mod 4)

        else if ((n+d)%4 == 0) d += k;  // n+d+k = 0 (mod 4)

    }

    for(uint32_t bucket = a[2*n+k]; d < n; d += step) {

        if (a[d] && a[n+d] == bucket) {

            x[k] = d;

            a[d] = 0;                   // mark taken

            recurse(a, n, k+1, fast_d, x);

            a[d] = d;                   // restore

        }

    }

}

void puzzle(uint32_t n) {       // polydivisible number, base n

    if (n%2) return;            // n must be even

    uint32_t a[3*n], x[n], i, j, g, bad = 0;

    struct libdivide_u32_t fast_d[n];

    for(i=1; i < n; i++) {

        a[i] = i;               //   +1 to  n-1 = duplicate check

        fast_d[i] = libdivide_u32_gen(i);

        if ((g = a[n+i] = gcd(n,i)) != (a[2*n+i] = gcd(2*n,i))) {

            for (x[bad] = g, j=0; x[j] != g; j++) ;

            if (j == bad) bad++;    // bad buckets

        }

    }

    for(i=0; i < bad; i++) {        // fix bad buckets

        for(g = x[i], j=2*n+1; j < 3*n; j++)

            if (g == a[j]) a[j-n] = g+g;

    }

    recurse(a, n, 1, fast_d, x-1);  // arrays all 1-based

}

int main(int argc, char **argv)

{

    uint32_t n = 0;

    if (argc > 1) n = atoi(argv[1]);

    if (n-1 < 1290) {puzzle(n); return 0;}

    return printf("Usage %s n, 0 < n <= 1290\n", argv[0]);

}

Perhaps of interest:
How do compilers optimize divisions?
http://homepage.divms.uiowa.edu/~jones/bcd/

(05-10-2021 02:44 PM)Albert Chan Wrote: [ -> ]Update2: unroll loop 1 time (but, 2 fastmod per loop), n=60 timing at 14.5 sec (1.5 sec saving)

My estimate is too conservative. Base is small, much less than 16 bits.
If base n ≤ 128, we may reduce 2 fastmod to 1, and still maintain correctness.

Or, keep the code as is, but pushed up base, n ≤ 1024.

Faster Remainder by Direct Computation: Applications to Compilers and Software Libraries

Conclusion part of Algorithm 1 Wrote:On this basis, Algorithm 2 gives the minimal number of fractional bits F. It is sufficient to pick F ≥ N + log2(d)

x % d = fastmod(x, d, c) // 0 < x < 2^N
c = ceil(2^F/d) = floor((2^F-1)/d)+1 // pre-computed

Our code is using 32-bits unsigned, F = 32

d = 1 to n-1 ⇒ d < n.
If n = 128 = 2^7, then log2(d) < 7

F ≥ N + log2(d)
max(N) = 32 - 7 = 25

If we combined 2 fastmod into 1, we have:

x = ((d + x[i-1])*n + x[i])*n < 2*n^3 = 2^(1 + 3*log2(n))

N ≤ 1+3*7 = 22 ≤ 25, thus we are safe.

---

Without combining 2 fastmod into 1, but pushed up base to maximum of 1024 = 2^10

max(N) = 32 - 10 = 22

x = (d + x[i-1])*n < 2*n^2 = 2^(1 + 2*log2(n))

N ≤ 1+2*10 = 21 ≤ 22, thus we are safe.

Conclusion part of Algorithm 1 Wrote:On this basis, Algorithm 2 gives the minimal number of fractional bits F. It is sufficient to pick F ≥ N + log2(d)

We can intuitively show above is true, by thinking floating point.

Let M = 2^F - 1, c = ⌊M/d⌋ + 1, s = "significant bits" of c = F - log2(d)

x % d == (x*c & M) * d >> 32 == ⌊ (x*c & M) * d / 2**32 ⌋

(x*c & M) removed quotient of x/d, leaving "fractional part" of s significant bits.

If x has less than s significnat bits, result is accurate.
If x is too big, returning s significant bits is not enough.

Example:

>>> M = (1<<32) - 1 # F = 32
>>> fastmod = lambda x: int((x*c & M) * d >> 32)
>>> test = lambda x: (fastmod(x), x%d, len(bin(x))-2)

>>> d = 17
>>> c = M//d + 1 # = 0xf0f0f10, 4*7 = 28 bits
>>> test((1<<28) - 1) # 28 bits x, ok
(15, 15, 28)
>>> test((1<<28)) # 29 bits x, failed
(0, 16, 29)

>>> d = 29
>>> c = M//d + 1 # = 0x8d3dcb1, 4*7 = 28 bits
>>> test(0x13b13b13) # 29 bits x, ok
(5, 5, 29)
>>> test(0x13b13b14) # 29 bits x, failed
(7, 6, 29)

(05-12-2021 03:10 PM)Albert Chan Wrote: [ -> ]If n = 128 = 2^7, then log2(d) < 7

F ≥ N + log2(d)
max(N) = 32 - 7 = 25

If we combined 2 fastmod into 1, we have:

x = ((d + x[i-1])*n + x[i])*n < 2*n^3 = 2^(1 + 3*log2(n))

N ≤ 1+3*7 = 22 ≤ 25, thus we are safe.

Instead of using general rule, we can get more closely to the fastmod limit.

M = 2^F - 1
c = floor(M/d) + 1 = (M+1)/d + (d-1-M%d)/d = 2^F/d + cerr

fastmod(x, d, c) = floor((x*c & M) * d / (M+1))

For fastmod to work, x must not be too big to cause error ≥ 1, where floor cannot correct.

x * cerr * d / (M+1) < 1
x * (d-1 - M%d) ≤ M

max(x) = floor(M / (d-1 - M%d))

Note that max(x) goes infinite when RHS denominator goes 0, when d = pow-of-2.
To confirm the equation, lets redo previous post examples, by direct calculation.

>>> M = (1<<32) - 1 # F = 32
>>> maxx = lambda d: int(M // (d-1 - M%d))
>>> print hex(maxx(17)), hex(maxx(29))

0xfffffff 0x13b13b13

Now, we are ready to get closer fastmod limit.

Code:

for n in range(128, 256):

    try: limit = maxx(n-1)

    except ZeroDivisionError: continue

    if 2*n*n*n > limit: print 'max base =',n-1; break

max base = 217

Actually, looping it all is unnecessary.
We maximize denominator (assume M%d=0), replace integer divide with float divide.

2*n^3 = M / (n-2)
n ≈ ⁴√(M/2) ≈ 215.3

Consider effect of n-2 ≈ n, this is below solution for actual n.
Start from n=216, we hit the sweet spot in 3 tries (n=218 failed max(x) test)

---

This is the patch to code (post #2)
For n=60, timing goes from 14.5 sec, down to 12.3 sec.

Code:

33c33

<         d = fastmod((d + x[i-1])*n, k, c);

---

>         d = (d + x[i-1])*n;

79,80c79,80

<     if (n-1 < 181) {puzzle(n); return 0;}

<     return printf("Usage %s n, 0 < n <= 181\n", argv[0]);

---

>     if (n-1 < 217) {puzzle(n); return 0;}

>     return printf("Usage %s n, 0 < n <= 217\n", argv[0]);

Added another version, similar to fmod optimized version, with cached intermediates.
Just like fmod version, code just return raw data, we have to work out the digits.

> polydivisible 10
{ 3 38 1 16 5 54 7 72 9 } → {3,8,1,6,5,4,7,2,9}

> polydivisible 14
{ 9 138 3 52 5 74 7 104 11 162 1 16 13 } → {9,12,3,10,5,4,7,6,11,8,1,2,13}

Code:

#include <stdio.h>

#include <stdlib.h>

#include <stdint.h>

// note: with 32-bits c, x and k is limited

static uint32_t fastmod(uint32_t x, uint64_t k, const uint32_t c)

{

    return (x * c * k) >> 32;   // = x % k

}

uint32_t gcd(uint32_t a, uint32_t b)

{

    for(;;) {

        if (b == 0) return a;

        a %= b;

        if (a == 0) return b;

        b %= a;

    }

}

void printx(uint32_t *x, uint32_t n)

{

    printf("{ ");

    for(uint32_t i=1; i<n; i++) printf("%d ", x[i]);

    printf("}\n");

}

void recurse(uint32_t *a, uint32_t n, uint32_t k, uint32_t *x)

{

    if (k==n) return printx(x, k);

    uint32_t c = a[3*n+k], step = k+k, i=2, d=k-1, D=0;

    for(; i < d; i += 2) D = fastmod(D*n*n + x[i], k, c);

    if (--i != d && d) x[d] += x[i]*n;  // add last digit

    D = k%2 ? fastmod(D*n*n + x[d], k, c) : D*n + x[d];

    d = k - fastmod(D*n, k, c);         // valid (mod k)

    if (k%2) {  // odd k

        if (d%2 == 0) d += k;           // k = d = 1 (mod 2)

    } else {    // even k

        if (k%4 == 0) step = k;         // step  = 0 (mod 4)

        else if ((n+d)%4 == 0) d += k;  // n+d+k = 0 (mod 4)

    }

    for(c = a[2*n+k]; d < n; d += step) {

        if (a[d] && a[n+d] == c) {

            x[k] = d;

            a[d] = 0;                   // mark taken

            recurse(a, n, k+1, x);

            a[d] = d;                   // restore

        }

    }

}

void puzzle(uint32_t n) {       // polydivisible number, base n

    if (n%2) return;            // n must be even

    uint32_t a[4*n], x[n], i, j, g, bad = 0;

    for(i=1; i < n; i++) {

        a[i] = i;               //   +1 to  n-1 = duplicate check

        a[3*n+i] = (~0U)/i + 1; // 3n+1 to 4n-1 = fastmod factor

        if ((g = a[n+i] = gcd(n,i)) != (a[2*n+i] = gcd(2*n,i))) {

            for (x[bad] = g, j=0; x[j] != g; j++) ;

            if (j == bad) bad++;    // bad buckets

        }

    }

    for(i=0; i < bad; i++) {        // fix bad buckets

        for(g = x[i], j=2*n+1; j < 3*n; j++)

            if (g == a[j]) a[j-n] = g+g;

    }

    recurse(a, n, 1, x);            // arrays all 1-based

}

int main(int argc, char **argv)

{

    int n = 0;

    if (argc > 1) n = atoi(argv[1]);

    if (0 < n && n <= 256) {puzzle(n); return 0;}

    return printf("Usage %s n, 0 < n <= 256\n", argv[0]);

}

For n=60, timing reduced to 10.5 sec Smile

Also, base covered more cases, 0 < n ≤ 256
(technically 0 < n ≤ 257, but odd n has no solution).

Lemire's general rule work well here:

If n = 256 = 2^8, n^3 = 2^24 → N = 24
F = N - log2(d) → max(N) = 32 - 8 = 24, confirmed OK.

Code:

    recurse(a, n, 1, x);            // arrays all 1-based

There is a subtle issue here, which turns out harmless.

First call to recurse(), k=1, x[0] is accessed.
I was going to add x[0] = 0, before recurse(), but turns out not needed.

x%1 = fastmod(x, d=1, c=0) = 0, does not matter what x is.

All d that is pow-of-2 had this property, fastmod without error (see previous post).