Post
Topic
Board Mining (Altcoins)
Re: An (even more) optimized version of cpuminer (pooler's cpuminer, CPU-only)
by
ag1233
on 15/09/2023, 05:37:25 UTC

It looks like your comparing original code with -ftreevectorize to "hand coded" with -ftreevectorize. That doesn't prove anything about -ftreevectorize.
You need to test the same code with and without vectorization. Did your hand coded version actually use parallel SIMD Salsa on the data "arranged in lanes"?


ok the original driving codes is like such
Code: ("main.c")
#include <time.h>
#include "salsa.h"
void salsa(uint *X, uint rounds);

int main(int argc, char **argv) {
    uint X[16];
    const int rounds = 1024*1024;
    clock_t start, end;
    double cpu_time_used;

    for(int i=0; i<16; i++)
        X[i] = i;

    puts(abin2hex((unsigned char *) X, 4*16));
    start = clock();
    salsa(X,rounds);
    end = clock();
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
    printf("cputime %g\n", cpu_time_used);
 
}

/* Salsa20, rounds must be a multiple of 2 */
void __attribute__ ((noinline)) salsa(uint *X, uint rounds) {
    uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t
;

    x0 = X[0];   x1 = X[1];   x2 = X[2];   x3 = X[3];
    x4 = X[4];   x5 = X[5];   x6 = X[6];   x7 = X[7];
    x8 = X[8];   x9 = X[9];  x10 = X[10]; x11 = X[11];
   x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];

#define quarter(a, b, c, d, v) \
    t = a + d; if (v) printf("t: %d\n",t); \
    t = ROTL32(t,  7); if(v) printf("t: %d\n",t); \
    b ^= t; if(v) printf("b: %d\n",b); \
    t = b + a; if(v) printf("t: %d\n",t); \
    t = ROTL32(t,  9); if(v) printf("t: %d\n",t); \
    c ^= t; if(v) printf("c: %d\n",c); \
    t = c + b; if(v) printf("t: %d\n",t); \
    t = ROTL32(t, 13); if(v) printf("t: %d\n",t); \
    d ^= t; if(v) printf("d: %d\n",d); \
    t = d + c; if(v) printf("t: %d\n",t); \
    t = ROTL32(t, 18); if(v) printf("t: %d\n",t); \
    a ^= t; if(v) printf("a: %d\n",a);

    int v = 0;
    for(; rounds; rounds -= 2) {
        quarter( x0,  x4,  x8, x12, v);
        quarter( x5,  x9, x13,  x1, v);
        quarter(x10, x14,  x2,  x6, v);
        quarter(x15,  x3,  x7, x11, v);
        quarter( x0,  x1,  x2,  x3, v);
        quarter( x5,  x6,  x7,  x4, v);
        quarter(x10, x11,  x8,  x9, v);
        quarter(x15, x12, x13, x14, v);
    }

    X[0] += x0;   X[1] += x1;   X[2] += x2;   X[3] += x3;
    X[4] += x4;   X[5] += x5;   X[6] += x6;   X[7] += x7;
    X[8] += x8;   X[9] += x9;  X[10] += x10; X[11] += x11;
   X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;

#undef quarter
}

that will run 1024 x 1024 rounds ~ 1048576 rounds
without optimization i.e. no -O flags
Code:
cputime 0.187971
cputime 0.231245
cputime 0.187873

~ 202.363 ms for that 1048576 rounds

with optimization -O2 but no -ftreevectorize
Code:
cputime 0.011749
cputime 0.011733
cputime 0.025701
~ 16.394 ms for that 1048576 rounds