Re: An (even more) optimized version of cpuminer (pooler's cpuminer, CPU-only)

Quote from: JayDDee on September 14, 2023, 07:55:40 PM

It looks like your comparing original code with -ftreevectorize to "hand coded" with -ftreevectorize. That doesn't prove anything about -ftreevectorize.
You need to test the same code with and without vectorization. Did your hand coded version actually use parallel SIMD Salsa on the data "arranged in lanes"?

ok the original driving codes is like such

Code: ("main.c")

#include <time.h>
#include "salsa.h"
void salsa(uint *X, uint rounds);

int main(int argc, char **argv) {
uint X[16];
const int rounds = 1024*1024;
clock_t start, end;
double cpu_time_used;

for(int i=0; i<16; i++)
X[i] = i;

puts(abin2hex((unsigned char *) X, 4*16));
start = clock();
salsa(X,rounds);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("cputime %g\n", cpu_time_used);

}

/* Salsa20, rounds must be a multiple of 2 */
void __attribute__ ((noinline)) salsa(uint *X, uint rounds) {
uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t
;

x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3];
x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7];
x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11];
x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];

#define quarter(a, b, c, d, v) \
t = a + d; if (v) printf("t: %d\n",t); \
t = ROTL32(t, 7); if(v) printf("t: %d\n",t); \
b ^= t; if(v) printf("b: %d\n",b); \
t = b + a; if(v) printf("t: %d\n",t); \
t = ROTL32(t, 9); if(v) printf("t: %d\n",t); \
c ^= t; if(v) printf("c: %d\n",c); \
t = c + b; if(v) printf("t: %d\n",t); \
t = ROTL32(t, 13); if(v) printf("t: %d\n",t); \
d ^= t; if(v) printf("d: %d\n",d); \
t = d + c; if(v) printf("t: %d\n",t); \
t = ROTL32(t, 18); if(v) printf("t: %d\n",t); \
a ^= t; if(v) printf("a: %d\n",a);

int v = 0;
for(; rounds; rounds -= 2) {
quarter( x0, x4, x8, x12, v);
quarter( x5, x9, x13, x1, v);
quarter(x10, x14, x2, x6, v);
quarter(x15, x3, x7, x11, v);
quarter( x0, x1, x2, x3, v);
quarter( x5, x6, x7, x4, v);
quarter(x10, x11, x8, x9, v);
quarter(x15, x12, x13, x14, v);
}

X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3;
X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7;
X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11;
X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;

#undef quarter
}

that will run 1024 x 1024 rounds ~ 1048576 rounds
without optimization i.e. no -O flags

Code:

cputime 0.187971
cputime 0.231245
cputime 0.187873

~ 202.363 ms for that 1048576 rounds

with optimization -O2 but no -ftreevectorize

Code:

cputime 0.011749
cputime 0.011733
cputime 0.025701

~ 16.394 ms for that 1048576 rounds