It looks like your comparing original code with -ftreevectorize to "hand coded" with -ftreevectorize. That doesn't prove anything about -ftreevectorize.
You need to test the same code with and without vectorization. Did your hand coded version actually use parallel SIMD Salsa on the data "arranged in lanes"?
ok the original driving codes is like such
#include <time.h>
#include "salsa.h"
void salsa(uint *X, uint rounds);
int main(int argc, char **argv) {
uint X[16];
const int rounds = 1024*1024;
clock_t start, end;
double cpu_time_used;
for(int i=0; i<16; i++)
X[i] = i;
puts(abin2hex((unsigned char *) X, 4*16));
start = clock();
salsa(X,rounds);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("cputime %g\n", cpu_time_used);
}
/* Salsa20, rounds must be a multiple of 2 */
void __attribute__ ((noinline)) salsa(uint *X, uint rounds) {
uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t
;
x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3];
x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7];
x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11];
x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
#define quarter(a, b, c, d, v) \
t = a + d; if (v) printf("t: %d\n",t); \
t = ROTL32(t, 7); if(v) printf("t: %d\n",t); \
b ^= t; if(v) printf("b: %d\n",b); \
t = b + a; if(v) printf("t: %d\n",t); \
t = ROTL32(t, 9); if(v) printf("t: %d\n",t); \
c ^= t; if(v) printf("c: %d\n",c); \
t = c + b; if(v) printf("t: %d\n",t); \
t = ROTL32(t, 13); if(v) printf("t: %d\n",t); \
d ^= t; if(v) printf("d: %d\n",d); \
t = d + c; if(v) printf("t: %d\n",t); \
t = ROTL32(t, 18); if(v) printf("t: %d\n",t); \
a ^= t; if(v) printf("a: %d\n",a);
int v = 0;
for(; rounds; rounds -= 2) {
quarter( x0, x4, x8, x12, v);
quarter( x5, x9, x13, x1, v);
quarter(x10, x14, x2, x6, v);
quarter(x15, x3, x7, x11, v);
quarter( x0, x1, x2, x3, v);
quarter( x5, x6, x7, x4, v);
quarter(x10, x11, x8, x9, v);
quarter(x15, x12, x13, x14, v);
}
X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3;
X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7;
X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11;
X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
#undef quarter
}
that will run 1024 x 1024 rounds ~ 1048576 rounds
without optimization i.e. no -O flags
cputime 0.187971
cputime 0.231245
cputime 0.187873
~ 202.363 ms for that 1048576 rounds
with optimization
-O2 but no
-ftreevectorizecputime 0.011749
cputime 0.011733
cputime 0.025701
~ 16.394 ms for that 1048576 rounds