Re: GPUCompute.h => ComputeKeysSEARCH_MODE_SA Rewrited for better performance

Quote from: DonaldTrumpMk on Today at 02:21:02 AM

something like this would be better!

Code:

__device__ void ComputeKeysSEARCH_MODE_SA(uint32_t mode, uint64_t* startx, uint64_t* starty,
uint32_t* hash160, uint32_t maxFound, uint32_t* out)
{
const int group_size = GRP_SIZE / 2 + 1;
__shared__ uint64_t dx[group_size][4];
__shared__ uint64_t gx_local[HSIZE][4], gy_local[HSIZE][4];
uint64_t px[4], py[4], pyn[4], sx[4], sy[4], dy[4], _s[4], _p2[4];

__syncthreads();

Load256A(sx, startx);
Load256A(sy, starty);
Load256(px, sx);
Load256(py, sy);

__syncthreads();

for (uint32_t i = 0; i < HSIZE; ++i) {
Load256(gx_local[i], Gx + 4 * i);
Load256(gy_local[i], Gy + 4 * i);
ModSub256(dx[i], gx_local[i], sx);
}
ModSub256(dx[HSIZE], _2Gnx, sx);

__syncthreads();
_ModInvGrouped(dx);

__syncthreads();
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2);

ModNeg256(pyn, py);

#pragma unroll
for (uint32_t i = 0; i < HSIZE; ++i) {
uint64_t gx_local_cache[4], gy_local_cache[4];
Load256(gx_local_cache, Gx + 4 * i);
Load256(gy_local_cache, Gy + 4 * i);

uint64_t px_local[4], py_local[4];
Load256(px_local, sx);
Load256(py_local, sy);

uint64_t dy_cache[4], s_cache[4], p2_cache[4];
for (int sign = 1; sign >= -1; sign -= 2) {
Load256(px, px_local);
Load256(py, py_local);

if (sign == 1) {
ModSub256(dy_cache, gy_local_cache, py);
} else {
ModSub256(dy_cache, pyn, gy_local_cache);
}

s_cache[0] = __umul64hi(dy_cache[0], dx[i][0]);
p2_cache[0] = __umul64hi(s_cache[0], s_cache[0]);

ModSub256(px, p2_cache, px);
ModSub256(px, gx_local_cache);

ModSub256(py, gx_local_cache, px);
_ModMult(py, s_cache);
ModSub256(py, gy_local_cache);

if (sign == 1) {
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 + (i + 1));
} else {
CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 - (i + 1));
}
}
}
}

Using shared Memory is undoubtedly a fairly exponential advantage, but you can't use it that way when you have to do a complete refactoring of all the GPU libraries § (In fact with your code it is not even possible to compile) Anyway, you understood a priori that Shared Memory is the final goal Grin

I published this first version because it works. However, I am working on an enhanced version of these libraries using Shared Memory.