Post
Topic
Board Development & Technical Discussion
Re: GPUCompute.h => ComputeKeysSEARCH_MODE_SA Rewrited for better performance
by
Fibonacci_Dev
on 03/12/2024, 10:31:22 UTC
something like this would be better!

Code:
__device__ void ComputeKeysSEARCH_MODE_SA(uint32_t mode, uint64_t* startx, uint64_t* starty,
    uint32_t* hash160, uint32_t maxFound, uint32_t* out)
{
    const int group_size = GRP_SIZE / 2 + 1;
    __shared__ uint64_t dx[group_size][4];
    __shared__ uint64_t gx_local[HSIZE][4], gy_local[HSIZE][4];
    uint64_t px[4], py[4], pyn[4], sx[4], sy[4], dy[4], _s[4], _p2[4];

    __syncthreads();

    Load256A(sx, startx);
    Load256A(sy, starty);
    Load256(px, sx);
    Load256(py, sy);

    __syncthreads();

    for (uint32_t i = 0; i < HSIZE; ++i) {
        Load256(gx_local[i], Gx + 4 * i);
        Load256(gy_local[i], Gy + 4 * i);
        ModSub256(dx[i], gx_local[i], sx);
    }
    ModSub256(dx[HSIZE], _2Gnx, sx);

    __syncthreads();
    _ModInvGrouped(dx);

    __syncthreads();
    CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2);

    ModNeg256(pyn, py);

    #pragma unroll
    for (uint32_t i = 0; i < HSIZE; ++i) {
uint64_t gx_local_cache[4], gy_local_cache[4];
        Load256(gx_local_cache, Gx + 4 * i);
        Load256(gy_local_cache, Gy + 4 * i);

        uint64_t px_local[4], py_local[4];
        Load256(px_local, sx);
        Load256(py_local, sy);

        uint64_t dy_cache[4], s_cache[4], p2_cache[4];
        for (int sign = 1; sign >= -1; sign -= 2) {
            Load256(px, px_local);
            Load256(py, py_local);

            if (sign == 1) {
                ModSub256(dy_cache, gy_local_cache, py);
            } else {
                ModSub256(dy_cache, pyn, gy_local_cache);
            }

            s_cache[0] = __umul64hi(dy_cache[0], dx[i][0]);
            p2_cache[0] = __umul64hi(s_cache[0], s_cache[0]);

            ModSub256(px, p2_cache, px);
            ModSub256(px, gx_local_cache);

            ModSub256(py, gx_local_cache, px);
            _ModMult(py, s_cache);
            ModSub256(py, gy_local_cache);

            if (sign == 1) {
                CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 + (i + 1));
            } else {
                CHECK_HASH_SEARCH_MODE_SA(GRP_SIZE / 2 - (i + 1));
            }
        }
    }
}

Using shared Memory is undoubtedly a fairly exponential advantage, but you can't use it that way when you have to do a complete refactoring of all the GPU libraries § (In fact with your code it is not even possible to compile) Anyway, you understood a priori that Shared Memory is the final goal  Grin

I published this first version because it works. However, I am working on an enhanced version of these libraries using Shared Memory.