Three accounts so far. Sadly its not very funny. Its driving people away and we lose the purpose of this forum.
Yes, we don't lose anything. Here is a GPU accelerated version of the main function. If I made a mistake in the code, please suggest a correction.
Im Added Loop:
// -----------------------------------------------------------------------------------------
__device__ void ComputeKeys(uint32_t mode, uint64_t *startx, uint64_t *starty,
prefix_t *sPrefix, uint32_t *lookup32, uint32_t maxFound, uint32_t *out) {
uint64_t dx[GRP_SIZE/2+1][4];
uint64_t px[4];
uint64_t py[4];
uint64_t pyn[4];
uint64_t sx[4];
uint64_t sy[4];
uint64_t dy[4];
uint64_t _s[4];
uint64_t _p2[4];
char pattern[48];
// Load starting key
__syncthreads();
Load256A(sx, startx);
Load256A(sy, starty);
Load256(px, sx);
Load256(py, sy);
if (sPrefix == NULL) {
memcpy(pattern,lookup32,48);
lookup32 = (uint32_t *)pattern;
}
uint32_t index = 0; // #define NB_SPIN 32 in GPUEngine.h
for (uint32_t j = 0; j < NB_SPIN; j++) { //for (uint32_t j = 0; j < STEP_SIZE / GRP_SIZE; j++) {
// Fill group with delta x
uint32_t i;
for (i = 0; i < HSIZE; i++)
ModSub256(dx[i], Gx[i], sx);
ModSub256(dx[i] , Gx[i], sx); // For the first point
ModSub256(dx[i+1],_2Gnx, sx); // For the next center point
// Compute modular inverse
_ModInvGrouped(dx);
// We use the fact that P + i*G and P - i*G has the same deltax, so the same inverse
// We compute key in the positive and negative way from the center of the group
// Check starting point
//CHECK_PREFIX(GRP_SIZE / 2);
index = (j * GRP_SIZE) + GRP_SIZE_DIV2;
CHECK_PREFIX(index);
ModNeg256(pyn,py);
for(i = 0; i < HSIZE; i++) {
__syncthreads();
// P = StartPoint + i*G
Load256(px, sx);
Load256(py, sy);
ModSub256(dy, Gy[i], py);
_ModMult(_s, dy, dx[i]); // s = (p2.y-p1.y)*inverse(p2.x-p1.x)
_ModSqr(_p2, _s); // _p2 = pow2(s)
ModSub256(px, _p2,px);
ModSub256(px, Gx[i]); // px = pow2(s) - p1.x - p2.x;
ModSub256(py, Gx[i], px);
_ModMult(py, _s); // py = - s*(ret.x-p2.x)
ModSub256(py, Gy[i]); // py = - p2.y - s*(ret.x-p2.x);
//CHECK_PREFIX(GRP_SIZE / 2 + (i + 1));
index = (j * GRP_SIZE) + (GRP_SIZE_DIV2 + (i + 1));
CHECK_PREFIX(index);
__syncthreads();
// P = StartPoint - i*G, if (x,y) = i*G then (x,-y) = -i*G
Load256(px, sx);
ModSub256(dy,pyn,Gy[i]);
_ModMult(_s, dy, dx[i]); // s = (p2.y-p1.y)*inverse(p2.x-p1.x)
_ModSqr(_p2, _s); // _p = pow2(s)
ModSub256(px, _p2, px);
ModSub256(px, Gx[i]); // px = pow2(s) - p1.x - p2.x;
ModSub256(py, px, Gx[i]);
_ModMult(py, _s); // py = s*(ret.x-p2.x)
ModSub256(py, Gy[i], py); // py = - p2.y - s*(ret.x-p2.x);
//CHECK_PREFIX(GRP_SIZE / 2 - (i + 1));
index = (j * GRP_SIZE) + (GRP_SIZE_DIV2 - (i + 1));
CHECK_PREFIX(index);
}
__syncthreads();
// First point (startP - (GRP_SZIE/2)*G)
Load256(px, sx);
Load256(py, sy);
ModNeg256(dy, Gy[i]);
ModSub256(dy, py);
_ModMult(_s, dy, dx[i]); // s = (p2.y-p1.y)*inverse(p2.x-p1.x)
_ModSqr(_p2,_s); // _p = pow2(s)
ModSub256(px, _p2, px);
ModSub256(px, Gx[i]); // px = pow2(s) - p1.x - p2.x;
ModSub256(py, px, Gx[i]);
_ModMult(py, _s); // py = s*(ret.x-p2.x)
ModSub256(py, Gy[i], py); // py = - p2.y - s*(ret.x-p2.x);
//CHECK_PREFIX(0);
index = (j * GRP_SIZE);
CHECK_PREFIX(index);
i++;
__syncthreads();
// Next start point (startP + GRP_SIZE*G)
Load256(px, sx);
Load256(py, sy);
ModSub256(dy, _2Gny, py);
_ModMult(_s, dy, dx[i]); // s = (p2.y-p1.y)*inverse(p2.x-p1.x)
_ModSqr(_p2, _s); // _p2 = pow2(s)
ModSub256(px, _p2, px);
ModSub256(px, _2Gnx); // px = pow2(s) - p1.x - p2.x;
ModSub256(py, _2Gnx, px);
_ModMult(py, _s); // py = - s*(ret.x-p2.x)
ModSub256(py, _2Gny); // py = - p2.y - s*(ret.x-p2.x);
// loop NB_SPIN
Load256(sx, px);
Load256(sy, py);
j++;// cnt loop
}
// Update starting point
__syncthreads();
Store256A(startx, px);
Store256A(starty, py);
}
// END
What do you think?