I can't understand why this code doesn't want to work correctly. The DPs accumulate but there is no collision...
__device__ void ComputeKangaroos(uint64_t* kangaroos, uint32_t maxFound, uint32_t* out, uint64_t dpMask) {
//GPU_GRP_SIZE =4
//NUM_OF_JUMPS = 64
uint64_t px[GPU_GRP_SIZE][4];
uint64_t py[GPU_GRP_SIZE][4];
uint64_t dist[GPU_GRP_SIZE][4];
uint64_t lastJump[GPU_GRP_SIZE];
uint64_t newpx[NUM_OF_JUMPS][4];
uint64_t newpy[NUM_OF_JUMPS][4];
uint64_t newpz[NUM_OF_JUMPS][4];
uint64_t jmp;
int stride = threadIdx.x & 31;
Load256(SjPx[stride], jPx[stride]);
Load256(SjPy[stride], jPy[stride]);
Load256(SjD[stride], jD[stride]);
__syncthreads();
LoadKangaroos(kangaroos, px, py, dist, lastJump);
for (int run = 0; run < NB_RUN; run++) {
for (int grp = 0; grp < GPU_GRP_SIZE; grp++){
newpz[0][3] = 0;
newpz[0][2] = 0;
newpz[0][1] = 0;
newpz[0][0] = 1;
jmp = (unsigned int)px[grp][0] & (NB_JUMP - 1);
ADDz1(newpx[0], newpy[0], newpz[0], px[grp], py[grp], newpz[0], SjPx[jmp], SjPy[jmp]);
for (int g = 1; g < NUM_OF_JUMPS; g++) {
jmp = (unsigned int)newpx[g - 1][0] & (NB_JUMP - 1);
ADDz1(newpx[g], newpy[g], newpz[g], newpx[g - 1], newpy[g - 1], newpz[g - 1], SjPx[jmp], SjPy[jmp]);
}
__syncthreads();
_InvGrouped(newpz);
jmp = (unsigned int)px[grp][0] & (NB_JUMP - 1);
_ModMult(px[grp], newpx[0], newpz[0]);
_ModMult(py[grp], newpy[0], newpz[0]);
Add128(dist[grp], SjD[jmp]);
if ((px[grp][3] & dpMask) == 0) {
// Distinguished point
uint32_t pos = atomicAdd(out, 1);
if (pos < maxFound) {
uint64_t kIdx = (uint64_t)IDX + (uint64_t)grp * (uint64_t)blockDim.x + (uint64_t)blockIdx.x * ((uint64_t)blockDim.x * GPU_GRP_SIZE);
OutputDP(px[grp], dist[grp], &kIdx);
}
}
for (int g = 1; g < NUM_OF_JUMPS; g++) {
jmp = (unsigned int)newpx[g - 1][0] & (NB_JUMP - 1);
_ModMult(px[grp], newpx[g], newpz[g]);
_ModMult(py[grp], newpy[g], newpz[g]);
Add128(dist[grp], SjD[jmp]);
if ((px[grp][3] & dpMask) == 0) {
// Distinguished point
uint32_t pos = atomicAdd(out, 1);
if (pos < maxFound) {
uint64_t kIdx = (uint64_t)IDX + (uint64_t)grp * (uint64_t)blockDim.x + (uint64_t)blockIdx.x * ((uint64_t)blockDim.x * GPU_GRP_SIZE);
OutputDP(px[grp], dist[grp], &kIdx);
}
}
}
}
}
__syncthreads();
StoreKangaroos(kangaroos, px, py, dist, lastJump);
}