Lucky hit after less than 3 hours. It's even a 53 bit PoW lol.
[9845 s] nLT012 = 4818944; 18211.71 M/s
[+] Key ID: 13949
S = 00000000000005a2844f65e43e868b000e27f653d19722b3da630bb59a57e34d
Z = 80000000000002d14227b2b6a674ef3fdaca7c092b7a374f200d1e4abbe775e4
Found! nLockTime = 498af3fc
Now... the next puzzle costs around 150 $ to solve, so I'll stop for now.
Here's the CUDA kernel.
#define GPU_CLOCK_FREQUENCY 2.712e9. // RTX 4090 boost mode
__maxnreg__(128)
__global__
void kernelMineS(
limb_t * __restrict__ pgSH, // one script hash / thread
) {
const U32 basePos = blockIdx.x * BLOCK_SIZE * 2 + threadIdx.x;
const U128 * pSH = (U128 *) pgSH + basePos;
U32 scriptHash[8]; // output script hash (32 bytes)
U32 md[8]; // SHA256 [0:128] state (32 bytes)
U32 total = 0;
// load script hash from gmem to registers; coalesced
LOAD_SH(scriptHash, pSH, 0, BLOCK_SIZE);
// hash the first 128 bytes (same result for all threads and all scriptHashes)
// (the message data is fixed all the way up to byte 157 - start of scriptHash)
prepare_hash_12(md);
U64 start_time = clock64();
// iterate for all possible 3 last bytes of the 3rd message chunk
// these bytes are part of the nLockTime
for (U32 nLT012 = 0; nLT012 < 1 << 24; nLT012++) {
U32 md2[8]; // SHA256 [0:192] state
if (nLT012 % 2048 == 0
&& blockIdx.x == 0 && threadIdx.x == 0
) {
U64 now = clock64();
float elapsed_sec = float(now - start_time) / GPU_CLOCK_FREQUENCY;
float keys_per_sec = (nLT012 * 256ULL * gridDim.x * BLOCK_SIZE) / elapsed_sec;
printf("[%.2f] nLT012 = %u, %.2f M/s\n",
elapsed_sec, nLT012, keys_per_sec / 1e6);
}
// hash the 3rd chunk, by reusing the SHA state of first 2 chunks
// this chunk contains the scriptHash of our thread, so we basically just
// change the 3 bytes of the nLockTime and hash the new 64 bytes of data
prepare_hash_3(md2, md, scriptHash, nLT012);
// iterate through the 4th nLockTime byte
for (U16 nLT3 = 0; nLT3 < 256; nLT3++) {
// this does a single SHA256 for the last 5 bytes (4th data chunk)
// by reusing the SHA state of the first 192 bytes hashed so far
// Then, it hashes the final SHA to obtain the Z value
// It then checks whether S = (Z + R) + (Z + R) mod N is a solution
// or whether N - S is a solution
// Another speed up is that Z must start with either 0x00000000,
// 0xffffffff, or 0x7fffffff in order for S to ever be a solution
if (check_s(md2, nLT3)) {
U32 nLockTime = nLT012 << 8 | nLT3;
printf("Found! nLockTime = %08x\n", nLockTime);
PTX_TRAP;
}
}
}
}