That's it, basically.
In the end, we came up with a sort of Frankenstein code together, and this is it:
#include <secp256k1.h>
#include "sha256_avx2.h"
#include "ripemd160_avx2.h"
static constexpr int HASH_BATCH_SIZE = 8;
static constexpr int POINTS_BATCH_SIZE = 256;
inline void prepareShaBlock(const uint8_t* dataSrc, __uint128_t dataLen, uint8_t* outBlock) {
std::fill_n(outBlock, 64, 0);
std::memcpy(outBlock, dataSrc, dataLen);
outBlock[dataLen] = 0x80;
const uint32_t bitLen = (uint32_t)(dataLen * 8);
outBlock[60] = (uint8_t)((bitLen >> 24) & 0xFF);
outBlock[61] = (uint8_t)((bitLen >> 16) & 0xFF);
outBlock[62] = (uint8_t)((bitLen >> 8) & 0xFF);
outBlock[63] = (uint8_t)( bitLen & 0xFF);
}
inline void prepareRipemdBlock(const uint8_t* dataSrc, uint8_t* outBlock) {
std::fill_n(outBlock, 64, 0);
std::memcpy(outBlock, dataSrc, 32);
outBlock[32] = 0x80;
const uint32_t bitLen = 256;
outBlock[60] = (uint8_t)((bitLen >> 24) & 0xFF);
outBlock[61] = (uint8_t)((bitLen >> 16) & 0xFF);
outBlock[62] = (uint8_t)((bitLen >> 8) & 0xFF);
outBlock[63] = (uint8_t)( bitLen & 0xFF);
}
static void computeHash160BatchBinSingle(int numKeys,
uint8_t pubKeys[][33],
uint8_t hashResults[][20])
{
alignas(32) std::array<std::array<uint8_t, 64>, HASH_BATCH_SIZE> shaInputs;
alignas(32) std::array<std::array<uint8_t, 32>, HASH_BATCH_SIZE> shaOutputs;
alignas(32) std::array<std::array<uint8_t, 64>, HASH_BATCH_SIZE> ripemdInputs;
alignas(32) std::array<std::array<uint8_t, 20>, HASH_BATCH_SIZE> ripemdOutputs;
const __uint128_t totalBatches = (numKeys + (HASH_BATCH_SIZE - 1)) / HASH_BATCH_SIZE;
for (__uint128_t batch = 0; batch < totalBatches; batch++) {
const __uint128_t batchCount = std::min<__uint128_t>(HASH_BATCH_SIZE, numKeys - batch * HASH_BATCH_SIZE);
for (__uint128_t i = 0; i < batchCount; i++) {
prepareShaBlock(pubKeys[batch * HASH_BATCH_SIZE + i], 33, shaInputs[i].data());
}
if (batchCount < HASH_BATCH_SIZE) {
static std::array<uint8_t, 64> shaPadding = {};
prepareShaBlock(pubKeys[0], 33, shaPadding.data());
for (__uint128_t i = batchCount; i < HASH_BATCH_SIZE; i++) {
std::memcpy(shaInputs[i].data(), shaPadding.data(), 64);
}
}
const uint8_t* inPtr[HASH_BATCH_SIZE];
uint8_t* outPtr[HASH_BATCH_SIZE];
for (int i = 0; i < HASH_BATCH_SIZE; i++) {
inPtr[i] = shaInputs[i].data();
outPtr[i] = shaOutputs[i].data();
}
sha256avx2_8B(inPtr[0], inPtr[1], inPtr[2], inPtr[3],
inPtr[4], inPtr[5], inPtr[6], inPtr[7],
outPtr[0], outPtr[1], outPtr[2], outPtr[3],
outPtr[4], outPtr[5], outPtr[6], outPtr[7]);
for (__uint128_t i = 0; i < batchCount; i++) {
prepareRipemdBlock(shaOutputs[i].data(), ripemdInputs[i].data());
}
if (batchCount < HASH_BATCH_SIZE) {
static std::array<uint8_t, 64> ripemdPadding = {};
prepareRipemdBlock(shaOutputs[0].data(), ripemdPadding.data());
for (__uint128_t i = batchCount; i < HASH_BATCH_SIZE; i++) {
std::memcpy(ripemdInputs[i].data(), ripemdPadding.data(), 64);
}
}
for (int i = 0; i < HASH_BATCH_SIZE; i++) {
inPtr[i] = ripemdInputs[i].data();
outPtr[i] = ripemdOutputs[i].data();
}
ripemd160avx2::ripemd160avx2_32(
(unsigned char*)inPtr[0], (unsigned char*)inPtr[1],
(unsigned char*)inPtr[2], (unsigned char*)inPtr[3],
(unsigned char*)inPtr[4], (unsigned char*)inPtr[5],
(unsigned char*)inPtr[6], (unsigned char*)inPtr[7],
outPtr[0], outPtr[1], outPtr[2], outPtr[3],
outPtr[4], outPtr[5],
outPtr[6], outPtr[7]
);
for (__uint128_t i = 0; i < batchCount; i++) {
std::memcpy(hashResults[batch * HASH_BATCH_SIZE + i], ripemdOutputs[i].data(), 20);
}
}
}
void worker(int threadId, __uint128_t threadRangeStart, __uint128_t threadRangeEnd) {
alignas(32) uint8_t localPubKeys[HASH_BATCH_SIZE][33];
alignas(32) uint8_t localHashResults[HASH_BATCH_SIZE][20];
__m256i target16 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(TARGET_HASH160_RAW.data()));
secp256k1_context* ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN);
// Precompute points for batch processing
alignas(32) secp256k1_fe plusPointsX[POINTS_BATCH_SIZE];
alignas(32) secp256k1_fe plusPointsY[POINTS_BATCH_SIZE];
alignas(32) secp256k1_fe minusPointsY[POINTS_BATCH_SIZE];
for (int i = 0; i < POINTS_BATCH_SIZE; i++) {
secp256k1_scalar scalar;
secp256k1_scalar_set_int(&scalar, i);
secp256k1_gej pointJ;
secp256k1_ecmult_gen(ctx->ecmult_gen_ctx, &pointJ, &scalar);
secp256k1_ge point;
secp256k1_ge_set_gej(&point, &pointJ);
secp256k1_fe_normalize_var(&point.x);
secp256k1_fe_normalize_var(&point.y);
plusPointsX[i] = point.x;
plusPointsY[i] = point.y;
secp256k1_fe_negate(&minusPointsY[i], &point.y, 1);
}
__uint128_t currentKey = threadRangeStart;
while (currentKey <= threadRangeEnd) {
int localBatchCount = 0;
// Generate public keys in batches
for (; localBatchCount < HASH_BATCH_SIZE && currentKey <= threadRangeEnd; ++localBatchCount, ++currentKey) {
uint8_t priv[32];
uint128ToPrivKey(currentKey, priv);
secp256k1_pubkey pubkey;
if (!secp256k1_ec_pubkey_create(ctx, &pubkey, priv)) {
std::cerr << "Failed to derive public key.\n";
continue;
}
size_t len = 33;
secp256k1_ec_pubkey_serialize(ctx, localPubKeys[localBatchCount], &len, &pubkey, SECP256K1_EC_COMPRESSED);
}
// Compute HASH160 for the batch
computeHash160BatchBinSingle(localBatchCount, localPubKeys, localHashResults);
// Compare HASH160 results with the target
for (int j = 0; j < localBatchCount; ++j) {
__m256i cand = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(localHashResults[j]));
__m256i cmp = _mm256_cmpeq_epi8(cand, target16);
int mask = _mm256_movemask_epi8(cmp);
if ((mask & 0x0F) == 0x0F) {
bool fullMatch = true;
for (int k = 0; k < 20; k++) {
if (localHashResults[j][k] != TARGET_HASH160_RAW[k]) {
fullMatch = false;
break;
}
}
if (fullMatch) {
auto tEndTime = std::chrono::high_resolution_clock::now();
double globalElapsedTime = std::chrono::duration<double>(tEndTime - tStart).count();
std::lock_guard<std::mutex> lock(progress_mutex);
globalComparedCount += actual_work_done;
mkeysPerSec = (double)globalComparedCount / globalElapsedTime / 1e6;
__uint128_t foundKey = currentKey - (localBatchCount - j);
std::string hexKey = uint128ToHex(foundKey);
std::lock_guard<std::mutex> resultLock(result_mutex);
results.push(std::make_tuple(hexKey, total_checked_avx.load(), flip_count));
stop_event.store(true);
return;
}
}
}
}
secp256k1_context_destroy(ctx);
}