Next scheduled rescrape ... never
Version 1
Last scraped
Edited on 02/05/2025, 19:16:59 UTC
That's it, basically.
In the end, we came up with a sort of Frankenstein code together, and this is it:

Code:
#include <secp256k1.h>
#include "sha256_avx2.h"
#include "ripemd160_avx2.h"

static constexpr int HASH_BATCH_SIZE = 8;
static constexpr int POINTS_BATCH_SIZE = 256;


inline void prepareShaBlock(const uint8_t* dataSrc, __uint128_t dataLen, uint8_t* outBlock) {
    std::fill_n(outBlock, 64, 0);
    std::memcpy(outBlock, dataSrc, dataLen);
    outBlock[dataLen] = 0x80;
    const uint32_t bitLen = (uint32_t)(dataLen * 8);
    outBlock[60] = (uint8_t)((bitLen >> 24) & 0xFF);
    outBlock[61] = (uint8_t)((bitLen >> 16) & 0xFF);
    outBlock[62] = (uint8_t)((bitLen >>  8) & 0xFF);
    outBlock[63] = (uint8_t)( bitLen        & 0xFF);
}

inline void prepareRipemdBlock(const uint8_t* dataSrc, uint8_t* outBlock) {
    std::fill_n(outBlock, 64, 0);
    std::memcpy(outBlock, dataSrc, 32);
    outBlock[32] = 0x80;
    const uint32_t bitLen = 256;
    outBlock[60] = (uint8_t)((bitLen >> 24) & 0xFF);
    outBlock[61] = (uint8_t)((bitLen >> 16) & 0xFF);
    outBlock[62] = (uint8_t)((bitLen >>  8) & 0xFF);
    outBlock[63] = (uint8_t)( bitLen        & 0xFF);
}

static void computeHash160BatchBinSingle(int numKeys,
                                       uint8_t pubKeys[][33],
                                       uint8_t hashResults[][20])
{
    alignas(32) std::array<std::array<uint8_t, 64>, HASH_BATCH_SIZE> shaInputs;
    alignas(32) std::array<std::array<uint8_t, 32>, HASH_BATCH_SIZE> shaOutputs;
    alignas(32) std::array<std::array<uint8_t, 64>, HASH_BATCH_SIZE> ripemdInputs;
    alignas(32) std::array<std::array<uint8_t, 20>, HASH_BATCH_SIZE> ripemdOutputs;
    const __uint128_t totalBatches = (numKeys + (HASH_BATCH_SIZE - 1)) / HASH_BATCH_SIZE;
    for (__uint128_t batch = 0; batch < totalBatches; batch++) {
        const __uint128_t batchCount = std::min<__uint128_t>(HASH_BATCH_SIZE, numKeys - batch * HASH_BATCH_SIZE);

        for (__uint128_t i = 0; i < batchCount; i++) {
            prepareShaBlock(pubKeys[batch * HASH_BATCH_SIZE + i], 33, shaInputs[i].data());
        }

        if (batchCount < HASH_BATCH_SIZE) {
            static std::array<uint8_t, 64> shaPadding = {};
            prepareShaBlock(pubKeys[0], 33, shaPadding.data());
            for (__uint128_t i = batchCount; i < HASH_BATCH_SIZE; i++) {
                std::memcpy(shaInputs[i].data(), shaPadding.data(), 64);
            }
        }

        const uint8_t* inPtr[HASH_BATCH_SIZE];
        uint8_t* outPtr[HASH_BATCH_SIZE];
        for (int i = 0; i < HASH_BATCH_SIZE; i++) {
            inPtr[i]  = shaInputs[i].data();
            outPtr[i] = shaOutputs[i].data();
        }

        sha256avx2_8B(inPtr[0], inPtr[1], inPtr[2], inPtr[3],
                      inPtr[4], inPtr[5], inPtr[6], inPtr[7],
                      outPtr[0], outPtr[1], outPtr[2], outPtr[3],
                      outPtr[4], outPtr[5], outPtr[6], outPtr[7]);

        for (__uint128_t i = 0; i < batchCount; i++) {
            prepareRipemdBlock(shaOutputs[i].data(), ripemdInputs[i].data());
        }

        if (batchCount < HASH_BATCH_SIZE) {
            static std::array<uint8_t, 64> ripemdPadding = {};
            prepareRipemdBlock(shaOutputs[0].data(), ripemdPadding.data());
            for (__uint128_t i = batchCount; i < HASH_BATCH_SIZE; i++) {
                std::memcpy(ripemdInputs[i].data(), ripemdPadding.data(), 64);
            }
        }

        for (int i = 0; i < HASH_BATCH_SIZE; i++) {
            inPtr[i]  = ripemdInputs[i].data();
            outPtr[i] = ripemdOutputs[i].data();
        }

        ripemd160avx2::ripemd160avx2_32(
            (unsigned char*)inPtr[0], (unsigned char*)inPtr[1],
            (unsigned char*)inPtr[2], (unsigned char*)inPtr[3],
            (unsigned char*)inPtr[4], (unsigned char*)inPtr[5],
            (unsigned char*)inPtr[6], (unsigned char*)inPtr[7],
            outPtr[0], outPtr[1], outPtr[2], outPtr[3],
            outPtr[4], outPtr[5],
            outPtr[6], outPtr[7]
        );

        for (__uint128_t i = 0; i < batchCount; i++) {
            std::memcpy(hashResults[batch * HASH_BATCH_SIZE + i], ripemdOutputs[i].data(), 20);
        }
    }
}



void worker(int threadId, __uint128_t threadRangeStart, __uint128_t threadRangeEnd) {
    alignas(32) uint8_t localPubKeys[HASH_BATCH_SIZE][33];
    alignas(32) uint8_t localHashResults[HASH_BATCH_SIZE][20];

    __m256i target16 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(TARGET_HASH160_RAW.data()));

    secp256k1_context* ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN);

    // Precompute points for batch processing
    alignas(32) secp256k1_fe plusPointsX[POINTS_BATCH_SIZE];
    alignas(32) secp256k1_fe plusPointsY[POINTS_BATCH_SIZE];
    alignas(32) secp256k1_fe minusPointsY[POINTS_BATCH_SIZE];

    for (int i = 0; i < POINTS_BATCH_SIZE; i++) {
        secp256k1_scalar scalar;
        secp256k1_scalar_set_int(&scalar, i);
        secp256k1_gej pointJ;
        secp256k1_ecmult_gen(ctx->ecmult_gen_ctx, &pointJ, &scalar);

        secp256k1_ge point;
        secp256k1_ge_set_gej(&point, &pointJ);

        secp256k1_fe_normalize_var(&point.x);
        secp256k1_fe_normalize_var(&point.y);

        plusPointsX[i] = point.x;
        plusPointsY[i] = point.y;

        secp256k1_fe_negate(&minusPointsY[i], &point.y, 1);
    }

    __uint128_t currentKey = threadRangeStart;
    while (currentKey <= threadRangeEnd) {
        int localBatchCount = 0;

        // Generate public keys in batches
        for (; localBatchCount < HASH_BATCH_SIZE && currentKey <= threadRangeEnd; ++localBatchCount, ++currentKey) {
            uint8_t priv[32];
            uint128ToPrivKey(currentKey, priv);

            secp256k1_pubkey pubkey;
            if (!secp256k1_ec_pubkey_create(ctx, &pubkey, priv)) {
                std::cerr << "Failed to derive public key.\n";
                continue;
            }

            size_t len = 33;
            secp256k1_ec_pubkey_serialize(ctx, localPubKeys[localBatchCount], &len, &pubkey, SECP256K1_EC_COMPRESSED);
        }

        // Compute HASH160 for the batch
        computeHash160BatchBinSingle(localBatchCount, localPubKeys, localHashResults);

        // Compare HASH160 results with the target
        for (int j = 0; j < localBatchCount; ++j) {
            __m256i cand = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(localHashResults[j]));
            __m256i cmp = _mm256_cmpeq_epi8(cand, target16);
            int mask = _mm256_movemask_epi8(cmp);

            if ((mask & 0x0F) == 0x0F) {  
                bool fullMatch = true;
                for (int k = 0; k < 20; k++) {
                    if (localHashResults[j][k] != TARGET_HASH160_RAW[k]) {
                        fullMatch = false;
                        break;
                    }
                }
                if (fullMatch) {
                    auto tEndTime = std::chrono::high_resolution_clock::now();
                    double globalElapsedTime = std::chrono::duration<double>(tEndTime - tStart).count();

                    std::lock_guard<std::mutex> lock(progress_mutex);
                    globalComparedCount += actual_work_done;
                    mkeysPerSec = (double)globalComparedCount / globalElapsedTime / 1e6;

                    __uint128_t foundKey = currentKey - (localBatchCount - j);
                    std::string hexKey = uint128ToHex(foundKey);

                    std::lock_guard<std::mutex> resultLock(result_mutex);
                    results.push(std::make_tuple(hexKey, total_checked_avx.load(), flip_count));
                    stop_event.store(true);
                    return;
                }
            }
        }
    }

    secp256k1_context_destroy(ctx);
}


To use internal APIs like secp256k1_ecmult_gen, you must install secp256k1 form source:

Code:
git clone https://github.com/bitcoin-core/secp256k1.git
cd secp256k1
./autogen.sh
./configure --enable-module-ecmult-gen --enable-experimental --enable-module-recovery
make
sudo make install


And the problems don't end here.

That's why it's easier for me to use JLP  secp256k1  Grin
Original archived Re: Bitcoin puzzle transaction ~32 BTC prize to who solves it
Scraped on 25/04/2025, 14:07:29 UTC
That's it, basically.
In the end, we came up with a sort of Frankenstein code together, and this is it:

Code:
#include <secp256k1.h>
#include "sha256_avx2.h"
#include "ripemd160_avx2.h"

static constexpr int HASH_BATCH_SIZE = 8;
static constexpr int POINTS_BATCH_SIZE = 256;


inline void prepareShaBlock(const uint8_t* dataSrc, __uint128_t dataLen, uint8_t* outBlock) {
    std::fill_n(outBlock, 64, 0);
    std::memcpy(outBlock, dataSrc, dataLen);
    outBlock[dataLen] = 0x80;
    const uint32_t bitLen = (uint32_t)(dataLen * 8);
    outBlock[60] = (uint8_t)((bitLen >> 24) & 0xFF);
    outBlock[61] = (uint8_t)((bitLen >> 16) & 0xFF);
    outBlock[62] = (uint8_t)((bitLen >>  8) & 0xFF);
    outBlock[63] = (uint8_t)( bitLen        & 0xFF);
}

inline void prepareRipemdBlock(const uint8_t* dataSrc, uint8_t* outBlock) {
    std::fill_n(outBlock, 64, 0);
    std::memcpy(outBlock, dataSrc, 32);
    outBlock[32] = 0x80;
    const uint32_t bitLen = 256;
    outBlock[60] = (uint8_t)((bitLen >> 24) & 0xFF);
    outBlock[61] = (uint8_t)((bitLen >> 16) & 0xFF);
    outBlock[62] = (uint8_t)((bitLen >>  8) & 0xFF);
    outBlock[63] = (uint8_t)( bitLen        & 0xFF);
}

static void computeHash160BatchBinSingle(int numKeys,
                                       uint8_t pubKeys[][33],
                                       uint8_t hashResults[][20])
{
    alignas(32) std::array<std::array<uint8_t, 64>, HASH_BATCH_SIZE> shaInputs;
    alignas(32) std::array<std::array<uint8_t, 32>, HASH_BATCH_SIZE> shaOutputs;
    alignas(32) std::array<std::array<uint8_t, 64>, HASH_BATCH_SIZE> ripemdInputs;
    alignas(32) std::array<std::array<uint8_t, 20>, HASH_BATCH_SIZE> ripemdOutputs;
    const __uint128_t totalBatches = (numKeys + (HASH_BATCH_SIZE - 1)) / HASH_BATCH_SIZE;
    for (__uint128_t batch = 0; batch < totalBatches; batch++) {
        const __uint128_t batchCount = std::min<__uint128_t>(HASH_BATCH_SIZE, numKeys - batch * HASH_BATCH_SIZE);

        for (__uint128_t i = 0; i < batchCount; i++) {
            prepareShaBlock(pubKeys[batch * HASH_BATCH_SIZE + i], 33, shaInputs[i].data());
        }

        if (batchCount < HASH_BATCH_SIZE) {
            static std::array<uint8_t, 64> shaPadding = {};
            prepareShaBlock(pubKeys[0], 33, shaPadding.data());
            for (__uint128_t i = batchCount; i < HASH_BATCH_SIZE; i++) {
                std::memcpy(shaInputs[i].data(), shaPadding.data(), 64);
            }
        }

        const uint8_t* inPtr[HASH_BATCH_SIZE];
        uint8_t* outPtr[HASH_BATCH_SIZE];
        for (int i = 0; i < HASH_BATCH_SIZE; i++) {
            inPtr[i]  = shaInputs[i].data();
            outPtr[i] = shaOutputs[i].data();
        }

        sha256avx2_8B(inPtr[0], inPtr[1], inPtr[2], inPtr[3],
                      inPtr[4], inPtr[5], inPtr[6], inPtr[7],
                      outPtr[0], outPtr[1], outPtr[2], outPtr[3],
                      outPtr[4], outPtr[5], outPtr[6], outPtr[7]);

        for (__uint128_t i = 0; i < batchCount; i++) {
            prepareRipemdBlock(shaOutputs[i].data(), ripemdInputs[i].data());
        }

        if (batchCount < HASH_BATCH_SIZE) {
            static std::array<uint8_t, 64> ripemdPadding = {};
            prepareRipemdBlock(shaOutputs[0].data(), ripemdPadding.data());
            for (__uint128_t i = batchCount; i < HASH_BATCH_SIZE; i++) {
                std::memcpy(ripemdInputs[i].data(), ripemdPadding.data(), 64);
            }
        }

        for (int i = 0; i < HASH_BATCH_SIZE; i++) {
            inPtr[i]  = ripemdInputs[i].data();
            outPtr[i] = ripemdOutputs[i].data();
        }

        ripemd160avx2::ripemd160avx2_32(
            (unsigned char*)inPtr[0], (unsigned char*)inPtr[1],
            (unsigned char*)inPtr[2], (unsigned char*)inPtr[3],
            (unsigned char*)inPtr[4], (unsigned char*)inPtr[5],
            (unsigned char*)inPtr[6], (unsigned char*)inPtr[7],
            outPtr[0], outPtr[1], outPtr[2], outPtr[3],
            outPtr[4], outPtr[5],
            outPtr[6], outPtr[7]
        );

        for (__uint128_t i = 0; i < batchCount; i++) {
            std::memcpy(hashResults[batch * HASH_BATCH_SIZE + i], ripemdOutputs[i].data(), 20);
        }
    }
}



void worker(int threadId, __uint128_t threadRangeStart, __uint128_t threadRangeEnd) {
    alignas(32) uint8_t localPubKeys[HASH_BATCH_SIZE][33];
    alignas(32) uint8_t localHashResults[HASH_BATCH_SIZE][20];

    __m256i target16 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(TARGET_HASH160_RAW.data()));

    secp256k1_context* ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN);

    // Precompute points for batch processing
    alignas(32) secp256k1_fe plusPointsX[POINTS_BATCH_SIZE];
    alignas(32) secp256k1_fe plusPointsY[POINTS_BATCH_SIZE];
    alignas(32) secp256k1_fe minusPointsY[POINTS_BATCH_SIZE];

    for (int i = 0; i < POINTS_BATCH_SIZE; i++) {
        secp256k1_scalar scalar;
        secp256k1_scalar_set_int(&scalar, i);
        secp256k1_gej pointJ;
        secp256k1_ecmult_gen(ctx->ecmult_gen_ctx, &pointJ, &scalar);

        secp256k1_ge point;
        secp256k1_ge_set_gej(&point, &pointJ);

        secp256k1_fe_normalize_var(&point.x);
        secp256k1_fe_normalize_var(&point.y);

        plusPointsX[i] = point.x;
        plusPointsY[i] = point.y;

        secp256k1_fe_negate(&minusPointsY[i], &point.y, 1);
    }

    __uint128_t currentKey = threadRangeStart;
    while (currentKey <= threadRangeEnd) {
        int localBatchCount = 0;

        // Generate public keys in batches
        for (; localBatchCount < HASH_BATCH_SIZE && currentKey <= threadRangeEnd; ++localBatchCount, ++currentKey) {
            uint8_t priv[32];
            uint128ToPrivKey(currentKey, priv);

            secp256k1_pubkey pubkey;
            if (!secp256k1_ec_pubkey_create(ctx, &pubkey, priv)) {
                std::cerr << "Failed to derive public key.\n";
                continue;
            }

            size_t len = 33;
            secp256k1_ec_pubkey_serialize(ctx, localPubKeys[localBatchCount], &len, &pubkey, SECP256K1_EC_COMPRESSED);
        }

        // Compute HASH160 for the batch
        computeHash160BatchBinSingle(localBatchCount, localPubKeys, localHashResults);

        // Compare HASH160 results with the target
        for (int j = 0; j < localBatchCount; ++j) {
            __m256i cand = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(localHashResults[j]));
            __m256i cmp = _mm256_cmpeq_epi8(cand, target16);
            int mask = _mm256_movemask_epi8(cmp);

            if ((mask & 0x0F) == 0x0F) { 
                bool fullMatch = true;
                for (int k = 0; k < 20; k++) {
                    if (localHashResults[j][k] != TARGET_HASH160_RAW[k]) {
                        fullMatch = false;
                        break;
                    }
                }
                if (fullMatch) {
                    auto tEndTime = std::chrono::high_resolution_clock::now();
                    double globalElapsedTime = std::chrono::duration<double>(tEndTime - tStart).count();

                    std::lock_guard<std::mutex> lock(progress_mutex);
                    globalComparedCount += actual_work_done;
                    mkeysPerSec = (double)globalComparedCount / globalElapsedTime / 1e6;

                    __uint128_t foundKey = currentKey - (localBatchCount - j);
                    std::string hexKey = uint128ToHex(foundKey);

                    std::lock_guard<std::mutex> resultLock(result_mutex);
                    results.push(std::make_tuple(hexKey, total_checked_avx.load(), flip_count));
                    stop_event.store(true);
                    return;
                }
            }
        }
    }

    secp256k1_context_destroy(ctx);
}