Bro, you don't even know what you're talkin' about. libsecp256k1 wasn't made for low-level batched optimizations like that. You gotta use JeanLucPons' SECP256K1 if you ever wanna hit 50M keys per second on any CPU.
Actually, JLP is the slow one here. Like, at least 50% slower.
libsecp256k1's primitives can be easily used to implement batched addition, or anything else. I already posted the code to do this a while ago.
Ok. Make this part for me with libsecp256k1
// Worker function
void worker(int threadId, __uint128_t threadRangeStart, __uint128_t threadRangeEnd) {
alignas(32) uint8_t localPubKeys[HASH_BATCH_SIZE][33];
alignas(32) uint8_t localHashResults[HASH_BATCH_SIZE][20];
alignas(32) int pointIndices[HASH_BATCH_SIZE];
__m256i target16 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(TARGET_HASH160_RAW.data()));
alignas(32) Point plusPoints[POINTS_BATCH_SIZE];
alignas(32) Point minusPoints[POINTS_BATCH_SIZE];
for (int i = 0; i < POINTS_BATCH_SIZE; i++) {
Int tmp;
tmp.SetInt32(i);
plusPoints[i] = secp->ComputePublicKey(&tmp);
minusPoints[i] = plusPoints[i];
minusPoints[i].y.ModNeg();
}
alignas(32) Int deltaX[POINTS_BATCH_SIZE];
IntGroup modGroup(POINTS_BATCH_SIZE);
alignas(32) Int pointBatchX[fullBatchSize];
alignas(32) Int pointBatchY[fullBatchSize];
secp256k1_context* ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN);
__uint128_t currentKey = threadRangeStart;
while (currentKey <= threadRangeEnd) {
int localBatchCount = 0;
// Generate public keys in batches
for (; localBatchCount < HASH_BATCH_SIZE && currentKey <= threadRangeEnd; ++localBatchCount, ++currentKey) {
uint8_t priv[32];
uint128ToPrivKey(currentKey, priv);
uint8_t startPoint[65];
if (!derivePubkey(ctx, priv, startPoint, true)) {
std::cerr << "Failed to derive public key.\n";
continue;
}
// Use custom arithmetic for batch processing
Point startPointObj;
startPointObj.x.SetBytes(startPoint + 1, 32);
startPointObj.y.SetBytes(startPoint + 1, 32);
for (int i = 0; i < POINTS_BATCH_SIZE; i += 4) {
deltaX[i].ModSub(&plusPoints[i].x, &startPointObj.x);
deltaX[i+1].ModSub(&plusPoints[i+1].x, &startPointObj.x);
deltaX[i+2].ModSub(&plusPoints[i+2].x, &startPointObj.x);
deltaX[i+3].ModSub(&plusPoints[i+3].x, &startPointObj.x);
}
modGroup.Set(deltaX);
modGroup.ModInv();
for (int i = 0; i < POINTS_BATCH_SIZE; i++) {
Int deltaY;
deltaY.ModSub(&plusPoints[i].y, &startPointObj.y);
Int slope;
slope.ModMulK1(&deltaY, &deltaX[i]);
Int slopeSq;
slopeSq.ModSquareK1(&slope);
pointBatchX[i].Set(&startPointObj.x);
pointBatchX[i].ModAdd(&slopeSq);
pointBatchX[i].ModSub(&plusPoints[i].x);
Int diffX;
diffX.Set(&startPointObj.x);
diffX.ModSub(&pointBatchX[i]);
diffX.ModMulK1(&slope);
pointBatchY[i].Set(&startPointObj.y);
pointBatchY[i].ModNeg();
pointBatchY[i].ModAdd(&diffX);
}
for (int i = 0; i < POINTS_BATCH_SIZE; i++) {
Int deltaY;
deltaY.ModSub(&minusPoints[i].y, &startPointObj.y);
Int slope;
slope.ModMulK1(&deltaY, &deltaX[i]);
Int slopeSq;
slopeSq.ModSquareK1(&slope);
pointBatchX[POINTS_BATCH_SIZE + i].Set(&startPointObj.x);
pointBatchX[POINTS_BATCH_SIZE + i].ModAdd(&slopeSq);
pointBatchX[POINTS_BATCH_SIZE + i].ModSub(&minusPoints[i].x);
Int diffX;
diffX.Set(&startPointObj.x);
diffX.ModSub(&pointBatchX[POINTS_BATCH_SIZE + i]);
diffX.ModMulK1(&slope);
pointBatchY[POINTS_BATCH_SIZE + i].Set(&startPointObj.y);
pointBatchY[POINTS_BATCH_SIZE + i].ModNeg();
pointBatchY[POINTS_BATCH_SIZE + i].ModAdd(&diffX);
}
for (int i = 0; i < fullBatchSize && localBatchCount < HASH_BATCH_SIZE; i++) {
Point tempPoint;
tempPoint.x.Set(&pointBatchX[i]);
tempPoint.y.Set(&pointBatchY[i]);
localPubKeys[localBatchCount][0] = tempPoint.y.IsEven() ? 0x02 : 0x03;
for (int j = 0; j < 32; j++) {
localPubKeys[localBatchCount][1 + j] = pointBatchX[i].GetByte(31 - j);
}
pointIndices[localBatchCount] = i;
localBatchCount++;
}
}