Re: secp256k1 library in pure assembly

Quote from: NotATether on Today at 04:56:50 PM

it appears to be that the multiplication has the secp256k1 characteristic modulus hardcoded into the algorithm, making it suitable only for public key multiplication.

I was thinking about making an adapted version of the multiplication algorithm that uses the curve order (subtracted from 2^256) in its place, so that private key multiplication is covered as well.

I haven't explored the 10x26 legs imply too deep, but I'm assuming it's a similar case.

Since you're here though, let me ask: Was the multiplication assembly (and possibly the C version of it in another file using int128) only intended for public keys?

You have to distinguish between:

1) field operations (mod p, space of coordinates x and y):

https://github.com/bitcoin-core/secp256k1/tree/master/src/ all files with name : field*

and

2) scalar operations (mod n, space of private keys):

https://github.com/bitcoin-core/secp256k1/tree/master/src/ all files with name : scalar*

If you want to multiply 2 private keys (mod n):

you could use secp256k1_scalar_mul_512 (256bit * 256 bit -> 512 bit) (assembly multiplication)

https://github.com/bitcoin-core/secp256k1/blob/master/src/scalar_4x64_impl.h#L587

Code:

inline void secp256k1_scalar_mul_512(uint64_t* r, const uint64_t *a, const uint64_t *b) {

const uint64_t *pb = b;

uint64_t l[8];

__asm__ __volatile__ (
/* Preload */
"movq 0(%%rdi), %%r15\n"
"movq 8(%%rdi), %%rbx\n"
"movq 16(%%rdi), %%rcx\n"
"movq 0(%%rdx), %%r11\n"
"movq 8(%%rdx), %%r12\n"
"movq 16(%%rdx), %%r13\n"
"movq 24(%%rdx), %%r14\n"
/* (rax,rdx) = a0 * b0 */
"movq %%r15, %%rax\n"
"mulq %%r11\n"
/* Extract l0 */
"movq %%rax, 0(%%rsi)\n"
/* (r8,r9,r10) = (rdx) */
"movq %%rdx, %%r8\n"
"xorq %%r9, %%r9\n"
"xorq %%r10, %%r10\n"
/* (r8,r9,r10) += a0 * b1 */
"movq %%r15, %%rax\n"
"mulq %%r12\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r9\n"
"adcq $0, %%r10\n"
/* (r8,r9,r10) += a1 * b0 */
"movq %%rbx, %%rax\n"
"mulq %%r11\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r9\n"
"adcq $0, %%r10\n"
/* Extract l1 */
"movq %%r8, 8(%%rsi)\n"
"xorq %%r8, %%r8\n"
/* (r9,r10,r8) += a0 * b2 */
"movq %%r15, %%rax\n"
"mulq %%r13\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
"adcq $0, %%r8\n"
/* (r9,r10,r8) += a1 * b1 */
"movq %%rbx, %%rax\n"
"mulq %%r12\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
"adcq $0, %%r8\n"
/* (r9,r10,r8) += a2 * b0 */
"movq %%rcx, %%rax\n"
"mulq %%r11\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
"adcq $0, %%r8\n"
/* Extract l2 */
"movq %%r9, 16(%%rsi)\n"
"xorq %%r9, %%r9\n"
/* (r10,r8,r9) += a0 * b3 */
"movq %%r15, %%rax\n"
"mulq %%r14\n"
"addq %%rax, %%r10\n"
"adcq %%rdx, %%r8\n"
"adcq $0, %%r9\n"
/* Preload a3 */
"movq 24(%%rdi), %%r15\n"
/* (r10,r8,r9) += a1 * b2 */
"movq %%rbx, %%rax\n"
"mulq %%r13\n"
"addq %%rax, %%r10\n"
"adcq %%rdx, %%r8\n"
"adcq $0, %%r9\n"
/* (r10,r8,r9) += a2 * b1 */
"movq %%rcx, %%rax\n"
"mulq %%r12\n"
"addq %%rax, %%r10\n"
"adcq %%rdx, %%r8\n"
"adcq $0, %%r9\n"
/* (r10,r8,r9) += a3 * b0 */
"movq %%r15, %%rax\n"
"mulq %%r11\n"
"addq %%rax, %%r10\n"
"adcq %%rdx, %%r8\n"
"adcq $0, %%r9\n"
/* Extract l3 */
"movq %%r10, 24(%%rsi)\n"
"xorq %%r10, %%r10\n"
/* (r8,r9,r10) += a1 * b3 */
"movq %%rbx, %%rax\n"
"mulq %%r14\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r9\n"
"adcq $0, %%r10\n"
/* (r8,r9,r10) += a2 * b2 */
"movq %%rcx, %%rax\n"
"mulq %%r13\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r9\n"
"adcq $0, %%r10\n"
/* (r8,r9,r10) += a3 * b1 */
"movq %%r15, %%rax\n"
"mulq %%r12\n"
"addq %%rax, %%r8\n"
"adcq %%rdx, %%r9\n"
"adcq $0, %%r10\n"
/* Extract l4 */
"movq %%r8, 32(%%rsi)\n"
"xorq %%r8, %%r8\n"
/* (r9,r10,r8) += a2 * b3 */
"movq %%rcx, %%rax\n"
"mulq %%r14\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
"adcq $0, %%r8\n"
/* (r9,r10,r8) += a3 * b2 */
"movq %%r15, %%rax\n"
"mulq %%r13\n"
"addq %%rax, %%r9\n"
"adcq %%rdx, %%r10\n"
"adcq $0, %%r8\n"
/* Extract l5 */
"movq %%r9, 40(%%rsi)\n"
/* (r10,r8) += a3 * b3 */
"movq %%r15, %%rax\n"
"mulq %%r14\n"
"addq %%rax, %%r10\n"
"adcq %%rdx, %%r8\n"
/* Extract l6 */
"movq %%r10, 48(%%rsi)\n"
/* Extract l7 */
"movq %%r8, 56(%%rsi)\n"
: "+d"(b)
: "S"(l), "D"(a)
: "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory");
}

where for example

Code:

const uint64_t a[4] = {0x59f2815b16f81798, 0x029bfcdb2dce28d9, 0x55a06295ce870b07, 0x79be667ef9dcbbac};
const uint64_t b[4] = {0x59f2815b16f81798, 0x029bfcdb2dce28d9, 0x55a06295ce870b07, 0x79be667ef9dcbbac};

uint64_t c[8] = {0};

secp256k1_scalar_mul_512(&c[0],&a[0],&b[0]);

and then apply the function secp256k1_scalar_reduce_512 (from 512 bit to 256 bit mod n)

https://github.com/bitcoin-core/secp256k1/blob/master/src/scalar_4x64_impl.h#L274

to get the result mod n

https://github.com/bitcoin-core/secp256k1/blob/master/src/scalar_4x64_impl.h#L761-L765

I get the same results with my function and with these functions.