One more piece of science! 
Everybody knows that when we calculate "NextPoint = PreviousPoint + JumpPoint", we can also quickly calculate "PreviousPoint - JumpPoint" because the inversion is the same.
Therefore, if the inversion calculation takes a lot of time, this second point is cheap for us, and we can use it to improve K.
I updated Part #1:
Added "SOTA+" method with K = 1.02. I think JLP has already applied this trick in his BSGS/kangaroo.
https://github.com/JeanLucPons/BSGS/blob/master/BSGS.cpp#L347...
// We use the fact that P + i*G and P - i*G has the same deltax, so the same inverse
// We compute key in the positive and negative way from the center of the group
// center point
pts[CPU_GRP_SIZE / 2] = startP;
for(i = 0; i<hLength; i++) {
pp = startP;
pn = startP;
// P = startP + i*G
dy.ModSub(&GSn[i].y,&pp.y);
_s.ModMulK1(&dy,&dx[i]); // s = (p2.y-p1.y)*inverse(p2.x-p1.x);
_p.ModSquareK1(&_s); // _p = pow2(s)
pp.x.ModNeg();
pp.x.ModAdd(&_p);
pp.x.ModSub(&GSn[i].x); // rx = pow2(s) - p1.x - p2.x;
#if 0
pp.y.ModSub(&GSn[i].x,&pp.x);
pp.y.ModMulK1(&_s);
pp.y.ModSub(&GSn[i].y); // ry = - p2.y - s*(ret.x-p2.x);
#endif
// P = startP - i*G , if (x,y) = i*G then (x,-y) = -i*G
dyn.Set(&GSn[i].y);
dyn.ModNeg();
dyn.ModSub(&pn.y);
_s.ModMulK1(&dyn,&dx[i]); // s = (p2.y-p1.y)*inverse(p2.x-p1.x);
_p.ModSquareK1(&_s); // _p = pow2(s)
pn.x.ModNeg();
pn.x.ModAdd(&_p);
pn.x.ModSub(&GSn[i].x); // rx = pow2(s) - p1.x - p2.x;
#if 0
pn.y.ModSub(&GSn[i].x,&pn.x);
pn.y.ModMulK1(&_s);
pn.y.ModAdd(&GSn[i].y); // ry = - p2.y - s*(ret.x-p2.x);
#endif
pts[CPU_GRP_SIZE / 2 + (i + 1)] = pp;
pts[CPU_GRP_SIZE / 2 - (i + 1)] = pn;
}
// First point (startP - (GRP_SZIE/2)*G)
pn = startP;
dyn.Set(&GSn[i].y);
dyn.ModNeg();
dyn.ModSub(&pn.y);
_s.ModMulK1(&dyn,&dx[i]);
_p.ModSquareK1(&_s);
pn.x.ModNeg();
pn.x.ModAdd(&_p);
pn.x.ModSub(&GSn[i].x);
#if 0
pn.y.ModSub(&GSn[i].x,&pn.x);
pn.y.ModMulK1(&_s);
pn.y.ModAdd(&GSn[i].y);
#endif
pts[0] = pn;
...