Re: Official CGMINER thread - CPU/GPU miner in C for linux/windows/osx

Quote from: zaytsev on July 31, 2011, 09:21:26 AM

d3m0n1q_733rz, why wouldn't you create a fork on github? would be easier then copy-pasting and less error-prone.

A) I can't actually program from scratch and most of my changes or just logic based.
B) Almost nobody gives me input on how my changes affect their hashing anyway.
C) People might end up sending me incessant requests for changes that I couldn't keep up with.

Besides, this is more of a hobby for me than an outright project and I want to be able to drop it like one without people getting caught in the wake. Wink

In related news:

Code:

;; SHA-256 for X86-64 for Linux, based off of:

; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain

; Significant re-write/optimisation and reordering by,
; Neil Kettle
; Small modifications played around with by,
; Erick Couts II
; ~18% performance improvement

; SHA-256 CPU SSE cruncher for Bitcoin Miner

ALIGN 32
BITS 64

%define hash rdi
%define data rsi
%define init rdx

; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA 2
%define LAB_CALC_UNROLL 8

%define LAB_LOOP_UNROLL 8

extern g_4sha256_k

global CalcSha256_x64_sse4
; CalcSha256 hash(rdi), data(rsi), init(rdx)
CalcSha256_x64_sse4:

push rbx

LAB_NEXT_NONCE:

mov rcx, 256 ; 256 - rcx is # of SHA-2 rounds
; mov rax, 64 ; 64 - rax is where we expand to

LAB_SHA:
push rcx
lea rcx, qword [data+1024] ; + 1024
lea r11, qword [data+256] ; + 256

LAB_CALC:
%macro lab_calc_blk 1
; prefetcht0 [r11-(15-%1)*16]
; prefetcht0 [r11-(15-(%1+1))*16]

movntdqa xmm0, [r11-(15-%1)*16] ; xmm0 = W[I-15]
movdqa xmm1, xmm0 ; xmm1 = W[I-15]
movdqa xmm2, xmm0 ; xmm2 = W[I-15]
movntdqa xmm3, [r11-(2-%1)*16] ; xmm3 = W[I-2]
movntdqa xmm4, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
movdqa xmm5, xmm4 ; xmm4 = W[I-15+1]
movtdqa xmm6, xmm4 ; xmm6 = W[I-15+1]
movntdqa xmm7, [r11-(2-(%1+1))*16] ; xmm7 = W[I-2+1]

; movdqa xmm2, xmm0 ; xmm2 = W[I-15]
; movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]

psrld xmm0, 3 ; xmm0 = W[I-15] >> 3
psrld xmm1, 7 ; xmm1 = W[I-15] >> 7 (Moved and made it independent of xmm0)
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3
psrld xmm5, 7 ; xmm5 = W[I-15+1] >> 7
pslld xmm2, 14 ; xmm2 = W[I-15] << 14

; movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3
; movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3

pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14

pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
pslld xmm2, 11 ; xmm2 = W[I-15] << 25
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
paddd xmm0, [r11-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16]
paddd xmm4, [r11-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1]

;;;;;;;;;;;;;;;;;;

movdqa xmm2, xmm3 ; xmm2 = W[I-2]
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1]
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10

paddd xmm0, [r11-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
paddd xmm4, [r11-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]

psrld xmm1, 17 ; xmm1 = W[I-2] >> 17
pslld xmm2, 13 ; xmm2 = W[I-2] << 13
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10
psrld xmm5, 17 ; xmm5 = W[I-2+1] >> 17
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10

pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)

psrld xmm1, 2 ; xmm1 = W[I-2] >> 19
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19
pslld xmm2, 2 ; xmm2 = W[I-2] << 15
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15

pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]

movdqa [r11+((%1+1)*16)], xmm4
movdqa [r11+(%1*16)], xmm0
%endmacro

%assign i 0
%rep LAB_CALC_UNROLL
lab_calc_blk i
%assign i i+LAB_CALC_PARA
%endrep
; prefetchnta [rcx]

add r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
cmp r11, rcx
jb LAB_CALC
prefetchnta [init+16]
pop rcx
mov rax, 0

; Load the init values of the message into the hash.

movntdqa xmm7, [init]
movntdqa xmm0, [init+16]
pshufd xmm5, xmm7, 0x55 ; xmm5 == b
pshufd xmm4, xmm7, 0xAA ; xmm4 == c
pshufd xmm3, xmm7, 0xFF ; xmm3 == d
pshufd xmm7, xmm7, 0 ; xmm7 == a
pshufd xmm8, xmm0, 0x55 ; xmm8 == f
pshufd xmm9, xmm0, 0xAA ; xmm9 == g
pshufd xmm10, xmm0, 0xFF ; xmm10 == h
pshufd xmm0, xmm0, 0 ; xmm0 == e

LAB_LOOP:

;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32(g_sha256_k[j]) + w[j]

%macro lab_loop_blk 0
; prefetchnta [rax*4]
movntdqa xmm6, [data+rax*4]
paddd xmm6, g_4sha256_k[rax*4]
add rax, 4

paddd xmm6, xmm10 ; +h

movdqa xmm1, xmm0
; movdqa xmm2, xmm9 ; It's redundant unless xmm9 becomes a destination
movdqa xmm10, xmm9 ; h = g Changed from xmm2 to xmm9
movdqa xmm9, xmm8 ; f
movdqa xmm2, xmm8 ; g = f xmm9 became a destination but not until xmm2 was already used and replaced

pand xmm2, xmm0 ; e & f
pandn xmm1, xmm10 ; ~e & g Changed from xmm2 to xmm9 (see above reason) then xmm10 to combine writes
pxor xmm1, xmm2 ; (e & f) ^ (~e & g)
paddd xmm6, xmm1 ; Ch + h + w[i] + k[i]

movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm8, xmm0 ; f = e Combining these three moves for processor hardware optimization
psrld xmm0, 6 ; The xmm2 from xmm0 movdqa used to be after this taking advantage of the r-rotate 6
psrld xmm2, 11 ; Changed from 5 to 11 after shoving the movdqa commands together
pslld xmm1, 7
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 14
psrld xmm2, 14
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 5
pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
paddd xmm6, xmm0 ; xmm6 = t1
paddd xmm3, xmm6 ; e = d+t1

movdqa xmm0, xmm3 ; d
movdqa xmm1, xmm5 ; =b
movdqa xmm2, xmm4 ; c
movdqa xmm3, xmm2 ; d = c
pand xmm2, xmm5 ; b & c
pand xmm4, xmm7 ; a & c
pand xmm1, xmm7 ; a & b
pxor xmm1, xmm4
pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d)
paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d))

movdqa xmm4, xmm5 ; c = b
movdqa xmm5, xmm7 ; b = a
movdqa xmm2, xmm7
movdqa xmm1, xmm7
psrld xmm1, 13
psrld xmm7, 2
pslld xmm2, 10
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 9
psrld xmm1, 9
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 11
pxor xmm7, xmm2
paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
%endmacro

%assign i 0
%rep LAB_LOOP_UNROLL
lab_loop_blk
%assign i i+1
%endrep

cmp rax, rcx
jb LAB_LOOP

; Finished the 64 rounds, calculate hash and save

movntdqa xmm1, [rdx]
pshufd xmm2, xmm1, 0x55
paddd xmm5, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm4, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm3, xmm11
pshufd xmm1, xmm1, 0
paddd xmm7, xmm1

movntdqa xmm1, [rdx+16]
pshufd xmm2, xmm1, 0x55
paddd xmm8, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm9, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm10, xmm11
pshufd xmm1, xmm1, 0
paddd xmm0, xmm1

movdqa [hash], xmm7
movdqa [hash+16], xmm5
movdqa [hash+32], xmm4
movdqa [hash+48], xmm3
movdqa [hash+64], xmm0
movdqa [hash+80], xmm8
movdqa [hash+96], xmm9
movdqa [hash+112], xmm10

LAB_RET:
pop rbx
ret

I've commented out some of the optimizations I've been playing around with so you can see what I've been trying. It seemed like the prefetches actually slowed the code down for me. AMD users might have different results. Here, I've taken the liberty of even supplying the AMD users with the SSE2 code for ease of use. I ended up leaving in the loop modifications I made just because I couldn't tell much difference honestly. But I'm going to bed.

Code:

;; SHA-256 for X86-64 for Linux, based off of:

; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
; Version 2011
; This software is Public Domain

; Significant re-write/optimisation and reordering by,
; Neil Kettle
; Small modifications played around with by,
; Erick Couts II
; ~18% performance improvement

; SHA-256 CPU SSE cruncher for Bitcoin Miner

ALIGN 32
BITS 64

%define hash rdi
%define data rsi
%define init rdx

; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
%define LAB_CALC_PARA 2
%define LAB_CALC_UNROLL 8

%define LAB_LOOP_UNROLL 8

extern g_4sha256_k

global CalcSha256_x64
; CalcSha256 hash(rdi), data(rsi), init(rdx)
CalcSha256_x64:

push rbx

LAB_NEXT_NONCE:

mov rcx, 256 ; 256 - rcx is # of SHA-2 rounds
; mov rax, 64 ; 64 - rax is where we expand to

LAB_SHA:
push rcx
lea rcx, qword [data+1024] ; + 1024
lea r11, qword [data+256] ; + 256

LAB_CALC:
%macro lab_calc_blk 1
; prefetcht0 [r11-(15-%1)*16]
; prefetcht0 [r11-(15-(%1+1))*16]

movdqa xmm0, [r11-(15-%1)*16] ; xmm0 = W[I-15]
movdqa xmm1, xmm0 ; xmm1 = W[I-15]
movdqa xmm2, xmm0 ; xmm2 = W[I-15]
movdqa xmm3, [r11-(2-%1)*16] ; xmm3 = W[I-2]
movdqa xmm4, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
movdqa xmm5, xmm4 ; xmm5 = W[I-15+1]
movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]
movdqa xmm7, [r11-(2-(%1+1))*16] ; xmm7 = W[I-2+1]

; movdqa xmm2, xmm0 ; xmm2 = W[I-15]
; movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]

psrld xmm0, 3 ; xmm0 = W[I-15] >> 3
psrld xmm1, 7 ; xmm1 = W[I-15] >> 7 (Moved and made it independent of xmm0)
psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3
psrld xmm5, 7 ; xmm5 = W[I-15+1] >> 7
pslld xmm2, 14 ; xmm2 = W[I-15] << 14

; movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3
; movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3

pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14

pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
psrld xmm1, 11 ; xmm1 = W[I-15] >> 18
psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
pslld xmm2, 11 ; xmm2 = W[I-15] << 25
pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25
pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
paddd xmm0, [r11-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16]
paddd xmm4, [r11-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1]

;;;;;;;;;;;;;;;;;;

movdqa xmm2, xmm3 ; xmm2 = W[I-2]
movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10
movdqa xmm6, xmm7 ; xmm6 = W[I-2+1]
movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10

paddd xmm0, [r11-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
paddd xmm4, [r11-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]

psrld xmm1, 17 ; xmm1 = W[I-2] >> 17
psrld xmm3, 10 ; xmm3 = W[I-2] >> 10
psrld xmm5, 17 ; xmm5 = W[I-2+1] >> 17
psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10
pslld xmm2, 13 ; xmm2 = W[I-2] << 13
pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13

pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)

psrld xmm1, 2 ; xmm1 = W[I-2] >> 19
psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19
pslld xmm2, 2 ; xmm2 = W[I-2] << 15
pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15

pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]

movdqa [r11+((%1+1)*16)], xmm4
movdqa [r11+(%1*16)], xmm0
%endmacro

%assign i 0
%rep LAB_CALC_UNROLL
lab_calc_blk i
%assign i i+LAB_CALC_PARA
%endrep
; prefetchnta [rcx]

add r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
cmp r11, rcx
jb LAB_CALC
prefetchnta [init+16]
pop rcx
mov rax, 0

; Load the init values of the message into the hash.

movdqa xmm7, [init]
movdqa xmm0, [init+16]
pshufd xmm5, xmm7, 0x55 ; xmm5 == b
pshufd xmm4, xmm7, 0xAA ; xmm4 == c
pshufd xmm3, xmm7, 0xFF ; xmm3 == d
pshufd xmm7, xmm7, 0 ; xmm7 == a
pshufd xmm8, xmm0, 0x55 ; xmm8 == f
pshufd xmm9, xmm0, 0xAA ; xmm9 == g
pshufd xmm10, xmm0, 0xFF ; xmm10 == h
pshufd xmm0, xmm0, 0 ; xmm0 == e

LAB_LOOP:

;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32(g_sha256_k[j]) + w[j]

%macro lab_loop_blk 0
; prefetchnta [rax*4]
movdqa xmm6, [data+rax*4]
paddd xmm6, g_4sha256_k[rax*4]
add rax, 4

paddd xmm6, xmm10 ; +h

movdqa xmm1, xmm0
; movdqa xmm2, xmm9 ; It's redundant unless xmm9 becomes a destination
movdqa xmm10, xmm9 ; h = g Changed from xmm2 to xmm9
movdqa xmm9, xmm8 ; f
movdqa xmm2, xmm8 ; g = f xmm9 became a destination but not until xmm2 was already used and replaced

pand xmm2, xmm0 ; e & f
pandn xmm1, xmm10 ; ~e & g Changed from xmm2 to xmm9 (see above reason) then xmm10 to combine writes
pxor xmm1, xmm2 ; (e & f) ^ (~e & g)
paddd xmm6, xmm1 ; Ch + h + w[i] + k[i]

movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm8, xmm0 ; f = e Combining these three moves for processor hardware optimization
psrld xmm0, 6 ; The xmm2 from xmm0 movdqa used to be after this taking advantage of the r-rotate 6
psrld xmm2, 11 ; Changed from 5 to 11 after shoving the movdqa commands together
pslld xmm1, 7
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 14
psrld xmm2, 14
pxor xmm0, xmm1
pxor xmm0, xmm2
pslld xmm1, 5
pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
paddd xmm6, xmm0 ; xmm6 = t1
paddd xmm3, xmm6 ; e = d+t1

movdqa xmm0, xmm3 ; d
movdqa xmm1, xmm5 ; =b
movdqa xmm2, xmm4 ; c
movdqa xmm3, xmm2 ; d = c
pand xmm2, xmm5 ; b & c
pand xmm4, xmm7 ; a & c
pand xmm1, xmm7 ; a & b
pxor xmm1, xmm4
pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d)
paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d))

movdqa xmm4, xmm5 ; c = b
movdqa xmm5, xmm7 ; b = a
movdqa xmm2, xmm7
movdqa xmm1, xmm7
psrld xmm1, 13
psrld xmm7, 2
pslld xmm2, 10
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 9
psrld xmm1, 9
pxor xmm7, xmm2
pxor xmm7, xmm1
pslld xmm2, 11
pxor xmm7, xmm2
paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
%endmacro

%assign i 0
%rep LAB_LOOP_UNROLL
lab_loop_blk
%assign i i+1
%endrep

cmp rax, rcx
jb LAB_LOOP

; Finished the 64 rounds, calculate hash and save

movdqa xmm1, [rdx]
pshufd xmm2, xmm1, 0x55
paddd xmm5, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm4, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm3, xmm11
pshufd xmm1, xmm1, 0
paddd xmm7, xmm1

movdqa xmm1, [rdx+16]
pshufd xmm2, xmm1, 0x55
paddd xmm8, xmm2
pshufd xmm6, xmm1, 0xAA
paddd xmm9, xmm6
pshufd xmm11, xmm1, 0xFF
paddd xmm10, xmm11
pshufd xmm1, xmm1, 0
paddd xmm0, xmm1

movdqa [hash], xmm7
movdqa [hash+16], xmm5
movdqa [hash+32], xmm4
movdqa [hash+48], xmm3
movdqa [hash+64], xmm0
movdqa [hash+80], xmm8
movdqa [hash+96], xmm9
movdqa [hash+112], xmm10

LAB_RET:
pop rbx
ret