<<  >> (p.32)
    Author Topic: OFFICIAL CGMINER mining software thread for linux/win/osx/mips/arm/r-pi 4.11.1  (Read 5806593 times)
    This is a self-moderated topic. If you do not want to be moderated by the person who started this topic, create a new topic. (3 posts by 1+ user deleted.)
    d3m0n1q_733rz
    Sr. Member
    ****
    Offline Offline

    Activity: 378
    Merit: 250



    View Profile WWW
    July 31, 2011, 06:54:14 AM
     #621

    Code:
    ;; SHA-256 for X86-64 for Linux, based off of:

    ; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
    ; Version 2011
    ; This software is Public Domain

    ; Significant re-write/optimisation and reordering by,
    ; Neil Kettle <mu-b@digit-labs.org>
    ; ~18% performance improvement

    ; SHA-256 CPU SSE cruncher for Bitcoin Miner

    ALIGN 32
    BITS 64

    %define hash rdi
    %define data rsi
    %define init rdx

    ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
    %define LAB_CALC_PARA 2
    %define LAB_CALC_UNROLL 8

    %define LAB_LOOP_UNROLL 8

    extern g_4sha256_k

    global CalcSha256_x64_sse4
    ; CalcSha256 hash(rdi), data(rsi), init(rdx)
    CalcSha256_x64_sse4:

    push rbx

    LAB_NEXT_NONCE:

    mov rcx, 256 ; 256 - rcx is # of SHA-2 rounds
    ; mov rax, 64 ; 64 - rax is where we expand to

    LAB_SHA:
    push rcx
    lea rcx, qword [data+1024] ; + 1024
    lea r11, qword [data+256] ; + 256

    LAB_CALC:
    %macro lab_calc_blk 1

    movntdqa xmm0, [r11-(15-%1)*16] ; xmm0 = W[I-15]
    movntdqa xmm1, [r11-(15-%1)*16] ; xmm1 = W[I-15]
    movntdqa xmm2, [r11-(15-%1)*16] ; xmm2 = W[I-15]
    movntdqa xmm3, [r11-(2-%1)*16] ; xmm3 = W[I-2]
    movntdqa xmm4, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
    movntdqa xmm5, [r11-(15-(%1+1))*16] ; xmm4 = W[I-15+1]
    movntdqa xmm6, [r11-(15-(%1+1))*16] ; xmm6 = W[I-15+1]
    movntdqa xmm7, [r11-(2-(%1+1))*16] ; xmm7 = W[I-2+1]

    ; movdqa xmm2, xmm0 ; xmm2 = W[I-15]
    ; movdqa xmm6, xmm4 ; xmm6 = W[I-15+1]

    psrld xmm0, 3 ; xmm0 = W[I-15] >> 3
    psrld xmm1, 7 ; xmm1 = W[I-15] >> 7 Moved and made it independent of xmm0
    psrld xmm4, 3 ; xmm4 = W[I-15+1] >> 3
    psrld xmm5, 7 ; xmm5 = W[I-15+1] >> 7
    pslld xmm2, 14 ; xmm2 = W[I-15] << 14

    ; movdqa xmm5, xmm4 ; xmm5 = W[I-15+1] >> 3
    ; movdqa xmm1, xmm0 ; xmm1 = W[I-15] >> 3

    pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
    pslld xmm6, 14 ; xmm6 = W[I-15+1] << 14

    pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
    pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
    psrld xmm1, 11 ; xmm1 = W[I-15] >> 18
    psrld xmm5, 11 ; xmm5 = W[I-15+1] >> 18
    pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
    pxor xmm4, xmm5 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
    pslld xmm2, 11 ; xmm2 = W[I-15] << 25
    pslld xmm6, 11 ; xmm6 = W[I-15+1] << 25
    pxor xmm4, xmm6 ; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
    pxor xmm0, xmm1 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
    pxor xmm0, xmm2 ; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
    paddd xmm0, [r11-(16-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16]
    paddd xmm4, [r11-(16-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1]


    ;;;;;;;;;;;;;;;;;;

    movdqa xmm2, xmm3 ; xmm2 = W[I-2]
    psrld xmm3, 10 ; xmm3 = W[I-2] >> 10
    movdqa xmm1, xmm3 ; xmm1 = W[I-2] >> 10
    movdqa xmm6, xmm7 ; xmm6 = W[I-2+1]
    psrld xmm7, 10 ; xmm7 = W[I-2+1] >> 10
    movdqa xmm5, xmm7 ; xmm5 = W[I-2+1] >> 10

    paddd xmm0, [r11-(7-%1)*16] ; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
    paddd xmm4, [r11-(7-(%1+1))*16] ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]

    pslld xmm2, 13 ; xmm2 = W[I-2] << 13
    pslld xmm6, 13 ; xmm6 = W[I-2+1] << 13
    psrld xmm1, 7 ; xmm1 = W[I-2] >> 17
    psrld xmm5, 7 ; xmm5 = W[I-2+1] >> 17



    pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
    psrld xmm1, 2 ; xmm1 = W[I-2] >> 19
    pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
    pslld xmm2, 2 ; xmm2 = W[I-2] << 15
    pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
    psrld xmm5, 2 ; xmm5 = W[I-2+1] >> 19
    pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)
    pslld xmm6, 2 ; xmm6 = W[I-2+1] << 15



    pxor xmm3, xmm1 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
    pxor xmm3, xmm2 ; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
    paddd xmm0, xmm3 ; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
    pxor xmm7, xmm5 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
    pxor xmm7, xmm6 ; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
    paddd xmm4, xmm7 ; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]

    movdqa [r11+(%1*16)], xmm0
    movdqa [r11+((%1+1)*16)], xmm4
    %endmacro

    %assign i 0
    %rep    LAB_CALC_UNROLL
            lab_calc_blk i
    %assign i i+LAB_CALC_PARA
    %endrep

    add r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
    cmp r11, rcx
    jb LAB_CALC

    pop rcx
    mov rax, 0

    ; Load the init values of the message into the hash.

    movntdqa xmm7, [init]
    movntdqa xmm0, [init+16]
    pshufd xmm5, xmm7, 0x55 ; xmm5 == b
    pshufd xmm4, xmm7, 0xAA ; xmm4 == c
    pshufd xmm3, xmm7, 0xFF ; xmm3 == d
    pshufd xmm7, xmm7, 0 ; xmm7 == a
    pshufd xmm8, xmm0, 0x55 ; xmm8 == f
    pshufd xmm9, xmm0, 0xAA ; xmm9 == g
    pshufd xmm10, xmm0, 0xFF ; xmm10 == h
    pshufd xmm0, xmm0, 0 ; xmm0 == e

    LAB_LOOP:

    ;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]

    %macro lab_loop_blk 0 ; Notice the macro! rax*4 isn't redundant here.
    movntdqa xmm6, [data+rax*4]
    paddd xmm6, g_4sha256_k[rax*4]
    add rax, 4

    paddd xmm6, xmm10 ; +h

    movdqa xmm1, xmm0
    ; movdqa xmm2, xmm9 ; It's redundant unless xmm9 becomes a destination
    movdqa xmm10, xmm9 ; h = g  Changed from xmm2 to xmm9
    pandn xmm1, xmm9 ; ~e & g Changed from xmm2 to xmm9

    movdqa xmm9, xmm8 ; f
    movdqa xmm2, xmm8 ; g = f xmm9 became a destination but not until xmm2 was already used and replaced

    pand xmm2, xmm0 ; e & f
    pxor xmm1, xmm2 ; (e & f) ^ (~e & g)
    paddd xmm6, xmm1 ; Ch + h + w[i] + k[i]

    movdqa xmm1, xmm0
    movdqa xmm2, xmm0
    movdqa xmm8, xmm0 ; f = e Combining these three moves for processor hardware optimization
    psrld xmm0, 6 ; The xmm2 from xmm0 move used to be after this taking advantage of the r-rotate 6
    psrld xmm2, 11 ; Changed from 5 to 11 after shoving the movdqa commands together
    pslld xmm1, 7
    pxor xmm0, xmm1
    pxor xmm0, xmm2
    pslld xmm1, 14
    psrld xmm2, 14
    pxor xmm0, xmm1
    pxor xmm0, xmm2
    pslld xmm1, 5
    pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
    paddd xmm6, xmm0 ; xmm6 = t1
    paddd xmm3, xmm6 ; e = d+t1

    movdqa xmm0, xmm3 ; d
    movdqa xmm1, xmm5 ; =b
    movdqa xmm2, xmm4 ; c
    movdqa xmm3, xmm2 ; d = c
    pand xmm2, xmm5 ; b & c
    pand xmm4, xmm7 ; a & c
    pand xmm1, xmm7 ; a & b
    pxor xmm1, xmm4
    pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d)
    paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d))

    movdqa xmm4, xmm5 ; c = b
    movdqa xmm5, xmm7 ; b = a
    movdqa xmm2, xmm7
    movdqa xmm1, xmm7
    psrld xmm7, 2
    pslld xmm2, 10
    psrld xmm1, 13
    pxor xmm7, xmm2
    pxor xmm7, xmm1
    pslld xmm2, 9
    psrld xmm1, 9
    pxor xmm7, xmm2
    pxor xmm7, xmm1
    pslld xmm2, 11
    pxor xmm7, xmm2
    paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
    %endmacro

    %assign i 0
    %rep    LAB_LOOP_UNROLL
            lab_loop_blk
    %assign i i+1
    %endrep

    cmp rax, rcx
    jb LAB_LOOP

    ; Finished the 64 rounds, calculate hash and save

    movntdqa xmm1, [rdx]
    pshufd xmm2, xmm1, 0x55
    paddd xmm5, xmm2
    pshufd xmm6, xmm1, 0xAA
    paddd xmm4, xmm6
    pshufd xmm11, xmm1, 0xFF
    paddd xmm3, xmm11
    pshufd xmm1, xmm1, 0
    paddd xmm7, xmm1

    movntdqa xmm1, [rdx+16]
    pshufd xmm2, xmm1, 0x55
    paddd xmm8, xmm2
    pshufd xmm6, xmm1, 0xAA
    paddd xmm9, xmm6
    pshufd xmm11, xmm1, 0xFF
    paddd xmm10, xmm11
    pshufd xmm1, xmm1, 0
    paddd xmm0, xmm1

    movdqa [hash], xmm7
    movdqa [hash+16], xmm5
    movdqa [hash+32], xmm4
    movdqa [hash+48], xmm3
    movdqa [hash+64], xmm0
    movdqa [hash+80], xmm8
    movdqa [hash+96], xmm9
    movdqa [hash+112], xmm10

    LAB_RET:
    pop rbx
    ret

    SSE4 so far.  I'm taking a break to watch anime.   Cheesy
    The changes take advantage of write combining hardware.  If you have it great, if you don't won't notice much of a change.  Probably won't notice much anyway since the basic code structure is the same.  Eh, oh well.
    Edit:  Slight slow-down in the lab-loop.  I'll copy-paste the old code back in to fix it later.  O_O  Bleach is on!

    Funroll_Loops, the theoretically quicker breakfast cereal!
    Check out http://www.facebook.com/JupiterICT for all of your computing needs.  If you need it, we can get it.  We have solutions for your computing conundrums.  BTC accepted!  12HWUSguWXRCQKfkPeJygVR1ex5wbg3hAq
Page 31
Viewing Page: 32