1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489 |
- ### Generated by hash_md5_sha_x86-64.S.sh ###
- #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
- #ifdef __linux__
- .section .note.GNU-stack, "", @progbits
- #endif
- .section .text.sha1_process_block64, "ax", @progbits
- .globl sha1_process_block64
- .hidden sha1_process_block64
- .type sha1_process_block64, @function
- .balign 8 # allow decoders to fetch at least 5 first insns
- sha1_process_block64:
- pushq %rbp # 1 byte insn
- pushq %rbx # 1 byte insn
- # pushq %r15 # 2 byte insn
- pushq %r14 # 2 byte insn
- pushq %r13 # 2 byte insn
- pushq %r12 # 2 byte insn
- pushq %rdi # we need ctx at the end
- #Register and stack use:
- # eax..edx: a..d
- # ebp: e
- # esi,edi,r8..r14: temps
- # r15: unused
- # xmm0..xmm3: W[]
- # xmm4,xmm5: temps
- # xmm6: current round constant
- # xmm7: all round constants
- # -64(%rsp): area for passing RCONST + W[] from vector to integer units
- movl 80(%rdi), %eax # a = ctx->hash[0]
- movl 84(%rdi), %ebx # b = ctx->hash[1]
- movl 88(%rdi), %ecx # c = ctx->hash[2]
- movl 92(%rdi), %edx # d = ctx->hash[3]
- movl 96(%rdi), %ebp # e = ctx->hash[4]
- movaps sha1const(%rip), %xmm7
- pshufd $0x00, %xmm7, %xmm6
- # Load W[] to xmm0..3, byteswapping on the fly.
- #
- # For iterations 0..15, we pass W[] in rsi,r8..r14
- # for use in RD1As instead of spilling them to stack.
- # We lose parallelized addition of RCONST, but LEA
- # can do two additions at once, so it is probably a wash.
- # (We use rsi instead of rN because this makes two
- # LEAs in two first RD1As shorter by one byte).
- movq 4*0(%rdi), %rsi
- movq 4*2(%rdi), %r8
- bswapq %rsi
- bswapq %r8
- rolq $32, %rsi # rsi = W[1]:W[0]
- rolq $32, %r8 # r8 = W[3]:W[2]
- movq %rsi, %xmm0
- movq %r8, %xmm4
- punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
- # movaps %xmm0, %xmm4 # add RCONST, spill to stack
- # paddd %xmm6, %xmm4
- # movups %xmm4, -64+16*0(%rsp)
- movq 4*4(%rdi), %r9
- movq 4*6(%rdi), %r10
- bswapq %r9
- bswapq %r10
- rolq $32, %r9 # r9 = W[5]:W[4]
- rolq $32, %r10 # r10 = W[7]:W[6]
- movq %r9, %xmm1
- movq %r10, %xmm4
- punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
- movq 4*8(%rdi), %r11
- movq 4*10(%rdi), %r12
- bswapq %r11
- bswapq %r12
- rolq $32, %r11 # r11 = W[9]:W[8]
- rolq $32, %r12 # r12 = W[11]:W[10]
- movq %r11, %xmm2
- movq %r12, %xmm4
- punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
- movq 4*12(%rdi), %r13
- movq 4*14(%rdi), %r14
- bswapq %r13
- bswapq %r14
- rolq $32, %r13 # r13 = W[13]:W[12]
- rolq $32, %r14 # r14 = W[15]:W[14]
- movq %r13, %xmm3
- movq %r14, %xmm4
- punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
- # 0
- leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
- shrq $32, %rsi
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- andl %ebx, %edi # &b
- xorl %edx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
- movl %eax, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 1
- leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- andl %eax, %edi # &b
- xorl %ecx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %edx # e += (((c ^ d) & b) ^ d)
- movl %ebp, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 2
- leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
- shrq $32, %r8
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- andl %ebp, %edi # &b
- xorl %ebx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
- movl %edx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 3
- leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- andl %edx, %edi # &b
- xorl %eax, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
- movl %ecx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 4
- leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
- shrq $32, %r9
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- andl %ecx, %edi # &b
- xorl %ebp, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %eax # e += (((c ^ d) & b) ^ d)
- movl %ebx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 5
- leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- andl %ebx, %edi # &b
- xorl %edx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
- movl %eax, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 6
- leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
- shrq $32, %r10
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- andl %eax, %edi # &b
- xorl %ecx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %edx # e += (((c ^ d) & b) ^ d)
- movl %ebp, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 7
- leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- andl %ebp, %edi # &b
- xorl %ebx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
- movl %edx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
- movaps %xmm3, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm0, %xmm5
- shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm0 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm0, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm0, %xmm0 # shift left by 1
- psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm0, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*0(%rsp)
- # 8
- leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
- shrq $32, %r11
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- andl %edx, %edi # &b
- xorl %eax, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
- movl %ecx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 9
- leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- andl %ecx, %edi # &b
- xorl %ebp, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %eax # e += (((c ^ d) & b) ^ d)
- movl %ebx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 10
- leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
- shrq $32, %r12
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- andl %ebx, %edi # &b
- xorl %edx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
- movl %eax, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 11
- leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- andl %eax, %edi # &b
- xorl %ecx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %edx # e += (((c ^ d) & b) ^ d)
- movl %ebp, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- pshufd $0x55, %xmm7, %xmm6
- # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
- movaps %xmm0, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm1, %xmm5
- shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm1 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm1, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm1, %xmm1 # shift left by 1
- psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm1, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*1(%rsp)
- # 12
- leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
- shrq $32, %r13
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- andl %ebp, %edi # &b
- xorl %ebx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
- movl %edx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 13
- leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- andl %edx, %edi # &b
- xorl %eax, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
- movl %ecx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 14
- leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
- shrq $32, %r14
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- andl %ecx, %edi # &b
- xorl %ebp, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %eax # e += (((c ^ d) & b) ^ d)
- movl %ebx, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 15
- leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- andl %ebx, %edi # &b
- xorl %edx, %edi # (((c ^ d) & b) ^ d)
- addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
- movl %eax, %edi #
- roll $5, %edi # rotl32(a,5)
- addl %edi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
- movaps %xmm1, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm2, %xmm5
- shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm2 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm2, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm2, %xmm2 # shift left by 1
- psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm2, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*2(%rsp)
- # 16
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- andl %eax, %edi # &b
- xorl %ecx, %edi # (((c ^ d) & b) ^ d)
- addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (((c ^ d) & b) ^ d)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 17
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- andl %ebp, %edi # &b
- xorl %ebx, %edi # (((c ^ d) & b) ^ d)
- addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 18
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- andl %edx, %edi # &b
- xorl %eax, %edi # (((c ^ d) & b) ^ d)
- addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 19
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- andl %ecx, %edi # &b
- xorl %ebp, %edi # (((c ^ d) & b) ^ d)
- addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (((c ^ d) & b) ^ d)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
- movaps %xmm2, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm3, %xmm5
- shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm3 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm3, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm3, %xmm3 # shift left by 1
- psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm3, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*3(%rsp)
- # 20
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 21
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 22
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 23
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
- movaps %xmm3, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm0, %xmm5
- shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm0 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm0, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm0, %xmm0 # shift left by 1
- psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm0, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*0(%rsp)
- # 24
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 25
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 26
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 27
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
- movaps %xmm0, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm1, %xmm5
- shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm1 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm1, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm1, %xmm1 # shift left by 1
- psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm1, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*1(%rsp)
- # 28
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 29
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 30
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 31
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- pshufd $0xaa, %xmm7, %xmm6
- # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
- movaps %xmm1, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm2, %xmm5
- shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm2 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm2, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm2, %xmm2 # shift left by 1
- psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm2, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*2(%rsp)
- # 32
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 33
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 34
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 35
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
- movaps %xmm2, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm3, %xmm5
- shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm3 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm3, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm3, %xmm3 # shift left by 1
- psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm3, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*3(%rsp)
- # 36
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 37
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 38
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 39
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
- movaps %xmm3, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm0, %xmm5
- shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm0 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm0, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm0, %xmm0 # shift left by 1
- psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm0, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*0(%rsp)
- # 40
- movl %ebx, %edi # di: b
- movl %ebx, %esi # si: b
- orl %ecx, %edi # di: b | c
- andl %ecx, %esi # si: b & c
- andl %edx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebp # += ((b | c) & d) | (b & c)
- addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15]
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 41
- movl %eax, %edi # di: b
- movl %eax, %esi # si: b
- orl %ebx, %edi # di: b | c
- andl %ebx, %esi # si: b & c
- andl %ecx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %edx # += ((b | c) & d) | (b & c)
- addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15]
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 42
- movl %ebp, %edi # di: b
- movl %ebp, %esi # si: b
- orl %eax, %edi # di: b | c
- andl %eax, %esi # si: b & c
- andl %ebx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ecx # += ((b | c) & d) | (b & c)
- addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15]
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 43
- movl %edx, %edi # di: b
- movl %edx, %esi # si: b
- orl %ebp, %edi # di: b | c
- andl %ebp, %esi # si: b & c
- andl %eax, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebx # += ((b | c) & d) | (b & c)
- addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15]
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
- movaps %xmm0, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm1, %xmm5
- shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm1 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm1, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm1, %xmm1 # shift left by 1
- psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm1, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*1(%rsp)
- # 44
- movl %ecx, %edi # di: b
- movl %ecx, %esi # si: b
- orl %edx, %edi # di: b | c
- andl %edx, %esi # si: b & c
- andl %ebp, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %eax # += ((b | c) & d) | (b & c)
- addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15]
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 45
- movl %ebx, %edi # di: b
- movl %ebx, %esi # si: b
- orl %ecx, %edi # di: b | c
- andl %ecx, %esi # si: b & c
- andl %edx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebp # += ((b | c) & d) | (b & c)
- addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15]
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 46
- movl %eax, %edi # di: b
- movl %eax, %esi # si: b
- orl %ebx, %edi # di: b | c
- andl %ebx, %esi # si: b & c
- andl %ecx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %edx # += ((b | c) & d) | (b & c)
- addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15]
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 47
- movl %ebp, %edi # di: b
- movl %ebp, %esi # si: b
- orl %eax, %edi # di: b | c
- andl %eax, %esi # si: b & c
- andl %ebx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ecx # += ((b | c) & d) | (b & c)
- addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15]
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
- movaps %xmm1, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm2, %xmm5
- shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm2 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm2, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm2, %xmm2 # shift left by 1
- psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm2, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*2(%rsp)
- # 48
- movl %edx, %edi # di: b
- movl %edx, %esi # si: b
- orl %ebp, %edi # di: b | c
- andl %ebp, %esi # si: b & c
- andl %eax, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebx # += ((b | c) & d) | (b & c)
- addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15]
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 49
- movl %ecx, %edi # di: b
- movl %ecx, %esi # si: b
- orl %edx, %edi # di: b | c
- andl %edx, %esi # si: b & c
- andl %ebp, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %eax # += ((b | c) & d) | (b & c)
- addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15]
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 50
- movl %ebx, %edi # di: b
- movl %ebx, %esi # si: b
- orl %ecx, %edi # di: b | c
- andl %ecx, %esi # si: b & c
- andl %edx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebp # += ((b | c) & d) | (b & c)
- addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15]
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 51
- movl %eax, %edi # di: b
- movl %eax, %esi # si: b
- orl %ebx, %edi # di: b | c
- andl %ebx, %esi # si: b & c
- andl %ecx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %edx # += ((b | c) & d) | (b & c)
- addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15]
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- pshufd $0xff, %xmm7, %xmm6
- # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
- movaps %xmm2, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm3, %xmm5
- shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm3 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm3, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm3, %xmm3 # shift left by 1
- psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm3, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*3(%rsp)
- # 52
- movl %ebp, %edi # di: b
- movl %ebp, %esi # si: b
- orl %eax, %edi # di: b | c
- andl %eax, %esi # si: b & c
- andl %ebx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ecx # += ((b | c) & d) | (b & c)
- addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15]
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 53
- movl %edx, %edi # di: b
- movl %edx, %esi # si: b
- orl %ebp, %edi # di: b | c
- andl %ebp, %esi # si: b & c
- andl %eax, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebx # += ((b | c) & d) | (b & c)
- addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15]
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 54
- movl %ecx, %edi # di: b
- movl %ecx, %esi # si: b
- orl %edx, %edi # di: b | c
- andl %edx, %esi # si: b & c
- andl %ebp, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %eax # += ((b | c) & d) | (b & c)
- addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15]
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 55
- movl %ebx, %edi # di: b
- movl %ebx, %esi # si: b
- orl %ecx, %edi # di: b | c
- andl %ecx, %esi # si: b & c
- andl %edx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebp # += ((b | c) & d) | (b & c)
- addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15]
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
- movaps %xmm3, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm0, %xmm5
- shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm0 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm0, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm0, %xmm0 # shift left by 1
- psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm0, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*0(%rsp)
- # 56
- movl %eax, %edi # di: b
- movl %eax, %esi # si: b
- orl %ebx, %edi # di: b | c
- andl %ebx, %esi # si: b & c
- andl %ecx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %edx # += ((b | c) & d) | (b & c)
- addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15]
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 57
- movl %ebp, %edi # di: b
- movl %ebp, %esi # si: b
- orl %eax, %edi # di: b | c
- andl %eax, %esi # si: b & c
- andl %ebx, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ecx # += ((b | c) & d) | (b & c)
- addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15]
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 58
- movl %edx, %edi # di: b
- movl %edx, %esi # si: b
- orl %ebp, %edi # di: b | c
- andl %ebp, %esi # si: b & c
- andl %eax, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %ebx # += ((b | c) & d) | (b & c)
- addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15]
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 59
- movl %ecx, %edi # di: b
- movl %ecx, %esi # si: b
- orl %edx, %edi # di: b | c
- andl %edx, %esi # si: b & c
- andl %ebp, %edi # di: (b | c) & d
- orl %esi, %edi # ((b | c) & d) | (b & c)
- addl %edi, %eax # += ((b | c) & d) | (b & c)
- addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15]
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
- movaps %xmm0, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm1, %xmm5
- shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm1 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm1, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm1, %xmm1 # shift left by 1
- psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm1, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*1(%rsp)
- # 60
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 61
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 62
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 63
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
- movaps %xmm1, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm2, %xmm5
- shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm2 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm2, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm2, %xmm2 # shift left by 1
- psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm2, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*2(%rsp)
- # 64
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 65
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 66
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 67
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
- movaps %xmm2, %xmm4
- psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
- # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
- # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
- # same result as above, but shorter and faster:
- # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
- # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
- movaps %xmm3, %xmm5
- shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
- xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
- xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
- xorps %xmm5, %xmm3 # ^
- # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
- movaps %xmm3, %xmm5
- xorps %xmm4, %xmm4 # rol(W0,1):
- pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
- paddd %xmm3, %xmm3 # shift left by 1
- psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
- # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
- pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
- movaps %xmm5, %xmm4
- pslld $2, %xmm5
- psrld $30, %xmm4
- # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
- xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
- xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
- movaps %xmm3, %xmm5
- paddd %xmm6, %xmm5
- movups %xmm5, -64+16*3(%rsp)
- # 68
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 69
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 70
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 71
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 72
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 73
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 74
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- # 75
- movl %ecx, %edi # c
- xorl %edx, %edi # ^d
- xorl %ebx, %edi # ^b
- addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15]
- addl %edi, %ebp # e += (c ^ d ^ b)
- movl %eax, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebp # e += rotl32(a,5)
- rorl $2, %ebx # b = rotl32(b,30)
- # 76
- movl %ebx, %edi # c
- xorl %ecx, %edi # ^d
- xorl %eax, %edi # ^b
- addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15]
- addl %edi, %edx # e += (c ^ d ^ b)
- movl %ebp, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %edx # e += rotl32(a,5)
- rorl $2, %eax # b = rotl32(b,30)
- # 77
- movl %eax, %edi # c
- xorl %ebx, %edi # ^d
- xorl %ebp, %edi # ^b
- addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15]
- addl %edi, %ecx # e += (c ^ d ^ b)
- movl %edx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ecx # e += rotl32(a,5)
- rorl $2, %ebp # b = rotl32(b,30)
- # 78
- movl %ebp, %edi # c
- xorl %eax, %edi # ^d
- xorl %edx, %edi # ^b
- addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15]
- addl %edi, %ebx # e += (c ^ d ^ b)
- movl %ecx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %ebx # e += rotl32(a,5)
- rorl $2, %edx # b = rotl32(b,30)
- # 79
- movl %edx, %edi # c
- xorl %ebp, %edi # ^d
- xorl %ecx, %edi # ^b
- addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15]
- addl %edi, %eax # e += (c ^ d ^ b)
- movl %ebx, %esi #
- roll $5, %esi # rotl32(a,5)
- addl %esi, %eax # e += rotl32(a,5)
- rorl $2, %ecx # b = rotl32(b,30)
- popq %rdi #
- popq %r12 #
- addl %eax, 80(%rdi) # ctx->hash[0] += a
- popq %r13 #
- addl %ebx, 84(%rdi) # ctx->hash[1] += b
- popq %r14 #
- addl %ecx, 88(%rdi) # ctx->hash[2] += c
- # popq %r15 #
- addl %edx, 92(%rdi) # ctx->hash[3] += d
- popq %rbx #
- addl %ebp, 96(%rdi) # ctx->hash[4] += e
- popq %rbp #
- ret
- .size sha1_process_block64, .-sha1_process_block64
- .section .rodata.cst16.sha1const, "aM", @progbits, 16
- .balign 16
- sha1const:
- .long 0x5A827999
- .long 0x6ED9EBA1
- .long 0x8F1BBCDC
- .long 0xCA62C1D6
- #endif
|