hash_sha1_x86-64.S 52 KB


  1. ### Generated by hash_sha1_x86-64.S.sh ###
  2. #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
  3. #ifdef __linux__
  4. .section .note.GNU-stack, "", @progbits
  5. #endif
  6. .section .text.sha1_process_block64, "ax", @progbits
  7. .globl sha1_process_block64
  8. .hidden sha1_process_block64
  9. .type sha1_process_block64, @function
  10. .balign 8 # allow decoders to fetch at least 5 first insns
  11. sha1_process_block64:
  12. pushq %rbp # 1 byte insn
  13. pushq %rbx # 1 byte insn
  14. # pushq %r15 # 2 byte insn
  15. pushq %r14 # 2 byte insn
  16. pushq %r13 # 2 byte insn
  17. pushq %r12 # 2 byte insn
  18. pushq %rdi # we need ctx at the end
  19. #Register and stack use:
  20. # eax..edx: a..d
  21. # ebp: e
  22. # esi,edi,r8..r14: temps
  23. # r15: unused
  24. # xmm0..xmm3: W[]
  25. # xmm4,xmm5: temps
  26. # xmm6: current round constant
  27. # xmm7: all round constants
  28. # -64(%rsp): area for passing RCONST + W[] from vector to integer units
  29. movl 80(%rdi), %eax # a = ctx->hash[0]
  30. movl 84(%rdi), %ebx # b = ctx->hash[1]
  31. movl 88(%rdi), %ecx # c = ctx->hash[2]
  32. movl 92(%rdi), %edx # d = ctx->hash[3]
  33. movl 96(%rdi), %ebp # e = ctx->hash[4]
  34. movaps sha1const(%rip), %xmm7
  35. pshufd $0x00, %xmm7, %xmm6
  36. # Load W[] to xmm0..3, byteswapping on the fly.
  37. #
  38. # For iterations 0..15, we pass W[] in rsi,r8..r14
  39. # for use in RD1As instead of spilling them to stack.
  40. # We lose parallelized addition of RCONST, but LEA
  41. # can do two additions at once, so it is probably a wash.
  42. # (We use rsi instead of rN because this makes two
  43. # LEAs in two first RD1As shorter by one byte).
  44. movq 4*0(%rdi), %rsi
  45. movq 4*2(%rdi), %r8
  46. bswapq %rsi
  47. bswapq %r8
  48. rolq $32, %rsi # rsi = W[1]:W[0]
  49. rolq $32, %r8 # r8 = W[3]:W[2]
  50. movq %rsi, %xmm0
  51. movq %r8, %xmm4
  52. punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
  53. # movaps %xmm0, %xmm4 # add RCONST, spill to stack
  54. # paddd %xmm6, %xmm4
  55. # movups %xmm4, -64+16*0(%rsp)
  56. movq 4*4(%rdi), %r9
  57. movq 4*6(%rdi), %r10
  58. bswapq %r9
  59. bswapq %r10
  60. rolq $32, %r9 # r9 = W[5]:W[4]
  61. rolq $32, %r10 # r10 = W[7]:W[6]
  62. movq %r9, %xmm1
  63. movq %r10, %xmm4
  64. punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
  65. movq 4*8(%rdi), %r11
  66. movq 4*10(%rdi), %r12
  67. bswapq %r11
  68. bswapq %r12
  69. rolq $32, %r11 # r11 = W[9]:W[8]
  70. rolq $32, %r12 # r12 = W[11]:W[10]
  71. movq %r11, %xmm2
  72. movq %r12, %xmm4
  73. punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
  74. movq 4*12(%rdi), %r13
  75. movq 4*14(%rdi), %r14
  76. bswapq %r13
  77. bswapq %r14
  78. rolq $32, %r13 # r13 = W[13]:W[12]
  79. rolq $32, %r14 # r14 = W[15]:W[14]
  80. movq %r13, %xmm3
  81. movq %r14, %xmm4
  82. punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
  83. # 0
  84. leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
  85. shrq $32, %rsi
  86. movl %ecx, %edi # c
  87. xorl %edx, %edi # ^d
  88. andl %ebx, %edi # &b
  89. xorl %edx, %edi # (((c ^ d) & b) ^ d)
  90. addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
  91. movl %eax, %edi #
  92. roll $5, %edi # rotl32(a,5)
  93. addl %edi, %ebp # e += rotl32(a,5)
  94. rorl $2, %ebx # b = rotl32(b,30)
  95. # 1
  96. leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
  97. movl %ebx, %edi # c
  98. xorl %ecx, %edi # ^d
  99. andl %eax, %edi # &b
  100. xorl %ecx, %edi # (((c ^ d) & b) ^ d)
  101. addl %edi, %edx # e += (((c ^ d) & b) ^ d)
  102. movl %ebp, %edi #
  103. roll $5, %edi # rotl32(a,5)
  104. addl %edi, %edx # e += rotl32(a,5)
  105. rorl $2, %eax # b = rotl32(b,30)
  106. # 2
  107. leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
  108. shrq $32, %r8
  109. movl %eax, %edi # c
  110. xorl %ebx, %edi # ^d
  111. andl %ebp, %edi # &b
  112. xorl %ebx, %edi # (((c ^ d) & b) ^ d)
  113. addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
  114. movl %edx, %edi #
  115. roll $5, %edi # rotl32(a,5)
  116. addl %edi, %ecx # e += rotl32(a,5)
  117. rorl $2, %ebp # b = rotl32(b,30)
  118. # 3
  119. leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
  120. movl %ebp, %edi # c
  121. xorl %eax, %edi # ^d
  122. andl %edx, %edi # &b
  123. xorl %eax, %edi # (((c ^ d) & b) ^ d)
  124. addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
  125. movl %ecx, %edi #
  126. roll $5, %edi # rotl32(a,5)
  127. addl %edi, %ebx # e += rotl32(a,5)
  128. rorl $2, %edx # b = rotl32(b,30)
  129. # 4
  130. leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
  131. shrq $32, %r9
  132. movl %edx, %edi # c
  133. xorl %ebp, %edi # ^d
  134. andl %ecx, %edi # &b
  135. xorl %ebp, %edi # (((c ^ d) & b) ^ d)
  136. addl %edi, %eax # e += (((c ^ d) & b) ^ d)
  137. movl %ebx, %edi #
  138. roll $5, %edi # rotl32(a,5)
  139. addl %edi, %eax # e += rotl32(a,5)
  140. rorl $2, %ecx # b = rotl32(b,30)
  141. # 5
  142. leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
  143. movl %ecx, %edi # c
  144. xorl %edx, %edi # ^d
  145. andl %ebx, %edi # &b
  146. xorl %edx, %edi # (((c ^ d) & b) ^ d)
  147. addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
  148. movl %eax, %edi #
  149. roll $5, %edi # rotl32(a,5)
  150. addl %edi, %ebp # e += rotl32(a,5)
  151. rorl $2, %ebx # b = rotl32(b,30)
  152. # 6
  153. leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
  154. shrq $32, %r10
  155. movl %ebx, %edi # c
  156. xorl %ecx, %edi # ^d
  157. andl %eax, %edi # &b
  158. xorl %ecx, %edi # (((c ^ d) & b) ^ d)
  159. addl %edi, %edx # e += (((c ^ d) & b) ^ d)
  160. movl %ebp, %edi #
  161. roll $5, %edi # rotl32(a,5)
  162. addl %edi, %edx # e += rotl32(a,5)
  163. rorl $2, %eax # b = rotl32(b,30)
  164. # 7
  165. leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
  166. movl %eax, %edi # c
  167. xorl %ebx, %edi # ^d
  168. andl %ebp, %edi # &b
  169. xorl %ebx, %edi # (((c ^ d) & b) ^ d)
  170. addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
  171. movl %edx, %edi #
  172. roll $5, %edi # rotl32(a,5)
  173. addl %edi, %ecx # e += rotl32(a,5)
  174. rorl $2, %ebp # b = rotl32(b,30)
  175. # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
  176. movaps %xmm3, %xmm4
  177. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  178. # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  179. # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  180. # same result as above, but shorter and faster:
  181. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  182. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  183. movaps %xmm0, %xmm5
  184. shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  185. xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  186. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  187. xorps %xmm5, %xmm0 # ^
  188. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  189. movaps %xmm0, %xmm5
  190. xorps %xmm4, %xmm4 # rol(W0,1):
  191. pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  192. paddd %xmm0, %xmm0 # shift left by 1
  193. psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
  194. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  195. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  196. movaps %xmm5, %xmm4
  197. pslld $2, %xmm5
  198. psrld $30, %xmm4
  199. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  200. xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
  201. xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  202. movaps %xmm0, %xmm5
  203. paddd %xmm6, %xmm5
  204. movups %xmm5, -64+16*0(%rsp)
  205. # 8
  206. leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
  207. shrq $32, %r11
  208. movl %ebp, %edi # c
  209. xorl %eax, %edi # ^d
  210. andl %edx, %edi # &b
  211. xorl %eax, %edi # (((c ^ d) & b) ^ d)
  212. addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
  213. movl %ecx, %edi #
  214. roll $5, %edi # rotl32(a,5)
  215. addl %edi, %ebx # e += rotl32(a,5)
  216. rorl $2, %edx # b = rotl32(b,30)
  217. # 9
  218. leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
  219. movl %edx, %edi # c
  220. xorl %ebp, %edi # ^d
  221. andl %ecx, %edi # &b
  222. xorl %ebp, %edi # (((c ^ d) & b) ^ d)
  223. addl %edi, %eax # e += (((c ^ d) & b) ^ d)
  224. movl %ebx, %edi #
  225. roll $5, %edi # rotl32(a,5)
  226. addl %edi, %eax # e += rotl32(a,5)
  227. rorl $2, %ecx # b = rotl32(b,30)
  228. # 10
  229. leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
  230. shrq $32, %r12
  231. movl %ecx, %edi # c
  232. xorl %edx, %edi # ^d
  233. andl %ebx, %edi # &b
  234. xorl %edx, %edi # (((c ^ d) & b) ^ d)
  235. addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
  236. movl %eax, %edi #
  237. roll $5, %edi # rotl32(a,5)
  238. addl %edi, %ebp # e += rotl32(a,5)
  239. rorl $2, %ebx # b = rotl32(b,30)
  240. # 11
  241. leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
  242. movl %ebx, %edi # c
  243. xorl %ecx, %edi # ^d
  244. andl %eax, %edi # &b
  245. xorl %ecx, %edi # (((c ^ d) & b) ^ d)
  246. addl %edi, %edx # e += (((c ^ d) & b) ^ d)
  247. movl %ebp, %edi #
  248. roll $5, %edi # rotl32(a,5)
  249. addl %edi, %edx # e += rotl32(a,5)
  250. rorl $2, %eax # b = rotl32(b,30)
  251. pshufd $0x55, %xmm7, %xmm6
  252. # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
  253. movaps %xmm0, %xmm4
  254. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  255. # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  256. # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  257. # same result as above, but shorter and faster:
  258. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  259. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  260. movaps %xmm1, %xmm5
  261. shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  262. xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  263. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  264. xorps %xmm5, %xmm1 # ^
  265. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  266. movaps %xmm1, %xmm5
  267. xorps %xmm4, %xmm4 # rol(W0,1):
  268. pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  269. paddd %xmm1, %xmm1 # shift left by 1
  270. psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
  271. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  272. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  273. movaps %xmm5, %xmm4
  274. pslld $2, %xmm5
  275. psrld $30, %xmm4
  276. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  277. xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
  278. xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  279. movaps %xmm1, %xmm5
  280. paddd %xmm6, %xmm5
  281. movups %xmm5, -64+16*1(%rsp)
  282. # 12
  283. leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
  284. shrq $32, %r13
  285. movl %eax, %edi # c
  286. xorl %ebx, %edi # ^d
  287. andl %ebp, %edi # &b
  288. xorl %ebx, %edi # (((c ^ d) & b) ^ d)
  289. addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
  290. movl %edx, %edi #
  291. roll $5, %edi # rotl32(a,5)
  292. addl %edi, %ecx # e += rotl32(a,5)
  293. rorl $2, %ebp # b = rotl32(b,30)
  294. # 13
  295. leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
  296. movl %ebp, %edi # c
  297. xorl %eax, %edi # ^d
  298. andl %edx, %edi # &b
  299. xorl %eax, %edi # (((c ^ d) & b) ^ d)
  300. addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
  301. movl %ecx, %edi #
  302. roll $5, %edi # rotl32(a,5)
  303. addl %edi, %ebx # e += rotl32(a,5)
  304. rorl $2, %edx # b = rotl32(b,30)
  305. # 14
  306. leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
  307. shrq $32, %r14
  308. movl %edx, %edi # c
  309. xorl %ebp, %edi # ^d
  310. andl %ecx, %edi # &b
  311. xorl %ebp, %edi # (((c ^ d) & b) ^ d)
  312. addl %edi, %eax # e += (((c ^ d) & b) ^ d)
  313. movl %ebx, %edi #
  314. roll $5, %edi # rotl32(a,5)
  315. addl %edi, %eax # e += rotl32(a,5)
  316. rorl $2, %ecx # b = rotl32(b,30)
  317. # 15
  318. leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
  319. movl %ecx, %edi # c
  320. xorl %edx, %edi # ^d
  321. andl %ebx, %edi # &b
  322. xorl %edx, %edi # (((c ^ d) & b) ^ d)
  323. addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
  324. movl %eax, %edi #
  325. roll $5, %edi # rotl32(a,5)
  326. addl %edi, %ebp # e += rotl32(a,5)
  327. rorl $2, %ebx # b = rotl32(b,30)
  328. # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
  329. movaps %xmm1, %xmm4
  330. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  331. # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  332. # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  333. # same result as above, but shorter and faster:
  334. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  335. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  336. movaps %xmm2, %xmm5
  337. shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  338. xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  339. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  340. xorps %xmm5, %xmm2 # ^
  341. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  342. movaps %xmm2, %xmm5
  343. xorps %xmm4, %xmm4 # rol(W0,1):
  344. pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  345. paddd %xmm2, %xmm2 # shift left by 1
  346. psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
  347. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  348. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  349. movaps %xmm5, %xmm4
  350. pslld $2, %xmm5
  351. psrld $30, %xmm4
  352. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  353. xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
  354. xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  355. movaps %xmm2, %xmm5
  356. paddd %xmm6, %xmm5
  357. movups %xmm5, -64+16*2(%rsp)
  358. # 16
  359. movl %ebx, %edi # c
  360. xorl %ecx, %edi # ^d
  361. andl %eax, %edi # &b
  362. xorl %ecx, %edi # (((c ^ d) & b) ^ d)
  363. addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15]
  364. addl %edi, %edx # e += (((c ^ d) & b) ^ d)
  365. movl %ebp, %esi #
  366. roll $5, %esi # rotl32(a,5)
  367. addl %esi, %edx # e += rotl32(a,5)
  368. rorl $2, %eax # b = rotl32(b,30)
  369. # 17
  370. movl %eax, %edi # c
  371. xorl %ebx, %edi # ^d
  372. andl %ebp, %edi # &b
  373. xorl %ebx, %edi # (((c ^ d) & b) ^ d)
  374. addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15]
  375. addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
  376. movl %edx, %esi #
  377. roll $5, %esi # rotl32(a,5)
  378. addl %esi, %ecx # e += rotl32(a,5)
  379. rorl $2, %ebp # b = rotl32(b,30)
  380. # 18
  381. movl %ebp, %edi # c
  382. xorl %eax, %edi # ^d
  383. andl %edx, %edi # &b
  384. xorl %eax, %edi # (((c ^ d) & b) ^ d)
  385. addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15]
  386. addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
  387. movl %ecx, %esi #
  388. roll $5, %esi # rotl32(a,5)
  389. addl %esi, %ebx # e += rotl32(a,5)
  390. rorl $2, %edx # b = rotl32(b,30)
  391. # 19
  392. movl %edx, %edi # c
  393. xorl %ebp, %edi # ^d
  394. andl %ecx, %edi # &b
  395. xorl %ebp, %edi # (((c ^ d) & b) ^ d)
  396. addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15]
  397. addl %edi, %eax # e += (((c ^ d) & b) ^ d)
  398. movl %ebx, %esi #
  399. roll $5, %esi # rotl32(a,5)
  400. addl %esi, %eax # e += rotl32(a,5)
  401. rorl $2, %ecx # b = rotl32(b,30)
  402. # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
  403. movaps %xmm2, %xmm4
  404. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  405. # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  406. # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  407. # same result as above, but shorter and faster:
  408. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  409. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  410. movaps %xmm3, %xmm5
  411. shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  412. xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  413. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  414. xorps %xmm5, %xmm3 # ^
  415. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  416. movaps %xmm3, %xmm5
  417. xorps %xmm4, %xmm4 # rol(W0,1):
  418. pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  419. paddd %xmm3, %xmm3 # shift left by 1
  420. psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
  421. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  422. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  423. movaps %xmm5, %xmm4
  424. pslld $2, %xmm5
  425. psrld $30, %xmm4
  426. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  427. xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
  428. xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  429. movaps %xmm3, %xmm5
  430. paddd %xmm6, %xmm5
  431. movups %xmm5, -64+16*3(%rsp)
  432. # 20
  433. movl %ecx, %edi # c
  434. xorl %edx, %edi # ^d
  435. xorl %ebx, %edi # ^b
  436. addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15]
  437. addl %edi, %ebp # e += (c ^ d ^ b)
  438. movl %eax, %esi #
  439. roll $5, %esi # rotl32(a,5)
  440. addl %esi, %ebp # e += rotl32(a,5)
  441. rorl $2, %ebx # b = rotl32(b,30)
  442. # 21
  443. movl %ebx, %edi # c
  444. xorl %ecx, %edi # ^d
  445. xorl %eax, %edi # ^b
  446. addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15]
  447. addl %edi, %edx # e += (c ^ d ^ b)
  448. movl %ebp, %esi #
  449. roll $5, %esi # rotl32(a,5)
  450. addl %esi, %edx # e += rotl32(a,5)
  451. rorl $2, %eax # b = rotl32(b,30)
  452. # 22
  453. movl %eax, %edi # c
  454. xorl %ebx, %edi # ^d
  455. xorl %ebp, %edi # ^b
  456. addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15]
  457. addl %edi, %ecx # e += (c ^ d ^ b)
  458. movl %edx, %esi #
  459. roll $5, %esi # rotl32(a,5)
  460. addl %esi, %ecx # e += rotl32(a,5)
  461. rorl $2, %ebp # b = rotl32(b,30)
  462. # 23
  463. movl %ebp, %edi # c
  464. xorl %eax, %edi # ^d
  465. xorl %edx, %edi # ^b
  466. addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15]
  467. addl %edi, %ebx # e += (c ^ d ^ b)
  468. movl %ecx, %esi #
  469. roll $5, %esi # rotl32(a,5)
  470. addl %esi, %ebx # e += rotl32(a,5)
  471. rorl $2, %edx # b = rotl32(b,30)
  472. # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
  473. movaps %xmm3, %xmm4
  474. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  475. # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  476. # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  477. # same result as above, but shorter and faster:
  478. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  479. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  480. movaps %xmm0, %xmm5
  481. shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  482. xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  483. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  484. xorps %xmm5, %xmm0 # ^
  485. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  486. movaps %xmm0, %xmm5
  487. xorps %xmm4, %xmm4 # rol(W0,1):
  488. pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  489. paddd %xmm0, %xmm0 # shift left by 1
  490. psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
  491. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  492. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  493. movaps %xmm5, %xmm4
  494. pslld $2, %xmm5
  495. psrld $30, %xmm4
  496. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  497. xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
  498. xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  499. movaps %xmm0, %xmm5
  500. paddd %xmm6, %xmm5
  501. movups %xmm5, -64+16*0(%rsp)
  502. # 24
  503. movl %edx, %edi # c
  504. xorl %ebp, %edi # ^d
  505. xorl %ecx, %edi # ^b
  506. addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15]
  507. addl %edi, %eax # e += (c ^ d ^ b)
  508. movl %ebx, %esi #
  509. roll $5, %esi # rotl32(a,5)
  510. addl %esi, %eax # e += rotl32(a,5)
  511. rorl $2, %ecx # b = rotl32(b,30)
  512. # 25
  513. movl %ecx, %edi # c
  514. xorl %edx, %edi # ^d
  515. xorl %ebx, %edi # ^b
  516. addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15]
  517. addl %edi, %ebp # e += (c ^ d ^ b)
  518. movl %eax, %esi #
  519. roll $5, %esi # rotl32(a,5)
  520. addl %esi, %ebp # e += rotl32(a,5)
  521. rorl $2, %ebx # b = rotl32(b,30)
  522. # 26
  523. movl %ebx, %edi # c
  524. xorl %ecx, %edi # ^d
  525. xorl %eax, %edi # ^b
  526. addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15]
  527. addl %edi, %edx # e += (c ^ d ^ b)
  528. movl %ebp, %esi #
  529. roll $5, %esi # rotl32(a,5)
  530. addl %esi, %edx # e += rotl32(a,5)
  531. rorl $2, %eax # b = rotl32(b,30)
  532. # 27
  533. movl %eax, %edi # c
  534. xorl %ebx, %edi # ^d
  535. xorl %ebp, %edi # ^b
  536. addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15]
  537. addl %edi, %ecx # e += (c ^ d ^ b)
  538. movl %edx, %esi #
  539. roll $5, %esi # rotl32(a,5)
  540. addl %esi, %ecx # e += rotl32(a,5)
  541. rorl $2, %ebp # b = rotl32(b,30)
  542. # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
  543. movaps %xmm0, %xmm4
  544. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  545. # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  546. # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  547. # same result as above, but shorter and faster:
  548. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  549. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  550. movaps %xmm1, %xmm5
  551. shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  552. xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  553. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  554. xorps %xmm5, %xmm1 # ^
  555. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  556. movaps %xmm1, %xmm5
  557. xorps %xmm4, %xmm4 # rol(W0,1):
  558. pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  559. paddd %xmm1, %xmm1 # shift left by 1
  560. psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
  561. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  562. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  563. movaps %xmm5, %xmm4
  564. pslld $2, %xmm5
  565. psrld $30, %xmm4
  566. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  567. xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
  568. xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  569. movaps %xmm1, %xmm5
  570. paddd %xmm6, %xmm5
  571. movups %xmm5, -64+16*1(%rsp)
  572. # 28
  573. movl %ebp, %edi # c
  574. xorl %eax, %edi # ^d
  575. xorl %edx, %edi # ^b
  576. addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15]
  577. addl %edi, %ebx # e += (c ^ d ^ b)
  578. movl %ecx, %esi #
  579. roll $5, %esi # rotl32(a,5)
  580. addl %esi, %ebx # e += rotl32(a,5)
  581. rorl $2, %edx # b = rotl32(b,30)
  582. # 29
  583. movl %edx, %edi # c
  584. xorl %ebp, %edi # ^d
  585. xorl %ecx, %edi # ^b
  586. addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15]
  587. addl %edi, %eax # e += (c ^ d ^ b)
  588. movl %ebx, %esi #
  589. roll $5, %esi # rotl32(a,5)
  590. addl %esi, %eax # e += rotl32(a,5)
  591. rorl $2, %ecx # b = rotl32(b,30)
  592. # 30
  593. movl %ecx, %edi # c
  594. xorl %edx, %edi # ^d
  595. xorl %ebx, %edi # ^b
  596. addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15]
  597. addl %edi, %ebp # e += (c ^ d ^ b)
  598. movl %eax, %esi #
  599. roll $5, %esi # rotl32(a,5)
  600. addl %esi, %ebp # e += rotl32(a,5)
  601. rorl $2, %ebx # b = rotl32(b,30)
  602. # 31
  603. movl %ebx, %edi # c
  604. xorl %ecx, %edi # ^d
  605. xorl %eax, %edi # ^b
  606. addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15]
  607. addl %edi, %edx # e += (c ^ d ^ b)
  608. movl %ebp, %esi #
  609. roll $5, %esi # rotl32(a,5)
  610. addl %esi, %edx # e += rotl32(a,5)
  611. rorl $2, %eax # b = rotl32(b,30)
  612. pshufd $0xaa, %xmm7, %xmm6
  613. # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
  614. movaps %xmm1, %xmm4
  615. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  616. # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  617. # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  618. # same result as above, but shorter and faster:
  619. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  620. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  621. movaps %xmm2, %xmm5
  622. shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  623. xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  624. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  625. xorps %xmm5, %xmm2 # ^
  626. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  627. movaps %xmm2, %xmm5
  628. xorps %xmm4, %xmm4 # rol(W0,1):
  629. pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  630. paddd %xmm2, %xmm2 # shift left by 1
  631. psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
  632. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  633. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  634. movaps %xmm5, %xmm4
  635. pslld $2, %xmm5
  636. psrld $30, %xmm4
  637. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  638. xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
  639. xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  640. movaps %xmm2, %xmm5
  641. paddd %xmm6, %xmm5
  642. movups %xmm5, -64+16*2(%rsp)
  643. # 32
  644. movl %eax, %edi # c
  645. xorl %ebx, %edi # ^d
  646. xorl %ebp, %edi # ^b
  647. addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15]
  648. addl %edi, %ecx # e += (c ^ d ^ b)
  649. movl %edx, %esi #
  650. roll $5, %esi # rotl32(a,5)
  651. addl %esi, %ecx # e += rotl32(a,5)
  652. rorl $2, %ebp # b = rotl32(b,30)
  653. # 33
  654. movl %ebp, %edi # c
  655. xorl %eax, %edi # ^d
  656. xorl %edx, %edi # ^b
  657. addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15]
  658. addl %edi, %ebx # e += (c ^ d ^ b)
  659. movl %ecx, %esi #
  660. roll $5, %esi # rotl32(a,5)
  661. addl %esi, %ebx # e += rotl32(a,5)
  662. rorl $2, %edx # b = rotl32(b,30)
  663. # 34
  664. movl %edx, %edi # c
  665. xorl %ebp, %edi # ^d
  666. xorl %ecx, %edi # ^b
  667. addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15]
  668. addl %edi, %eax # e += (c ^ d ^ b)
  669. movl %ebx, %esi #
  670. roll $5, %esi # rotl32(a,5)
  671. addl %esi, %eax # e += rotl32(a,5)
  672. rorl $2, %ecx # b = rotl32(b,30)
  673. # 35
  674. movl %ecx, %edi # c
  675. xorl %edx, %edi # ^d
  676. xorl %ebx, %edi # ^b
  677. addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15]
  678. addl %edi, %ebp # e += (c ^ d ^ b)
  679. movl %eax, %esi #
  680. roll $5, %esi # rotl32(a,5)
  681. addl %esi, %ebp # e += rotl32(a,5)
  682. rorl $2, %ebx # b = rotl32(b,30)
  683. # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
  684. movaps %xmm2, %xmm4
  685. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  686. # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  687. # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  688. # same result as above, but shorter and faster:
  689. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  690. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  691. movaps %xmm3, %xmm5
  692. shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  693. xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  694. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  695. xorps %xmm5, %xmm3 # ^
  696. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  697. movaps %xmm3, %xmm5
  698. xorps %xmm4, %xmm4 # rol(W0,1):
  699. pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  700. paddd %xmm3, %xmm3 # shift left by 1
  701. psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
  702. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  703. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  704. movaps %xmm5, %xmm4
  705. pslld $2, %xmm5
  706. psrld $30, %xmm4
  707. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  708. xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
  709. xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  710. movaps %xmm3, %xmm5
  711. paddd %xmm6, %xmm5
  712. movups %xmm5, -64+16*3(%rsp)
  713. # 36
  714. movl %ebx, %edi # c
  715. xorl %ecx, %edi # ^d
  716. xorl %eax, %edi # ^b
  717. addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15]
  718. addl %edi, %edx # e += (c ^ d ^ b)
  719. movl %ebp, %esi #
  720. roll $5, %esi # rotl32(a,5)
  721. addl %esi, %edx # e += rotl32(a,5)
  722. rorl $2, %eax # b = rotl32(b,30)
  723. # 37
  724. movl %eax, %edi # c
  725. xorl %ebx, %edi # ^d
  726. xorl %ebp, %edi # ^b
  727. addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15]
  728. addl %edi, %ecx # e += (c ^ d ^ b)
  729. movl %edx, %esi #
  730. roll $5, %esi # rotl32(a,5)
  731. addl %esi, %ecx # e += rotl32(a,5)
  732. rorl $2, %ebp # b = rotl32(b,30)
  733. # 38
  734. movl %ebp, %edi # c
  735. xorl %eax, %edi # ^d
  736. xorl %edx, %edi # ^b
  737. addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15]
  738. addl %edi, %ebx # e += (c ^ d ^ b)
  739. movl %ecx, %esi #
  740. roll $5, %esi # rotl32(a,5)
  741. addl %esi, %ebx # e += rotl32(a,5)
  742. rorl $2, %edx # b = rotl32(b,30)
  743. # 39
  744. movl %edx, %edi # c
  745. xorl %ebp, %edi # ^d
  746. xorl %ecx, %edi # ^b
  747. addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15]
  748. addl %edi, %eax # e += (c ^ d ^ b)
  749. movl %ebx, %esi #
  750. roll $5, %esi # rotl32(a,5)
  751. addl %esi, %eax # e += rotl32(a,5)
  752. rorl $2, %ecx # b = rotl32(b,30)
  753. # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
  754. movaps %xmm3, %xmm4
  755. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  756. # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  757. # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  758. # same result as above, but shorter and faster:
  759. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  760. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  761. movaps %xmm0, %xmm5
  762. shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  763. xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  764. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  765. xorps %xmm5, %xmm0 # ^
  766. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  767. movaps %xmm0, %xmm5
  768. xorps %xmm4, %xmm4 # rol(W0,1):
  769. pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  770. paddd %xmm0, %xmm0 # shift left by 1
  771. psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
  772. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  773. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  774. movaps %xmm5, %xmm4
  775. pslld $2, %xmm5
  776. psrld $30, %xmm4
  777. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  778. xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
  779. xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  780. movaps %xmm0, %xmm5
  781. paddd %xmm6, %xmm5
  782. movups %xmm5, -64+16*0(%rsp)
  783. # 40
  784. movl %ebx, %edi # di: b
  785. movl %ebx, %esi # si: b
  786. orl %ecx, %edi # di: b | c
  787. andl %ecx, %esi # si: b & c
  788. andl %edx, %edi # di: (b | c) & d
  789. orl %esi, %edi # ((b | c) & d) | (b & c)
  790. addl %edi, %ebp # += ((b | c) & d) | (b & c)
  791. addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15]
  792. movl %eax, %esi #
  793. roll $5, %esi # rotl32(a,5)
  794. addl %esi, %ebp # e += rotl32(a,5)
  795. rorl $2, %ebx # b = rotl32(b,30)
  796. # 41
  797. movl %eax, %edi # di: b
  798. movl %eax, %esi # si: b
  799. orl %ebx, %edi # di: b | c
  800. andl %ebx, %esi # si: b & c
  801. andl %ecx, %edi # di: (b | c) & d
  802. orl %esi, %edi # ((b | c) & d) | (b & c)
  803. addl %edi, %edx # += ((b | c) & d) | (b & c)
  804. addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15]
  805. movl %ebp, %esi #
  806. roll $5, %esi # rotl32(a,5)
  807. addl %esi, %edx # e += rotl32(a,5)
  808. rorl $2, %eax # b = rotl32(b,30)
  809. # 42
  810. movl %ebp, %edi # di: b
  811. movl %ebp, %esi # si: b
  812. orl %eax, %edi # di: b | c
  813. andl %eax, %esi # si: b & c
  814. andl %ebx, %edi # di: (b | c) & d
  815. orl %esi, %edi # ((b | c) & d) | (b & c)
  816. addl %edi, %ecx # += ((b | c) & d) | (b & c)
  817. addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15]
  818. movl %edx, %esi #
  819. roll $5, %esi # rotl32(a,5)
  820. addl %esi, %ecx # e += rotl32(a,5)
  821. rorl $2, %ebp # b = rotl32(b,30)
  822. # 43
  823. movl %edx, %edi # di: b
  824. movl %edx, %esi # si: b
  825. orl %ebp, %edi # di: b | c
  826. andl %ebp, %esi # si: b & c
  827. andl %eax, %edi # di: (b | c) & d
  828. orl %esi, %edi # ((b | c) & d) | (b & c)
  829. addl %edi, %ebx # += ((b | c) & d) | (b & c)
  830. addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15]
  831. movl %ecx, %esi #
  832. roll $5, %esi # rotl32(a,5)
  833. addl %esi, %ebx # e += rotl32(a,5)
  834. rorl $2, %edx # b = rotl32(b,30)
  835. # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
  836. movaps %xmm0, %xmm4
  837. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  838. # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  839. # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  840. # same result as above, but shorter and faster:
  841. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  842. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  843. movaps %xmm1, %xmm5
  844. shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  845. xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  846. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  847. xorps %xmm5, %xmm1 # ^
  848. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  849. movaps %xmm1, %xmm5
  850. xorps %xmm4, %xmm4 # rol(W0,1):
  851. pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  852. paddd %xmm1, %xmm1 # shift left by 1
  853. psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
  854. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  855. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  856. movaps %xmm5, %xmm4
  857. pslld $2, %xmm5
  858. psrld $30, %xmm4
  859. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  860. xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
  861. xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  862. movaps %xmm1, %xmm5
  863. paddd %xmm6, %xmm5
  864. movups %xmm5, -64+16*1(%rsp)
  865. # 44
  866. movl %ecx, %edi # di: b
  867. movl %ecx, %esi # si: b
  868. orl %edx, %edi # di: b | c
  869. andl %edx, %esi # si: b & c
  870. andl %ebp, %edi # di: (b | c) & d
  871. orl %esi, %edi # ((b | c) & d) | (b & c)
  872. addl %edi, %eax # += ((b | c) & d) | (b & c)
  873. addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15]
  874. movl %ebx, %esi #
  875. roll $5, %esi # rotl32(a,5)
  876. addl %esi, %eax # e += rotl32(a,5)
  877. rorl $2, %ecx # b = rotl32(b,30)
  878. # 45
  879. movl %ebx, %edi # di: b
  880. movl %ebx, %esi # si: b
  881. orl %ecx, %edi # di: b | c
  882. andl %ecx, %esi # si: b & c
  883. andl %edx, %edi # di: (b | c) & d
  884. orl %esi, %edi # ((b | c) & d) | (b & c)
  885. addl %edi, %ebp # += ((b | c) & d) | (b & c)
  886. addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15]
  887. movl %eax, %esi #
  888. roll $5, %esi # rotl32(a,5)
  889. addl %esi, %ebp # e += rotl32(a,5)
  890. rorl $2, %ebx # b = rotl32(b,30)
  891. # 46
  892. movl %eax, %edi # di: b
  893. movl %eax, %esi # si: b
  894. orl %ebx, %edi # di: b | c
  895. andl %ebx, %esi # si: b & c
  896. andl %ecx, %edi # di: (b | c) & d
  897. orl %esi, %edi # ((b | c) & d) | (b & c)
  898. addl %edi, %edx # += ((b | c) & d) | (b & c)
  899. addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15]
  900. movl %ebp, %esi #
  901. roll $5, %esi # rotl32(a,5)
  902. addl %esi, %edx # e += rotl32(a,5)
  903. rorl $2, %eax # b = rotl32(b,30)
  904. # 47
  905. movl %ebp, %edi # di: b
  906. movl %ebp, %esi # si: b
  907. orl %eax, %edi # di: b | c
  908. andl %eax, %esi # si: b & c
  909. andl %ebx, %edi # di: (b | c) & d
  910. orl %esi, %edi # ((b | c) & d) | (b & c)
  911. addl %edi, %ecx # += ((b | c) & d) | (b & c)
  912. addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15]
  913. movl %edx, %esi #
  914. roll $5, %esi # rotl32(a,5)
  915. addl %esi, %ecx # e += rotl32(a,5)
  916. rorl $2, %ebp # b = rotl32(b,30)
  917. # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
  918. movaps %xmm1, %xmm4
  919. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  920. # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  921. # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  922. # same result as above, but shorter and faster:
  923. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  924. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  925. movaps %xmm2, %xmm5
  926. shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  927. xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  928. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  929. xorps %xmm5, %xmm2 # ^
  930. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  931. movaps %xmm2, %xmm5
  932. xorps %xmm4, %xmm4 # rol(W0,1):
  933. pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  934. paddd %xmm2, %xmm2 # shift left by 1
  935. psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
  936. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  937. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  938. movaps %xmm5, %xmm4
  939. pslld $2, %xmm5
  940. psrld $30, %xmm4
  941. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  942. xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
  943. xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  944. movaps %xmm2, %xmm5
  945. paddd %xmm6, %xmm5
  946. movups %xmm5, -64+16*2(%rsp)
  947. # 48
  948. movl %edx, %edi # di: b
  949. movl %edx, %esi # si: b
  950. orl %ebp, %edi # di: b | c
  951. andl %ebp, %esi # si: b & c
  952. andl %eax, %edi # di: (b | c) & d
  953. orl %esi, %edi # ((b | c) & d) | (b & c)
  954. addl %edi, %ebx # += ((b | c) & d) | (b & c)
  955. addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15]
  956. movl %ecx, %esi #
  957. roll $5, %esi # rotl32(a,5)
  958. addl %esi, %ebx # e += rotl32(a,5)
  959. rorl $2, %edx # b = rotl32(b,30)
  960. # 49
  961. movl %ecx, %edi # di: b
  962. movl %ecx, %esi # si: b
  963. orl %edx, %edi # di: b | c
  964. andl %edx, %esi # si: b & c
  965. andl %ebp, %edi # di: (b | c) & d
  966. orl %esi, %edi # ((b | c) & d) | (b & c)
  967. addl %edi, %eax # += ((b | c) & d) | (b & c)
  968. addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15]
  969. movl %ebx, %esi #
  970. roll $5, %esi # rotl32(a,5)
  971. addl %esi, %eax # e += rotl32(a,5)
  972. rorl $2, %ecx # b = rotl32(b,30)
  973. # 50
  974. movl %ebx, %edi # di: b
  975. movl %ebx, %esi # si: b
  976. orl %ecx, %edi # di: b | c
  977. andl %ecx, %esi # si: b & c
  978. andl %edx, %edi # di: (b | c) & d
  979. orl %esi, %edi # ((b | c) & d) | (b & c)
  980. addl %edi, %ebp # += ((b | c) & d) | (b & c)
  981. addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15]
  982. movl %eax, %esi #
  983. roll $5, %esi # rotl32(a,5)
  984. addl %esi, %ebp # e += rotl32(a,5)
  985. rorl $2, %ebx # b = rotl32(b,30)
  986. # 51
  987. movl %eax, %edi # di: b
  988. movl %eax, %esi # si: b
  989. orl %ebx, %edi # di: b | c
  990. andl %ebx, %esi # si: b & c
  991. andl %ecx, %edi # di: (b | c) & d
  992. orl %esi, %edi # ((b | c) & d) | (b & c)
  993. addl %edi, %edx # += ((b | c) & d) | (b & c)
  994. addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15]
  995. movl %ebp, %esi #
  996. roll $5, %esi # rotl32(a,5)
  997. addl %esi, %edx # e += rotl32(a,5)
  998. rorl $2, %eax # b = rotl32(b,30)
  999. pshufd $0xff, %xmm7, %xmm6
  1000. # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
  1001. movaps %xmm2, %xmm4
  1002. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  1003. # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  1004. # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  1005. # same result as above, but shorter and faster:
  1006. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  1007. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  1008. movaps %xmm3, %xmm5
  1009. shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  1010. xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  1011. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  1012. xorps %xmm5, %xmm3 # ^
  1013. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  1014. movaps %xmm3, %xmm5
  1015. xorps %xmm4, %xmm4 # rol(W0,1):
  1016. pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  1017. paddd %xmm3, %xmm3 # shift left by 1
  1018. psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
  1019. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  1020. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  1021. movaps %xmm5, %xmm4
  1022. pslld $2, %xmm5
  1023. psrld $30, %xmm4
  1024. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  1025. xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
  1026. xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  1027. movaps %xmm3, %xmm5
  1028. paddd %xmm6, %xmm5
  1029. movups %xmm5, -64+16*3(%rsp)
  1030. # 52
  1031. movl %ebp, %edi # di: b
  1032. movl %ebp, %esi # si: b
  1033. orl %eax, %edi # di: b | c
  1034. andl %eax, %esi # si: b & c
  1035. andl %ebx, %edi # di: (b | c) & d
  1036. orl %esi, %edi # ((b | c) & d) | (b & c)
  1037. addl %edi, %ecx # += ((b | c) & d) | (b & c)
  1038. addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15]
  1039. movl %edx, %esi #
  1040. roll $5, %esi # rotl32(a,5)
  1041. addl %esi, %ecx # e += rotl32(a,5)
  1042. rorl $2, %ebp # b = rotl32(b,30)
  1043. # 53
  1044. movl %edx, %edi # di: b
  1045. movl %edx, %esi # si: b
  1046. orl %ebp, %edi # di: b | c
  1047. andl %ebp, %esi # si: b & c
  1048. andl %eax, %edi # di: (b | c) & d
  1049. orl %esi, %edi # ((b | c) & d) | (b & c)
  1050. addl %edi, %ebx # += ((b | c) & d) | (b & c)
  1051. addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15]
  1052. movl %ecx, %esi #
  1053. roll $5, %esi # rotl32(a,5)
  1054. addl %esi, %ebx # e += rotl32(a,5)
  1055. rorl $2, %edx # b = rotl32(b,30)
  1056. # 54
  1057. movl %ecx, %edi # di: b
  1058. movl %ecx, %esi # si: b
  1059. orl %edx, %edi # di: b | c
  1060. andl %edx, %esi # si: b & c
  1061. andl %ebp, %edi # di: (b | c) & d
  1062. orl %esi, %edi # ((b | c) & d) | (b & c)
  1063. addl %edi, %eax # += ((b | c) & d) | (b & c)
  1064. addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15]
  1065. movl %ebx, %esi #
  1066. roll $5, %esi # rotl32(a,5)
  1067. addl %esi, %eax # e += rotl32(a,5)
  1068. rorl $2, %ecx # b = rotl32(b,30)
  1069. # 55
  1070. movl %ebx, %edi # di: b
  1071. movl %ebx, %esi # si: b
  1072. orl %ecx, %edi # di: b | c
  1073. andl %ecx, %esi # si: b & c
  1074. andl %edx, %edi # di: (b | c) & d
  1075. orl %esi, %edi # ((b | c) & d) | (b & c)
  1076. addl %edi, %ebp # += ((b | c) & d) | (b & c)
  1077. addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15]
  1078. movl %eax, %esi #
  1079. roll $5, %esi # rotl32(a,5)
  1080. addl %esi, %ebp # e += rotl32(a,5)
  1081. rorl $2, %ebx # b = rotl32(b,30)
  1082. # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
  1083. movaps %xmm3, %xmm4
  1084. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  1085. # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  1086. # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  1087. # same result as above, but shorter and faster:
  1088. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  1089. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  1090. movaps %xmm0, %xmm5
  1091. shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  1092. xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  1093. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  1094. xorps %xmm5, %xmm0 # ^
  1095. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  1096. movaps %xmm0, %xmm5
  1097. xorps %xmm4, %xmm4 # rol(W0,1):
  1098. pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  1099. paddd %xmm0, %xmm0 # shift left by 1
  1100. psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
  1101. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  1102. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  1103. movaps %xmm5, %xmm4
  1104. pslld $2, %xmm5
  1105. psrld $30, %xmm4
  1106. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  1107. xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
  1108. xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  1109. movaps %xmm0, %xmm5
  1110. paddd %xmm6, %xmm5
  1111. movups %xmm5, -64+16*0(%rsp)
  1112. # 56
  1113. movl %eax, %edi # di: b
  1114. movl %eax, %esi # si: b
  1115. orl %ebx, %edi # di: b | c
  1116. andl %ebx, %esi # si: b & c
  1117. andl %ecx, %edi # di: (b | c) & d
  1118. orl %esi, %edi # ((b | c) & d) | (b & c)
  1119. addl %edi, %edx # += ((b | c) & d) | (b & c)
  1120. addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15]
  1121. movl %ebp, %esi #
  1122. roll $5, %esi # rotl32(a,5)
  1123. addl %esi, %edx # e += rotl32(a,5)
  1124. rorl $2, %eax # b = rotl32(b,30)
  1125. # 57
  1126. movl %ebp, %edi # di: b
  1127. movl %ebp, %esi # si: b
  1128. orl %eax, %edi # di: b | c
  1129. andl %eax, %esi # si: b & c
  1130. andl %ebx, %edi # di: (b | c) & d
  1131. orl %esi, %edi # ((b | c) & d) | (b & c)
  1132. addl %edi, %ecx # += ((b | c) & d) | (b & c)
  1133. addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15]
  1134. movl %edx, %esi #
  1135. roll $5, %esi # rotl32(a,5)
  1136. addl %esi, %ecx # e += rotl32(a,5)
  1137. rorl $2, %ebp # b = rotl32(b,30)
  1138. # 58
  1139. movl %edx, %edi # di: b
  1140. movl %edx, %esi # si: b
  1141. orl %ebp, %edi # di: b | c
  1142. andl %ebp, %esi # si: b & c
  1143. andl %eax, %edi # di: (b | c) & d
  1144. orl %esi, %edi # ((b | c) & d) | (b & c)
  1145. addl %edi, %ebx # += ((b | c) & d) | (b & c)
  1146. addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15]
  1147. movl %ecx, %esi #
  1148. roll $5, %esi # rotl32(a,5)
  1149. addl %esi, %ebx # e += rotl32(a,5)
  1150. rorl $2, %edx # b = rotl32(b,30)
  1151. # 59
  1152. movl %ecx, %edi # di: b
  1153. movl %ecx, %esi # si: b
  1154. orl %edx, %edi # di: b | c
  1155. andl %edx, %esi # si: b & c
  1156. andl %ebp, %edi # di: (b | c) & d
  1157. orl %esi, %edi # ((b | c) & d) | (b & c)
  1158. addl %edi, %eax # += ((b | c) & d) | (b & c)
  1159. addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15]
  1160. movl %ebx, %esi #
  1161. roll $5, %esi # rotl32(a,5)
  1162. addl %esi, %eax # e += rotl32(a,5)
  1163. rorl $2, %ecx # b = rotl32(b,30)
  1164. # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
  1165. movaps %xmm0, %xmm4
  1166. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  1167. # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  1168. # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  1169. # same result as above, but shorter and faster:
  1170. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  1171. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  1172. movaps %xmm1, %xmm5
  1173. shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  1174. xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  1175. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  1176. xorps %xmm5, %xmm1 # ^
  1177. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  1178. movaps %xmm1, %xmm5
  1179. xorps %xmm4, %xmm4 # rol(W0,1):
  1180. pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  1181. paddd %xmm1, %xmm1 # shift left by 1
  1182. psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
  1183. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  1184. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  1185. movaps %xmm5, %xmm4
  1186. pslld $2, %xmm5
  1187. psrld $30, %xmm4
  1188. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  1189. xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
  1190. xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  1191. movaps %xmm1, %xmm5
  1192. paddd %xmm6, %xmm5
  1193. movups %xmm5, -64+16*1(%rsp)
  1194. # 60
  1195. movl %ecx, %edi # c
  1196. xorl %edx, %edi # ^d
  1197. xorl %ebx, %edi # ^b
  1198. addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15]
  1199. addl %edi, %ebp # e += (c ^ d ^ b)
  1200. movl %eax, %esi #
  1201. roll $5, %esi # rotl32(a,5)
  1202. addl %esi, %ebp # e += rotl32(a,5)
  1203. rorl $2, %ebx # b = rotl32(b,30)
  1204. # 61
  1205. movl %ebx, %edi # c
  1206. xorl %ecx, %edi # ^d
  1207. xorl %eax, %edi # ^b
  1208. addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15]
  1209. addl %edi, %edx # e += (c ^ d ^ b)
  1210. movl %ebp, %esi #
  1211. roll $5, %esi # rotl32(a,5)
  1212. addl %esi, %edx # e += rotl32(a,5)
  1213. rorl $2, %eax # b = rotl32(b,30)
  1214. # 62
  1215. movl %eax, %edi # c
  1216. xorl %ebx, %edi # ^d
  1217. xorl %ebp, %edi # ^b
  1218. addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15]
  1219. addl %edi, %ecx # e += (c ^ d ^ b)
  1220. movl %edx, %esi #
  1221. roll $5, %esi # rotl32(a,5)
  1222. addl %esi, %ecx # e += rotl32(a,5)
  1223. rorl $2, %ebp # b = rotl32(b,30)
  1224. # 63
  1225. movl %ebp, %edi # c
  1226. xorl %eax, %edi # ^d
  1227. xorl %edx, %edi # ^b
  1228. addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15]
  1229. addl %edi, %ebx # e += (c ^ d ^ b)
  1230. movl %ecx, %esi #
  1231. roll $5, %esi # rotl32(a,5)
  1232. addl %esi, %ebx # e += rotl32(a,5)
  1233. rorl $2, %edx # b = rotl32(b,30)
  1234. # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
  1235. movaps %xmm1, %xmm4
  1236. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  1237. # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  1238. # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  1239. # same result as above, but shorter and faster:
  1240. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  1241. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  1242. movaps %xmm2, %xmm5
  1243. shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  1244. xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  1245. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  1246. xorps %xmm5, %xmm2 # ^
  1247. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  1248. movaps %xmm2, %xmm5
  1249. xorps %xmm4, %xmm4 # rol(W0,1):
  1250. pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  1251. paddd %xmm2, %xmm2 # shift left by 1
  1252. psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
  1253. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  1254. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  1255. movaps %xmm5, %xmm4
  1256. pslld $2, %xmm5
  1257. psrld $30, %xmm4
  1258. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  1259. xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
  1260. xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  1261. movaps %xmm2, %xmm5
  1262. paddd %xmm6, %xmm5
  1263. movups %xmm5, -64+16*2(%rsp)
  1264. # 64
  1265. movl %edx, %edi # c
  1266. xorl %ebp, %edi # ^d
  1267. xorl %ecx, %edi # ^b
  1268. addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15]
  1269. addl %edi, %eax # e += (c ^ d ^ b)
  1270. movl %ebx, %esi #
  1271. roll $5, %esi # rotl32(a,5)
  1272. addl %esi, %eax # e += rotl32(a,5)
  1273. rorl $2, %ecx # b = rotl32(b,30)
  1274. # 65
  1275. movl %ecx, %edi # c
  1276. xorl %edx, %edi # ^d
  1277. xorl %ebx, %edi # ^b
  1278. addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15]
  1279. addl %edi, %ebp # e += (c ^ d ^ b)
  1280. movl %eax, %esi #
  1281. roll $5, %esi # rotl32(a,5)
  1282. addl %esi, %ebp # e += rotl32(a,5)
  1283. rorl $2, %ebx # b = rotl32(b,30)
  1284. # 66
  1285. movl %ebx, %edi # c
  1286. xorl %ecx, %edi # ^d
  1287. xorl %eax, %edi # ^b
  1288. addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15]
  1289. addl %edi, %edx # e += (c ^ d ^ b)
  1290. movl %ebp, %esi #
  1291. roll $5, %esi # rotl32(a,5)
  1292. addl %esi, %edx # e += rotl32(a,5)
  1293. rorl $2, %eax # b = rotl32(b,30)
  1294. # 67
  1295. movl %eax, %edi # c
  1296. xorl %ebx, %edi # ^d
  1297. xorl %ebp, %edi # ^b
  1298. addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15]
  1299. addl %edi, %ecx # e += (c ^ d ^ b)
  1300. movl %edx, %esi #
  1301. roll $5, %esi # rotl32(a,5)
  1302. addl %esi, %ecx # e += rotl32(a,5)
  1303. rorl $2, %ebp # b = rotl32(b,30)
  1304. # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
  1305. movaps %xmm2, %xmm4
  1306. psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
  1307. # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
  1308. # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
  1309. # same result as above, but shorter and faster:
  1310. # pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
  1311. # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
  1312. movaps %xmm3, %xmm5
  1313. shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
  1314. xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
  1315. xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
  1316. xorps %xmm5, %xmm3 # ^
  1317. # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
  1318. movaps %xmm3, %xmm5
  1319. xorps %xmm4, %xmm4 # rol(W0,1):
  1320. pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
  1321. paddd %xmm3, %xmm3 # shift left by 1
  1322. psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
  1323. # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
  1324. pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
  1325. movaps %xmm5, %xmm4
  1326. pslld $2, %xmm5
  1327. psrld $30, %xmm4
  1328. # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
  1329. xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
  1330. xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
  1331. movaps %xmm3, %xmm5
  1332. paddd %xmm6, %xmm5
  1333. movups %xmm5, -64+16*3(%rsp)
  1334. # 68
  1335. movl %ebp, %edi # c
  1336. xorl %eax, %edi # ^d
  1337. xorl %edx, %edi # ^b
  1338. addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15]
  1339. addl %edi, %ebx # e += (c ^ d ^ b)
  1340. movl %ecx, %esi #
  1341. roll $5, %esi # rotl32(a,5)
  1342. addl %esi, %ebx # e += rotl32(a,5)
  1343. rorl $2, %edx # b = rotl32(b,30)
  1344. # 69
  1345. movl %edx, %edi # c
  1346. xorl %ebp, %edi # ^d
  1347. xorl %ecx, %edi # ^b
  1348. addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15]
  1349. addl %edi, %eax # e += (c ^ d ^ b)
  1350. movl %ebx, %esi #
  1351. roll $5, %esi # rotl32(a,5)
  1352. addl %esi, %eax # e += rotl32(a,5)
  1353. rorl $2, %ecx # b = rotl32(b,30)
  1354. # 70
  1355. movl %ecx, %edi # c
  1356. xorl %edx, %edi # ^d
  1357. xorl %ebx, %edi # ^b
  1358. addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15]
  1359. addl %edi, %ebp # e += (c ^ d ^ b)
  1360. movl %eax, %esi #
  1361. roll $5, %esi # rotl32(a,5)
  1362. addl %esi, %ebp # e += rotl32(a,5)
  1363. rorl $2, %ebx # b = rotl32(b,30)
  1364. # 71
  1365. movl %ebx, %edi # c
  1366. xorl %ecx, %edi # ^d
  1367. xorl %eax, %edi # ^b
  1368. addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15]
  1369. addl %edi, %edx # e += (c ^ d ^ b)
  1370. movl %ebp, %esi #
  1371. roll $5, %esi # rotl32(a,5)
  1372. addl %esi, %edx # e += rotl32(a,5)
  1373. rorl $2, %eax # b = rotl32(b,30)
  1374. # 72
  1375. movl %eax, %edi # c
  1376. xorl %ebx, %edi # ^d
  1377. xorl %ebp, %edi # ^b
  1378. addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15]
  1379. addl %edi, %ecx # e += (c ^ d ^ b)
  1380. movl %edx, %esi #
  1381. roll $5, %esi # rotl32(a,5)
  1382. addl %esi, %ecx # e += rotl32(a,5)
  1383. rorl $2, %ebp # b = rotl32(b,30)
  1384. # 73
  1385. movl %ebp, %edi # c
  1386. xorl %eax, %edi # ^d
  1387. xorl %edx, %edi # ^b
  1388. addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15]
  1389. addl %edi, %ebx # e += (c ^ d ^ b)
  1390. movl %ecx, %esi #
  1391. roll $5, %esi # rotl32(a,5)
  1392. addl %esi, %ebx # e += rotl32(a,5)
  1393. rorl $2, %edx # b = rotl32(b,30)
  1394. # 74
  1395. movl %edx, %edi # c
  1396. xorl %ebp, %edi # ^d
  1397. xorl %ecx, %edi # ^b
  1398. addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15]
  1399. addl %edi, %eax # e += (c ^ d ^ b)
  1400. movl %ebx, %esi #
  1401. roll $5, %esi # rotl32(a,5)
  1402. addl %esi, %eax # e += rotl32(a,5)
  1403. rorl $2, %ecx # b = rotl32(b,30)
  1404. # 75
  1405. movl %ecx, %edi # c
  1406. xorl %edx, %edi # ^d
  1407. xorl %ebx, %edi # ^b
  1408. addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15]
  1409. addl %edi, %ebp # e += (c ^ d ^ b)
  1410. movl %eax, %esi #
  1411. roll $5, %esi # rotl32(a,5)
  1412. addl %esi, %ebp # e += rotl32(a,5)
  1413. rorl $2, %ebx # b = rotl32(b,30)
  1414. # 76
  1415. movl %ebx, %edi # c
  1416. xorl %ecx, %edi # ^d
  1417. xorl %eax, %edi # ^b
  1418. addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15]
  1419. addl %edi, %edx # e += (c ^ d ^ b)
  1420. movl %ebp, %esi #
  1421. roll $5, %esi # rotl32(a,5)
  1422. addl %esi, %edx # e += rotl32(a,5)
  1423. rorl $2, %eax # b = rotl32(b,30)
  1424. # 77
  1425. movl %eax, %edi # c
  1426. xorl %ebx, %edi # ^d
  1427. xorl %ebp, %edi # ^b
  1428. addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15]
  1429. addl %edi, %ecx # e += (c ^ d ^ b)
  1430. movl %edx, %esi #
  1431. roll $5, %esi # rotl32(a,5)
  1432. addl %esi, %ecx # e += rotl32(a,5)
  1433. rorl $2, %ebp # b = rotl32(b,30)
  1434. # 78
  1435. movl %ebp, %edi # c
  1436. xorl %eax, %edi # ^d
  1437. xorl %edx, %edi # ^b
  1438. addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15]
  1439. addl %edi, %ebx # e += (c ^ d ^ b)
  1440. movl %ecx, %esi #
  1441. roll $5, %esi # rotl32(a,5)
  1442. addl %esi, %ebx # e += rotl32(a,5)
  1443. rorl $2, %edx # b = rotl32(b,30)
  1444. # 79
  1445. movl %edx, %edi # c
  1446. xorl %ebp, %edi # ^d
  1447. xorl %ecx, %edi # ^b
  1448. addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15]
  1449. addl %edi, %eax # e += (c ^ d ^ b)
  1450. movl %ebx, %esi #
  1451. roll $5, %esi # rotl32(a,5)
  1452. addl %esi, %eax # e += rotl32(a,5)
  1453. rorl $2, %ecx # b = rotl32(b,30)
  1454. popq %rdi #
  1455. popq %r12 #
  1456. addl %eax, 80(%rdi) # ctx->hash[0] += a
  1457. popq %r13 #
  1458. addl %ebx, 84(%rdi) # ctx->hash[1] += b
  1459. popq %r14 #
  1460. addl %ecx, 88(%rdi) # ctx->hash[2] += c
  1461. # popq %r15 #
  1462. addl %edx, 92(%rdi) # ctx->hash[3] += d
  1463. popq %rbx #
  1464. addl %ebp, 96(%rdi) # ctx->hash[4] += e
  1465. popq %rbp #
  1466. ret
  1467. .size sha1_process_block64, .-sha1_process_block64
  1468. .section .rodata.cst16.sha1const, "aM", @progbits, 16
  1469. .balign 16
  1470. sha1const:
  1471. .long 0x5A827999
  1472. .long 0x6ED9EBA1
  1473. .long 0x8F1BBCDC
  1474. .long 0xCA62C1D6
  1475. #endif