hash_md5_sha256_x86-64_shaNI.S 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. #if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
  2. /* The code is adapted from Linux kernel's source */
  3. // We use shorter insns, even though they are for "wrong"
  4. // data type (fp, not int).
  5. // For Intel, there is no penalty for doing it at all
  6. // (CPUs which do have such penalty do not support SHA insns).
  7. // For AMD, the penalty is one extra cycle
  8. // (allegedly: I failed to find measurable difference).
  9. //#define mova128 movdqa
  10. #define mova128 movaps
  11. //#define movu128 movdqu
  12. #define movu128 movups
  13. //#define shuf128_32 pshufd
  14. #define shuf128_32 shufps
  15. // pshufb and palignr are SSSE3 insns.
  16. // We do not check SSSE3 in cpuid,
  17. // all SHA-capable CPUs support it as well.
  18. #ifdef __linux__
  19. .section .note.GNU-stack, "", @progbits
  20. #endif
  21. .section .text.sha256_process_block64_shaNI, "ax", @progbits
  22. .globl sha256_process_block64_shaNI
  23. .hidden sha256_process_block64_shaNI
  24. .type sha256_process_block64_shaNI, @function
  25. #define DATA_PTR %rdi
  26. #define SHA256CONSTANTS %rax
  27. #define MSG %xmm0
  28. #define STATE0 %xmm1
  29. #define STATE1 %xmm2
  30. #define MSGTMP0 %xmm3
  31. #define MSGTMP1 %xmm4
  32. #define MSGTMP2 %xmm5
  33. #define MSGTMP3 %xmm6
  34. #define XMMTMP %xmm7
  35. #define SAVE0 %xmm8
  36. #define SAVE1 %xmm9
  37. #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
  38. .balign 8 # allow decoders to fetch at least 2 first insns
  39. sha256_process_block64_shaNI:
  40. movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */
  41. movu128 80+1*16(%rdi), STATE1 /* EFGH */
  42. /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
  43. mova128 STATE1, STATE0
  44. /* --- -------------- ABCD -- EFGH */
  45. shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
  46. shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
  47. /* XMMTMP holds flip mask from here... */
  48. mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
  49. leaq K256+8*16(%rip), SHA256CONSTANTS
  50. /* Save hash values for addition after rounds */
  51. mova128 STATE0, SAVE0
  52. mova128 STATE1, SAVE1
  53. /* Rounds 0-3 */
  54. movu128 0*16(DATA_PTR), MSG
  55. pshufb XMMTMP, MSG
  56. mova128 MSG, MSGTMP0
  57. paddd 0*16-8*16(SHA256CONSTANTS), MSG
  58. sha256rnds2 MSG, STATE0, STATE1
  59. shuf128_32 $0x0E, MSG, MSG
  60. sha256rnds2 MSG, STATE1, STATE0
  61. /* Rounds 4-7 */
  62. movu128 1*16(DATA_PTR), MSG
  63. pshufb XMMTMP, MSG
  64. mova128 MSG, MSGTMP1
  65. paddd 1*16-8*16(SHA256CONSTANTS), MSG
  66. sha256rnds2 MSG, STATE0, STATE1
  67. shuf128_32 $0x0E, MSG, MSG
  68. sha256rnds2 MSG, STATE1, STATE0
  69. sha256msg1 MSGTMP1, MSGTMP0
  70. /* Rounds 8-11 */
  71. movu128 2*16(DATA_PTR), MSG
  72. pshufb XMMTMP, MSG
  73. mova128 MSG, MSGTMP2
  74. paddd 2*16-8*16(SHA256CONSTANTS), MSG
  75. sha256rnds2 MSG, STATE0, STATE1
  76. shuf128_32 $0x0E, MSG, MSG
  77. sha256rnds2 MSG, STATE1, STATE0
  78. sha256msg1 MSGTMP2, MSGTMP1
  79. /* Rounds 12-15 */
  80. movu128 3*16(DATA_PTR), MSG
  81. pshufb XMMTMP, MSG
  82. /* ...to here */
  83. mova128 MSG, MSGTMP3
  84. paddd 3*16-8*16(SHA256CONSTANTS), MSG
  85. sha256rnds2 MSG, STATE0, STATE1
  86. mova128 MSGTMP3, XMMTMP
  87. palignr $4, MSGTMP2, XMMTMP
  88. paddd XMMTMP, MSGTMP0
  89. sha256msg2 MSGTMP3, MSGTMP0
  90. shuf128_32 $0x0E, MSG, MSG
  91. sha256rnds2 MSG, STATE1, STATE0
  92. sha256msg1 MSGTMP3, MSGTMP2
  93. /* Rounds 16-19 */
  94. mova128 MSGTMP0, MSG
  95. paddd 4*16-8*16(SHA256CONSTANTS), MSG
  96. sha256rnds2 MSG, STATE0, STATE1
  97. mova128 MSGTMP0, XMMTMP
  98. palignr $4, MSGTMP3, XMMTMP
  99. paddd XMMTMP, MSGTMP1
  100. sha256msg2 MSGTMP0, MSGTMP1
  101. shuf128_32 $0x0E, MSG, MSG
  102. sha256rnds2 MSG, STATE1, STATE0
  103. sha256msg1 MSGTMP0, MSGTMP3
  104. /* Rounds 20-23 */
  105. mova128 MSGTMP1, MSG
  106. paddd 5*16-8*16(SHA256CONSTANTS), MSG
  107. sha256rnds2 MSG, STATE0, STATE1
  108. mova128 MSGTMP1, XMMTMP
  109. palignr $4, MSGTMP0, XMMTMP
  110. paddd XMMTMP, MSGTMP2
  111. sha256msg2 MSGTMP1, MSGTMP2
  112. shuf128_32 $0x0E, MSG, MSG
  113. sha256rnds2 MSG, STATE1, STATE0
  114. sha256msg1 MSGTMP1, MSGTMP0
  115. /* Rounds 24-27 */
  116. mova128 MSGTMP2, MSG
  117. paddd 6*16-8*16(SHA256CONSTANTS), MSG
  118. sha256rnds2 MSG, STATE0, STATE1
  119. mova128 MSGTMP2, XMMTMP
  120. palignr $4, MSGTMP1, XMMTMP
  121. paddd XMMTMP, MSGTMP3
  122. sha256msg2 MSGTMP2, MSGTMP3
  123. shuf128_32 $0x0E, MSG, MSG
  124. sha256rnds2 MSG, STATE1, STATE0
  125. sha256msg1 MSGTMP2, MSGTMP1
  126. /* Rounds 28-31 */
  127. mova128 MSGTMP3, MSG
  128. paddd 7*16-8*16(SHA256CONSTANTS), MSG
  129. sha256rnds2 MSG, STATE0, STATE1
  130. mova128 MSGTMP3, XMMTMP
  131. palignr $4, MSGTMP2, XMMTMP
  132. paddd XMMTMP, MSGTMP0
  133. sha256msg2 MSGTMP3, MSGTMP0
  134. shuf128_32 $0x0E, MSG, MSG
  135. sha256rnds2 MSG, STATE1, STATE0
  136. sha256msg1 MSGTMP3, MSGTMP2
  137. /* Rounds 32-35 */
  138. mova128 MSGTMP0, MSG
  139. paddd 8*16-8*16(SHA256CONSTANTS), MSG
  140. sha256rnds2 MSG, STATE0, STATE1
  141. mova128 MSGTMP0, XMMTMP
  142. palignr $4, MSGTMP3, XMMTMP
  143. paddd XMMTMP, MSGTMP1
  144. sha256msg2 MSGTMP0, MSGTMP1
  145. shuf128_32 $0x0E, MSG, MSG
  146. sha256rnds2 MSG, STATE1, STATE0
  147. sha256msg1 MSGTMP0, MSGTMP3
  148. /* Rounds 36-39 */
  149. mova128 MSGTMP1, MSG
  150. paddd 9*16-8*16(SHA256CONSTANTS), MSG
  151. sha256rnds2 MSG, STATE0, STATE1
  152. mova128 MSGTMP1, XMMTMP
  153. palignr $4, MSGTMP0, XMMTMP
  154. paddd XMMTMP, MSGTMP2
  155. sha256msg2 MSGTMP1, MSGTMP2
  156. shuf128_32 $0x0E, MSG, MSG
  157. sha256rnds2 MSG, STATE1, STATE0
  158. sha256msg1 MSGTMP1, MSGTMP0
  159. /* Rounds 40-43 */
  160. mova128 MSGTMP2, MSG
  161. paddd 10*16-8*16(SHA256CONSTANTS), MSG
  162. sha256rnds2 MSG, STATE0, STATE1
  163. mova128 MSGTMP2, XMMTMP
  164. palignr $4, MSGTMP1, XMMTMP
  165. paddd XMMTMP, MSGTMP3
  166. sha256msg2 MSGTMP2, MSGTMP3
  167. shuf128_32 $0x0E, MSG, MSG
  168. sha256rnds2 MSG, STATE1, STATE0
  169. sha256msg1 MSGTMP2, MSGTMP1
  170. /* Rounds 44-47 */
  171. mova128 MSGTMP3, MSG
  172. paddd 11*16-8*16(SHA256CONSTANTS), MSG
  173. sha256rnds2 MSG, STATE0, STATE1
  174. mova128 MSGTMP3, XMMTMP
  175. palignr $4, MSGTMP2, XMMTMP
  176. paddd XMMTMP, MSGTMP0
  177. sha256msg2 MSGTMP3, MSGTMP0
  178. shuf128_32 $0x0E, MSG, MSG
  179. sha256rnds2 MSG, STATE1, STATE0
  180. sha256msg1 MSGTMP3, MSGTMP2
  181. /* Rounds 48-51 */
  182. mova128 MSGTMP0, MSG
  183. paddd 12*16-8*16(SHA256CONSTANTS), MSG
  184. sha256rnds2 MSG, STATE0, STATE1
  185. mova128 MSGTMP0, XMMTMP
  186. palignr $4, MSGTMP3, XMMTMP
  187. paddd XMMTMP, MSGTMP1
  188. sha256msg2 MSGTMP0, MSGTMP1
  189. shuf128_32 $0x0E, MSG, MSG
  190. sha256rnds2 MSG, STATE1, STATE0
  191. sha256msg1 MSGTMP0, MSGTMP3
  192. /* Rounds 52-55 */
  193. mova128 MSGTMP1, MSG
  194. paddd 13*16-8*16(SHA256CONSTANTS), MSG
  195. sha256rnds2 MSG, STATE0, STATE1
  196. mova128 MSGTMP1, XMMTMP
  197. palignr $4, MSGTMP0, XMMTMP
  198. paddd XMMTMP, MSGTMP2
  199. sha256msg2 MSGTMP1, MSGTMP2
  200. shuf128_32 $0x0E, MSG, MSG
  201. sha256rnds2 MSG, STATE1, STATE0
  202. /* Rounds 56-59 */
  203. mova128 MSGTMP2, MSG
  204. paddd 14*16-8*16(SHA256CONSTANTS), MSG
  205. sha256rnds2 MSG, STATE0, STATE1
  206. mova128 MSGTMP2, XMMTMP
  207. palignr $4, MSGTMP1, XMMTMP
  208. paddd XMMTMP, MSGTMP3
  209. sha256msg2 MSGTMP2, MSGTMP3
  210. shuf128_32 $0x0E, MSG, MSG
  211. sha256rnds2 MSG, STATE1, STATE0
  212. /* Rounds 60-63 */
  213. mova128 MSGTMP3, MSG
  214. paddd 15*16-8*16(SHA256CONSTANTS), MSG
  215. sha256rnds2 MSG, STATE0, STATE1
  216. shuf128_32 $0x0E, MSG, MSG
  217. sha256rnds2 MSG, STATE1, STATE0
  218. /* Add current hash values with previously saved */
  219. paddd SAVE0, STATE0
  220. paddd SAVE1, STATE1
  221. /* Write hash values back in the correct order */
  222. mova128 STATE0, XMMTMP
  223. /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
  224. /* --- -------------- HGDC -- FEBA */
  225. shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
  226. shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
  227. movu128 STATE0, 80+0*16(%rdi)
  228. movu128 XMMTMP, 80+1*16(%rdi)
  229. ret
  230. .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
  231. .section .rodata.cst256.K256, "aM", @progbits, 256
  232. .balign 16
  233. K256:
  234. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  235. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  236. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  237. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  238. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  239. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  240. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  241. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  242. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  243. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  244. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  245. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  246. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  247. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  248. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  249. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  250. .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
  251. .balign 16
  252. PSHUFFLE_BSWAP32_FLIP_MASK:
  253. .octa 0x0c0d0e0f08090a0b0405060700010203
  254. #endif