3
0

hash_sha256_hwaccel_x86-32.S 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. #if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
  2. /* The code is adapted from Linux kernel's source */
  3. // We use shorter insns, even though they are for "wrong"
  4. // data type (fp, not int).
  5. // For Intel, there is no penalty for doing it at all
  6. // (CPUs which do have such penalty do not support SHA insns).
  7. // For AMD, the penalty is one extra cycle
  8. // (allegedly: I failed to find measurable difference).
  9. //#define mova128 movdqa
  10. #define mova128 movaps
  11. //#define movu128 movdqu
  12. #define movu128 movups
  13. //#define shuf128_32 pshufd
  14. #define shuf128_32 shufps
  15. // pshufb and palignr are SSSE3 insns.
  16. // We do not check SSSE3 in cpuid,
  17. // all SHA-capable CPUs support it as well.
  18. #ifdef __linux__
  19. .section .note.GNU-stack, "", @progbits
  20. #endif
  21. .section .text.sha256_process_block64_shaNI, "ax", @progbits
  22. .globl sha256_process_block64_shaNI
  23. .hidden sha256_process_block64_shaNI
  24. .type sha256_process_block64_shaNI, @function
  25. #define DATA_PTR %eax
  26. #define SHA256CONSTANTS %ecx
  27. #define MSG %xmm0
  28. #define STATE0 %xmm1
  29. #define STATE1 %xmm2
  30. #define MSGTMP0 %xmm3
  31. #define MSGTMP1 %xmm4
  32. #define MSGTMP2 %xmm5
  33. #define MSGTMP3 %xmm6
  34. #define XMMTMP %xmm7
  35. #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
  36. .balign 8 # allow decoders to fetch at least 2 first insns
  37. sha256_process_block64_shaNI:
  38. movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
  39. movu128 76+1*16(%eax), STATE1 /* EFGH */
  40. /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
  41. mova128 STATE1, STATE0
  42. /* --- -------------- ABCD -- EFGH */
  43. shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
  44. shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
  45. /* XMMTMP holds flip mask from here... */
  46. mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
  47. movl $K256+8*16, SHA256CONSTANTS
  48. /* Rounds 0-3 */
  49. movu128 0*16(DATA_PTR), MSG
  50. pshufb XMMTMP, MSG
  51. mova128 MSG, MSGTMP0
  52. paddd 0*16-8*16(SHA256CONSTANTS), MSG
  53. sha256rnds2 MSG, STATE0, STATE1
  54. shuf128_32 $0x0E, MSG, MSG
  55. sha256rnds2 MSG, STATE1, STATE0
  56. /* Rounds 4-7 */
  57. movu128 1*16(DATA_PTR), MSG
  58. pshufb XMMTMP, MSG
  59. mova128 MSG, MSGTMP1
  60. paddd 1*16-8*16(SHA256CONSTANTS), MSG
  61. sha256rnds2 MSG, STATE0, STATE1
  62. shuf128_32 $0x0E, MSG, MSG
  63. sha256rnds2 MSG, STATE1, STATE0
  64. sha256msg1 MSGTMP1, MSGTMP0
  65. /* Rounds 8-11 */
  66. movu128 2*16(DATA_PTR), MSG
  67. pshufb XMMTMP, MSG
  68. mova128 MSG, MSGTMP2
  69. paddd 2*16-8*16(SHA256CONSTANTS), MSG
  70. sha256rnds2 MSG, STATE0, STATE1
  71. shuf128_32 $0x0E, MSG, MSG
  72. sha256rnds2 MSG, STATE1, STATE0
  73. sha256msg1 MSGTMP2, MSGTMP1
  74. /* Rounds 12-15 */
  75. movu128 3*16(DATA_PTR), MSG
  76. pshufb XMMTMP, MSG
  77. /* ...to here */
  78. mova128 MSG, MSGTMP3
  79. paddd 3*16-8*16(SHA256CONSTANTS), MSG
  80. sha256rnds2 MSG, STATE0, STATE1
  81. mova128 MSGTMP3, XMMTMP
  82. palignr $4, MSGTMP2, XMMTMP
  83. paddd XMMTMP, MSGTMP0
  84. sha256msg2 MSGTMP3, MSGTMP0
  85. shuf128_32 $0x0E, MSG, MSG
  86. sha256rnds2 MSG, STATE1, STATE0
  87. sha256msg1 MSGTMP3, MSGTMP2
  88. /* Rounds 16-19 */
  89. mova128 MSGTMP0, MSG
  90. paddd 4*16-8*16(SHA256CONSTANTS), MSG
  91. sha256rnds2 MSG, STATE0, STATE1
  92. mova128 MSGTMP0, XMMTMP
  93. palignr $4, MSGTMP3, XMMTMP
  94. paddd XMMTMP, MSGTMP1
  95. sha256msg2 MSGTMP0, MSGTMP1
  96. shuf128_32 $0x0E, MSG, MSG
  97. sha256rnds2 MSG, STATE1, STATE0
  98. sha256msg1 MSGTMP0, MSGTMP3
  99. /* Rounds 20-23 */
  100. mova128 MSGTMP1, MSG
  101. paddd 5*16-8*16(SHA256CONSTANTS), MSG
  102. sha256rnds2 MSG, STATE0, STATE1
  103. mova128 MSGTMP1, XMMTMP
  104. palignr $4, MSGTMP0, XMMTMP
  105. paddd XMMTMP, MSGTMP2
  106. sha256msg2 MSGTMP1, MSGTMP2
  107. shuf128_32 $0x0E, MSG, MSG
  108. sha256rnds2 MSG, STATE1, STATE0
  109. sha256msg1 MSGTMP1, MSGTMP0
  110. /* Rounds 24-27 */
  111. mova128 MSGTMP2, MSG
  112. paddd 6*16-8*16(SHA256CONSTANTS), MSG
  113. sha256rnds2 MSG, STATE0, STATE1
  114. mova128 MSGTMP2, XMMTMP
  115. palignr $4, MSGTMP1, XMMTMP
  116. paddd XMMTMP, MSGTMP3
  117. sha256msg2 MSGTMP2, MSGTMP3
  118. shuf128_32 $0x0E, MSG, MSG
  119. sha256rnds2 MSG, STATE1, STATE0
  120. sha256msg1 MSGTMP2, MSGTMP1
  121. /* Rounds 28-31 */
  122. mova128 MSGTMP3, MSG
  123. paddd 7*16-8*16(SHA256CONSTANTS), MSG
  124. sha256rnds2 MSG, STATE0, STATE1
  125. mova128 MSGTMP3, XMMTMP
  126. palignr $4, MSGTMP2, XMMTMP
  127. paddd XMMTMP, MSGTMP0
  128. sha256msg2 MSGTMP3, MSGTMP0
  129. shuf128_32 $0x0E, MSG, MSG
  130. sha256rnds2 MSG, STATE1, STATE0
  131. sha256msg1 MSGTMP3, MSGTMP2
  132. /* Rounds 32-35 */
  133. mova128 MSGTMP0, MSG
  134. paddd 8*16-8*16(SHA256CONSTANTS), MSG
  135. sha256rnds2 MSG, STATE0, STATE1
  136. mova128 MSGTMP0, XMMTMP
  137. palignr $4, MSGTMP3, XMMTMP
  138. paddd XMMTMP, MSGTMP1
  139. sha256msg2 MSGTMP0, MSGTMP1
  140. shuf128_32 $0x0E, MSG, MSG
  141. sha256rnds2 MSG, STATE1, STATE0
  142. sha256msg1 MSGTMP0, MSGTMP3
  143. /* Rounds 36-39 */
  144. mova128 MSGTMP1, MSG
  145. paddd 9*16-8*16(SHA256CONSTANTS), MSG
  146. sha256rnds2 MSG, STATE0, STATE1
  147. mova128 MSGTMP1, XMMTMP
  148. palignr $4, MSGTMP0, XMMTMP
  149. paddd XMMTMP, MSGTMP2
  150. sha256msg2 MSGTMP1, MSGTMP2
  151. shuf128_32 $0x0E, MSG, MSG
  152. sha256rnds2 MSG, STATE1, STATE0
  153. sha256msg1 MSGTMP1, MSGTMP0
  154. /* Rounds 40-43 */
  155. mova128 MSGTMP2, MSG
  156. paddd 10*16-8*16(SHA256CONSTANTS), MSG
  157. sha256rnds2 MSG, STATE0, STATE1
  158. mova128 MSGTMP2, XMMTMP
  159. palignr $4, MSGTMP1, XMMTMP
  160. paddd XMMTMP, MSGTMP3
  161. sha256msg2 MSGTMP2, MSGTMP3
  162. shuf128_32 $0x0E, MSG, MSG
  163. sha256rnds2 MSG, STATE1, STATE0
  164. sha256msg1 MSGTMP2, MSGTMP1
  165. /* Rounds 44-47 */
  166. mova128 MSGTMP3, MSG
  167. paddd 11*16-8*16(SHA256CONSTANTS), MSG
  168. sha256rnds2 MSG, STATE0, STATE1
  169. mova128 MSGTMP3, XMMTMP
  170. palignr $4, MSGTMP2, XMMTMP
  171. paddd XMMTMP, MSGTMP0
  172. sha256msg2 MSGTMP3, MSGTMP0
  173. shuf128_32 $0x0E, MSG, MSG
  174. sha256rnds2 MSG, STATE1, STATE0
  175. sha256msg1 MSGTMP3, MSGTMP2
  176. /* Rounds 48-51 */
  177. mova128 MSGTMP0, MSG
  178. paddd 12*16-8*16(SHA256CONSTANTS), MSG
  179. sha256rnds2 MSG, STATE0, STATE1
  180. mova128 MSGTMP0, XMMTMP
  181. palignr $4, MSGTMP3, XMMTMP
  182. paddd XMMTMP, MSGTMP1
  183. sha256msg2 MSGTMP0, MSGTMP1
  184. shuf128_32 $0x0E, MSG, MSG
  185. sha256rnds2 MSG, STATE1, STATE0
  186. sha256msg1 MSGTMP0, MSGTMP3
  187. /* Rounds 52-55 */
  188. mova128 MSGTMP1, MSG
  189. paddd 13*16-8*16(SHA256CONSTANTS), MSG
  190. sha256rnds2 MSG, STATE0, STATE1
  191. mova128 MSGTMP1, XMMTMP
  192. palignr $4, MSGTMP0, XMMTMP
  193. paddd XMMTMP, MSGTMP2
  194. sha256msg2 MSGTMP1, MSGTMP2
  195. shuf128_32 $0x0E, MSG, MSG
  196. sha256rnds2 MSG, STATE1, STATE0
  197. /* Rounds 56-59 */
  198. mova128 MSGTMP2, MSG
  199. paddd 14*16-8*16(SHA256CONSTANTS), MSG
  200. sha256rnds2 MSG, STATE0, STATE1
  201. mova128 MSGTMP2, XMMTMP
  202. palignr $4, MSGTMP1, XMMTMP
  203. paddd XMMTMP, MSGTMP3
  204. sha256msg2 MSGTMP2, MSGTMP3
  205. shuf128_32 $0x0E, MSG, MSG
  206. sha256rnds2 MSG, STATE1, STATE0
  207. /* Rounds 60-63 */
  208. mova128 MSGTMP3, MSG
  209. paddd 15*16-8*16(SHA256CONSTANTS), MSG
  210. sha256rnds2 MSG, STATE0, STATE1
  211. shuf128_32 $0x0E, MSG, MSG
  212. sha256rnds2 MSG, STATE1, STATE0
  213. /* Write hash values back in the correct order */
  214. mova128 STATE0, XMMTMP
  215. /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
  216. /* --- -------------- HGDC -- FEBA */
  217. shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
  218. shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
  219. /* add current hash values to previous ones */
  220. movu128 76+1*16(%eax), STATE1
  221. paddd XMMTMP, STATE1
  222. movu128 STATE1, 76+1*16(%eax)
  223. movu128 76+0*16(%eax), XMMTMP
  224. paddd XMMTMP, STATE0
  225. movu128 STATE0, 76+0*16(%eax)
  226. ret
  227. .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
  228. .section .rodata.cst256.K256, "aM", @progbits, 256
  229. .balign 16
  230. K256:
  231. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  232. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  233. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  234. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  235. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  236. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  237. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  238. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  239. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  240. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  241. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  242. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  243. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  244. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  245. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  246. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  247. .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
  248. .balign 16
  249. PSHUFFLE_BSWAP32_FLIP_MASK:
  250. .octa 0x0c0d0e0f08090a0b0405060700010203
  251. #endif