hash_md5_sha_x86-64_shaNI.S 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. #if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
  2. /* The code is adapted from Linux kernel's source */
  3. // We use shorter insns, even though they are for "wrong"
  4. // data type (fp, not int).
  5. // For Intel, there is no penalty for doing it at all
  6. // (CPUs which do have such penalty do not support SHA insns).
  7. // For AMD, the penalty is one extra cycle
  8. // (allegedly: I failed to find measurable difference).
  9. //#define mova128 movdqa
  10. #define mova128 movaps
  11. //#define movu128 movdqu
  12. #define movu128 movups
  13. //#define xor128 pxor
  14. #define xor128 xorps
  15. //#define shuf128_32 pshufd
  16. #define shuf128_32 shufps
  17. #define extr128_32 pextrd
  18. //#define extr128_32 extractps # not shorter
  19. // pshufb is a SSSE3 insn.
  20. // pinsrd, pextrd, extractps are SSE4.1 insns.
  21. // We do not check SSSE3/SSE4.1 in cpuid,
  22. // all SHA-capable CPUs support them as well.
  23. #ifdef __linux__
  24. .section .note.GNU-stack, "", @progbits
  25. #endif
  26. .section .text.sha1_process_block64_shaNI, "ax", @progbits
  27. .globl sha1_process_block64_shaNI
  28. .hidden sha1_process_block64_shaNI
  29. .type sha1_process_block64_shaNI, @function
  30. #define ABCD %xmm0
  31. #define E0 %xmm1 /* Need two E's b/c they ping pong */
  32. #define E1 %xmm2
  33. #define MSG0 %xmm3
  34. #define MSG1 %xmm4
  35. #define MSG2 %xmm5
  36. #define MSG3 %xmm6
  37. .balign 8 # allow decoders to fetch at least 2 first insns
  38. sha1_process_block64_shaNI:
  39. /* load initial hash values */
  40. movu128 80(%rdi), ABCD
  41. xor128 E0, E0
  42. pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word
  43. shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
  44. mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
  45. movu128 0*16(%rdi), MSG0
  46. pshufb %xmm7, MSG0
  47. movu128 1*16(%rdi), MSG1
  48. pshufb %xmm7, MSG1
  49. movu128 2*16(%rdi), MSG2
  50. pshufb %xmm7, MSG2
  51. movu128 3*16(%rdi), MSG3
  52. pshufb %xmm7, MSG3
  53. /* Save hash values for addition after rounds */
  54. mova128 E0, %xmm7
  55. mova128 ABCD, %xmm8
  56. /* Rounds 0-3 */
  57. paddd MSG0, E0
  58. mova128 ABCD, E1
  59. sha1rnds4 $0, E0, ABCD
  60. /* Rounds 4-7 */
  61. sha1nexte MSG1, E1
  62. mova128 ABCD, E0
  63. sha1rnds4 $0, E1, ABCD
  64. sha1msg1 MSG1, MSG0
  65. /* Rounds 8-11 */
  66. sha1nexte MSG2, E0
  67. mova128 ABCD, E1
  68. sha1rnds4 $0, E0, ABCD
  69. sha1msg1 MSG2, MSG1
  70. xor128 MSG2, MSG0
  71. /* Rounds 12-15 */
  72. sha1nexte MSG3, E1
  73. mova128 ABCD, E0
  74. sha1msg2 MSG3, MSG0
  75. sha1rnds4 $0, E1, ABCD
  76. sha1msg1 MSG3, MSG2
  77. xor128 MSG3, MSG1
  78. /* Rounds 16-19 */
  79. sha1nexte MSG0, E0
  80. mova128 ABCD, E1
  81. sha1msg2 MSG0, MSG1
  82. sha1rnds4 $0, E0, ABCD
  83. sha1msg1 MSG0, MSG3
  84. xor128 MSG0, MSG2
  85. /* Rounds 20-23 */
  86. sha1nexte MSG1, E1
  87. mova128 ABCD, E0
  88. sha1msg2 MSG1, MSG2
  89. sha1rnds4 $1, E1, ABCD
  90. sha1msg1 MSG1, MSG0
  91. xor128 MSG1, MSG3
  92. /* Rounds 24-27 */
  93. sha1nexte MSG2, E0
  94. mova128 ABCD, E1
  95. sha1msg2 MSG2, MSG3
  96. sha1rnds4 $1, E0, ABCD
  97. sha1msg1 MSG2, MSG1
  98. xor128 MSG2, MSG0
  99. /* Rounds 28-31 */
  100. sha1nexte MSG3, E1
  101. mova128 ABCD, E0
  102. sha1msg2 MSG3, MSG0
  103. sha1rnds4 $1, E1, ABCD
  104. sha1msg1 MSG3, MSG2
  105. xor128 MSG3, MSG1
  106. /* Rounds 32-35 */
  107. sha1nexte MSG0, E0
  108. mova128 ABCD, E1
  109. sha1msg2 MSG0, MSG1
  110. sha1rnds4 $1, E0, ABCD
  111. sha1msg1 MSG0, MSG3
  112. xor128 MSG0, MSG2
  113. /* Rounds 36-39 */
  114. sha1nexte MSG1, E1
  115. mova128 ABCD, E0
  116. sha1msg2 MSG1, MSG2
  117. sha1rnds4 $1, E1, ABCD
  118. sha1msg1 MSG1, MSG0
  119. xor128 MSG1, MSG3
  120. /* Rounds 40-43 */
  121. sha1nexte MSG2, E0
  122. mova128 ABCD, E1
  123. sha1msg2 MSG2, MSG3
  124. sha1rnds4 $2, E0, ABCD
  125. sha1msg1 MSG2, MSG1
  126. xor128 MSG2, MSG0
  127. /* Rounds 44-47 */
  128. sha1nexte MSG3, E1
  129. mova128 ABCD, E0
  130. sha1msg2 MSG3, MSG0
  131. sha1rnds4 $2, E1, ABCD
  132. sha1msg1 MSG3, MSG2
  133. xor128 MSG3, MSG1
  134. /* Rounds 48-51 */
  135. sha1nexte MSG0, E0
  136. mova128 ABCD, E1
  137. sha1msg2 MSG0, MSG1
  138. sha1rnds4 $2, E0, ABCD
  139. sha1msg1 MSG0, MSG3
  140. xor128 MSG0, MSG2
  141. /* Rounds 52-55 */
  142. sha1nexte MSG1, E1
  143. mova128 ABCD, E0
  144. sha1msg2 MSG1, MSG2
  145. sha1rnds4 $2, E1, ABCD
  146. sha1msg1 MSG1, MSG0
  147. xor128 MSG1, MSG3
  148. /* Rounds 56-59 */
  149. sha1nexte MSG2, E0
  150. mova128 ABCD, E1
  151. sha1msg2 MSG2, MSG3
  152. sha1rnds4 $2, E0, ABCD
  153. sha1msg1 MSG2, MSG1
  154. xor128 MSG2, MSG0
  155. /* Rounds 60-63 */
  156. sha1nexte MSG3, E1
  157. mova128 ABCD, E0
  158. sha1msg2 MSG3, MSG0
  159. sha1rnds4 $3, E1, ABCD
  160. sha1msg1 MSG3, MSG2
  161. xor128 MSG3, MSG1
  162. /* Rounds 64-67 */
  163. sha1nexte MSG0, E0
  164. mova128 ABCD, E1
  165. sha1msg2 MSG0, MSG1
  166. sha1rnds4 $3, E0, ABCD
  167. sha1msg1 MSG0, MSG3
  168. xor128 MSG0, MSG2
  169. /* Rounds 68-71 */
  170. sha1nexte MSG1, E1
  171. mova128 ABCD, E0
  172. sha1msg2 MSG1, MSG2
  173. sha1rnds4 $3, E1, ABCD
  174. xor128 MSG1, MSG3
  175. /* Rounds 72-75 */
  176. sha1nexte MSG2, E0
  177. mova128 ABCD, E1
  178. sha1msg2 MSG2, MSG3
  179. sha1rnds4 $3, E0, ABCD
  180. /* Rounds 76-79 */
  181. sha1nexte MSG3, E1
  182. mova128 ABCD, E0
  183. sha1rnds4 $3, E1, ABCD
  184. /* Add current hash values with previously saved */
  185. sha1nexte %xmm7, E0
  186. paddd %xmm8, ABCD
  187. /* Write hash values back in the correct order */
  188. shuf128_32 $0x1B, ABCD, ABCD
  189. movu128 ABCD, 80(%rdi)
  190. extr128_32 $3, E0, 80+4*4(%rdi)
  191. ret
  192. .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
  193. .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
  194. .balign 16
  195. PSHUFFLE_BYTE_FLIP_MASK:
  196. .octa 0x000102030405060708090a0b0c0d0e0f
  197. #endif