x86_64-gf2m.pl 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. #! /usr/bin/env perl
  2. # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # May 2011
  17. #
  18. # The module implements bn_GF2m_mul_2x2 polynomial multiplication used
  19. # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
  20. # the time being... Except that it has two code paths: code suitable
  21. # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
  22. # later. Improvement varies from one benchmark and µ-arch to another.
  23. # Vanilla code path is at most 20% faster than compiler-generated code
  24. # [not very impressive], while PCLMULQDQ - whole 85%-160% better on
  25. # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
  26. # these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
  27. # all CPU time is burnt in it...
  28. $flavour = shift;
  29. $output = shift;
  30. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  31. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  32. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  33. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  34. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  35. die "can't locate x86_64-xlate.pl";
  36. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  37. *STDOUT=*OUT;
  38. ($lo,$hi)=("%rax","%rdx"); $a=$lo;
  39. ($i0,$i1)=("%rsi","%rdi");
  40. ($t0,$t1)=("%rbx","%rcx");
  41. ($b,$mask)=("%rbp","%r8");
  42. ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
  43. ($R,$Tx)=("%xmm0","%xmm1");
  44. $code.=<<___;
  45. .text
  46. .type _mul_1x1,\@abi-omnipotent
  47. .align 16
  48. _mul_1x1:
  49. .cfi_startproc
  50. sub \$128+8,%rsp
  51. .cfi_adjust_cfa_offset 128+8
  52. mov \$-1,$a1
  53. lea ($a,$a),$i0
  54. shr \$3,$a1
  55. lea (,$a,4),$i1
  56. and $a,$a1 # a1=a&0x1fffffffffffffff
  57. lea (,$a,8),$a8
  58. sar \$63,$a # broadcast 63rd bit
  59. lea ($a1,$a1),$a2
  60. sar \$63,$i0 # broadcast 62nd bit
  61. lea (,$a1,4),$a4
  62. and $b,$a
  63. sar \$63,$i1 # boardcast 61st bit
  64. mov $a,$hi # $a is $lo
  65. shl \$63,$lo
  66. and $b,$i0
  67. shr \$1,$hi
  68. mov $i0,$t1
  69. shl \$62,$i0
  70. and $b,$i1
  71. shr \$2,$t1
  72. xor $i0,$lo
  73. mov $i1,$t0
  74. shl \$61,$i1
  75. xor $t1,$hi
  76. shr \$3,$t0
  77. xor $i1,$lo
  78. xor $t0,$hi
  79. mov $a1,$a12
  80. movq \$0,0(%rsp) # tab[0]=0
  81. xor $a2,$a12 # a1^a2
  82. mov $a1,8(%rsp) # tab[1]=a1
  83. mov $a4,$a48
  84. mov $a2,16(%rsp) # tab[2]=a2
  85. xor $a8,$a48 # a4^a8
  86. mov $a12,24(%rsp) # tab[3]=a1^a2
  87. xor $a4,$a1
  88. mov $a4,32(%rsp) # tab[4]=a4
  89. xor $a4,$a2
  90. mov $a1,40(%rsp) # tab[5]=a1^a4
  91. xor $a4,$a12
  92. mov $a2,48(%rsp) # tab[6]=a2^a4
  93. xor $a48,$a1 # a1^a4^a4^a8=a1^a8
  94. mov $a12,56(%rsp) # tab[7]=a1^a2^a4
  95. xor $a48,$a2 # a2^a4^a4^a8=a1^a8
  96. mov $a8,64(%rsp) # tab[8]=a8
  97. xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
  98. mov $a1,72(%rsp) # tab[9]=a1^a8
  99. xor $a4,$a1 # a1^a8^a4
  100. mov $a2,80(%rsp) # tab[10]=a2^a8
  101. xor $a4,$a2 # a2^a8^a4
  102. mov $a12,88(%rsp) # tab[11]=a1^a2^a8
  103. xor $a4,$a12 # a1^a2^a8^a4
  104. mov $a48,96(%rsp) # tab[12]=a4^a8
  105. mov $mask,$i0
  106. mov $a1,104(%rsp) # tab[13]=a1^a4^a8
  107. and $b,$i0
  108. mov $a2,112(%rsp) # tab[14]=a2^a4^a8
  109. shr \$4,$b
  110. mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
  111. mov $mask,$i1
  112. and $b,$i1
  113. shr \$4,$b
  114. movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
  115. mov $mask,$i0
  116. and $b,$i0
  117. shr \$4,$b
  118. ___
  119. for ($n=1;$n<8;$n++) {
  120. $code.=<<___;
  121. mov (%rsp,$i1,8),$t1
  122. mov $mask,$i1
  123. mov $t1,$t0
  124. shl \$`8*$n-4`,$t1
  125. and $b,$i1
  126. movq (%rsp,$i0,8),$Tx
  127. shr \$`64-(8*$n-4)`,$t0
  128. xor $t1,$lo
  129. pslldq \$$n,$Tx
  130. mov $mask,$i0
  131. shr \$4,$b
  132. xor $t0,$hi
  133. and $b,$i0
  134. shr \$4,$b
  135. pxor $Tx,$R
  136. ___
  137. }
  138. $code.=<<___;
  139. mov (%rsp,$i1,8),$t1
  140. mov $t1,$t0
  141. shl \$`8*$n-4`,$t1
  142. movq $R,$i0
  143. shr \$`64-(8*$n-4)`,$t0
  144. xor $t1,$lo
  145. psrldq \$8,$R
  146. xor $t0,$hi
  147. movq $R,$i1
  148. xor $i0,$lo
  149. xor $i1,$hi
  150. add \$128+8,%rsp
  151. .cfi_adjust_cfa_offset -128-8
  152. ret
  153. .Lend_mul_1x1:
  154. .cfi_endproc
  155. .size _mul_1x1,.-_mul_1x1
  156. ___
  157. ($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
  158. ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
  159. $code.=<<___;
  160. .extern OPENSSL_ia32cap_P
  161. .globl bn_GF2m_mul_2x2
  162. .type bn_GF2m_mul_2x2,\@abi-omnipotent
  163. .align 16
  164. bn_GF2m_mul_2x2:
  165. .cfi_startproc
  166. mov %rsp,%rax
  167. mov OPENSSL_ia32cap_P(%rip),%r10
  168. bt \$33,%r10
  169. jnc .Lvanilla_mul_2x2
  170. movq $a1,%xmm0
  171. movq $b1,%xmm1
  172. movq $a0,%xmm2
  173. ___
  174. $code.=<<___ if ($win64);
  175. movq 40(%rsp),%xmm3
  176. ___
  177. $code.=<<___ if (!$win64);
  178. movq $b0,%xmm3
  179. ___
  180. $code.=<<___;
  181. movdqa %xmm0,%xmm4
  182. movdqa %xmm1,%xmm5
  183. pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
  184. pxor %xmm2,%xmm4
  185. pxor %xmm3,%xmm5
  186. pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
  187. pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
  188. xorps %xmm0,%xmm4
  189. xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
  190. movdqa %xmm4,%xmm5
  191. pslldq \$8,%xmm4
  192. psrldq \$8,%xmm5
  193. pxor %xmm4,%xmm2
  194. pxor %xmm5,%xmm0
  195. movdqu %xmm2,0($rp)
  196. movdqu %xmm0,16($rp)
  197. ret
  198. .align 16
  199. .Lvanilla_mul_2x2:
  200. lea -8*17(%rsp),%rsp
  201. .cfi_adjust_cfa_offset 8*17
  202. ___
  203. $code.=<<___ if ($win64);
  204. mov `8*17+40`(%rsp),$b0
  205. mov %rdi,8*15(%rsp)
  206. mov %rsi,8*16(%rsp)
  207. ___
  208. $code.=<<___;
  209. mov %r14,8*10(%rsp)
  210. .cfi_rel_offset %r14,8*10
  211. mov %r13,8*11(%rsp)
  212. .cfi_rel_offset %r13,8*11
  213. mov %r12,8*12(%rsp)
  214. .cfi_rel_offset %r12,8*12
  215. mov %rbp,8*13(%rsp)
  216. .cfi_rel_offset %rbp,8*13
  217. mov %rbx,8*14(%rsp)
  218. .cfi_rel_offset %rbx,8*14
  219. .Lbody_mul_2x2:
  220. mov $rp,32(%rsp) # save the arguments
  221. mov $a1,40(%rsp)
  222. mov $a0,48(%rsp)
  223. mov $b1,56(%rsp)
  224. mov $b0,64(%rsp)
  225. mov \$0xf,$mask
  226. mov $a1,$a
  227. mov $b1,$b
  228. call _mul_1x1 # a1·b1
  229. mov $lo,16(%rsp)
  230. mov $hi,24(%rsp)
  231. mov 48(%rsp),$a
  232. mov 64(%rsp),$b
  233. call _mul_1x1 # a0·b0
  234. mov $lo,0(%rsp)
  235. mov $hi,8(%rsp)
  236. mov 40(%rsp),$a
  237. mov 56(%rsp),$b
  238. xor 48(%rsp),$a
  239. xor 64(%rsp),$b
  240. call _mul_1x1 # (a0+a1)·(b0+b1)
  241. ___
  242. @r=("%rbx","%rcx","%rdi","%rsi");
  243. $code.=<<___;
  244. mov 0(%rsp),@r[0]
  245. mov 8(%rsp),@r[1]
  246. mov 16(%rsp),@r[2]
  247. mov 24(%rsp),@r[3]
  248. mov 32(%rsp),%rbp
  249. xor $hi,$lo
  250. xor @r[1],$hi
  251. xor @r[0],$lo
  252. mov @r[0],0(%rbp)
  253. xor @r[2],$hi
  254. mov @r[3],24(%rbp)
  255. xor @r[3],$lo
  256. xor @r[3],$hi
  257. xor $hi,$lo
  258. mov $hi,16(%rbp)
  259. mov $lo,8(%rbp)
  260. mov 8*10(%rsp),%r14
  261. .cfi_restore %r14
  262. mov 8*11(%rsp),%r13
  263. .cfi_restore %r13
  264. mov 8*12(%rsp),%r12
  265. .cfi_restore %r12
  266. mov 8*13(%rsp),%rbp
  267. .cfi_restore %rbp
  268. mov 8*14(%rsp),%rbx
  269. .cfi_restore %rbx
  270. ___
  271. $code.=<<___ if ($win64);
  272. mov 8*15(%rsp),%rdi
  273. mov 8*16(%rsp),%rsi
  274. ___
  275. $code.=<<___;
  276. lea 8*17(%rsp),%rsp
  277. .cfi_adjust_cfa_offset -8*17
  278. .Lepilogue_mul_2x2:
  279. ret
  280. .Lend_mul_2x2:
  281. .cfi_endproc
  282. .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
  283. .asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  284. .align 16
  285. ___
  286. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  287. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  288. if ($win64) {
  289. $rec="%rcx";
  290. $frame="%rdx";
  291. $context="%r8";
  292. $disp="%r9";
  293. $code.=<<___;
  294. .extern __imp_RtlVirtualUnwind
  295. .type se_handler,\@abi-omnipotent
  296. .align 16
  297. se_handler:
  298. push %rsi
  299. push %rdi
  300. push %rbx
  301. push %rbp
  302. push %r12
  303. push %r13
  304. push %r14
  305. push %r15
  306. pushfq
  307. sub \$64,%rsp
  308. mov 120($context),%rax # pull context->Rax
  309. mov 248($context),%rbx # pull context->Rip
  310. lea .Lbody_mul_2x2(%rip),%r10
  311. cmp %r10,%rbx # context->Rip<"prologue" label
  312. jb .Lin_prologue
  313. mov 152($context),%rax # pull context->Rsp
  314. lea .Lepilogue_mul_2x2(%rip),%r10
  315. cmp %r10,%rbx # context->Rip>="epilogue" label
  316. jae .Lin_prologue
  317. mov 8*10(%rax),%r14 # mimic epilogue
  318. mov 8*11(%rax),%r13
  319. mov 8*12(%rax),%r12
  320. mov 8*13(%rax),%rbp
  321. mov 8*14(%rax),%rbx
  322. mov 8*15(%rax),%rdi
  323. mov 8*16(%rax),%rsi
  324. mov %rbx,144($context) # restore context->Rbx
  325. mov %rbp,160($context) # restore context->Rbp
  326. mov %rsi,168($context) # restore context->Rsi
  327. mov %rdi,176($context) # restore context->Rdi
  328. mov %r12,216($context) # restore context->R12
  329. mov %r13,224($context) # restore context->R13
  330. mov %r14,232($context) # restore context->R14
  331. lea 8*17(%rax),%rax
  332. .Lin_prologue:
  333. mov %rax,152($context) # restore context->Rsp
  334. mov 40($disp),%rdi # disp->ContextRecord
  335. mov $context,%rsi # context
  336. mov \$154,%ecx # sizeof(CONTEXT)
  337. .long 0xa548f3fc # cld; rep movsq
  338. mov $disp,%rsi
  339. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  340. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  341. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  342. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  343. mov 40(%rsi),%r10 # disp->ContextRecord
  344. lea 56(%rsi),%r11 # &disp->HandlerData
  345. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  346. mov %r10,32(%rsp) # arg5
  347. mov %r11,40(%rsp) # arg6
  348. mov %r12,48(%rsp) # arg7
  349. mov %rcx,56(%rsp) # arg8, (NULL)
  350. call *__imp_RtlVirtualUnwind(%rip)
  351. mov \$1,%eax # ExceptionContinueSearch
  352. add \$64,%rsp
  353. popfq
  354. pop %r15
  355. pop %r14
  356. pop %r13
  357. pop %r12
  358. pop %rbp
  359. pop %rbx
  360. pop %rdi
  361. pop %rsi
  362. ret
  363. .size se_handler,.-se_handler
  364. .section .pdata
  365. .align 4
  366. .rva _mul_1x1
  367. .rva .Lend_mul_1x1
  368. .rva .LSEH_info_1x1
  369. .rva .Lvanilla_mul_2x2
  370. .rva .Lend_mul_2x2
  371. .rva .LSEH_info_2x2
  372. .section .xdata
  373. .align 8
  374. .LSEH_info_1x1:
  375. .byte 0x01,0x07,0x02,0x00
  376. .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
  377. .LSEH_info_2x2:
  378. .byte 9,0,0,0
  379. .rva se_handler
  380. ___
  381. }
  382. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  383. print $code;
  384. close STDOUT;