x86_64-gf2m.pl 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. #! /usr/bin/env perl
  2. # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # May 2011
  17. #
  18. # The module implements bn_GF2m_mul_2x2 polynomial multiplication used
  19. # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
  20. # the time being... Except that it has two code paths: code suitable
  21. # for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
  22. # later. Improvement varies from one benchmark and µ-arch to another.
  23. # Vanilla code path is at most 20% faster than compiler-generated code
  24. # [not very impressive], while PCLMULQDQ - whole 85%-160% better on
  25. # 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
  26. # these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
  27. # all CPU time is burnt in it...
  28. # $output is the last argument if it looks like a file (it has an extension)
  29. # $flavour is the first argument if it doesn't look like a file
  30. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  31. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  32. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  33. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  34. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  35. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  36. die "can't locate x86_64-xlate.pl";
  37. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  38. or die "can't call $xlate: $!";
  39. *STDOUT=*OUT;
  40. ($lo,$hi)=("%rax","%rdx"); $a=$lo;
  41. ($i0,$i1)=("%rsi","%rdi");
  42. ($t0,$t1)=("%rbx","%rcx");
  43. ($b,$mask)=("%rbp","%r8");
  44. ($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
  45. ($R,$Tx)=("%xmm0","%xmm1");
  46. $code.=<<___;
  47. .text
  48. .type _mul_1x1,\@abi-omnipotent
  49. .align 16
  50. _mul_1x1:
  51. .cfi_startproc
  52. sub \$128+8,%rsp
  53. .cfi_adjust_cfa_offset 128+8
  54. mov \$-1,$a1
  55. lea ($a,$a),$i0
  56. shr \$3,$a1
  57. lea (,$a,4),$i1
  58. and $a,$a1 # a1=a&0x1fffffffffffffff
  59. lea (,$a,8),$a8
  60. sar \$63,$a # broadcast 63rd bit
  61. lea ($a1,$a1),$a2
  62. sar \$63,$i0 # broadcast 62nd bit
  63. lea (,$a1,4),$a4
  64. and $b,$a
  65. sar \$63,$i1 # broadcast 61st bit
  66. mov $a,$hi # $a is $lo
  67. shl \$63,$lo
  68. and $b,$i0
  69. shr \$1,$hi
  70. mov $i0,$t1
  71. shl \$62,$i0
  72. and $b,$i1
  73. shr \$2,$t1
  74. xor $i0,$lo
  75. mov $i1,$t0
  76. shl \$61,$i1
  77. xor $t1,$hi
  78. shr \$3,$t0
  79. xor $i1,$lo
  80. xor $t0,$hi
  81. mov $a1,$a12
  82. movq \$0,0(%rsp) # tab[0]=0
  83. xor $a2,$a12 # a1^a2
  84. mov $a1,8(%rsp) # tab[1]=a1
  85. mov $a4,$a48
  86. mov $a2,16(%rsp) # tab[2]=a2
  87. xor $a8,$a48 # a4^a8
  88. mov $a12,24(%rsp) # tab[3]=a1^a2
  89. xor $a4,$a1
  90. mov $a4,32(%rsp) # tab[4]=a4
  91. xor $a4,$a2
  92. mov $a1,40(%rsp) # tab[5]=a1^a4
  93. xor $a4,$a12
  94. mov $a2,48(%rsp) # tab[6]=a2^a4
  95. xor $a48,$a1 # a1^a4^a4^a8=a1^a8
  96. mov $a12,56(%rsp) # tab[7]=a1^a2^a4
  97. xor $a48,$a2 # a2^a4^a4^a8=a1^a8
  98. mov $a8,64(%rsp) # tab[8]=a8
  99. xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
  100. mov $a1,72(%rsp) # tab[9]=a1^a8
  101. xor $a4,$a1 # a1^a8^a4
  102. mov $a2,80(%rsp) # tab[10]=a2^a8
  103. xor $a4,$a2 # a2^a8^a4
  104. mov $a12,88(%rsp) # tab[11]=a1^a2^a8
  105. xor $a4,$a12 # a1^a2^a8^a4
  106. mov $a48,96(%rsp) # tab[12]=a4^a8
  107. mov $mask,$i0
  108. mov $a1,104(%rsp) # tab[13]=a1^a4^a8
  109. and $b,$i0
  110. mov $a2,112(%rsp) # tab[14]=a2^a4^a8
  111. shr \$4,$b
  112. mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
  113. mov $mask,$i1
  114. and $b,$i1
  115. shr \$4,$b
  116. movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
  117. mov $mask,$i0
  118. and $b,$i0
  119. shr \$4,$b
  120. ___
  121. for ($n=1;$n<8;$n++) {
  122. $code.=<<___;
  123. mov (%rsp,$i1,8),$t1
  124. mov $mask,$i1
  125. mov $t1,$t0
  126. shl \$`8*$n-4`,$t1
  127. and $b,$i1
  128. movq (%rsp,$i0,8),$Tx
  129. shr \$`64-(8*$n-4)`,$t0
  130. xor $t1,$lo
  131. pslldq \$$n,$Tx
  132. mov $mask,$i0
  133. shr \$4,$b
  134. xor $t0,$hi
  135. and $b,$i0
  136. shr \$4,$b
  137. pxor $Tx,$R
  138. ___
  139. }
  140. $code.=<<___;
  141. mov (%rsp,$i1,8),$t1
  142. mov $t1,$t0
  143. shl \$`8*$n-4`,$t1
  144. movq $R,$i0
  145. shr \$`64-(8*$n-4)`,$t0
  146. xor $t1,$lo
  147. psrldq \$8,$R
  148. xor $t0,$hi
  149. movq $R,$i1
  150. xor $i0,$lo
  151. xor $i1,$hi
  152. add \$128+8,%rsp
  153. .cfi_adjust_cfa_offset -128-8
  154. ret
  155. .Lend_mul_1x1:
  156. .cfi_endproc
  157. .size _mul_1x1,.-_mul_1x1
  158. ___
  159. ($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
  160. ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
  161. $code.=<<___;
  162. .extern OPENSSL_ia32cap_P
  163. .globl bn_GF2m_mul_2x2
  164. .type bn_GF2m_mul_2x2,\@abi-omnipotent
  165. .align 16
  166. bn_GF2m_mul_2x2:
  167. .cfi_startproc
  168. mov %rsp,%rax
  169. mov OPENSSL_ia32cap_P(%rip),%r10
  170. bt \$33,%r10
  171. jnc .Lvanilla_mul_2x2
  172. movq $a1,%xmm0
  173. movq $b1,%xmm1
  174. movq $a0,%xmm2
  175. ___
  176. $code.=<<___ if ($win64);
  177. movq 40(%rsp),%xmm3
  178. ___
  179. $code.=<<___ if (!$win64);
  180. movq $b0,%xmm3
  181. ___
  182. $code.=<<___;
  183. movdqa %xmm0,%xmm4
  184. movdqa %xmm1,%xmm5
  185. pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
  186. pxor %xmm2,%xmm4
  187. pxor %xmm3,%xmm5
  188. pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
  189. pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
  190. xorps %xmm0,%xmm4
  191. xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
  192. movdqa %xmm4,%xmm5
  193. pslldq \$8,%xmm4
  194. psrldq \$8,%xmm5
  195. pxor %xmm4,%xmm2
  196. pxor %xmm5,%xmm0
  197. movdqu %xmm2,0($rp)
  198. movdqu %xmm0,16($rp)
  199. ret
  200. .align 16
  201. .Lvanilla_mul_2x2:
  202. lea -8*17(%rsp),%rsp
  203. .cfi_adjust_cfa_offset 8*17
  204. ___
  205. $code.=<<___ if ($win64);
  206. mov `8*17+40`(%rsp),$b0
  207. mov %rdi,8*15(%rsp)
  208. mov %rsi,8*16(%rsp)
  209. ___
  210. $code.=<<___;
  211. mov %r14,8*10(%rsp)
  212. .cfi_rel_offset %r14,8*10
  213. mov %r13,8*11(%rsp)
  214. .cfi_rel_offset %r13,8*11
  215. mov %r12,8*12(%rsp)
  216. .cfi_rel_offset %r12,8*12
  217. mov %rbp,8*13(%rsp)
  218. .cfi_rel_offset %rbp,8*13
  219. mov %rbx,8*14(%rsp)
  220. .cfi_rel_offset %rbx,8*14
  221. .Lbody_mul_2x2:
  222. mov $rp,32(%rsp) # save the arguments
  223. mov $a1,40(%rsp)
  224. mov $a0,48(%rsp)
  225. mov $b1,56(%rsp)
  226. mov $b0,64(%rsp)
  227. mov \$0xf,$mask
  228. mov $a1,$a
  229. mov $b1,$b
  230. call _mul_1x1 # a1·b1
  231. mov $lo,16(%rsp)
  232. mov $hi,24(%rsp)
  233. mov 48(%rsp),$a
  234. mov 64(%rsp),$b
  235. call _mul_1x1 # a0·b0
  236. mov $lo,0(%rsp)
  237. mov $hi,8(%rsp)
  238. mov 40(%rsp),$a
  239. mov 56(%rsp),$b
  240. xor 48(%rsp),$a
  241. xor 64(%rsp),$b
  242. call _mul_1x1 # (a0+a1)·(b0+b1)
  243. ___
  244. @r=("%rbx","%rcx","%rdi","%rsi");
  245. $code.=<<___;
  246. mov 0(%rsp),@r[0]
  247. mov 8(%rsp),@r[1]
  248. mov 16(%rsp),@r[2]
  249. mov 24(%rsp),@r[3]
  250. mov 32(%rsp),%rbp
  251. xor $hi,$lo
  252. xor @r[1],$hi
  253. xor @r[0],$lo
  254. mov @r[0],0(%rbp)
  255. xor @r[2],$hi
  256. mov @r[3],24(%rbp)
  257. xor @r[3],$lo
  258. xor @r[3],$hi
  259. xor $hi,$lo
  260. mov $hi,16(%rbp)
  261. mov $lo,8(%rbp)
  262. mov 8*10(%rsp),%r14
  263. .cfi_restore %r14
  264. mov 8*11(%rsp),%r13
  265. .cfi_restore %r13
  266. mov 8*12(%rsp),%r12
  267. .cfi_restore %r12
  268. mov 8*13(%rsp),%rbp
  269. .cfi_restore %rbp
  270. mov 8*14(%rsp),%rbx
  271. .cfi_restore %rbx
  272. ___
  273. $code.=<<___ if ($win64);
  274. mov 8*15(%rsp),%rdi
  275. mov 8*16(%rsp),%rsi
  276. ___
  277. $code.=<<___;
  278. lea 8*17(%rsp),%rsp
  279. .cfi_adjust_cfa_offset -8*17
  280. .Lepilogue_mul_2x2:
  281. ret
  282. .Lend_mul_2x2:
  283. .cfi_endproc
  284. .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
  285. .asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  286. .align 16
  287. ___
  288. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  289. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  290. if ($win64) {
  291. $rec="%rcx";
  292. $frame="%rdx";
  293. $context="%r8";
  294. $disp="%r9";
  295. $code.=<<___;
  296. .extern __imp_RtlVirtualUnwind
  297. .type se_handler,\@abi-omnipotent
  298. .align 16
  299. se_handler:
  300. push %rsi
  301. push %rdi
  302. push %rbx
  303. push %rbp
  304. push %r12
  305. push %r13
  306. push %r14
  307. push %r15
  308. pushfq
  309. sub \$64,%rsp
  310. mov 120($context),%rax # pull context->Rax
  311. mov 248($context),%rbx # pull context->Rip
  312. lea .Lbody_mul_2x2(%rip),%r10
  313. cmp %r10,%rbx # context->Rip<"prologue" label
  314. jb .Lin_prologue
  315. mov 152($context),%rax # pull context->Rsp
  316. lea .Lepilogue_mul_2x2(%rip),%r10
  317. cmp %r10,%rbx # context->Rip>="epilogue" label
  318. jae .Lin_prologue
  319. mov 8*10(%rax),%r14 # mimic epilogue
  320. mov 8*11(%rax),%r13
  321. mov 8*12(%rax),%r12
  322. mov 8*13(%rax),%rbp
  323. mov 8*14(%rax),%rbx
  324. mov 8*15(%rax),%rdi
  325. mov 8*16(%rax),%rsi
  326. mov %rbx,144($context) # restore context->Rbx
  327. mov %rbp,160($context) # restore context->Rbp
  328. mov %rsi,168($context) # restore context->Rsi
  329. mov %rdi,176($context) # restore context->Rdi
  330. mov %r12,216($context) # restore context->R12
  331. mov %r13,224($context) # restore context->R13
  332. mov %r14,232($context) # restore context->R14
  333. lea 8*17(%rax),%rax
  334. .Lin_prologue:
  335. mov %rax,152($context) # restore context->Rsp
  336. mov 40($disp),%rdi # disp->ContextRecord
  337. mov $context,%rsi # context
  338. mov \$154,%ecx # sizeof(CONTEXT)
  339. .long 0xa548f3fc # cld; rep movsq
  340. mov $disp,%rsi
  341. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  342. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  343. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  344. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  345. mov 40(%rsi),%r10 # disp->ContextRecord
  346. lea 56(%rsi),%r11 # &disp->HandlerData
  347. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  348. mov %r10,32(%rsp) # arg5
  349. mov %r11,40(%rsp) # arg6
  350. mov %r12,48(%rsp) # arg7
  351. mov %rcx,56(%rsp) # arg8, (NULL)
  352. call *__imp_RtlVirtualUnwind(%rip)
  353. mov \$1,%eax # ExceptionContinueSearch
  354. add \$64,%rsp
  355. popfq
  356. pop %r15
  357. pop %r14
  358. pop %r13
  359. pop %r12
  360. pop %rbp
  361. pop %rbx
  362. pop %rdi
  363. pop %rsi
  364. ret
  365. .size se_handler,.-se_handler
  366. .section .pdata
  367. .align 4
  368. .rva _mul_1x1
  369. .rva .Lend_mul_1x1
  370. .rva .LSEH_info_1x1
  371. .rva .Lvanilla_mul_2x2
  372. .rva .Lend_mul_2x2
  373. .rva .LSEH_info_2x2
  374. .section .xdata
  375. .align 8
  376. .LSEH_info_1x1:
  377. .byte 0x01,0x07,0x02,0x00
  378. .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
  379. .LSEH_info_2x2:
  380. .byte 9,0,0,0
  381. .rva se_handler
  382. ___
  383. }
  384. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  385. print $code;
  386. close STDOUT or die "error closing STDOUT: $!";