e_padlock-x86_64.pl 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. #! /usr/bin/env perl
  2. # Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # September 2011
  15. #
  16. # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
  17. # details.
  18. $flavour = shift;
  19. $output = shift;
  20. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  21. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  22. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  23. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  24. ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
  25. die "can't locate x86_64-xlate.pl";
  26. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  27. *STDOUT=*OUT;
  28. $code=".text\n";
  29. %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
  30. $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
  31. $ctx="%rdx";
  32. $out="%rdi";
  33. $inp="%rsi";
  34. $len="%rcx";
  35. $chunk="%rbx";
  36. ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
  37. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  38. $code.=<<___;
  39. .globl padlock_capability
  40. .type padlock_capability,\@abi-omnipotent
  41. .align 16
  42. padlock_capability:
  43. mov %rbx,%r8
  44. xor %eax,%eax
  45. cpuid
  46. xor %eax,%eax
  47. cmp \$`"0x".unpack("H*",'tneC')`,%ebx
  48. jne .Lzhaoxin
  49. cmp \$`"0x".unpack("H*",'Hrua')`,%edx
  50. jne .Lnoluck
  51. cmp \$`"0x".unpack("H*",'slua')`,%ecx
  52. jne .Lnoluck
  53. jmp .LzhaoxinEnd
  54. .Lzhaoxin:
  55. cmp \$`"0x".unpack("H*",'hS ')`,%ebx
  56. jne .Lnoluck
  57. cmp \$`"0x".unpack("H*",'hgna')`,%edx
  58. jne .Lnoluck
  59. cmp \$`"0x".unpack("H*",' ia')`,%ecx
  60. jne .Lnoluck
  61. .LzhaoxinEnd:
  62. mov \$0xC0000000,%eax
  63. cpuid
  64. mov %eax,%edx
  65. xor %eax,%eax
  66. cmp \$0xC0000001,%edx
  67. jb .Lnoluck
  68. mov \$0xC0000001,%eax
  69. cpuid
  70. mov %edx,%eax
  71. and \$0xffffffef,%eax
  72. or \$0x10,%eax # set Nano bit#4
  73. .Lnoluck:
  74. mov %r8,%rbx
  75. ret
  76. .size padlock_capability,.-padlock_capability
  77. .globl padlock_key_bswap
  78. .type padlock_key_bswap,\@abi-omnipotent,0
  79. .align 16
  80. padlock_key_bswap:
  81. mov 240($arg1),%edx
  82. .Lbswap_loop:
  83. mov ($arg1),%eax
  84. bswap %eax
  85. mov %eax,($arg1)
  86. lea 4($arg1),$arg1
  87. sub \$1,%edx
  88. jnz .Lbswap_loop
  89. ret
  90. .size padlock_key_bswap,.-padlock_key_bswap
  91. .globl padlock_verify_context
  92. .type padlock_verify_context,\@abi-omnipotent
  93. .align 16
  94. padlock_verify_context:
  95. mov $arg1,$ctx
  96. pushf
  97. lea .Lpadlock_saved_context(%rip),%rax
  98. call _padlock_verify_ctx
  99. lea 8(%rsp),%rsp
  100. ret
  101. .size padlock_verify_context,.-padlock_verify_context
  102. .type _padlock_verify_ctx,\@abi-omnipotent
  103. .align 16
  104. _padlock_verify_ctx:
  105. mov 8(%rsp),%r8
  106. bt \$30,%r8
  107. jnc .Lverified
  108. cmp (%rax),$ctx
  109. je .Lverified
  110. pushf
  111. popf
  112. .Lverified:
  113. mov $ctx,(%rax)
  114. ret
  115. .size _padlock_verify_ctx,.-_padlock_verify_ctx
  116. .globl padlock_reload_key
  117. .type padlock_reload_key,\@abi-omnipotent
  118. .align 16
  119. padlock_reload_key:
  120. pushf
  121. popf
  122. ret
  123. .size padlock_reload_key,.-padlock_reload_key
  124. .globl padlock_aes_block
  125. .type padlock_aes_block,\@function,3
  126. .align 16
  127. padlock_aes_block:
  128. mov %rbx,%r8
  129. mov \$1,$len
  130. lea 32($ctx),%rbx # key
  131. lea 16($ctx),$ctx # control word
  132. .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
  133. mov %r8,%rbx
  134. ret
  135. .size padlock_aes_block,.-padlock_aes_block
  136. .globl padlock_xstore
  137. .type padlock_xstore,\@function,2
  138. .align 16
  139. padlock_xstore:
  140. mov %esi,%edx
  141. .byte 0x0f,0xa7,0xc0 # xstore
  142. ret
  143. .size padlock_xstore,.-padlock_xstore
  144. .globl padlock_sha1_oneshot
  145. .type padlock_sha1_oneshot,\@function,3
  146. .align 16
  147. padlock_sha1_oneshot:
  148. mov %rdx,%rcx
  149. mov %rdi,%rdx # put aside %rdi
  150. movups (%rdi),%xmm0 # copy-in context
  151. sub \$128+8,%rsp
  152. mov 16(%rdi),%eax
  153. movaps %xmm0,(%rsp)
  154. mov %rsp,%rdi
  155. mov %eax,16(%rsp)
  156. xor %rax,%rax
  157. .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
  158. movaps (%rsp),%xmm0
  159. mov 16(%rsp),%eax
  160. add \$128+8,%rsp
  161. movups %xmm0,(%rdx) # copy-out context
  162. mov %eax,16(%rdx)
  163. ret
  164. .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
  165. .globl padlock_sha1_blocks
  166. .type padlock_sha1_blocks,\@function,3
  167. .align 16
  168. padlock_sha1_blocks:
  169. mov %rdx,%rcx
  170. mov %rdi,%rdx # put aside %rdi
  171. movups (%rdi),%xmm0 # copy-in context
  172. sub \$128+8,%rsp
  173. mov 16(%rdi),%eax
  174. movaps %xmm0,(%rsp)
  175. mov %rsp,%rdi
  176. mov %eax,16(%rsp)
  177. mov \$-1,%rax
  178. .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
  179. movaps (%rsp),%xmm0
  180. mov 16(%rsp),%eax
  181. add \$128+8,%rsp
  182. movups %xmm0,(%rdx) # copy-out context
  183. mov %eax,16(%rdx)
  184. ret
  185. .size padlock_sha1_blocks,.-padlock_sha1_blocks
  186. .globl padlock_sha256_oneshot
  187. .type padlock_sha256_oneshot,\@function,3
  188. .align 16
  189. padlock_sha256_oneshot:
  190. mov %rdx,%rcx
  191. mov %rdi,%rdx # put aside %rdi
  192. movups (%rdi),%xmm0 # copy-in context
  193. sub \$128+8,%rsp
  194. movups 16(%rdi),%xmm1
  195. movaps %xmm0,(%rsp)
  196. mov %rsp,%rdi
  197. movaps %xmm1,16(%rsp)
  198. xor %rax,%rax
  199. .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
  200. movaps (%rsp),%xmm0
  201. movaps 16(%rsp),%xmm1
  202. add \$128+8,%rsp
  203. movups %xmm0,(%rdx) # copy-out context
  204. movups %xmm1,16(%rdx)
  205. ret
  206. .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
  207. .globl padlock_sha256_blocks
  208. .type padlock_sha256_blocks,\@function,3
  209. .align 16
  210. padlock_sha256_blocks:
  211. mov %rdx,%rcx
  212. mov %rdi,%rdx # put aside %rdi
  213. movups (%rdi),%xmm0 # copy-in context
  214. sub \$128+8,%rsp
  215. movups 16(%rdi),%xmm1
  216. movaps %xmm0,(%rsp)
  217. mov %rsp,%rdi
  218. movaps %xmm1,16(%rsp)
  219. mov \$-1,%rax
  220. .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
  221. movaps (%rsp),%xmm0
  222. movaps 16(%rsp),%xmm1
  223. add \$128+8,%rsp
  224. movups %xmm0,(%rdx) # copy-out context
  225. movups %xmm1,16(%rdx)
  226. ret
  227. .size padlock_sha256_blocks,.-padlock_sha256_blocks
  228. .globl padlock_sha512_blocks
  229. .type padlock_sha512_blocks,\@function,3
  230. .align 16
  231. padlock_sha512_blocks:
  232. mov %rdx,%rcx
  233. mov %rdi,%rdx # put aside %rdi
  234. movups (%rdi),%xmm0 # copy-in context
  235. sub \$128+8,%rsp
  236. movups 16(%rdi),%xmm1
  237. movups 32(%rdi),%xmm2
  238. movups 48(%rdi),%xmm3
  239. movaps %xmm0,(%rsp)
  240. mov %rsp,%rdi
  241. movaps %xmm1,16(%rsp)
  242. movaps %xmm2,32(%rsp)
  243. movaps %xmm3,48(%rsp)
  244. .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
  245. movaps (%rsp),%xmm0
  246. movaps 16(%rsp),%xmm1
  247. movaps 32(%rsp),%xmm2
  248. movaps 48(%rsp),%xmm3
  249. add \$128+8,%rsp
  250. movups %xmm0,(%rdx) # copy-out context
  251. movups %xmm1,16(%rdx)
  252. movups %xmm2,32(%rdx)
  253. movups %xmm3,48(%rdx)
  254. ret
  255. .size padlock_sha512_blocks,.-padlock_sha512_blocks
  256. ___
  257. sub generate_mode {
  258. my ($mode,$opcode) = @_;
  259. # int padlock_$mode_encrypt(void *out, const void *inp,
  260. # struct padlock_cipher_data *ctx, size_t len);
  261. $code.=<<___;
  262. .globl padlock_${mode}_encrypt
  263. .type padlock_${mode}_encrypt,\@function,4
  264. .align 16
  265. padlock_${mode}_encrypt:
  266. push %rbp
  267. push %rbx
  268. xor %eax,%eax
  269. test \$15,$ctx
  270. jnz .L${mode}_abort
  271. test \$15,$len
  272. jnz .L${mode}_abort
  273. lea .Lpadlock_saved_context(%rip),%rax
  274. pushf
  275. cld
  276. call _padlock_verify_ctx
  277. lea 16($ctx),$ctx # control word
  278. xor %eax,%eax
  279. xor %ebx,%ebx
  280. testl \$`1<<5`,($ctx) # align bit in control word
  281. jnz .L${mode}_aligned
  282. test \$0x0f,$out
  283. setz %al # !out_misaligned
  284. test \$0x0f,$inp
  285. setz %bl # !inp_misaligned
  286. test %ebx,%eax
  287. jnz .L${mode}_aligned
  288. neg %rax
  289. mov \$$PADLOCK_CHUNK,$chunk
  290. not %rax # out_misaligned?-1:0
  291. lea (%rsp),%rbp
  292. cmp $chunk,$len
  293. cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
  294. and $chunk,%rax # out_misaligned?chunk:0
  295. mov $len,$chunk
  296. neg %rax
  297. and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
  298. lea (%rax,%rbp),%rsp
  299. mov \$$PADLOCK_CHUNK,%rax
  300. cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
  301. ___
  302. $code.=<<___ if ($mode eq "ctr32");
  303. .L${mode}_reenter:
  304. mov -4($ctx),%eax # pull 32-bit counter
  305. bswap %eax
  306. neg %eax
  307. and \$`$PADLOCK_CHUNK/16-1`,%eax
  308. mov \$$PADLOCK_CHUNK,$chunk
  309. shl \$4,%eax
  310. cmovz $chunk,%rax
  311. cmp %rax,$len
  312. cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
  313. cmovbe $len,$chunk
  314. ___
  315. $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
  316. cmp $chunk,$len
  317. ja .L${mode}_loop
  318. mov $inp,%rax # check if prefetch crosses page
  319. cmp %rsp,%rbp
  320. cmove $out,%rax
  321. add $len,%rax
  322. neg %rax
  323. and \$0xfff,%rax # distance to page boundary
  324. cmp \$$PADLOCK_PREFETCH{$mode},%rax
  325. mov \$-$PADLOCK_PREFETCH{$mode},%rax
  326. cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
  327. and %rax,$chunk
  328. jz .L${mode}_unaligned_tail
  329. ___
  330. $code.=<<___;
  331. jmp .L${mode}_loop
  332. .align 16
  333. .L${mode}_loop:
  334. cmp $len,$chunk # ctr32 artefact
  335. cmova $len,$chunk # ctr32 artefact
  336. mov $out,%r8 # save parameters
  337. mov $inp,%r9
  338. mov $len,%r10
  339. mov $chunk,$len
  340. mov $chunk,%r11
  341. test \$0x0f,$out # out_misaligned
  342. cmovnz %rsp,$out
  343. test \$0x0f,$inp # inp_misaligned
  344. jz .L${mode}_inp_aligned
  345. shr \$3,$len
  346. .byte 0xf3,0x48,0xa5 # rep movsq
  347. sub $chunk,$out
  348. mov $chunk,$len
  349. mov $out,$inp
  350. .L${mode}_inp_aligned:
  351. lea -16($ctx),%rax # ivp
  352. lea 16($ctx),%rbx # key
  353. shr \$4,$len
  354. .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
  355. ___
  356. $code.=<<___ if ($mode !~ /ecb|ctr/);
  357. movdqa (%rax),%xmm0
  358. movdqa %xmm0,-16($ctx) # copy [or refresh] iv
  359. ___
  360. $code.=<<___ if ($mode eq "ctr32");
  361. mov -4($ctx),%eax # pull 32-bit counter
  362. test \$0xffff0000,%eax
  363. jnz .L${mode}_no_carry
  364. bswap %eax
  365. add \$0x10000,%eax
  366. bswap %eax
  367. mov %eax,-4($ctx)
  368. .L${mode}_no_carry:
  369. ___
  370. $code.=<<___;
  371. mov %r8,$out # restore parameters
  372. mov %r11,$chunk
  373. test \$0x0f,$out
  374. jz .L${mode}_out_aligned
  375. mov $chunk,$len
  376. lea (%rsp),$inp
  377. shr \$3,$len
  378. .byte 0xf3,0x48,0xa5 # rep movsq
  379. sub $chunk,$out
  380. .L${mode}_out_aligned:
  381. mov %r9,$inp
  382. mov %r10,$len
  383. add $chunk,$out
  384. add $chunk,$inp
  385. sub $chunk,$len
  386. mov \$$PADLOCK_CHUNK,$chunk
  387. ___
  388. if (!$PADLOCK_PREFETCH{$mode}) {
  389. $code.=<<___;
  390. jnz .L${mode}_loop
  391. ___
  392. } else {
  393. $code.=<<___;
  394. jz .L${mode}_break
  395. cmp $chunk,$len
  396. jae .L${mode}_loop
  397. ___
  398. $code.=<<___ if ($mode eq "ctr32");
  399. mov $len,$chunk
  400. mov $inp,%rax # check if prefetch crosses page
  401. cmp %rsp,%rbp
  402. cmove $out,%rax
  403. add $len,%rax
  404. neg %rax
  405. and \$0xfff,%rax # distance to page boundary
  406. cmp \$$PADLOCK_PREFETCH{$mode},%rax
  407. mov \$-$PADLOCK_PREFETCH{$mode},%rax
  408. cmovae $chunk,%rax
  409. and %rax,$chunk
  410. jnz .L${mode}_loop
  411. ___
  412. $code.=<<___;
  413. .L${mode}_unaligned_tail:
  414. xor %eax,%eax
  415. cmp %rsp,%rbp
  416. cmove $len,%rax
  417. mov $out,%r8 # save parameters
  418. mov $len,$chunk
  419. sub %rax,%rsp # alloca
  420. shr \$3,$len
  421. lea (%rsp),$out
  422. .byte 0xf3,0x48,0xa5 # rep movsq
  423. mov %rsp,$inp
  424. mov %r8, $out # restore parameters
  425. mov $chunk,$len
  426. jmp .L${mode}_loop
  427. .align 16
  428. .L${mode}_break:
  429. ___
  430. }
  431. $code.=<<___;
  432. cmp %rbp,%rsp
  433. je .L${mode}_done
  434. pxor %xmm0,%xmm0
  435. lea (%rsp),%rax
  436. .L${mode}_bzero:
  437. movaps %xmm0,(%rax)
  438. lea 16(%rax),%rax
  439. cmp %rax,%rbp
  440. ja .L${mode}_bzero
  441. .L${mode}_done:
  442. lea (%rbp),%rsp
  443. jmp .L${mode}_exit
  444. .align 16
  445. .L${mode}_aligned:
  446. ___
  447. $code.=<<___ if ($mode eq "ctr32");
  448. mov -4($ctx),%eax # pull 32-bit counter
  449. bswap %eax
  450. neg %eax
  451. and \$0xffff,%eax
  452. mov \$`16*0x10000`,$chunk
  453. shl \$4,%eax
  454. cmovz $chunk,%rax
  455. cmp %rax,$len
  456. cmova %rax,$chunk # don't let counter cross 2^16
  457. cmovbe $len,$chunk
  458. jbe .L${mode}_aligned_skip
  459. .L${mode}_aligned_loop:
  460. mov $len,%r10 # save parameters
  461. mov $chunk,$len
  462. mov $chunk,%r11
  463. lea -16($ctx),%rax # ivp
  464. lea 16($ctx),%rbx # key
  465. shr \$4,$len # len/=AES_BLOCK_SIZE
  466. .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
  467. mov -4($ctx),%eax # pull 32-bit counter
  468. bswap %eax
  469. add \$0x10000,%eax
  470. bswap %eax
  471. mov %eax,-4($ctx)
  472. mov %r10,$len # restore parameters
  473. sub %r11,$len
  474. mov \$`16*0x10000`,$chunk
  475. jz .L${mode}_exit
  476. cmp $chunk,$len
  477. jae .L${mode}_aligned_loop
  478. .L${mode}_aligned_skip:
  479. ___
  480. $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
  481. lea ($inp,$len),%rbp
  482. neg %rbp
  483. and \$0xfff,%rbp # distance to page boundary
  484. xor %eax,%eax
  485. cmp \$$PADLOCK_PREFETCH{$mode},%rbp
  486. mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
  487. cmovae %rax,%rbp
  488. and $len,%rbp # remainder
  489. sub %rbp,$len
  490. jz .L${mode}_aligned_tail
  491. ___
  492. $code.=<<___;
  493. lea -16($ctx),%rax # ivp
  494. lea 16($ctx),%rbx # key
  495. shr \$4,$len # len/=AES_BLOCK_SIZE
  496. .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
  497. ___
  498. $code.=<<___ if ($mode !~ /ecb|ctr/);
  499. movdqa (%rax),%xmm0
  500. movdqa %xmm0,-16($ctx) # copy [or refresh] iv
  501. ___
  502. $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
  503. test %rbp,%rbp # check remainder
  504. jz .L${mode}_exit
  505. .L${mode}_aligned_tail:
  506. mov $out,%r8
  507. mov %rbp,$chunk
  508. mov %rbp,$len
  509. lea (%rsp),%rbp
  510. sub $len,%rsp
  511. shr \$3,$len
  512. lea (%rsp),$out
  513. .byte 0xf3,0x48,0xa5 # rep movsq
  514. lea (%r8),$out
  515. lea (%rsp),$inp
  516. mov $chunk,$len
  517. jmp .L${mode}_loop
  518. ___
  519. $code.=<<___;
  520. .L${mode}_exit:
  521. mov \$1,%eax
  522. lea 8(%rsp),%rsp
  523. .L${mode}_abort:
  524. pop %rbx
  525. pop %rbp
  526. ret
  527. .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
  528. ___
  529. }
  530. &generate_mode("ecb",0xc8);
  531. &generate_mode("cbc",0xd0);
  532. &generate_mode("cfb",0xe0);
  533. &generate_mode("ofb",0xe8);
  534. &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
  535. $code.=<<___;
  536. .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
  537. .align 16
  538. .data
  539. .align 8
  540. .Lpadlock_saved_context:
  541. .quad 0
  542. ___
  543. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  544. print $code;
  545. close STDOUT;