e_padlock-x86_64.pl 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  4. # project. The module is, however, dual licensed under OpenSSL and
  5. # CRYPTOGAMS licenses depending on where you obtain it. For further
  6. # details see http://www.openssl.org/~appro/cryptogams/.
  7. # ====================================================================
  8. # September 2011
  9. #
  10. # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
  11. # details.
  12. $flavour = shift;
  13. $output = shift;
  14. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  15. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  16. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  17. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  18. ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
  19. die "can't locate x86_64-xlate.pl";
  20. open STDOUT,"| $^X $xlate $flavour $output";
  21. $code=".text\n";
  22. $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
  23. $ctx="%rdx";
  24. $out="%rdi";
  25. $inp="%rsi";
  26. $len="%rcx";
  27. $chunk="%rbx";
  28. ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
  29. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  30. $code.=<<___;
  31. .globl padlock_capability
  32. .type padlock_capability,\@abi-omnipotent
  33. .align 16
  34. padlock_capability:
  35. mov %rbx,%r8
  36. xor %eax,%eax
  37. cpuid
  38. xor %eax,%eax
  39. cmp \$`"0x".unpack("H*",'tneC')`,%ebx
  40. jne .Lnoluck
  41. cmp \$`"0x".unpack("H*",'Hrua')`,%edx
  42. jne .Lnoluck
  43. cmp \$`"0x".unpack("H*",'slua')`,%ecx
  44. jne .Lnoluck
  45. mov \$0xC0000000,%eax
  46. cpuid
  47. mov %eax,%edx
  48. xor %eax,%eax
  49. cmp \$0xC0000001,%edx
  50. jb .Lnoluck
  51. mov \$0xC0000001,%eax
  52. cpuid
  53. mov %edx,%eax
  54. and \$0xffffffef,%eax
  55. or \$0x10,%eax # set Nano bit#4
  56. .Lnoluck:
  57. mov %r8,%rbx
  58. ret
  59. .size padlock_capability,.-padlock_capability
  60. .globl padlock_key_bswap
  61. .type padlock_key_bswap,\@abi-omnipotent,0
  62. .align 16
  63. padlock_key_bswap:
  64. mov 240($arg1),%edx
  65. .Lbswap_loop:
  66. mov ($arg1),%eax
  67. bswap %eax
  68. mov %eax,($arg1)
  69. lea 4($arg1),$arg1
  70. sub \$1,%edx
  71. jnz .Lbswap_loop
  72. ret
  73. .size padlock_key_bswap,.-padlock_key_bswap
  74. .globl padlock_verify_context
  75. .type padlock_verify_context,\@abi-omnipotent
  76. .align 16
  77. padlock_verify_context:
  78. mov $arg1,$ctx
  79. pushf
  80. lea .Lpadlock_saved_context(%rip),%rax
  81. call _padlock_verify_ctx
  82. lea 8(%rsp),%rsp
  83. ret
  84. .size padlock_verify_context,.-padlock_verify_context
  85. .type _padlock_verify_ctx,\@abi-omnipotent
  86. .align 16
  87. _padlock_verify_ctx:
  88. mov 8(%rsp),%r8
  89. bt \$30,%r8
  90. jnc .Lverified
  91. cmp (%rax),$ctx
  92. je .Lverified
  93. pushf
  94. popf
  95. .Lverified:
  96. mov $ctx,(%rax)
  97. ret
  98. .size _padlock_verify_ctx,.-_padlock_verify_ctx
  99. .globl padlock_reload_key
  100. .type padlock_reload_key,\@abi-omnipotent
  101. .align 16
  102. padlock_reload_key:
  103. pushf
  104. popf
  105. ret
  106. .size padlock_reload_key,.-padlock_reload_key
  107. .globl padlock_aes_block
  108. .type padlock_aes_block,\@function,3
  109. .align 16
  110. padlock_aes_block:
  111. mov %rbx,%r8
  112. mov \$1,$len
  113. lea 32($ctx),%rbx # key
  114. lea 16($ctx),$ctx # control word
  115. .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
  116. mov %r8,%rbx
  117. ret
  118. .size padlock_aes_block,.-padlock_aes_block
  119. .globl padlock_xstore
  120. .type padlock_xstore,\@function,2
  121. .align 16
  122. padlock_xstore:
  123. mov %esi,%edx
  124. .byte 0x0f,0xa7,0xc0 # xstore
  125. ret
  126. .size padlock_xstore,.-padlock_xstore
  127. .globl padlock_sha1_oneshot
  128. .type padlock_sha1_oneshot,\@function,3
  129. .align 16
  130. padlock_sha1_oneshot:
  131. xor %rax,%rax
  132. mov %rdx,%rcx
  133. .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
  134. ret
  135. .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
  136. .globl padlock_sha1_blocks
  137. .type padlock_sha1_blocks,\@function,3
  138. .align 16
  139. padlock_sha1_blocks:
  140. mov \$-1,%rax
  141. mov %rdx,%rcx
  142. .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
  143. ret
  144. .size padlock_sha1_blocks,.-padlock_sha1_blocks
  145. .globl padlock_sha256_oneshot
  146. .type padlock_sha256_oneshot,\@function,3
  147. .align 16
  148. padlock_sha256_oneshot:
  149. xor %rax,%rax
  150. mov %rdx,%rcx
  151. .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
  152. ret
  153. .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
  154. .globl padlock_sha256_blocks
  155. .type padlock_sha256_blocks,\@function,3
  156. .align 16
  157. padlock_sha256_blocks:
  158. mov \$-1,%rax
  159. mov %rdx,%rcx
  160. .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
  161. ret
  162. .size padlock_sha256_blocks,.-padlock_sha256_blocks
  163. .globl padlock_sha512_blocks
  164. .type padlock_sha512_blocks,\@function,3
  165. .align 16
  166. padlock_sha512_blocks:
  167. mov %rdx,%rcx
  168. .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
  169. ret
  170. .size padlock_sha512_blocks,.-padlock_sha512_blocks
  171. ___
  172. sub generate_mode {
  173. my ($mode,$opcode) = @_;
  174. # int padlock_$mode_encrypt(void *out, const void *inp,
  175. # struct padlock_cipher_data *ctx, size_t len);
  176. $code.=<<___;
  177. .globl padlock_${mode}_encrypt
  178. .type padlock_${mode}_encrypt,\@function,4
  179. .align 16
  180. padlock_${mode}_encrypt:
  181. push %rbp
  182. push %rbx
  183. xor %eax,%eax
  184. test \$15,$ctx
  185. jnz .L${mode}_abort
  186. test \$15,$len
  187. jnz .L${mode}_abort
  188. lea .Lpadlock_saved_context(%rip),%rax
  189. pushf
  190. cld
  191. call _padlock_verify_ctx
  192. lea 16($ctx),$ctx # control word
  193. xor %eax,%eax
  194. xor %ebx,%ebx
  195. testl \$`1<<5`,($ctx) # align bit in control word
  196. jnz .L${mode}_aligned
  197. test \$0x0f,$out
  198. setz %al # !out_misaligned
  199. test \$0x0f,$inp
  200. setz %bl # !inp_misaligned
  201. test %ebx,%eax
  202. jnz .L${mode}_aligned
  203. neg %rax
  204. mov \$$PADLOCK_CHUNK,$chunk
  205. not %rax # out_misaligned?-1:0
  206. lea (%rsp),%rbp
  207. cmp $chunk,$len
  208. cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
  209. and $chunk,%rax # out_misaligned?chunk:0
  210. mov $len,$chunk
  211. neg %rax
  212. and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
  213. lea (%rax,%rbp),%rsp
  214. ___
  215. $code.=<<___ if ($mode eq "ctr32");
  216. mov -4($ctx),%eax # pull 32-bit counter
  217. bswap %eax
  218. neg %eax
  219. and \$`$PADLOCK_CHUNK/16-1`,%eax
  220. jz .L${mode}_loop
  221. shl \$4,%eax
  222. cmp %rax,$len
  223. cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
  224. ___
  225. $code.=<<___;
  226. jmp .L${mode}_loop
  227. .align 16
  228. .L${mode}_loop:
  229. cmp $len,$chunk # ctr32 artefact
  230. cmova $len,$chunk # ctr32 artefact
  231. mov $out,%r8 # save parameters
  232. mov $inp,%r9
  233. mov $len,%r10
  234. mov $chunk,$len
  235. mov $chunk,%r11
  236. test \$0x0f,$out # out_misaligned
  237. cmovnz %rsp,$out
  238. test \$0x0f,$inp # inp_misaligned
  239. jz .L${mode}_inp_aligned
  240. shr \$3,$len
  241. .byte 0xf3,0x48,0xa5 # rep movsq
  242. sub $chunk,$out
  243. mov $chunk,$len
  244. mov $out,$inp
  245. .L${mode}_inp_aligned:
  246. lea -16($ctx),%rax # ivp
  247. lea 16($ctx),%rbx # key
  248. shr \$4,$len
  249. .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
  250. ___
  251. $code.=<<___ if ($mode !~ /ecb|ctr/);
  252. movdqa (%rax),%xmm0
  253. movdqa %xmm0,-16($ctx) # copy [or refresh] iv
  254. ___
  255. $code.=<<___ if ($mode eq "ctr32");
  256. mov -4($ctx),%eax # pull 32-bit counter
  257. test \$0xffff0000,%eax
  258. jnz .L${mode}_no_corr
  259. bswap %eax
  260. add \$0x10000,%eax
  261. bswap %eax
  262. mov %eax,-4($ctx)
  263. .L${mode}_no_corr:
  264. ___
  265. $code.=<<___;
  266. mov %r8,$out # restore paramters
  267. mov %r11,$chunk
  268. test \$0x0f,$out
  269. jz .L${mode}_out_aligned
  270. mov $chunk,$len
  271. shr \$3,$len
  272. lea (%rsp),$inp
  273. .byte 0xf3,0x48,0xa5 # rep movsq
  274. sub $chunk,$out
  275. .L${mode}_out_aligned:
  276. mov %r9,$inp
  277. mov %r10,$len
  278. add $chunk,$out
  279. add $chunk,$inp
  280. sub $chunk,$len
  281. mov \$$PADLOCK_CHUNK,$chunk
  282. jnz .L${mode}_loop
  283. test \$0x0f,$out
  284. jz .L${mode}_done
  285. mov %rbp,$len
  286. mov %rsp,$out
  287. sub %rsp,$len
  288. xor %rax,%rax
  289. shr \$3,$len
  290. .byte 0xf3,0x48,0xab # rep stosq
  291. .L${mode}_done:
  292. lea (%rbp),%rsp
  293. jmp .L${mode}_exit
  294. .align 16
  295. .L${mode}_aligned:
  296. ___
  297. $code.=<<___ if ($mode eq "ctr32");
  298. mov -4($ctx),%eax # pull 32-bit counter
  299. mov \$`16*0x10000`,$chunk
  300. bswap %eax
  301. cmp $len,$chunk
  302. cmova $len,$chunk
  303. neg %eax
  304. and \$0xffff,%eax
  305. jz .L${mode}_aligned_loop
  306. shl \$4,%eax
  307. cmp %rax,$len
  308. cmova %rax,$chunk # don't let counter cross 2^16
  309. jmp .L${mode}_aligned_loop
  310. .align 16
  311. .L${mode}_aligned_loop:
  312. cmp $len,$chunk
  313. cmova $len,$chunk
  314. mov $len,%r10 # save parameters
  315. mov $chunk,$len
  316. mov $chunk,%r11
  317. ___
  318. $code.=<<___;
  319. lea -16($ctx),%rax # ivp
  320. lea 16($ctx),%rbx # key
  321. shr \$4,$len # len/=AES_BLOCK_SIZE
  322. .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
  323. ___
  324. $code.=<<___ if ($mode !~ /ecb|ctr/);
  325. movdqa (%rax),%xmm0
  326. movdqa %xmm0,-16($ctx) # copy [or refresh] iv
  327. ___
  328. $code.=<<___ if ($mode eq "ctr32");
  329. mov -4($ctx),%eax # pull 32-bit counter
  330. bswap %eax
  331. add \$0x10000,%eax
  332. bswap %eax
  333. mov %eax,-4($ctx)
  334. mov %r11,$chunk # restore paramters
  335. mov %r10,$len
  336. sub $chunk,$len
  337. mov \$`16*0x10000`,$chunk
  338. jnz .L${mode}_aligned_loop
  339. ___
  340. $code.=<<___;
  341. .L${mode}_exit:
  342. mov \$1,%eax
  343. lea 8(%rsp),%rsp
  344. .L${mode}_abort:
  345. pop %rbx
  346. pop %rbp
  347. ret
  348. .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
  349. ___
  350. }
  351. &generate_mode("ecb",0xc8);
  352. &generate_mode("cbc",0xd0);
  353. &generate_mode("cfb",0xe0);
  354. &generate_mode("ofb",0xe8);
  355. &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
  356. $code.=<<___;
  357. .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
  358. .align 16
  359. .data
  360. .align 8
  361. .Lpadlock_saved_context:
  362. .quad 0
  363. ___
  364. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  365. print $code;
  366. close STDOUT;