vpaes-x86_64.pl 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241
  1. #! /usr/bin/env perl
  2. # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. ######################################################################
  9. ## Constant-time SSSE3 AES core implementation.
  10. ## version 0.1
  11. ##
  12. ## By Mike Hamburg (Stanford University), 2009
  13. ## Public domain.
  14. ##
  15. ## For details see http://shiftleft.org/papers/vector_aes/ and
  16. ## http://crypto.stanford.edu/vpaes/.
  17. ######################################################################
  18. # September 2011.
  19. #
  20. # Interface to OpenSSL as "almost" drop-in replacement for
  21. # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
  22. # doesn't handle partial vectors (doesn't have to if called from
  23. # EVP only). "Drop-in" implies that this module doesn't share key
  24. # schedule structure with the original nor does it make assumption
  25. # about its alignment...
  26. #
  27. # Performance summary. aes-x86_64.pl column lists large-block CBC
  28. # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
  29. # byte processed with 128-bit key, and vpaes-x86_64.pl column -
  30. # [also large-block CBC] encrypt/decrypt.
  31. #
  32. # aes-x86_64.pl vpaes-x86_64.pl
  33. #
  34. # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***)
  35. # Nehalem 29.6/40.3/14.6 10.0/11.8
  36. # Atom 57.3/74.2/32.1 60.9/77.2(***)
  37. # Silvermont 52.7/64.0/19.5 48.8/60.8(***)
  38. # Goldmont 38.9/49.0/17.8 10.6/12.6
  39. #
  40. # (*) "Hyper-threading" in the context refers rather to cache shared
  41. # among multiple cores, than to specifically Intel HTT. As vast
  42. # majority of contemporary cores share cache, slower code path
  43. # is common place. In other words "with-hyper-threading-off"
  44. # results are presented mostly for reference purposes.
  45. #
  46. # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
  47. #
  48. # (***) Less impressive improvement on Core 2 and Atom is due to slow
  49. # pshufb, yet it's respectable +36%/62% improvement on Core 2
  50. # (as implied, over "hyper-threading-safe" code path).
  51. #
  52. # <appro@openssl.org>
  53. $flavour = shift;
  54. $output = shift;
  55. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  56. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  57. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  58. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  59. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  60. die "can't locate x86_64-xlate.pl";
  61. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  62. *STDOUT=*OUT;
  63. $PREFIX="vpaes";
  64. $code.=<<___;
  65. .text
  66. ##
  67. ## _aes_encrypt_core
  68. ##
  69. ## AES-encrypt %xmm0.
  70. ##
  71. ## Inputs:
  72. ## %xmm0 = input
  73. ## %xmm9-%xmm15 as in _vpaes_preheat
  74. ## (%rdx) = scheduled keys
  75. ##
  76. ## Output in %xmm0
  77. ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
  78. ## Preserves %xmm6 - %xmm8 so you get some local vectors
  79. ##
  80. ##
  81. .type _vpaes_encrypt_core,\@abi-omnipotent
  82. .align 16
  83. _vpaes_encrypt_core:
  84. .cfi_startproc
  85. mov %rdx, %r9
  86. mov \$16, %r11
  87. mov 240(%rdx),%eax
  88. movdqa %xmm9, %xmm1
  89. movdqa .Lk_ipt(%rip), %xmm2 # iptlo
  90. pandn %xmm0, %xmm1
  91. movdqu (%r9), %xmm5 # round0 key
  92. psrld \$4, %xmm1
  93. pand %xmm9, %xmm0
  94. pshufb %xmm0, %xmm2
  95. movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
  96. pshufb %xmm1, %xmm0
  97. pxor %xmm5, %xmm2
  98. add \$16, %r9
  99. pxor %xmm2, %xmm0
  100. lea .Lk_mc_backward(%rip),%r10
  101. jmp .Lenc_entry
  102. .align 16
  103. .Lenc_loop:
  104. # middle of middle round
  105. movdqa %xmm13, %xmm4 # 4 : sb1u
  106. movdqa %xmm12, %xmm0 # 0 : sb1t
  107. pshufb %xmm2, %xmm4 # 4 = sb1u
  108. pshufb %xmm3, %xmm0 # 0 = sb1t
  109. pxor %xmm5, %xmm4 # 4 = sb1u + k
  110. movdqa %xmm15, %xmm5 # 4 : sb2u
  111. pxor %xmm4, %xmm0 # 0 = A
  112. movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
  113. pshufb %xmm2, %xmm5 # 4 = sb2u
  114. movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
  115. movdqa %xmm14, %xmm2 # 2 : sb2t
  116. pshufb %xmm3, %xmm2 # 2 = sb2t
  117. movdqa %xmm0, %xmm3 # 3 = A
  118. pxor %xmm5, %xmm2 # 2 = 2A
  119. pshufb %xmm1, %xmm0 # 0 = B
  120. add \$16, %r9 # next key
  121. pxor %xmm2, %xmm0 # 0 = 2A+B
  122. pshufb %xmm4, %xmm3 # 3 = D
  123. add \$16, %r11 # next mc
  124. pxor %xmm0, %xmm3 # 3 = 2A+B+D
  125. pshufb %xmm1, %xmm0 # 0 = 2B+C
  126. and \$0x30, %r11 # ... mod 4
  127. sub \$1,%rax # nr--
  128. pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
  129. .Lenc_entry:
  130. # top of round
  131. movdqa %xmm9, %xmm1 # 1 : i
  132. movdqa %xmm11, %xmm5 # 2 : a/k
  133. pandn %xmm0, %xmm1 # 1 = i<<4
  134. psrld \$4, %xmm1 # 1 = i
  135. pand %xmm9, %xmm0 # 0 = k
  136. pshufb %xmm0, %xmm5 # 2 = a/k
  137. movdqa %xmm10, %xmm3 # 3 : 1/i
  138. pxor %xmm1, %xmm0 # 0 = j
  139. pshufb %xmm1, %xmm3 # 3 = 1/i
  140. movdqa %xmm10, %xmm4 # 4 : 1/j
  141. pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
  142. pshufb %xmm0, %xmm4 # 4 = 1/j
  143. movdqa %xmm10, %xmm2 # 2 : 1/iak
  144. pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
  145. pshufb %xmm3, %xmm2 # 2 = 1/iak
  146. movdqa %xmm10, %xmm3 # 3 : 1/jak
  147. pxor %xmm0, %xmm2 # 2 = io
  148. pshufb %xmm4, %xmm3 # 3 = 1/jak
  149. movdqu (%r9), %xmm5
  150. pxor %xmm1, %xmm3 # 3 = jo
  151. jnz .Lenc_loop
  152. # middle of last round
  153. movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
  154. movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
  155. pshufb %xmm2, %xmm4 # 4 = sbou
  156. pxor %xmm5, %xmm4 # 4 = sb1u + k
  157. pshufb %xmm3, %xmm0 # 0 = sb1t
  158. movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
  159. pxor %xmm4, %xmm0 # 0 = A
  160. pshufb %xmm1, %xmm0
  161. ret
  162. .cfi_endproc
  163. .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
  164. ##
  165. ## Decryption core
  166. ##
  167. ## Same API as encryption core.
  168. ##
  169. .type _vpaes_decrypt_core,\@abi-omnipotent
  170. .align 16
  171. _vpaes_decrypt_core:
  172. .cfi_startproc
  173. mov %rdx, %r9 # load key
  174. mov 240(%rdx),%eax
  175. movdqa %xmm9, %xmm1
  176. movdqa .Lk_dipt(%rip), %xmm2 # iptlo
  177. pandn %xmm0, %xmm1
  178. mov %rax, %r11
  179. psrld \$4, %xmm1
  180. movdqu (%r9), %xmm5 # round0 key
  181. shl \$4, %r11
  182. pand %xmm9, %xmm0
  183. pshufb %xmm0, %xmm2
  184. movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
  185. xor \$0x30, %r11
  186. lea .Lk_dsbd(%rip),%r10
  187. pshufb %xmm1, %xmm0
  188. and \$0x30, %r11
  189. pxor %xmm5, %xmm2
  190. movdqa .Lk_mc_forward+48(%rip), %xmm5
  191. pxor %xmm2, %xmm0
  192. add \$16, %r9
  193. add %r10, %r11
  194. jmp .Ldec_entry
  195. .align 16
  196. .Ldec_loop:
  197. ##
  198. ## Inverse mix columns
  199. ##
  200. movdqa -0x20(%r10),%xmm4 # 4 : sb9u
  201. movdqa -0x10(%r10),%xmm1 # 0 : sb9t
  202. pshufb %xmm2, %xmm4 # 4 = sb9u
  203. pshufb %xmm3, %xmm1 # 0 = sb9t
  204. pxor %xmm4, %xmm0
  205. movdqa 0x00(%r10),%xmm4 # 4 : sbdu
  206. pxor %xmm1, %xmm0 # 0 = ch
  207. movdqa 0x10(%r10),%xmm1 # 0 : sbdt
  208. pshufb %xmm2, %xmm4 # 4 = sbdu
  209. pshufb %xmm5, %xmm0 # MC ch
  210. pshufb %xmm3, %xmm1 # 0 = sbdt
  211. pxor %xmm4, %xmm0 # 4 = ch
  212. movdqa 0x20(%r10),%xmm4 # 4 : sbbu
  213. pxor %xmm1, %xmm0 # 0 = ch
  214. movdqa 0x30(%r10),%xmm1 # 0 : sbbt
  215. pshufb %xmm2, %xmm4 # 4 = sbbu
  216. pshufb %xmm5, %xmm0 # MC ch
  217. pshufb %xmm3, %xmm1 # 0 = sbbt
  218. pxor %xmm4, %xmm0 # 4 = ch
  219. movdqa 0x40(%r10),%xmm4 # 4 : sbeu
  220. pxor %xmm1, %xmm0 # 0 = ch
  221. movdqa 0x50(%r10),%xmm1 # 0 : sbet
  222. pshufb %xmm2, %xmm4 # 4 = sbeu
  223. pshufb %xmm5, %xmm0 # MC ch
  224. pshufb %xmm3, %xmm1 # 0 = sbet
  225. pxor %xmm4, %xmm0 # 4 = ch
  226. add \$16, %r9 # next round key
  227. palignr \$12, %xmm5, %xmm5
  228. pxor %xmm1, %xmm0 # 0 = ch
  229. sub \$1,%rax # nr--
  230. .Ldec_entry:
  231. # top of round
  232. movdqa %xmm9, %xmm1 # 1 : i
  233. pandn %xmm0, %xmm1 # 1 = i<<4
  234. movdqa %xmm11, %xmm2 # 2 : a/k
  235. psrld \$4, %xmm1 # 1 = i
  236. pand %xmm9, %xmm0 # 0 = k
  237. pshufb %xmm0, %xmm2 # 2 = a/k
  238. movdqa %xmm10, %xmm3 # 3 : 1/i
  239. pxor %xmm1, %xmm0 # 0 = j
  240. pshufb %xmm1, %xmm3 # 3 = 1/i
  241. movdqa %xmm10, %xmm4 # 4 : 1/j
  242. pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
  243. pshufb %xmm0, %xmm4 # 4 = 1/j
  244. pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
  245. movdqa %xmm10, %xmm2 # 2 : 1/iak
  246. pshufb %xmm3, %xmm2 # 2 = 1/iak
  247. movdqa %xmm10, %xmm3 # 3 : 1/jak
  248. pxor %xmm0, %xmm2 # 2 = io
  249. pshufb %xmm4, %xmm3 # 3 = 1/jak
  250. movdqu (%r9), %xmm0
  251. pxor %xmm1, %xmm3 # 3 = jo
  252. jnz .Ldec_loop
  253. # middle of last round
  254. movdqa 0x60(%r10), %xmm4 # 3 : sbou
  255. pshufb %xmm2, %xmm4 # 4 = sbou
  256. pxor %xmm0, %xmm4 # 4 = sb1u + k
  257. movdqa 0x70(%r10), %xmm0 # 0 : sbot
  258. movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
  259. pshufb %xmm3, %xmm0 # 0 = sb1t
  260. pxor %xmm4, %xmm0 # 0 = A
  261. pshufb %xmm2, %xmm0
  262. ret
  263. .cfi_endproc
  264. .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
  265. ########################################################
  266. ## ##
  267. ## AES key schedule ##
  268. ## ##
  269. ########################################################
  270. .type _vpaes_schedule_core,\@abi-omnipotent
  271. .align 16
  272. _vpaes_schedule_core:
  273. .cfi_startproc
  274. # rdi = key
  275. # rsi = size in bits
  276. # rdx = buffer
  277. # rcx = direction. 0=encrypt, 1=decrypt
  278. call _vpaes_preheat # load the tables
  279. movdqa .Lk_rcon(%rip), %xmm8 # load rcon
  280. movdqu (%rdi), %xmm0 # load key (unaligned)
  281. # input transform
  282. movdqa %xmm0, %xmm3
  283. lea .Lk_ipt(%rip), %r11
  284. call _vpaes_schedule_transform
  285. movdqa %xmm0, %xmm7
  286. lea .Lk_sr(%rip),%r10
  287. test %rcx, %rcx
  288. jnz .Lschedule_am_decrypting
  289. # encrypting, output zeroth round key after transform
  290. movdqu %xmm0, (%rdx)
  291. jmp .Lschedule_go
  292. .Lschedule_am_decrypting:
  293. # decrypting, output zeroth round key after shiftrows
  294. movdqa (%r8,%r10),%xmm1
  295. pshufb %xmm1, %xmm3
  296. movdqu %xmm3, (%rdx)
  297. xor \$0x30, %r8
  298. .Lschedule_go:
  299. cmp \$192, %esi
  300. ja .Lschedule_256
  301. je .Lschedule_192
  302. # 128: fall though
  303. ##
  304. ## .schedule_128
  305. ##
  306. ## 128-bit specific part of key schedule.
  307. ##
  308. ## This schedule is really simple, because all its parts
  309. ## are accomplished by the subroutines.
  310. ##
  311. .Lschedule_128:
  312. mov \$10, %esi
  313. .Loop_schedule_128:
  314. call _vpaes_schedule_round
  315. dec %rsi
  316. jz .Lschedule_mangle_last
  317. call _vpaes_schedule_mangle # write output
  318. jmp .Loop_schedule_128
  319. ##
  320. ## .aes_schedule_192
  321. ##
  322. ## 192-bit specific part of key schedule.
  323. ##
  324. ## The main body of this schedule is the same as the 128-bit
  325. ## schedule, but with more smearing. The long, high side is
  326. ## stored in %xmm7 as before, and the short, low side is in
  327. ## the high bits of %xmm6.
  328. ##
  329. ## This schedule is somewhat nastier, however, because each
  330. ## round produces 192 bits of key material, or 1.5 round keys.
  331. ## Therefore, on each cycle we do 2 rounds and produce 3 round
  332. ## keys.
  333. ##
  334. .align 16
  335. .Lschedule_192:
  336. movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
  337. call _vpaes_schedule_transform # input transform
  338. movdqa %xmm0, %xmm6 # save short part
  339. pxor %xmm4, %xmm4 # clear 4
  340. movhlps %xmm4, %xmm6 # clobber low side with zeros
  341. mov \$4, %esi
  342. .Loop_schedule_192:
  343. call _vpaes_schedule_round
  344. palignr \$8,%xmm6,%xmm0
  345. call _vpaes_schedule_mangle # save key n
  346. call _vpaes_schedule_192_smear
  347. call _vpaes_schedule_mangle # save key n+1
  348. call _vpaes_schedule_round
  349. dec %rsi
  350. jz .Lschedule_mangle_last
  351. call _vpaes_schedule_mangle # save key n+2
  352. call _vpaes_schedule_192_smear
  353. jmp .Loop_schedule_192
  354. ##
  355. ## .aes_schedule_256
  356. ##
  357. ## 256-bit specific part of key schedule.
  358. ##
  359. ## The structure here is very similar to the 128-bit
  360. ## schedule, but with an additional "low side" in
  361. ## %xmm6. The low side's rounds are the same as the
  362. ## high side's, except no rcon and no rotation.
  363. ##
  364. .align 16
  365. .Lschedule_256:
  366. movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
  367. call _vpaes_schedule_transform # input transform
  368. mov \$7, %esi
  369. .Loop_schedule_256:
  370. call _vpaes_schedule_mangle # output low result
  371. movdqa %xmm0, %xmm6 # save cur_lo in xmm6
  372. # high round
  373. call _vpaes_schedule_round
  374. dec %rsi
  375. jz .Lschedule_mangle_last
  376. call _vpaes_schedule_mangle
  377. # low round. swap xmm7 and xmm6
  378. pshufd \$0xFF, %xmm0, %xmm0
  379. movdqa %xmm7, %xmm5
  380. movdqa %xmm6, %xmm7
  381. call _vpaes_schedule_low_round
  382. movdqa %xmm5, %xmm7
  383. jmp .Loop_schedule_256
  384. ##
  385. ## .aes_schedule_mangle_last
  386. ##
  387. ## Mangler for last round of key schedule
  388. ## Mangles %xmm0
  389. ## when encrypting, outputs out(%xmm0) ^ 63
  390. ## when decrypting, outputs unskew(%xmm0)
  391. ##
  392. ## Always called right before return... jumps to cleanup and exits
  393. ##
  394. .align 16
  395. .Lschedule_mangle_last:
  396. # schedule last round key from xmm0
  397. lea .Lk_deskew(%rip),%r11 # prepare to deskew
  398. test %rcx, %rcx
  399. jnz .Lschedule_mangle_last_dec
  400. # encrypting
  401. movdqa (%r8,%r10),%xmm1
  402. pshufb %xmm1, %xmm0 # output permute
  403. lea .Lk_opt(%rip), %r11 # prepare to output transform
  404. add \$32, %rdx
  405. .Lschedule_mangle_last_dec:
  406. add \$-16, %rdx
  407. pxor .Lk_s63(%rip), %xmm0
  408. call _vpaes_schedule_transform # output transform
  409. movdqu %xmm0, (%rdx) # save last key
  410. # cleanup
  411. pxor %xmm0, %xmm0
  412. pxor %xmm1, %xmm1
  413. pxor %xmm2, %xmm2
  414. pxor %xmm3, %xmm3
  415. pxor %xmm4, %xmm4
  416. pxor %xmm5, %xmm5
  417. pxor %xmm6, %xmm6
  418. pxor %xmm7, %xmm7
  419. ret
  420. .cfi_endproc
  421. .size _vpaes_schedule_core,.-_vpaes_schedule_core
  422. ##
  423. ## .aes_schedule_192_smear
  424. ##
  425. ## Smear the short, low side in the 192-bit key schedule.
  426. ##
  427. ## Inputs:
  428. ## %xmm7: high side, b a x y
  429. ## %xmm6: low side, d c 0 0
  430. ## %xmm13: 0
  431. ##
  432. ## Outputs:
  433. ## %xmm6: b+c+d b+c 0 0
  434. ## %xmm0: b+c+d b+c b a
  435. ##
  436. .type _vpaes_schedule_192_smear,\@abi-omnipotent
  437. .align 16
  438. _vpaes_schedule_192_smear:
  439. .cfi_startproc
  440. pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
  441. pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
  442. pxor %xmm1, %xmm6 # -> c+d c 0 0
  443. pxor %xmm1, %xmm1
  444. pxor %xmm0, %xmm6 # -> b+c+d b+c b a
  445. movdqa %xmm6, %xmm0
  446. movhlps %xmm1, %xmm6 # clobber low side with zeros
  447. ret
  448. .cfi_endproc
  449. .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
  450. ##
  451. ## .aes_schedule_round
  452. ##
  453. ## Runs one main round of the key schedule on %xmm0, %xmm7
  454. ##
  455. ## Specifically, runs subbytes on the high dword of %xmm0
  456. ## then rotates it by one byte and xors into the low dword of
  457. ## %xmm7.
  458. ##
  459. ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
  460. ## next rcon.
  461. ##
  462. ## Smears the dwords of %xmm7 by xoring the low into the
  463. ## second low, result into third, result into highest.
  464. ##
  465. ## Returns results in %xmm7 = %xmm0.
  466. ## Clobbers %xmm1-%xmm4, %r11.
  467. ##
  468. .type _vpaes_schedule_round,\@abi-omnipotent
  469. .align 16
  470. _vpaes_schedule_round:
  471. .cfi_startproc
  472. # extract rcon from xmm8
  473. pxor %xmm1, %xmm1
  474. palignr \$15, %xmm8, %xmm1
  475. palignr \$15, %xmm8, %xmm8
  476. pxor %xmm1, %xmm7
  477. # rotate
  478. pshufd \$0xFF, %xmm0, %xmm0
  479. palignr \$1, %xmm0, %xmm0
  480. # fall through...
  481. # low round: same as high round, but no rotation and no rcon.
  482. _vpaes_schedule_low_round:
  483. # smear xmm7
  484. movdqa %xmm7, %xmm1
  485. pslldq \$4, %xmm7
  486. pxor %xmm1, %xmm7
  487. movdqa %xmm7, %xmm1
  488. pslldq \$8, %xmm7
  489. pxor %xmm1, %xmm7
  490. pxor .Lk_s63(%rip), %xmm7
  491. # subbytes
  492. movdqa %xmm9, %xmm1
  493. pandn %xmm0, %xmm1
  494. psrld \$4, %xmm1 # 1 = i
  495. pand %xmm9, %xmm0 # 0 = k
  496. movdqa %xmm11, %xmm2 # 2 : a/k
  497. pshufb %xmm0, %xmm2 # 2 = a/k
  498. pxor %xmm1, %xmm0 # 0 = j
  499. movdqa %xmm10, %xmm3 # 3 : 1/i
  500. pshufb %xmm1, %xmm3 # 3 = 1/i
  501. pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
  502. movdqa %xmm10, %xmm4 # 4 : 1/j
  503. pshufb %xmm0, %xmm4 # 4 = 1/j
  504. pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
  505. movdqa %xmm10, %xmm2 # 2 : 1/iak
  506. pshufb %xmm3, %xmm2 # 2 = 1/iak
  507. pxor %xmm0, %xmm2 # 2 = io
  508. movdqa %xmm10, %xmm3 # 3 : 1/jak
  509. pshufb %xmm4, %xmm3 # 3 = 1/jak
  510. pxor %xmm1, %xmm3 # 3 = jo
  511. movdqa %xmm13, %xmm4 # 4 : sbou
  512. pshufb %xmm2, %xmm4 # 4 = sbou
  513. movdqa %xmm12, %xmm0 # 0 : sbot
  514. pshufb %xmm3, %xmm0 # 0 = sb1t
  515. pxor %xmm4, %xmm0 # 0 = sbox output
  516. # add in smeared stuff
  517. pxor %xmm7, %xmm0
  518. movdqa %xmm0, %xmm7
  519. ret
  520. .cfi_endproc
  521. .size _vpaes_schedule_round,.-_vpaes_schedule_round
  522. ##
  523. ## .aes_schedule_transform
  524. ##
  525. ## Linear-transform %xmm0 according to tables at (%r11)
  526. ##
  527. ## Requires that %xmm9 = 0x0F0F... as in preheat
  528. ## Output in %xmm0
  529. ## Clobbers %xmm1, %xmm2
  530. ##
  531. .type _vpaes_schedule_transform,\@abi-omnipotent
  532. .align 16
  533. _vpaes_schedule_transform:
  534. .cfi_startproc
  535. movdqa %xmm9, %xmm1
  536. pandn %xmm0, %xmm1
  537. psrld \$4, %xmm1
  538. pand %xmm9, %xmm0
  539. movdqa (%r11), %xmm2 # lo
  540. pshufb %xmm0, %xmm2
  541. movdqa 16(%r11), %xmm0 # hi
  542. pshufb %xmm1, %xmm0
  543. pxor %xmm2, %xmm0
  544. ret
  545. .cfi_endproc
  546. .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
  547. ##
  548. ## .aes_schedule_mangle
  549. ##
  550. ## Mangle xmm0 from (basis-transformed) standard version
  551. ## to our version.
  552. ##
  553. ## On encrypt,
  554. ## xor with 0x63
  555. ## multiply by circulant 0,1,1,1
  556. ## apply shiftrows transform
  557. ##
  558. ## On decrypt,
  559. ## xor with 0x63
  560. ## multiply by "inverse mixcolumns" circulant E,B,D,9
  561. ## deskew
  562. ## apply shiftrows transform
  563. ##
  564. ##
  565. ## Writes out to (%rdx), and increments or decrements it
  566. ## Keeps track of round number mod 4 in %r8
  567. ## Preserves xmm0
  568. ## Clobbers xmm1-xmm5
  569. ##
  570. .type _vpaes_schedule_mangle,\@abi-omnipotent
  571. .align 16
  572. _vpaes_schedule_mangle:
  573. .cfi_startproc
  574. movdqa %xmm0, %xmm4 # save xmm0 for later
  575. movdqa .Lk_mc_forward(%rip),%xmm5
  576. test %rcx, %rcx
  577. jnz .Lschedule_mangle_dec
  578. # encrypting
  579. add \$16, %rdx
  580. pxor .Lk_s63(%rip),%xmm4
  581. pshufb %xmm5, %xmm4
  582. movdqa %xmm4, %xmm3
  583. pshufb %xmm5, %xmm4
  584. pxor %xmm4, %xmm3
  585. pshufb %xmm5, %xmm4
  586. pxor %xmm4, %xmm3
  587. jmp .Lschedule_mangle_both
  588. .align 16
  589. .Lschedule_mangle_dec:
  590. # inverse mix columns
  591. lea .Lk_dksd(%rip),%r11
  592. movdqa %xmm9, %xmm1
  593. pandn %xmm4, %xmm1
  594. psrld \$4, %xmm1 # 1 = hi
  595. pand %xmm9, %xmm4 # 4 = lo
  596. movdqa 0x00(%r11), %xmm2
  597. pshufb %xmm4, %xmm2
  598. movdqa 0x10(%r11), %xmm3
  599. pshufb %xmm1, %xmm3
  600. pxor %xmm2, %xmm3
  601. pshufb %xmm5, %xmm3
  602. movdqa 0x20(%r11), %xmm2
  603. pshufb %xmm4, %xmm2
  604. pxor %xmm3, %xmm2
  605. movdqa 0x30(%r11), %xmm3
  606. pshufb %xmm1, %xmm3
  607. pxor %xmm2, %xmm3
  608. pshufb %xmm5, %xmm3
  609. movdqa 0x40(%r11), %xmm2
  610. pshufb %xmm4, %xmm2
  611. pxor %xmm3, %xmm2
  612. movdqa 0x50(%r11), %xmm3
  613. pshufb %xmm1, %xmm3
  614. pxor %xmm2, %xmm3
  615. pshufb %xmm5, %xmm3
  616. movdqa 0x60(%r11), %xmm2
  617. pshufb %xmm4, %xmm2
  618. pxor %xmm3, %xmm2
  619. movdqa 0x70(%r11), %xmm3
  620. pshufb %xmm1, %xmm3
  621. pxor %xmm2, %xmm3
  622. add \$-16, %rdx
  623. .Lschedule_mangle_both:
  624. movdqa (%r8,%r10),%xmm1
  625. pshufb %xmm1,%xmm3
  626. add \$-16, %r8
  627. and \$0x30, %r8
  628. movdqu %xmm3, (%rdx)
  629. ret
  630. .cfi_endproc
  631. .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
  632. #
  633. # Interface to OpenSSL
  634. #
  635. .globl ${PREFIX}_set_encrypt_key
  636. .type ${PREFIX}_set_encrypt_key,\@function,3
  637. .align 16
  638. ${PREFIX}_set_encrypt_key:
  639. .cfi_startproc
  640. ___
  641. $code.=<<___ if ($win64);
  642. lea -0xb8(%rsp),%rsp
  643. movaps %xmm6,0x10(%rsp)
  644. movaps %xmm7,0x20(%rsp)
  645. movaps %xmm8,0x30(%rsp)
  646. movaps %xmm9,0x40(%rsp)
  647. movaps %xmm10,0x50(%rsp)
  648. movaps %xmm11,0x60(%rsp)
  649. movaps %xmm12,0x70(%rsp)
  650. movaps %xmm13,0x80(%rsp)
  651. movaps %xmm14,0x90(%rsp)
  652. movaps %xmm15,0xa0(%rsp)
  653. .Lenc_key_body:
  654. ___
  655. $code.=<<___;
  656. mov %esi,%eax
  657. shr \$5,%eax
  658. add \$5,%eax
  659. mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  660. mov \$0,%ecx
  661. mov \$0x30,%r8d
  662. call _vpaes_schedule_core
  663. ___
  664. $code.=<<___ if ($win64);
  665. movaps 0x10(%rsp),%xmm6
  666. movaps 0x20(%rsp),%xmm7
  667. movaps 0x30(%rsp),%xmm8
  668. movaps 0x40(%rsp),%xmm9
  669. movaps 0x50(%rsp),%xmm10
  670. movaps 0x60(%rsp),%xmm11
  671. movaps 0x70(%rsp),%xmm12
  672. movaps 0x80(%rsp),%xmm13
  673. movaps 0x90(%rsp),%xmm14
  674. movaps 0xa0(%rsp),%xmm15
  675. lea 0xb8(%rsp),%rsp
  676. .Lenc_key_epilogue:
  677. ___
  678. $code.=<<___;
  679. xor %eax,%eax
  680. ret
  681. .cfi_endproc
  682. .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
  683. .globl ${PREFIX}_set_decrypt_key
  684. .type ${PREFIX}_set_decrypt_key,\@function,3
  685. .align 16
  686. ${PREFIX}_set_decrypt_key:
  687. .cfi_startproc
  688. ___
  689. $code.=<<___ if ($win64);
  690. lea -0xb8(%rsp),%rsp
  691. movaps %xmm6,0x10(%rsp)
  692. movaps %xmm7,0x20(%rsp)
  693. movaps %xmm8,0x30(%rsp)
  694. movaps %xmm9,0x40(%rsp)
  695. movaps %xmm10,0x50(%rsp)
  696. movaps %xmm11,0x60(%rsp)
  697. movaps %xmm12,0x70(%rsp)
  698. movaps %xmm13,0x80(%rsp)
  699. movaps %xmm14,0x90(%rsp)
  700. movaps %xmm15,0xa0(%rsp)
  701. .Ldec_key_body:
  702. ___
  703. $code.=<<___;
  704. mov %esi,%eax
  705. shr \$5,%eax
  706. add \$5,%eax
  707. mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  708. shl \$4,%eax
  709. lea 16(%rdx,%rax),%rdx
  710. mov \$1,%ecx
  711. mov %esi,%r8d
  712. shr \$1,%r8d
  713. and \$32,%r8d
  714. xor \$32,%r8d # nbits==192?0:32
  715. call _vpaes_schedule_core
  716. ___
  717. $code.=<<___ if ($win64);
  718. movaps 0x10(%rsp),%xmm6
  719. movaps 0x20(%rsp),%xmm7
  720. movaps 0x30(%rsp),%xmm8
  721. movaps 0x40(%rsp),%xmm9
  722. movaps 0x50(%rsp),%xmm10
  723. movaps 0x60(%rsp),%xmm11
  724. movaps 0x70(%rsp),%xmm12
  725. movaps 0x80(%rsp),%xmm13
  726. movaps 0x90(%rsp),%xmm14
  727. movaps 0xa0(%rsp),%xmm15
  728. lea 0xb8(%rsp),%rsp
  729. .Ldec_key_epilogue:
  730. ___
  731. $code.=<<___;
  732. xor %eax,%eax
  733. ret
  734. .cfi_endproc
  735. .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
  736. .globl ${PREFIX}_encrypt
  737. .type ${PREFIX}_encrypt,\@function,3
  738. .align 16
  739. ${PREFIX}_encrypt:
  740. .cfi_startproc
  741. ___
  742. $code.=<<___ if ($win64);
  743. lea -0xb8(%rsp),%rsp
  744. movaps %xmm6,0x10(%rsp)
  745. movaps %xmm7,0x20(%rsp)
  746. movaps %xmm8,0x30(%rsp)
  747. movaps %xmm9,0x40(%rsp)
  748. movaps %xmm10,0x50(%rsp)
  749. movaps %xmm11,0x60(%rsp)
  750. movaps %xmm12,0x70(%rsp)
  751. movaps %xmm13,0x80(%rsp)
  752. movaps %xmm14,0x90(%rsp)
  753. movaps %xmm15,0xa0(%rsp)
  754. .Lenc_body:
  755. ___
  756. $code.=<<___;
  757. movdqu (%rdi),%xmm0
  758. call _vpaes_preheat
  759. call _vpaes_encrypt_core
  760. movdqu %xmm0,(%rsi)
  761. ___
  762. $code.=<<___ if ($win64);
  763. movaps 0x10(%rsp),%xmm6
  764. movaps 0x20(%rsp),%xmm7
  765. movaps 0x30(%rsp),%xmm8
  766. movaps 0x40(%rsp),%xmm9
  767. movaps 0x50(%rsp),%xmm10
  768. movaps 0x60(%rsp),%xmm11
  769. movaps 0x70(%rsp),%xmm12
  770. movaps 0x80(%rsp),%xmm13
  771. movaps 0x90(%rsp),%xmm14
  772. movaps 0xa0(%rsp),%xmm15
  773. lea 0xb8(%rsp),%rsp
  774. .Lenc_epilogue:
  775. ___
  776. $code.=<<___;
  777. ret
  778. .cfi_endproc
  779. .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
  780. .globl ${PREFIX}_decrypt
  781. .type ${PREFIX}_decrypt,\@function,3
  782. .align 16
  783. ${PREFIX}_decrypt:
  784. .cfi_startproc
  785. ___
  786. $code.=<<___ if ($win64);
  787. lea -0xb8(%rsp),%rsp
  788. movaps %xmm6,0x10(%rsp)
  789. movaps %xmm7,0x20(%rsp)
  790. movaps %xmm8,0x30(%rsp)
  791. movaps %xmm9,0x40(%rsp)
  792. movaps %xmm10,0x50(%rsp)
  793. movaps %xmm11,0x60(%rsp)
  794. movaps %xmm12,0x70(%rsp)
  795. movaps %xmm13,0x80(%rsp)
  796. movaps %xmm14,0x90(%rsp)
  797. movaps %xmm15,0xa0(%rsp)
  798. .Ldec_body:
  799. ___
  800. $code.=<<___;
  801. movdqu (%rdi),%xmm0
  802. call _vpaes_preheat
  803. call _vpaes_decrypt_core
  804. movdqu %xmm0,(%rsi)
  805. ___
  806. $code.=<<___ if ($win64);
  807. movaps 0x10(%rsp),%xmm6
  808. movaps 0x20(%rsp),%xmm7
  809. movaps 0x30(%rsp),%xmm8
  810. movaps 0x40(%rsp),%xmm9
  811. movaps 0x50(%rsp),%xmm10
  812. movaps 0x60(%rsp),%xmm11
  813. movaps 0x70(%rsp),%xmm12
  814. movaps 0x80(%rsp),%xmm13
  815. movaps 0x90(%rsp),%xmm14
  816. movaps 0xa0(%rsp),%xmm15
  817. lea 0xb8(%rsp),%rsp
  818. .Ldec_epilogue:
  819. ___
  820. $code.=<<___;
  821. ret
  822. .cfi_endproc
  823. .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
  824. ___
  825. {
  826. my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
  827. # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
  828. # size_t length, const AES_KEY *key,
  829. # unsigned char *ivp,const int enc);
  830. $code.=<<___;
  831. .globl ${PREFIX}_cbc_encrypt
  832. .type ${PREFIX}_cbc_encrypt,\@function,6
  833. .align 16
  834. ${PREFIX}_cbc_encrypt:
  835. .cfi_startproc
  836. xchg $key,$len
  837. ___
  838. ($len,$key)=($key,$len);
  839. $code.=<<___;
  840. sub \$16,$len
  841. jc .Lcbc_abort
  842. ___
  843. $code.=<<___ if ($win64);
  844. lea -0xb8(%rsp),%rsp
  845. movaps %xmm6,0x10(%rsp)
  846. movaps %xmm7,0x20(%rsp)
  847. movaps %xmm8,0x30(%rsp)
  848. movaps %xmm9,0x40(%rsp)
  849. movaps %xmm10,0x50(%rsp)
  850. movaps %xmm11,0x60(%rsp)
  851. movaps %xmm12,0x70(%rsp)
  852. movaps %xmm13,0x80(%rsp)
  853. movaps %xmm14,0x90(%rsp)
  854. movaps %xmm15,0xa0(%rsp)
  855. .Lcbc_body:
  856. ___
  857. $code.=<<___;
  858. movdqu ($ivp),%xmm6 # load IV
  859. sub $inp,$out
  860. call _vpaes_preheat
  861. cmp \$0,${enc}d
  862. je .Lcbc_dec_loop
  863. jmp .Lcbc_enc_loop
  864. .align 16
  865. .Lcbc_enc_loop:
  866. movdqu ($inp),%xmm0
  867. pxor %xmm6,%xmm0
  868. call _vpaes_encrypt_core
  869. movdqa %xmm0,%xmm6
  870. movdqu %xmm0,($out,$inp)
  871. lea 16($inp),$inp
  872. sub \$16,$len
  873. jnc .Lcbc_enc_loop
  874. jmp .Lcbc_done
  875. .align 16
  876. .Lcbc_dec_loop:
  877. movdqu ($inp),%xmm0
  878. movdqa %xmm0,%xmm7
  879. call _vpaes_decrypt_core
  880. pxor %xmm6,%xmm0
  881. movdqa %xmm7,%xmm6
  882. movdqu %xmm0,($out,$inp)
  883. lea 16($inp),$inp
  884. sub \$16,$len
  885. jnc .Lcbc_dec_loop
  886. .Lcbc_done:
  887. movdqu %xmm6,($ivp) # save IV
  888. ___
  889. $code.=<<___ if ($win64);
  890. movaps 0x10(%rsp),%xmm6
  891. movaps 0x20(%rsp),%xmm7
  892. movaps 0x30(%rsp),%xmm8
  893. movaps 0x40(%rsp),%xmm9
  894. movaps 0x50(%rsp),%xmm10
  895. movaps 0x60(%rsp),%xmm11
  896. movaps 0x70(%rsp),%xmm12
  897. movaps 0x80(%rsp),%xmm13
  898. movaps 0x90(%rsp),%xmm14
  899. movaps 0xa0(%rsp),%xmm15
  900. lea 0xb8(%rsp),%rsp
  901. .Lcbc_epilogue:
  902. ___
  903. $code.=<<___;
  904. .Lcbc_abort:
  905. ret
  906. .cfi_endproc
  907. .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
  908. ___
  909. }
  910. $code.=<<___;
  911. ##
  912. ## _aes_preheat
  913. ##
  914. ## Fills register %r10 -> .aes_consts (so you can -fPIC)
  915. ## and %xmm9-%xmm15 as specified below.
  916. ##
  917. .type _vpaes_preheat,\@abi-omnipotent
  918. .align 16
  919. _vpaes_preheat:
  920. .cfi_startproc
  921. lea .Lk_s0F(%rip), %r10
  922. movdqa -0x20(%r10), %xmm10 # .Lk_inv
  923. movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
  924. movdqa 0x00(%r10), %xmm9 # .Lk_s0F
  925. movdqa 0x30(%r10), %xmm13 # .Lk_sb1
  926. movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
  927. movdqa 0x50(%r10), %xmm15 # .Lk_sb2
  928. movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
  929. ret
  930. .cfi_endproc
  931. .size _vpaes_preheat,.-_vpaes_preheat
  932. ########################################################
  933. ## ##
  934. ## Constants ##
  935. ## ##
  936. ########################################################
  937. .type _vpaes_consts,\@object
  938. .align 64
  939. _vpaes_consts:
  940. .Lk_inv: # inv, inva
  941. .quad 0x0E05060F0D080180, 0x040703090A0B0C02
  942. .quad 0x01040A060F0B0780, 0x030D0E0C02050809
  943. .Lk_s0F: # s0F
  944. .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
  945. .Lk_ipt: # input transform (lo, hi)
  946. .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
  947. .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
  948. .Lk_sb1: # sb1u, sb1t
  949. .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
  950. .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
  951. .Lk_sb2: # sb2u, sb2t
  952. .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
  953. .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
  954. .Lk_sbo: # sbou, sbot
  955. .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
  956. .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
  957. .Lk_mc_forward: # mc_forward
  958. .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
  959. .quad 0x080B0A0904070605, 0x000302010C0F0E0D
  960. .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
  961. .quad 0x000302010C0F0E0D, 0x080B0A0904070605
  962. .Lk_mc_backward:# mc_backward
  963. .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
  964. .quad 0x020100030E0D0C0F, 0x0A09080B06050407
  965. .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
  966. .quad 0x0A09080B06050407, 0x020100030E0D0C0F
  967. .Lk_sr: # sr
  968. .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
  969. .quad 0x030E09040F0A0500, 0x0B06010C07020D08
  970. .quad 0x0F060D040B020900, 0x070E050C030A0108
  971. .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
  972. .Lk_rcon: # rcon
  973. .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
  974. .Lk_s63: # s63: all equal to 0x63 transformed
  975. .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
  976. .Lk_opt: # output transform
  977. .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
  978. .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
  979. .Lk_deskew: # deskew tables: inverts the sbox's "skew"
  980. .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
  981. .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
  982. ##
  983. ## Decryption stuff
  984. ## Key schedule constants
  985. ##
  986. .Lk_dksd: # decryption key schedule: invskew x*D
  987. .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
  988. .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
  989. .Lk_dksb: # decryption key schedule: invskew x*B
  990. .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
  991. .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
  992. .Lk_dkse: # decryption key schedule: invskew x*E + 0x63
  993. .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
  994. .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
  995. .Lk_dks9: # decryption key schedule: invskew x*9
  996. .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
  997. .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
  998. ##
  999. ## Decryption stuff
  1000. ## Round function constants
  1001. ##
  1002. .Lk_dipt: # decryption input transform
  1003. .quad 0x0F505B040B545F00, 0x154A411E114E451A
  1004. .quad 0x86E383E660056500, 0x12771772F491F194
  1005. .Lk_dsb9: # decryption sbox output *9*u, *9*t
  1006. .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
  1007. .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
  1008. .Lk_dsbd: # decryption sbox output *D*u, *D*t
  1009. .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
  1010. .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
  1011. .Lk_dsbb: # decryption sbox output *B*u, *B*t
  1012. .quad 0xD022649296B44200, 0x602646F6B0F2D404
  1013. .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
  1014. .Lk_dsbe: # decryption sbox output *E*u, *E*t
  1015. .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
  1016. .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
  1017. .Lk_dsbo: # decryption sbox final output
  1018. .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
  1019. .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
  1020. .asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
  1021. .align 64
  1022. .size _vpaes_consts,.-_vpaes_consts
  1023. ___
  1024. if ($win64) {
  1025. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1026. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1027. $rec="%rcx";
  1028. $frame="%rdx";
  1029. $context="%r8";
  1030. $disp="%r9";
  1031. $code.=<<___;
  1032. .extern __imp_RtlVirtualUnwind
  1033. .type se_handler,\@abi-omnipotent
  1034. .align 16
  1035. se_handler:
  1036. push %rsi
  1037. push %rdi
  1038. push %rbx
  1039. push %rbp
  1040. push %r12
  1041. push %r13
  1042. push %r14
  1043. push %r15
  1044. pushfq
  1045. sub \$64,%rsp
  1046. mov 120($context),%rax # pull context->Rax
  1047. mov 248($context),%rbx # pull context->Rip
  1048. mov 8($disp),%rsi # disp->ImageBase
  1049. mov 56($disp),%r11 # disp->HandlerData
  1050. mov 0(%r11),%r10d # HandlerData[0]
  1051. lea (%rsi,%r10),%r10 # prologue label
  1052. cmp %r10,%rbx # context->Rip<prologue label
  1053. jb .Lin_prologue
  1054. mov 152($context),%rax # pull context->Rsp
  1055. mov 4(%r11),%r10d # HandlerData[1]
  1056. lea (%rsi,%r10),%r10 # epilogue label
  1057. cmp %r10,%rbx # context->Rip>=epilogue label
  1058. jae .Lin_prologue
  1059. lea 16(%rax),%rsi # %xmm save area
  1060. lea 512($context),%rdi # &context.Xmm6
  1061. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  1062. .long 0xa548f3fc # cld; rep movsq
  1063. lea 0xb8(%rax),%rax # adjust stack pointer
  1064. .Lin_prologue:
  1065. mov 8(%rax),%rdi
  1066. mov 16(%rax),%rsi
  1067. mov %rax,152($context) # restore context->Rsp
  1068. mov %rsi,168($context) # restore context->Rsi
  1069. mov %rdi,176($context) # restore context->Rdi
  1070. mov 40($disp),%rdi # disp->ContextRecord
  1071. mov $context,%rsi # context
  1072. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  1073. .long 0xa548f3fc # cld; rep movsq
  1074. mov $disp,%rsi
  1075. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1076. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1077. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1078. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1079. mov 40(%rsi),%r10 # disp->ContextRecord
  1080. lea 56(%rsi),%r11 # &disp->HandlerData
  1081. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1082. mov %r10,32(%rsp) # arg5
  1083. mov %r11,40(%rsp) # arg6
  1084. mov %r12,48(%rsp) # arg7
  1085. mov %rcx,56(%rsp) # arg8, (NULL)
  1086. call *__imp_RtlVirtualUnwind(%rip)
  1087. mov \$1,%eax # ExceptionContinueSearch
  1088. add \$64,%rsp
  1089. popfq
  1090. pop %r15
  1091. pop %r14
  1092. pop %r13
  1093. pop %r12
  1094. pop %rbp
  1095. pop %rbx
  1096. pop %rdi
  1097. pop %rsi
  1098. ret
  1099. .size se_handler,.-se_handler
  1100. .section .pdata
  1101. .align 4
  1102. .rva .LSEH_begin_${PREFIX}_set_encrypt_key
  1103. .rva .LSEH_end_${PREFIX}_set_encrypt_key
  1104. .rva .LSEH_info_${PREFIX}_set_encrypt_key
  1105. .rva .LSEH_begin_${PREFIX}_set_decrypt_key
  1106. .rva .LSEH_end_${PREFIX}_set_decrypt_key
  1107. .rva .LSEH_info_${PREFIX}_set_decrypt_key
  1108. .rva .LSEH_begin_${PREFIX}_encrypt
  1109. .rva .LSEH_end_${PREFIX}_encrypt
  1110. .rva .LSEH_info_${PREFIX}_encrypt
  1111. .rva .LSEH_begin_${PREFIX}_decrypt
  1112. .rva .LSEH_end_${PREFIX}_decrypt
  1113. .rva .LSEH_info_${PREFIX}_decrypt
  1114. .rva .LSEH_begin_${PREFIX}_cbc_encrypt
  1115. .rva .LSEH_end_${PREFIX}_cbc_encrypt
  1116. .rva .LSEH_info_${PREFIX}_cbc_encrypt
  1117. .section .xdata
  1118. .align 8
  1119. .LSEH_info_${PREFIX}_set_encrypt_key:
  1120. .byte 9,0,0,0
  1121. .rva se_handler
  1122. .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
  1123. .LSEH_info_${PREFIX}_set_decrypt_key:
  1124. .byte 9,0,0,0
  1125. .rva se_handler
  1126. .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
  1127. .LSEH_info_${PREFIX}_encrypt:
  1128. .byte 9,0,0,0
  1129. .rva se_handler
  1130. .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
  1131. .LSEH_info_${PREFIX}_decrypt:
  1132. .byte 9,0,0,0
  1133. .rva se_handler
  1134. .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
  1135. .LSEH_info_${PREFIX}_cbc_encrypt:
  1136. .byte 9,0,0,0
  1137. .rva se_handler
  1138. .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
  1139. ___
  1140. }
  1141. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1142. print $code;
  1143. close STDOUT;