2
0

vpaes-loongarch64.pl 29 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2015-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. ######################################################################
  9. ## Constant-time SSSE3 AES core implementation.
  10. ## version 0.1
  11. ##
  12. ## By Mike Hamburg (Stanford University), 2009
  13. ## Public domain.
  14. ##
  15. ## For details see http://shiftleft.org/papers/vector_aes/ and
  16. ## http://crypto.stanford.edu/vpaes/.
  17. ##
  18. ######################################################################
  19. # Loongarch64 LSX adaptation by <zhuchen@loongson.cn>,
  20. # <lujingfeng@loongson.cn> and <shichenlong@loongson.cn>
  21. #
  22. ($zero,$ra,$tp,$sp)=map("\$r$_",(0..3));
  23. ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11));
  24. ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9)=map("\$r$_",(12..21));
  25. ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$r$_",(23..30));
  26. ($vr0,$vr1,$vr2,$vr3,$vr4,$vr5,$vr6,$vr7,$vr8,$vr9,$vr10,$vr11,$vr12,$vr13,$vr14,$vr15,$vr16,$vr17,$vr18,$vr19)=map("\$vr$_",(0..19));
  27. ($fp)=map("\$r$_",(22));
  28. for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
  29. open STDOUT,">$output";
  30. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  31. open STDOUT,">$output";
  32. $PREFIX="vpaes";
  33. $code.=<<___;
  34. ##
  35. ## _aes_encrypt_core
  36. ##
  37. ## AES-encrypt %vr0.
  38. ##
  39. ## Inputs:
  40. ## %vr0 = input
  41. ## %vr9-%vr15 as in _vpaes_preheat
  42. ## (%a2) = scheduled keys
  43. ##
  44. ## Output in %vr0
  45. ## Clobbers %vr1-%vr5, %r9, %r10, %r11, %t5
  46. ## Preserves %vr6 - %vr8 so you get some local vectors
  47. ##
  48. ##
  49. ##.type _vpaes_encrypt_core
  50. .align 4
  51. _vpaes_encrypt_core:
  52. .cfi_startproc
  53. move $a5,$a2
  54. li.d $a7,0x10
  55. ld.w $t5,$a2,240
  56. vori.b $vr1,$vr9,0
  57. la.local $t0,Lk_ipt
  58. vld $vr2,$t0,0 # iptlo
  59. vandn.v $vr1,$vr1,$vr0
  60. vld $vr5,$a5,0 # round0 key
  61. vsrli.w $vr1,$vr1,4
  62. vand.v $vr0,$vr0,$vr9
  63. vshuf.b $vr2,$vr18,$vr2,$vr0
  64. vld $vr0,$t0,16 # ipthi
  65. vshuf.b $vr0,$vr18,$vr0,$vr1
  66. vxor.v $vr2,$vr2,$vr5
  67. addi.d $a5,$a5,16
  68. vxor.v $vr0,$vr0,$vr2
  69. la.local $a6,Lk_mc_backward
  70. b .Lenc_entry
  71. .align 4
  72. .Lenc_loop:
  73. # middle of middle round
  74. vori.b $vr4,$vr13,0 # 4 : sb1u
  75. vori.b $vr0,$vr12,0 # 0 : sb1t
  76. vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sb1u
  77. vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t
  78. vxor.v $vr4,$vr4,$vr5 # 4 = sb1u + k
  79. vori.b $vr5,$vr15,0 # 4 : sb2u
  80. vxor.v $vr0,$vr0,$vr4 # 0 = A
  81. add.d $t0,$a7,$a6 # Lk_mc_forward[]
  82. vld $vr1,$t0,-0x40
  83. vshuf.b $vr5,$vr18,$vr5,$vr2 # 4 = sb2u
  84. vld $vr4,$t0,0 # Lk_mc_backward[]
  85. vori.b $vr2,$vr14,0 # 2 : sb2t
  86. vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = sb2t
  87. vori.b $vr3,$vr0,0 # 3 = A
  88. vxor.v $vr2,$vr5,$vr2 # 2 = 2A
  89. vshuf.b $vr0,$vr18,$vr0,$vr1 # 0 = B
  90. addi.d $a5,$a5,16 # next key
  91. vxor.v $vr0,$vr0,$vr2 # 0 = 2A+B
  92. vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = D
  93. addi.d $a7,$a7,16 # next mc
  94. vxor.v $vr3,$vr3,$vr0 # 3 = 2A+B+D
  95. vshuf.b $vr0,$vr18,$vr0,$vr1 # 0 = 2B+C
  96. andi $a7,$a7,0x30 # ... mod 4
  97. addi.d $t5,$t5,-1 # nr--
  98. vxor.v $vr0,$vr0,$vr3 # 0 = 2A+3B+C+D
  99. .Lenc_entry:
  100. # top of round
  101. vori.b $vr1,$vr9,0 # 1 : i
  102. vori.b $vr5,$vr11,0 # 2 : a/k
  103. vandn.v $vr1,$vr1,$vr0 # 1 = i<<4
  104. vsrli.w $vr1,$vr1,4 # 1 = i
  105. vand.v $vr0,$vr0,$vr9 # 0 = k
  106. vshuf.b $vr5,$vr18,$vr5,$vr0 # 2 = a/k
  107. vori.b $vr3,$vr10,0 # 3 : 1/i
  108. vxor.v $vr0,$vr0,$vr1 # 0 = j
  109. vshuf.b $vr3,$vr18,$vr3,$vr1 # 3 = 1/i
  110. vori.b $vr4,$vr10,0 # 4 : 1/j
  111. vxor.v $vr3,$vr3,$vr5 # 3 = iak = 1/i + a/k
  112. vshuf.b $vr4,$vr18,$vr4,$vr0 # 4 = 1/j
  113. vori.b $vr2,$vr10,0 # 2 : 1/iak
  114. vxor.v $vr4,$vr4,$vr5 # 4 = jak = 1/j + a/k
  115. vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = 1/iak
  116. vori.b $vr3,$vr10,0 # 3 : 1/jak
  117. vxor.v $vr2,$vr2,$vr0 # 2 = io
  118. vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = 1/jak
  119. vld $vr5,$a5,0
  120. vxor.v $vr3,$vr3,$vr1 # 3 = jo
  121. bnez $t5,.Lenc_loop
  122. # middle of last round
  123. vld $vr4,$a6, -0x60 # 3 : sbou Lk_sbo
  124. vld $vr0,$a6, -0x50 # 0 : sbot Lk_sbo+16
  125. vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbou
  126. vxor.v $vr4,$vr4,$vr5 # 4 = sb1u + k
  127. vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t
  128. add.d $t0,$a7,$a6 # Lk_sr[]
  129. vld $vr1,$t0,0x40
  130. vxor.v $vr0,$vr0,$vr4 # 0 = A
  131. vshuf.b $vr0,$vr18,$vr0,$vr1
  132. jr $ra
  133. .cfi_endproc
  134. .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
  135. ##
  136. ## Decryption core
  137. ##
  138. ## Same API as encryption core.
  139. ##
  140. #.type _vpaes_decrypt_core,\@abi-omnipotent
  141. .align 4
  142. _vpaes_decrypt_core:
  143. .cfi_startproc
  144. move $a5,$a2 # load key
  145. ld.w $t5,$a2,240
  146. vori.b $vr1,$vr9,0
  147. la.local $t0,Lk_dipt
  148. vld $vr2,$t0,0 # iptlo
  149. vandn.v $vr1,$vr1,$vr0
  150. move $a7,$t5
  151. vsrli.w $vr1,$vr1,4
  152. vld $vr5,$a5,0 # round0 key
  153. slli.d $a7,$a7,4
  154. vand.v $vr0,$vr9,$vr0
  155. vshuf.b $vr2,$vr18,$vr2,$vr0
  156. vld $vr0,$t0,16 # ipthi
  157. xori $a7,$a7,0x30
  158. la.local $a6,Lk_dsbd
  159. vshuf.b $vr0,$vr18,$vr0,$vr1
  160. andi $a7,$a7,0x30
  161. vxor.v $vr2,$vr2,$vr5
  162. la.local $t0,Lk_mc_forward
  163. vld $vr5,$t0,48
  164. vxor.v $vr0,$vr0,$vr2
  165. addi.d $a5,$a5,16
  166. add.d $a7,$a7,$a6
  167. b .Ldec_entry
  168. .align 4
  169. .Ldec_loop:
  170. ##
  171. ## Inverse mix columns
  172. ##
  173. vld $vr4,$a6,-0x20 # 4 : sb9u
  174. vld $vr1,$a6,-0x10 # 0 : sb9t
  175. vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sb9u
  176. vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sb9t
  177. vxor.v $vr0,$vr0,$vr4
  178. vld $vr4,$a6,0x0 # 4 : sbdu
  179. vxor.v $vr0,$vr0,$vr1 # 0 = ch
  180. vld $vr1,$a6,0x10 # 0 : sbdt
  181. vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbdu
  182. vshuf.b $vr0,$vr18,$vr0,$vr5 # MC ch
  183. vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sbdt
  184. vxor.v $vr0,$vr0,$vr4 # 4 = ch
  185. vld $vr4,$a6,0x20 # 4 : sbbu
  186. vxor.v $vr0,$vr0,$vr1 # 0 = ch
  187. vld $vr1,$a6,0x30 # 0 : sbbt
  188. vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbbu
  189. vshuf.b $vr0,$vr18,$vr0,$vr5 # MC ch
  190. vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sbbt
  191. vxor.v $vr0,$vr0,$vr4 # 4 = ch
  192. vld $vr4,$a6,0x40 # 4 : sbeu
  193. vxor.v $vr0,$vr0,$vr1 # 0 = ch
  194. vld $vr1,$a6,0x50 # 0 : sbet
  195. vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbeu
  196. vshuf.b $vr0,$vr18,$vr0,$vr5 # MC ch
  197. vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sbet
  198. vxor.v $vr0,$vr0,$vr4 # 4 = ch
  199. addi.d $a5,$a5, 16 # next round key
  200. vbsrl.v $vr16,$vr5,0xc
  201. vbsll.v $vr5,$vr5,0x4
  202. vor.v $vr5,$vr5,$vr16
  203. vxor.v $vr0,$vr0,$vr1 # 0 = ch
  204. addi.d $t5,$t5,-1 # nr--
  205. .Ldec_entry:
  206. # top of round
  207. vori.b $vr1,$vr9,0 # 1 : i
  208. vandn.v $vr1,$vr1,$vr0 # 1 = i<<4
  209. vori.b $vr2,$vr11,0 # 2 : a/k
  210. vsrli.w $vr1,$vr1,4 # 1 = i
  211. vand.v $vr0,$vr0,$vr9 # 0 = k
  212. vshuf.b $vr2,$vr18,$vr2,$vr0 # 2 = a/k
  213. vori.b $vr3,$vr10,0 # 3 : 1/i
  214. vxor.v $vr0,$vr0,$vr1 # 0 = j
  215. vshuf.b $vr3,$vr18,$vr3,$vr1 # 3 = 1/i
  216. vori.b $vr4,$vr10,0 # 4 : 1/j
  217. vxor.v $vr3,$vr3,$vr2 # 3 = iak = 1/i + a/k
  218. vshuf.b $vr4,$vr18,$vr4,$vr0 # 4 = 1/j
  219. vxor.v $vr4,$vr4,$vr2 # 4 = jak = 1/j + a/k
  220. vori.b $vr2,$vr10,0 # 2 : 1/iak
  221. vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = 1/iak
  222. vori.b $vr3,$vr10,0 # 3 : 1/jak
  223. vxor.v $vr2,$vr2,$vr0 # 2 = io
  224. vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = 1/jak
  225. vld $vr0,$a5,0
  226. vxor.v $vr3,$vr3,$vr1 # 3 = jo
  227. bnez $t5,.Ldec_loop
  228. # middle of last round
  229. vld $vr4,$a6,0x60 # 3 : sbou
  230. vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbou
  231. vxor.v $vr4,$vr4,$vr0 # 4 = sb1u + k
  232. vld $vr0,$a6,0x70 # 0 : sbot
  233. vld $vr2,$a7,-0x160 # Lk_sr-.Lk_dsbd=-0x160
  234. vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t
  235. vxor.v $vr0,$vr0,$vr4 # 0 = A
  236. vshuf.b $vr0,$vr18,$vr0,$vr2
  237. jr $ra
  238. .cfi_endproc
  239. .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
  240. ########################################################
  241. ## ##
  242. ## AES key schedule ##
  243. ## ##
  244. ########################################################
  245. #.type _vpaes_schedule_core,\@abi-omnipotent
  246. .align 4
  247. _vpaes_schedule_core:
  248. .cfi_startproc
  249. # a0 = key
  250. # a1 = size in bits
  251. # a2 = buffer
  252. # a3 = direction. 0=encrypt, 1=decrypt
  253. addi.d $sp,$sp,-48
  254. st.d $ra,$sp,40
  255. st.d $fp,$sp,32
  256. bl _vpaes_preheat # load the tables
  257. la.local $t0,Lk_rcon
  258. vld $vr8,$t0,0 # load rcon
  259. vld $vr0,$a0,0 # load key (unaligned)
  260. # input transform
  261. vori.b $vr3,$vr0,0
  262. la.local $a7,Lk_ipt
  263. bl _vpaes_schedule_transform
  264. vori.b $vr7,$vr0,0
  265. la.local $a6,Lk_sr
  266. bnez $a3,.Lschedule_am_decrypting
  267. # encrypting, output zeroth round key after transform
  268. vst $vr0,$a2,0
  269. b .Lschedule_go
  270. .Lschedule_am_decrypting:
  271. # decrypting, output zeroth round key after shiftrows
  272. add.d $t2,$a4,$a6
  273. vld $vr1,$t2,0
  274. vshuf.b $vr3,$vr18,$vr3,$vr1
  275. vst $vr3,$a2,0
  276. xori $a4,$a4,0x30
  277. .Lschedule_go:
  278. li.d $t6,192
  279. bltu $t6,$a1,.Lschedule_256
  280. beq $t6,$a1,.Lschedule_192
  281. # 128: fall though
  282. ##
  283. ## .schedule_128
  284. ##
  285. ## 128-bit specific part of key schedule.
  286. ##
  287. ## This schedule is really simple, because all its parts
  288. ## are accomplished by the subroutines.
  289. ##
  290. .Lschedule_128:
  291. li.w $a1,10
  292. .Loop_schedule_128:
  293. bl _vpaes_schedule_round
  294. addi.w $a1,$a1,-1
  295. beqz $a1,.Lschedule_mangle_last
  296. bl _vpaes_schedule_mangle
  297. b .Loop_schedule_128
  298. ##
  299. ## .aes_schedule_192
  300. ##
  301. ## 192-bit specific part of key schedule.
  302. ##
  303. ## The main body of this schedule is the same as the 128-bit
  304. ## schedule, but with more smearing. The long, high side is
  305. ## stored in %vr7 as before, and the short, low side is in
  306. ## the high bits of %vr6.
  307. ##
  308. ## This schedule is somewhat nastier, however, because each
  309. ## round produces 192 bits of key material, or 1.5 round keys.
  310. ## Therefore, on each cycle we do 2 rounds and produce 3 round
  311. ## keys.
  312. ##
  313. .align 4
  314. .Lschedule_192:
  315. vld $vr0,$a0,8 #load key part 2
  316. bl _vpaes_schedule_transform #input transform
  317. vaddi.du $vr6,$vr0,0x0 #save short part
  318. vxor.v $vr4,$vr4,$vr4 #clear 4
  319. vpackod.d $vr6,$vr6,$vr4 #clobber low side with zeros
  320. li.w $a1,4
  321. .Loop_schedule_192:
  322. bl _vpaes_schedule_round
  323. vbsrl.v $vr16,$vr6,0x8
  324. vbsll.v $vr0,$vr0,0x8
  325. vor.v $vr0,$vr0,$vr16
  326. bl _vpaes_schedule_mangle # save key n
  327. bl _vpaes_schedule_192_smear
  328. bl _vpaes_schedule_mangle # save key n+1
  329. bl _vpaes_schedule_round
  330. addi.w $a1,$a1,-1
  331. beqz $a1,.Lschedule_mangle_last
  332. bl _vpaes_schedule_mangle # save key n+2
  333. bl _vpaes_schedule_192_smear
  334. b .Loop_schedule_192
  335. ##
  336. ## .aes_schedule_256
  337. ##
  338. ## 256-bit specific part of key schedule.
  339. ##
  340. ## The structure here is very similar to the 128-bit
  341. ## schedule, but with an additional "low side" in
  342. ## %vr6. The low side's rounds are the same as the
  343. ## high side's, except no rcon and no rotation.
  344. ##
  345. .align 4
  346. .Lschedule_256:
  347. vld $vr0,$a0,16 # load key part 2 (unaligned)
  348. bl _vpaes_schedule_transform # input transform
  349. addi.w $a1,$zero,7
  350. .Loop_schedule_256:
  351. bl _vpaes_schedule_mangle # output low result
  352. vori.b $vr6,$vr0,0 # save cur_lo in vr6
  353. # high round
  354. bl _vpaes_schedule_round
  355. addi.d $a1,$a1,-1
  356. beqz $a1,.Lschedule_mangle_last
  357. bl _vpaes_schedule_mangle
  358. # low round. swap vr7 and vr6
  359. vshuf4i.w $vr0,$vr0,0xFF
  360. vori.b $vr5,$vr7,0
  361. vori.b $vr7,$vr6,0
  362. bl _vpaes_schedule_low_round
  363. vori.b $vr7,$vr5,0
  364. b .Loop_schedule_256
  365. ##
  366. ## .aes_schedule_mangle_last
  367. ##
  368. ## Mangler for last round of key schedule
  369. ## Mangles %vr0
  370. ## when encrypting, outputs out(%vr0) ^ 63
  371. ## when decrypting, outputs unskew(%vr0)
  372. ##
  373. ## Always called right before return... jumps to cleanup and exits
  374. ##
  375. .align 4
  376. .Lschedule_mangle_last:
  377. # schedule last round key from vr0
  378. la.local $a7,Lk_deskew # prepare to deskew
  379. bnez $a3,.Lschedule_mangle_last_dec
  380. # encrypting
  381. add.d $t0,$a4,$a6
  382. vld $vr1,$t0,0
  383. vshuf.b $vr0,$vr18,$vr0,$vr1 # output permute
  384. la.local $a7,Lk_opt # prepare to output transform
  385. addi.d $a2,$a2,32
  386. .Lschedule_mangle_last_dec:
  387. addi.d $a2,$a2,-16
  388. la.local $t0,Lk_s63
  389. vld $vr16,$t0,0
  390. vxor.v $vr0,$vr0,$vr16
  391. bl _vpaes_schedule_transform # output transform
  392. vst $vr0,$a2,0 # save last key
  393. # cleanup
  394. vxor.v $vr0,$vr0,$vr0
  395. vxor.v $vr1,$vr1,$vr1
  396. vxor.v $vr2,$vr2,$vr2
  397. vxor.v $vr3,$vr3,$vr3
  398. vxor.v $vr4,$vr4,$vr4
  399. vxor.v $vr5,$vr5,$vr5
  400. vxor.v $vr6,$vr6,$vr6
  401. vxor.v $vr7,$vr7,$vr7
  402. ld.d $ra,$sp,40
  403. ld.d $fp,$sp,32
  404. addi.d $sp,$sp,48
  405. jr $ra
  406. .cfi_endproc
  407. .size _vpaes_schedule_core,.-_vpaes_schedule_core
  408. ##
  409. ## .aes_schedule_192_smear
  410. ##
  411. ## Smear the short, low side in the 192-bit key schedule.
  412. ##
  413. ## Inputs:
  414. ## %vr7: high side, b a x y
  415. ## %vr6: low side, d c 0 0
  416. ## %vr13: 0
  417. ##
  418. ## Outputs:
  419. ## %vr6: b+c+d b+c 0 0
  420. ## %vr0: b+c+d b+c b a
  421. ##
  422. #.type _vpaes_schedule_192_smear,\@abi-omnipotent
  423. .align 4
  424. _vpaes_schedule_192_smear:
  425. .cfi_startproc
  426. vshuf4i.w $vr1,$vr6,0x80 # d c 0 0 -> c 0 0 0
  427. vshuf4i.w $vr0,$vr7,0xFE # b a _ _ -> b b b a
  428. vxor.v $vr6,$vr6,$vr1 # -> c+d c 0 0
  429. vxor.v $vr1,$vr1,$vr1
  430. vxor.v $vr6,$vr6,$vr0 # -> b+c+d b+c b a
  431. vori.b $vr0,$vr6,0
  432. vilvh.d $vr6,$vr6,$vr1 # clobber low side with zeros
  433. jr $ra
  434. .cfi_endproc
  435. .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
  436. ##
  437. ## .aes_schedule_round
  438. ##
  439. ## Runs one main round of the key schedule on %vr0, %vr7
  440. ##
  441. ## Specifically, runs subbytes on the high dword of %vr0
  442. ## then rotates it by one byte and xors into the low dword of
  443. ## %vr7.
  444. ##
  445. ## Adds rcon from low byte of %vr8, then rotates %vr8 for
  446. ## next rcon.
  447. ##
  448. ## Smears the dwords of %vr7 by xoring the low into the
  449. ## second low, result into third, result into highest.
  450. ##
  451. ## Returns results in %vr7 = %vr0.
  452. ## Clobbers %vr1-%vr4, %a7.
  453. ##
  454. #.type _vpaes_schedule_round,\@abi-omnipotent
  455. .align 4
  456. _vpaes_schedule_round:
  457. .cfi_startproc
  458. # extract rcon from vr8
  459. vxor.v $vr1,$vr1,$vr1
  460. vbsrl.v $vr16,$vr8,0xf
  461. vbsll.v $vr1,$vr1,0x1
  462. vor.v $vr1,$vr1,$vr16
  463. vbsrl.v $vr16,$vr8,0xf
  464. vbsll.v $vr8,$vr8,0x1
  465. vor.v $vr8,$vr8,$vr16
  466. vxor.v $vr7,$vr7,$vr1
  467. # rotate
  468. vshuf4i.w $vr0,$vr0,0xff #put $vr0 lowest 32 bit to each words
  469. vbsrl.v $vr16,$vr0,0x1
  470. vbsll.v $vr0,$vr0,0xf
  471. vor.v $vr0,$vr0,$vr16
  472. # fall through...
  473. # low round: same as high round, but no rotation and no rcon.
  474. _vpaes_schedule_low_round:
  475. # smear vr7
  476. vaddi.du $vr1,$vr7,0x0
  477. vbsll.v $vr7,$vr7,0x4
  478. vxor.v $vr7,$vr7,$vr1
  479. vaddi.du $vr1,$vr7,0x0
  480. vbsll.v $vr7,$vr7,0x8
  481. vxor.v $vr7,$vr7,$vr1
  482. vxori.b $vr7,$vr7,0x5B
  483. # subbytes
  484. vaddi.du $vr1,$vr9,0x0
  485. vandn.v $vr1,$vr1,$vr0
  486. vsrli.w $vr1,$vr1,0x4 # 1 = i
  487. vand.v $vr0,$vr0,$vr9 # 0 = k
  488. vaddi.du $vr2,$vr11,0x0 # 2 : a/k
  489. vshuf.b $vr2,$vr18,$vr2,$vr0 # 2 = a/k
  490. vxor.v $vr0,$vr0,$vr1 # 0 = j
  491. vaddi.du $vr3,$vr10,0x0 # 3 : 1/i
  492. vshuf.b $vr3,$vr18,$vr3,$vr1 # 3 = 1/i
  493. vxor.v $vr3,$vr3,$vr2 # 3 = iak = 1/i + a/k
  494. vaddi.du $vr4,$vr10,0x0 # 4 : 1/j
  495. vshuf.b $vr4,$vr18,$vr4,$vr0 # 4 = 1/j
  496. vxor.v $vr4,$vr4,$vr2 # 4 = jak = 1/j + a/k
  497. vaddi.du $vr2,$vr10,0x0 # 2 : 1/iak
  498. vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = 1/iak
  499. vxor.v $vr2,$vr2,$vr0 # 2 = io
  500. vaddi.du $vr3,$vr10,0x0 # 3 : 1/jak
  501. vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = 1/jak
  502. vxor.v $vr3,$vr3,$vr1 # 3 = jo
  503. vaddi.du $vr4,$vr13,0x0 # 4 : sbou
  504. vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbou
  505. vaddi.du $vr0,$vr12,0x0 # 0 : sbot
  506. vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t
  507. vxor.v $vr0,$vr0,$vr4 # 0 = sbox output
  508. # add in smeared stuff
  509. vxor.v $vr0,$vr0,$vr7
  510. vaddi.du $vr7,$vr0,0x0
  511. jr $ra
  512. .cfi_endproc
  513. .size _vpaes_schedule_round,.-_vpaes_schedule_round
  514. ##
  515. ## .aes_schedule_transform
  516. ##
  517. ## Linear-transform %vr0 according to tables at (%r11)
  518. ##
  519. ## Requires that %vr9 = 0x0F0F... as in preheat
  520. ## Output in %vr0
  521. ## Clobbers %vr1, %vr2
  522. ##
  523. #.type _vpaes_schedule_transform,\@abi-omnipotent
  524. .align 4
  525. _vpaes_schedule_transform:
  526. .cfi_startproc
  527. vori.b $vr1,$vr9,0
  528. vandn.v $vr1,$vr1,$vr0
  529. vsrli.w $vr1,$vr1,4
  530. vand.v $vr0,$vr0,$vr9
  531. vld $vr2,$a7,0 # lo
  532. vshuf.b $vr2,$vr18,$vr2,$vr0
  533. vld $vr0,$a7,16 # hi
  534. vshuf.b $vr0,$vr18,$vr0,$vr1
  535. vxor.v $vr0,$vr0,$vr2
  536. jr $ra
  537. .cfi_endproc
  538. .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
  539. ##
  540. ## .aes_schedule_mangle
  541. ##
  542. ## Mangle vr0 from (basis-transformed) standard version
  543. ## to our version.
  544. ##
  545. ## On encrypt,
  546. ## xor with 0x63
  547. ## multiply by circulant 0,1,1,1
  548. ## apply shiftrows transform
  549. ##
  550. ## On decrypt,
  551. ## xor with 0x63
  552. ## multiply by "inverse mixcolumns" circulant E,B,D,9
  553. ## deskew
  554. ## apply shiftrows transform
  555. ##
  556. ##
  557. ## Writes out to (%a2), and increments or decrements it
  558. ## Keeps track of round number mod 4 in %a4
  559. ## Preserves vr0
  560. ## Clobbers vr1-vr5
  561. ##
  562. #.type _vpaes_schedule_mangle,\@abi-omnipotent
  563. .align 4
  564. _vpaes_schedule_mangle:
  565. .cfi_startproc
  566. vori.b $vr4,$vr0,0 # save vr0 for later
  567. la.local $t0,Lk_mc_forward
  568. vld $vr5,$t0,0
  569. bnez $a3,.Lschedule_mangle_dec
  570. # encrypting
  571. addi.d $a2,$a2,16
  572. la.local $t0,Lk_s63
  573. vld $vr16,$t0,0
  574. vxor.v $vr4,$vr4,$vr16
  575. vshuf.b $vr4,$vr18,$vr4,$vr5
  576. vori.b $vr3,$vr4,0
  577. vshuf.b $vr4,$vr18,$vr4,$vr5
  578. vxor.v $vr3,$vr3,$vr4
  579. vshuf.b $vr4,$vr18,$vr4,$vr5
  580. vxor.v $vr3,$vr3,$vr4
  581. b .Lschedule_mangle_both
  582. .align 4
  583. .Lschedule_mangle_dec:
  584. # inverse mix columns
  585. la.local $a7,Lk_dksd
  586. vori.b $vr1,$vr9,0
  587. vandn.v $vr1,$vr1,$vr4
  588. vsrli.w $vr1,$vr1,4 # 1 = hi
  589. vand.v $vr4,$vr4,$vr9 # 4 = lo
  590. vld $vr2,$a7,0
  591. vshuf.b $vr2,$vr18,$vr2,$vr4
  592. vld $vr3,$a7,0x10
  593. vshuf.b $vr3,$vr18,$vr3,$vr1
  594. vxor.v $vr3,$vr3,$vr2
  595. vshuf.b $vr3,$vr18,$vr3,$vr5
  596. vld $vr2,$a7,0x20
  597. vshuf.b $vr2,$vr18,$vr2,$vr4
  598. vxor.v $vr2,$vr2,$vr3
  599. vld $vr3,$a7,0x30
  600. vshuf.b $vr3,$vr18,$vr3,$vr1
  601. vxor.v $vr3,$vr3,$vr2
  602. vshuf.b $vr3,$vr18,$vr3,$vr5
  603. vld $vr2,$a7,0x40
  604. vshuf.b $vr2,$vr18,$vr2,$vr4
  605. vxor.v $vr2,$vr2,$vr3
  606. vld $vr3,$a7,0x50
  607. vshuf.b $vr3,$vr18,$vr3,$vr1
  608. vxor.v $vr3,$vr3,$vr2
  609. vshuf.b $vr3,$vr18,$vr3,$vr5
  610. vld $vr2,$a7,0x60
  611. vshuf.b $vr2,$vr18,$vr2,$vr4
  612. vxor.v $vr2,$vr2,$vr3
  613. vld $vr3,$a7,0x70
  614. vshuf.b $vr3,$vr18,$vr3,$vr1
  615. vxor.v $vr3,$vr3,$vr2
  616. addi.d $a2,$a2,-16
  617. .Lschedule_mangle_both:
  618. add.d $t2,$a4,$a6
  619. vld $vr1,$t2,0
  620. vshuf.b $vr3,$vr18,$vr3,$vr1
  621. addi.d $a4,$a4,-16
  622. andi $a4,$a4,0x30
  623. vst $vr3,$a2,0
  624. jirl $zero,$ra,0
  625. .cfi_endproc
  626. .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
  627. #
  628. # Interface to OpenSSL
  629. #
  630. .globl ${PREFIX}_set_encrypt_key
  631. #.type ${PREFIX}_set_encrypt_key,\@function,3
  632. .align 4
  633. ${PREFIX}_set_encrypt_key:
  634. .cfi_startproc
  635. ___
  636. $code.=<<___;
  637. addi.d $sp,$sp,-48
  638. st.d $ra,$sp,40
  639. st.d $fp,$sp,32
  640. move $t5,$a1
  641. srli.w $t5,$t5,0x5
  642. addi.w $t5,$t5,0x5
  643. st.w $t5,$a2,240 # AES_KEY->rounds = nbits/32+5;
  644. move $a3,$zero
  645. li.d $a4,0x30
  646. bl _vpaes_schedule_core
  647. ___
  648. $code.=<<___;
  649. xor $a0,$a0,$a0
  650. ld.d $ra,$sp,40
  651. ld.d $fp,$sp,32
  652. addi.d $sp,$sp,48
  653. jirl $zero,$ra,0
  654. .cfi_endproc
  655. .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
  656. .globl ${PREFIX}_set_decrypt_key
  657. #.type ${PREFIX}_set_decrypt_key,\@function,3
  658. .align 4
  659. ${PREFIX}_set_decrypt_key:
  660. .cfi_startproc
  661. .Ldec_key_body:
  662. ___
  663. $code.=<<___;
  664. addi.d $sp,$sp,-48
  665. st.d $ra,$sp,40
  666. st.d $fp,$sp,32
  667. move $t5,$a1
  668. srli.w $t5,$t5,5
  669. addi.w $t5,$t5,5
  670. st.w $t5,$a2,240 # AES_KEY->rounds = nbits/32+5;
  671. slli.w $t5,$t5,4
  672. add.d $t0,$a2,$t5
  673. addi.d $a2,$t0,16
  674. li.d $a3,0x1
  675. move $a4,$a1
  676. srli.w $a4,$a4,1
  677. andi $a4,$a4,32
  678. xori $a4,$a4,32 # nbits==192?0:32
  679. bl _vpaes_schedule_core
  680. .Ldec_key_epilogue:
  681. ___
  682. $code.=<<___;
  683. xor $a0,$a0,$a0
  684. ld.d $ra,$sp,40
  685. ld.d $fp,$sp,32
  686. addi.d $sp,$sp,48
  687. jirl $zero,$ra,0
  688. .cfi_endproc
  689. .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
  690. .globl ${PREFIX}_encrypt
  691. #.type ${PREFIX}_encrypt,\@function,3
  692. .align 4
  693. ${PREFIX}_encrypt:
  694. .cfi_startproc
  695. .Lenc_body:
  696. ___
  697. $code.=<<___;
  698. addi.d $sp,$sp,-48
  699. st.d $ra,$sp,40
  700. st.d $fp,$sp,32
  701. vld $vr0,$a0,0x0
  702. bl _vpaes_preheat
  703. bl _vpaes_encrypt_core
  704. vst $vr0,$a1,0x0
  705. .Lenc_epilogue:
  706. ___
  707. $code.=<<___;
  708. ld.d $ra,$sp,40
  709. ld.d $fp,$sp,32
  710. addi.d $sp,$sp,48
  711. jirl $zero,$ra,0
  712. .cfi_endproc
  713. .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
  714. .globl ${PREFIX}_decrypt
  715. #.type ${PREFIX}_decrypt,\@function,3
  716. .align 4
  717. ${PREFIX}_decrypt:
  718. .cfi_startproc
  719. ___
  720. $code.=<<___;
  721. addi.d $sp,$sp,-48
  722. st.d $ra,$sp,40
  723. st.d $fp,$sp,32
  724. vld $vr0,$a0,0x0
  725. bl _vpaes_preheat
  726. bl _vpaes_decrypt_core
  727. vst $vr0,$a1,0x0
  728. ___
  729. $code.=<<___;
  730. ld.d $ra,$sp,40
  731. ld.d $fp,$sp,32
  732. addi.d $sp,$sp,48
  733. jirl $zero,$ra,0
  734. .cfi_endproc
  735. .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
  736. ___
  737. {
  738. my ($inp,$out,$len,$key,$ivp,$enc)=("$a0","$a1","$a2","$a3","$a4","$a5");
  739. # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
  740. # size_t length, const AES_KEY *key,
  741. # unsigned char *ivp,const int enc);
  742. $code.=<<___;
  743. .globl ${PREFIX}_cbc_encrypt
  744. #.type ${PREFIX}_cbc_encrypt,\@function,6
  745. .align 4
  746. ${PREFIX}_cbc_encrypt:
  747. .cfi_startproc
  748. addi.d $sp,$sp,-48
  749. st.d $ra,$sp,40
  750. st.d $fp,$sp,32
  751. ori $t0,$len,0
  752. ori $len,$key,0
  753. ori $key,$t0,0
  754. ___
  755. ($len,$key)=($key,$len);
  756. $code.=<<___;
  757. addi.d $len,$len,-16
  758. blt $len,$zero,.Lcbc_abort
  759. ___
  760. $code.=<<___;
  761. vld $vr6,$ivp,0 # load IV
  762. sub.d $out,$out,$inp
  763. bl _vpaes_preheat
  764. beqz $a5,.Lcbc_dec_loop
  765. b .Lcbc_enc_loop
  766. .align 4
  767. .Lcbc_enc_loop:
  768. vld $vr0,$inp,0
  769. vxor.v $vr0,$vr0,$vr6
  770. bl _vpaes_encrypt_core
  771. vori.b $vr6,$vr0,0
  772. add.d $t0,$out,$inp
  773. vst $vr0,$t0,0
  774. addi.d $inp,$inp,16
  775. addi.d $len,$len,-16
  776. bge $len,$zero,.Lcbc_enc_loop
  777. b .Lcbc_done
  778. .align 4
  779. .Lcbc_dec_loop:
  780. vld $vr0,$inp,0
  781. vori.b $vr7,$vr0,0
  782. bl _vpaes_decrypt_core
  783. vxor.v $vr0,$vr0,$vr6
  784. vori.b $vr6,$vr7,0
  785. add.d $t0,$out,$inp
  786. vst $vr0,$t0,0
  787. addi.d $inp,$inp,16
  788. addi.d $len,$len,-16
  789. bge $len,$zero,.Lcbc_dec_loop
  790. .Lcbc_done:
  791. vst $vr6,$ivp,0 # save IV
  792. ___
  793. $code.=<<___;
  794. .Lcbc_abort:
  795. ld.d $ra,$sp,40
  796. ld.d $fp,$sp,32
  797. addi.d $sp,$sp,48
  798. jirl $zero,$ra,0
  799. .cfi_endproc
  800. .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
  801. ___
  802. }
  803. {
  804. $code.=<<___;
  805. ##
  806. ## _aes_preheat
  807. ##
  808. ## Fills register %a6 -> .aes_consts (so you can -fPIC)
  809. ## and %vr9-%vr15 as specified below.
  810. ##
  811. #.type _vpaes_preheat,\@abi-omnipotent
  812. .align 4
  813. _vpaes_preheat:
  814. .cfi_startproc
  815. la.local $a6,Lk_s0F
  816. vld $vr10,$a6,-0x20 # Lk_inv
  817. vld $vr11,$a6,-0x10 # Lk_inv+16
  818. vld $vr9,$a6,0 # Lk_s0F
  819. vld $vr13,$a6,0x30 # Lk_sb1
  820. vld $vr12,$a6,0x40 # Lk_sb1+16
  821. vld $vr15,$a6,0x50 # Lk_sb2
  822. vld $vr14,$a6,0x60 # Lk_sb2+16
  823. vldi $vr18,0 # $vr18 in this program is equal to 0
  824. jirl $zero,$ra,0
  825. .cfi_endproc
  826. .size _vpaes_preheat,.-_vpaes_preheat
  827. ___
  828. }
  829. ########################################################
  830. ## ##
  831. ## Constants ##
  832. ## ##
  833. ########################################################
  834. $code.=<<___;
  835. .section .rodata
  836. .align 6
  837. Lk_inv: # inv, inva
  838. .quad 0x0E05060F0D080110, 0x040703090A0B0C02
  839. .quad 0x01040A060F0B0710, 0x030D0E0C02050809
  840. Lk_s0F: # s0F
  841. .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
  842. Lk_ipt: # input transform (lo, hi)
  843. .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
  844. .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
  845. Lk_sb1: # sb1u, sb1t
  846. .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
  847. .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
  848. Lk_sb2: # sb2u, sb2t
  849. .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
  850. .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
  851. Lk_sbo: # sbou, sbot
  852. .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
  853. .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
  854. Lk_mc_forward: # mc_forward
  855. .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
  856. .quad 0x080B0A0904070605, 0x000302010C0F0E0D
  857. .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
  858. .quad 0x000302010C0F0E0D, 0x080B0A0904070605
  859. Lk_mc_backward:# mc_backward
  860. .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
  861. .quad 0x020100030E0D0C0F, 0x0A09080B06050407
  862. .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
  863. .quad 0x0A09080B06050407, 0x020100030E0D0C0F
  864. Lk_sr: # sr
  865. .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
  866. .quad 0x030E09040F0A0500, 0x0B06010C07020D08
  867. .quad 0x0F060D040B020900, 0x070E050C030A0108
  868. .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
  869. Lk_rcon: # rcon
  870. .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
  871. Lk_s63: # s63: all equal to 0x63 transformed
  872. .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
  873. Lk_opt: # output transform
  874. .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
  875. .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
  876. Lk_deskew: # deskew tables: inverts the sbox's "skew"
  877. .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
  878. .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
  879. ##
  880. ## Decryption stuff
  881. ## Key schedule constants
  882. ##
  883. Lk_dksd: # decryption key schedule: invskew x*D
  884. .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
  885. .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
  886. Lk_dksb: # decryption key schedule: invskew x*B
  887. .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
  888. .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
  889. Lk_dkse: # decryption key schedule: invskew x*E + 0x63
  890. .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
  891. .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
  892. Lk_dks9: # decryption key schedule: invskew x*9
  893. .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
  894. .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
  895. ##
  896. ## Decryption stuff
  897. ## Round function constants
  898. ##
  899. Lk_dipt: # decryption input transform
  900. .quad 0x0F505B040B545F00, 0x154A411E114E451A
  901. .quad 0x86E383E660056500, 0x12771772F491F194
  902. Lk_dsb9: # decryption sbox output *9*u, *9*t
  903. .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
  904. .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
  905. Lk_dsbd: # decryption sbox output *D*u, *D*t
  906. .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
  907. .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
  908. Lk_dsbb: # decryption sbox output *B*u, *B*t
  909. .quad 0xD022649296B44200, 0x602646F6B0F2D404
  910. .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
  911. Lk_dsbe: # decryption sbox output *E*u, *E*t
  912. .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
  913. .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
  914. Lk_dsbo: # decryption sbox final output
  915. .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
  916. .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
  917. .asciz "Vector Permutation AES for loongarch64/lsx, Mike Hamburg (Stanford University)"
  918. .align 6
  919. ___
  920. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  921. print $code;
  922. close STDOUT or die "error closing STDOUT: $!";