chacha-armv8-sve.pl 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843
  1. #! /usr/bin/env perl
  2. # Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. #
  10. # ChaCha20 for ARMv8 via SVE
  11. #
  12. # $output is the last argument if it looks like a file (it has an extension)
  13. # $flavour is the first argument if it doesn't look like a file
  14. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  15. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  16. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  17. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  18. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  19. die "can't locate arm-xlate.pl";
  20. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  21. or die "can't call $xlate: $!";
  22. *STDOUT=*OUT;
  23. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  24. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  25. my $arg = pop;
  26. $arg = "#$arg" if ($arg*1 eq $arg);
  27. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  28. }
  29. my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
  30. my ($state) = ("x5");
  31. my ($veclen_w,$veclen,$blocks) = ("w6","x6","x7");
  32. my ($saved_outp) = ("x8");
  33. my ($wctr, $xctr) = ("w9", "x9");
  34. my @mx=map("z$_",(0..7,16..23));
  35. my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
  36. $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
  37. my @xt=map("z$_",(24..31,8..11));
  38. my ($rot8) = ("z12");
  39. my ($zctr) = ("z13");
  40. my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7,$xt8,$xt9,$xt10,$xt11)=@xt;
  41. my $debug_encoder=0;
  42. sub SVE_ADD() {
  43. my $x = shift;
  44. my $y = shift;
  45. $code.=<<___;
  46. add @mx[$x].s,@mx[$x].s,@mx[$y].s
  47. ___
  48. if (@_) {
  49. &SVE_ADD(@_);
  50. }
  51. }
  52. sub SVE_EOR() {
  53. my $x = shift;
  54. my $y = shift;
  55. $code.=<<___;
  56. eor @mx[$x].d,@mx[$x].d,@mx[$y].d
  57. ___
  58. if (@_) {
  59. &SVE_EOR(@_);
  60. }
  61. }
  62. sub SVE_LSL() {
  63. my $bits = shift;
  64. my $x = shift;
  65. my $y = shift;
  66. my $next = $x + 1;
  67. $code.=<<___;
  68. lsl @xt[$x].s,@mx[$y].s,$bits
  69. ___
  70. if (@_) {
  71. &SVE_LSL($bits,$next,@_);
  72. }
  73. }
  74. sub SVE_LSR() {
  75. my $bits = shift;
  76. my $x = shift;
  77. $code.=<<___;
  78. lsr @mx[$x].s,@mx[$x].s,$bits
  79. ___
  80. if (@_) {
  81. &SVE_LSR($bits,@_);
  82. }
  83. }
  84. sub SVE_ORR() {
  85. my $x = shift;
  86. my $y = shift;
  87. my $next = $x + 1;
  88. $code.=<<___;
  89. orr @mx[$y].d,@mx[$y].d,@xt[$x].d
  90. ___
  91. if (@_) {
  92. &SVE_ORR($next,@_);
  93. }
  94. }
  95. sub SVE_REV16() {
  96. my $x = shift;
  97. $code.=<<___;
  98. revh @mx[$x].s,p0/m,@mx[$x].s
  99. ___
  100. if (@_) {
  101. &SVE_REV16(@_);
  102. }
  103. }
  104. sub SVE_ROT8() {
  105. my $x = shift;
  106. $code.=<<___;
  107. tbl @mx[$x].b,{@mx[$x].b},$rot8.b
  108. ___
  109. if (@_) {
  110. &SVE_ROT8(@_);
  111. }
  112. }
  113. sub SVE2_XAR() {
  114. my $bits = shift;
  115. my $x = shift;
  116. my $y = shift;
  117. my $rbits = 32-$bits;
  118. $code.=<<___;
  119. xar @mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits
  120. ___
  121. if (@_) {
  122. &SVE2_XAR($bits,@_);
  123. }
  124. }
  125. sub SVE_QR_GROUP() {
  126. my $have_sve2 = shift;
  127. my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
  128. &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
  129. &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
  130. &SVE_REV16($d0,$d1,$d2,$d3);
  131. &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
  132. if ($have_sve2 == 0) {
  133. &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
  134. &SVE_LSL(12,0,$b0,$b1,$b2,$b3);
  135. &SVE_LSR(20,$b0,$b1,$b2,$b3);
  136. &SVE_ORR(0,$b0,$b1,$b2,$b3,);
  137. } else {
  138. &SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
  139. }
  140. &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
  141. &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
  142. &SVE_ROT8($d0,$d1,$d2,$d3);
  143. &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
  144. if ($have_sve2 == 0) {
  145. &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
  146. &SVE_LSL(7,0,$b0,$b1,$b2,$b3);
  147. &SVE_LSR(25,$b0,$b1,$b2,$b3);
  148. &SVE_ORR(0,$b0,$b1,$b2,$b3);
  149. } else {
  150. &SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
  151. }
  152. }
  153. sub SVE_INNER_BLOCK() {
  154. $code.=<<___;
  155. //cbnz $sve2flag, 10f
  156. ___
  157. &SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
  158. &SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
  159. $code.=<<___;
  160. // SVE 2 not enabled until hardware available
  161. #if 0
  162. b 11f
  163. 10:
  164. ___
  165. # &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
  166. # &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
  167. $code.=<<___;
  168. 11:
  169. #endif
  170. ___
  171. }
  172. {{{
  173. my ($dlen,$rsize,$tmp) = ("x10","x11","x12");
  174. sub load() {
  175. my $x0 = shift;
  176. my $x1 = shift;
  177. my $x2 = shift;
  178. my $x3 = shift;
  179. my $x4 = shift;
  180. my $x5 = shift;
  181. my $x6 = shift;
  182. my $x7 = shift;
  183. $code.=<<___;
  184. ld1w {$x0.s},p0/z,[$inp]
  185. ld1w {$x1.s},p0/z,[$inp, #1, MUL VL]
  186. ld1w {$x2.s},p0/z,[$inp, #2, MUL VL]
  187. ld1w {$x3.s},p0/z,[$inp, #3, MUL VL]
  188. ld1w {$x4.s},p0/z,[$inp, #4, MUL VL]
  189. ld1w {$x5.s},p0/z,[$inp, #5, MUL VL]
  190. ld1w {$x6.s},p0/z,[$inp, #6, MUL VL]
  191. ld1w {$x7.s},p0/z,[$inp, #7, MUL VL]
  192. addvl $inp,$inp,#8
  193. ___
  194. }
  195. sub store() {
  196. my $x0 = shift;
  197. my $x1 = shift;
  198. my $x2 = shift;
  199. my $x3 = shift;
  200. my $x4 = shift;
  201. my $x5 = shift;
  202. my $x6 = shift;
  203. my $x7 = shift;
  204. $code.=<<___;
  205. st1w {$x0.s},p0,[$outp]
  206. st1w {$x1.s},p0,[$outp, #1, MUL VL]
  207. st1w {$x2.s},p0,[$outp, #2, MUL VL]
  208. st1w {$x3.s},p0,[$outp, #3, MUL VL]
  209. st1w {$x4.s},p0,[$outp, #4, MUL VL]
  210. st1w {$x5.s},p0,[$outp, #5, MUL VL]
  211. st1w {$x6.s},p0,[$outp, #6, MUL VL]
  212. st1w {$x7.s},p0,[$outp, #7, MUL VL]
  213. addvl $outp,$outp,#8
  214. ___
  215. }
  216. sub transpose() {
  217. my $xa = shift;
  218. my $xb = shift;
  219. my $xc = shift;
  220. my $xd = shift;
  221. $code.=<<___;
  222. zip1 $xt8.s,$xa.s,$xb.s
  223. zip2 $xt9.s,$xa.s,$xb.s
  224. zip1 $xt10.s,$xc.s,$xd.s
  225. zip2 $xt11.s,$xc.s,$xd.s
  226. zip1 $xa.d,$xt8.d,$xt10.d
  227. zip2 $xb.d,$xt8.d,$xt10.d
  228. zip1 $xc.d,$xt9.d,$xt11.d
  229. zip2 $xd.d,$xt9.d,$xt11.d
  230. ___
  231. }
  232. sub add_states() {
  233. my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
  234. $code.=<<___;
  235. ldp $tmpw0,$tmpw1,[$state]
  236. ldp $tmpw2,$tmpw3,[$state,#8]
  237. dup $xt0.s,$tmpw0
  238. dup $xt1.s,$tmpw1
  239. dup $xt2.s,$tmpw2
  240. dup $xt3.s,$tmpw3
  241. ldp $tmpw0,$tmpw1,[$state,#16]
  242. ldp $tmpw2,$tmpw3,[$state,#24]
  243. add @mx[0].s,@mx[0].s,$xt0.s
  244. add @mx[1].s,@mx[1].s,$xt1.s
  245. add @mx[2].s,@mx[2].s,$xt2.s
  246. add @mx[3].s,@mx[3].s,$xt3.s
  247. dup $xt4.s,$tmpw0
  248. dup $xt5.s,$tmpw1
  249. dup $xt6.s,$tmpw2
  250. dup $xt7.s,$tmpw3
  251. ldp $tmpw0,$tmpw1,[$state,#32]
  252. ldp $tmpw2,$tmpw3,[$state,#40]
  253. add @mx[4].s,@mx[4].s,$xt4.s
  254. add @mx[5].s,@mx[5].s,$xt5.s
  255. add @mx[6].s,@mx[6].s,$xt6.s
  256. add @mx[7].s,@mx[7].s,$xt7.s
  257. dup $xt0.s,$tmpw0
  258. dup $xt1.s,$tmpw1
  259. dup $xt2.s,$tmpw2
  260. dup $xt3.s,$tmpw3
  261. ldp $tmpw0,$tmpw1,[$state,#48]
  262. ldp $tmpw2,$tmpw3,[$state,#56]
  263. add @mx[8].s,@mx[8].s,$xt0.s
  264. add @mx[9].s,@mx[9].s,$xt1.s
  265. add @mx[10].s,@mx[10].s,$xt2.s
  266. add @mx[11].s,@mx[11].s,$xt3.s
  267. dup $xt5.s,$tmpw1
  268. dup $xt6.s,$tmpw2
  269. dup $xt7.s,$tmpw3
  270. add @mx[12].s,@mx[12].s,$zctr.s
  271. add @mx[13].s,@mx[13].s,$xt5.s
  272. add @mx[14].s,@mx[14].s,$xt6.s
  273. add @mx[15].s,@mx[15].s,$xt7.s
  274. ___
  275. }
  276. sub SVE_TRANSFORMS() {
  277. &add_states();
  278. &transpose($xa0,$xb0,$xc0,$xd0);
  279. &transpose($xa1,$xb1,$xc1,$xd1);
  280. &transpose($xa2,$xb2,$xc2,$xd2);
  281. &transpose($xa3,$xb3,$xc3,$xd3);
  282. &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
  283. &transpose($xa0,$xa1,$xa2,$xa3);
  284. &transpose($xb0,$xb1,$xb2,$xb3);
  285. $code.=<<___;
  286. eor $xa0.d,$xa0.d,$xt0.d
  287. eor $xa1.d,$xa1.d,$xt1.d
  288. eor $xa2.d,$xa2.d,$xt2.d
  289. eor $xa3.d,$xa3.d,$xt3.d
  290. eor $xb0.d,$xb0.d,$xt4.d
  291. eor $xb1.d,$xb1.d,$xt5.d
  292. eor $xb2.d,$xb2.d,$xt6.d
  293. eor $xb3.d,$xb3.d,$xt7.d
  294. ___
  295. &transpose($xc0,$xc1,$xc2,$xc3);
  296. &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
  297. &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
  298. &transpose($xd0,$xd1,$xd2,$xd3);
  299. $code.=<<___;
  300. eor $xc0.d,$xc0.d,$xt0.d
  301. eor $xc1.d,$xc1.d,$xt1.d
  302. eor $xc2.d,$xc2.d,$xt2.d
  303. eor $xc3.d,$xc3.d,$xt3.d
  304. eor $xd0.d,$xd0.d,$xt4.d
  305. eor $xd1.d,$xd1.d,$xt5.d
  306. eor $xd2.d,$xd2.d,$xt6.d
  307. eor $xd3.d,$xd3.d,$xt7.d
  308. ___
  309. &store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
  310. $code.=<<___;
  311. incw $xctr, ALL, MUL #1
  312. incw $zctr.s, ALL, MUL #1
  313. ___
  314. }
  315. }}}
  316. sub SVE_LOAD_STATES() {
  317. my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
  318. $code.=<<___;
  319. // FIXME following code are not functionally necessary
  320. // but appear to enhance performance
  321. #if 1
  322. ptrues p2.s,ALL
  323. ptrues p2.s,ALL
  324. ptrues p2.s,ALL
  325. ptrues p2.s,ALL
  326. ptrues p2.s,ALL
  327. ptrues p2.s,ALL
  328. #endif
  329. ___
  330. $code.=<<___;
  331. ldp $tmpw0,$tmpw1,[$state]
  332. ldp $tmpw2,$tmpw3,[$state,#8]
  333. dup @mx[0].s,$tmpw0
  334. dup @mx[1].s,$tmpw1
  335. dup @mx[2].s,$tmpw2
  336. dup @mx[3].s,$tmpw3
  337. ldp $tmpw0,$tmpw1,[$state,#16]
  338. ldp $tmpw2,$tmpw3,[$state,#24]
  339. dup @mx[4].s,$tmpw0
  340. dup @mx[5].s,$tmpw1
  341. dup @mx[6].s,$tmpw2
  342. dup @mx[7].s,$tmpw3
  343. ldp $tmpw0,$tmpw1,[$state,#32]
  344. ldp $tmpw2,$tmpw3,[$state,#40]
  345. dup @mx[8].s,$tmpw0
  346. dup @mx[9].s,$tmpw1
  347. dup @mx[10].s,$tmpw2
  348. dup @mx[11].s,$tmpw3
  349. ldp $tmpw0,$tmpw1,[$state, #48]
  350. ldp $tmpw2,$tmpw3,[$state,#56]
  351. mov @mx[12].s,p0/m,$zctr.s
  352. dup @mx[13].s,$tmpw1
  353. dup @mx[14].s,$tmpw2
  354. dup @mx[15].s,$tmpw3
  355. ___
  356. }
  357. sub sve_handle_blocks() {
  358. my ($counter) = ("x10");
  359. &SVE_LOAD_STATES();
  360. $code.=<<___;
  361. mov $counter,#10
  362. .align 5
  363. 1:
  364. ___
  365. &SVE_INNER_BLOCK();
  366. $code.=<<___;
  367. subs $counter,$counter,1
  368. b.ne 1b
  369. ___
  370. &SVE_TRANSFORMS();
  371. }
  372. sub chacha20_process() {
  373. my ($counter) = ("x10");
  374. my ($tmpw) = ("w11");
  375. $code.=<<___;
  376. .align 5
  377. .Loop:
  378. cmp $blocks,$veclen
  379. b.lt .Lexit
  380. ___
  381. &sve_handle_blocks();
  382. $code.=<<___;
  383. subs $blocks,$blocks,$veclen
  384. b.gt .Loop
  385. .Lexit:
  386. ___
  387. }
  388. {{{
  389. my ($tmp,$tmpw) = ("x10", "w10");
  390. my ($tmpw0,$tmpw1) = ("w11", "w12");
  391. my ($ptr) = ("x13");
  392. $code.=<<___;
  393. #include "arm_arch.h"
  394. .arch armv8-a
  395. #if 0
  396. .extern OPENSSL_armcap_P
  397. .hidden OPENSSL_armcap_P
  398. #endif
  399. .text
  400. .align 5
  401. .Lchacha20_consts:
  402. .word 0x61707865
  403. .word 0x3320646e
  404. .word 0x79622d32
  405. .word 0x6b206574
  406. .Lrot8:
  407. .word 0x02010003,0x04040404,0x02010003,0x04040404
  408. .globl ChaCha20_ctr32_sve
  409. .type ChaCha20_ctr32_sve,%function
  410. .align 5
  411. ChaCha20_ctr32_sve:
  412. AARCH64_VALID_CALL_TARGET
  413. mov $tmp, #64
  414. whilelo p0.s,xzr,$tmp
  415. cntp $veclen,p0,p0.s
  416. // run Neon if we only have 128-bit SVE
  417. // in the future, we need to check SVE2
  418. cmp $veclen,4
  419. b.le .Lreturn
  420. lsr $blocks,$len,#6
  421. cmp $blocks,$veclen
  422. b.lt .Lreturn
  423. stp d8,d9,[sp,-48]!
  424. stp d10,d11,[sp,16]
  425. stp d12,d13,[sp,32]
  426. sub sp,sp,#64
  427. adr $tmp,.Lchacha20_consts
  428. ld1 {v0.4s},[$tmp]
  429. adr $tmp,.Lrot8
  430. ldp $tmpw0,$tmpw1,[$tmp]
  431. ld1 {v1.4s,v2.4s},[$key]
  432. ld1 {v3.4s},[$ctr]
  433. ldr $wctr,[$ctr]
  434. index $zctr.s,$wctr,1
  435. index $rot8.s,$tmpw0,$tmpw1
  436. st1 {v0.4s,v1.4s,v2.4s,v3.4s},[sp]
  437. mov $state,sp
  438. #if 0
  439. // SVE2 code not enabled until we have hardware
  440. // for verification
  441. mov $sve2flag,0
  442. adrp $tmp,OPENSSL_armcap_P
  443. ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
  444. tst $tmpw,#ARMV8_SVE2
  445. b.eq 1f
  446. mov $sve2flag,1
  447. 1:
  448. #endif
  449. ___
  450. &chacha20_process();
  451. $code.=<<___;
  452. add sp,sp,#64
  453. ldp d10,d11,[sp,16]
  454. ldp d12,d13,[sp,32]
  455. ldp d8,d9,[sp],48
  456. str $wctr,[$ctr]
  457. and $len,$len,#63
  458. add $len,$len,$blocks,lsl #6
  459. .Lreturn:
  460. ret
  461. .size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
  462. ___
  463. }}}
  464. ########################################
  465. {
  466. my %opcode_unpred = (
  467. "eor" => 0x04a03000,
  468. "add" => 0x04200000,
  469. "orr" => 0x04603000,
  470. "lsl" => 0x04209C00,
  471. "lsr" => 0x04209400,
  472. "incw" => 0x04B0C000,
  473. "xar" => 0x04203400,
  474. "zip1" => 0x05206000,
  475. "zip2" => 0x05206400,
  476. "uzp1" => 0x05206800,
  477. "uzp2" => 0x05206C00,
  478. "index" => 0x04204C00,
  479. "mov" => 0x05203800,
  480. "dup" => 0x05203800,
  481. "tbl" => 0x05203000);
  482. my %opcode_imm_unpred = (
  483. "dup" => 0x2538C000,
  484. "index" => 0x04204400);
  485. my %opcode_scalar_pred = (
  486. "mov" => 0x0528A000,
  487. "cpy" => 0x0528A000,
  488. "st4w" => 0xE5606000,
  489. "st1w" => 0xE5004000,
  490. "ld1w" => 0xA5404000);
  491. my %opcode_gather_pred = (
  492. "ld1w" => 0x85204000);
  493. my %opcode_pred = (
  494. "eor" => 0x04190000,
  495. "add" => 0x04000000,
  496. "orr" => 0x04180000,
  497. "whilelo" => 0x25200C00,
  498. "whilelt" => 0x25200400,
  499. "cntp" => 0x25208000,
  500. "addvl" => 0x04205000,
  501. "lsl" => 0x04038000,
  502. "lsr" => 0x04018000,
  503. "sel" => 0x0520C000,
  504. "mov" => 0x0520C000,
  505. "ptrue" => 0x2518E000,
  506. "pfalse" => 0x2518E400,
  507. "ptrues" => 0x2519E000,
  508. "pnext" => 0x2519C400,
  509. "ld4w" => 0xA560E000,
  510. "st4w" => 0xE570E000,
  511. "st1w" => 0xE500E000,
  512. "ld1w" => 0xA540A000,
  513. "revh" => 0x05258000);
  514. my %tsize = (
  515. 'b' => 0,
  516. 'h' => 1,
  517. 's' => 2,
  518. 'd' => 3);
  519. my %sf = (
  520. "w" => 0,
  521. "x" => 1);
  522. my %pattern = (
  523. "POW2" => 0,
  524. "VL1" => 1,
  525. "VL2" => 2,
  526. "VL3" => 3,
  527. "VL4" => 4,
  528. "VL5" => 5,
  529. "VL6" => 6,
  530. "VL7" => 7,
  531. "VL8" => 8,
  532. "VL16" => 9,
  533. "VL32" => 10,
  534. "VL64" => 11,
  535. "VL128" => 12,
  536. "VL256" => 13,
  537. "MUL4" => 29,
  538. "MUL3" => 30,
  539. "ALL" => 31);
  540. sub create_verifier {
  541. my $filename="./compile_sve.sh";
  542. $scripts = <<___;
  543. #! /bin/bash
  544. set -e
  545. CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'}
  546. [ -z "\$1" ] && exit 1
  547. ARCH=`uname -p | xargs echo -n`
  548. # need gcc-10 and above to compile SVE code
  549. # change this according to your system during debugging
  550. if [ \$ARCH == 'aarch64' ]; then
  551. CC=gcc-11
  552. OBJDUMP=objdump
  553. else
  554. CC=\${CROSS_COMPILE}gcc
  555. OBJDUMP=\${CROSS_COMPILE}objdump
  556. fi
  557. TMPFILE=/tmp/\$\$
  558. cat > \$TMPFILE.c << EOF
  559. extern __attribute__((noinline, section("disasm_output"))) void dummy_func()
  560. {
  561. asm("\$@\\t\\n");
  562. }
  563. int main(int argc, char *argv[])
  564. {
  565. }
  566. EOF
  567. \$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c
  568. \$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}'
  569. rm \$TMPFILE.c \$TMPFILE.out
  570. ___
  571. open(FH, '>', $filename) or die $!;
  572. print FH $scripts;
  573. close(FH);
  574. system("chmod a+x ./compile_sve.sh");
  575. }
  576. sub compile_sve {
  577. return `./compile_sve.sh '@_'`
  578. }
  579. sub verify_inst {
  580. my ($code,$inst)=@_;
  581. my $hexcode = (sprintf "%08x", $code);
  582. if ($debug_encoder == 1) {
  583. my $expect=&compile_sve($inst);
  584. if ($expect ne $hexcode) {
  585. return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode);
  586. }
  587. }
  588. return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst);
  589. }
  590. sub reg_code {
  591. my $code = shift;
  592. if ($code == "zr") {
  593. return "31";
  594. }
  595. return $code;
  596. }
  597. sub encode_size_imm() {
  598. my ($mnemonic, $isize, $const)=@_;
  599. my $esize = (8<<$tsize{$isize});
  600. my $tsize_imm = $esize + $const;
  601. if ($mnemonic eq "lsr" || $mnemonic eq "xar") {
  602. $tsize_imm = 2*$esize - $const;
  603. }
  604. return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16);
  605. }
  606. sub encode_shift_pred() {
  607. my ($mnemonic, $isize, $const)=@_;
  608. my $esize = (8<<$tsize{$isize});
  609. my $tsize_imm = $esize + $const;
  610. if ($mnemonic eq "lsr") {
  611. $tsize_imm = 2*$esize - $const;
  612. }
  613. return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5);
  614. }
  615. sub sve_unpred {
  616. my ($mnemonic,$arg)=@_;
  617. my $inst = (sprintf "%s %s", $mnemonic,$arg);
  618. if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) {
  619. return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16),
  620. $inst)
  621. } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) {
  622. my $regd = $1;
  623. my $isize = $2;
  624. my $regs=$3;
  625. if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
  626. if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
  627. && ((8<<$tsize{$isize}) > $2)) {
  628. return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2),
  629. $inst);
  630. }
  631. } elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) {
  632. return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
  633. } elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) {
  634. return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
  635. } elsif ($regs =~ m/[wx]([0-9]+)/o) {
  636. return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst);
  637. } else {
  638. my $encoded_size = 0;
  639. if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) {
  640. $encoded_size = ($tsize{$isize}<<22);
  641. }
  642. if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o &&
  643. $1 == $regd) {
  644. return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst);
  645. } elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) {
  646. return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst);
  647. }
  648. }
  649. } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) {
  650. return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22),
  651. $inst)
  652. }
  653. sprintf "%s // fail to parse", $inst;
  654. }
  655. sub sve_pred {
  656. my ($mnemonic,,$arg)=@_;
  657. my $inst = (sprintf "%s %s", $mnemonic,$arg);
  658. if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) {
  659. my $zt = $1;
  660. my $size = $tsize{$2};
  661. my $pg = $3;
  662. my $addr = $5;
  663. my $xn = 31;
  664. if ($addr =~ m/x([0-9]+)\s*/o) {
  665. $xn = $1;
  666. }
  667. if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
  668. return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
  669. } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
  670. my $xs = ($2 eq "SXTW") ? 1 : 0;
  671. return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
  672. } elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) {
  673. return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
  674. } else {
  675. return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst);
  676. }
  677. } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) {
  678. my $regd = $1;
  679. my $isize = $2;
  680. my $pg = $3;
  681. my $mod = $4;
  682. my $regs = $5;
  683. if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
  684. if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
  685. && $regd == $1
  686. && $mode == 'm'
  687. && ((8<<$tsize{$isize}) > $2)) {
  688. return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst);
  689. }
  690. } elsif($regs =~ m/[wx]([0-9]+)/o) {
  691. return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
  692. } elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) {
  693. if ($mnemonic eq "sel") {
  694. return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst);
  695. } elsif ($mnemonic eq "mov") {
  696. return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst);
  697. } elsif (length $2 > 0) {
  698. return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst);
  699. } else {
  700. return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
  701. }
  702. }
  703. } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) {
  704. my $pg = $1;
  705. my $isize = $2;
  706. my $regs = $3;
  707. if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) {
  708. return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(&reg_code($2)<<5)|(&reg_code($3)<<16), $inst);
  709. } elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) {
  710. return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst);
  711. } else {
  712. return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst);
  713. }
  714. } elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) {
  715. return &verify_inst($opcode_pred{$mnemonic}|$1, $inst);
  716. }
  717. sprintf "%s // fail to parse", $inst;
  718. }
  719. sub sve_other {
  720. my ($mnemonic,$arg)=@_;
  721. my $inst = (sprintf "%s %s", $mnemonic,$arg);
  722. if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) {
  723. return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst);
  724. } elsif ($mnemonic =~ /inc[bhdw]/) {
  725. if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
  726. return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16), $inst);
  727. } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
  728. return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
  729. } elsif ($arg =~ m/x([0-9]+)/o) {
  730. return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst);
  731. }
  732. } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
  733. return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
  734. }
  735. sprintf "%s // fail to parse", $inst;
  736. }
  737. }
  738. open SELF,$0;
  739. while(<SELF>) {
  740. next if (/^#!/);
  741. last if (!s/^#/\/\// and !/^$/);
  742. print;
  743. }
  744. close SELF;
  745. if ($debug_encoder == 1) {
  746. &create_verifier();
  747. }
  748. foreach(split("\n",$code)) {
  749. s/\`([^\`]*)\`/eval($1)/ge;
  750. s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
  751. s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
  752. s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
  753. s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
  754. s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
  755. s/\b(cntp|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
  756. print $_,"\n";
  757. }
  758. close STDOUT or die "error closing STDOUT: $!";