chacha-armv8-sve.pl 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157
  1. #! /usr/bin/env perl
  2. # Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. #
  10. # ChaCha20 for ARMv8 via SVE
  11. #
  12. # $output is the last argument if it looks like a file (it has an extension)
  13. # $flavour is the first argument if it doesn't look like a file
  14. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  15. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  16. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  17. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  18. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  19. die "can't locate arm-xlate.pl";
  20. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  21. or die "can't call $xlate: $!";
  22. *STDOUT=*OUT;
  23. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  24. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  25. my $arg = pop;
  26. $arg = "#$arg" if ($arg*1 eq $arg);
  27. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  28. }
  29. my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
  30. my ($veclen) = ("x5");
  31. my ($counter) = ("x6");
  32. my ($counter_w) = ("w6");
  33. my @xx=(7..22);
  34. my @sxx=map("x$_",@xx);
  35. my @sx=map("w$_",@xx);
  36. my @K=map("x$_",(23..30));
  37. my @elem=(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
  38. my @KL=map("w$_",(23..30));
  39. my @mx=map("z$_",@elem);
  40. my @vx=map("v$_",@elem);
  41. my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
  42. $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
  43. my ($zctr) = ("z16");
  44. my @tt=(17..24);
  45. my @xt=map("z$_",@tt);
  46. my @vt=map("v$_",@tt);
  47. my @perm=map("z$_",(25..30));
  48. my ($rot8) = ("z31");
  49. my @bak=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],@xt[4],@xt[5],@xt[6],@xt[7],@xt[0],@xt[1],$zctr,@xt[2],@xt[3],$rot8);
  50. my $debug_encoder=0;
  51. sub SVE_ADD() {
  52. my $x = shift;
  53. my $y = shift;
  54. $code.=<<___;
  55. add @mx[$x].s,@mx[$x].s,@mx[$y].s
  56. .if mixin == 1
  57. add @sx[$x],@sx[$x],@sx[$y]
  58. .endif
  59. ___
  60. if (@_) {
  61. &SVE_ADD(@_);
  62. }
  63. }
  64. sub SVE_EOR() {
  65. my $x = shift;
  66. my $y = shift;
  67. $code.=<<___;
  68. eor @mx[$x].d,@mx[$x].d,@mx[$y].d
  69. .if mixin == 1
  70. eor @sx[$x],@sx[$x],@sx[$y]
  71. .endif
  72. ___
  73. if (@_) {
  74. &SVE_EOR(@_);
  75. }
  76. }
  77. sub SVE_LSL() {
  78. my $bits = shift;
  79. my $x = shift;
  80. my $y = shift;
  81. my $next = $x + 1;
  82. $code.=<<___;
  83. lsl @xt[$x].s,@mx[$y].s,$bits
  84. ___
  85. if (@_) {
  86. &SVE_LSL($bits,$next,@_);
  87. }
  88. }
  89. sub SVE_LSR() {
  90. my $bits = shift;
  91. my $x = shift;
  92. $code.=<<___;
  93. lsr @mx[$x].s,@mx[$x].s,$bits
  94. .if mixin == 1
  95. ror @sx[$x],@sx[$x],$bits
  96. .endif
  97. ___
  98. if (@_) {
  99. &SVE_LSR($bits,@_);
  100. }
  101. }
  102. sub SVE_ORR() {
  103. my $x = shift;
  104. my $y = shift;
  105. my $next = $x + 1;
  106. $code.=<<___;
  107. orr @mx[$y].d,@mx[$y].d,@xt[$x].d
  108. ___
  109. if (@_) {
  110. &SVE_ORR($next,@_);
  111. }
  112. }
  113. sub SVE_REV16() {
  114. my $x = shift;
  115. $code.=<<___;
  116. revh @mx[$x].s,p0/m,@mx[$x].s
  117. .if mixin == 1
  118. ror @sx[$x],@sx[$x],#16
  119. .endif
  120. ___
  121. if (@_) {
  122. &SVE_REV16(@_);
  123. }
  124. }
  125. sub SVE_ROT8() {
  126. my $x = shift;
  127. $code.=<<___;
  128. tbl @mx[$x].b,{@mx[$x].b},$rot8.b
  129. .if mixin == 1
  130. ror @sx[$x],@sx[$x],#24
  131. .endif
  132. ___
  133. if (@_) {
  134. &SVE_ROT8(@_);
  135. }
  136. }
  137. sub SVE2_XAR() {
  138. my $bits = shift;
  139. my $x = shift;
  140. my $y = shift;
  141. my $rbits = 32-$bits;
  142. $code.=<<___;
  143. .if mixin == 1
  144. eor @sx[$x],@sx[$x],@sx[$y]
  145. .endif
  146. xar @mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits
  147. .if mixin == 1
  148. ror @sx[$x],@sx[$x],$rbits
  149. .endif
  150. ___
  151. if (@_) {
  152. &SVE2_XAR($bits,@_);
  153. }
  154. }
  155. sub SVE2_QR_GROUP() {
  156. my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
  157. &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
  158. &SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
  159. &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
  160. &SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
  161. &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
  162. &SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
  163. &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
  164. &SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
  165. }
  166. sub SVE_QR_GROUP() {
  167. my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
  168. &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
  169. &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
  170. &SVE_REV16($d0,$d1,$d2,$d3);
  171. &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
  172. &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
  173. &SVE_LSL(12,0,$b0,$b1,$b2,$b3);
  174. &SVE_LSR(20,$b0,$b1,$b2,$b3);
  175. &SVE_ORR(0,$b0,$b1,$b2,$b3);
  176. &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
  177. &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
  178. &SVE_ROT8($d0,$d1,$d2,$d3);
  179. &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
  180. &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
  181. &SVE_LSL(7,0,$b0,$b1,$b2,$b3);
  182. &SVE_LSR(25,$b0,$b1,$b2,$b3);
  183. &SVE_ORR(0,$b0,$b1,$b2,$b3);
  184. }
  185. sub SVE_INNER_BLOCK() {
  186. $code.=<<___;
  187. mov $counter,#10
  188. 10:
  189. .align 5
  190. ___
  191. &SVE_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
  192. &SVE_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
  193. $code.=<<___;
  194. sub $counter,$counter,1
  195. cbnz $counter,10b
  196. ___
  197. }
  198. sub SVE2_INNER_BLOCK() {
  199. $code.=<<___;
  200. mov $counter,#10
  201. 10:
  202. .align 5
  203. ___
  204. &SVE2_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
  205. &SVE2_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
  206. $code.=<<___;
  207. sub $counter,$counter,1
  208. cbnz $counter,10b
  209. ___
  210. }
  211. sub load_regs() {
  212. my $offset = shift;
  213. my $reg = shift;
  214. my $next_offset = $offset + 1;
  215. $code.=<<___;
  216. ld1w {$reg.s},p0/z,[$inp,#$offset,MUL VL]
  217. #ifdef __AARCH64EB__
  218. revb $reg.s,p0/m,$reg.s
  219. #endif
  220. ___
  221. if (@_) {
  222. &load_regs($next_offset, @_);
  223. } else {
  224. $code.=<<___;
  225. addvl $inp,$inp,$next_offset
  226. ___
  227. }
  228. }
  229. sub load() {
  230. if (@_) {
  231. &load_regs(0, @_);
  232. }
  233. }
  234. sub store_regs() {
  235. my $offset = shift;
  236. my $reg = shift;
  237. my $next_offset = $offset + 1;
  238. $code.=<<___;
  239. #ifdef __AARCH64EB__
  240. revb $reg.s,p0/m,$reg.s
  241. #endif
  242. st1w {$reg.s},p0,[$outp,#$offset,MUL VL]
  243. ___
  244. if (@_) {
  245. &store_regs($next_offset, @_);
  246. } else {
  247. $code.=<<___;
  248. addvl $outp,$outp,$next_offset
  249. ___
  250. }
  251. }
  252. sub store() {
  253. if (@_) {
  254. &store_regs(0, @_);
  255. }
  256. }
  257. sub transpose() {
  258. my $xa = shift;
  259. my $xb = shift;
  260. my $xc = shift;
  261. my $xd = shift;
  262. my $xa1 = shift;
  263. my $xb1 = shift;
  264. my $xc1 = shift;
  265. my $xd1 = shift;
  266. $code.=<<___;
  267. zip1 @xt[0].s,$xa.s,$xb.s
  268. zip2 @xt[1].s,$xa.s,$xb.s
  269. zip1 @xt[2].s,$xc.s,$xd.s
  270. zip2 @xt[3].s,$xc.s,$xd.s
  271. zip1 @xt[4].s,$xa1.s,$xb1.s
  272. zip2 @xt[5].s,$xa1.s,$xb1.s
  273. zip1 @xt[6].s,$xc1.s,$xd1.s
  274. zip2 @xt[7].s,$xc1.s,$xd1.s
  275. zip1 $xa.d,@xt[0].d,@xt[2].d
  276. zip2 $xb.d,@xt[0].d,@xt[2].d
  277. zip1 $xc.d,@xt[1].d,@xt[3].d
  278. zip2 $xd.d,@xt[1].d,@xt[3].d
  279. zip1 $xa1.d,@xt[4].d,@xt[6].d
  280. zip2 $xb1.d,@xt[4].d,@xt[6].d
  281. zip1 $xc1.d,@xt[5].d,@xt[7].d
  282. zip2 $xd1.d,@xt[5].d,@xt[7].d
  283. ___
  284. }
  285. sub ACCUM() {
  286. my $idx0 = shift;
  287. my $idx1 = $idx0 + 1;
  288. my $x0 = @sx[$idx0];
  289. my $xx0 = @sxx[$idx0];
  290. my $x1 = @sx[$idx1];
  291. my $xx1 = @sxx[$idx1];
  292. my $d = $idx0/2;
  293. my ($tmp,$tmpw) = ($counter,$counter_w);
  294. my $bk0 = @_ ? shift : @bak[$idx0];
  295. my $bk1 = @_ ? shift : @bak[$idx1];
  296. $code.=<<___;
  297. .if mixin == 1
  298. add @sx[$idx0],@sx[$idx0],@KL[$d]
  299. .endif
  300. add @mx[$idx0].s,@mx[$idx0].s,$bk0.s
  301. .if mixin == 1
  302. add @sxx[$idx1],@sxx[$idx1],@K[$d],lsr #32
  303. .endif
  304. add @mx[$idx1].s,@mx[$idx1].s,$bk1.s
  305. .if mixin == 1
  306. add @sxx[$idx0],@sxx[$idx0],$sxx[$idx1],lsl #32 // pack
  307. .endif
  308. ___
  309. }
  310. sub SCA_INP() {
  311. my $idx0 = shift;
  312. my $idx1 = $idx0 + 2;
  313. $code.=<<___;
  314. .if mixin == 1
  315. ldp @sxx[$idx0],@sxx[$idx1],[$inp],#16
  316. .endif
  317. ___
  318. }
  319. sub SVE_ACCUM_STATES() {
  320. my ($tmp,$tmpw) = ($counter,$counter_w);
  321. $code.=<<___;
  322. lsr $tmp,@K[5],#32
  323. dup @bak[10].s,@KL[5]
  324. dup @bak[11].s,$tmpw
  325. lsr $tmp,@K[6],#32
  326. dup @bak[13].s,$tmpw
  327. lsr $tmp,@K[7],#32
  328. ___
  329. &ACCUM(0);
  330. &ACCUM(2);
  331. &SCA_INP(1);
  332. &ACCUM(4);
  333. &ACCUM(6);
  334. &SCA_INP(5);
  335. &ACCUM(8);
  336. &ACCUM(10);
  337. &SCA_INP(9);
  338. $code.=<<___;
  339. dup @bak[14].s,@KL[7]
  340. dup @bak[0].s,$tmpw // bak[15] not available for SVE
  341. ___
  342. &ACCUM(12);
  343. &ACCUM(14, @bak[14],@bak[0]);
  344. &SCA_INP(13);
  345. }
  346. sub SVE2_ACCUM_STATES() {
  347. &ACCUM(0);
  348. &ACCUM(2);
  349. &SCA_INP(1);
  350. &ACCUM(4);
  351. &ACCUM(6);
  352. &SCA_INP(5);
  353. &ACCUM(8);
  354. &ACCUM(10);
  355. &SCA_INP(9);
  356. &ACCUM(12);
  357. &ACCUM(14);
  358. &SCA_INP(13);
  359. }
  360. sub SCA_EOR() {
  361. my $idx0 = shift;
  362. my $idx1 = $idx0 + 1;
  363. $code.=<<___;
  364. .if mixin == 1
  365. eor @sxx[$idx0],@sxx[$idx0],@sxx[$idx1]
  366. .endif
  367. ___
  368. }
  369. sub SCA_SAVE() {
  370. my $idx0 = shift;
  371. my $idx1 = shift;
  372. $code.=<<___;
  373. .if mixin == 1
  374. stp @sxx[$idx0],@sxx[$idx1],[$outp],#16
  375. .endif
  376. ___
  377. }
  378. sub SVE_VL128_TRANSFORMS() {
  379. &SCA_EOR(0);
  380. &SCA_EOR(2);
  381. &SCA_EOR(4);
  382. &transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
  383. &SCA_EOR(6);
  384. &SCA_EOR(8);
  385. &SCA_EOR(10);
  386. &transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
  387. &SCA_EOR(12);
  388. &SCA_EOR(14);
  389. $code.=<<___;
  390. ld1 {@vt[0].4s-@vt[3].4s},[$inp],#64
  391. ld1 {@vt[4].4s-@vt[7].4s},[$inp],#64
  392. eor $xa0.d,$xa0.d,@xt[0].d
  393. eor $xb0.d,$xb0.d,@xt[1].d
  394. eor $xc0.d,$xc0.d,@xt[2].d
  395. eor $xd0.d,$xd0.d,@xt[3].d
  396. eor $xa1.d,$xa1.d,@xt[4].d
  397. eor $xb1.d,$xb1.d,@xt[5].d
  398. eor $xc1.d,$xc1.d,@xt[6].d
  399. eor $xd1.d,$xd1.d,@xt[7].d
  400. ld1 {@vt[0].4s-@vt[3].4s},[$inp],#64
  401. ld1 {@vt[4].4s-@vt[7].4s},[$inp],#64
  402. ___
  403. &SCA_SAVE(0,2);
  404. $code.=<<___;
  405. eor $xa2.d,$xa2.d,@xt[0].d
  406. eor $xb2.d,$xb2.d,@xt[1].d
  407. ___
  408. &SCA_SAVE(4,6);
  409. $code.=<<___;
  410. eor $xc2.d,$xc2.d,@xt[2].d
  411. eor $xd2.d,$xd2.d,@xt[3].d
  412. ___
  413. &SCA_SAVE(8,10);
  414. $code.=<<___;
  415. eor $xa3.d,$xa3.d,@xt[4].d
  416. eor $xb3.d,$xb3.d,@xt[5].d
  417. ___
  418. &SCA_SAVE(12,14);
  419. $code.=<<___;
  420. eor $xc3.d,$xc3.d,@xt[6].d
  421. eor $xd3.d,$xd3.d,@xt[7].d
  422. st1 {@vx[0].4s-@vx[12].4s},[$outp],#64
  423. st1 {@vx[1].4s-@vx[13].4s},[$outp],#64
  424. st1 {@vx[2].4s-@vx[14].4s},[$outp],#64
  425. st1 {@vx[3].4s-@vx[15].4s},[$outp],#64
  426. ___
  427. }
  428. sub SVE_TRANSFORMS() {
  429. $code.=<<___;
  430. #ifdef __AARCH64EB__
  431. rev @sxx[0],@sxx[0]
  432. rev @sxx[2],@sxx[2]
  433. rev @sxx[4],@sxx[4]
  434. rev @sxx[6],@sxx[6]
  435. rev @sxx[8],@sxx[8]
  436. rev @sxx[10],@sxx[10]
  437. rev @sxx[12],@sxx[12]
  438. rev @sxx[14],@sxx[14]
  439. #endif
  440. .if mixin == 1
  441. add @K[6],@K[6],#1
  442. .endif
  443. cmp $veclen,4
  444. b.ne 200f
  445. ___
  446. &SVE_VL128_TRANSFORMS();
  447. $code.=<<___;
  448. b 210f
  449. 200:
  450. ___
  451. &transpose($xa0,$xb0,$xc0,$xd0,$xa1,$xb1,$xc1,$xd1);
  452. &SCA_EOR(0);
  453. &SCA_EOR(2);
  454. &transpose($xa2,$xb2,$xc2,$xd2,$xa3,$xb3,$xc3,$xd3);
  455. &SCA_EOR(4);
  456. &SCA_EOR(6);
  457. &transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
  458. &SCA_EOR(8);
  459. &SCA_EOR(10);
  460. &transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
  461. &SCA_EOR(12);
  462. &SCA_EOR(14);
  463. &load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]);
  464. $code.=<<___;
  465. eor $xa0.d,$xa0.d,@xt[0].d
  466. eor $xa1.d,$xa1.d,@xt[1].d
  467. eor $xa2.d,$xa2.d,@xt[2].d
  468. eor $xa3.d,$xa3.d,@xt[3].d
  469. eor $xb0.d,$xb0.d,@xt[4].d
  470. eor $xb1.d,$xb1.d,@xt[5].d
  471. eor $xb2.d,$xb2.d,@xt[6].d
  472. eor $xb3.d,$xb3.d,@xt[7].d
  473. ___
  474. &load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]);
  475. &SCA_SAVE(0,2);
  476. $code.=<<___;
  477. eor $xc0.d,$xc0.d,@xt[0].d
  478. eor $xc1.d,$xc1.d,@xt[1].d
  479. ___
  480. &SCA_SAVE(4,6);
  481. $code.=<<___;
  482. eor $xc2.d,$xc2.d,@xt[2].d
  483. eor $xc3.d,$xc3.d,@xt[3].d
  484. ___
  485. &SCA_SAVE(8,10);
  486. $code.=<<___;
  487. eor $xd0.d,$xd0.d,@xt[4].d
  488. eor $xd1.d,$xd1.d,@xt[5].d
  489. ___
  490. &SCA_SAVE(12,14);
  491. $code.=<<___;
  492. eor $xd2.d,$xd2.d,@xt[6].d
  493. eor $xd3.d,$xd3.d,@xt[7].d
  494. ___
  495. &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
  496. &store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
  497. $code.=<<___;
  498. 210:
  499. incw @K[6], ALL, MUL #1
  500. ___
  501. }
  502. sub SET_STATE_BAK() {
  503. my $idx0 = shift;
  504. my $idx1 = $idx0 + 1;
  505. my $x0 = @sx[$idx0];
  506. my $xx0 = @sxx[$idx0];
  507. my $x1 = @sx[$idx1];
  508. my $xx1 = @sxx[$idx1];
  509. my $d = $idx0/2;
  510. $code.=<<___;
  511. lsr $xx1,@K[$d],#32
  512. dup @mx[$idx0].s,@KL[$d]
  513. dup @bak[$idx0].s,@KL[$d]
  514. .if mixin == 1
  515. mov $x0,@KL[$d]
  516. .endif
  517. dup @mx[$idx1].s,$x1
  518. dup @bak[$idx1].s,$x1
  519. ___
  520. }
  521. sub SET_STATE() {
  522. my $idx0 = shift;
  523. my $idx1 = $idx0 + 1;
  524. my $x0 = @sx[$idx0];
  525. my $xx0 = @sxx[$idx0];
  526. my $x1 = @sx[$idx1];
  527. my $xx1 = @sxx[$idx1];
  528. my $d = $idx0/2;
  529. $code.=<<___;
  530. lsr $xx1,@K[$d],#32
  531. dup @mx[$idx0].s,@KL[$d]
  532. .if mixin == 1
  533. mov $x0,@KL[$d]
  534. .endif
  535. dup @mx[$idx1].s,$x1
  536. ___
  537. }
  538. sub SVE_LOAD_STATES() {
  539. &SET_STATE_BAK(0);
  540. &SET_STATE_BAK(2);
  541. &SET_STATE_BAK(4);
  542. &SET_STATE_BAK(6);
  543. &SET_STATE_BAK(8);
  544. &SET_STATE(10);
  545. &SET_STATE(14);
  546. $code.=<<___;
  547. .if mixin == 1
  548. add @sx[13],@KL[6],#1
  549. mov @sx[12],@KL[6]
  550. index $zctr.s,@sx[13],1
  551. index @mx[12].s,@sx[13],1
  552. .else
  553. index $zctr.s,@KL[6],1
  554. index @mx[12].s,@KL[6],1
  555. .endif
  556. lsr @sxx[13],@K[6],#32
  557. dup @mx[13].s,@sx[13]
  558. ___
  559. }
  560. sub SVE2_LOAD_STATES() {
  561. &SET_STATE_BAK(0);
  562. &SET_STATE_BAK(2);
  563. &SET_STATE_BAK(4);
  564. &SET_STATE_BAK(6);
  565. &SET_STATE_BAK(8);
  566. &SET_STATE_BAK(10);
  567. &SET_STATE_BAK(14);
  568. $code.=<<___;
  569. .if mixin == 1
  570. add @sx[13],@KL[6],#1
  571. mov @sx[12],@KL[6]
  572. index $zctr.s,@sx[13],1
  573. index @mx[12].s,@sx[13],1
  574. .else
  575. index $zctr.s,@KL[6],1
  576. index @mx[12].s,@KL[6],1
  577. .endif
  578. lsr @sxx[13],@K[6],#32
  579. dup @mx[13].s,@sx[13]
  580. dup @bak[13].s,@sx[13]
  581. ___
  582. }
  583. sub chacha20_sve() {
  584. my ($tmp) = (@sxx[0]);
  585. $code.=<<___;
  586. .align 5
  587. 100:
  588. subs $tmp,$len,$veclen,lsl #6
  589. b.lt 110f
  590. mov $len,$tmp
  591. b.eq 101f
  592. cmp $len,64
  593. b.lt 101f
  594. mixin=1
  595. ___
  596. &SVE_LOAD_STATES();
  597. &SVE_INNER_BLOCK();
  598. &SVE_ACCUM_STATES();
  599. &SVE_TRANSFORMS();
  600. $code.=<<___;
  601. subs $len,$len,64
  602. b.gt 100b
  603. b 110f
  604. 101:
  605. mixin=0
  606. ___
  607. &SVE_LOAD_STATES();
  608. &SVE_INNER_BLOCK();
  609. &SVE_ACCUM_STATES();
  610. &SVE_TRANSFORMS();
  611. $code.=<<___;
  612. 110:
  613. ___
  614. }
  615. sub chacha20_sve2() {
  616. my ($tmp) = (@sxx[0]);
  617. $code.=<<___;
  618. .align 5
  619. 100:
  620. subs $tmp,$len,$veclen,lsl #6
  621. b.lt 110f
  622. mov $len,$tmp
  623. b.eq 101f
  624. cmp $len,64
  625. b.lt 101f
  626. mixin=1
  627. ___
  628. &SVE2_LOAD_STATES();
  629. &SVE2_INNER_BLOCK();
  630. &SVE2_ACCUM_STATES();
  631. &SVE_TRANSFORMS();
  632. $code.=<<___;
  633. subs $len,$len,64
  634. b.gt 100b
  635. b 110f
  636. 101:
  637. mixin=0
  638. ___
  639. &SVE2_LOAD_STATES();
  640. &SVE2_INNER_BLOCK();
  641. &SVE2_ACCUM_STATES();
  642. &SVE_TRANSFORMS();
  643. $code.=<<___;
  644. 110:
  645. ___
  646. }
  647. {{{
  648. my ($tmp,$tmpw) = ("x6", "w6");
  649. my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
  650. my ($sve2flag) = ("x7");
  651. $code.=<<___;
  652. #include "arm_arch.h"
  653. .arch armv8-a
  654. .extern OPENSSL_armcap_P
  655. .hidden OPENSSL_armcap_P
  656. .text
  657. .align 5
  658. .Lchacha20_consts:
  659. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  660. .Lrot8:
  661. .word 0x02010003,0x04040404,0x02010003,0x04040404
  662. .globl ChaCha20_ctr32_sve
  663. .type ChaCha20_ctr32_sve,%function
  664. .align 5
  665. ChaCha20_ctr32_sve:
  666. AARCH64_VALID_CALL_TARGET
  667. cntw $veclen, ALL, MUL #1
  668. cmp $len,$veclen,lsl #6
  669. b.lt .Lreturn
  670. mov $sve2flag,0
  671. adrp $tmp,OPENSSL_armcap_P
  672. ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
  673. tst $tmpw,#ARMV8_SVE2
  674. b.eq 1f
  675. mov $sve2flag,1
  676. b 2f
  677. 1:
  678. cmp $veclen,4
  679. b.le .Lreturn
  680. adr $tmp,.Lrot8
  681. ldp $tmpw0,$tmpw1,[$tmp]
  682. index $rot8.s,$tmpw0,$tmpw1
  683. 2:
  684. AARCH64_SIGN_LINK_REGISTER
  685. stp d8,d9,[sp,-192]!
  686. stp d10,d11,[sp,16]
  687. stp d12,d13,[sp,32]
  688. stp d14,d15,[sp,48]
  689. stp x16,x17,[sp,64]
  690. stp x18,x19,[sp,80]
  691. stp x20,x21,[sp,96]
  692. stp x22,x23,[sp,112]
  693. stp x24,x25,[sp,128]
  694. stp x26,x27,[sp,144]
  695. stp x28,x29,[sp,160]
  696. str x30,[sp,176]
  697. adr $tmp,.Lchacha20_consts
  698. ldp @K[0],@K[1],[$tmp]
  699. ldp @K[2],@K[3],[$key]
  700. ldp @K[4],@K[5],[$key, 16]
  701. ldp @K[6],@K[7],[$ctr]
  702. ptrues p0.s,ALL
  703. #ifdef __AARCH64EB__
  704. ror @K[2],@K[2],#32
  705. ror @K[3],@K[3],#32
  706. ror @K[4],@K[4],#32
  707. ror @K[5],@K[5],#32
  708. ror @K[6],@K[6],#32
  709. ror @K[7],@K[7],#32
  710. #endif
  711. cbz $sve2flag, 1f
  712. ___
  713. &chacha20_sve2();
  714. $code.=<<___;
  715. b 2f
  716. 1:
  717. ___
  718. &chacha20_sve();
  719. $code.=<<___;
  720. 2:
  721. str @KL[6],[$ctr]
  722. ldp d10,d11,[sp,16]
  723. ldp d12,d13,[sp,32]
  724. ldp d14,d15,[sp,48]
  725. ldp x16,x17,[sp,64]
  726. ldp x18,x19,[sp,80]
  727. ldp x20,x21,[sp,96]
  728. ldp x22,x23,[sp,112]
  729. ldp x24,x25,[sp,128]
  730. ldp x26,x27,[sp,144]
  731. ldp x28,x29,[sp,160]
  732. ldr x30,[sp,176]
  733. ldp d8,d9,[sp],192
  734. AARCH64_VALIDATE_LINK_REGISTER
  735. .Lreturn:
  736. ret
  737. .size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
  738. ___
  739. }}}
  740. ########################################
  741. {
  742. my %opcode_unpred = (
  743. "movprfx" => 0x0420BC00,
  744. "eor" => 0x04a03000,
  745. "add" => 0x04200000,
  746. "orr" => 0x04603000,
  747. "lsl" => 0x04209C00,
  748. "lsr" => 0x04209400,
  749. "incw" => 0x04B00000,
  750. "xar" => 0x04203400,
  751. "zip1" => 0x05206000,
  752. "zip2" => 0x05206400,
  753. "uzp1" => 0x05206800,
  754. "uzp2" => 0x05206C00,
  755. "index" => 0x04204C00,
  756. "mov" => 0x05203800,
  757. "dup" => 0x05203800,
  758. "cntw" => 0x04A0E000,
  759. "tbl" => 0x05203000);
  760. my %opcode_imm_unpred = (
  761. "dup" => 0x2538C000,
  762. "index" => 0x04204400);
  763. my %opcode_scalar_pred = (
  764. "mov" => 0x0528A000,
  765. "cpy" => 0x0528A000,
  766. "st4w" => 0xE5606000,
  767. "st1w" => 0xE5004000,
  768. "ld1w" => 0xA5404000);
  769. my %opcode_gather_pred = (
  770. "ld1w" => 0x85204000);
  771. my %opcode_pred = (
  772. "eor" => 0x04190000,
  773. "add" => 0x04000000,
  774. "orr" => 0x04180000,
  775. "whilelo" => 0x25200C00,
  776. "whilelt" => 0x25200400,
  777. "cntp" => 0x25208000,
  778. "addvl" => 0x04205000,
  779. "lsl" => 0x04038000,
  780. "lsr" => 0x04018000,
  781. "sel" => 0x0520C000,
  782. "mov" => 0x0520C000,
  783. "ptrue" => 0x2518E000,
  784. "pfalse" => 0x2518E400,
  785. "ptrues" => 0x2519E000,
  786. "pnext" => 0x2519C400,
  787. "ld4w" => 0xA560E000,
  788. "st4w" => 0xE570E000,
  789. "st1w" => 0xE500E000,
  790. "ld1w" => 0xA540A000,
  791. "ld1rw" => 0x8540C000,
  792. "lasta" => 0x0520A000,
  793. "revh" => 0x05258000,
  794. "revb" => 0x05248000);
  795. my %tsize = (
  796. 'b' => 0,
  797. 'h' => 1,
  798. 's' => 2,
  799. 'd' => 3);
  800. my %sf = (
  801. "w" => 0,
  802. "x" => 1);
  803. my %pattern = (
  804. "POW2" => 0,
  805. "VL1" => 1,
  806. "VL2" => 2,
  807. "VL3" => 3,
  808. "VL4" => 4,
  809. "VL5" => 5,
  810. "VL6" => 6,
  811. "VL7" => 7,
  812. "VL8" => 8,
  813. "VL16" => 9,
  814. "VL32" => 10,
  815. "VL64" => 11,
  816. "VL128" => 12,
  817. "VL256" => 13,
  818. "MUL4" => 29,
  819. "MUL3" => 30,
  820. "ALL" => 31);
  821. sub create_verifier {
  822. my $filename="./compile_sve.sh";
  823. $scripts = <<___;
  824. #! /bin/bash
  825. set -e
  826. CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'}
  827. [ -z "\$1" ] && exit 1
  828. ARCH=`uname -p | xargs echo -n`
  829. # need gcc-10 and above to compile SVE code
  830. # change this according to your system during debugging
  831. if [ \$ARCH == 'aarch64' ]; then
  832. CC=gcc-11
  833. OBJDUMP=objdump
  834. else
  835. CC=\${CROSS_COMPILE}gcc
  836. OBJDUMP=\${CROSS_COMPILE}objdump
  837. fi
  838. TMPFILE=/tmp/\$\$
  839. cat > \$TMPFILE.c << EOF
  840. extern __attribute__((noinline, section("disasm_output"))) void dummy_func()
  841. {
  842. asm("\$@\\t\\n");
  843. }
  844. int main(int argc, char *argv[])
  845. {
  846. }
  847. EOF
  848. \$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c
  849. \$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}'
  850. rm \$TMPFILE.c \$TMPFILE.out
  851. ___
  852. open(FH, '>', $filename) or die $!;
  853. print FH $scripts;
  854. close(FH);
  855. system("chmod a+x ./compile_sve.sh");
  856. }
  857. sub compile_sve {
  858. return `./compile_sve.sh '@_'`
  859. }
  860. sub verify_inst {
  861. my ($code,$inst)=@_;
  862. my $hexcode = (sprintf "%08x", $code);
  863. if ($debug_encoder == 1) {
  864. my $expect=&compile_sve($inst);
  865. if ($expect ne $hexcode) {
  866. return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode);
  867. }
  868. }
  869. return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst);
  870. }
  871. sub reg_code {
  872. my $code = shift;
  873. if ($code == "zr") {
  874. return "31";
  875. }
  876. return $code;
  877. }
  878. sub encode_size_imm() {
  879. my ($mnemonic, $isize, $const)=@_;
  880. my $esize = (8<<$tsize{$isize});
  881. my $tsize_imm = $esize + $const;
  882. if ($mnemonic eq "lsr" || $mnemonic eq "xar") {
  883. $tsize_imm = 2*$esize - $const;
  884. }
  885. return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16);
  886. }
  887. sub encode_shift_pred() {
  888. my ($mnemonic, $isize, $const)=@_;
  889. my $esize = (8<<$tsize{$isize});
  890. my $tsize_imm = $esize + $const;
  891. if ($mnemonic eq "lsr") {
  892. $tsize_imm = 2*$esize - $const;
  893. }
  894. return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5);
  895. }
  896. sub sve_unpred {
  897. my ($mnemonic,$arg)=@_;
  898. my $inst = (sprintf "%s %s", $mnemonic,$arg);
  899. if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) {
  900. return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16),
  901. $inst)
  902. } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) {
  903. my $regd = $1;
  904. my $isize = $2;
  905. my $regs=$3;
  906. if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
  907. if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
  908. && ((8<<$tsize{$isize}) > $2)) {
  909. return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2),
  910. $inst);
  911. }
  912. } elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) {
  913. return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
  914. } elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) {
  915. return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
  916. } elsif ($regs =~ m/[wx]([0-9]+)/o) {
  917. return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst);
  918. } else {
  919. my $encoded_size = 0;
  920. if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) {
  921. $encoded_size = ($tsize{$isize}<<22);
  922. }
  923. if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o &&
  924. $1 == $regd) {
  925. return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst);
  926. } elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) {
  927. return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst);
  928. }
  929. }
  930. } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) {
  931. return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22),
  932. $inst)
  933. }
  934. sprintf "%s // fail to parse", $inst;
  935. }
  936. sub sve_pred {
  937. my ($mnemonic,,$arg)=@_;
  938. my $inst = (sprintf "%s %s", $mnemonic,$arg);
  939. if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) {
  940. my $zt = $1;
  941. my $size = $tsize{$2};
  942. my $pg = $3;
  943. my $addr = $5;
  944. my $xn = 31;
  945. if ($addr =~ m/x([0-9]+)\s*/o) {
  946. $xn = $1;
  947. }
  948. if ($mnemonic =~m/ld1r[bhwd]/o) {
  949. $size = 0;
  950. }
  951. if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
  952. return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
  953. } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
  954. my $xs = ($2 eq "SXTW") ? 1 : 0;
  955. return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
  956. } elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) {
  957. return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
  958. } else {
  959. return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst);
  960. }
  961. } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) {
  962. my $regd = $1;
  963. my $isize = $2;
  964. my $pg = $3;
  965. my $mod = $4;
  966. my $regs = $5;
  967. if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
  968. if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
  969. && $regd == $1
  970. && $mode == 'm'
  971. && ((8<<$tsize{$isize}) > $2)) {
  972. return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst);
  973. }
  974. } elsif($regs =~ m/[wx]([0-9]+)/o) {
  975. return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
  976. } elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) {
  977. if ($mnemonic eq "sel") {
  978. return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst);
  979. } elsif ($mnemonic eq "mov") {
  980. return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst);
  981. } elsif (length $2 > 0) {
  982. return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst);
  983. } else {
  984. return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
  985. }
  986. }
  987. } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) {
  988. my $pg = $1;
  989. my $isize = $2;
  990. my $regs = $3;
  991. if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) {
  992. return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(&reg_code($2)<<5)|(&reg_code($3)<<16), $inst);
  993. } elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) {
  994. return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst);
  995. } else {
  996. return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst);
  997. }
  998. } elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) {
  999. return &verify_inst($opcode_pred{$mnemonic}|$1, $inst);
  1000. }
  1001. sprintf "%s // fail to parse", $inst;
  1002. }
  1003. sub sve_other {
  1004. my ($mnemonic,$arg)=@_;
  1005. my $inst = (sprintf "%s %s", $mnemonic,$arg);
  1006. if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) {
  1007. return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst);
  1008. } elsif ($arg =~ m/(x|w)([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*z([0-9]+)\.([bhsd])/o) {
  1009. return &verify_inst($opcode_pred{$mnemonic}|($tsize{$5}<<22)|$1|($3<<10)|($4<<5)|$2, $inst);
  1010. }elsif ($mnemonic =~ /inc[bhdw]/) {
  1011. if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
  1012. return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16)|0xE000, $inst);
  1013. } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
  1014. return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16)|0xC000, $inst);
  1015. } elsif ($arg =~ m/x([0-9]+)/o) {
  1016. return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16)|0xE000, $inst);
  1017. }
  1018. } elsif ($mnemonic =~ /cnt[bhdw]/) {
  1019. if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
  1020. return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
  1021. }
  1022. } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
  1023. return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
  1024. } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
  1025. return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
  1026. }
  1027. sprintf "%s // fail to parse", $inst;
  1028. }
  1029. }
  1030. open SELF,$0;
  1031. while(<SELF>) {
  1032. next if (/^#!/);
  1033. last if (!s/^#/\/\// and !/^$/);
  1034. print;
  1035. }
  1036. close SELF;
  1037. if ($debug_encoder == 1) {
  1038. &create_verifier();
  1039. }
  1040. foreach(split("\n",$code)) {
  1041. s/\`([^\`]*)\`/eval($1)/ge;
  1042. s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
  1043. s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
  1044. s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
  1045. s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
  1046. s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
  1047. s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
  1048. s/\b(movprfx|lasta|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z|w).*)/sve_other($1,$2)/ge;
  1049. print $_,"\n";
  1050. }
  1051. close STDOUT or die "error closing STDOUT: $!";