2
0

chacha-s390x.pl 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # December 2015
  17. #
  18. # ChaCha20 for s390x.
  19. #
  20. # 3 times faster than compiler-generated code.
  21. #
  22. # August 2018
  23. #
  24. # Add vx code path: 4x"vertical".
  25. #
  26. # Copyright IBM Corp. 2018
  27. # Author: Patrick Steuer <patrick.steuer@de.ibm.com>
  28. #
  29. # February 2019
  30. #
  31. # Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
  32. # 4x"vertical" submission [on z13] and >3 faster than scalar code.
  33. # But to harness overheads revert to transliteration of VSX code path
  34. # from chacha-ppc module, which is also 4x"vertical", to handle inputs
  35. # not longer than 256 bytes.
  36. use strict;
  37. use FindBin qw($Bin);
  38. use lib "$Bin/../..";
  39. use perlasm::s390x qw(:DEFAULT :VX :EI AUTOLOAD LABEL INCLUDE);
  40. # $output is the last argument if it looks like a file (it has an extension)
  41. # $flavour is the first argument if it doesn't look like a file
  42. my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  43. my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  44. my ($z,$SIZE_T);
  45. if ($flavour =~ /3[12]/) {
  46. $z=0; # S/390 ABI
  47. $SIZE_T=4;
  48. } else {
  49. $z=1; # zSeries ABI
  50. $SIZE_T=8;
  51. }
  52. my $sp="%r15";
  53. my $stdframe=16*$SIZE_T+4*8;
  54. sub ROUND {
  55. my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
  56. my @t=map("%r$_",(8,9));
  57. my ($a0,$b0,$c0,$d0)=@_;
  58. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  59. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  60. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  61. my ($xc,$xc_)=map("$_",@t);
  62. # Consider order in which variables are addressed by their
  63. # index:
  64. #
  65. # a b c d
  66. #
  67. # 0 4 8 12 < even round
  68. # 1 5 9 13
  69. # 2 6 10 14
  70. # 3 7 11 15
  71. # 0 5 10 15 < odd round
  72. # 1 6 11 12
  73. # 2 7 8 13
  74. # 3 4 9 14
  75. #
  76. # 'a', 'b' and 'd's are permanently allocated in registers,
  77. # @x[0..7,12..15], while 'c's are maintained in memory. If
  78. # you observe 'c' column, you'll notice that pair of 'c's is
  79. # invariant between rounds. This means that we have to reload
  80. # them once per round, in the middle. This is why you'll see
  81. # 'c' stores and loads in the middle, but none in the beginning
  82. # or end.
  83. alr (@x[$a0],@x[$b0]); # Q1
  84. alr (@x[$a1],@x[$b1]); # Q2
  85. xr (@x[$d0],@x[$a0]);
  86. xr (@x[$d1],@x[$a1]);
  87. rll (@x[$d0],@x[$d0],16);
  88. rll (@x[$d1],@x[$d1],16);
  89. alr ($xc,@x[$d0]);
  90. alr ($xc_,@x[$d1]);
  91. xr (@x[$b0],$xc);
  92. xr (@x[$b1],$xc_);
  93. rll (@x[$b0],@x[$b0],12);
  94. rll (@x[$b1],@x[$b1],12);
  95. alr (@x[$a0],@x[$b0]);
  96. alr (@x[$a1],@x[$b1]);
  97. xr (@x[$d0],@x[$a0]);
  98. xr (@x[$d1],@x[$a1]);
  99. rll (@x[$d0],@x[$d0],8);
  100. rll (@x[$d1],@x[$d1],8);
  101. alr ($xc,@x[$d0]);
  102. alr ($xc_,@x[$d1]);
  103. xr (@x[$b0],$xc);
  104. xr (@x[$b1],$xc_);
  105. rll (@x[$b0],@x[$b0],7);
  106. rll (@x[$b1],@x[$b1],7);
  107. stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's
  108. lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
  109. alr (@x[$a2],@x[$b2]); # Q3
  110. alr (@x[$a3],@x[$b3]); # Q4
  111. xr (@x[$d2],@x[$a2]);
  112. xr (@x[$d3],@x[$a3]);
  113. rll (@x[$d2],@x[$d2],16);
  114. rll (@x[$d3],@x[$d3],16);
  115. alr ($xc,@x[$d2]);
  116. alr ($xc_,@x[$d3]);
  117. xr (@x[$b2],$xc);
  118. xr (@x[$b3],$xc_);
  119. rll (@x[$b2],@x[$b2],12);
  120. rll (@x[$b3],@x[$b3],12);
  121. alr (@x[$a2],@x[$b2]);
  122. alr (@x[$a3],@x[$b3]);
  123. xr (@x[$d2],@x[$a2]);
  124. xr (@x[$d3],@x[$a3]);
  125. rll (@x[$d2],@x[$d2],8);
  126. rll (@x[$d3],@x[$d3],8);
  127. alr ($xc,@x[$d2]);
  128. alr ($xc_,@x[$d3]);
  129. xr (@x[$b2],$xc);
  130. xr (@x[$b3],$xc_);
  131. rll (@x[$b2],@x[$b2],7);
  132. rll (@x[$b3],@x[$b3],7);
  133. }
  134. sub VX_lane_ROUND {
  135. my ($a0,$b0,$c0,$d0)=@_;
  136. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  137. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  138. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  139. my @x=map("%v$_",(0..15));
  140. vaf (@x[$a0],@x[$a0],@x[$b0]); # Q1
  141. vx (@x[$d0],@x[$d0],@x[$a0]);
  142. verllf (@x[$d0],@x[$d0],16);
  143. vaf (@x[$a1],@x[$a1],@x[$b1]); # Q2
  144. vx (@x[$d1],@x[$d1],@x[$a1]);
  145. verllf (@x[$d1],@x[$d1],16);
  146. vaf (@x[$a2],@x[$a2],@x[$b2]); # Q3
  147. vx (@x[$d2],@x[$d2],@x[$a2]);
  148. verllf (@x[$d2],@x[$d2],16);
  149. vaf (@x[$a3],@x[$a3],@x[$b3]); # Q4
  150. vx (@x[$d3],@x[$d3],@x[$a3]);
  151. verllf (@x[$d3],@x[$d3],16);
  152. vaf (@x[$c0],@x[$c0],@x[$d0]);
  153. vx (@x[$b0],@x[$b0],@x[$c0]);
  154. verllf (@x[$b0],@x[$b0],12);
  155. vaf (@x[$c1],@x[$c1],@x[$d1]);
  156. vx (@x[$b1],@x[$b1],@x[$c1]);
  157. verllf (@x[$b1],@x[$b1],12);
  158. vaf (@x[$c2],@x[$c2],@x[$d2]);
  159. vx (@x[$b2],@x[$b2],@x[$c2]);
  160. verllf (@x[$b2],@x[$b2],12);
  161. vaf (@x[$c3],@x[$c3],@x[$d3]);
  162. vx (@x[$b3],@x[$b3],@x[$c3]);
  163. verllf (@x[$b3],@x[$b3],12);
  164. vaf (@x[$a0],@x[$a0],@x[$b0]);
  165. vx (@x[$d0],@x[$d0],@x[$a0]);
  166. verllf (@x[$d0],@x[$d0],8);
  167. vaf (@x[$a1],@x[$a1],@x[$b1]);
  168. vx (@x[$d1],@x[$d1],@x[$a1]);
  169. verllf (@x[$d1],@x[$d1],8);
  170. vaf (@x[$a2],@x[$a2],@x[$b2]);
  171. vx (@x[$d2],@x[$d2],@x[$a2]);
  172. verllf (@x[$d2],@x[$d2],8);
  173. vaf (@x[$a3],@x[$a3],@x[$b3]);
  174. vx (@x[$d3],@x[$d3],@x[$a3]);
  175. verllf (@x[$d3],@x[$d3],8);
  176. vaf (@x[$c0],@x[$c0],@x[$d0]);
  177. vx (@x[$b0],@x[$b0],@x[$c0]);
  178. verllf (@x[$b0],@x[$b0],7);
  179. vaf (@x[$c1],@x[$c1],@x[$d1]);
  180. vx (@x[$b1],@x[$b1],@x[$c1]);
  181. verllf (@x[$b1],@x[$b1],7);
  182. vaf (@x[$c2],@x[$c2],@x[$d2]);
  183. vx (@x[$b2],@x[$b2],@x[$c2]);
  184. verllf (@x[$b2],@x[$b2],7);
  185. vaf (@x[$c3],@x[$c3],@x[$d3]);
  186. vx (@x[$b3],@x[$b3],@x[$c3]);
  187. verllf (@x[$b3],@x[$b3],7);
  188. }
  189. sub VX_ROUND {
  190. my @a=@_[0..5];
  191. my @b=@_[6..11];
  192. my @c=@_[12..17];
  193. my @d=@_[18..23];
  194. my $odd=@_[24];
  195. vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
  196. vx (@d[$_],@d[$_],@a[$_]) for (0..5);
  197. verllf (@d[$_],@d[$_],16) for (0..5);
  198. vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
  199. vx (@b[$_],@b[$_],@c[$_]) for (0..5);
  200. verllf (@b[$_],@b[$_],12) for (0..5);
  201. vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
  202. vx (@d[$_],@d[$_],@a[$_]) for (0..5);
  203. verllf (@d[$_],@d[$_],8) for (0..5);
  204. vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
  205. vx (@b[$_],@b[$_],@c[$_]) for (0..5);
  206. verllf (@b[$_],@b[$_],7) for (0..5);
  207. vsldb (@c[$_],@c[$_],@c[$_],8) for (0..5);
  208. vsldb (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
  209. vsldb (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
  210. }
  211. PERLASM_BEGIN($output);
  212. INCLUDE ("s390x_arch.h");
  213. TEXT ();
  214. ################
  215. # void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
  216. # const unsigned int key[8], const unsigned int counter[4])
  217. my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
  218. {
  219. my $frame=$stdframe+4*20;
  220. my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
  221. my @t=map("%r$_",(8,9));
  222. GLOBL ("ChaCha20_ctr32");
  223. TYPE ("ChaCha20_ctr32","\@function");
  224. ALIGN (32);
  225. LABEL ("ChaCha20_ctr32");
  226. larl ("%r1","OPENSSL_s390xcap_P");
  227. lghi ("%r0",64);
  228. &{$z? \&ltgr:\&ltr} ($len,$len); # len==0?
  229. bzr ("%r14");
  230. lg ("%r1","S390X_STFLE+16(%r1)");
  231. &{$z? \&clgr:\&clr} ($len,"%r0");
  232. jle (".Lshort");
  233. tmhh ("%r1",0x4000); # check for vx bit
  234. jnz (".LChaCha20_ctr32_vx");
  235. LABEL (".Lshort");
  236. &{$z? \&aghi:\&ahi} ($len,-64);
  237. &{$z? \&lghi:\&lhi} ("%r1",-$frame);
  238. &{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
  239. &{$z? \&slgr:\&slr} ($out,$inp); # difference
  240. la ($len,"0($inp,$len)"); # end of input minus 64
  241. larl ("%r7",".Lsigma");
  242. lgr ("%r0",$sp);
  243. la ($sp,"0(%r1,$sp)");
  244. &{$z? \&stg:\&st} ("%r0","0($sp)");
  245. lmg ("%r8","%r11","0($key)"); # load key
  246. lmg ("%r12","%r13","0($counter)"); # load counter
  247. lmg ("%r6","%r7","0(%r7)"); # load sigma constant
  248. la ("%r14","0($inp)");
  249. &{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
  250. &{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
  251. stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
  252. srlg (@x[12],"%r12",32); # 32-bit counter value
  253. j (".Loop_outer");
  254. ALIGN (16);
  255. LABEL (".Loop_outer");
  256. lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
  257. lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
  258. lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
  259. stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
  260. lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
  261. st (@x[12],"$stdframe+4*12($sp)"); # save counter
  262. &{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
  263. lhi ("%r14",10);
  264. j (".Loop");
  265. ALIGN (4);
  266. LABEL (".Loop");
  267. ROUND (0, 4, 8,12);
  268. ROUND (0, 5,10,15);
  269. brct ("%r14",".Loop");
  270. &{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
  271. stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
  272. &{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
  273. al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
  274. al (@x[1],"$stdframe+4*1($sp)");
  275. al (@x[2],"$stdframe+4*2($sp)");
  276. al (@x[3],"$stdframe+4*3($sp)");
  277. al (@x[4],"$stdframe+4*4($sp)");
  278. al (@x[5],"$stdframe+4*5($sp)");
  279. al (@x[6],"$stdframe+4*6($sp)");
  280. al (@x[7],"$stdframe+4*7($sp)");
  281. lrvr (@x[0],@x[0]);
  282. lrvr (@x[1],@x[1]);
  283. lrvr (@x[2],@x[2]);
  284. lrvr (@x[3],@x[3]);
  285. lrvr (@x[4],@x[4]);
  286. lrvr (@x[5],@x[5]);
  287. lrvr (@x[6],@x[6]);
  288. lrvr (@x[7],@x[7]);
  289. al (@x[12],"$stdframe+4*12($sp)");
  290. al (@x[13],"$stdframe+4*13($sp)");
  291. al (@x[14],"$stdframe+4*14($sp)");
  292. al (@x[15],"$stdframe+4*15($sp)");
  293. lrvr (@x[12],@x[12]);
  294. lrvr (@x[13],@x[13]);
  295. lrvr (@x[14],@x[14]);
  296. lrvr (@x[15],@x[15]);
  297. la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
  298. &{$z? \&clgr:\&clr} ("%r14",@t[1]);
  299. jh (".Ltail");
  300. x (@x[0],"4*0(%r14)"); # xor with input
  301. x (@x[1],"4*1(%r14)");
  302. st (@x[0],"4*0(@t[0])"); # store output
  303. x (@x[2],"4*2(%r14)");
  304. st (@x[1],"4*1(@t[0])");
  305. x (@x[3],"4*3(%r14)");
  306. st (@x[2],"4*2(@t[0])");
  307. x (@x[4],"4*4(%r14)");
  308. st (@x[3],"4*3(@t[0])");
  309. lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
  310. x (@x[5],"4*5(%r14)");
  311. st (@x[4],"4*4(@t[0])");
  312. x (@x[6],"4*6(%r14)");
  313. al (@x[0],"$stdframe+4*8($sp)");
  314. st (@x[5],"4*5(@t[0])");
  315. x (@x[7],"4*7(%r14)");
  316. al (@x[1],"$stdframe+4*9($sp)");
  317. st (@x[6],"4*6(@t[0])");
  318. x (@x[12],"4*12(%r14)");
  319. al (@x[2],"$stdframe+4*10($sp)");
  320. st (@x[7],"4*7(@t[0])");
  321. x (@x[13],"4*13(%r14)");
  322. al (@x[3],"$stdframe+4*11($sp)");
  323. st (@x[12],"4*12(@t[0])");
  324. x (@x[14],"4*14(%r14)");
  325. st (@x[13],"4*13(@t[0])");
  326. x (@x[15],"4*15(%r14)");
  327. st (@x[14],"4*14(@t[0])");
  328. lrvr (@x[0],@x[0]);
  329. st (@x[15],"4*15(@t[0])");
  330. lrvr (@x[1],@x[1]);
  331. lrvr (@x[2],@x[2]);
  332. lrvr (@x[3],@x[3]);
  333. lhi (@x[12],1);
  334. x (@x[0],"4*8(%r14)");
  335. al (@x[12],"$stdframe+4*12($sp)"); # increment counter
  336. x (@x[1],"4*9(%r14)");
  337. st (@x[0],"4*8(@t[0])");
  338. x (@x[2],"4*10(%r14)");
  339. st (@x[1],"4*9(@t[0])");
  340. x (@x[3],"4*11(%r14)");
  341. st (@x[2],"4*10(@t[0])");
  342. st (@x[3],"4*11(@t[0])");
  343. &{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
  344. la ("%r14","64(%r14)");
  345. jl (".Loop_outer");
  346. LABEL (".Ldone");
  347. xgr ("%r0","%r0");
  348. xgr ("%r1","%r1");
  349. xgr ("%r2","%r2");
  350. xgr ("%r3","%r3");
  351. stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
  352. stmg ("%r0","%r3","$stdframe+4*12($sp)");
  353. &{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
  354. br ("%r14");
  355. ALIGN (16);
  356. LABEL (".Ltail");
  357. la (@t[1],"64($t[1])");
  358. stm (@x[0],@x[7],"$stdframe+4*0($sp)");
  359. &{$z? \&slgr:\&slr} (@t[1],"%r14");
  360. lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
  361. &{$z? \&lghi:\&lhi} (@x[6],0);
  362. stm (@x[12],@x[15],"$stdframe+4*12($sp)");
  363. al (@x[0],"$stdframe+4*8($sp)");
  364. al (@x[1],"$stdframe+4*9($sp)");
  365. al (@x[2],"$stdframe+4*10($sp)");
  366. al (@x[3],"$stdframe+4*11($sp)");
  367. lrvr (@x[0],@x[0]);
  368. lrvr (@x[1],@x[1]);
  369. lrvr (@x[2],@x[2]);
  370. lrvr (@x[3],@x[3]);
  371. stm (@x[0],@x[3],"$stdframe+4*8($sp)");
  372. LABEL (".Loop_tail");
  373. llgc (@x[4],"0(@x[6],%r14)");
  374. llgc (@x[5],"$stdframe(@x[6],$sp)");
  375. xr (@x[5],@x[4]);
  376. stc (@x[5],"0(@x[6],@t[0])");
  377. la (@x[6],"1(@x[6])");
  378. brct (@t[1],".Loop_tail");
  379. j (".Ldone");
  380. SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
  381. }
  382. ########################################################################
  383. # 4x"vertical" layout minimizes amount of instructions, but pipeline
  384. # runs underutilized [because of vector instructions' high latency].
  385. # On the other hand minimum amount of data it takes to fully utilize
  386. # the pipeline is higher, so that effectively, short inputs would be
  387. # processed slower. Hence this code path targeting <=256 bytes lengths.
  388. #
  389. {
  390. my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
  391. $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
  392. my @K=map("%v$_",(16..19));
  393. my $CTR="%v26";
  394. my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
  395. my $beperm="%v31";
  396. my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
  397. my $FRAME=$stdframe+4*16;
  398. ALIGN (32);
  399. LABEL ("ChaCha20_ctr32_4x");
  400. LABEL (".LChaCha20_ctr32_4x");
  401. &{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
  402. if (!$z) {
  403. std ("%f4","16*$SIZE_T+2*8($sp)");
  404. std ("%f6","16*$SIZE_T+3*8($sp)");
  405. }
  406. &{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
  407. lgr ("%r0",$sp);
  408. la ($sp,"0(%r1,$sp)");
  409. &{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
  410. if ($z) {
  411. std ("%f8","$stdframe+8*0($sp)");
  412. std ("%f9","$stdframe+8*1($sp)");
  413. std ("%f10","$stdframe+8*2($sp)");
  414. std ("%f11","$stdframe+8*3($sp)");
  415. std ("%f12","$stdframe+8*4($sp)");
  416. std ("%f13","$stdframe+8*5($sp)");
  417. std ("%f14","$stdframe+8*6($sp)");
  418. std ("%f15","$stdframe+8*7($sp)");
  419. }
  420. larl ("%r7",".Lsigma");
  421. lhi ("%r0",10);
  422. lhi ("%r1",0);
  423. vl (@K[0],"0(%r7)"); # load sigma
  424. vl (@K[1],"0($key)"); # load key
  425. vl (@K[2],"16($key)");
  426. vl (@K[3],"0($counter)"); # load counter
  427. vl ($beperm,"0x40(%r7)");
  428. vl ($xt1,"0x50(%r7)");
  429. vrepf ($CTR,@K[3],0);
  430. vlvgf (@K[3],"%r1",0); # clear @K[3].word[0]
  431. vaf ($CTR,$CTR,$xt1);
  432. #LABEL (".Loop_outer_4x");
  433. vlm ($xa0,$xa3,"0x60(%r7)"); # load [smashed] sigma
  434. vrepf ($xb0,@K[1],0); # smash the key
  435. vrepf ($xb1,@K[1],1);
  436. vrepf ($xb2,@K[1],2);
  437. vrepf ($xb3,@K[1],3);
  438. vrepf ($xc0,@K[2],0);
  439. vrepf ($xc1,@K[2],1);
  440. vrepf ($xc2,@K[2],2);
  441. vrepf ($xc3,@K[2],3);
  442. vlr ($xd0,$CTR);
  443. vrepf ($xd1,@K[3],1);
  444. vrepf ($xd2,@K[3],2);
  445. vrepf ($xd3,@K[3],3);
  446. LABEL (".Loop_4x");
  447. VX_lane_ROUND(0, 4, 8,12);
  448. VX_lane_ROUND(0, 5,10,15);
  449. brct ("%r0",".Loop_4x");
  450. vaf ($xd0,$xd0,$CTR);
  451. vmrhf ($xt0,$xa0,$xa1); # transpose data
  452. vmrhf ($xt1,$xa2,$xa3);
  453. vmrlf ($xt2,$xa0,$xa1);
  454. vmrlf ($xt3,$xa2,$xa3);
  455. vpdi ($xa0,$xt0,$xt1,0b0000);
  456. vpdi ($xa1,$xt0,$xt1,0b0101);
  457. vpdi ($xa2,$xt2,$xt3,0b0000);
  458. vpdi ($xa3,$xt2,$xt3,0b0101);
  459. vmrhf ($xt0,$xb0,$xb1);
  460. vmrhf ($xt1,$xb2,$xb3);
  461. vmrlf ($xt2,$xb0,$xb1);
  462. vmrlf ($xt3,$xb2,$xb3);
  463. vpdi ($xb0,$xt0,$xt1,0b0000);
  464. vpdi ($xb1,$xt0,$xt1,0b0101);
  465. vpdi ($xb2,$xt2,$xt3,0b0000);
  466. vpdi ($xb3,$xt2,$xt3,0b0101);
  467. vmrhf ($xt0,$xc0,$xc1);
  468. vmrhf ($xt1,$xc2,$xc3);
  469. vmrlf ($xt2,$xc0,$xc1);
  470. vmrlf ($xt3,$xc2,$xc3);
  471. vpdi ($xc0,$xt0,$xt1,0b0000);
  472. vpdi ($xc1,$xt0,$xt1,0b0101);
  473. vpdi ($xc2,$xt2,$xt3,0b0000);
  474. vpdi ($xc3,$xt2,$xt3,0b0101);
  475. vmrhf ($xt0,$xd0,$xd1);
  476. vmrhf ($xt1,$xd2,$xd3);
  477. vmrlf ($xt2,$xd0,$xd1);
  478. vmrlf ($xt3,$xd2,$xd3);
  479. vpdi ($xd0,$xt0,$xt1,0b0000);
  480. vpdi ($xd1,$xt0,$xt1,0b0101);
  481. vpdi ($xd2,$xt2,$xt3,0b0000);
  482. vpdi ($xd3,$xt2,$xt3,0b0101);
  483. #vrepif ($xt0,4);
  484. #vaf ($CTR,$CTR,$xt0); # next counter value
  485. vaf ($xa0,$xa0,@K[0]);
  486. vaf ($xb0,$xb0,@K[1]);
  487. vaf ($xc0,$xc0,@K[2]);
  488. vaf ($xd0,$xd0,@K[3]);
  489. vperm ($xa0,$xa0,$xa0,$beperm);
  490. vperm ($xb0,$xb0,$xb0,$beperm);
  491. vperm ($xc0,$xc0,$xc0,$beperm);
  492. vperm ($xd0,$xd0,$xd0,$beperm);
  493. #&{$z? \&clgfi:\&clfi} ($len,0x40);
  494. #jl (".Ltail_4x");
  495. vlm ($xt0,$xt3,"0($inp)");
  496. vx ($xt0,$xt0,$xa0);
  497. vx ($xt1,$xt1,$xb0);
  498. vx ($xt2,$xt2,$xc0);
  499. vx ($xt3,$xt3,$xd0);
  500. vstm ($xt0,$xt3,"0($out)");
  501. la ($inp,"0x40($inp)");
  502. la ($out,"0x40($out)");
  503. &{$z? \&aghi:\&ahi} ($len,-0x40);
  504. #je (".Ldone_4x");
  505. vaf ($xa0,$xa1,@K[0]);
  506. vaf ($xb0,$xb1,@K[1]);
  507. vaf ($xc0,$xc1,@K[2]);
  508. vaf ($xd0,$xd1,@K[3]);
  509. vperm ($xa0,$xa0,$xa0,$beperm);
  510. vperm ($xb0,$xb0,$xb0,$beperm);
  511. vperm ($xc0,$xc0,$xc0,$beperm);
  512. vperm ($xd0,$xd0,$xd0,$beperm);
  513. &{$z? \&clgfi:\&clfi} ($len,0x40);
  514. jl (".Ltail_4x");
  515. vlm ($xt0,$xt3,"0($inp)");
  516. vx ($xt0,$xt0,$xa0);
  517. vx ($xt1,$xt1,$xb0);
  518. vx ($xt2,$xt2,$xc0);
  519. vx ($xt3,$xt3,$xd0);
  520. vstm ($xt0,$xt3,"0($out)");
  521. la ($inp,"0x40($inp)");
  522. la ($out,"0x40($out)");
  523. &{$z? \&aghi:\&ahi} ($len,-0x40);
  524. je (".Ldone_4x");
  525. vaf ($xa0,$xa2,@K[0]);
  526. vaf ($xb0,$xb2,@K[1]);
  527. vaf ($xc0,$xc2,@K[2]);
  528. vaf ($xd0,$xd2,@K[3]);
  529. vperm ($xa0,$xa0,$xa0,$beperm);
  530. vperm ($xb0,$xb0,$xb0,$beperm);
  531. vperm ($xc0,$xc0,$xc0,$beperm);
  532. vperm ($xd0,$xd0,$xd0,$beperm);
  533. &{$z? \&clgfi:\&clfi} ($len,0x40);
  534. jl (".Ltail_4x");
  535. vlm ($xt0,$xt3,"0($inp)");
  536. vx ($xt0,$xt0,$xa0);
  537. vx ($xt1,$xt1,$xb0);
  538. vx ($xt2,$xt2,$xc0);
  539. vx ($xt3,$xt3,$xd0);
  540. vstm ($xt0,$xt3,"0($out)");
  541. la ($inp,"0x40($inp)");
  542. la ($out,"0x40($out)");
  543. &{$z? \&aghi:\&ahi} ($len,-0x40);
  544. je (".Ldone_4x");
  545. vaf ($xa0,$xa3,@K[0]);
  546. vaf ($xb0,$xb3,@K[1]);
  547. vaf ($xc0,$xc3,@K[2]);
  548. vaf ($xd0,$xd3,@K[3]);
  549. vperm ($xa0,$xa0,$xa0,$beperm);
  550. vperm ($xb0,$xb0,$xb0,$beperm);
  551. vperm ($xc0,$xc0,$xc0,$beperm);
  552. vperm ($xd0,$xd0,$xd0,$beperm);
  553. &{$z? \&clgfi:\&clfi} ($len,0x40);
  554. jl (".Ltail_4x");
  555. vlm ($xt0,$xt3,"0($inp)");
  556. vx ($xt0,$xt0,$xa0);
  557. vx ($xt1,$xt1,$xb0);
  558. vx ($xt2,$xt2,$xc0);
  559. vx ($xt3,$xt3,$xd0);
  560. vstm ($xt0,$xt3,"0($out)");
  561. #la $inp,0x40($inp));
  562. #la $out,0x40($out));
  563. #lhi %r0,10);
  564. #&{$z? \&aghi:\&ahi} $len,-0x40);
  565. #jne .Loop_outer_4x);
  566. LABEL (".Ldone_4x");
  567. if (!$z) {
  568. ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
  569. ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
  570. } else {
  571. ld ("%f8","$stdframe+8*0($sp)");
  572. ld ("%f9","$stdframe+8*1($sp)");
  573. ld ("%f10","$stdframe+8*2($sp)");
  574. ld ("%f11","$stdframe+8*3($sp)");
  575. ld ("%f12","$stdframe+8*4($sp)");
  576. ld ("%f13","$stdframe+8*5($sp)");
  577. ld ("%f14","$stdframe+8*6($sp)");
  578. ld ("%f15","$stdframe+8*7($sp)");
  579. }
  580. &{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
  581. la ($sp,"$FRAME($sp)");
  582. br ("%r14");
  583. ALIGN (16);
  584. LABEL (".Ltail_4x");
  585. if (!$z) {
  586. vlr ($xt0,$xb0);
  587. ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
  588. ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
  589. vst ($xa0,"$stdframe+0x00($sp)");
  590. vst ($xt0,"$stdframe+0x10($sp)");
  591. vst ($xc0,"$stdframe+0x20($sp)");
  592. vst ($xd0,"$stdframe+0x30($sp)");
  593. } else {
  594. vlr ($xt0,$xc0);
  595. ld ("%f8","$stdframe+8*0($sp)");
  596. ld ("%f9","$stdframe+8*1($sp)");
  597. ld ("%f10","$stdframe+8*2($sp)");
  598. ld ("%f11","$stdframe+8*3($sp)");
  599. vlr ($xt1,$xd0);
  600. ld ("%f12","$stdframe+8*4($sp)");
  601. ld ("%f13","$stdframe+8*5($sp)");
  602. ld ("%f14","$stdframe+8*6($sp)");
  603. ld ("%f15","$stdframe+8*7($sp)");
  604. vst ($xa0,"$stdframe+0x00($sp)");
  605. vst ($xb0,"$stdframe+0x10($sp)");
  606. vst ($xt0,"$stdframe+0x20($sp)");
  607. vst ($xt1,"$stdframe+0x30($sp)");
  608. }
  609. lghi ("%r1",0);
  610. LABEL (".Loop_tail_4x");
  611. llgc ("%r5","0(%r1,$inp)");
  612. llgc ("%r6","$stdframe(%r1,$sp)");
  613. xr ("%r6","%r5");
  614. stc ("%r6","0(%r1,$out)");
  615. la ("%r1","1(%r1)");
  616. brct ($len,".Loop_tail_4x");
  617. &{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
  618. la ($sp,"$FRAME($sp)");
  619. br ("%r14");
  620. SIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
  621. }
  622. ########################################################################
  623. # 6x"horizontal" layout is optimal fit for the platform in its current
  624. # shape, more specifically for given vector instructions' latency. Well,
  625. # computational part of 8x"vertical" would be faster, but it consumes
  626. # all registers and dealing with that will diminish the return...
  627. #
  628. {
  629. my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
  630. $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
  631. $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
  632. my @K=map("%v$_",(27,24..26));
  633. my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
  634. my $beperm="%v31";
  635. my $FRAME=$stdframe + 4*16;
  636. GLOBL ("ChaCha20_ctr32_vx");
  637. ALIGN (32);
  638. LABEL ("ChaCha20_ctr32_vx");
  639. LABEL (".LChaCha20_ctr32_vx");
  640. &{$z? \&clgfi:\&clfi} ($len,256);
  641. jle (".LChaCha20_ctr32_4x");
  642. &{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
  643. if (!$z) {
  644. std ("%f4","16*$SIZE_T+2*8($sp)");
  645. std ("%f6","16*$SIZE_T+3*8($sp)");
  646. }
  647. &{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
  648. lgr ("%r0",$sp);
  649. la ($sp,"0(%r1,$sp)");
  650. &{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
  651. if ($z) {
  652. std ("%f8","$FRAME-8*8($sp)");
  653. std ("%f9","$FRAME-8*7($sp)");
  654. std ("%f10","$FRAME-8*6($sp)");
  655. std ("%f11","$FRAME-8*5($sp)");
  656. std ("%f12","$FRAME-8*4($sp)");
  657. std ("%f13","$FRAME-8*3($sp)");
  658. std ("%f14","$FRAME-8*2($sp)");
  659. std ("%f15","$FRAME-8*1($sp)");
  660. }
  661. larl ("%r7",".Lsigma");
  662. lhi ("%r0",10);
  663. vlm (@K[1],@K[2],"0($key)"); # load key
  664. vl (@K[3],"0($counter)"); # load counter
  665. vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ...
  666. LABEL (".Loop_outer_vx");
  667. vlr ($a0,@K[0]);
  668. vlr ($b0,@K[1]);
  669. vlr ($a1,@K[0]);
  670. vlr ($b1,@K[1]);
  671. vlr ($a2,@K[0]);
  672. vlr ($b2,@K[1]);
  673. vlr ($a3,@K[0]);
  674. vlr ($b3,@K[1]);
  675. vlr ($a4,@K[0]);
  676. vlr ($b4,@K[1]);
  677. vlr ($a5,@K[0]);
  678. vlr ($b5,@K[1]);
  679. vlr ($d0,@K[3]);
  680. vaf ($d1,@K[3],$t1); # K[3]+1
  681. vaf ($d2,@K[3],$t2); # K[3]+2
  682. vaf ($d3,@K[3],$t3); # K[3]+3
  683. vaf ($d4,$d2,$t2); # K[3]+4
  684. vaf ($d5,$d2,$t3); # K[3]+5
  685. vlr ($c0,@K[2]);
  686. vlr ($c1,@K[2]);
  687. vlr ($c2,@K[2]);
  688. vlr ($c3,@K[2]);
  689. vlr ($c4,@K[2]);
  690. vlr ($c5,@K[2]);
  691. vlr ($t1,$d1);
  692. vlr ($t2,$d2);
  693. vlr ($t3,$d3);
  694. ALIGN (4);
  695. LABEL (".Loop_vx");
  696. VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
  697. $b0,$b1,$b2,$b3,$b4,$b5,
  698. $c0,$c1,$c2,$c3,$c4,$c5,
  699. $d0,$d1,$d2,$d3,$d4,$d5,
  700. 0);
  701. VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
  702. $b0,$b1,$b2,$b3,$b4,$b5,
  703. $c0,$c1,$c2,$c3,$c4,$c5,
  704. $d0,$d1,$d2,$d3,$d4,$d5,
  705. 1);
  706. brct ("%r0",".Loop_vx");
  707. vaf ($a0,$a0,@K[0]);
  708. vaf ($b0,$b0,@K[1]);
  709. vaf ($c0,$c0,@K[2]);
  710. vaf ($d0,$d0,@K[3]);
  711. vaf ($a1,$a1,@K[0]);
  712. vaf ($d1,$d1,$t1); # +K[3]+1
  713. vperm ($a0,$a0,$a0,$beperm);
  714. vperm ($b0,$b0,$b0,$beperm);
  715. vperm ($c0,$c0,$c0,$beperm);
  716. vperm ($d0,$d0,$d0,$beperm);
  717. &{$z? \&clgfi:\&clfi} ($len,0x40);
  718. jl (".Ltail_vx");
  719. vaf ($d2,$d2,$t2); # +K[3]+2
  720. vaf ($d3,$d3,$t3); # +K[3]+3
  721. vlm ($t0,$t3,"0($inp)");
  722. vx ($a0,$a0,$t0);
  723. vx ($b0,$b0,$t1);
  724. vx ($c0,$c0,$t2);
  725. vx ($d0,$d0,$t3);
  726. vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments
  727. vstm ($a0,$d0,"0($out)");
  728. la ($inp,"0x40($inp)");
  729. la ($out,"0x40($out)");
  730. &{$z? \&aghi:\&ahi} ($len,-0x40);
  731. je (".Ldone_vx");
  732. vaf ($b1,$b1,@K[1]);
  733. vaf ($c1,$c1,@K[2]);
  734. vperm ($a0,$a1,$a1,$beperm);
  735. vperm ($b0,$b1,$b1,$beperm);
  736. vperm ($c0,$c1,$c1,$beperm);
  737. vperm ($d0,$d1,$d1,$beperm);
  738. &{$z? \&clgfi:\&clfi} ($len,0x40);
  739. jl (".Ltail_vx");
  740. vlm ($a1,$d1,"0($inp)");
  741. vx ($a0,$a0,$a1);
  742. vx ($b0,$b0,$b1);
  743. vx ($c0,$c0,$c1);
  744. vx ($d0,$d0,$d1);
  745. vstm ($a0,$d0,"0($out)");
  746. la ($inp,"0x40($inp)");
  747. la ($out,"0x40($out)");
  748. &{$z? \&aghi:\&ahi} ($len,-0x40);
  749. je (".Ldone_vx");
  750. vaf ($a2,$a2,@K[0]);
  751. vaf ($b2,$b2,@K[1]);
  752. vaf ($c2,$c2,@K[2]);
  753. vperm ($a0,$a2,$a2,$beperm);
  754. vperm ($b0,$b2,$b2,$beperm);
  755. vperm ($c0,$c2,$c2,$beperm);
  756. vperm ($d0,$d2,$d2,$beperm);
  757. &{$z? \&clgfi:\&clfi} ($len,0x40);
  758. jl (".Ltail_vx");
  759. vlm ($a1,$d1,"0($inp)");
  760. vx ($a0,$a0,$a1);
  761. vx ($b0,$b0,$b1);
  762. vx ($c0,$c0,$c1);
  763. vx ($d0,$d0,$d1);
  764. vstm ($a0,$d0,"0($out)");
  765. la ($inp,"0x40($inp)");
  766. la ($out,"0x40($out)");
  767. &{$z? \&aghi:\&ahi} ($len,-0x40);
  768. je (".Ldone_vx");
  769. vaf ($a3,$a3,@K[0]);
  770. vaf ($b3,$b3,@K[1]);
  771. vaf ($c3,$c3,@K[2]);
  772. vaf ($d2,@K[3],$t3); # K[3]+3
  773. vperm ($a0,$a3,$a3,$beperm);
  774. vperm ($b0,$b3,$b3,$beperm);
  775. vperm ($c0,$c3,$c3,$beperm);
  776. vperm ($d0,$d3,$d3,$beperm);
  777. &{$z? \&clgfi:\&clfi} ($len,0x40);
  778. jl (".Ltail_vx");
  779. vaf ($d3,$d2,$t1); # K[3]+4
  780. vlm ($a1,$d1,"0($inp)");
  781. vx ($a0,$a0,$a1);
  782. vx ($b0,$b0,$b1);
  783. vx ($c0,$c0,$c1);
  784. vx ($d0,$d0,$d1);
  785. vstm ($a0,$d0,"0($out)");
  786. la ($inp,"0x40($inp)");
  787. la ($out,"0x40($out)");
  788. &{$z? \&aghi:\&ahi} ($len,-0x40);
  789. je (".Ldone_vx");
  790. vaf ($a4,$a4,@K[0]);
  791. vaf ($b4,$b4,@K[1]);
  792. vaf ($c4,$c4,@K[2]);
  793. vaf ($d4,$d4,$d3); # +K[3]+4
  794. vaf ($d3,$d3,$t1); # K[3]+5
  795. vaf (@K[3],$d2,$t3); # K[3]+=6
  796. vperm ($a0,$a4,$a4,$beperm);
  797. vperm ($b0,$b4,$b4,$beperm);
  798. vperm ($c0,$c4,$c4,$beperm);
  799. vperm ($d0,$d4,$d4,$beperm);
  800. &{$z? \&clgfi:\&clfi} ($len,0x40);
  801. jl (".Ltail_vx");
  802. vlm ($a1,$d1,"0($inp)");
  803. vx ($a0,$a0,$a1);
  804. vx ($b0,$b0,$b1);
  805. vx ($c0,$c0,$c1);
  806. vx ($d0,$d0,$d1);
  807. vstm ($a0,$d0,"0($out)");
  808. la ($inp,"0x40($inp)");
  809. la ($out,"0x40($out)");
  810. &{$z? \&aghi:\&ahi} ($len,-0x40);
  811. je (".Ldone_vx");
  812. vaf ($a5,$a5,@K[0]);
  813. vaf ($b5,$b5,@K[1]);
  814. vaf ($c5,$c5,@K[2]);
  815. vaf ($d5,$d5,$d3); # +K[3]+5
  816. vperm ($a0,$a5,$a5,$beperm);
  817. vperm ($b0,$b5,$b5,$beperm);
  818. vperm ($c0,$c5,$c5,$beperm);
  819. vperm ($d0,$d5,$d5,$beperm);
  820. &{$z? \&clgfi:\&clfi} ($len,0x40);
  821. jl (".Ltail_vx");
  822. vlm ($a1,$d1,"0($inp)");
  823. vx ($a0,$a0,$a1);
  824. vx ($b0,$b0,$b1);
  825. vx ($c0,$c0,$c1);
  826. vx ($d0,$d0,$d1);
  827. vstm ($a0,$d0,"0($out)");
  828. la ($inp,"0x40($inp)");
  829. la ($out,"0x40($out)");
  830. lhi ("%r0",10);
  831. &{$z? \&aghi:\&ahi} ($len,-0x40);
  832. jne (".Loop_outer_vx");
  833. LABEL (".Ldone_vx");
  834. if (!$z) {
  835. ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
  836. ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
  837. } else {
  838. ld ("%f8","$FRAME-8*8($sp)");
  839. ld ("%f9","$FRAME-8*7($sp)");
  840. ld ("%f10","$FRAME-8*6($sp)");
  841. ld ("%f11","$FRAME-8*5($sp)");
  842. ld ("%f12","$FRAME-8*4($sp)");
  843. ld ("%f13","$FRAME-8*3($sp)");
  844. ld ("%f14","$FRAME-8*2($sp)");
  845. ld ("%f15","$FRAME-8*1($sp)");
  846. }
  847. &{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
  848. la ($sp,"$FRAME($sp)");
  849. br ("%r14");
  850. ALIGN (16);
  851. LABEL (".Ltail_vx");
  852. if (!$z) {
  853. ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
  854. ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
  855. } else {
  856. ld ("%f8","$FRAME-8*8($sp)");
  857. ld ("%f9","$FRAME-8*7($sp)");
  858. ld ("%f10","$FRAME-8*6($sp)");
  859. ld ("%f11","$FRAME-8*5($sp)");
  860. ld ("%f12","$FRAME-8*4($sp)");
  861. ld ("%f13","$FRAME-8*3($sp)");
  862. ld ("%f14","$FRAME-8*2($sp)");
  863. ld ("%f15","$FRAME-8*1($sp)");
  864. }
  865. vstm ($a0,$d0,"$stdframe($sp)");
  866. lghi ("%r1",0);
  867. LABEL (".Loop_tail_vx");
  868. llgc ("%r5","0(%r1,$inp)");
  869. llgc ("%r6","$stdframe(%r1,$sp)");
  870. xr ("%r6","%r5");
  871. stc ("%r6","0(%r1,$out)");
  872. la ("%r1","1(%r1)");
  873. brct ($len,".Loop_tail_vx");
  874. &{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
  875. la ($sp,"$FRAME($sp)");
  876. br ("%r14");
  877. SIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
  878. }
  879. ################
  880. ALIGN (32);
  881. LABEL (".Lsigma");
  882. LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
  883. LONG (1,0,0,0);
  884. LONG (2,0,0,0);
  885. LONG (3,0,0,0);
  886. LONG (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c); # byte swap
  887. LONG (0,1,2,3);
  888. LONG (0x61707865,0x61707865,0x61707865,0x61707865); # smashed sigma
  889. LONG (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
  890. LONG (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
  891. LONG (0x6b206574,0x6b206574,0x6b206574,0x6b206574);
  892. ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
  893. ALIGN (4);
  894. PERLASM_END();