chacha-s390x.pl 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. #! /usr/bin/env perl
  2. # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # December 2015
  17. #
  18. # ChaCha20 for s390x.
  19. #
  20. # 3 times faster than compiler-generated code.
  21. $flavour = shift;
  22. if ($flavour =~ /3[12]/) {
  23. $SIZE_T=4;
  24. $g="";
  25. } else {
  26. $SIZE_T=8;
  27. $g="g";
  28. }
  29. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  30. open STDOUT,">$output";
  31. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  32. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  33. $code .= "\t$opcode\t".join(',',@_)."\n";
  34. }
  35. my $sp="%r15";
  36. my $stdframe=16*$SIZE_T+4*8;
  37. my $frame=$stdframe+4*20;
  38. my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
  39. my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
  40. my @t=map("%r$_",(8,9));
  41. sub ROUND {
  42. my ($a0,$b0,$c0,$d0)=@_;
  43. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  44. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  45. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  46. my ($xc,$xc_)=map("\"$_\"",@t);
  47. my @x=map("\"$_\"",@x);
  48. # Consider order in which variables are addressed by their
  49. # index:
  50. #
  51. # a b c d
  52. #
  53. # 0 4 8 12 < even round
  54. # 1 5 9 13
  55. # 2 6 10 14
  56. # 3 7 11 15
  57. # 0 5 10 15 < odd round
  58. # 1 6 11 12
  59. # 2 7 8 13
  60. # 3 4 9 14
  61. #
  62. # 'a', 'b' and 'd's are permanently allocated in registers,
  63. # @x[0..7,12..15], while 'c's are maintained in memory. If
  64. # you observe 'c' column, you'll notice that pair of 'c's is
  65. # invariant between rounds. This means that we have to reload
  66. # them once per round, in the middle. This is why you'll see
  67. # 'c' stores and loads in the middle, but none in the beginning
  68. # or end.
  69. (
  70. "&alr (@x[$a0],@x[$b0])", # Q1
  71. "&alr (@x[$a1],@x[$b1])", # Q2
  72. "&xr (@x[$d0],@x[$a0])",
  73. "&xr (@x[$d1],@x[$a1])",
  74. "&rll (@x[$d0],@x[$d0],16)",
  75. "&rll (@x[$d1],@x[$d1],16)",
  76. "&alr ($xc,@x[$d0])",
  77. "&alr ($xc_,@x[$d1])",
  78. "&xr (@x[$b0],$xc)",
  79. "&xr (@x[$b1],$xc_)",
  80. "&rll (@x[$b0],@x[$b0],12)",
  81. "&rll (@x[$b1],@x[$b1],12)",
  82. "&alr (@x[$a0],@x[$b0])",
  83. "&alr (@x[$a1],@x[$b1])",
  84. "&xr (@x[$d0],@x[$a0])",
  85. "&xr (@x[$d1],@x[$a1])",
  86. "&rll (@x[$d0],@x[$d0],8)",
  87. "&rll (@x[$d1],@x[$d1],8)",
  88. "&alr ($xc,@x[$d0])",
  89. "&alr ($xc_,@x[$d1])",
  90. "&xr (@x[$b0],$xc)",
  91. "&xr (@x[$b1],$xc_)",
  92. "&rll (@x[$b0],@x[$b0],7)",
  93. "&rll (@x[$b1],@x[$b1],7)",
  94. "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
  95. "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
  96. "&alr (@x[$a2],@x[$b2])", # Q3
  97. "&alr (@x[$a3],@x[$b3])", # Q4
  98. "&xr (@x[$d2],@x[$a2])",
  99. "&xr (@x[$d3],@x[$a3])",
  100. "&rll (@x[$d2],@x[$d2],16)",
  101. "&rll (@x[$d3],@x[$d3],16)",
  102. "&alr ($xc,@x[$d2])",
  103. "&alr ($xc_,@x[$d3])",
  104. "&xr (@x[$b2],$xc)",
  105. "&xr (@x[$b3],$xc_)",
  106. "&rll (@x[$b2],@x[$b2],12)",
  107. "&rll (@x[$b3],@x[$b3],12)",
  108. "&alr (@x[$a2],@x[$b2])",
  109. "&alr (@x[$a3],@x[$b3])",
  110. "&xr (@x[$d2],@x[$a2])",
  111. "&xr (@x[$d3],@x[$a3])",
  112. "&rll (@x[$d2],@x[$d2],8)",
  113. "&rll (@x[$d3],@x[$d3],8)",
  114. "&alr ($xc,@x[$d2])",
  115. "&alr ($xc_,@x[$d3])",
  116. "&xr (@x[$b2],$xc)",
  117. "&xr (@x[$b3],$xc_)",
  118. "&rll (@x[$b2],@x[$b2],7)",
  119. "&rll (@x[$b3],@x[$b3],7)"
  120. );
  121. }
  122. $code.=<<___;
  123. .text
  124. .globl ChaCha20_ctr32
  125. .type ChaCha20_ctr32,\@function
  126. .align 32
  127. ChaCha20_ctr32:
  128. lt${g}r $len,$len # $len==0?
  129. bzr %r14
  130. a${g}hi $len,-64
  131. l${g}hi %r1,-$frame
  132. stm${g} %r6,%r15,`6*$SIZE_T`($sp)
  133. sl${g}r $out,$inp # difference
  134. la $len,0($inp,$len) # end of input minus 64
  135. larl %r7,.Lsigma
  136. lgr %r0,$sp
  137. la $sp,0(%r1,$sp)
  138. st${g} %r0,0($sp)
  139. lmg %r8,%r11,0($key) # load key
  140. lmg %r12,%r13,0($counter) # load counter
  141. lmg %r6,%r7,0(%r7) # load sigma constant
  142. la %r14,0($inp)
  143. st${g} $out,$frame+3*$SIZE_T($sp)
  144. st${g} $len,$frame+4*$SIZE_T($sp)
  145. stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack
  146. srlg @x[12],%r12,32 # 32-bit counter value
  147. j .Loop_outer
  148. .align 16
  149. .Loop_outer:
  150. lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7]
  151. lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11]
  152. lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15]
  153. stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11]
  154. lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9]
  155. st @x[12],$stdframe+4*12($sp) # save counter
  156. st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer
  157. lhi %r14,10
  158. j .Loop
  159. .align 4
  160. .Loop:
  161. ___
  162. foreach (&ROUND(0, 4, 8,12)) { eval; }
  163. foreach (&ROUND(0, 5,10,15)) { eval; }
  164. $code.=<<___;
  165. brct %r14,.Loop
  166. l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer
  167. stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9]
  168. lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp)
  169. al @x[0],$stdframe+4*0($sp) # accumulate key schedule
  170. al @x[1],$stdframe+4*1($sp)
  171. al @x[2],$stdframe+4*2($sp)
  172. al @x[3],$stdframe+4*3($sp)
  173. al @x[4],$stdframe+4*4($sp)
  174. al @x[5],$stdframe+4*5($sp)
  175. al @x[6],$stdframe+4*6($sp)
  176. al @x[7],$stdframe+4*7($sp)
  177. lrvr @x[0],@x[0]
  178. lrvr @x[1],@x[1]
  179. lrvr @x[2],@x[2]
  180. lrvr @x[3],@x[3]
  181. lrvr @x[4],@x[4]
  182. lrvr @x[5],@x[5]
  183. lrvr @x[6],@x[6]
  184. lrvr @x[7],@x[7]
  185. al @x[12],$stdframe+4*12($sp)
  186. al @x[13],$stdframe+4*13($sp)
  187. al @x[14],$stdframe+4*14($sp)
  188. al @x[15],$stdframe+4*15($sp)
  189. lrvr @x[12],@x[12]
  190. lrvr @x[13],@x[13]
  191. lrvr @x[14],@x[14]
  192. lrvr @x[15],@x[15]
  193. la @t[0],0(@t[0],%r14) # reconstruct output pointer
  194. cl${g}r %r14,@t[1]
  195. jh .Ltail
  196. x @x[0],4*0(%r14) # xor with input
  197. x @x[1],4*1(%r14)
  198. st @x[0],4*0(@t[0]) # store output
  199. x @x[2],4*2(%r14)
  200. st @x[1],4*1(@t[0])
  201. x @x[3],4*3(%r14)
  202. st @x[2],4*2(@t[0])
  203. x @x[4],4*4(%r14)
  204. st @x[3],4*3(@t[0])
  205. lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11]
  206. x @x[5],4*5(%r14)
  207. st @x[4],4*4(@t[0])
  208. x @x[6],4*6(%r14)
  209. al @x[0],$stdframe+4*8($sp)
  210. st @x[5],4*5(@t[0])
  211. x @x[7],4*7(%r14)
  212. al @x[1],$stdframe+4*9($sp)
  213. st @x[6],4*6(@t[0])
  214. x @x[12],4*12(%r14)
  215. al @x[2],$stdframe+4*10($sp)
  216. st @x[7],4*7(@t[0])
  217. x @x[13],4*13(%r14)
  218. al @x[3],$stdframe+4*11($sp)
  219. st @x[12],4*12(@t[0])
  220. x @x[14],4*14(%r14)
  221. st @x[13],4*13(@t[0])
  222. x @x[15],4*15(%r14)
  223. st @x[14],4*14(@t[0])
  224. lrvr @x[0],@x[0]
  225. st @x[15],4*15(@t[0])
  226. lrvr @x[1],@x[1]
  227. lrvr @x[2],@x[2]
  228. lrvr @x[3],@x[3]
  229. lhi @x[12],1
  230. x @x[0],4*8(%r14)
  231. al @x[12],$stdframe+4*12($sp) # increment counter
  232. x @x[1],4*9(%r14)
  233. st @x[0],4*8(@t[0])
  234. x @x[2],4*10(%r14)
  235. st @x[1],4*9(@t[0])
  236. x @x[3],4*11(%r14)
  237. st @x[2],4*10(@t[0])
  238. st @x[3],4*11(@t[0])
  239. cl${g}r %r14,@t[1] # done yet?
  240. la %r14,64(%r14)
  241. jl .Loop_outer
  242. .Ldone:
  243. xgr %r0,%r0
  244. xgr %r1,%r1
  245. xgr %r2,%r2
  246. xgr %r3,%r3
  247. stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy
  248. stmg %r0,%r3,$stdframe+4*12($sp)
  249. lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
  250. br %r14
  251. .align 16
  252. .Ltail:
  253. la @t[1],64($t[1])
  254. stm @x[0],@x[7],$stdframe+4*0($sp)
  255. sl${g}r @t[1],%r14
  256. lm @x[0],@x[3],$stdframe+4*8+4*8($sp)
  257. l${g}hi @x[6],0
  258. stm @x[12],@x[15],$stdframe+4*12($sp)
  259. al @x[0],$stdframe+4*8($sp)
  260. al @x[1],$stdframe+4*9($sp)
  261. al @x[2],$stdframe+4*10($sp)
  262. al @x[3],$stdframe+4*11($sp)
  263. lrvr @x[0],@x[0]
  264. lrvr @x[1],@x[1]
  265. lrvr @x[2],@x[2]
  266. lrvr @x[3],@x[3]
  267. stm @x[0],@x[3],$stdframe+4*8($sp)
  268. .Loop_tail:
  269. llgc @x[4],0(@x[6],%r14)
  270. llgc @x[5],$stdframe(@x[6],$sp)
  271. xr @x[5],@x[4]
  272. stc @x[5],0(@x[6],@t[0])
  273. la @x[6],1(@x[6])
  274. brct @t[1],.Loop_tail
  275. j .Ldone
  276. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  277. .align 32
  278. .Lsigma:
  279. .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
  280. .asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
  281. .align 4
  282. ___
  283. foreach (split("\n",$code)) {
  284. s/\`([^\`]*)\`/eval $1/ge;
  285. print $_,"\n";
  286. }
  287. close STDOUT;