chacha-ia64.pl 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL.
  5. # ====================================================================
  6. #
  7. # ChaCha20 for Itanium.
  8. #
  9. # March 2019
  10. #
  11. # Itanium 9xxx, which has pair of shifters, manages to process one byte
  12. # in 9.3 cycles. This aligns perfectly with theoretical estimate.
  13. # On the other hand, pre-9000 CPU has single shifter and each extr/dep
  14. # pairs below takes additional cycle. Then final input->xor->output
  15. # pass runs slower than expected... Overall result is 15.6 cpb, two
  16. # cycles more than theoretical estimate.
  17. $output = pop and open STDOUT, ">$output";
  18. my @k = map("r$_",(16..31));
  19. my @x = map("r$_",(38..53));
  20. my @y = map("r$_",(8..11));
  21. my @z = map("r$_",(15,35..37));
  22. my ($out,$inp,$len,$key,$counter) = map("r$_",(32..36));
  23. $code.=<<___;
  24. #if defined(_HPUX_SOURCE)
  25. # if !defined(_LP64)
  26. # define ADDP addp4
  27. # else
  28. # define ADDP add
  29. # endif
  30. #else
  31. # define ADDP add
  32. #endif
  33. .text
  34. .global ChaCha20_ctr32#
  35. .proc ChaCha20_ctr32#
  36. .align 32
  37. ChaCha20_ctr32:
  38. .prologue
  39. .save ar.pfs,r2
  40. { .mmi; alloc r2=ar.pfs,5,17,0,0
  41. ADDP @k[11]=4,$key
  42. .save ar.lc,r3
  43. mov r3=ar.lc }
  44. { .mmi; ADDP $out=0,$out
  45. ADDP $inp=0,$inp }
  46. { .mmi; ADDP $key=0,$key
  47. ADDP $counter=0,$counter
  48. .save pr,r14
  49. mov r14=pr };;
  50. .body
  51. { .mlx; ld4 @k[4]=[$key],8
  52. movl @k[0]=0x61707865 }
  53. { .mlx; ld4 @k[5]=[@k[11]],8
  54. movl @k[1]=0x3320646e };;
  55. { .mlx; ld4 @k[6]=[$key],8
  56. movl @k[2]=0x79622d32 }
  57. { .mlx; ld4 @k[7]=[@k[11]],8
  58. movl @k[3]=0x6b206574 };;
  59. { .mmi; ld4 @k[8]=[$key],8
  60. ld4 @k[9]=[@k[11]],8
  61. add @k[15]=4,$counter };;
  62. { .mmi; ld4 @k[10]=[$key]
  63. ld4 @k[11]=[@k[11]]
  64. mov @x[0]=@k[0] };;
  65. { .mmi; ld4 @k[12]=[$counter],8
  66. ld4 @k[13]=[@k[15]],8
  67. mov @x[1]=@k[1] };;
  68. { .mmi; ld4 @k[14]=[$counter]
  69. ld4 @k[15]=[@k[15]]
  70. mov @x[2]=@k[2] }
  71. { .mmi; mov @x[3]=@k[3]
  72. mov @x[4]=@k[4]
  73. mov @x[5]=@k[5] };;
  74. { .mmi; mov @x[6]=@k[6]
  75. mov @x[7]=@k[7]
  76. mov @x[8]=@k[8] }
  77. { .mmi; mov @x[9]=@k[9]
  78. mov @x[10]=@k[10]
  79. mov @x[11]=@k[11] }
  80. { .mmi; mov @x[12]=@k[12]
  81. mov @x[13]=@k[13]
  82. mov @x[14]=@k[14] };;
  83. .Loop_outer:
  84. { .mii; mov @x[15]=@k[15]
  85. mov ar.lc=9
  86. mov ar.ec=1 }
  87. { .mmb; cmp.geu p6,p0=64,$len
  88. sub @z[1]=64,$len
  89. brp.loop.imp .Loop_top,.Loop_end-16 };;
  90. .Loop_top:
  91. ___
  92. sub ROUND {
  93. my ($a0,$b0,$c0,$d0)=@_;
  94. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  95. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  96. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  97. $code.=<<___;
  98. { .mmi; add @x[$a0]=@x[$a0],@x[$b0]
  99. add @x[$a1]=@x[$a1],@x[$b1]
  100. add @x[$a2]=@x[$a2],@x[$b2] };;
  101. { .mmi; add @x[$a3]=@x[$a3],@x[$b3]
  102. xor @x[$d0]=@x[$d0],@x[$a0]
  103. xor @x[$d1]=@x[$d1],@x[$a1] };;
  104. { .mmi; xor @x[$d2]=@x[$d2],@x[$a2]
  105. xor @x[$d3]=@x[$d3],@x[$a3]
  106. extr.u @y[0]=@x[$d0],16,16 };;
  107. { .mii; extr.u @y[1]=@x[$d1],16,16
  108. dep @x[$d0]=@x[$d0],@y[0],16,16 };;
  109. { .mii; add @x[$c0]=@x[$c0],@x[$d0]
  110. extr.u @y[2]=@x[$d2],16,16
  111. dep @x[$d1]=@x[$d1],@y[1],16,16 };;
  112. { .mii; add @x[$c1]=@x[$c1],@x[$d1]
  113. xor @x[$b0]=@x[$b0],@x[$c0]
  114. extr.u @y[3]=@x[$d3],16,16 };;
  115. { .mii; xor @x[$b1]=@x[$b1],@x[$c1]
  116. dep @x[$d2]=@x[$d2],@y[2],16,16
  117. dep @x[$d3]=@x[$d3],@y[3],16,16 };;
  118. { .mmi; add @x[$c2]=@x[$c2],@x[$d2]
  119. add @x[$c3]=@x[$c3],@x[$d3]
  120. extr.u @y[0]=@x[$b0],20,12 };;
  121. { .mmi; xor @x[$b2]=@x[$b2],@x[$c2]
  122. xor @x[$b3]=@x[$b3],@x[$c3]
  123. dep.z @x[$b0]=@x[$b0],12,20 };;
  124. { .mii; or @x[$b0]=@x[$b0],@y[0]
  125. extr.u @y[1]=@x[$b1],20,12
  126. dep.z @x[$b1]=@x[$b1],12,20 };;
  127. { .mii; add @x[$a0]=@x[$a0],@x[$b0]
  128. extr.u @y[2]=@x[$b2],20,12
  129. extr.u @y[3]=@x[$b3],20,12 }
  130. { .mii; or @x[$b1]=@x[$b1],@y[1]
  131. dep.z @x[$b2]=@x[$b2],12,20
  132. dep.z @x[$b3]=@x[$b3],12,20 };;
  133. { .mmi; or @x[$b2]=@x[$b2],@y[2]
  134. or @x[$b3]=@x[$b3],@y[3]
  135. add @x[$a1]=@x[$a1],@x[$b1] };;
  136. { .mmi; add @x[$a2]=@x[$a2],@x[$b2]
  137. add @x[$a3]=@x[$a3],@x[$b3]
  138. xor @x[$d0]=@x[$d0],@x[$a0] };;
  139. { .mii; xor @x[$d1]=@x[$d1],@x[$a1]
  140. extr.u @y[0]=@x[$d0],24,8
  141. dep.z @x[$d0]=@x[$d0],8,24 };;
  142. { .mii; or @x[$d0]=@x[$d0],@y[0]
  143. extr.u @y[1]=@x[$d1],24,8
  144. dep.z @x[$d1]=@x[$d1],8,24 };;
  145. { .mmi; or @x[$d1]=@x[$d1],@y[1]
  146. xor @x[$d2]=@x[$d2],@x[$a2]
  147. xor @x[$d3]=@x[$d3],@x[$a3] };;
  148. { .mii; add @x[$c0]=@x[$c0],@x[$d0]
  149. extr.u @y[2]=@x[$d2],24,8
  150. dep.z @x[$d2]=@x[$d2],8,24 };;
  151. { .mii; xor @x[$b0]=@x[$b0],@x[$c0]
  152. extr.u @y[3]=@x[$d3],24,8
  153. dep.z @x[$d3]=@x[$d3],8,24 };;
  154. { .mmi; or @x[$d2]=@x[$d2],@y[2]
  155. or @x[$d3]=@x[$d3],@y[3]
  156. extr.u @y[0]=@x[$b0],25,7 };;
  157. { .mmi; add @x[$c1]=@x[$c1],@x[$d1]
  158. add @x[$c2]=@x[$c2],@x[$d2]
  159. dep.z @x[$b0]=@x[$b0],7,25 };;
  160. { .mmi; xor @x[$b1]=@x[$b1],@x[$c1]
  161. xor @x[$b2]=@x[$b2],@x[$c2]
  162. add @x[$c3]=@x[$c3],@x[$d3] };;
  163. { .mii; xor @x[$b3]=@x[$b3],@x[$c3]
  164. extr.u @y[1]=@x[$b1],25,7
  165. dep.z @x[$b1]=@x[$b1],7,25 };;
  166. { .mii; or @x[$b0]=@x[$b0],@y[0]
  167. extr.u @y[2]=@x[$b2],25,7
  168. dep.z @x[$b2]=@x[$b2],7,25 };;
  169. { .mii; or @x[$b1]=@x[$b1],@y[1]
  170. extr.u @y[3]=@x[$b3],25,7
  171. dep.z @x[$b3]=@x[$b3],7,25 };;
  172. ___
  173. $code.=<<___ if ($d0 == 12);
  174. { .mmi; or @x[$b2]=@x[$b2],@y[2]
  175. or @x[$b3]=@x[$b3],@y[3]
  176. mov @z[0]=-1 };;
  177. ___
  178. $code.=<<___ if ($d0 == 15);
  179. { .mmb; or @x[$b2]=@x[$b2],@y[2]
  180. or @x[$b3]=@x[$b3],@y[3]
  181. br.ctop.sptk .Loop_top };;
  182. ___
  183. }
  184. &ROUND(0, 4, 8, 12);
  185. &ROUND(0, 5, 10, 15);
  186. $code.=<<___;
  187. .Loop_end:
  188. { .mmi; add @x[0]=@x[0],@k[0]
  189. add @x[1]=@x[1],@k[1]
  190. (p6) shr.u @z[0]=@z[0],@z[1] }
  191. { .mmb; add @x[2]=@x[2],@k[2]
  192. add @x[3]=@x[3],@k[3]
  193. clrrrb.pr };;
  194. { .mmi; add @x[4]=@x[4],@k[4]
  195. add @x[5]=@x[5],@k[5]
  196. add @x[6]=@x[6],@k[6] }
  197. { .mmi; add @x[7]=@x[7],@k[7]
  198. add @x[8]=@x[8],@k[8]
  199. add @x[9]=@x[9],@k[9] }
  200. { .mmi; add @x[10]=@x[10],@k[10]
  201. add @x[11]=@x[11],@k[11]
  202. add @x[12]=@x[12],@k[12] }
  203. { .mmi; add @x[13]=@x[13],@k[13]
  204. add @x[14]=@x[14],@k[14]
  205. add @x[15]=@x[15],@k[15] }
  206. { .mmi; add @k[12]=1,@k[12] // next counter
  207. mov pr=@z[0],0x1ffff };;
  208. //////////////////////////////////////////////////////////////////
  209. // Each predicate bit corresponds to byte to be processed. Note
  210. // that p0 is wired to 1, but it works out, because there always
  211. // is at least one byte to process...
  212. { .mmi; (p0) ld1 @z[0]=[$inp],1
  213. shr.u @y[1]=@x[0],8 };;
  214. { .mmi; (p1) ld1 @z[1]=[$inp],1
  215. (p2) shr.u @y[2]=@x[0],16 };;
  216. { .mmi; (p2) ld1 @z[2]=[$inp],1
  217. (p0) xor @z[0]=@z[0],@x[0]
  218. (p3) shr.u @y[3]=@x[0],24 };;
  219. ___
  220. for(my $i0=0; $i0<60; $i0+=4) {
  221. my ($i1, $i2, $i3, $i4, $i5, $i6, $i7) = map($i0+$_,(1..7));
  222. my $k = $i0/4+1;
  223. $code.=<<___;
  224. { .mmi; (p$i3) ld1 @z[3]=[$inp],1
  225. (p$i0) st1 [$out]=@z[0],1
  226. (p$i1) xor @z[1]=@z[1],@y[1] };;
  227. { .mmi; (p$i4) ld1 @z[0]=[$inp],1
  228. (p$i5) shr.u @y[1]=@x[$k],8 }
  229. { .mmi; (p$i1) st1 [$out]=@z[1],1
  230. (p$i2) xor @z[2]=@z[2],@y[2]
  231. (p1) mov @x[$k-1]=@k[$k-1] };;
  232. { .mfi; (p$i5) ld1 @z[1]=[$inp],1
  233. (p$i6) shr.u @y[2]=@x[$k],16 }
  234. { .mfi; (p$i2) st1 [$out]=@z[2],1
  235. (p$i3) xor @z[3]=@z[3],@y[3] };;
  236. { .mfi; (p$i6) ld1 @z[2]=[$inp],1
  237. (p$i7) shr.u @y[3]=@x[$k],24 }
  238. ___
  239. $code.=<<___ if ($i0==0); # p1,p2 are available for reuse in first round
  240. { .mmi; (p$i3) st1 [$out]=@z[3],1
  241. (p$i4) xor @z[0]=@z[0],@x[$k]
  242. cmp.ltu p1,p2=64,$len };;
  243. ___
  244. $code.=<<___ if ($i0>0);
  245. { .mfi; (p$i3) st1 [$out]=@z[3],1
  246. (p$i4) xor @z[0]=@z[0],@x[$k] };;
  247. ___
  248. }
  249. $code.=<<___;
  250. { .mmi; (p63) ld1 @z[3]=[$inp],1
  251. (p60) st1 [$out]=@z[0],1
  252. (p61) xor @z[1]=@z[1],@y[1] };;
  253. { .mmi; (p61) st1 [$out]=@z[1],1
  254. (p62) xor @z[2]=@z[2],@y[2] };;
  255. { .mmi; (p62) st1 [$out]=@z[2],1
  256. (p63) xor @z[3]=@z[3],@y[3]
  257. (p2) mov ar.lc=r3 };;
  258. { .mib; (p63) st1 [$out]=@z[3],1
  259. (p1) add $len=-64,$len
  260. (p1) br.dptk.many .Loop_outer };;
  261. { .mmi; mov @k[4]=0 // wipe key material
  262. mov @k[5]=0
  263. mov @k[6]=0 }
  264. { .mmi; mov @k[7]=0
  265. mov @k[8]=0
  266. mov @k[9]=0 }
  267. { .mmi; mov @k[10]=0
  268. mov @k[11]=0
  269. mov @k[12]=0 }
  270. { .mmi; mov @k[13]=0
  271. mov @k[14]=0
  272. mov @k[15]=0 }
  273. { .mib; mov pr=r14,0x1ffff
  274. br.ret.sptk.many b0 };;
  275. .endp ChaCha20_ctr32#
  276. stringz "ChaCha20 for IA64, CRYPTOGAMS by \@dot-asm"
  277. ___
  278. print $code;
  279. close STDOUT or die "error closing STDOUT: $!";