keccak1600-s390x.pl 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. #!/usr/bin/env perl
  2. # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # Keccak-1600 for s390x.
  17. #
  18. # June 2017.
  19. #
  20. # Below code is [lane complementing] KECCAK_2X implementation (see
  21. # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
  22. # instead of actually unrolling the loop pair-wise I simply flip
  23. # pointers to T[][] and A[][] at the end of round. Since number of
  24. # rounds is even, last round writes to A[][] and everything works out.
  25. # In the nutshell it's transliteration of x86_64 module, because both
  26. # architectures have similar capabilities/limitations. Performance
  27. # measurement is problematic as I don't have access to an idle system.
  28. # It looks like z13 processes one byte [out of long message] in ~14
  29. # cycles. At least the result is consistent with estimate based on
  30. # amount of instruction and assumed instruction issue rate. It's ~2.5x
  31. # faster than compiler-generated code.
  32. # $output is the last argument if it looks like a file (it has an extension)
  33. # $flavour is the first argument if it doesn't look like a file
  34. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  35. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  36. if ($flavour =~ /3[12]/) {
  37. $SIZE_T=4;
  38. $g="";
  39. } else {
  40. $SIZE_T=8;
  41. $g="g";
  42. }
  43. $output and open STDOUT,">$output";
  44. my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
  45. my @C = map("%r$_",(0,1,5..7));
  46. my @D = map("%r$_",(8..12));
  47. my @T = map("%r$_",(13..14));
  48. my ($src,$dst,$iotas) = map("%r$_",(2..4));
  49. my $sp = "%r15";
  50. $stdframe=16*$SIZE_T+4*8;
  51. $frame=$stdframe+25*8;
  52. my @rhotates = ([ 0, 1, 62, 28, 27 ],
  53. [ 36, 44, 6, 55, 20 ],
  54. [ 3, 10, 43, 25, 39 ],
  55. [ 41, 45, 15, 21, 8 ],
  56. [ 18, 2, 61, 56, 14 ]);
  57. { my @C = @C; # copy, because we mess them up...
  58. my @D = @D;
  59. $code.=<<___;
  60. .text
  61. .type __KeccakF1600,\@function
  62. .align 32
  63. __KeccakF1600:
  64. st${g} %r14,$SIZE_T*14($sp)
  65. lg @C[0],$A[4][0]($src)
  66. lg @C[1],$A[4][1]($src)
  67. lg @C[2],$A[4][2]($src)
  68. lg @C[3],$A[4][3]($src)
  69. lg @C[4],$A[4][4]($src)
  70. larl $iotas,iotas
  71. j .Loop
  72. .align 16
  73. .Loop:
  74. lg @D[0],$A[0][0]($src)
  75. lg @D[1],$A[1][1]($src)
  76. lg @D[2],$A[2][2]($src)
  77. lg @D[3],$A[3][3]($src)
  78. xgr @C[0],@D[0]
  79. xg @C[1],$A[0][1]($src)
  80. xg @C[2],$A[0][2]($src)
  81. xg @C[3],$A[0][3]($src)
  82. lgr @D[4],@C[4]
  83. xg @C[4],$A[0][4]($src)
  84. xg @C[0],$A[1][0]($src)
  85. xgr @C[1],@D[1]
  86. xg @C[2],$A[1][2]($src)
  87. xg @C[3],$A[1][3]($src)
  88. xg @C[4],$A[1][4]($src)
  89. xg @C[0],$A[2][0]($src)
  90. xg @C[1],$A[2][1]($src)
  91. xgr @C[2],@D[2]
  92. xg @C[3],$A[2][3]($src)
  93. xg @C[4],$A[2][4]($src)
  94. xg @C[0],$A[3][0]($src)
  95. xg @C[1],$A[3][1]($src)
  96. xg @C[2],$A[3][2]($src)
  97. xgr @C[3],@D[3]
  98. xg @C[4],$A[3][4]($src)
  99. lgr @T[0],@C[2]
  100. rllg @C[2],@C[2],1
  101. xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0]
  102. rllg @C[0],@C[0],1
  103. xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3]
  104. rllg @C[3],@C[3],1
  105. xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1]
  106. rllg @C[1],@C[1],1
  107. xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4]
  108. rllg @C[4],@C[4],1
  109. xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2]
  110. ___
  111. (@D[0..4], @C) = (@C[1..4,0], @D);
  112. $code.=<<___;
  113. xgr @C[1],@D[1]
  114. xgr @C[2],@D[2]
  115. xgr @C[3],@D[3]
  116. rllg @C[1],@C[1],$rhotates[1][1]
  117. xgr @C[4],@D[4]
  118. rllg @C[2],@C[2],$rhotates[2][2]
  119. xgr @C[0],@D[0]
  120. lgr @T[0],@C[1]
  121. ogr @C[1],@C[2]
  122. rllg @C[3],@C[3],$rhotates[3][3]
  123. xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2])
  124. rllg @C[4],@C[4],$rhotates[4][4]
  125. xg @C[1],0($iotas)
  126. la $iotas,8($iotas)
  127. stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
  128. lgr @T[1],@C[4]
  129. ngr @C[4],@C[3]
  130. lghi @C[1],-1 # no 'not' instruction :-(
  131. xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3])
  132. xgr @C[2],@C[1] # not @C[2]
  133. stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3])
  134. ogr @C[2],@C[3]
  135. xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3])
  136. ngr @T[0],@C[0]
  137. stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3])
  138. xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0])
  139. ogr @T[1],@C[0]
  140. stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0])
  141. xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0])
  142. stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0])
  143. lg @C[0],$A[0][3]($src)
  144. lg @C[4],$A[4][2]($src)
  145. lg @C[3],$A[3][1]($src)
  146. lg @C[1],$A[1][4]($src)
  147. lg @C[2],$A[2][0]($src)
  148. xgr @C[0],@D[3]
  149. xgr @C[4],@D[2]
  150. rllg @C[0],@C[0],$rhotates[0][3]
  151. xgr @C[3],@D[1]
  152. rllg @C[4],@C[4],$rhotates[4][2]
  153. xgr @C[1],@D[4]
  154. rllg @C[3],@C[3],$rhotates[3][1]
  155. xgr @C[2],@D[0]
  156. lgr @T[0],@C[0]
  157. ogr @C[0],@C[4]
  158. rllg @C[1],@C[1],$rhotates[1][4]
  159. xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4])
  160. rllg @C[2],@C[2],$rhotates[2][0]
  161. stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4])
  162. lgr @T[1],@C[1]
  163. ngr @C[1],@T[0]
  164. lghi @C[0],-1 # no 'not' instruction :-(
  165. xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0])
  166. xgr @C[4],@C[0] # not @C[4]
  167. stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0])
  168. ogr @C[4],@C[3]
  169. xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3])
  170. ngr @C[3],@C[2]
  171. stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3])
  172. xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2])
  173. ogr @T[1],@C[2]
  174. stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2])
  175. xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2])
  176. stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2])
  177. lg @C[2],$A[2][3]($src)
  178. lg @C[3],$A[3][4]($src)
  179. lg @C[1],$A[1][2]($src)
  180. lg @C[4],$A[4][0]($src)
  181. lg @C[0],$A[0][1]($src)
  182. xgr @C[2],@D[3]
  183. xgr @C[3],@D[4]
  184. rllg @C[2],@C[2],$rhotates[2][3]
  185. xgr @C[1],@D[2]
  186. rllg @C[3],@C[3],$rhotates[3][4]
  187. xgr @C[4],@D[0]
  188. rllg @C[1],@C[1],$rhotates[1][2]
  189. xgr @C[0],@D[1]
  190. lgr @T[0],@C[2]
  191. ngr @C[2],@C[3]
  192. rllg @C[4],@C[4],$rhotates[4][0]
  193. xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3])
  194. lghi @T[1],-1 # no 'not' instruction :-(
  195. stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3])
  196. xgr @C[3],@T[1] # not @C[3]
  197. lgr @T[1],@C[4]
  198. ngr @C[4],@C[3]
  199. rllg @C[0],@C[0],$rhotates[0][1]
  200. xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3])
  201. ogr @T[0],@C[1]
  202. stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
  203. xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1])
  204. ngr @C[1],@C[0]
  205. stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1])
  206. xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0])
  207. ogr @C[0],@T[1]
  208. stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0])
  209. xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4])
  210. stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
  211. lg @C[2],$A[2][1]($src)
  212. lg @C[3],$A[3][2]($src)
  213. lg @C[1],$A[1][0]($src)
  214. lg @C[4],$A[4][3]($src)
  215. lg @C[0],$A[0][4]($src)
  216. xgr @C[2],@D[1]
  217. xgr @C[3],@D[2]
  218. rllg @C[2],@C[2],$rhotates[2][1]
  219. xgr @C[1],@D[0]
  220. rllg @C[3],@C[3],$rhotates[3][2]
  221. xgr @C[4],@D[3]
  222. rllg @C[1],@C[1],$rhotates[1][0]
  223. xgr @C[0],@D[4]
  224. rllg @C[4],@C[4],$rhotates[4][3]
  225. lgr @T[0],@C[2]
  226. ogr @C[2],@C[3]
  227. lghi @T[1],-1 # no 'not' instruction :-(
  228. xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3])
  229. xgr @C[3],@T[1] # not @C[3]
  230. stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3])
  231. lgr @T[1],@C[4]
  232. ogr @C[4],@C[3]
  233. rllg @C[0],@C[0],$rhotates[0][4]
  234. xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3])
  235. ngr @T[0],@C[1]
  236. stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
  237. xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1])
  238. ogr @C[1],@C[0]
  239. stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1])
  240. xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0])
  241. ngr @C[0],@T[1]
  242. stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0])
  243. xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4])
  244. stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
  245. xg @D[2],$A[0][2]($src)
  246. xg @D[3],$A[1][3]($src)
  247. xg @D[1],$A[4][1]($src)
  248. xg @D[4],$A[2][4]($src)
  249. xgr $dst,$src # xchg $dst,$src
  250. rllg @D[2],@D[2],$rhotates[0][2]
  251. xg @D[0],$A[3][0]($src)
  252. rllg @D[3],@D[3],$rhotates[1][3]
  253. xgr $src,$dst
  254. rllg @D[1],@D[1],$rhotates[4][1]
  255. xgr $dst,$src
  256. rllg @D[4],@D[4],$rhotates[2][4]
  257. ___
  258. @C = @D[2..4,0,1];
  259. $code.=<<___;
  260. lgr @T[0],@C[0]
  261. ngr @C[0],@C[1]
  262. lghi @T[1],-1 # no 'not' instruction :-(
  263. xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1])
  264. xgr @C[1],@T[1] # not @C[1]
  265. stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1])
  266. lgr @T[1],@C[2]
  267. ngr @C[2],@C[1]
  268. rllg @D[0],@D[0],$rhotates[3][0]
  269. xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1])
  270. ogr @T[0],@C[4]
  271. stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
  272. xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4])
  273. ngr @C[4],@C[3]
  274. stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4])
  275. xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3])
  276. ogr @C[3],@T[1]
  277. stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3])
  278. xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3])
  279. lgr @C[1],@C[0] # harmonize with the loop top
  280. lgr @C[0],@T[0]
  281. stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
  282. tmll $iotas,255
  283. jnz .Loop
  284. l${g} %r14,$SIZE_T*14($sp)
  285. br %r14
  286. .size __KeccakF1600,.-__KeccakF1600
  287. ___
  288. }
  289. {
  290. $code.=<<___;
  291. .type KeccakF1600,\@function
  292. .align 32
  293. KeccakF1600:
  294. .LKeccakF1600:
  295. lghi %r1,-$frame
  296. stm${g} %r6,%r15,$SIZE_T*6($sp)
  297. lgr %r0,$sp
  298. la $sp,0(%r1,$sp)
  299. st${g} %r0,0($sp)
  300. lghi @D[0],-1 # no 'not' instruction :-(
  301. lghi @D[1],-1
  302. lghi @D[2],-1
  303. lghi @D[3],-1
  304. lghi @D[4],-1
  305. lghi @T[0],-1
  306. xg @D[0],$A[0][1]($src)
  307. xg @D[1],$A[0][2]($src)
  308. xg @D[2],$A[1][3]($src)
  309. xg @D[3],$A[2][2]($src)
  310. xg @D[4],$A[3][2]($src)
  311. xg @T[0],$A[4][0]($src)
  312. stmg @D[0],@D[1],$A[0][1]($src)
  313. stg @D[2],$A[1][3]($src)
  314. stg @D[3],$A[2][2]($src)
  315. stg @D[4],$A[3][2]($src)
  316. stg @T[0],$A[4][0]($src)
  317. la $dst,$stdframe($sp)
  318. bras %r14,__KeccakF1600
  319. lghi @D[0],-1 # no 'not' instruction :-(
  320. lghi @D[1],-1
  321. lghi @D[2],-1
  322. lghi @D[3],-1
  323. lghi @D[4],-1
  324. lghi @T[0],-1
  325. xg @D[0],$A[0][1]($src)
  326. xg @D[1],$A[0][2]($src)
  327. xg @D[2],$A[1][3]($src)
  328. xg @D[3],$A[2][2]($src)
  329. xg @D[4],$A[3][2]($src)
  330. xg @T[0],$A[4][0]($src)
  331. stmg @D[0],@D[1],$A[0][1]($src)
  332. stg @D[2],$A[1][3]($src)
  333. stg @D[3],$A[2][2]($src)
  334. stg @D[4],$A[3][2]($src)
  335. stg @T[0],$A[4][0]($src)
  336. lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
  337. br %r14
  338. .size KeccakF1600,.-KeccakF1600
  339. ___
  340. }
  341. { my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
  342. $code.=<<___;
  343. .globl SHA3_absorb
  344. .type SHA3_absorb,\@function
  345. .align 32
  346. SHA3_absorb:
  347. lghi %r1,-$frame
  348. stm${g} %r5,%r15,$SIZE_T*5($sp)
  349. lgr %r0,$sp
  350. la $sp,0(%r1,$sp)
  351. st${g} %r0,0($sp)
  352. lghi @D[0],-1 # no 'not' instruction :-(
  353. lghi @D[1],-1
  354. lghi @D[2],-1
  355. lghi @D[3],-1
  356. lghi @D[4],-1
  357. lghi @T[0],-1
  358. xg @D[0],$A[0][1]($src)
  359. xg @D[1],$A[0][2]($src)
  360. xg @D[2],$A[1][3]($src)
  361. xg @D[3],$A[2][2]($src)
  362. xg @D[4],$A[3][2]($src)
  363. xg @T[0],$A[4][0]($src)
  364. stmg @D[0],@D[1],$A[0][1]($src)
  365. stg @D[2],$A[1][3]($src)
  366. stg @D[3],$A[2][2]($src)
  367. stg @D[4],$A[3][2]($src)
  368. stg @T[0],$A[4][0]($src)
  369. .Loop_absorb:
  370. cl${g}r $len,$bsz
  371. jl .Ldone_absorb
  372. srl${g} $bsz,3
  373. la %r1,0($A_flat)
  374. .Lblock_absorb:
  375. lrvg %r0,0($inp)
  376. la $inp,8($inp)
  377. xg %r0,0(%r1)
  378. a${g}hi $len,-8
  379. stg %r0,0(%r1)
  380. la %r1,8(%r1)
  381. brct $bsz,.Lblock_absorb
  382. stm${g} $inp,$len,$frame+3*$SIZE_T($sp)
  383. la $dst,$stdframe($sp)
  384. bras %r14,__KeccakF1600
  385. lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp)
  386. j .Loop_absorb
  387. .align 16
  388. .Ldone_absorb:
  389. lghi @D[0],-1 # no 'not' instruction :-(
  390. lghi @D[1],-1
  391. lghi @D[2],-1
  392. lghi @D[3],-1
  393. lghi @D[4],-1
  394. lghi @T[0],-1
  395. xg @D[0],$A[0][1]($src)
  396. xg @D[1],$A[0][2]($src)
  397. xg @D[2],$A[1][3]($src)
  398. xg @D[3],$A[2][2]($src)
  399. xg @D[4],$A[3][2]($src)
  400. xg @T[0],$A[4][0]($src)
  401. stmg @D[0],@D[1],$A[0][1]($src)
  402. stg @D[2],$A[1][3]($src)
  403. stg @D[3],$A[2][2]($src)
  404. stg @D[4],$A[3][2]($src)
  405. stg @T[0],$A[4][0]($src)
  406. lgr %r2,$len # return value
  407. lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
  408. br %r14
  409. .size SHA3_absorb,.-SHA3_absorb
  410. ___
  411. }
  412. { my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
  413. $code.=<<___;
  414. .globl SHA3_squeeze
  415. .type SHA3_squeeze,\@function
  416. .align 32
  417. SHA3_squeeze:
  418. srl${g} $bsz,3
  419. st${g} %r14,2*$SIZE_T($sp)
  420. lghi %r14,8
  421. st${g} $bsz,5*$SIZE_T($sp)
  422. la %r1,0($A_flat)
  423. j .Loop_squeeze
  424. .align 16
  425. .Loop_squeeze:
  426. cl${g}r $len,%r14
  427. jl .Ltail_squeeze
  428. lrvg %r0,0(%r1)
  429. la %r1,8(%r1)
  430. stg %r0,0($out)
  431. la $out,8($out)
  432. a${g}hi $len,-8 # len -= 8
  433. jz .Ldone_squeeze
  434. brct $bsz,.Loop_squeeze # bsz--
  435. stm${g} $out,$len,3*$SIZE_T($sp)
  436. bras %r14,.LKeccakF1600
  437. lm${g} $out,$bsz,3*$SIZE_T($sp)
  438. lghi %r14,8
  439. la %r1,0($A_flat)
  440. j .Loop_squeeze
  441. .Ltail_squeeze:
  442. lg %r0,0(%r1)
  443. .Loop_tail_squeeze:
  444. stc %r0,0($out)
  445. la $out,1($out)
  446. srlg %r0,8
  447. brct $len,.Loop_tail_squeeze
  448. .Ldone_squeeze:
  449. l${g} %r14,2*$SIZE_T($sp)
  450. br %r14
  451. .size SHA3_squeeze,.-SHA3_squeeze
  452. ___
  453. }
  454. $code.=<<___;
  455. .align 256
  456. .quad 0,0,0,0,0,0,0,0
  457. .type iotas,\@object
  458. iotas:
  459. .quad 0x0000000000000001
  460. .quad 0x0000000000008082
  461. .quad 0x800000000000808a
  462. .quad 0x8000000080008000
  463. .quad 0x000000000000808b
  464. .quad 0x0000000080000001
  465. .quad 0x8000000080008081
  466. .quad 0x8000000000008009
  467. .quad 0x000000000000008a
  468. .quad 0x0000000000000088
  469. .quad 0x0000000080008009
  470. .quad 0x000000008000000a
  471. .quad 0x000000008000808b
  472. .quad 0x800000000000008b
  473. .quad 0x8000000000008089
  474. .quad 0x8000000000008003
  475. .quad 0x8000000000008002
  476. .quad 0x8000000000000080
  477. .quad 0x000000000000800a
  478. .quad 0x800000008000000a
  479. .quad 0x8000000080008081
  480. .quad 0x8000000000008080
  481. .quad 0x0000000080000001
  482. .quad 0x8000000080008008
  483. .size iotas,.-iotas
  484. .asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
  485. ___
  486. # unlike 32-bit shift 64-bit one takes three arguments
  487. $code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
  488. print $code;
  489. close STDOUT;