poly1305-s390x.pl 24 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements Poly1305 hash for s390x.
  17. #
  18. # June 2015
  19. #
  20. # ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
  21. # code. For older compiler improvement coefficient is >3x, because
  22. # then base 2^64 and base 2^32 implementations are compared.
  23. #
  24. # On side note, z13 enables vector base 2^26 implementation...
  25. #
  26. # January 2019
  27. #
  28. # Add vx code path (base 2^26).
  29. #
  30. # Copyright IBM Corp. 2019
  31. # Author: Patrick Steuer <patrick.steuer@de.ibm.com>
  32. #
  33. # January 2019
  34. #
  35. # Add vector base 2^26 implementation. It's problematic to accurately
  36. # measure performance, because reference system is hardly idle. But
  37. # it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's
  38. # >=20% faster than IBM's submission on long inputs, and much faster on
  39. # short ones, because calculation of key powers is postponed till we
  40. # know that input is long enough to justify the additional overhead.
  41. use strict;
  42. use FindBin qw($Bin);
  43. use lib "$Bin/../..";
  44. use perlasm::s390x qw(:DEFAULT :GE :EI :MI1 :VX AUTOLOAD LABEL INCLUDE);
  45. # $output is the last argument if it looks like a file (it has an extension)
  46. # $flavour is the first argument if it doesn't look like a file
  47. my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  48. my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  49. my ($z,$SIZE_T);
  50. if ($flavour =~ /3[12]/) {
  51. $z=0; # S/390 ABI
  52. $SIZE_T=4;
  53. } else {
  54. $z=1; # zSeries ABI
  55. $SIZE_T=8;
  56. }
  57. my $stdframe=16*$SIZE_T+4*8;
  58. my $sp="%r15";
  59. my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
  60. PERLASM_BEGIN($output);
  61. INCLUDE ("s390x_arch.h");
  62. TEXT ();
  63. ################
  64. # static void poly1305_init(void *ctx, const unsigned char key[16])
  65. {
  66. GLOBL ("poly1305_init");
  67. TYPE ("poly1305_init","\@function");
  68. ALIGN (16);
  69. LABEL ("poly1305_init");
  70. lghi ("%r0",0);
  71. lghi ("%r1",-1);
  72. stg ("%r0","0($ctx)"); # zero hash value
  73. stg ("%r0","8($ctx)");
  74. stg ("%r0","16($ctx)");
  75. st ("%r0","24($ctx)"); # clear is_base2_26
  76. lgr ("%r5",$ctx); # reassign $ctx
  77. lghi ("%r2",0);
  78. &{$z? \&clgr:\&clr} ($inp,"%r0");
  79. je (".Lno_key");
  80. lrvg ("%r2","0($inp)"); # load little-endian key
  81. lrvg ("%r3","8($inp)");
  82. nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
  83. srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
  84. srlg ("%r1","%r1",4);
  85. nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
  86. ngr ("%r2","%r0");
  87. ngr ("%r3","%r1");
  88. stmg ("%r2","%r3","32(%r5)");
  89. larl ("%r1","OPENSSL_s390xcap_P");
  90. lg ("%r0","16(%r1)");
  91. srlg ("%r0","%r0",62);
  92. nill ("%r0",1); # extract vx bit
  93. lcgr ("%r0","%r0");
  94. larl ("%r1",".Lpoly1305_blocks");
  95. larl ("%r2",".Lpoly1305_blocks_vx");
  96. larl ("%r3",".Lpoly1305_emit");
  97. &{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector
  98. &{$z? \&ngr:\&nr} ("%r2","%r0");
  99. &{$z? \&xgr:\&xr} ("%r2","%r1");
  100. &{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)");
  101. lghi ("%r2",1);
  102. LABEL (".Lno_key");
  103. br ("%r14");
  104. SIZE ("poly1305_init",".-poly1305_init");
  105. }
  106. ################
  107. # static void poly1305_blocks(void *ctx, const unsigned char *inp,
  108. # size_t len, u32 padbit)
  109. {
  110. my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
  111. my ($r0,$r1,$s1) = map("%r$_",(0..2));
  112. GLOBL ("poly1305_blocks");
  113. TYPE ("poly1305_blocks","\@function");
  114. ALIGN (16);
  115. LABEL ("poly1305_blocks");
  116. LABEL (".Lpoly1305_blocks");
  117. &{$z? \&ltgr:\&ltr} ("%r0",$len);
  118. jz (".Lno_data");
  119. &{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
  120. lg ($h0,"0($ctx)"); # load hash value
  121. lg ($h1,"8($ctx)");
  122. lg ($h2,"16($ctx)");
  123. LABEL (".Lpoly1305_blocks_entry");
  124. if ($z) {
  125. srlg ($len,$len,4);
  126. } else {
  127. srl ($len,4);
  128. }
  129. llgfr ($padbit,$padbit); # clear upper half, much needed with
  130. # non-64-bit ABI
  131. lg ($r0,"32($ctx)"); # load key
  132. lg ($r1,"40($ctx)");
  133. &{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx
  134. srlg ($s1,$r1,2);
  135. algr ($s1,$r1); # s1 = r1 + r1>>2
  136. j (".Loop");
  137. ALIGN (16);
  138. LABEL (".Loop");
  139. lrvg ($d0lo,"0($inp)"); # load little-endian input
  140. lrvg ($d1lo,"8($inp)");
  141. la ($inp,"16($inp)");
  142. algr ($d0lo,$h0); # accumulate input
  143. alcgr ($d1lo,$h1);
  144. alcgr ($h2,$padbit);
  145. lgr ($h0,$d0lo);
  146. mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
  147. lgr ($h1,$d1lo);
  148. mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
  149. mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
  150. mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
  151. algr ($d0lo,$d1lo);
  152. lgr ($d1lo,$h2);
  153. alcgr ($d0hi,$d1hi);
  154. lghi ($d1hi,0);
  155. algr ($h1,$h0);
  156. alcgr ($t1,$t0);
  157. msgr ($d1lo,$s1); # h2*s1
  158. msgr ($h2,$r0); # h2*r0
  159. algr ($h1,$d1lo);
  160. alcgr ($t1,$d1hi); # $d1hi is zero
  161. algr ($h1,$d0hi);
  162. alcgr ($h2,$t1);
  163. lghi ($h0,-4); # final reduction step
  164. ngr ($h0,$h2);
  165. srlg ($t0,$h2,2);
  166. algr ($h0,$t0);
  167. lghi ($t1,3);
  168. ngr ($h2,$t1);
  169. algr ($h0,$d0lo);
  170. alcgr ($h1,$d1hi); # $d1hi is still zero
  171. alcgr ($h2,$d1hi); # $d1hi is still zero
  172. &{$z? \&brctg:\&brct} ($len,".Loop");
  173. &{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx
  174. stg ($h0,"0($ctx)"); # store hash value
  175. stg ($h1,"8($ctx)");
  176. stg ($h2,"16($ctx)");
  177. &{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)");
  178. LABEL (".Lno_data");
  179. br ("%r14");
  180. SIZE ("poly1305_blocks",".-poly1305_blocks");
  181. }
  182. ################
  183. # static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
  184. # size_t len, u32 padbit)
  185. {
  186. my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4));
  187. my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9));
  188. my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14));
  189. my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18));
  190. my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23));
  191. my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27));
  192. my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31));
  193. my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14));
  194. TYPE ("poly1305_blocks_vx","\@function");
  195. ALIGN (16);
  196. LABEL ("poly1305_blocks_vx");
  197. LABEL (".Lpoly1305_blocks_vx");
  198. &{$z? \&clgfi:\&clfi} ($len,128);
  199. jhe ("__poly1305_blocks_vx");
  200. &{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
  201. lg ($d0,"0($ctx)");
  202. lg ($d1,"8($ctx)");
  203. lg ($d2,"16($ctx)");
  204. llgfr ("%r0",$d0); # base 2^26 -> base 2^64
  205. srlg ($h0,$d0,32);
  206. llgfr ("%r1",$d1);
  207. srlg ($h1,$d1,32);
  208. srlg ($h2,$d2,32);
  209. sllg ("%r0","%r0",26);
  210. algr ($h0,"%r0");
  211. sllg ("%r0",$h1,52);
  212. srlg ($h1,$h1,12);
  213. sllg ("%r1","%r1",14);
  214. algr ($h0,"%r0");
  215. alcgr ($h1,"%r1");
  216. sllg ("%r0",$h2,40);
  217. srlg ($h2,$h2,24);
  218. lghi ("%r1",0);
  219. algr ($h1,"%r0");
  220. alcgr ($h2,"%r1");
  221. llgf ("%r0","24($ctx)"); # is_base2_26
  222. lcgr ("%r0","%r0");
  223. xgr ($h0,$d0); # choose between radixes
  224. xgr ($h1,$d1);
  225. xgr ($h2,$d2);
  226. ngr ($h0,"%r0");
  227. ngr ($h1,"%r0");
  228. ngr ($h2,"%r0");
  229. xgr ($h0,$d0);
  230. xgr ($h1,$d1);
  231. xgr ($h2,$d2);
  232. lhi ("%r0",0);
  233. st ("%r0","24($ctx)"); # clear is_base2_26
  234. j (".Lpoly1305_blocks_entry");
  235. SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
  236. TYPE ("__poly1305_mul","\@function");
  237. ALIGN (16);
  238. LABEL ("__poly1305_mul");
  239. vmlof ($ACC0,$H0,$R0);
  240. vmlof ($ACC1,$H0,$R1);
  241. vmlof ($ACC2,$H0,$R2);
  242. vmlof ($ACC3,$H0,$R3);
  243. vmlof ($ACC4,$H0,$R4);
  244. vmalof ($ACC0,$H1,$S4,$ACC0);
  245. vmalof ($ACC1,$H1,$R0,$ACC1);
  246. vmalof ($ACC2,$H1,$R1,$ACC2);
  247. vmalof ($ACC3,$H1,$R2,$ACC3);
  248. vmalof ($ACC4,$H1,$R3,$ACC4);
  249. vmalof ($ACC0,$H2,$S3,$ACC0);
  250. vmalof ($ACC1,$H2,$S4,$ACC1);
  251. vmalof ($ACC2,$H2,$R0,$ACC2);
  252. vmalof ($ACC3,$H2,$R1,$ACC3);
  253. vmalof ($ACC4,$H2,$R2,$ACC4);
  254. vmalof ($ACC0,$H3,$S2,$ACC0);
  255. vmalof ($ACC1,$H3,$S3,$ACC1);
  256. vmalof ($ACC2,$H3,$S4,$ACC2);
  257. vmalof ($ACC3,$H3,$R0,$ACC3);
  258. vmalof ($ACC4,$H3,$R1,$ACC4);
  259. vmalof ($ACC0,$H4,$S1,$ACC0);
  260. vmalof ($ACC1,$H4,$S2,$ACC1);
  261. vmalof ($ACC2,$H4,$S3,$ACC2);
  262. vmalof ($ACC3,$H4,$S4,$ACC3);
  263. vmalof ($ACC4,$H4,$R0,$ACC4);
  264. ################################################################
  265. # lazy reduction
  266. vesrlg ($H4,$ACC3,26);
  267. vesrlg ($H1,$ACC0,26);
  268. vn ($H3,$ACC3,$mask26);
  269. vn ($H0,$ACC0,$mask26);
  270. vag ($H4,$H4,$ACC4); # h3 -> h4
  271. vag ($H1,$H1,$ACC1); # h0 -> h1
  272. vesrlg ($ACC4,$H4,26);
  273. vesrlg ($ACC1,$H1,26);
  274. vn ($H4,$H4,$mask26);
  275. vn ($H1,$H1,$mask26);
  276. vag ($H0,$H0,$ACC4);
  277. vag ($H2,$ACC2,$ACC1); # h1 -> h2
  278. veslg ($ACC4,$ACC4,2); # <<2
  279. vesrlg ($ACC2,$H2,26);
  280. vn ($H2,$H2,$mask26);
  281. vag ($H0,$H0,$ACC4); # h4 -> h0
  282. vag ($H3,$H3,$ACC2); # h2 -> h3
  283. vesrlg ($ACC0,$H0,26);
  284. vesrlg ($ACC3,$H3,26);
  285. vn ($H0,$H0,$mask26);
  286. vn ($H3,$H3,$mask26);
  287. vag ($H1,$H1,$ACC0); # h0 -> h1
  288. vag ($H4,$H4,$ACC3); # h3 -> h4
  289. br ("%r14");
  290. SIZE ("__poly1305_mul",".-__poly1305_mul");
  291. TYPE ("__poly1305_blocks_vx","\@function");
  292. ALIGN (16);
  293. LABEL ("__poly1305_blocks_vx");
  294. &{$z? \&lgr:\&lr} ("%r0",$sp);
  295. &{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)");
  296. if (!$z) {
  297. std ("%f4","16*$SIZE_T+2*8($sp)");
  298. std ("%f6","16*$SIZE_T+3*8($sp)");
  299. ahi ($sp,-$stdframe);
  300. st ("%r0","0($sp)"); # back-chain
  301. llgfr ($len,$len); # so that srlg works on $len
  302. } else {
  303. aghi ($sp,"-($stdframe+8*8)");
  304. stg ("%r0","0($sp)"); # back-chain
  305. std ("%f8","$stdframe+0*8($sp)");
  306. std ("%f9","$stdframe+1*8($sp)");
  307. std ("%f10","$stdframe+2*8($sp)");
  308. std ("%f11","$stdframe+3*8($sp)");
  309. std ("%f12","$stdframe+4*8($sp)");
  310. std ("%f13","$stdframe+5*8($sp)");
  311. std ("%f14","$stdframe+6*8($sp)");
  312. std ("%f15","$stdframe+7*8($sp)");
  313. }
  314. larl ("%r1",".Lconst");
  315. vgmg ($mask26,38,63);
  316. vlm ($bswaplo,$bswapmi,"16(%r1)");
  317. &lt ("%r0","24($ctx)"); # is_base2_26?
  318. jnz (".Lskip_init");
  319. lg ($h0,"32($ctx)"); # load key base 2^64
  320. lg ($h1,"40($ctx)");
  321. risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
  322. srlg ($d1,$h0,52);
  323. risbg ($h0,$h0,38,0x80+63,0);
  324. vlvgg ($R0,$h0,0);
  325. risbg ($d1,$h1,38,51,12);
  326. vlvgg ($R1,$d0,0);
  327. risbg ($d0,$h1,38,63,50);
  328. vlvgg ($R2,$d1,0);
  329. srlg ($d1,$h1,40);
  330. vlvgg ($R3,$d0,0);
  331. vlvgg ($R4,$d1,0);
  332. veslg ($S1,$R1,2);
  333. veslg ($S2,$R2,2);
  334. veslg ($S3,$R3,2);
  335. veslg ($S4,$R4,2);
  336. vlr ($H0,$R0);
  337. vlr ($H1,$R1);
  338. vlr ($H2,$R2);
  339. vlr ($H3,$R3);
  340. vlr ($H4,$R4);
  341. vag ($S1,$S1,$R1); # * 5
  342. vag ($S2,$S2,$R2);
  343. vag ($S3,$S3,$R3);
  344. vag ($S4,$S4,$R4);
  345. brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:-
  346. vpdi ($R0,$H0,$R0,0); # r^2:r^1
  347. vpdi ($R1,$H1,$R1,0);
  348. vpdi ($R2,$H2,$R2,0);
  349. vpdi ($R3,$H3,$R3,0);
  350. vpdi ($R4,$H4,$R4,0);
  351. vpdi ($H0,$H0,$H0,0); # r^2:r^2
  352. vpdi ($H1,$H1,$H1,0);
  353. vpdi ($H2,$H2,$H2,0);
  354. vpdi ($H3,$H3,$H3,0);
  355. vpdi ($H4,$H4,$H4,0);
  356. veslg ($S1,$R1,2);
  357. veslg ($S2,$R2,2);
  358. veslg ($S3,$R3,2);
  359. veslg ($S4,$R4,2);
  360. vag ($S1,$S1,$R1); # * 5
  361. vag ($S2,$S2,$R2);
  362. vag ($S3,$S3,$R3);
  363. vag ($S4,$S4,$R4);
  364. brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1
  365. vl ($I0,"0(%r1)"); # borrow $I0
  366. vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3
  367. vperm ($R1,$R1,$H1,$I0);
  368. vperm ($R2,$R2,$H2,$I0);
  369. vperm ($R3,$R3,$H3,$I0);
  370. vperm ($R4,$R4,$H4,$I0);
  371. veslf ($S1,$R1,2);
  372. veslf ($S2,$R2,2);
  373. veslf ($S3,$R3,2);
  374. veslf ($S4,$R4,2);
  375. vaf ($S1,$S1,$R1); # * 5
  376. vaf ($S2,$S2,$R2);
  377. vaf ($S3,$S3,$R3);
  378. vaf ($S4,$S4,$R4);
  379. lg ($h0,"0($ctx)"); # load hash base 2^64
  380. lg ($h1,"8($ctx)");
  381. lg ($h2,"16($ctx)");
  382. vzero ($H0);
  383. vzero ($H1);
  384. vzero ($H2);
  385. vzero ($H3);
  386. vzero ($H4);
  387. risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
  388. srlg ($d1,$h0,52);
  389. risbg ($h0,$h0,38,0x80+63,0);
  390. vlvgg ($H0,$h0,0);
  391. risbg ($d1,$h1,38,51,12);
  392. vlvgg ($H1,$d0,0);
  393. risbg ($d0,$h1,38,63,50);
  394. vlvgg ($H2,$d1,0);
  395. srlg ($d1,$h1,40);
  396. vlvgg ($H3,$d0,0);
  397. risbg ($d1,$h2,37,39,24);
  398. vlvgg ($H4,$d1,0);
  399. lhi ("%r0",1);
  400. st ("%r0","24($ctx)"); # set is_base2_26
  401. vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26
  402. vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4
  403. vpdi ($R1,$R1,$R1,0);
  404. vpdi ($S1,$S1,$S1,0);
  405. vpdi ($R2,$R2,$R2,0);
  406. vpdi ($S2,$S2,$S2,0);
  407. vpdi ($R3,$R3,$R3,0);
  408. vpdi ($S3,$S3,$S3,0);
  409. vpdi ($R4,$R4,$R4,0);
  410. vpdi ($S4,$S4,$S4,0);
  411. j (".Loaded_hash");
  412. ALIGN (16);
  413. LABEL (".Lskip_init");
  414. vllezf ($H0,"0($ctx)"); # load hash base 2^26
  415. vllezf ($H1,"4($ctx)");
  416. vllezf ($H2,"8($ctx)");
  417. vllezf ($H3,"12($ctx)");
  418. vllezf ($H4,"16($ctx)");
  419. vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4
  420. vlrepg ($R1,"0x40($ctx)");
  421. vlrepg ($S1,"0x50($ctx)");
  422. vlrepg ($R2,"0x60($ctx)");
  423. vlrepg ($S2,"0x70($ctx)");
  424. vlrepg ($R3,"0x80($ctx)");
  425. vlrepg ($S3,"0x90($ctx)");
  426. vlrepg ($R4,"0xa0($ctx)");
  427. vlrepg ($S4,"0xb0($ctx)");
  428. LABEL (".Loaded_hash");
  429. vzero ($I1);
  430. vzero ($I3);
  431. vlm ($T1,$T4,"0x00($inp)"); # load first input block
  432. la ($inp,"0x40($inp)");
  433. vgmg ($mask26,6,31);
  434. vgmf ($I4,5,5); # padbit<<2
  435. vperm ($I0,$T3,$T4,$bswaplo);
  436. vperm ($I2,$T3,$T4,$bswapmi);
  437. vperm ($T3,$T3,$T4,$bswaphi);
  438. verimg ($I1,$I0,$mask26,6); # >>26
  439. veslg ($I0,$I0,32);
  440. veslg ($I2,$I2,28); # >>4
  441. verimg ($I3,$T3,$mask26,18); # >>14
  442. verimg ($I4,$T3,$mask26,58); # >>38
  443. vn ($I0,$I0,$mask26);
  444. vn ($I2,$I2,$mask26);
  445. vesrlf ($I4,$I4,2); # >>2
  446. vgmg ($mask26,38,63);
  447. vperm ($T3,$T1,$T2,$bswaplo);
  448. vperm ($T4,$T1,$T2,$bswaphi);
  449. vperm ($T2,$T1,$T2,$bswapmi);
  450. verimg ($I0,$T3,$mask26,0);
  451. verimg ($I1,$T3,$mask26,38); # >>26
  452. verimg ($I2,$T2,$mask26,60); # >>4
  453. verimg ($I3,$T4,$mask26,50); # >>14
  454. vesrlg ($T4,$T4,40);
  455. vo ($I4,$I4,$T4);
  456. srlg ("%r0",$len,6);
  457. &{$z? \&aghi:\&ahi} ("%r0",-1);
  458. ALIGN (16);
  459. LABEL (".Loop_vx");
  460. vmlef ($ACC0,$I0,$R0);
  461. vmlef ($ACC1,$I0,$R1);
  462. vmlef ($ACC2,$I0,$R2);
  463. vmlef ($ACC3,$I0,$R3);
  464. vmlef ($ACC4,$I0,$R4);
  465. vmalef ($ACC0,$I1,$S4,$ACC0);
  466. vmalef ($ACC1,$I1,$R0,$ACC1);
  467. vmalef ($ACC2,$I1,$R1,$ACC2);
  468. vmalef ($ACC3,$I1,$R2,$ACC3);
  469. vmalef ($ACC4,$I1,$R3,$ACC4);
  470. vaf ($H2,$H2,$I2);
  471. vaf ($H0,$H0,$I0);
  472. vaf ($H3,$H3,$I3);
  473. vaf ($H1,$H1,$I1);
  474. vaf ($H4,$H4,$I4);
  475. vmalef ($ACC0,$I2,$S3,$ACC0);
  476. vmalef ($ACC1,$I2,$S4,$ACC1);
  477. vmalef ($ACC2,$I2,$R0,$ACC2);
  478. vmalef ($ACC3,$I2,$R1,$ACC3);
  479. vmalef ($ACC4,$I2,$R2,$ACC4);
  480. vlm ($T1,$T4,"0x00($inp)"); # load next input block
  481. la ($inp,"0x40($inp)");
  482. vgmg ($mask26,6,31);
  483. vmalef ($ACC0,$I3,$S2,$ACC0);
  484. vmalef ($ACC1,$I3,$S3,$ACC1);
  485. vmalef ($ACC2,$I3,$S4,$ACC2);
  486. vmalef ($ACC3,$I3,$R0,$ACC3);
  487. vmalef ($ACC4,$I3,$R1,$ACC4);
  488. vperm ($I0,$T3,$T4,$bswaplo);
  489. vperm ($I2,$T3,$T4,$bswapmi);
  490. vperm ($T3,$T3,$T4,$bswaphi);
  491. vmalef ($ACC0,$I4,$S1,$ACC0);
  492. vmalef ($ACC1,$I4,$S2,$ACC1);
  493. vmalef ($ACC2,$I4,$S3,$ACC2);
  494. vmalef ($ACC3,$I4,$S4,$ACC3);
  495. vmalef ($ACC4,$I4,$R0,$ACC4);
  496. verimg ($I1,$I0,$mask26,6); # >>26
  497. veslg ($I0,$I0,32);
  498. veslg ($I2,$I2,28); # >>4
  499. verimg ($I3,$T3,$mask26,18); # >>14
  500. vmalof ($ACC0,$H0,$R0,$ACC0);
  501. vmalof ($ACC1,$H0,$R1,$ACC1);
  502. vmalof ($ACC2,$H0,$R2,$ACC2);
  503. vmalof ($ACC3,$H0,$R3,$ACC3);
  504. vmalof ($ACC4,$H0,$R4,$ACC4);
  505. vgmf ($I4,5,5); # padbit<<2
  506. verimg ($I4,$T3,$mask26,58); # >>38
  507. vn ($I0,$I0,$mask26);
  508. vn ($I2,$I2,$mask26);
  509. vesrlf ($I4,$I4,2); # >>2
  510. vmalof ($ACC0,$H1,$S4,$ACC0);
  511. vmalof ($ACC1,$H1,$R0,$ACC1);
  512. vmalof ($ACC2,$H1,$R1,$ACC2);
  513. vmalof ($ACC3,$H1,$R2,$ACC3);
  514. vmalof ($ACC4,$H1,$R3,$ACC4);
  515. vgmg ($mask26,38,63);
  516. vperm ($T3,$T1,$T2,$bswaplo);
  517. vperm ($T4,$T1,$T2,$bswaphi);
  518. vperm ($T2,$T1,$T2,$bswapmi);
  519. vmalof ($ACC0,$H2,$S3,$ACC0);
  520. vmalof ($ACC1,$H2,$S4,$ACC1);
  521. vmalof ($ACC2,$H2,$R0,$ACC2);
  522. vmalof ($ACC3,$H2,$R1,$ACC3);
  523. vmalof ($ACC4,$H2,$R2,$ACC4);
  524. verimg ($I0,$T3,$mask26,0);
  525. verimg ($I1,$T3,$mask26,38); # >>26
  526. verimg ($I2,$T2,$mask26,60); # >>4
  527. vmalof ($ACC0,$H3,$S2,$ACC0);
  528. vmalof ($ACC1,$H3,$S3,$ACC1);
  529. vmalof ($ACC2,$H3,$S4,$ACC2);
  530. vmalof ($ACC3,$H3,$R0,$ACC3);
  531. vmalof ($ACC4,$H3,$R1,$ACC4);
  532. verimg ($I3,$T4,$mask26,50); # >>14
  533. vesrlg ($T4,$T4,40);
  534. vo ($I4,$I4,$T4);
  535. vmalof ($ACC0,$H4,$S1,$ACC0);
  536. vmalof ($ACC1,$H4,$S2,$ACC1);
  537. vmalof ($ACC2,$H4,$S3,$ACC2);
  538. vmalof ($ACC3,$H4,$S4,$ACC3);
  539. vmalof ($ACC4,$H4,$R0,$ACC4);
  540. ################################################################
  541. # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  542. # and P. Schwabe
  543. vesrlg ($H4,$ACC3,26);
  544. vesrlg ($H1,$ACC0,26);
  545. vn ($H3,$ACC3,$mask26);
  546. vn ($H0,$ACC0,$mask26);
  547. vag ($H4,$H4,$ACC4); # h3 -> h4
  548. vag ($H1,$H1,$ACC1); # h0 -> h1
  549. vesrlg ($ACC4,$H4,26);
  550. vesrlg ($ACC1,$H1,26);
  551. vn ($H4,$H4,$mask26);
  552. vn ($H1,$H1,$mask26);
  553. vag ($H0,$H0,$ACC4);
  554. vag ($H2,$ACC2,$ACC1); # h1 -> h2
  555. veslg ($ACC4,$ACC4,2); # <<2
  556. vesrlg ($ACC2,$H2,26);
  557. vn ($H2,$H2,$mask26);
  558. vag ($H0,$H0,$ACC4); # h4 -> h0
  559. vag ($H3,$H3,$ACC2); # h2 -> h3
  560. vesrlg ($ACC0,$H0,26);
  561. vesrlg ($ACC3,$H3,26);
  562. vn ($H0,$H0,$mask26);
  563. vn ($H3,$H3,$mask26);
  564. vag ($H1,$H1,$ACC0); # h0 -> h1
  565. vag ($H4,$H4,$ACC3); # h3 -> h4
  566. &{$z? \&brctg:\&brct} ("%r0",".Loop_vx");
  567. vlm ($R0,$S4,"48($ctx)"); # load all powers
  568. lghi ("%r0",0x30);
  569. &{$z? \&lcgr:\&lcr} ($len,$len);
  570. &{$z? \&ngr:\&nr} ($len,"%r0");
  571. &{$z? \&slgr:\&slr} ($inp,$len);
  572. LABEL (".Last");
  573. vmlef ($ACC0,$I0,$R0);
  574. vmlef ($ACC1,$I0,$R1);
  575. vmlef ($ACC2,$I0,$R2);
  576. vmlef ($ACC3,$I0,$R3);
  577. vmlef ($ACC4,$I0,$R4);
  578. vmalef ($ACC0,$I1,$S4,$ACC0);
  579. vmalef ($ACC1,$I1,$R0,$ACC1);
  580. vmalef ($ACC2,$I1,$R1,$ACC2);
  581. vmalef ($ACC3,$I1,$R2,$ACC3);
  582. vmalef ($ACC4,$I1,$R3,$ACC4);
  583. vaf ($H0,$H0,$I0);
  584. vaf ($H1,$H1,$I1);
  585. vaf ($H2,$H2,$I2);
  586. vaf ($H3,$H3,$I3);
  587. vaf ($H4,$H4,$I4);
  588. vmalef ($ACC0,$I2,$S3,$ACC0);
  589. vmalef ($ACC1,$I2,$S4,$ACC1);
  590. vmalef ($ACC2,$I2,$R0,$ACC2);
  591. vmalef ($ACC3,$I2,$R1,$ACC3);
  592. vmalef ($ACC4,$I2,$R2,$ACC4);
  593. vmalef ($ACC0,$I3,$S2,$ACC0);
  594. vmalef ($ACC1,$I3,$S3,$ACC1);
  595. vmalef ($ACC2,$I3,$S4,$ACC2);
  596. vmalef ($ACC3,$I3,$R0,$ACC3);
  597. vmalef ($ACC4,$I3,$R1,$ACC4);
  598. vmalef ($ACC0,$I4,$S1,$ACC0);
  599. vmalef ($ACC1,$I4,$S2,$ACC1);
  600. vmalef ($ACC2,$I4,$S3,$ACC2);
  601. vmalef ($ACC3,$I4,$S4,$ACC3);
  602. vmalef ($ACC4,$I4,$R0,$ACC4);
  603. vmalof ($ACC0,$H0,$R0,$ACC0);
  604. vmalof ($ACC1,$H0,$R1,$ACC1);
  605. vmalof ($ACC2,$H0,$R2,$ACC2);
  606. vmalof ($ACC3,$H0,$R3,$ACC3);
  607. vmalof ($ACC4,$H0,$R4,$ACC4);
  608. vmalof ($ACC0,$H1,$S4,$ACC0);
  609. vmalof ($ACC1,$H1,$R0,$ACC1);
  610. vmalof ($ACC2,$H1,$R1,$ACC2);
  611. vmalof ($ACC3,$H1,$R2,$ACC3);
  612. vmalof ($ACC4,$H1,$R3,$ACC4);
  613. vmalof ($ACC0,$H2,$S3,$ACC0);
  614. vmalof ($ACC1,$H2,$S4,$ACC1);
  615. vmalof ($ACC2,$H2,$R0,$ACC2);
  616. vmalof ($ACC3,$H2,$R1,$ACC3);
  617. vmalof ($ACC4,$H2,$R2,$ACC4);
  618. vmalof ($ACC0,$H3,$S2,$ACC0);
  619. vmalof ($ACC1,$H3,$S3,$ACC1);
  620. vmalof ($ACC2,$H3,$S4,$ACC2);
  621. vmalof ($ACC3,$H3,$R0,$ACC3);
  622. vmalof ($ACC4,$H3,$R1,$ACC4);
  623. vmalof ($ACC0,$H4,$S1,$ACC0);
  624. vmalof ($ACC1,$H4,$S2,$ACC1);
  625. vmalof ($ACC2,$H4,$S3,$ACC2);
  626. vmalof ($ACC3,$H4,$S4,$ACC3);
  627. vmalof ($ACC4,$H4,$R0,$ACC4);
  628. ################################################################
  629. # horizontal addition
  630. vzero ($H0);
  631. vsumqg ($ACC0,$ACC0,$H0);
  632. vsumqg ($ACC1,$ACC1,$H0);
  633. vsumqg ($ACC2,$ACC2,$H0);
  634. vsumqg ($ACC3,$ACC3,$H0);
  635. vsumqg ($ACC4,$ACC4,$H0);
  636. ################################################################
  637. # lazy reduction
  638. vesrlg ($H4,$ACC3,26);
  639. vesrlg ($H1,$ACC0,26);
  640. vn ($H3,$ACC3,$mask26);
  641. vn ($H0,$ACC0,$mask26);
  642. vag ($H4,$H4,$ACC4); # h3 -> h4
  643. vag ($H1,$H1,$ACC1); # h0 -> h1
  644. vesrlg ($ACC4,$H4,26);
  645. vesrlg ($ACC1,$H1,26);
  646. vn ($H4,$H4,$mask26);
  647. vn ($H1,$H1,$mask26);
  648. vag ($H0,$H0,$ACC4);
  649. vag ($H2,$ACC2,$ACC1); # h1 -> h2
  650. veslg ($ACC4,$ACC4,2); # <<2
  651. vesrlg ($ACC2,$H2,26);
  652. vn ($H2,$H2,$mask26);
  653. vag ($H0,$H0,$ACC4); # h4 -> h0
  654. vag ($H3,$H3,$ACC2); # h2 -> h3
  655. vesrlg ($ACC0,$H0,26);
  656. vesrlg ($ACC3,$H3,26);
  657. vn ($H0,$H0,$mask26);
  658. vn ($H3,$H3,$mask26);
  659. vag ($H1,$H1,$ACC0); # h0 -> h1
  660. vag ($H4,$H4,$ACC3); # h3 -> h4
  661. &{$z? \&clgfi:\&clfi} ($len,0);
  662. je (".Ldone");
  663. vlm ($T1,$T4,"0x00($inp)"); # load last partial block
  664. vgmg ($mask26,6,31);
  665. vgmf ($I4,5,5); # padbit<<2
  666. vperm ($I0,$T3,$T4,$bswaplo);
  667. vperm ($I2,$T3,$T4,$bswapmi);
  668. vperm ($T3,$T3,$T4,$bswaphi);
  669. vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1
  670. vl ($ACC1,"0x60($len,%r1)");
  671. verimg ($I1,$I0,$mask26,6); # >>26
  672. veslg ($I0,$I0,32);
  673. veslg ($I2,$I2,28); # >>4
  674. verimg ($I3,$T3,$mask26,18); # >>14
  675. verimg ($I4,$T3,$mask26,58); # >>38
  676. vn ($I0,$I0,$mask26);
  677. vn ($I2,$I2,$mask26);
  678. vesrlf ($I4,$I4,2); # >>2
  679. vgmg ($mask26,38,63);
  680. vperm ($T3,$T1,$T2,$bswaplo);
  681. vperm ($T4,$T1,$T2,$bswaphi);
  682. vperm ($T2,$T1,$T2,$bswapmi);
  683. verimg ($I0,$T3,$mask26,0);
  684. verimg ($I1,$T3,$mask26,38); # >>26
  685. verimg ($I2,$T2,$mask26,60); # >>4
  686. verimg ($I3,$T4,$mask26,50); # >>14
  687. vesrlg ($T4,$T4,40);
  688. vo ($I4,$I4,$T4);
  689. vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane
  690. vn ($I0,$I0,$ACC1); # mask redundant lane[s]
  691. vperm ($H1,$H1,$H1,$ACC0);
  692. vn ($I1,$I1,$ACC1);
  693. vperm ($H2,$H2,$H2,$ACC0);
  694. vn ($I2,$I2,$ACC1);
  695. vperm ($H3,$H3,$H3,$ACC0);
  696. vn ($I3,$I3,$ACC1);
  697. vperm ($H4,$H4,$H4,$ACC0);
  698. vn ($I4,$I4,$ACC1);
  699. vaf ($I0,$I0,$H0); # accumulate hash
  700. vzero ($H0); # wipe hash value
  701. vaf ($I1,$I1,$H1);
  702. vzero ($H1);
  703. vaf ($I2,$I2,$H2);
  704. vzero ($H2);
  705. vaf ($I3,$I3,$H3);
  706. vzero ($H3);
  707. vaf ($I4,$I4,$H4);
  708. vzero ($H4);
  709. &{$z? \&lghi:\&lhi} ($len,0);
  710. j (".Last");
  711. # I don't bother to tell apart cases when only one multiplication
  712. # pass is sufficient, because I argue that mispredicted branch
  713. # penalties are comparable to overhead of sometimes redundant
  714. # multiplication pass...
  715. LABEL (".Ldone");
  716. vstef ($H0,"0($ctx)",3); # store hash base 2^26
  717. vstef ($H1,"4($ctx)",3);
  718. vstef ($H2,"8($ctx)",3);
  719. vstef ($H3,"12($ctx)",3);
  720. vstef ($H4,"16($ctx)",3);
  721. if ($z) {
  722. ld ("%f8","$stdframe+0*8($sp)");
  723. ld ("%f9","$stdframe+1*8($sp)");
  724. ld ("%f10","$stdframe+2*8($sp)");
  725. ld ("%f11","$stdframe+3*8($sp)");
  726. ld ("%f12","$stdframe+4*8($sp)");
  727. ld ("%f13","$stdframe+5*8($sp)");
  728. ld ("%f14","$stdframe+6*8($sp)");
  729. ld ("%f15","$stdframe+7*8($sp)");
  730. &{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)");
  731. } else {
  732. ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)");
  733. ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)");
  734. &{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)");
  735. }
  736. br ("%r14");
  737. SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx");
  738. }
  739. ################
  740. # static void poly1305_emit(void *ctx, unsigned char mac[16],
  741. # const u32 nonce[4])
  742. {
  743. my ($mac,$nonce)=($inp,$len);
  744. my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10));
  745. GLOBL ("poly1305_emit");
  746. TYPE ("poly1305_emit","\@function");
  747. ALIGN (16);
  748. LABEL ("poly1305_emit");
  749. LABEL (".Lpoly1305_emit");
  750. &{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)");
  751. lg ($d0,"0($ctx)");
  752. lg ($d1,"8($ctx)");
  753. lg ($d2,"16($ctx)");
  754. llgfr ("%r0",$d0); # base 2^26 -> base 2^64
  755. srlg ($h0,$d0,32);
  756. llgfr ("%r1",$d1);
  757. srlg ($h1,$d1,32);
  758. srlg ($h2,$d2,32);
  759. sllg ("%r0","%r0",26);
  760. algr ($h0,"%r0");
  761. sllg ("%r0",$h1,52);
  762. srlg ($h1,$h1,12);
  763. sllg ("%r1","%r1",14);
  764. algr ($h0,"%r0");
  765. alcgr ($h1,"%r1");
  766. sllg ("%r0",$h2,40);
  767. srlg ($h2,$h2,24);
  768. lghi ("%r1",0);
  769. algr ($h1,"%r0");
  770. alcgr ($h2,"%r1");
  771. llgf ("%r0","24($ctx)"); # is_base2_26
  772. lcgr ("%r0","%r0");
  773. xgr ($h0,$d0); # choose between radixes
  774. xgr ($h1,$d1);
  775. xgr ($h2,$d2);
  776. ngr ($h0,"%r0");
  777. ngr ($h1,"%r0");
  778. ngr ($h2,"%r0");
  779. xgr ($h0,$d0);
  780. xgr ($h1,$d1);
  781. xgr ($h2,$d2);
  782. lghi ("%r0",5);
  783. lgr ($d0,$h0);
  784. lgr ($d1,$h1);
  785. algr ($h0,"%r0"); # compare to modulus
  786. alcgr ($h1,"%r1");
  787. alcgr ($h2,"%r1");
  788. srlg ($h2,$h2,2); # did it borrow/carry?
  789. slgr ("%r1",$h2); # 0-$h2>>2
  790. lg ($d2,"0($nonce)"); # load nonce
  791. lg ($ctx,"8($nonce)");
  792. xgr ($h0,$d0);
  793. xgr ($h1,$d1);
  794. ngr ($h0,"%r1");
  795. ngr ($h1,"%r1");
  796. xgr ($h0,$d0);
  797. rllg ($d0,$d2,32); # flip nonce words
  798. xgr ($h1,$d1);
  799. rllg ($d1,$ctx,32);
  800. algr ($h0,$d0); # accumulate nonce
  801. alcgr ($h1,$d1);
  802. strvg ($h0,"0($mac)"); # write little-endian result
  803. strvg ($h1,"8($mac)");
  804. &{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)");
  805. br ("%r14");
  806. SIZE ("poly1305_emit",".-poly1305_emit");
  807. }
  808. ################
  809. ALIGN (16);
  810. LABEL (".Lconst");
  811. LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd
  812. LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks
  813. LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918);
  814. LONG (0x00000000,0x09080706,0x00000000,0x19181716);
  815. LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks
  816. LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000);
  817. LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000);
  818. LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff);
  819. LONG (0xffffffff,0x00000000,0xffffffff,0x00000000);
  820. LONG (0x00000000,0x00000000,0xffffffff,0x00000000);
  821. STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
  822. PERLASM_END();