2
0

ghashv8-armx.pl 19 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
  17. #
  18. # June 2014
  19. #
  20. # Initial version was developed in tight cooperation with Ard
  21. # Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
  22. # Just like aesv8-armx.pl this module supports both AArch32 and
  23. # AArch64 execution modes.
  24. #
  25. # July 2014
  26. #
  27. # Implement 2x aggregated reduction [see ghash-x86.pl for background
  28. # information].
  29. #
  30. # November 2017
  31. #
  32. # AArch64 register bank to "accommodate" 4x aggregated reduction and
  33. # improve performance by 20-70% depending on processor.
  34. #
  35. # Current performance in cycles per processed byte:
  36. #
  37. # 64-bit PMULL 32-bit PMULL 32-bit NEON(*)
  38. # Apple A7 0.58 0.92 5.62
  39. # Cortex-A53 0.85 1.01 8.39
  40. # Cortex-A57 0.73 1.17 7.61
  41. # Denver 0.51 0.65 6.02
  42. # Mongoose 0.65 1.10 8.06
  43. # Kryo 0.76 1.16 8.00
  44. # ThunderX2 1.05
  45. #
  46. # (*) presented for reference/comparison purposes;
  47. # $output is the last argument if it looks like a file (it has an extension)
  48. # $flavour is the first argument if it doesn't look like a file
  49. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  50. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  51. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  52. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  53. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  54. die "can't locate arm-xlate.pl";
  55. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  56. or die "can't call $xlate: $!";
  57. *STDOUT=*OUT;
  58. $Xi="x0"; # argument block
  59. $Htbl="x1";
  60. $inp="x2";
  61. $len="x3";
  62. $inc="x12";
  63. {
  64. my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
  65. my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
  66. my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  67. $code=<<___;
  68. #include "arm_arch.h"
  69. #if __ARM_MAX_ARCH__>=7
  70. ___
  71. $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
  72. $code.=<<___ if ($flavour !~ /64/);
  73. .fpu neon
  74. #ifdef __thumb2__
  75. .syntax unified
  76. .thumb
  77. # define INST(a,b,c,d) $_byte c,0xef,a,b
  78. #else
  79. .code 32
  80. # define INST(a,b,c,d) $_byte a,b,c,0xf2
  81. #endif
  82. .text
  83. ___
  84. ################################################################################
  85. # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
  86. #
  87. # input: 128-bit H - secret parameter E(K,0^128)
  88. # output: precomputed table filled with degrees of twisted H;
  89. # H is twisted to handle reverse bitness of GHASH;
  90. # only few of 16 slots of Htable[16] are used;
  91. # data is opaque to outside world (which allows to
  92. # optimize the code independently);
  93. #
  94. $code.=<<___;
  95. .global gcm_init_v8
  96. .type gcm_init_v8,%function
  97. .align 4
  98. gcm_init_v8:
  99. vld1.64 {$t1},[x1] @ load input H
  100. vmov.i8 $xC2,#0xe1
  101. vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
  102. vext.8 $IN,$t1,$t1,#8
  103. vshr.u64 $t2,$xC2,#63
  104. vdup.32 $t1,${t1}[1]
  105. vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
  106. vshr.u64 $t2,$IN,#63
  107. vshr.s32 $t1,$t1,#31 @ broadcast carry bit
  108. vand $t2,$t2,$t0
  109. vshl.i64 $IN,$IN,#1
  110. vext.8 $t2,$t2,$t2,#8
  111. vand $t0,$t0,$t1
  112. vorr $IN,$IN,$t2 @ H<<<=1
  113. veor $H,$IN,$t0 @ twisted H
  114. vst1.64 {$H},[x0],#16 @ store Htable[0]
  115. @ calculate H^2
  116. vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
  117. vpmull.p64 $Xl,$H,$H
  118. veor $t0,$t0,$H
  119. vpmull2.p64 $Xh,$H,$H
  120. vpmull.p64 $Xm,$t0,$t0
  121. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  122. veor $t2,$Xl,$Xh
  123. veor $Xm,$Xm,$t1
  124. veor $Xm,$Xm,$t2
  125. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
  126. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  127. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  128. veor $Xl,$Xm,$t2
  129. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
  130. vpmull.p64 $Xl,$Xl,$xC2
  131. veor $t2,$t2,$Xh
  132. veor $H2,$Xl,$t2
  133. vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
  134. veor $t1,$t1,$H2
  135. vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
  136. vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2]
  137. ___
  138. if ($flavour =~ /64/) {
  139. my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
  140. $code.=<<___;
  141. @ calculate H^3 and H^4
  142. vpmull.p64 $Xl,$H, $H2
  143. vpmull.p64 $Yl,$H2,$H2
  144. vpmull2.p64 $Xh,$H, $H2
  145. vpmull2.p64 $Yh,$H2,$H2
  146. vpmull.p64 $Xm,$t0,$t1
  147. vpmull.p64 $Ym,$t1,$t1
  148. vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
  149. vext.8 $t1,$Yl,$Yh,#8
  150. veor $t2,$Xl,$Xh
  151. veor $Xm,$Xm,$t0
  152. veor $t3,$Yl,$Yh
  153. veor $Ym,$Ym,$t1
  154. veor $Xm,$Xm,$t2
  155. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
  156. veor $Ym,$Ym,$t3
  157. vpmull.p64 $t3,$Yl,$xC2
  158. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  159. vmov $Yh#lo,$Ym#hi
  160. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  161. vmov $Ym#hi,$Yl#lo
  162. veor $Xl,$Xm,$t2
  163. veor $Yl,$Ym,$t3
  164. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
  165. vext.8 $t3,$Yl,$Yl,#8
  166. vpmull.p64 $Xl,$Xl,$xC2
  167. vpmull.p64 $Yl,$Yl,$xC2
  168. veor $t2,$t2,$Xh
  169. veor $t3,$t3,$Yh
  170. veor $H, $Xl,$t2 @ H^3
  171. veor $H2,$Yl,$t3 @ H^4
  172. vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing
  173. vext.8 $t1,$H2,$H2,#8
  174. veor $t0,$t0,$H
  175. veor $t1,$t1,$H2
  176. vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
  177. vst1.64 {$H-$H2},[x0] @ store Htable[3..5]
  178. ___
  179. }
  180. $code.=<<___;
  181. ret
  182. .size gcm_init_v8,.-gcm_init_v8
  183. ___
  184. ################################################################################
  185. # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
  186. #
  187. # input: Xi - current hash value;
  188. # Htable - table precomputed in gcm_init_v8;
  189. # output: Xi - next hash value Xi;
  190. #
  191. $code.=<<___;
  192. .global gcm_gmult_v8
  193. .type gcm_gmult_v8,%function
  194. .align 4
  195. gcm_gmult_v8:
  196. vld1.64 {$t1},[$Xi] @ load Xi
  197. vmov.i8 $xC2,#0xe1
  198. vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
  199. vshl.u64 $xC2,$xC2,#57
  200. #ifndef __ARMEB__
  201. vrev64.8 $t1,$t1
  202. #endif
  203. vext.8 $IN,$t1,$t1,#8
  204. vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
  205. veor $t1,$t1,$IN @ Karatsuba pre-processing
  206. vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
  207. vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
  208. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  209. veor $t2,$Xl,$Xh
  210. veor $Xm,$Xm,$t1
  211. veor $Xm,$Xm,$t2
  212. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  213. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  214. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  215. veor $Xl,$Xm,$t2
  216. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  217. vpmull.p64 $Xl,$Xl,$xC2
  218. veor $t2,$t2,$Xh
  219. veor $Xl,$Xl,$t2
  220. #ifndef __ARMEB__
  221. vrev64.8 $Xl,$Xl
  222. #endif
  223. vext.8 $Xl,$Xl,$Xl,#8
  224. vst1.64 {$Xl},[$Xi] @ write out Xi
  225. ret
  226. .size gcm_gmult_v8,.-gcm_gmult_v8
  227. ___
  228. ################################################################################
  229. # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
  230. #
  231. # input: table precomputed in gcm_init_v8;
  232. # current hash value Xi;
  233. # pointer to input data;
  234. # length of input data in bytes, but divisible by block size;
  235. # output: next hash value Xi;
  236. #
  237. $code.=<<___;
  238. .global gcm_ghash_v8
  239. .type gcm_ghash_v8,%function
  240. .align 4
  241. gcm_ghash_v8:
  242. ___
  243. $code.=<<___ if ($flavour =~ /64/);
  244. cmp $len,#64
  245. b.hs .Lgcm_ghash_v8_4x
  246. ___
  247. $code.=<<___ if ($flavour !~ /64/);
  248. vstmdb sp!,{d8-d15} @ 32-bit ABI says so
  249. ___
  250. $code.=<<___;
  251. vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
  252. @ "[rotated]" means that
  253. @ loaded value would have
  254. @ to be rotated in order to
  255. @ make it appear as in
  256. @ algorithm specification
  257. subs $len,$len,#32 @ see if $len is 32 or larger
  258. mov $inc,#16 @ $inc is used as post-
  259. @ increment for input pointer;
  260. @ as loop is modulo-scheduled
  261. @ $inc is zeroed just in time
  262. @ to preclude overstepping
  263. @ inp[len], which means that
  264. @ last block[s] are actually
  265. @ loaded twice, but last
  266. @ copy is not processed
  267. vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
  268. vmov.i8 $xC2,#0xe1
  269. vld1.64 {$H2},[$Htbl]
  270. cclr $inc,eq @ is it time to zero $inc?
  271. vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
  272. vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
  273. vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
  274. #ifndef __ARMEB__
  275. vrev64.8 $t0,$t0
  276. vrev64.8 $Xl,$Xl
  277. #endif
  278. vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
  279. b.lo .Lodd_tail_v8 @ $len was less than 32
  280. ___
  281. { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
  282. #######
  283. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  284. # [(H*Ii+1) + (H*Xi+1)] mod P =
  285. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  286. #
  287. $code.=<<___;
  288. vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
  289. #ifndef __ARMEB__
  290. vrev64.8 $t1,$t1
  291. #endif
  292. vext.8 $In,$t1,$t1,#8
  293. veor $IN,$IN,$Xl @ I[i]^=Xi
  294. vpmull.p64 $Xln,$H,$In @ H·Ii+1
  295. veor $t1,$t1,$In @ Karatsuba pre-processing
  296. vpmull2.p64 $Xhn,$H,$In
  297. b .Loop_mod2x_v8
  298. .align 4
  299. .Loop_mod2x_v8:
  300. vext.8 $t2,$IN,$IN,#8
  301. subs $len,$len,#32 @ is there more data?
  302. vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
  303. cclr $inc,lo @ is it time to zero $inc?
  304. vpmull.p64 $Xmn,$Hhl,$t1
  305. veor $t2,$t2,$IN @ Karatsuba pre-processing
  306. vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
  307. veor $Xl,$Xl,$Xln @ accumulate
  308. vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
  309. vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
  310. veor $Xh,$Xh,$Xhn
  311. cclr $inc,eq @ is it time to zero $inc?
  312. veor $Xm,$Xm,$Xmn
  313. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  314. veor $t2,$Xl,$Xh
  315. veor $Xm,$Xm,$t1
  316. vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
  317. #ifndef __ARMEB__
  318. vrev64.8 $t0,$t0
  319. #endif
  320. veor $Xm,$Xm,$t2
  321. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  322. #ifndef __ARMEB__
  323. vrev64.8 $t1,$t1
  324. #endif
  325. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  326. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  327. vext.8 $In,$t1,$t1,#8
  328. vext.8 $IN,$t0,$t0,#8
  329. veor $Xl,$Xm,$t2
  330. vpmull.p64 $Xln,$H,$In @ H·Ii+1
  331. veor $IN,$IN,$Xh @ accumulate $IN early
  332. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  333. vpmull.p64 $Xl,$Xl,$xC2
  334. veor $IN,$IN,$t2
  335. veor $t1,$t1,$In @ Karatsuba pre-processing
  336. veor $IN,$IN,$Xl
  337. vpmull2.p64 $Xhn,$H,$In
  338. b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
  339. veor $Xh,$Xh,$t2
  340. vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
  341. adds $len,$len,#32 @ re-construct $len
  342. veor $Xl,$Xl,$Xh @ re-construct $Xl
  343. b.eq .Ldone_v8 @ is $len zero?
  344. ___
  345. }
  346. $code.=<<___;
  347. .Lodd_tail_v8:
  348. vext.8 $t2,$Xl,$Xl,#8
  349. veor $IN,$IN,$Xl @ inp^=Xi
  350. veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
  351. vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
  352. veor $t1,$t1,$IN @ Karatsuba pre-processing
  353. vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
  354. vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
  355. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  356. veor $t2,$Xl,$Xh
  357. veor $Xm,$Xm,$t1
  358. veor $Xm,$Xm,$t2
  359. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  360. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  361. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  362. veor $Xl,$Xm,$t2
  363. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  364. vpmull.p64 $Xl,$Xl,$xC2
  365. veor $t2,$t2,$Xh
  366. veor $Xl,$Xl,$t2
  367. .Ldone_v8:
  368. #ifndef __ARMEB__
  369. vrev64.8 $Xl,$Xl
  370. #endif
  371. vext.8 $Xl,$Xl,$Xl,#8
  372. vst1.64 {$Xl},[$Xi] @ write out Xi
  373. ___
  374. $code.=<<___ if ($flavour !~ /64/);
  375. vldmia sp!,{d8-d15} @ 32-bit ABI says so
  376. ___
  377. $code.=<<___;
  378. ret
  379. .size gcm_ghash_v8,.-gcm_ghash_v8
  380. ___
  381. if ($flavour =~ /64/) { # 4x subroutine
  382. my ($I0,$j1,$j2,$j3,
  383. $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
  384. $code.=<<___;
  385. .type gcm_ghash_v8_4x,%function
  386. .align 4
  387. gcm_ghash_v8_4x:
  388. .Lgcm_ghash_v8_4x:
  389. vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
  390. vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2
  391. vmov.i8 $xC2,#0xe1
  392. vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4
  393. vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
  394. vld1.64 {$I0-$j3},[$inp],#64
  395. #ifndef __ARMEB__
  396. vrev64.8 $Xl,$Xl
  397. vrev64.8 $j1,$j1
  398. vrev64.8 $j2,$j2
  399. vrev64.8 $j3,$j3
  400. vrev64.8 $I0,$I0
  401. #endif
  402. vext.8 $I3,$j3,$j3,#8
  403. vext.8 $I2,$j2,$j2,#8
  404. vext.8 $I1,$j1,$j1,#8
  405. vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
  406. veor $j3,$j3,$I3
  407. vpmull2.p64 $Yh,$H,$I3
  408. vpmull.p64 $Ym,$Hhl,$j3
  409. vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
  410. veor $j2,$j2,$I2
  411. vpmull2.p64 $I2,$H2,$I2
  412. vpmull2.p64 $j2,$Hhl,$j2
  413. veor $Yl,$Yl,$t0
  414. veor $Yh,$Yh,$I2
  415. veor $Ym,$Ym,$j2
  416. vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
  417. veor $j1,$j1,$I1
  418. vpmull2.p64 $I1,$H3,$I1
  419. vpmull.p64 $j1,$H34,$j1
  420. veor $Yl,$Yl,$j3
  421. veor $Yh,$Yh,$I1
  422. veor $Ym,$Ym,$j1
  423. subs $len,$len,#128
  424. b.lo .Ltail4x
  425. b .Loop4x
  426. .align 4
  427. .Loop4x:
  428. veor $t0,$I0,$Xl
  429. vld1.64 {$I0-$j3},[$inp],#64
  430. vext.8 $IN,$t0,$t0,#8
  431. #ifndef __ARMEB__
  432. vrev64.8 $j1,$j1
  433. vrev64.8 $j2,$j2
  434. vrev64.8 $j3,$j3
  435. vrev64.8 $I0,$I0
  436. #endif
  437. vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
  438. veor $t0,$t0,$IN
  439. vpmull2.p64 $Xh,$H4,$IN
  440. vext.8 $I3,$j3,$j3,#8
  441. vpmull2.p64 $Xm,$H34,$t0
  442. veor $Xl,$Xl,$Yl
  443. veor $Xh,$Xh,$Yh
  444. vext.8 $I2,$j2,$j2,#8
  445. veor $Xm,$Xm,$Ym
  446. vext.8 $I1,$j1,$j1,#8
  447. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  448. veor $t2,$Xl,$Xh
  449. vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
  450. veor $j3,$j3,$I3
  451. veor $Xm,$Xm,$t1
  452. vpmull2.p64 $Yh,$H,$I3
  453. veor $Xm,$Xm,$t2
  454. vpmull.p64 $Ym,$Hhl,$j3
  455. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  456. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  457. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  458. vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
  459. veor $j2,$j2,$I2
  460. vpmull2.p64 $I2,$H2,$I2
  461. veor $Xl,$Xm,$t2
  462. vpmull2.p64 $j2,$Hhl,$j2
  463. veor $Yl,$Yl,$t0
  464. veor $Yh,$Yh,$I2
  465. veor $Ym,$Ym,$j2
  466. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  467. vpmull.p64 $Xl,$Xl,$xC2
  468. vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
  469. veor $j1,$j1,$I1
  470. veor $t2,$t2,$Xh
  471. vpmull2.p64 $I1,$H3,$I1
  472. vpmull.p64 $j1,$H34,$j1
  473. veor $Xl,$Xl,$t2
  474. veor $Yl,$Yl,$j3
  475. veor $Yh,$Yh,$I1
  476. vext.8 $Xl,$Xl,$Xl,#8
  477. veor $Ym,$Ym,$j1
  478. subs $len,$len,#64
  479. b.hs .Loop4x
  480. .Ltail4x:
  481. veor $t0,$I0,$Xl
  482. vext.8 $IN,$t0,$t0,#8
  483. vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
  484. veor $t0,$t0,$IN
  485. vpmull2.p64 $Xh,$H4,$IN
  486. vpmull2.p64 $Xm,$H34,$t0
  487. veor $Xl,$Xl,$Yl
  488. veor $Xh,$Xh,$Yh
  489. veor $Xm,$Xm,$Ym
  490. adds $len,$len,#64
  491. b.eq .Ldone4x
  492. cmp $len,#32
  493. b.lo .Lone
  494. b.eq .Ltwo
  495. .Lthree:
  496. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  497. veor $t2,$Xl,$Xh
  498. veor $Xm,$Xm,$t1
  499. vld1.64 {$I0-$j2},[$inp]
  500. veor $Xm,$Xm,$t2
  501. #ifndef __ARMEB__
  502. vrev64.8 $j1,$j1
  503. vrev64.8 $j2,$j2
  504. vrev64.8 $I0,$I0
  505. #endif
  506. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  507. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  508. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  509. vext.8 $I2,$j2,$j2,#8
  510. vext.8 $I1,$j1,$j1,#8
  511. veor $Xl,$Xm,$t2
  512. vpmull.p64 $Yl,$H,$I2 @ H·Ii+2
  513. veor $j2,$j2,$I2
  514. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  515. vpmull.p64 $Xl,$Xl,$xC2
  516. veor $t2,$t2,$Xh
  517. vpmull2.p64 $Yh,$H,$I2
  518. vpmull.p64 $Ym,$Hhl,$j2
  519. veor $Xl,$Xl,$t2
  520. vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1
  521. veor $j1,$j1,$I1
  522. vext.8 $Xl,$Xl,$Xl,#8
  523. vpmull2.p64 $I1,$H2,$I1
  524. veor $t0,$I0,$Xl
  525. vpmull2.p64 $j1,$Hhl,$j1
  526. vext.8 $IN,$t0,$t0,#8
  527. veor $Yl,$Yl,$j3
  528. veor $Yh,$Yh,$I1
  529. veor $Ym,$Ym,$j1
  530. vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii)
  531. veor $t0,$t0,$IN
  532. vpmull2.p64 $Xh,$H3,$IN
  533. vpmull.p64 $Xm,$H34,$t0
  534. veor $Xl,$Xl,$Yl
  535. veor $Xh,$Xh,$Yh
  536. veor $Xm,$Xm,$Ym
  537. b .Ldone4x
  538. .align 4
  539. .Ltwo:
  540. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  541. veor $t2,$Xl,$Xh
  542. veor $Xm,$Xm,$t1
  543. vld1.64 {$I0-$j1},[$inp]
  544. veor $Xm,$Xm,$t2
  545. #ifndef __ARMEB__
  546. vrev64.8 $j1,$j1
  547. vrev64.8 $I0,$I0
  548. #endif
  549. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  550. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  551. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  552. vext.8 $I1,$j1,$j1,#8
  553. veor $Xl,$Xm,$t2
  554. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  555. vpmull.p64 $Xl,$Xl,$xC2
  556. veor $t2,$t2,$Xh
  557. veor $Xl,$Xl,$t2
  558. vext.8 $Xl,$Xl,$Xl,#8
  559. vpmull.p64 $Yl,$H,$I1 @ H·Ii+1
  560. veor $j1,$j1,$I1
  561. veor $t0,$I0,$Xl
  562. vext.8 $IN,$t0,$t0,#8
  563. vpmull2.p64 $Yh,$H,$I1
  564. vpmull.p64 $Ym,$Hhl,$j1
  565. vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii)
  566. veor $t0,$t0,$IN
  567. vpmull2.p64 $Xh,$H2,$IN
  568. vpmull2.p64 $Xm,$Hhl,$t0
  569. veor $Xl,$Xl,$Yl
  570. veor $Xh,$Xh,$Yh
  571. veor $Xm,$Xm,$Ym
  572. b .Ldone4x
  573. .align 4
  574. .Lone:
  575. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  576. veor $t2,$Xl,$Xh
  577. veor $Xm,$Xm,$t1
  578. vld1.64 {$I0},[$inp]
  579. veor $Xm,$Xm,$t2
  580. #ifndef __ARMEB__
  581. vrev64.8 $I0,$I0
  582. #endif
  583. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  584. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  585. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  586. veor $Xl,$Xm,$t2
  587. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  588. vpmull.p64 $Xl,$Xl,$xC2
  589. veor $t2,$t2,$Xh
  590. veor $Xl,$Xl,$t2
  591. vext.8 $Xl,$Xl,$Xl,#8
  592. veor $t0,$I0,$Xl
  593. vext.8 $IN,$t0,$t0,#8
  594. vpmull.p64 $Xl,$H,$IN
  595. veor $t0,$t0,$IN
  596. vpmull2.p64 $Xh,$H,$IN
  597. vpmull.p64 $Xm,$Hhl,$t0
  598. .Ldone4x:
  599. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  600. veor $t2,$Xl,$Xh
  601. veor $Xm,$Xm,$t1
  602. veor $Xm,$Xm,$t2
  603. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  604. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  605. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  606. veor $Xl,$Xm,$t2
  607. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  608. vpmull.p64 $Xl,$Xl,$xC2
  609. veor $t2,$t2,$Xh
  610. veor $Xl,$Xl,$t2
  611. vext.8 $Xl,$Xl,$Xl,#8
  612. #ifndef __ARMEB__
  613. vrev64.8 $Xl,$Xl
  614. #endif
  615. vst1.64 {$Xl},[$Xi] @ write out Xi
  616. ret
  617. .size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
  618. ___
  619. }
  620. }
  621. $code.=<<___;
  622. .asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  623. .align 2
  624. #endif
  625. ___
  626. if ($flavour =~ /64/) { ######## 64-bit code
  627. sub unvmov {
  628. my $arg=shift;
  629. $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
  630. sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
  631. $3<8?$3:$3+8,($4 eq "lo")?0:1;
  632. }
  633. foreach(split("\n",$code)) {
  634. s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  635. s/vmov\.i8/movi/o or # fix up legacy mnemonics
  636. s/vmov\s+(.*)/unvmov($1)/geo or
  637. s/vext\.8/ext/o or
  638. s/vshr\.s/sshr\.s/o or
  639. s/vshr/ushr/o or
  640. s/^(\s+)v/$1/o or # strip off v prefix
  641. s/\bbx\s+lr\b/ret/o;
  642. s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
  643. s/@\s/\/\//o; # old->new style commentary
  644. # fix up remaining legacy suffixes
  645. s/\.[ui]?8(\s)/$1/o;
  646. s/\.[uis]?32//o and s/\.16b/\.4s/go;
  647. m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
  648. m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
  649. s/\.[uisp]?64//o and s/\.16b/\.2d/go;
  650. s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  651. print $_,"\n";
  652. }
  653. } else { ######## 32-bit code
  654. sub unvdup32 {
  655. my $arg=shift;
  656. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  657. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  658. }
  659. sub unvpmullp64 {
  660. my ($mnemonic,$arg)=@_;
  661. if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
  662. my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
  663. |(($2&7)<<17)|(($2&8)<<4)
  664. |(($3&7)<<1) |(($3&8)<<2);
  665. $word |= 0x00010001 if ($mnemonic =~ "2");
  666. # since ARMv7 instructions are always encoded little-endian.
  667. # correct solution is to use .inst directive, but older
  668. # assemblers don't implement it:-(
  669. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  670. $word&0xff,($word>>8)&0xff,
  671. ($word>>16)&0xff,($word>>24)&0xff,
  672. $mnemonic,$arg;
  673. }
  674. }
  675. foreach(split("\n",$code)) {
  676. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  677. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  678. s/\/\/\s?/@ /o; # new->old style commentary
  679. # fix up remaining new-style suffixes
  680. s/\],#[0-9]+/]!/o;
  681. s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
  682. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  683. s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
  684. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  685. s/^(\s+)b\./$1b/o or
  686. s/^(\s+)ret/$1bx\tlr/o;
  687. if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
  688. print " it $2\n";
  689. }
  690. print $_,"\n";
  691. }
  692. }
  693. close STDOUT or die "error closing STDOUT"; # enforce flush