ghashv8-armx.pl 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781
  1. #! /usr/bin/env perl
  2. # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
  17. #
  18. # June 2014
  19. #
  20. # Initial version was developed in tight cooperation with Ard
  21. # Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
  22. # Just like aesv8-armx.pl this module supports both AArch32 and
  23. # AArch64 execution modes.
  24. #
  25. # July 2014
  26. #
  27. # Implement 2x aggregated reduction [see ghash-x86.pl for background
  28. # information].
  29. #
  30. # November 2017
  31. #
  32. # AArch64 register bank to "accommodate" 4x aggregated reduction and
  33. # improve performance by 20-70% depending on processor.
  34. #
  35. # Current performance in cycles per processed byte:
  36. #
  37. # 64-bit PMULL 32-bit PMULL 32-bit NEON(*)
  38. # Apple A7 0.58 0.92 5.62
  39. # Cortex-A53 0.85 1.01 8.39
  40. # Cortex-A57 0.73 1.17 7.61
  41. # Denver 0.51 0.65 6.02
  42. # Mongoose 0.65 1.10 8.06
  43. # Kryo 0.76 1.16 8.00
  44. #
  45. # (*) presented for reference/comparison purposes;
  46. $flavour = shift;
  47. $output = shift;
  48. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  49. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  50. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  51. die "can't locate arm-xlate.pl";
  52. open OUT,"| \"$^X\" $xlate $flavour $output";
  53. *STDOUT=*OUT;
  54. $Xi="x0"; # argument block
  55. $Htbl="x1";
  56. $inp="x2";
  57. $len="x3";
  58. $inc="x12";
  59. {
  60. my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
  61. my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
  62. $code=<<___;
  63. #include "arm_arch.h"
  64. #if __ARM_MAX_ARCH__>=7
  65. .text
  66. ___
  67. $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
  68. $code.=<<___ if ($flavour !~ /64/);
  69. .fpu neon
  70. .code 32
  71. #undef __thumb2__
  72. ___
  73. ################################################################################
  74. # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
  75. #
  76. # input: 128-bit H - secret parameter E(K,0^128)
  77. # output: precomputed table filled with degrees of twisted H;
  78. # H is twisted to handle reverse bitness of GHASH;
  79. # only few of 16 slots of Htable[16] are used;
  80. # data is opaque to outside world (which allows to
  81. # optimize the code independently);
  82. #
  83. $code.=<<___;
  84. .global gcm_init_v8
  85. .type gcm_init_v8,%function
  86. .align 4
  87. gcm_init_v8:
  88. vld1.64 {$t1},[x1] @ load input H
  89. vmov.i8 $xC2,#0xe1
  90. vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
  91. vext.8 $IN,$t1,$t1,#8
  92. vshr.u64 $t2,$xC2,#63
  93. vdup.32 $t1,${t1}[1]
  94. vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
  95. vshr.u64 $t2,$IN,#63
  96. vshr.s32 $t1,$t1,#31 @ broadcast carry bit
  97. vand $t2,$t2,$t0
  98. vshl.i64 $IN,$IN,#1
  99. vext.8 $t2,$t2,$t2,#8
  100. vand $t0,$t0,$t1
  101. vorr $IN,$IN,$t2 @ H<<<=1
  102. veor $H,$IN,$t0 @ twisted H
  103. vst1.64 {$H},[x0],#16 @ store Htable[0]
  104. @ calculate H^2
  105. vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
  106. vpmull.p64 $Xl,$H,$H
  107. veor $t0,$t0,$H
  108. vpmull2.p64 $Xh,$H,$H
  109. vpmull.p64 $Xm,$t0,$t0
  110. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  111. veor $t2,$Xl,$Xh
  112. veor $Xm,$Xm,$t1
  113. veor $Xm,$Xm,$t2
  114. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
  115. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  116. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  117. veor $Xl,$Xm,$t2
  118. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
  119. vpmull.p64 $Xl,$Xl,$xC2
  120. veor $t2,$t2,$Xh
  121. veor $H2,$Xl,$t2
  122. vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
  123. veor $t1,$t1,$H2
  124. vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
  125. vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2]
  126. ___
  127. if ($flavour =~ /64/) {
  128. my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
  129. $code.=<<___;
  130. @ calculate H^3 and H^4
  131. vpmull.p64 $Xl,$H, $H2
  132. vpmull.p64 $Yl,$H2,$H2
  133. vpmull2.p64 $Xh,$H, $H2
  134. vpmull2.p64 $Yh,$H2,$H2
  135. vpmull.p64 $Xm,$t0,$t1
  136. vpmull.p64 $Ym,$t1,$t1
  137. vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
  138. vext.8 $t1,$Yl,$Yh,#8
  139. veor $t2,$Xl,$Xh
  140. veor $Xm,$Xm,$t0
  141. veor $t3,$Yl,$Yh
  142. veor $Ym,$Ym,$t1
  143. veor $Xm,$Xm,$t2
  144. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
  145. veor $Ym,$Ym,$t3
  146. vpmull.p64 $t3,$Yl,$xC2
  147. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  148. vmov $Yh#lo,$Ym#hi
  149. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  150. vmov $Ym#hi,$Yl#lo
  151. veor $Xl,$Xm,$t2
  152. veor $Yl,$Ym,$t3
  153. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
  154. vext.8 $t3,$Yl,$Yl,#8
  155. vpmull.p64 $Xl,$Xl,$xC2
  156. vpmull.p64 $Yl,$Yl,$xC2
  157. veor $t2,$t2,$Xh
  158. veor $t3,$t3,$Yh
  159. veor $H, $Xl,$t2 @ H^3
  160. veor $H2,$Yl,$t3 @ H^4
  161. vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing
  162. vext.8 $t1,$H2,$H2,#8
  163. veor $t0,$t0,$H
  164. veor $t1,$t1,$H2
  165. vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
  166. vst1.64 {$H-$H2},[x0] @ store Htable[3..5]
  167. ___
  168. }
  169. $code.=<<___;
  170. ret
  171. .size gcm_init_v8,.-gcm_init_v8
  172. ___
  173. ################################################################################
  174. # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
  175. #
  176. # input: Xi - current hash value;
  177. # Htable - table precomputed in gcm_init_v8;
  178. # output: Xi - next hash value Xi;
  179. #
  180. $code.=<<___;
  181. .global gcm_gmult_v8
  182. .type gcm_gmult_v8,%function
  183. .align 4
  184. gcm_gmult_v8:
  185. vld1.64 {$t1},[$Xi] @ load Xi
  186. vmov.i8 $xC2,#0xe1
  187. vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
  188. vshl.u64 $xC2,$xC2,#57
  189. #ifndef __ARMEB__
  190. vrev64.8 $t1,$t1
  191. #endif
  192. vext.8 $IN,$t1,$t1,#8
  193. vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
  194. veor $t1,$t1,$IN @ Karatsuba pre-processing
  195. vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
  196. vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
  197. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  198. veor $t2,$Xl,$Xh
  199. veor $Xm,$Xm,$t1
  200. veor $Xm,$Xm,$t2
  201. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  202. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  203. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  204. veor $Xl,$Xm,$t2
  205. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  206. vpmull.p64 $Xl,$Xl,$xC2
  207. veor $t2,$t2,$Xh
  208. veor $Xl,$Xl,$t2
  209. #ifndef __ARMEB__
  210. vrev64.8 $Xl,$Xl
  211. #endif
  212. vext.8 $Xl,$Xl,$Xl,#8
  213. vst1.64 {$Xl},[$Xi] @ write out Xi
  214. ret
  215. .size gcm_gmult_v8,.-gcm_gmult_v8
  216. ___
  217. ################################################################################
  218. # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
  219. #
  220. # input: table precomputed in gcm_init_v8;
  221. # current hash value Xi;
  222. # pointer to input data;
  223. # length of input data in bytes, but divisible by block size;
  224. # output: next hash value Xi;
  225. #
  226. $code.=<<___;
  227. .global gcm_ghash_v8
  228. .type gcm_ghash_v8,%function
  229. .align 4
  230. gcm_ghash_v8:
  231. ___
  232. $code.=<<___ if ($flavour =~ /64/);
  233. cmp $len,#64
  234. b.hs .Lgcm_ghash_v8_4x
  235. ___
  236. $code.=<<___ if ($flavour !~ /64/);
  237. vstmdb sp!,{d8-d15} @ 32-bit ABI says so
  238. ___
  239. $code.=<<___;
  240. vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
  241. @ "[rotated]" means that
  242. @ loaded value would have
  243. @ to be rotated in order to
  244. @ make it appear as in
  245. @ algorithm specification
  246. subs $len,$len,#32 @ see if $len is 32 or larger
  247. mov $inc,#16 @ $inc is used as post-
  248. @ increment for input pointer;
  249. @ as loop is modulo-scheduled
  250. @ $inc is zeroed just in time
  251. @ to preclude overstepping
  252. @ inp[len], which means that
  253. @ last block[s] are actually
  254. @ loaded twice, but last
  255. @ copy is not processed
  256. vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
  257. vmov.i8 $xC2,#0xe1
  258. vld1.64 {$H2},[$Htbl]
  259. cclr $inc,eq @ is it time to zero $inc?
  260. vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
  261. vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
  262. vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
  263. #ifndef __ARMEB__
  264. vrev64.8 $t0,$t0
  265. vrev64.8 $Xl,$Xl
  266. #endif
  267. vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
  268. b.lo .Lodd_tail_v8 @ $len was less than 32
  269. ___
  270. { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
  271. #######
  272. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  273. # [(H*Ii+1) + (H*Xi+1)] mod P =
  274. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  275. #
  276. $code.=<<___;
  277. vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
  278. #ifndef __ARMEB__
  279. vrev64.8 $t1,$t1
  280. #endif
  281. vext.8 $In,$t1,$t1,#8
  282. veor $IN,$IN,$Xl @ I[i]^=Xi
  283. vpmull.p64 $Xln,$H,$In @ H·Ii+1
  284. veor $t1,$t1,$In @ Karatsuba pre-processing
  285. vpmull2.p64 $Xhn,$H,$In
  286. b .Loop_mod2x_v8
  287. .align 4
  288. .Loop_mod2x_v8:
  289. vext.8 $t2,$IN,$IN,#8
  290. subs $len,$len,#32 @ is there more data?
  291. vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
  292. cclr $inc,lo @ is it time to zero $inc?
  293. vpmull.p64 $Xmn,$Hhl,$t1
  294. veor $t2,$t2,$IN @ Karatsuba pre-processing
  295. vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
  296. veor $Xl,$Xl,$Xln @ accumulate
  297. vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
  298. vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
  299. veor $Xh,$Xh,$Xhn
  300. cclr $inc,eq @ is it time to zero $inc?
  301. veor $Xm,$Xm,$Xmn
  302. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  303. veor $t2,$Xl,$Xh
  304. veor $Xm,$Xm,$t1
  305. vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
  306. #ifndef __ARMEB__
  307. vrev64.8 $t0,$t0
  308. #endif
  309. veor $Xm,$Xm,$t2
  310. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  311. #ifndef __ARMEB__
  312. vrev64.8 $t1,$t1
  313. #endif
  314. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  315. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  316. vext.8 $In,$t1,$t1,#8
  317. vext.8 $IN,$t0,$t0,#8
  318. veor $Xl,$Xm,$t2
  319. vpmull.p64 $Xln,$H,$In @ H·Ii+1
  320. veor $IN,$IN,$Xh @ accumulate $IN early
  321. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  322. vpmull.p64 $Xl,$Xl,$xC2
  323. veor $IN,$IN,$t2
  324. veor $t1,$t1,$In @ Karatsuba pre-processing
  325. veor $IN,$IN,$Xl
  326. vpmull2.p64 $Xhn,$H,$In
  327. b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
  328. veor $Xh,$Xh,$t2
  329. vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
  330. adds $len,$len,#32 @ re-construct $len
  331. veor $Xl,$Xl,$Xh @ re-construct $Xl
  332. b.eq .Ldone_v8 @ is $len zero?
  333. ___
  334. }
  335. $code.=<<___;
  336. .Lodd_tail_v8:
  337. vext.8 $t2,$Xl,$Xl,#8
  338. veor $IN,$IN,$Xl @ inp^=Xi
  339. veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
  340. vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
  341. veor $t1,$t1,$IN @ Karatsuba pre-processing
  342. vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
  343. vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
  344. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  345. veor $t2,$Xl,$Xh
  346. veor $Xm,$Xm,$t1
  347. veor $Xm,$Xm,$t2
  348. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  349. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  350. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  351. veor $Xl,$Xm,$t2
  352. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  353. vpmull.p64 $Xl,$Xl,$xC2
  354. veor $t2,$t2,$Xh
  355. veor $Xl,$Xl,$t2
  356. .Ldone_v8:
  357. #ifndef __ARMEB__
  358. vrev64.8 $Xl,$Xl
  359. #endif
  360. vext.8 $Xl,$Xl,$Xl,#8
  361. vst1.64 {$Xl},[$Xi] @ write out Xi
  362. ___
  363. $code.=<<___ if ($flavour !~ /64/);
  364. vldmia sp!,{d8-d15} @ 32-bit ABI says so
  365. ___
  366. $code.=<<___;
  367. ret
  368. .size gcm_ghash_v8,.-gcm_ghash_v8
  369. ___
  370. if ($flavour =~ /64/) { # 4x subroutine
  371. my ($I0,$j1,$j2,$j3,
  372. $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
  373. $code.=<<___;
  374. .type gcm_ghash_v8_4x,%function
  375. .align 4
  376. gcm_ghash_v8_4x:
  377. .Lgcm_ghash_v8_4x:
  378. vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
  379. vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2
  380. vmov.i8 $xC2,#0xe1
  381. vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4
  382. vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
  383. vld1.64 {$I0-$j3},[$inp],#64
  384. #ifndef __ARMEB__
  385. vrev64.8 $Xl,$Xl
  386. vrev64.8 $j1,$j1
  387. vrev64.8 $j2,$j2
  388. vrev64.8 $j3,$j3
  389. vrev64.8 $I0,$I0
  390. #endif
  391. vext.8 $I3,$j3,$j3,#8
  392. vext.8 $I2,$j2,$j2,#8
  393. vext.8 $I1,$j1,$j1,#8
  394. vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
  395. veor $j3,$j3,$I3
  396. vpmull2.p64 $Yh,$H,$I3
  397. vpmull.p64 $Ym,$Hhl,$j3
  398. vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
  399. veor $j2,$j2,$I2
  400. vpmull2.p64 $I2,$H2,$I2
  401. vpmull2.p64 $j2,$Hhl,$j2
  402. veor $Yl,$Yl,$t0
  403. veor $Yh,$Yh,$I2
  404. veor $Ym,$Ym,$j2
  405. vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
  406. veor $j1,$j1,$I1
  407. vpmull2.p64 $I1,$H3,$I1
  408. vpmull.p64 $j1,$H34,$j1
  409. veor $Yl,$Yl,$j3
  410. veor $Yh,$Yh,$I1
  411. veor $Ym,$Ym,$j1
  412. subs $len,$len,#128
  413. b.lo .Ltail4x
  414. b .Loop4x
  415. .align 4
  416. .Loop4x:
  417. veor $t0,$I0,$Xl
  418. vld1.64 {$I0-$j3},[$inp],#64
  419. vext.8 $IN,$t0,$t0,#8
  420. #ifndef __ARMEB__
  421. vrev64.8 $j1,$j1
  422. vrev64.8 $j2,$j2
  423. vrev64.8 $j3,$j3
  424. vrev64.8 $I0,$I0
  425. #endif
  426. vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
  427. veor $t0,$t0,$IN
  428. vpmull2.p64 $Xh,$H4,$IN
  429. vext.8 $I3,$j3,$j3,#8
  430. vpmull2.p64 $Xm,$H34,$t0
  431. veor $Xl,$Xl,$Yl
  432. veor $Xh,$Xh,$Yh
  433. vext.8 $I2,$j2,$j2,#8
  434. veor $Xm,$Xm,$Ym
  435. vext.8 $I1,$j1,$j1,#8
  436. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  437. veor $t2,$Xl,$Xh
  438. vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
  439. veor $j3,$j3,$I3
  440. veor $Xm,$Xm,$t1
  441. vpmull2.p64 $Yh,$H,$I3
  442. veor $Xm,$Xm,$t2
  443. vpmull.p64 $Ym,$Hhl,$j3
  444. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  445. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  446. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  447. vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
  448. veor $j2,$j2,$I2
  449. vpmull2.p64 $I2,$H2,$I2
  450. veor $Xl,$Xm,$t2
  451. vpmull2.p64 $j2,$Hhl,$j2
  452. veor $Yl,$Yl,$t0
  453. veor $Yh,$Yh,$I2
  454. veor $Ym,$Ym,$j2
  455. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  456. vpmull.p64 $Xl,$Xl,$xC2
  457. vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
  458. veor $j1,$j1,$I1
  459. veor $t2,$t2,$Xh
  460. vpmull2.p64 $I1,$H3,$I1
  461. vpmull.p64 $j1,$H34,$j1
  462. veor $Xl,$Xl,$t2
  463. veor $Yl,$Yl,$j3
  464. veor $Yh,$Yh,$I1
  465. vext.8 $Xl,$Xl,$Xl,#8
  466. veor $Ym,$Ym,$j1
  467. subs $len,$len,#64
  468. b.hs .Loop4x
  469. .Ltail4x:
  470. veor $t0,$I0,$Xl
  471. vext.8 $IN,$t0,$t0,#8
  472. vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
  473. veor $t0,$t0,$IN
  474. vpmull2.p64 $Xh,$H4,$IN
  475. vpmull2.p64 $Xm,$H34,$t0
  476. veor $Xl,$Xl,$Yl
  477. veor $Xh,$Xh,$Yh
  478. veor $Xm,$Xm,$Ym
  479. adds $len,$len,#64
  480. b.eq .Ldone4x
  481. cmp $len,#32
  482. b.lo .Lone
  483. b.eq .Ltwo
  484. .Lthree:
  485. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  486. veor $t2,$Xl,$Xh
  487. veor $Xm,$Xm,$t1
  488. vld1.64 {$I0-$j2},[$inp]
  489. veor $Xm,$Xm,$t2
  490. #ifndef __ARMEB__
  491. vrev64.8 $j1,$j1
  492. vrev64.8 $j2,$j2
  493. vrev64.8 $I0,$I0
  494. #endif
  495. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  496. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  497. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  498. vext.8 $I2,$j2,$j2,#8
  499. vext.8 $I1,$j1,$j1,#8
  500. veor $Xl,$Xm,$t2
  501. vpmull.p64 $Yl,$H,$I2 @ H·Ii+2
  502. veor $j2,$j2,$I2
  503. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  504. vpmull.p64 $Xl,$Xl,$xC2
  505. veor $t2,$t2,$Xh
  506. vpmull2.p64 $Yh,$H,$I2
  507. vpmull.p64 $Ym,$Hhl,$j2
  508. veor $Xl,$Xl,$t2
  509. vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1
  510. veor $j1,$j1,$I1
  511. vext.8 $Xl,$Xl,$Xl,#8
  512. vpmull2.p64 $I1,$H2,$I1
  513. veor $t0,$I0,$Xl
  514. vpmull2.p64 $j1,$Hhl,$j1
  515. vext.8 $IN,$t0,$t0,#8
  516. veor $Yl,$Yl,$j3
  517. veor $Yh,$Yh,$I1
  518. veor $Ym,$Ym,$j1
  519. vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii)
  520. veor $t0,$t0,$IN
  521. vpmull2.p64 $Xh,$H3,$IN
  522. vpmull.p64 $Xm,$H34,$t0
  523. veor $Xl,$Xl,$Yl
  524. veor $Xh,$Xh,$Yh
  525. veor $Xm,$Xm,$Ym
  526. b .Ldone4x
  527. .align 4
  528. .Ltwo:
  529. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  530. veor $t2,$Xl,$Xh
  531. veor $Xm,$Xm,$t1
  532. vld1.64 {$I0-$j1},[$inp]
  533. veor $Xm,$Xm,$t2
  534. #ifndef __ARMEB__
  535. vrev64.8 $j1,$j1
  536. vrev64.8 $I0,$I0
  537. #endif
  538. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  539. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  540. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  541. vext.8 $I1,$j1,$j1,#8
  542. veor $Xl,$Xm,$t2
  543. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  544. vpmull.p64 $Xl,$Xl,$xC2
  545. veor $t2,$t2,$Xh
  546. veor $Xl,$Xl,$t2
  547. vext.8 $Xl,$Xl,$Xl,#8
  548. vpmull.p64 $Yl,$H,$I1 @ H·Ii+1
  549. veor $j1,$j1,$I1
  550. veor $t0,$I0,$Xl
  551. vext.8 $IN,$t0,$t0,#8
  552. vpmull2.p64 $Yh,$H,$I1
  553. vpmull.p64 $Ym,$Hhl,$j1
  554. vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii)
  555. veor $t0,$t0,$IN
  556. vpmull2.p64 $Xh,$H2,$IN
  557. vpmull2.p64 $Xm,$Hhl,$t0
  558. veor $Xl,$Xl,$Yl
  559. veor $Xh,$Xh,$Yh
  560. veor $Xm,$Xm,$Ym
  561. b .Ldone4x
  562. .align 4
  563. .Lone:
  564. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  565. veor $t2,$Xl,$Xh
  566. veor $Xm,$Xm,$t1
  567. vld1.64 {$I0},[$inp]
  568. veor $Xm,$Xm,$t2
  569. #ifndef __ARMEB__
  570. vrev64.8 $I0,$I0
  571. #endif
  572. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  573. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  574. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  575. veor $Xl,$Xm,$t2
  576. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  577. vpmull.p64 $Xl,$Xl,$xC2
  578. veor $t2,$t2,$Xh
  579. veor $Xl,$Xl,$t2
  580. vext.8 $Xl,$Xl,$Xl,#8
  581. veor $t0,$I0,$Xl
  582. vext.8 $IN,$t0,$t0,#8
  583. vpmull.p64 $Xl,$H,$IN
  584. veor $t0,$t0,$IN
  585. vpmull2.p64 $Xh,$H,$IN
  586. vpmull.p64 $Xm,$Hhl,$t0
  587. .Ldone4x:
  588. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  589. veor $t2,$Xl,$Xh
  590. veor $Xm,$Xm,$t1
  591. veor $Xm,$Xm,$t2
  592. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  593. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  594. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  595. veor $Xl,$Xm,$t2
  596. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  597. vpmull.p64 $Xl,$Xl,$xC2
  598. veor $t2,$t2,$Xh
  599. veor $Xl,$Xl,$t2
  600. vext.8 $Xl,$Xl,$Xl,#8
  601. #ifndef __ARMEB__
  602. vrev64.8 $Xl,$Xl
  603. #endif
  604. vst1.64 {$Xl},[$Xi] @ write out Xi
  605. ret
  606. .size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
  607. ___
  608. }
  609. }
  610. $code.=<<___;
  611. .asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  612. .align 2
  613. #endif
  614. ___
  615. if ($flavour =~ /64/) { ######## 64-bit code
  616. sub unvmov {
  617. my $arg=shift;
  618. $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
  619. sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
  620. $3<8?$3:$3+8,($4 eq "lo")?0:1;
  621. }
  622. foreach(split("\n",$code)) {
  623. s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  624. s/vmov\.i8/movi/o or # fix up legacy mnemonics
  625. s/vmov\s+(.*)/unvmov($1)/geo or
  626. s/vext\.8/ext/o or
  627. s/vshr\.s/sshr\.s/o or
  628. s/vshr/ushr/o or
  629. s/^(\s+)v/$1/o or # strip off v prefix
  630. s/\bbx\s+lr\b/ret/o;
  631. s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
  632. s/@\s/\/\//o; # old->new style commentary
  633. # fix up remaining legacy suffixes
  634. s/\.[ui]?8(\s)/$1/o;
  635. s/\.[uis]?32//o and s/\.16b/\.4s/go;
  636. m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
  637. m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
  638. s/\.[uisp]?64//o and s/\.16b/\.2d/go;
  639. s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  640. print $_,"\n";
  641. }
  642. } else { ######## 32-bit code
  643. sub unvdup32 {
  644. my $arg=shift;
  645. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  646. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  647. }
  648. sub unvpmullp64 {
  649. my ($mnemonic,$arg)=@_;
  650. if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
  651. my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
  652. |(($2&7)<<17)|(($2&8)<<4)
  653. |(($3&7)<<1) |(($3&8)<<2);
  654. $word |= 0x00010001 if ($mnemonic =~ "2");
  655. # since ARMv7 instructions are always encoded little-endian.
  656. # correct solution is to use .inst directive, but older
  657. # assemblers don't implement it:-(
  658. sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
  659. $word&0xff,($word>>8)&0xff,
  660. ($word>>16)&0xff,($word>>24)&0xff,
  661. $mnemonic,$arg;
  662. }
  663. }
  664. foreach(split("\n",$code)) {
  665. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  666. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  667. s/\/\/\s?/@ /o; # new->old style commentary
  668. # fix up remaining new-style suffixes
  669. s/\],#[0-9]+/]!/o;
  670. s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
  671. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  672. s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
  673. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  674. s/^(\s+)b\./$1b/o or
  675. s/^(\s+)ret/$1bx\tlr/o;
  676. print $_,"\n";
  677. }
  678. }
  679. close STDOUT; # enforce flush