ghashv8-armx.pl 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900
  1. #! /usr/bin/env perl
  2. # Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
  17. #
  18. # June 2014
  19. #
  20. # Initial version was developed in tight cooperation with Ard
  21. # Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
  22. # Just like aesv8-armx.pl this module supports both AArch32 and
  23. # AArch64 execution modes.
  24. #
  25. # July 2014
  26. #
  27. # Implement 2x aggregated reduction [see ghash-x86.pl for background
  28. # information].
  29. #
  30. # November 2017
  31. #
  32. # AArch64 register bank to "accommodate" 4x aggregated reduction and
  33. # improve performance by 20-70% depending on processor.
  34. #
  35. # Current performance in cycles per processed byte:
  36. #
  37. # 64-bit PMULL 32-bit PMULL 32-bit NEON(*)
  38. # Apple A7 0.58 0.92 5.62
  39. # Cortex-A53 0.85 1.01 8.39
  40. # Cortex-A57 0.73 1.17 7.61
  41. # Denver 0.51 0.65 6.02
  42. # Mongoose 0.65 1.10 8.06
  43. # Kryo 0.76 1.16 8.00
  44. # ThunderX2 1.05
  45. #
  46. # (*) presented for reference/comparison purposes;
  47. # $output is the last argument if it looks like a file (it has an extension)
  48. # $flavour is the first argument if it doesn't look like a file
  49. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  50. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  51. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  52. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  53. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  54. die "can't locate arm-xlate.pl";
  55. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  56. or die "can't call $xlate: $!";
  57. *STDOUT=*OUT;
  58. $Xi="x0"; # argument block
  59. $Htbl="x1";
  60. $inp="x2";
  61. $len="x3";
  62. $inc="x12";
  63. {
  64. my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
  65. my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
  66. my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  67. $code=<<___;
  68. #include "arm_arch.h"
  69. #if __ARM_MAX_ARCH__>=7
  70. ___
  71. $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
  72. $code.=<<___ if ($flavour !~ /64/);
  73. .fpu neon
  74. #ifdef __thumb2__
  75. .syntax unified
  76. .thumb
  77. # define INST(a,b,c,d) $_byte c,0xef,a,b
  78. #else
  79. .code 32
  80. # define INST(a,b,c,d) $_byte a,b,c,0xf2
  81. #endif
  82. .text
  83. ___
  84. ################################################################################
  85. # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
  86. #
  87. # input: 128-bit H - secret parameter E(K,0^128)
  88. # output: precomputed table filled with degrees of twisted H;
  89. # H is twisted to handle reverse bitness of GHASH;
  90. # only few of 16 slots of Htable[16] are used;
  91. # data is opaque to outside world (which allows to
  92. # optimize the code independently);
  93. #
  94. $code.=<<___;
  95. .global gcm_init_v8
  96. .type gcm_init_v8,%function
  97. .align 4
  98. gcm_init_v8:
  99. ___
  100. $code.=<<___ if ($flavour =~ /64/);
  101. AARCH64_VALID_CALL_TARGET
  102. ___
  103. $code.=<<___;
  104. vld1.64 {$t1},[x1] @ load input H
  105. vmov.i8 $xC2,#0xe1
  106. vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
  107. vext.8 $IN,$t1,$t1,#8
  108. vshr.u64 $t2,$xC2,#63
  109. vdup.32 $t1,${t1}[1]
  110. vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
  111. vshr.u64 $t2,$IN,#63
  112. vshr.s32 $t1,$t1,#31 @ broadcast carry bit
  113. vand $t2,$t2,$t0
  114. vshl.i64 $IN,$IN,#1
  115. vext.8 $t2,$t2,$t2,#8
  116. vand $t0,$t0,$t1
  117. vorr $IN,$IN,$t2 @ H<<<=1
  118. veor $H,$IN,$t0 @ twisted H
  119. vst1.64 {$H},[x0],#16 @ store Htable[0]
  120. @ calculate H^2
  121. vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
  122. vpmull.p64 $Xl,$H,$H
  123. veor $t0,$t0,$H
  124. vpmull2.p64 $Xh,$H,$H
  125. vpmull.p64 $Xm,$t0,$t0
  126. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  127. veor $t2,$Xl,$Xh
  128. veor $Xm,$Xm,$t1
  129. veor $Xm,$Xm,$t2
  130. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
  131. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  132. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  133. veor $Xl,$Xm,$t2
  134. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
  135. vpmull.p64 $Xl,$Xl,$xC2
  136. veor $t2,$t2,$Xh
  137. veor $H2,$Xl,$t2
  138. vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
  139. veor $t1,$t1,$H2
  140. vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
  141. vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2]
  142. ___
  143. if ($flavour =~ /64/) {
  144. my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
  145. my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23));
  146. $code.=<<___;
  147. @ calculate H^3 and H^4
  148. vpmull.p64 $Xl,$H, $H2
  149. vpmull.p64 $Yl,$H2,$H2
  150. vpmull2.p64 $Xh,$H, $H2
  151. vpmull2.p64 $Yh,$H2,$H2
  152. vpmull.p64 $Xm,$t0,$t1
  153. vpmull.p64 $Ym,$t1,$t1
  154. vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
  155. vext.8 $t1,$Yl,$Yh,#8
  156. veor $t2,$Xl,$Xh
  157. veor $Xm,$Xm,$t0
  158. veor $t3,$Yl,$Yh
  159. veor $Ym,$Ym,$t1
  160. veor $Xm,$Xm,$t2
  161. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
  162. veor $Ym,$Ym,$t3
  163. vpmull.p64 $t3,$Yl,$xC2
  164. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  165. vmov $Yh#lo,$Ym#hi
  166. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  167. vmov $Ym#hi,$Yl#lo
  168. veor $Xl,$Xm,$t2
  169. veor $Yl,$Ym,$t3
  170. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
  171. vext.8 $t3,$Yl,$Yl,#8
  172. vpmull.p64 $Xl,$Xl,$xC2
  173. vpmull.p64 $Yl,$Yl,$xC2
  174. veor $t2,$t2,$Xh
  175. veor $t3,$t3,$Yh
  176. veor $H3, $Xl,$t2 @ H^3
  177. veor $H4,$Yl,$t3 @ H^4
  178. vext.8 $t0,$H3, $H3,#8 @ Karatsuba pre-processing
  179. vext.8 $t1,$H4,$H4,#8
  180. vext.8 $t2,$H2,$H2,#8
  181. veor $t0,$t0,$H3
  182. veor $t1,$t1,$H4
  183. veor $t2,$t2,$H2
  184. vext.8 $H34k,$t0,$t1,#8 @ pack Karatsuba pre-processed
  185. vst1.64 {$H3-$H4},[x0],#48 @ store Htable[3..5]
  186. @ calculate H^5 and H^6
  187. vpmull.p64 $Xl,$H2, $H3
  188. vpmull.p64 $Yl,$H3,$H3
  189. vpmull2.p64 $Xh,$H2, $H3
  190. vpmull2.p64 $Yh,$H3,$H3
  191. vpmull.p64 $Xm,$t0,$t2
  192. vpmull.p64 $Ym,$t0,$t0
  193. vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
  194. vext.8 $t1,$Yl,$Yh,#8
  195. veor $t2,$Xl,$Xh
  196. veor $Xm,$Xm,$t0
  197. veor $t3,$Yl,$Yh
  198. veor $Ym,$Ym,$t1
  199. veor $Xm,$Xm,$t2
  200. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
  201. veor $Ym,$Ym,$t3
  202. vpmull.p64 $t3,$Yl,$xC2
  203. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  204. vmov $Yh#lo,$Ym#hi
  205. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  206. vmov $Ym#hi,$Yl#lo
  207. veor $Xl,$Xm,$t2
  208. veor $Yl,$Ym,$t3
  209. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
  210. vext.8 $t3,$Yl,$Yl,#8
  211. vpmull.p64 $Xl,$Xl,$xC2
  212. vpmull.p64 $Yl,$Yl,$xC2
  213. veor $t2,$t2,$Xh
  214. veor $t3,$t3,$Yh
  215. veor $H5,$Xl,$t2 @ H^5
  216. veor $H6,$Yl,$t3 @ H^6
  217. vext.8 $t0,$H5, $H5,#8 @ Karatsuba pre-processing
  218. vext.8 $t1,$H6,$H6,#8
  219. vext.8 $t2,$H2,$H2,#8
  220. veor $t0,$t0,$H5
  221. veor $t1,$t1,$H6
  222. veor $t2,$t2,$H2
  223. vext.8 $H56k,$t0,$t1,#8 @ pack Karatsuba pre-processed
  224. vst1.64 {$H5-$H6},[x0],#48 @ store Htable[6..8]
  225. @ calculate H^7 and H^8
  226. vpmull.p64 $Xl,$H2,$H5
  227. vpmull.p64 $Yl,$H2,$H6
  228. vpmull2.p64 $Xh,$H2,$H5
  229. vpmull2.p64 $Yh,$H2,$H6
  230. vpmull.p64 $Xm,$t0,$t2
  231. vpmull.p64 $Ym,$t1,$t2
  232. vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
  233. vext.8 $t1,$Yl,$Yh,#8
  234. veor $t2,$Xl,$Xh
  235. veor $Xm,$Xm,$t0
  236. veor $t3,$Yl,$Yh
  237. veor $Ym,$Ym,$t1
  238. veor $Xm,$Xm,$t2
  239. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
  240. veor $Ym,$Ym,$t3
  241. vpmull.p64 $t3,$Yl,$xC2
  242. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  243. vmov $Yh#lo,$Ym#hi
  244. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  245. vmov $Ym#hi,$Yl#lo
  246. veor $Xl,$Xm,$t2
  247. veor $Yl,$Ym,$t3
  248. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
  249. vext.8 $t3,$Yl,$Yl,#8
  250. vpmull.p64 $Xl,$Xl,$xC2
  251. vpmull.p64 $Yl,$Yl,$xC2
  252. veor $t2,$t2,$Xh
  253. veor $t3,$t3,$Yh
  254. veor $H7,$Xl,$t2 @ H^7
  255. veor $H8,$Yl,$t3 @ H^8
  256. vext.8 $t0,$H7,$H7,#8 @ Karatsuba pre-processing
  257. vext.8 $t1,$H8,$H8,#8
  258. veor $t0,$t0,$H7
  259. veor $t1,$t1,$H8
  260. vext.8 $H78k,$t0,$t1,#8 @ pack Karatsuba pre-processed
  261. vst1.64 {$H7-$H8},[x0] @ store Htable[9..11]
  262. ___
  263. }
  264. $code.=<<___;
  265. ret
  266. .size gcm_init_v8,.-gcm_init_v8
  267. ___
  268. ################################################################################
  269. # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
  270. #
  271. # input: Xi - current hash value;
  272. # Htable - table precomputed in gcm_init_v8;
  273. # output: Xi - next hash value Xi;
  274. #
  275. $code.=<<___;
  276. .global gcm_gmult_v8
  277. .type gcm_gmult_v8,%function
  278. .align 4
  279. gcm_gmult_v8:
  280. ___
  281. $code.=<<___ if ($flavour =~ /64/);
  282. AARCH64_VALID_CALL_TARGET
  283. ___
  284. $code.=<<___;
  285. vld1.64 {$t1},[$Xi] @ load Xi
  286. vmov.i8 $xC2,#0xe1
  287. vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
  288. vshl.u64 $xC2,$xC2,#57
  289. #ifndef __ARMEB__
  290. vrev64.8 $t1,$t1
  291. #endif
  292. vext.8 $IN,$t1,$t1,#8
  293. vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
  294. veor $t1,$t1,$IN @ Karatsuba pre-processing
  295. vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
  296. vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
  297. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  298. veor $t2,$Xl,$Xh
  299. veor $Xm,$Xm,$t1
  300. veor $Xm,$Xm,$t2
  301. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  302. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  303. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  304. veor $Xl,$Xm,$t2
  305. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  306. vpmull.p64 $Xl,$Xl,$xC2
  307. veor $t2,$t2,$Xh
  308. veor $Xl,$Xl,$t2
  309. #ifndef __ARMEB__
  310. vrev64.8 $Xl,$Xl
  311. #endif
  312. vext.8 $Xl,$Xl,$Xl,#8
  313. vst1.64 {$Xl},[$Xi] @ write out Xi
  314. ret
  315. .size gcm_gmult_v8,.-gcm_gmult_v8
  316. ___
  317. ################################################################################
  318. # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
  319. #
  320. # input: table precomputed in gcm_init_v8;
  321. # current hash value Xi;
  322. # pointer to input data;
  323. # length of input data in bytes, but divisible by block size;
  324. # output: next hash value Xi;
  325. #
  326. $code.=<<___;
  327. .global gcm_ghash_v8
  328. .type gcm_ghash_v8,%function
  329. .align 4
  330. gcm_ghash_v8:
  331. ___
  332. $code.=<<___ if ($flavour =~ /64/);
  333. AARCH64_VALID_CALL_TARGET
  334. cmp $len,#64
  335. b.hs .Lgcm_ghash_v8_4x
  336. ___
  337. $code.=<<___ if ($flavour !~ /64/);
  338. vstmdb sp!,{d8-d15} @ 32-bit ABI says so
  339. ___
  340. $code.=<<___;
  341. vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
  342. @ "[rotated]" means that
  343. @ loaded value would have
  344. @ to be rotated in order to
  345. @ make it appear as in
  346. @ algorithm specification
  347. subs $len,$len,#32 @ see if $len is 32 or larger
  348. mov $inc,#16 @ $inc is used as post-
  349. @ increment for input pointer;
  350. @ as loop is modulo-scheduled
  351. @ $inc is zeroed just in time
  352. @ to preclude overstepping
  353. @ inp[len], which means that
  354. @ last block[s] are actually
  355. @ loaded twice, but last
  356. @ copy is not processed
  357. vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
  358. vmov.i8 $xC2,#0xe1
  359. vld1.64 {$H2},[$Htbl]
  360. cclr $inc,eq @ is it time to zero $inc?
  361. vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
  362. vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
  363. vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
  364. #ifndef __ARMEB__
  365. vrev64.8 $t0,$t0
  366. vrev64.8 $Xl,$Xl
  367. #endif
  368. vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
  369. b.lo .Lodd_tail_v8 @ $len was less than 32
  370. ___
  371. { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
  372. #######
  373. # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
  374. # [(H*Ii+1) + (H*Xi+1)] mod P =
  375. # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
  376. #
  377. $code.=<<___;
  378. vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
  379. #ifndef __ARMEB__
  380. vrev64.8 $t1,$t1
  381. #endif
  382. vext.8 $In,$t1,$t1,#8
  383. veor $IN,$IN,$Xl @ I[i]^=Xi
  384. vpmull.p64 $Xln,$H,$In @ H·Ii+1
  385. veor $t1,$t1,$In @ Karatsuba pre-processing
  386. vpmull2.p64 $Xhn,$H,$In
  387. b .Loop_mod2x_v8
  388. .align 4
  389. .Loop_mod2x_v8:
  390. vext.8 $t2,$IN,$IN,#8
  391. subs $len,$len,#32 @ is there more data?
  392. vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
  393. cclr $inc,lo @ is it time to zero $inc?
  394. vpmull.p64 $Xmn,$Hhl,$t1
  395. veor $t2,$t2,$IN @ Karatsuba pre-processing
  396. vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
  397. veor $Xl,$Xl,$Xln @ accumulate
  398. vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
  399. vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
  400. veor $Xh,$Xh,$Xhn
  401. cclr $inc,eq @ is it time to zero $inc?
  402. veor $Xm,$Xm,$Xmn
  403. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  404. veor $t2,$Xl,$Xh
  405. veor $Xm,$Xm,$t1
  406. vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
  407. #ifndef __ARMEB__
  408. vrev64.8 $t0,$t0
  409. #endif
  410. veor $Xm,$Xm,$t2
  411. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  412. #ifndef __ARMEB__
  413. vrev64.8 $t1,$t1
  414. #endif
  415. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  416. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  417. vext.8 $In,$t1,$t1,#8
  418. vext.8 $IN,$t0,$t0,#8
  419. veor $Xl,$Xm,$t2
  420. vpmull.p64 $Xln,$H,$In @ H·Ii+1
  421. veor $IN,$IN,$Xh @ accumulate $IN early
  422. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  423. vpmull.p64 $Xl,$Xl,$xC2
  424. veor $IN,$IN,$t2
  425. veor $t1,$t1,$In @ Karatsuba pre-processing
  426. veor $IN,$IN,$Xl
  427. vpmull2.p64 $Xhn,$H,$In
  428. b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
  429. veor $Xh,$Xh,$t2
  430. vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
  431. adds $len,$len,#32 @ re-construct $len
  432. veor $Xl,$Xl,$Xh @ re-construct $Xl
  433. b.eq .Ldone_v8 @ is $len zero?
  434. ___
  435. }
  436. $code.=<<___;
  437. .Lodd_tail_v8:
  438. vext.8 $t2,$Xl,$Xl,#8
  439. veor $IN,$IN,$Xl @ inp^=Xi
  440. veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
  441. vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
  442. veor $t1,$t1,$IN @ Karatsuba pre-processing
  443. vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
  444. vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
  445. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  446. veor $t2,$Xl,$Xh
  447. veor $Xm,$Xm,$t1
  448. veor $Xm,$Xm,$t2
  449. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  450. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  451. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  452. veor $Xl,$Xm,$t2
  453. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  454. vpmull.p64 $Xl,$Xl,$xC2
  455. veor $t2,$t2,$Xh
  456. veor $Xl,$Xl,$t2
  457. .Ldone_v8:
  458. #ifndef __ARMEB__
  459. vrev64.8 $Xl,$Xl
  460. #endif
  461. vext.8 $Xl,$Xl,$Xl,#8
  462. vst1.64 {$Xl},[$Xi] @ write out Xi
  463. ___
  464. $code.=<<___ if ($flavour !~ /64/);
  465. vldmia sp!,{d8-d15} @ 32-bit ABI says so
  466. ___
  467. $code.=<<___;
  468. ret
  469. .size gcm_ghash_v8,.-gcm_ghash_v8
  470. ___
  471. if ($flavour =~ /64/) { # 4x subroutine
  472. my ($I0,$j1,$j2,$j3,
  473. $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
  474. $code.=<<___;
  475. .type gcm_ghash_v8_4x,%function
  476. .align 4
  477. gcm_ghash_v8_4x:
  478. .Lgcm_ghash_v8_4x:
  479. vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
  480. vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2
  481. vmov.i8 $xC2,#0xe1
  482. vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4
  483. vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
  484. vld1.64 {$I0-$j3},[$inp],#64
  485. #ifndef __ARMEB__
  486. vrev64.8 $Xl,$Xl
  487. vrev64.8 $j1,$j1
  488. vrev64.8 $j2,$j2
  489. vrev64.8 $j3,$j3
  490. vrev64.8 $I0,$I0
  491. #endif
  492. vext.8 $I3,$j3,$j3,#8
  493. vext.8 $I2,$j2,$j2,#8
  494. vext.8 $I1,$j1,$j1,#8
  495. vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
  496. veor $j3,$j3,$I3
  497. vpmull2.p64 $Yh,$H,$I3
  498. vpmull.p64 $Ym,$Hhl,$j3
  499. vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
  500. veor $j2,$j2,$I2
  501. vpmull2.p64 $I2,$H2,$I2
  502. vpmull2.p64 $j2,$Hhl,$j2
  503. veor $Yl,$Yl,$t0
  504. veor $Yh,$Yh,$I2
  505. veor $Ym,$Ym,$j2
  506. vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
  507. veor $j1,$j1,$I1
  508. vpmull2.p64 $I1,$H3,$I1
  509. vpmull.p64 $j1,$H34,$j1
  510. veor $Yl,$Yl,$j3
  511. veor $Yh,$Yh,$I1
  512. veor $Ym,$Ym,$j1
  513. subs $len,$len,#128
  514. b.lo .Ltail4x
  515. b .Loop4x
  516. .align 4
  517. .Loop4x:
  518. veor $t0,$I0,$Xl
  519. vld1.64 {$I0-$j3},[$inp],#64
  520. vext.8 $IN,$t0,$t0,#8
  521. #ifndef __ARMEB__
  522. vrev64.8 $j1,$j1
  523. vrev64.8 $j2,$j2
  524. vrev64.8 $j3,$j3
  525. vrev64.8 $I0,$I0
  526. #endif
  527. vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
  528. veor $t0,$t0,$IN
  529. vpmull2.p64 $Xh,$H4,$IN
  530. vext.8 $I3,$j3,$j3,#8
  531. vpmull2.p64 $Xm,$H34,$t0
  532. veor $Xl,$Xl,$Yl
  533. veor $Xh,$Xh,$Yh
  534. vext.8 $I2,$j2,$j2,#8
  535. veor $Xm,$Xm,$Ym
  536. vext.8 $I1,$j1,$j1,#8
  537. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  538. veor $t2,$Xl,$Xh
  539. vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
  540. veor $j3,$j3,$I3
  541. veor $Xm,$Xm,$t1
  542. vpmull2.p64 $Yh,$H,$I3
  543. veor $Xm,$Xm,$t2
  544. vpmull.p64 $Ym,$Hhl,$j3
  545. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  546. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  547. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  548. vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
  549. veor $j2,$j2,$I2
  550. vpmull2.p64 $I2,$H2,$I2
  551. veor $Xl,$Xm,$t2
  552. vpmull2.p64 $j2,$Hhl,$j2
  553. veor $Yl,$Yl,$t0
  554. veor $Yh,$Yh,$I2
  555. veor $Ym,$Ym,$j2
  556. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  557. vpmull.p64 $Xl,$Xl,$xC2
  558. vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
  559. veor $j1,$j1,$I1
  560. veor $t2,$t2,$Xh
  561. vpmull2.p64 $I1,$H3,$I1
  562. vpmull.p64 $j1,$H34,$j1
  563. veor $Xl,$Xl,$t2
  564. veor $Yl,$Yl,$j3
  565. veor $Yh,$Yh,$I1
  566. vext.8 $Xl,$Xl,$Xl,#8
  567. veor $Ym,$Ym,$j1
  568. subs $len,$len,#64
  569. b.hs .Loop4x
  570. .Ltail4x:
  571. veor $t0,$I0,$Xl
  572. vext.8 $IN,$t0,$t0,#8
  573. vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
  574. veor $t0,$t0,$IN
  575. vpmull2.p64 $Xh,$H4,$IN
  576. vpmull2.p64 $Xm,$H34,$t0
  577. veor $Xl,$Xl,$Yl
  578. veor $Xh,$Xh,$Yh
  579. veor $Xm,$Xm,$Ym
  580. adds $len,$len,#64
  581. b.eq .Ldone4x
  582. cmp $len,#32
  583. b.lo .Lone
  584. b.eq .Ltwo
  585. .Lthree:
  586. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  587. veor $t2,$Xl,$Xh
  588. veor $Xm,$Xm,$t1
  589. vld1.64 {$I0-$j2},[$inp]
  590. veor $Xm,$Xm,$t2
  591. #ifndef __ARMEB__
  592. vrev64.8 $j1,$j1
  593. vrev64.8 $j2,$j2
  594. vrev64.8 $I0,$I0
  595. #endif
  596. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  597. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  598. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  599. vext.8 $I2,$j2,$j2,#8
  600. vext.8 $I1,$j1,$j1,#8
  601. veor $Xl,$Xm,$t2
  602. vpmull.p64 $Yl,$H,$I2 @ H·Ii+2
  603. veor $j2,$j2,$I2
  604. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  605. vpmull.p64 $Xl,$Xl,$xC2
  606. veor $t2,$t2,$Xh
  607. vpmull2.p64 $Yh,$H,$I2
  608. vpmull.p64 $Ym,$Hhl,$j2
  609. veor $Xl,$Xl,$t2
  610. vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1
  611. veor $j1,$j1,$I1
  612. vext.8 $Xl,$Xl,$Xl,#8
  613. vpmull2.p64 $I1,$H2,$I1
  614. veor $t0,$I0,$Xl
  615. vpmull2.p64 $j1,$Hhl,$j1
  616. vext.8 $IN,$t0,$t0,#8
  617. veor $Yl,$Yl,$j3
  618. veor $Yh,$Yh,$I1
  619. veor $Ym,$Ym,$j1
  620. vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii)
  621. veor $t0,$t0,$IN
  622. vpmull2.p64 $Xh,$H3,$IN
  623. vpmull.p64 $Xm,$H34,$t0
  624. veor $Xl,$Xl,$Yl
  625. veor $Xh,$Xh,$Yh
  626. veor $Xm,$Xm,$Ym
  627. b .Ldone4x
  628. .align 4
  629. .Ltwo:
  630. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  631. veor $t2,$Xl,$Xh
  632. veor $Xm,$Xm,$t1
  633. vld1.64 {$I0-$j1},[$inp]
  634. veor $Xm,$Xm,$t2
  635. #ifndef __ARMEB__
  636. vrev64.8 $j1,$j1
  637. vrev64.8 $I0,$I0
  638. #endif
  639. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  640. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  641. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  642. vext.8 $I1,$j1,$j1,#8
  643. veor $Xl,$Xm,$t2
  644. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  645. vpmull.p64 $Xl,$Xl,$xC2
  646. veor $t2,$t2,$Xh
  647. veor $Xl,$Xl,$t2
  648. vext.8 $Xl,$Xl,$Xl,#8
  649. vpmull.p64 $Yl,$H,$I1 @ H·Ii+1
  650. veor $j1,$j1,$I1
  651. veor $t0,$I0,$Xl
  652. vext.8 $IN,$t0,$t0,#8
  653. vpmull2.p64 $Yh,$H,$I1
  654. vpmull.p64 $Ym,$Hhl,$j1
  655. vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii)
  656. veor $t0,$t0,$IN
  657. vpmull2.p64 $Xh,$H2,$IN
  658. vpmull2.p64 $Xm,$Hhl,$t0
  659. veor $Xl,$Xl,$Yl
  660. veor $Xh,$Xh,$Yh
  661. veor $Xm,$Xm,$Ym
  662. b .Ldone4x
  663. .align 4
  664. .Lone:
  665. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  666. veor $t2,$Xl,$Xh
  667. veor $Xm,$Xm,$t1
  668. vld1.64 {$I0},[$inp]
  669. veor $Xm,$Xm,$t2
  670. #ifndef __ARMEB__
  671. vrev64.8 $I0,$I0
  672. #endif
  673. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  674. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  675. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  676. veor $Xl,$Xm,$t2
  677. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  678. vpmull.p64 $Xl,$Xl,$xC2
  679. veor $t2,$t2,$Xh
  680. veor $Xl,$Xl,$t2
  681. vext.8 $Xl,$Xl,$Xl,#8
  682. veor $t0,$I0,$Xl
  683. vext.8 $IN,$t0,$t0,#8
  684. vpmull.p64 $Xl,$H,$IN
  685. veor $t0,$t0,$IN
  686. vpmull2.p64 $Xh,$H,$IN
  687. vpmull.p64 $Xm,$Hhl,$t0
  688. .Ldone4x:
  689. vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
  690. veor $t2,$Xl,$Xh
  691. veor $Xm,$Xm,$t1
  692. veor $Xm,$Xm,$t2
  693. vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
  694. vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
  695. vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
  696. veor $Xl,$Xm,$t2
  697. vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
  698. vpmull.p64 $Xl,$Xl,$xC2
  699. veor $t2,$t2,$Xh
  700. veor $Xl,$Xl,$t2
  701. vext.8 $Xl,$Xl,$Xl,#8
  702. #ifndef __ARMEB__
  703. vrev64.8 $Xl,$Xl
  704. #endif
  705. vst1.64 {$Xl},[$Xi] @ write out Xi
  706. ret
  707. .size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
  708. ___
  709. }
  710. }
  711. $code.=<<___;
  712. .asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  713. .align 2
  714. #endif
  715. ___
  716. if ($flavour =~ /64/) { ######## 64-bit code
  717. sub unvmov {
  718. my $arg=shift;
  719. $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
  720. sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
  721. $3<8?$3:$3+8,($4 eq "lo")?0:1;
  722. }
  723. foreach(split("\n",$code)) {
  724. s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  725. s/vmov\.i8/movi/o or # fix up legacy mnemonics
  726. s/vmov\s+(.*)/unvmov($1)/geo or
  727. s/vext\.8/ext/o or
  728. s/vshr\.s/sshr\.s/o or
  729. s/vshr/ushr/o or
  730. s/^(\s+)v/$1/o or # strip off v prefix
  731. s/\bbx\s+lr\b/ret/o;
  732. s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
  733. s/@\s/\/\//o; # old->new style commentary
  734. # fix up remaining legacy suffixes
  735. s/\.[ui]?8(\s)/$1/o;
  736. s/\.[uis]?32//o and s/\.16b/\.4s/go;
  737. m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
  738. m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
  739. s/\.[uisp]?64//o and s/\.16b/\.2d/go;
  740. s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  741. # Switch preprocessor checks to aarch64 versions.
  742. s/__ARME([BL])__/__AARCH64E$1__/go;
  743. print $_,"\n";
  744. }
  745. } else { ######## 32-bit code
  746. sub unvdup32 {
  747. my $arg=shift;
  748. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  749. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  750. }
  751. sub unvpmullp64 {
  752. my ($mnemonic,$arg)=@_;
  753. if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
  754. my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
  755. |(($2&7)<<17)|(($2&8)<<4)
  756. |(($3&7)<<1) |(($3&8)<<2);
  757. $word |= 0x00010001 if ($mnemonic =~ "2");
  758. # since ARMv7 instructions are always encoded little-endian.
  759. # correct solution is to use .inst directive, but older
  760. # assemblers don't implement it:-(
  761. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  762. $word&0xff,($word>>8)&0xff,
  763. ($word>>16)&0xff,($word>>24)&0xff,
  764. $mnemonic,$arg;
  765. }
  766. }
  767. foreach(split("\n",$code)) {
  768. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  769. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  770. s/\/\/\s?/@ /o; # new->old style commentary
  771. # fix up remaining new-style suffixes
  772. s/\],#[0-9]+/]!/o;
  773. s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
  774. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  775. s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
  776. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  777. s/^(\s+)b\./$1b/o or
  778. s/^(\s+)ret/$1bx\tlr/o;
  779. if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
  780. print " it $2\n";
  781. }
  782. print $_,"\n";
  783. }
  784. }
  785. close STDOUT or die "error closing STDOUT: $!"; # enforce flush