2
0

ghash-armv4.pl 11 KB


  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # April 2010
  11. #
  12. # The module implements "4-bit" GCM GHASH function and underlying
  13. # single multiplication operation in GF(2^128). "4-bit" means that it
  14. # uses 256 bytes per-key table [+32 bytes shared table]. There is no
  15. # experimental performance data available yet. The only approximation
  16. # that can be made at this point is based on code size. Inner loop is
  17. # 32 instructions long and on single-issue core should execute in <40
  18. # cycles. Having verified that gcc 3.4 didn't unroll corresponding
  19. # loop, this assembler loop body was found to be ~3x smaller than
  20. # compiler-generated one...
  21. #
  22. # July 2010
  23. #
  24. # Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
  25. # Cortex A8 core and ~25 cycles per processed byte (which was observed
  26. # to be ~3 times faster than gcc-generated code:-)
  27. #
  28. # February 2011
  29. #
  30. # Profiler-assisted and platform-specific optimization resulted in 7%
  31. # improvement on Cortex A8 core and ~23.5 cycles per byte.
  32. #
  33. # March 2011
  34. #
  35. # Add NEON implementation featuring polynomial multiplication, i.e. no
  36. # lookup tables involved. On Cortex A8 it was measured to process one
  37. # byte in 15 cycles or 55% faster than integer-only code.
  38. # ====================================================================
  39. # Note about "528B" variant. In ARM case it makes lesser sense to
  40. # implement it for following reasons:
  41. #
  42. # - performance improvement won't be anywhere near 50%, because 128-
  43. # bit shift operation is neatly fused with 128-bit xor here, and
  44. # "538B" variant would eliminate only 4-5 instructions out of 32
  45. # in the inner loop (meaning that estimated improvement is ~15%);
  46. # - ARM-based systems are often embedded ones and extra memory
  47. # consumption might be unappreciated (for so little improvement);
  48. #
  49. # Byte order [in]dependence. =========================================
  50. #
  51. # Caller is expected to maintain specific *dword* order in Htable,
  52. # namely with *least* significant dword of 128-bit value at *lower*
  53. # address. This differs completely from C code and has everything to
  54. # do with ldm instruction and order in which dwords are "consumed" by
  55. # algorithm. *Byte* order within these dwords in turn is whatever
  56. # *native* byte order on current platform. See gcm128.c for working
  57. # example...
  58. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  59. open STDOUT,">$output";
  60. $Xi="r0"; # argument block
  61. $Htbl="r1";
  62. $inp="r2";
  63. $len="r3";
  64. $Zll="r4"; # variables
  65. $Zlh="r5";
  66. $Zhl="r6";
  67. $Zhh="r7";
  68. $Tll="r8";
  69. $Tlh="r9";
  70. $Thl="r10";
  71. $Thh="r11";
  72. $nlo="r12";
  73. ################# r13 is stack pointer
  74. $nhi="r14";
  75. ################# r15 is program counter
  76. $rem_4bit=$inp; # used in gcm_gmult_4bit
  77. $cnt=$len;
  78. sub Zsmash() {
  79. my $i=12;
  80. my @args=@_;
  81. for ($Zll,$Zlh,$Zhl,$Zhh) {
  82. $code.=<<___;
  83. #if __ARM_ARCH__>=7 && defined(__ARMEL__)
  84. rev $_,$_
  85. str $_,[$Xi,#$i]
  86. #elif defined(__ARMEB__)
  87. str $_,[$Xi,#$i]
  88. #else
  89. mov $Tlh,$_,lsr#8
  90. strb $_,[$Xi,#$i+3]
  91. mov $Thl,$_,lsr#16
  92. strb $Tlh,[$Xi,#$i+2]
  93. mov $Thh,$_,lsr#24
  94. strb $Thl,[$Xi,#$i+1]
  95. strb $Thh,[$Xi,#$i]
  96. #endif
  97. ___
  98. $code.="\t".shift(@args)."\n";
  99. $i-=4;
  100. }
  101. }
  102. $code=<<___;
  103. #include "arm_arch.h"
  104. .text
  105. .code 32
  106. .type rem_4bit,%object
  107. .align 5
  108. rem_4bit:
  109. .short 0x0000,0x1C20,0x3840,0x2460
  110. .short 0x7080,0x6CA0,0x48C0,0x54E0
  111. .short 0xE100,0xFD20,0xD940,0xC560
  112. .short 0x9180,0x8DA0,0xA9C0,0xB5E0
  113. .size rem_4bit,.-rem_4bit
  114. .type rem_4bit_get,%function
  115. rem_4bit_get:
  116. sub $rem_4bit,pc,#8
  117. sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
  118. b .Lrem_4bit_got
  119. nop
  120. .size rem_4bit_get,.-rem_4bit_get
  121. .global gcm_ghash_4bit
  122. .type gcm_ghash_4bit,%function
  123. gcm_ghash_4bit:
  124. sub r12,pc,#8
  125. add $len,$inp,$len @ $len to point at the end
  126. stmdb sp!,{r3-r11,lr} @ save $len/end too
  127. sub r12,r12,#48 @ &rem_4bit
  128. ldmia r12,{r4-r11} @ copy rem_4bit ...
  129. stmdb sp!,{r4-r11} @ ... to stack
  130. ldrb $nlo,[$inp,#15]
  131. ldrb $nhi,[$Xi,#15]
  132. .Louter:
  133. eor $nlo,$nlo,$nhi
  134. and $nhi,$nlo,#0xf0
  135. and $nlo,$nlo,#0x0f
  136. mov $cnt,#14
  137. add $Zhh,$Htbl,$nlo,lsl#4
  138. ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
  139. add $Thh,$Htbl,$nhi
  140. ldrb $nlo,[$inp,#14]
  141. and $nhi,$Zll,#0xf @ rem
  142. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
  143. add $nhi,$nhi,$nhi
  144. eor $Zll,$Tll,$Zll,lsr#4
  145. ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
  146. eor $Zll,$Zll,$Zlh,lsl#28
  147. ldrb $nhi,[$Xi,#14]
  148. eor $Zlh,$Tlh,$Zlh,lsr#4
  149. eor $Zlh,$Zlh,$Zhl,lsl#28
  150. eor $Zhl,$Thl,$Zhl,lsr#4
  151. eor $Zhl,$Zhl,$Zhh,lsl#28
  152. eor $Zhh,$Thh,$Zhh,lsr#4
  153. eor $nlo,$nlo,$nhi
  154. and $nhi,$nlo,#0xf0
  155. and $nlo,$nlo,#0x0f
  156. eor $Zhh,$Zhh,$Tll,lsl#16
  157. .Linner:
  158. add $Thh,$Htbl,$nlo,lsl#4
  159. and $nlo,$Zll,#0xf @ rem
  160. subs $cnt,$cnt,#1
  161. add $nlo,$nlo,$nlo
  162. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
  163. eor $Zll,$Tll,$Zll,lsr#4
  164. eor $Zll,$Zll,$Zlh,lsl#28
  165. eor $Zlh,$Tlh,$Zlh,lsr#4
  166. eor $Zlh,$Zlh,$Zhl,lsl#28
  167. ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
  168. eor $Zhl,$Thl,$Zhl,lsr#4
  169. ldrplb $nlo,[$inp,$cnt]
  170. eor $Zhl,$Zhl,$Zhh,lsl#28
  171. eor $Zhh,$Thh,$Zhh,lsr#4
  172. add $Thh,$Htbl,$nhi
  173. and $nhi,$Zll,#0xf @ rem
  174. eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
  175. add $nhi,$nhi,$nhi
  176. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
  177. eor $Zll,$Tll,$Zll,lsr#4
  178. ldrplb $Tll,[$Xi,$cnt]
  179. eor $Zll,$Zll,$Zlh,lsl#28
  180. eor $Zlh,$Tlh,$Zlh,lsr#4
  181. ldrh $Tlh,[sp,$nhi]
  182. eor $Zlh,$Zlh,$Zhl,lsl#28
  183. eor $Zhl,$Thl,$Zhl,lsr#4
  184. eor $Zhl,$Zhl,$Zhh,lsl#28
  185. eorpl $nlo,$nlo,$Tll
  186. eor $Zhh,$Thh,$Zhh,lsr#4
  187. andpl $nhi,$nlo,#0xf0
  188. andpl $nlo,$nlo,#0x0f
  189. eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
  190. bpl .Linner
  191. ldr $len,[sp,#32] @ re-load $len/end
  192. add $inp,$inp,#16
  193. mov $nhi,$Zll
  194. ___
  195. &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
  196. $code.=<<___;
  197. bne .Louter
  198. add sp,sp,#36
  199. #if __ARM_ARCH__>=5
  200. ldmia sp!,{r4-r11,pc}
  201. #else
  202. ldmia sp!,{r4-r11,lr}
  203. tst lr,#1
  204. moveq pc,lr @ be binary compatible with V4, yet
  205. bx lr @ interoperable with Thumb ISA:-)
  206. #endif
  207. .size gcm_ghash_4bit,.-gcm_ghash_4bit
  208. .global gcm_gmult_4bit
  209. .type gcm_gmult_4bit,%function
  210. gcm_gmult_4bit:
  211. stmdb sp!,{r4-r11,lr}
  212. ldrb $nlo,[$Xi,#15]
  213. b rem_4bit_get
  214. .Lrem_4bit_got:
  215. and $nhi,$nlo,#0xf0
  216. and $nlo,$nlo,#0x0f
  217. mov $cnt,#14
  218. add $Zhh,$Htbl,$nlo,lsl#4
  219. ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
  220. ldrb $nlo,[$Xi,#14]
  221. add $Thh,$Htbl,$nhi
  222. and $nhi,$Zll,#0xf @ rem
  223. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
  224. add $nhi,$nhi,$nhi
  225. eor $Zll,$Tll,$Zll,lsr#4
  226. ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
  227. eor $Zll,$Zll,$Zlh,lsl#28
  228. eor $Zlh,$Tlh,$Zlh,lsr#4
  229. eor $Zlh,$Zlh,$Zhl,lsl#28
  230. eor $Zhl,$Thl,$Zhl,lsr#4
  231. eor $Zhl,$Zhl,$Zhh,lsl#28
  232. eor $Zhh,$Thh,$Zhh,lsr#4
  233. and $nhi,$nlo,#0xf0
  234. eor $Zhh,$Zhh,$Tll,lsl#16
  235. and $nlo,$nlo,#0x0f
  236. .Loop:
  237. add $Thh,$Htbl,$nlo,lsl#4
  238. and $nlo,$Zll,#0xf @ rem
  239. subs $cnt,$cnt,#1
  240. add $nlo,$nlo,$nlo
  241. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
  242. eor $Zll,$Tll,$Zll,lsr#4
  243. eor $Zll,$Zll,$Zlh,lsl#28
  244. eor $Zlh,$Tlh,$Zlh,lsr#4
  245. eor $Zlh,$Zlh,$Zhl,lsl#28
  246. ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
  247. eor $Zhl,$Thl,$Zhl,lsr#4
  248. ldrplb $nlo,[$Xi,$cnt]
  249. eor $Zhl,$Zhl,$Zhh,lsl#28
  250. eor $Zhh,$Thh,$Zhh,lsr#4
  251. add $Thh,$Htbl,$nhi
  252. and $nhi,$Zll,#0xf @ rem
  253. eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
  254. add $nhi,$nhi,$nhi
  255. ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
  256. eor $Zll,$Tll,$Zll,lsr#4
  257. eor $Zll,$Zll,$Zlh,lsl#28
  258. eor $Zlh,$Tlh,$Zlh,lsr#4
  259. ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
  260. eor $Zlh,$Zlh,$Zhl,lsl#28
  261. eor $Zhl,$Thl,$Zhl,lsr#4
  262. eor $Zhl,$Zhl,$Zhh,lsl#28
  263. eor $Zhh,$Thh,$Zhh,lsr#4
  264. andpl $nhi,$nlo,#0xf0
  265. andpl $nlo,$nlo,#0x0f
  266. eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
  267. bpl .Loop
  268. ___
  269. &Zsmash();
  270. $code.=<<___;
  271. #if __ARM_ARCH__>=5
  272. ldmia sp!,{r4-r11,pc}
  273. #else
  274. ldmia sp!,{r4-r11,lr}
  275. tst lr,#1
  276. moveq pc,lr @ be binary compatible with V4, yet
  277. bx lr @ interoperable with Thumb ISA:-)
  278. #endif
  279. .size gcm_gmult_4bit,.-gcm_gmult_4bit
  280. ___
  281. {
  282. my $cnt=$Htbl; # $Htbl is used once in the very beginning
  283. my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
  284. my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
  285. # Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
  286. # in Zo. Or should I say "top bit", because GHASH is specified in
  287. # reverse bit order? Otherwise straightforward 128-bt H by one input
  288. # byte multiplication and modulo-reduction, times 16.
  289. sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
  290. sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
  291. sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
  292. $code.=<<___;
  293. #if __ARM_ARCH__>=7
  294. .fpu neon
  295. .global gcm_gmult_neon
  296. .type gcm_gmult_neon,%function
  297. .align 4
  298. gcm_gmult_neon:
  299. sub $Htbl,#16 @ point at H in GCM128_CTX
  300. vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
  301. vmov.i32 $mod,#0xe1 @ our irreducible polynomial
  302. vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
  303. vshr.u64 $mod,#32
  304. vldmia $Htbl,{$Hhi-$Hlo} @ load H
  305. veor $zero,$zero
  306. #ifdef __ARMEL__
  307. vrev64.8 $IN,$IN
  308. #endif
  309. veor $Qpost,$Qpost
  310. veor $R,$R
  311. mov $cnt,#16
  312. veor $Z,$Z
  313. mov $len,#16
  314. veor $Zo,$Zo
  315. vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
  316. b .Linner_neon
  317. .size gcm_gmult_neon,.-gcm_gmult_neon
  318. .global gcm_ghash_neon
  319. .type gcm_ghash_neon,%function
  320. .align 4
  321. gcm_ghash_neon:
  322. vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
  323. vmov.i32 $mod,#0xe1 @ our irreducible polynomial
  324. vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
  325. vshr.u64 $mod,#32
  326. vldmia $Xi,{$Hhi-$Hlo} @ load H
  327. veor $zero,$zero
  328. nop
  329. #ifdef __ARMEL__
  330. vrev64.8 $Z,$Z
  331. #endif
  332. .Louter_neon:
  333. vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
  334. veor $Qpost,$Qpost
  335. vld1.64 `&Dlo($IN)`,[$inp]!
  336. veor $R,$R
  337. mov $cnt,#16
  338. #ifdef __ARMEL__
  339. vrev64.8 $IN,$IN
  340. #endif
  341. veor $Zo,$Zo
  342. veor $IN,$Z @ inp^=Xi
  343. veor $Z,$Z
  344. vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
  345. .Linner_neon:
  346. subs $cnt,$cnt,#1
  347. vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
  348. vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
  349. vext.8 $IN,$zero,#1 @ IN>>=8
  350. veor $Z,$Qpost @ modulo-scheduled part
  351. vshl.i64 `&Dlo("$R")`,#48
  352. vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
  353. veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
  354. veor `&Dhi("$Z")`,`&Dlo("$R")`
  355. vuzp.8 $Qlo,$Qhi
  356. vsli.8 $Zo,$T,#1 @ compose the "carry" byte
  357. vext.8 $Z,$zero,#1 @ Z>>=8
  358. vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
  359. vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
  360. vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
  361. veor $Z,$Qhi
  362. bne .Linner_neon
  363. veor $Z,$Qpost @ modulo-scheduled artefact
  364. vshl.i64 `&Dlo("$R")`,#48
  365. veor `&Dhi("$Z")`,`&Dlo("$R")`
  366. @ finalization, normalize Z:Zo
  367. vand $Zo,$mod @ suffices to mask the bit
  368. vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
  369. vshl.i64 $Z,#1
  370. subs $len,#16
  371. vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
  372. bne .Louter_neon
  373. #ifdef __ARMEL__
  374. vrev64.8 $Z,$Z
  375. #endif
  376. sub $Xi,#16
  377. vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
  378. vst1.64 `&Dlo("$Z")`,[$Xi,:64]
  379. bx lr
  380. .size gcm_ghash_neon,.-gcm_ghash_neon
  381. #endif
  382. ___
  383. }
  384. $code.=<<___;
  385. .asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  386. .align 2
  387. ___
  388. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  389. $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
  390. print $code;
  391. close STDOUT; # enforce flush