armv4-gf2m.pl 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. #! /usr/bin/env perl
  2. # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # May 2011
  17. #
  18. # The module implements bn_GF2m_mul_2x2 polynomial multiplication
  19. # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
  20. # C for the time being... Except that it has two code paths: pure
  21. # integer code suitable for any ARMv4 and later CPU and NEON code
  22. # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
  23. # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
  24. # faster than compiler-generated code. For ECDH and ECDSA verify (but
  25. # not for ECDSA sign) it means 25%-45% improvement depending on key
  26. # length, more for longer keys. Even though NEON 1x1 multiplication
  27. # runs in even less cycles, ~30, improvement is measurable only on
  28. # longer keys. One has to optimize code elsewhere to get NEON glow...
  29. #
  30. # April 2014
  31. #
  32. # Double bn_GF2m_mul_2x2 performance by using algorithm from paper
  33. # referred below, which improves ECDH and ECDSA verify benchmarks
  34. # by 18-40%.
  35. #
  36. # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
  37. # Polynomial Multiplication on ARM Processors using the NEON Engine.
  38. #
  39. # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
  40. $flavour = shift;
  41. if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  42. else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  43. if ($flavour && $flavour ne "void") {
  44. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  45. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  46. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  47. die "can't locate arm-xlate.pl";
  48. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  49. } else {
  50. open STDOUT,">$output";
  51. }
  52. $code=<<___;
  53. #include "arm_arch.h"
  54. .text
  55. #if defined(__thumb2__)
  56. .syntax unified
  57. .thumb
  58. #else
  59. .code 32
  60. #endif
  61. ___
  62. ################
  63. # private interface to mul_1x1_ialu
  64. #
  65. $a="r1";
  66. $b="r0";
  67. ($a0,$a1,$a2,$a12,$a4,$a14)=
  68. ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
  69. $mask="r12";
  70. $code.=<<___;
  71. .type mul_1x1_ialu,%function
  72. .align 5
  73. mul_1x1_ialu:
  74. mov $a0,#0
  75. bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
  76. str $a0,[sp,#0] @ tab[0]=0
  77. add $a2,$a1,$a1 @ a2=a1<<1
  78. str $a1,[sp,#4] @ tab[1]=a1
  79. eor $a12,$a1,$a2 @ a1^a2
  80. str $a2,[sp,#8] @ tab[2]=a2
  81. mov $a4,$a1,lsl#2 @ a4=a1<<2
  82. str $a12,[sp,#12] @ tab[3]=a1^a2
  83. eor $a14,$a1,$a4 @ a1^a4
  84. str $a4,[sp,#16] @ tab[4]=a4
  85. eor $a0,$a2,$a4 @ a2^a4
  86. str $a14,[sp,#20] @ tab[5]=a1^a4
  87. eor $a12,$a12,$a4 @ a1^a2^a4
  88. str $a0,[sp,#24] @ tab[6]=a2^a4
  89. and $i0,$mask,$b,lsl#2
  90. str $a12,[sp,#28] @ tab[7]=a1^a2^a4
  91. and $i1,$mask,$b,lsr#1
  92. ldr $lo,[sp,$i0] @ tab[b & 0x7]
  93. and $i0,$mask,$b,lsr#4
  94. ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
  95. and $i1,$mask,$b,lsr#7
  96. ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
  97. eor $lo,$lo,$t1,lsl#3 @ stall
  98. mov $hi,$t1,lsr#29
  99. ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
  100. and $i0,$mask,$b,lsr#10
  101. eor $lo,$lo,$t0,lsl#6
  102. eor $hi,$hi,$t0,lsr#26
  103. ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
  104. and $i1,$mask,$b,lsr#13
  105. eor $lo,$lo,$t1,lsl#9
  106. eor $hi,$hi,$t1,lsr#23
  107. ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
  108. and $i0,$mask,$b,lsr#16
  109. eor $lo,$lo,$t0,lsl#12
  110. eor $hi,$hi,$t0,lsr#20
  111. ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
  112. and $i1,$mask,$b,lsr#19
  113. eor $lo,$lo,$t1,lsl#15
  114. eor $hi,$hi,$t1,lsr#17
  115. ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
  116. and $i0,$mask,$b,lsr#22
  117. eor $lo,$lo,$t0,lsl#18
  118. eor $hi,$hi,$t0,lsr#14
  119. ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
  120. and $i1,$mask,$b,lsr#25
  121. eor $lo,$lo,$t1,lsl#21
  122. eor $hi,$hi,$t1,lsr#11
  123. ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
  124. tst $a,#1<<30
  125. and $i0,$mask,$b,lsr#28
  126. eor $lo,$lo,$t0,lsl#24
  127. eor $hi,$hi,$t0,lsr#8
  128. ldr $t0,[sp,$i0] @ tab[b >> 30 ]
  129. #ifdef __thumb2__
  130. itt ne
  131. #endif
  132. eorne $lo,$lo,$b,lsl#30
  133. eorne $hi,$hi,$b,lsr#2
  134. tst $a,#1<<31
  135. eor $lo,$lo,$t1,lsl#27
  136. eor $hi,$hi,$t1,lsr#5
  137. #ifdef __thumb2__
  138. itt ne
  139. #endif
  140. eorne $lo,$lo,$b,lsl#31
  141. eorne $hi,$hi,$b,lsr#1
  142. eor $lo,$lo,$t0,lsl#30
  143. eor $hi,$hi,$t0,lsr#2
  144. mov pc,lr
  145. .size mul_1x1_ialu,.-mul_1x1_ialu
  146. ___
  147. ################
  148. # void bn_GF2m_mul_2x2(BN_ULONG *r,
  149. # BN_ULONG a1,BN_ULONG a0,
  150. # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
  151. {
  152. $code.=<<___;
  153. .global bn_GF2m_mul_2x2
  154. .type bn_GF2m_mul_2x2,%function
  155. .align 5
  156. bn_GF2m_mul_2x2:
  157. #if __ARM_MAX_ARCH__>=7
  158. stmdb sp!,{r10,lr}
  159. ldr r12,.LOPENSSL_armcap
  160. adr r10,.LOPENSSL_armcap
  161. ldr r12,[r12,r10]
  162. #ifdef __APPLE__
  163. ldr r12,[r12]
  164. #endif
  165. tst r12,#ARMV7_NEON
  166. itt ne
  167. ldrne r10,[sp],#8
  168. bne .LNEON
  169. stmdb sp!,{r4-r9}
  170. #else
  171. stmdb sp!,{r4-r10,lr}
  172. #endif
  173. ___
  174. $ret="r10"; # reassigned 1st argument
  175. $code.=<<___;
  176. mov $ret,r0 @ reassign 1st argument
  177. mov $b,r3 @ $b=b1
  178. sub r7,sp,#36
  179. mov r8,sp
  180. and r7,r7,#-32
  181. ldr r3,[sp,#32] @ load b0
  182. mov $mask,#7<<2
  183. mov sp,r7 @ allocate tab[8]
  184. str r8,[r7,#32]
  185. bl mul_1x1_ialu @ a1·b1
  186. str $lo,[$ret,#8]
  187. str $hi,[$ret,#12]
  188. eor $b,$b,r3 @ flip b0 and b1
  189. eor $a,$a,r2 @ flip a0 and a1
  190. eor r3,r3,$b
  191. eor r2,r2,$a
  192. eor $b,$b,r3
  193. eor $a,$a,r2
  194. bl mul_1x1_ialu @ a0·b0
  195. str $lo,[$ret]
  196. str $hi,[$ret,#4]
  197. eor $a,$a,r2
  198. eor $b,$b,r3
  199. bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
  200. ___
  201. @r=map("r$_",(6..9));
  202. $code.=<<___;
  203. ldmia $ret,{@r[0]-@r[3]}
  204. eor $lo,$lo,$hi
  205. ldr sp,[sp,#32] @ destroy tab[8]
  206. eor $hi,$hi,@r[1]
  207. eor $lo,$lo,@r[0]
  208. eor $hi,$hi,@r[2]
  209. eor $lo,$lo,@r[3]
  210. eor $hi,$hi,@r[3]
  211. str $hi,[$ret,#8]
  212. eor $lo,$lo,$hi
  213. str $lo,[$ret,#4]
  214. #if __ARM_ARCH__>=5
  215. ldmia sp!,{r4-r10,pc}
  216. #else
  217. ldmia sp!,{r4-r10,lr}
  218. tst lr,#1
  219. moveq pc,lr @ be binary compatible with V4, yet
  220. bx lr @ interoperable with Thumb ISA:-)
  221. #endif
  222. ___
  223. }
  224. {
  225. my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
  226. my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
  227. $code.=<<___;
  228. #if __ARM_MAX_ARCH__>=7
  229. .arch armv7-a
  230. .fpu neon
  231. .align 5
  232. .LNEON:
  233. ldr r12, [sp] @ 5th argument
  234. vmov $a, r2, r1
  235. vmov $b, r12, r3
  236. vmov.i64 $k48, #0x0000ffffffffffff
  237. vmov.i64 $k32, #0x00000000ffffffff
  238. vmov.i64 $k16, #0x000000000000ffff
  239. vext.8 $t0#lo, $a, $a, #1 @ A1
  240. vmull.p8 $t0, $t0#lo, $b @ F = A1*B
  241. vext.8 $r#lo, $b, $b, #1 @ B1
  242. vmull.p8 $r, $a, $r#lo @ E = A*B1
  243. vext.8 $t1#lo, $a, $a, #2 @ A2
  244. vmull.p8 $t1, $t1#lo, $b @ H = A2*B
  245. vext.8 $t3#lo, $b, $b, #2 @ B2
  246. vmull.p8 $t3, $a, $t3#lo @ G = A*B2
  247. vext.8 $t2#lo, $a, $a, #3 @ A3
  248. veor $t0, $t0, $r @ L = E + F
  249. vmull.p8 $t2, $t2#lo, $b @ J = A3*B
  250. vext.8 $r#lo, $b, $b, #3 @ B3
  251. veor $t1, $t1, $t3 @ M = G + H
  252. vmull.p8 $r, $a, $r#lo @ I = A*B3
  253. veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
  254. vand $t0#hi, $t0#hi, $k48
  255. vext.8 $t3#lo, $b, $b, #4 @ B4
  256. veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
  257. vand $t1#hi, $t1#hi, $k32
  258. vmull.p8 $t3, $a, $t3#lo @ K = A*B4
  259. veor $t2, $t2, $r @ N = I + J
  260. veor $t0#lo, $t0#lo, $t0#hi
  261. veor $t1#lo, $t1#lo, $t1#hi
  262. veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
  263. vand $t2#hi, $t2#hi, $k16
  264. vext.8 $t0, $t0, $t0, #15
  265. veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
  266. vmov.i64 $t3#hi, #0
  267. vext.8 $t1, $t1, $t1, #14
  268. veor $t2#lo, $t2#lo, $t2#hi
  269. vmull.p8 $r, $a, $b @ D = A*B
  270. vext.8 $t3, $t3, $t3, #12
  271. vext.8 $t2, $t2, $t2, #13
  272. veor $t0, $t0, $t1
  273. veor $t2, $t2, $t3
  274. veor $r, $r, $t0
  275. veor $r, $r, $t2
  276. vst1.32 {$r}, [r0]
  277. ret @ bx lr
  278. #endif
  279. ___
  280. }
  281. $code.=<<___;
  282. .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
  283. #if __ARM_MAX_ARCH__>=7
  284. .align 5
  285. .LOPENSSL_armcap:
  286. .word OPENSSL_armcap_P-.
  287. #endif
  288. .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  289. .align 5
  290. #if __ARM_MAX_ARCH__>=7
  291. .comm OPENSSL_armcap_P,4,4
  292. #endif
  293. ___
  294. foreach (split("\n",$code)) {
  295. s/\`([^\`]*)\`/eval $1/geo;
  296. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  297. s/\bret\b/bx lr/go or
  298. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  299. print $_,"\n";
  300. }
  301. close STDOUT; # enforce flush