armv4-gf2m.pl 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # May 2011
  11. #
  12. # The module implements bn_GF2m_mul_2x2 polynomial multiplication
  13. # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
  14. # C for the time being... Except that it has two code paths: pure
  15. # integer code suitable for any ARMv4 and later CPU and NEON code
  16. # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
  17. # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
  18. # faster than compiler-generated code. For ECDH and ECDSA verify (but
  19. # not for ECDSA sign) it means 25%-45% improvement depending on key
  20. # length, more for longer keys. Even though NEON 1x1 multiplication
  21. # runs in even less cycles, ~30, improvement is measurable only on
  22. # longer keys. One has to optimize code elsewhere to get NEON glow...
  23. #
  24. # April 2014
  25. #
  26. # Double bn_GF2m_mul_2x2 performance by using algorithm from paper
  27. # referred below, which improves ECDH and ECDSA verify benchmarks
  28. # by 18-40%.
  29. #
  30. # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
  31. # Polynomial Multiplication on ARM Processors using the NEON Engine.
  32. #
  33. # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
  34. $flavour = shift;
  35. if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
  36. else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
  37. if ($flavour && $flavour ne "void") {
  38. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  39. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  40. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  41. die "can't locate arm-xlate.pl";
  42. open STDOUT,"| \"$^X\" $xlate $flavour $output";
  43. } else {
  44. open STDOUT,">$output";
  45. }
  46. $code=<<___;
  47. #include "arm_arch.h"
  48. .text
  49. #if defined(__thumb2__)
  50. .syntax unified
  51. .thumb
  52. #else
  53. .code 32
  54. #endif
  55. ___
  56. ################
  57. # private interface to mul_1x1_ialu
  58. #
  59. $a="r1";
  60. $b="r0";
  61. ($a0,$a1,$a2,$a12,$a4,$a14)=
  62. ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
  63. $mask="r12";
  64. $code.=<<___;
  65. .type mul_1x1_ialu,%function
  66. .align 5
  67. mul_1x1_ialu:
  68. mov $a0,#0
  69. bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
  70. str $a0,[sp,#0] @ tab[0]=0
  71. add $a2,$a1,$a1 @ a2=a1<<1
  72. str $a1,[sp,#4] @ tab[1]=a1
  73. eor $a12,$a1,$a2 @ a1^a2
  74. str $a2,[sp,#8] @ tab[2]=a2
  75. mov $a4,$a1,lsl#2 @ a4=a1<<2
  76. str $a12,[sp,#12] @ tab[3]=a1^a2
  77. eor $a14,$a1,$a4 @ a1^a4
  78. str $a4,[sp,#16] @ tab[4]=a4
  79. eor $a0,$a2,$a4 @ a2^a4
  80. str $a14,[sp,#20] @ tab[5]=a1^a4
  81. eor $a12,$a12,$a4 @ a1^a2^a4
  82. str $a0,[sp,#24] @ tab[6]=a2^a4
  83. and $i0,$mask,$b,lsl#2
  84. str $a12,[sp,#28] @ tab[7]=a1^a2^a4
  85. and $i1,$mask,$b,lsr#1
  86. ldr $lo,[sp,$i0] @ tab[b & 0x7]
  87. and $i0,$mask,$b,lsr#4
  88. ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
  89. and $i1,$mask,$b,lsr#7
  90. ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
  91. eor $lo,$lo,$t1,lsl#3 @ stall
  92. mov $hi,$t1,lsr#29
  93. ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
  94. and $i0,$mask,$b,lsr#10
  95. eor $lo,$lo,$t0,lsl#6
  96. eor $hi,$hi,$t0,lsr#26
  97. ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
  98. and $i1,$mask,$b,lsr#13
  99. eor $lo,$lo,$t1,lsl#9
  100. eor $hi,$hi,$t1,lsr#23
  101. ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
  102. and $i0,$mask,$b,lsr#16
  103. eor $lo,$lo,$t0,lsl#12
  104. eor $hi,$hi,$t0,lsr#20
  105. ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
  106. and $i1,$mask,$b,lsr#19
  107. eor $lo,$lo,$t1,lsl#15
  108. eor $hi,$hi,$t1,lsr#17
  109. ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
  110. and $i0,$mask,$b,lsr#22
  111. eor $lo,$lo,$t0,lsl#18
  112. eor $hi,$hi,$t0,lsr#14
  113. ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
  114. and $i1,$mask,$b,lsr#25
  115. eor $lo,$lo,$t1,lsl#21
  116. eor $hi,$hi,$t1,lsr#11
  117. ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
  118. tst $a,#1<<30
  119. and $i0,$mask,$b,lsr#28
  120. eor $lo,$lo,$t0,lsl#24
  121. eor $hi,$hi,$t0,lsr#8
  122. ldr $t0,[sp,$i0] @ tab[b >> 30 ]
  123. #ifdef __thumb2__
  124. itt ne
  125. #endif
  126. eorne $lo,$lo,$b,lsl#30
  127. eorne $hi,$hi,$b,lsr#2
  128. tst $a,#1<<31
  129. eor $lo,$lo,$t1,lsl#27
  130. eor $hi,$hi,$t1,lsr#5
  131. #ifdef __thumb2__
  132. itt ne
  133. #endif
  134. eorne $lo,$lo,$b,lsl#31
  135. eorne $hi,$hi,$b,lsr#1
  136. eor $lo,$lo,$t0,lsl#30
  137. eor $hi,$hi,$t0,lsr#2
  138. mov pc,lr
  139. .size mul_1x1_ialu,.-mul_1x1_ialu
  140. ___
  141. ################
  142. # void bn_GF2m_mul_2x2(BN_ULONG *r,
  143. # BN_ULONG a1,BN_ULONG a0,
  144. # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
  145. {
  146. $code.=<<___;
  147. .global bn_GF2m_mul_2x2
  148. .type bn_GF2m_mul_2x2,%function
  149. .align 5
  150. bn_GF2m_mul_2x2:
  151. #if __ARM_MAX_ARCH__>=7
  152. stmdb sp!,{r10,lr}
  153. ldr r12,.LOPENSSL_armcap
  154. adr r10,.LOPENSSL_armcap
  155. ldr r12,[r12,r10]
  156. #ifdef __APPLE__
  157. ldr r12,[r12]
  158. #endif
  159. tst r12,#ARMV7_NEON
  160. itt ne
  161. ldrne r10,[sp],#8
  162. bne .LNEON
  163. stmdb sp!,{r4-r9}
  164. #else
  165. stmdb sp!,{r4-r10,lr}
  166. #endif
  167. ___
  168. $ret="r10"; # reassigned 1st argument
  169. $code.=<<___;
  170. mov $ret,r0 @ reassign 1st argument
  171. mov $b,r3 @ $b=b1
  172. sub r7,sp,#36
  173. mov r8,sp
  174. and r7,r7,#-32
  175. ldr r3,[sp,#32] @ load b0
  176. mov $mask,#7<<2
  177. mov sp,r7 @ allocate tab[8]
  178. str r8,[r7,#32]
  179. bl mul_1x1_ialu @ a1·b1
  180. str $lo,[$ret,#8]
  181. str $hi,[$ret,#12]
  182. eor $b,$b,r3 @ flip b0 and b1
  183. eor $a,$a,r2 @ flip a0 and a1
  184. eor r3,r3,$b
  185. eor r2,r2,$a
  186. eor $b,$b,r3
  187. eor $a,$a,r2
  188. bl mul_1x1_ialu @ a0·b0
  189. str $lo,[$ret]
  190. str $hi,[$ret,#4]
  191. eor $a,$a,r2
  192. eor $b,$b,r3
  193. bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
  194. ___
  195. @r=map("r$_",(6..9));
  196. $code.=<<___;
  197. ldmia $ret,{@r[0]-@r[3]}
  198. eor $lo,$lo,$hi
  199. ldr sp,[sp,#32] @ destroy tab[8]
  200. eor $hi,$hi,@r[1]
  201. eor $lo,$lo,@r[0]
  202. eor $hi,$hi,@r[2]
  203. eor $lo,$lo,@r[3]
  204. eor $hi,$hi,@r[3]
  205. str $hi,[$ret,#8]
  206. eor $lo,$lo,$hi
  207. str $lo,[$ret,#4]
  208. #if __ARM_ARCH__>=5
  209. ldmia sp!,{r4-r10,pc}
  210. #else
  211. ldmia sp!,{r4-r10,lr}
  212. tst lr,#1
  213. moveq pc,lr @ be binary compatible with V4, yet
  214. bx lr @ interoperable with Thumb ISA:-)
  215. #endif
  216. ___
  217. }
  218. {
  219. my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
  220. my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
  221. $code.=<<___;
  222. #if __ARM_MAX_ARCH__>=7
  223. .arch armv7-a
  224. .fpu neon
  225. .align 5
  226. .LNEON:
  227. ldr r12, [sp] @ 5th argument
  228. vmov $a, r2, r1
  229. vmov $b, r12, r3
  230. vmov.i64 $k48, #0x0000ffffffffffff
  231. vmov.i64 $k32, #0x00000000ffffffff
  232. vmov.i64 $k16, #0x000000000000ffff
  233. vext.8 $t0#lo, $a, $a, #1 @ A1
  234. vmull.p8 $t0, $t0#lo, $b @ F = A1*B
  235. vext.8 $r#lo, $b, $b, #1 @ B1
  236. vmull.p8 $r, $a, $r#lo @ E = A*B1
  237. vext.8 $t1#lo, $a, $a, #2 @ A2
  238. vmull.p8 $t1, $t1#lo, $b @ H = A2*B
  239. vext.8 $t3#lo, $b, $b, #2 @ B2
  240. vmull.p8 $t3, $a, $t3#lo @ G = A*B2
  241. vext.8 $t2#lo, $a, $a, #3 @ A3
  242. veor $t0, $t0, $r @ L = E + F
  243. vmull.p8 $t2, $t2#lo, $b @ J = A3*B
  244. vext.8 $r#lo, $b, $b, #3 @ B3
  245. veor $t1, $t1, $t3 @ M = G + H
  246. vmull.p8 $r, $a, $r#lo @ I = A*B3
  247. veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
  248. vand $t0#hi, $t0#hi, $k48
  249. vext.8 $t3#lo, $b, $b, #4 @ B4
  250. veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
  251. vand $t1#hi, $t1#hi, $k32
  252. vmull.p8 $t3, $a, $t3#lo @ K = A*B4
  253. veor $t2, $t2, $r @ N = I + J
  254. veor $t0#lo, $t0#lo, $t0#hi
  255. veor $t1#lo, $t1#lo, $t1#hi
  256. veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
  257. vand $t2#hi, $t2#hi, $k16
  258. vext.8 $t0, $t0, $t0, #15
  259. veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
  260. vmov.i64 $t3#hi, #0
  261. vext.8 $t1, $t1, $t1, #14
  262. veor $t2#lo, $t2#lo, $t2#hi
  263. vmull.p8 $r, $a, $b @ D = A*B
  264. vext.8 $t3, $t3, $t3, #12
  265. vext.8 $t2, $t2, $t2, #13
  266. veor $t0, $t0, $t1
  267. veor $t2, $t2, $t3
  268. veor $r, $r, $t0
  269. veor $r, $r, $t2
  270. vst1.32 {$r}, [r0]
  271. ret @ bx lr
  272. #endif
  273. ___
  274. }
  275. $code.=<<___;
  276. .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
  277. #if __ARM_MAX_ARCH__>=7
  278. .align 5
  279. .LOPENSSL_armcap:
  280. .word OPENSSL_armcap_P-.
  281. #endif
  282. .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  283. .align 5
  284. #if __ARM_MAX_ARCH__>=7
  285. .comm OPENSSL_armcap_P,4,4
  286. #endif
  287. ___
  288. foreach (split("\n",$code)) {
  289. s/\`([^\`]*)\`/eval $1/geo;
  290. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  291. s/\bret\b/bx lr/go or
  292. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  293. print $_,"\n";
  294. }
  295. close STDOUT; # enforce flush