armv4-gf2m.pl 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. #! /usr/bin/env perl
  2. # Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # May 2011
  17. #
  18. # The module implements bn_GF2m_mul_2x2 polynomial multiplication
  19. # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
  20. # C for the time being... Except that it has two code paths: pure
  21. # integer code suitable for any ARMv4 and later CPU and NEON code
  22. # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
  23. # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
  24. # faster than compiler-generated code. For ECDH and ECDSA verify (but
  25. # not for ECDSA sign) it means 25%-45% improvement depending on key
  26. # length, more for longer keys. Even though NEON 1x1 multiplication
  27. # runs in even less cycles, ~30, improvement is measurable only on
  28. # longer keys. One has to optimize code elsewhere to get NEON glow...
  29. #
  30. # April 2014
  31. #
  32. # Double bn_GF2m_mul_2x2 performance by using algorithm from paper
  33. # referred below, which improves ECDH and ECDSA verify benchmarks
  34. # by 18-40%.
  35. #
  36. # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
  37. # Polynomial Multiplication on ARM Processors using the NEON Engine.
  38. #
  39. # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
  40. # $output is the last argument if it looks like a file (it has an extension)
  41. # $flavour is the first argument if it doesn't look like a file
  42. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  43. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  44. if ($flavour && $flavour ne "void") {
  45. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  46. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  47. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  48. die "can't locate arm-xlate.pl";
  49. open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
  50. or die "can't call $xlate: $1";
  51. } else {
  52. $output and open STDOUT,">$output";
  53. }
  54. $code=<<___;
  55. #include "arm_arch.h"
  56. #if defined(__thumb2__)
  57. .syntax unified
  58. .thumb
  59. #else
  60. .code 32
  61. #endif
  62. .text
  63. ___
  64. ################
  65. # private interface to mul_1x1_ialu
  66. #
  67. $a="r1";
  68. $b="r0";
  69. ($a0,$a1,$a2,$a12,$a4,$a14)=
  70. ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
  71. $mask="r12";
  72. $code.=<<___;
  73. .type mul_1x1_ialu,%function
  74. .align 5
  75. mul_1x1_ialu:
  76. mov $a0,#0
  77. bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
  78. str $a0,[sp,#0] @ tab[0]=0
  79. add $a2,$a1,$a1 @ a2=a1<<1
  80. str $a1,[sp,#4] @ tab[1]=a1
  81. eor $a12,$a1,$a2 @ a1^a2
  82. str $a2,[sp,#8] @ tab[2]=a2
  83. mov $a4,$a1,lsl#2 @ a4=a1<<2
  84. str $a12,[sp,#12] @ tab[3]=a1^a2
  85. eor $a14,$a1,$a4 @ a1^a4
  86. str $a4,[sp,#16] @ tab[4]=a4
  87. eor $a0,$a2,$a4 @ a2^a4
  88. str $a14,[sp,#20] @ tab[5]=a1^a4
  89. eor $a12,$a12,$a4 @ a1^a2^a4
  90. str $a0,[sp,#24] @ tab[6]=a2^a4
  91. and $i0,$mask,$b,lsl#2
  92. str $a12,[sp,#28] @ tab[7]=a1^a2^a4
  93. and $i1,$mask,$b,lsr#1
  94. ldr $lo,[sp,$i0] @ tab[b & 0x7]
  95. and $i0,$mask,$b,lsr#4
  96. ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
  97. and $i1,$mask,$b,lsr#7
  98. ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
  99. eor $lo,$lo,$t1,lsl#3 @ stall
  100. mov $hi,$t1,lsr#29
  101. ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
  102. and $i0,$mask,$b,lsr#10
  103. eor $lo,$lo,$t0,lsl#6
  104. eor $hi,$hi,$t0,lsr#26
  105. ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
  106. and $i1,$mask,$b,lsr#13
  107. eor $lo,$lo,$t1,lsl#9
  108. eor $hi,$hi,$t1,lsr#23
  109. ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
  110. and $i0,$mask,$b,lsr#16
  111. eor $lo,$lo,$t0,lsl#12
  112. eor $hi,$hi,$t0,lsr#20
  113. ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
  114. and $i1,$mask,$b,lsr#19
  115. eor $lo,$lo,$t1,lsl#15
  116. eor $hi,$hi,$t1,lsr#17
  117. ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
  118. and $i0,$mask,$b,lsr#22
  119. eor $lo,$lo,$t0,lsl#18
  120. eor $hi,$hi,$t0,lsr#14
  121. ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
  122. and $i1,$mask,$b,lsr#25
  123. eor $lo,$lo,$t1,lsl#21
  124. eor $hi,$hi,$t1,lsr#11
  125. ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
  126. tst $a,#1<<30
  127. and $i0,$mask,$b,lsr#28
  128. eor $lo,$lo,$t0,lsl#24
  129. eor $hi,$hi,$t0,lsr#8
  130. ldr $t0,[sp,$i0] @ tab[b >> 30 ]
  131. #ifdef __thumb2__
  132. itt ne
  133. #endif
  134. eorne $lo,$lo,$b,lsl#30
  135. eorne $hi,$hi,$b,lsr#2
  136. tst $a,#1<<31
  137. eor $lo,$lo,$t1,lsl#27
  138. eor $hi,$hi,$t1,lsr#5
  139. #ifdef __thumb2__
  140. itt ne
  141. #endif
  142. eorne $lo,$lo,$b,lsl#31
  143. eorne $hi,$hi,$b,lsr#1
  144. eor $lo,$lo,$t0,lsl#30
  145. eor $hi,$hi,$t0,lsr#2
  146. mov pc,lr
  147. .size mul_1x1_ialu,.-mul_1x1_ialu
  148. ___
  149. ################
  150. # void bn_GF2m_mul_2x2(BN_ULONG *r,
  151. # BN_ULONG a1,BN_ULONG a0,
  152. # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
  153. {
  154. $code.=<<___;
  155. .global bn_GF2m_mul_2x2
  156. .type bn_GF2m_mul_2x2,%function
  157. .align 5
  158. bn_GF2m_mul_2x2:
  159. #if __ARM_MAX_ARCH__>=7
  160. stmdb sp!,{r10,lr}
  161. ldr r12,.LOPENSSL_armcap
  162. # if !defined(_WIN32)
  163. adr r10,.LOPENSSL_armcap
  164. ldr r12,[r12,r10]
  165. # endif
  166. # if defined(__APPLE__) || defined(_WIN32)
  167. ldr r12,[r12]
  168. # endif
  169. tst r12,#ARMV7_NEON
  170. itt ne
  171. ldrne r10,[sp],#8
  172. bne .LNEON
  173. stmdb sp!,{r4-r9}
  174. #else
  175. stmdb sp!,{r4-r10,lr}
  176. #endif
  177. ___
  178. $ret="r10"; # reassigned 1st argument
  179. $code.=<<___;
  180. mov $ret,r0 @ reassign 1st argument
  181. mov $b,r3 @ $b=b1
  182. sub r7,sp,#36
  183. mov r8,sp
  184. and r7,r7,#-32
  185. ldr r3,[sp,#32] @ load b0
  186. mov $mask,#7<<2
  187. mov sp,r7 @ allocate tab[8]
  188. str r8,[r7,#32]
  189. bl mul_1x1_ialu @ a1·b1
  190. str $lo,[$ret,#8]
  191. str $hi,[$ret,#12]
  192. eor $b,$b,r3 @ flip b0 and b1
  193. eor $a,$a,r2 @ flip a0 and a1
  194. eor r3,r3,$b
  195. eor r2,r2,$a
  196. eor $b,$b,r3
  197. eor $a,$a,r2
  198. bl mul_1x1_ialu @ a0·b0
  199. str $lo,[$ret]
  200. str $hi,[$ret,#4]
  201. eor $a,$a,r2
  202. eor $b,$b,r3
  203. bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
  204. ___
  205. @r=map("r$_",(6..9));
  206. $code.=<<___;
  207. ldmia $ret,{@r[0]-@r[3]}
  208. eor $lo,$lo,$hi
  209. ldr sp,[sp,#32] @ destroy tab[8]
  210. eor $hi,$hi,@r[1]
  211. eor $lo,$lo,@r[0]
  212. eor $hi,$hi,@r[2]
  213. eor $lo,$lo,@r[3]
  214. eor $hi,$hi,@r[3]
  215. str $hi,[$ret,#8]
  216. eor $lo,$lo,$hi
  217. str $lo,[$ret,#4]
  218. #if __ARM_ARCH__>=5
  219. ldmia sp!,{r4-r10,pc}
  220. #else
  221. ldmia sp!,{r4-r10,lr}
  222. tst lr,#1
  223. moveq pc,lr @ be binary compatible with V4, yet
  224. bx lr @ interoperable with Thumb ISA:-)
  225. #endif
  226. ___
  227. }
  228. {
  229. my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
  230. my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
  231. $code.=<<___;
  232. #if __ARM_MAX_ARCH__>=7
  233. .arch armv7-a
  234. .fpu neon
  235. .align 5
  236. .LNEON:
  237. ldr r12, [sp] @ 5th argument
  238. vmov $a, r2, r1
  239. vmov $b, r12, r3
  240. vmov.i64 $k48, #0x0000ffffffffffff
  241. vmov.i64 $k32, #0x00000000ffffffff
  242. vmov.i64 $k16, #0x000000000000ffff
  243. vext.8 $t0#lo, $a, $a, #1 @ A1
  244. vmull.p8 $t0, $t0#lo, $b @ F = A1*B
  245. vext.8 $r#lo, $b, $b, #1 @ B1
  246. vmull.p8 $r, $a, $r#lo @ E = A*B1
  247. vext.8 $t1#lo, $a, $a, #2 @ A2
  248. vmull.p8 $t1, $t1#lo, $b @ H = A2*B
  249. vext.8 $t3#lo, $b, $b, #2 @ B2
  250. vmull.p8 $t3, $a, $t3#lo @ G = A*B2
  251. vext.8 $t2#lo, $a, $a, #3 @ A3
  252. veor $t0, $t0, $r @ L = E + F
  253. vmull.p8 $t2, $t2#lo, $b @ J = A3*B
  254. vext.8 $r#lo, $b, $b, #3 @ B3
  255. veor $t1, $t1, $t3 @ M = G + H
  256. vmull.p8 $r, $a, $r#lo @ I = A*B3
  257. veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
  258. vand $t0#hi, $t0#hi, $k48
  259. vext.8 $t3#lo, $b, $b, #4 @ B4
  260. veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
  261. vand $t1#hi, $t1#hi, $k32
  262. vmull.p8 $t3, $a, $t3#lo @ K = A*B4
  263. veor $t2, $t2, $r @ N = I + J
  264. veor $t0#lo, $t0#lo, $t0#hi
  265. veor $t1#lo, $t1#lo, $t1#hi
  266. veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
  267. vand $t2#hi, $t2#hi, $k16
  268. vext.8 $t0, $t0, $t0, #15
  269. veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
  270. vmov.i64 $t3#hi, #0
  271. vext.8 $t1, $t1, $t1, #14
  272. veor $t2#lo, $t2#lo, $t2#hi
  273. vmull.p8 $r, $a, $b @ D = A*B
  274. vext.8 $t3, $t3, $t3, #12
  275. vext.8 $t2, $t2, $t2, #13
  276. veor $t0, $t0, $t1
  277. veor $t2, $t2, $t3
  278. veor $r, $r, $t0
  279. veor $r, $r, $t2
  280. vst1.32 {$r}, [r0]
  281. ret @ bx lr
  282. #endif
  283. ___
  284. }
  285. $code.=<<___;
  286. .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
  287. #if __ARM_MAX_ARCH__>=7
  288. .align 5
  289. .LOPENSSL_armcap:
  290. # ifdef _WIN32
  291. .word OPENSSL_armcap_P
  292. # else
  293. .word OPENSSL_armcap_P-.
  294. # endif
  295. #endif
  296. .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  297. .align 5
  298. #if __ARM_MAX_ARCH__>=7
  299. .extern OPENSSL_armcap_P
  300. #endif
  301. ___
  302. foreach (split("\n",$code)) {
  303. s/\`([^\`]*)\`/eval $1/geo;
  304. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  305. s/\bret\b/bx lr/go or
  306. s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
  307. print $_,"\n";
  308. }
  309. close STDOUT or die "error closing STDOUT: $!"; # enforce flush