poly1305-mips.pl 9.1 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. # project. The module is, however, dual licensed under OpenSSL and
  11. # CRYPTOGAMS licenses depending on where you obtain it. For further
  12. # details see http://www.openssl.org/~appro/cryptogams/.
  13. # ====================================================================
  14. # Poly1305 hash for MIPS64.
  15. #
  16. # May 2016
  17. #
  18. # Numbers are cycles per processed byte with poly1305_blocks alone.
  19. #
  20. # IALU/gcc
  21. # R1x000 5.64/+120% (big-endian)
  22. # Octeon II 3.80/+280% (little-endian)
  23. ######################################################################
  24. # There is a number of MIPS ABI in use, O32 and N32/64 are most
  25. # widely used. Then there is a new contender: NUBI. It appears that if
  26. # one picks the latter, it's possible to arrange code in ABI neutral
  27. # manner. Therefore let's stick to NUBI register layout:
  28. #
  29. ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
  30. ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  31. ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
  32. ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
  33. #
  34. # The return value is placed in $a0. Following coding rules facilitate
  35. # interoperability:
  36. #
  37. # - never ever touch $tp, "thread pointer", former $gp [o32 can be
  38. # excluded from the rule, because it's specified volatile];
  39. # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
  40. # old code];
  41. # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
  42. #
  43. # For reference here is register layout for N32/64 MIPS ABIs:
  44. #
  45. # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
  46. # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  47. # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
  48. # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
  49. # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
  50. #
  51. # <appro@openssl.org>
  52. #
  53. ######################################################################
  54. # $output is the last argument if it looks like a file (it has an extension)
  55. # $flavour is the first argument if it doesn't look like a file
  56. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  57. # supported flavours are o32,n32,64,nubi32,nubi64, default is o32
  58. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
  59. die "MIPS64 only" unless ($flavour =~ /64|n32/i);
  60. $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
  61. $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
  62. ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
  63. ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
  64. $code.=<<___;
  65. #include "mips_arch.h"
  66. #ifdef MIPSEB
  67. # define MSB 0
  68. # define LSB 7
  69. #else
  70. # define MSB 7
  71. # define LSB 0
  72. #endif
  73. .text
  74. .set noat
  75. .set noreorder
  76. .align 5
  77. .globl poly1305_init
  78. .ent poly1305_init
  79. poly1305_init:
  80. .frame $sp,0,$ra
  81. .set reorder
  82. sd $zero,0($ctx)
  83. sd $zero,8($ctx)
  84. sd $zero,16($ctx)
  85. beqz $inp,.Lno_key
  86. #if defined(_MIPS_ARCH_MIPS64R6)
  87. ld $in0,0($inp)
  88. ld $in1,8($inp)
  89. #else
  90. ldl $in0,0+MSB($inp)
  91. ldl $in1,8+MSB($inp)
  92. ldr $in0,0+LSB($inp)
  93. ldr $in1,8+LSB($inp)
  94. #endif
  95. #ifdef MIPSEB
  96. # if defined(_MIPS_ARCH_MIPS64R2)
  97. dsbh $in0,$in0 # byte swap
  98. dsbh $in1,$in1
  99. dshd $in0,$in0
  100. dshd $in1,$in1
  101. # else
  102. ori $tmp0,$zero,0xFF
  103. dsll $tmp2,$tmp0,32
  104. or $tmp0,$tmp2 # 0x000000FF000000FF
  105. and $tmp1,$in0,$tmp0 # byte swap
  106. and $tmp3,$in1,$tmp0
  107. dsrl $tmp2,$in0,24
  108. dsrl $tmp4,$in1,24
  109. dsll $tmp1,24
  110. dsll $tmp3,24
  111. and $tmp2,$tmp0
  112. and $tmp4,$tmp0
  113. dsll $tmp0,8 # 0x0000FF000000FF00
  114. or $tmp1,$tmp2
  115. or $tmp3,$tmp4
  116. and $tmp2,$in0,$tmp0
  117. and $tmp4,$in1,$tmp0
  118. dsrl $in0,8
  119. dsrl $in1,8
  120. dsll $tmp2,8
  121. dsll $tmp4,8
  122. and $in0,$tmp0
  123. and $in1,$tmp0
  124. or $tmp1,$tmp2
  125. or $tmp3,$tmp4
  126. or $in0,$tmp1
  127. or $in1,$tmp3
  128. dsrl $tmp1,$in0,32
  129. dsrl $tmp3,$in1,32
  130. dsll $in0,32
  131. dsll $in1,32
  132. or $in0,$tmp1
  133. or $in1,$tmp3
  134. # endif
  135. #endif
  136. li $tmp0,1
  137. dsll $tmp0,32
  138. daddiu $tmp0,-63
  139. dsll $tmp0,28
  140. daddiu $tmp0,-1 # 0ffffffc0fffffff
  141. and $in0,$tmp0
  142. daddiu $tmp0,-3 # 0ffffffc0ffffffc
  143. and $in1,$tmp0
  144. sd $in0,24($ctx)
  145. dsrl $tmp0,$in1,2
  146. sd $in1,32($ctx)
  147. daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
  148. sd $tmp0,40($ctx)
  149. .Lno_key:
  150. li $v0,0 # return 0
  151. jr $ra
  152. .end poly1305_init
  153. ___
  154. {
  155. my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
  156. ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
  157. $code.=<<___;
  158. .align 5
  159. .globl poly1305_blocks
  160. .ent poly1305_blocks
  161. poly1305_blocks:
  162. .set noreorder
  163. dsrl $len,4 # number of complete blocks
  164. bnez $len,poly1305_blocks_internal
  165. nop
  166. jr $ra
  167. nop
  168. .end poly1305_blocks
  169. .align 5
  170. .ent poly1305_blocks_internal
  171. poly1305_blocks_internal:
  172. .frame $sp,6*8,$ra
  173. .mask $SAVED_REGS_MASK,-8
  174. .set noreorder
  175. dsubu $sp,6*8
  176. sd $s5,40($sp)
  177. sd $s4,32($sp)
  178. ___
  179. $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
  180. sd $s3,24($sp)
  181. sd $s2,16($sp)
  182. sd $s1,8($sp)
  183. sd $s0,0($sp)
  184. ___
  185. $code.=<<___;
  186. .set reorder
  187. ld $h0,0($ctx) # load hash value
  188. ld $h1,8($ctx)
  189. ld $h2,16($ctx)
  190. ld $r0,24($ctx) # load key
  191. ld $r1,32($ctx)
  192. ld $s1,40($ctx)
  193. .Loop:
  194. #if defined(_MIPS_ARCH_MIPS64R6)
  195. ld $in0,0($inp) # load input
  196. ld $in1,8($inp)
  197. #else
  198. ldl $in0,0+MSB($inp) # load input
  199. ldl $in1,8+MSB($inp)
  200. ldr $in0,0+LSB($inp)
  201. ldr $in1,8+LSB($inp)
  202. #endif
  203. daddiu $len,-1
  204. daddiu $inp,16
  205. #ifdef MIPSEB
  206. # if defined(_MIPS_ARCH_MIPS64R2)
  207. dsbh $in0,$in0 # byte swap
  208. dsbh $in1,$in1
  209. dshd $in0,$in0
  210. dshd $in1,$in1
  211. # else
  212. ori $tmp0,$zero,0xFF
  213. dsll $tmp2,$tmp0,32
  214. or $tmp0,$tmp2 # 0x000000FF000000FF
  215. and $tmp1,$in0,$tmp0 # byte swap
  216. and $tmp3,$in1,$tmp0
  217. dsrl $tmp2,$in0,24
  218. dsrl $tmp4,$in1,24
  219. dsll $tmp1,24
  220. dsll $tmp3,24
  221. and $tmp2,$tmp0
  222. and $tmp4,$tmp0
  223. dsll $tmp0,8 # 0x0000FF000000FF00
  224. or $tmp1,$tmp2
  225. or $tmp3,$tmp4
  226. and $tmp2,$in0,$tmp0
  227. and $tmp4,$in1,$tmp0
  228. dsrl $in0,8
  229. dsrl $in1,8
  230. dsll $tmp2,8
  231. dsll $tmp4,8
  232. and $in0,$tmp0
  233. and $in1,$tmp0
  234. or $tmp1,$tmp2
  235. or $tmp3,$tmp4
  236. or $in0,$tmp1
  237. or $in1,$tmp3
  238. dsrl $tmp1,$in0,32
  239. dsrl $tmp3,$in1,32
  240. dsll $in0,32
  241. dsll $in1,32
  242. or $in0,$tmp1
  243. or $in1,$tmp3
  244. # endif
  245. #endif
  246. daddu $h0,$in0 # accumulate input
  247. daddu $h1,$in1
  248. sltu $tmp0,$h0,$in0
  249. sltu $tmp1,$h1,$in1
  250. daddu $h1,$tmp0
  251. dmultu ($r0,$h0) # h0*r0
  252. daddu $h2,$padbit
  253. sltu $tmp0,$h1,$tmp0
  254. mflo ($d0,$r0,$h0)
  255. mfhi ($d1,$r0,$h0)
  256. dmultu ($s1,$h1) # h1*5*r1
  257. daddu $tmp0,$tmp1
  258. daddu $h2,$tmp0
  259. mflo ($tmp0,$s1,$h1)
  260. mfhi ($tmp1,$s1,$h1)
  261. dmultu ($r1,$h0) # h0*r1
  262. daddu $d0,$tmp0
  263. daddu $d1,$tmp1
  264. mflo ($tmp2,$r1,$h0)
  265. mfhi ($d2,$r1,$h0)
  266. sltu $tmp0,$d0,$tmp0
  267. daddu $d1,$tmp0
  268. dmultu ($r0,$h1) # h1*r0
  269. daddu $d1,$tmp2
  270. sltu $tmp2,$d1,$tmp2
  271. mflo ($tmp0,$r0,$h1)
  272. mfhi ($tmp1,$r0,$h1)
  273. daddu $d2,$tmp2
  274. dmultu ($s1,$h2) # h2*5*r1
  275. daddu $d1,$tmp0
  276. daddu $d2,$tmp1
  277. mflo ($tmp2,$s1,$h2)
  278. dmultu ($r0,$h2) # h2*r0
  279. sltu $tmp0,$d1,$tmp0
  280. daddu $d2,$tmp0
  281. mflo ($tmp3,$r0,$h2)
  282. daddu $d1,$tmp2
  283. daddu $d2,$tmp3
  284. sltu $tmp2,$d1,$tmp2
  285. daddu $d2,$tmp2
  286. li $tmp0,-4 # final reduction
  287. and $tmp0,$d2
  288. dsrl $tmp1,$d2,2
  289. andi $h2,$d2,3
  290. daddu $tmp0,$tmp1
  291. daddu $h0,$d0,$tmp0
  292. sltu $tmp0,$h0,$tmp0
  293. daddu $h1,$d1,$tmp0
  294. sltu $tmp0,$h1,$tmp0
  295. daddu $h2,$h2,$tmp0
  296. bnez $len,.Loop
  297. sd $h0,0($ctx) # store hash value
  298. sd $h1,8($ctx)
  299. sd $h2,16($ctx)
  300. .set noreorder
  301. ld $s5,40($sp) # epilogue
  302. ld $s4,32($sp)
  303. ___
  304. $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
  305. ld $s3,24($sp)
  306. ld $s2,16($sp)
  307. ld $s1,8($sp)
  308. ld $s0,0($sp)
  309. ___
  310. $code.=<<___;
  311. jr $ra
  312. daddu $sp,6*8
  313. .end poly1305_blocks_internal
  314. ___
  315. }
  316. {
  317. my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
  318. $code.=<<___;
  319. .align 5
  320. .globl poly1305_emit
  321. .ent poly1305_emit
  322. poly1305_emit:
  323. .frame $sp,0,$ra
  324. .set reorder
  325. ld $tmp0,0($ctx)
  326. ld $tmp1,8($ctx)
  327. ld $tmp2,16($ctx)
  328. daddiu $in0,$tmp0,5 # compare to modulus
  329. sltiu $tmp3,$in0,5
  330. daddu $in1,$tmp1,$tmp3
  331. sltu $tmp3,$in1,$tmp3
  332. daddu $tmp2,$tmp2,$tmp3
  333. dsrl $tmp2,2 # see if it carried/borrowed
  334. dsubu $tmp2,$zero,$tmp2
  335. nor $tmp3,$zero,$tmp2
  336. and $in0,$tmp2
  337. and $tmp0,$tmp3
  338. and $in1,$tmp2
  339. and $tmp1,$tmp3
  340. or $in0,$tmp0
  341. or $in1,$tmp1
  342. lwu $tmp0,0($nonce) # load nonce
  343. lwu $tmp1,4($nonce)
  344. lwu $tmp2,8($nonce)
  345. lwu $tmp3,12($nonce)
  346. dsll $tmp1,32
  347. dsll $tmp3,32
  348. or $tmp0,$tmp1
  349. or $tmp2,$tmp3
  350. daddu $in0,$tmp0 # accumulate nonce
  351. daddu $in1,$tmp2
  352. sltu $tmp0,$in0,$tmp0
  353. daddu $in1,$tmp0
  354. dsrl $tmp0,$in0,8 # write mac value
  355. dsrl $tmp1,$in0,16
  356. dsrl $tmp2,$in0,24
  357. sb $in0,0($mac)
  358. dsrl $tmp3,$in0,32
  359. sb $tmp0,1($mac)
  360. dsrl $tmp0,$in0,40
  361. sb $tmp1,2($mac)
  362. dsrl $tmp1,$in0,48
  363. sb $tmp2,3($mac)
  364. dsrl $tmp2,$in0,56
  365. sb $tmp3,4($mac)
  366. dsrl $tmp3,$in1,8
  367. sb $tmp0,5($mac)
  368. dsrl $tmp0,$in1,16
  369. sb $tmp1,6($mac)
  370. dsrl $tmp1,$in1,24
  371. sb $tmp2,7($mac)
  372. sb $in1,8($mac)
  373. dsrl $tmp2,$in1,32
  374. sb $tmp3,9($mac)
  375. dsrl $tmp3,$in1,40
  376. sb $tmp0,10($mac)
  377. dsrl $tmp0,$in1,48
  378. sb $tmp1,11($mac)
  379. dsrl $tmp1,$in1,56
  380. sb $tmp2,12($mac)
  381. sb $tmp3,13($mac)
  382. sb $tmp0,14($mac)
  383. sb $tmp1,15($mac)
  384. jr $ra
  385. .end poly1305_emit
  386. .rdata
  387. .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
  388. .align 2
  389. ___
  390. }
  391. $output and open STDOUT,">$output";
  392. print $code;
  393. close STDOUT or die "error closing STDOUT: $!";