auth.S 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. /* -*- Mode:MAL */
  2. /**
  3. * @author Caleb James DeLisle <cjd at cjdns dot fr>
  4. * @version 1.0
  5. * @since 28 Feb 2014
  6. *
  7. * Bernstein's Poly1305 ported to mips32r2 processors.
  8. * Based on the poly1305-donna algorithm by Floodyberry.
  9. *
  10. * This is free and unencumbered software released into the public domain.
  11. *
  12. * Anyone is free to copy, modify, publish, use, compile, sell, or
  13. * distribute this software, either in source code form or as a compiled
  14. * binary, for any purpose, commercial or non-commercial, and by any
  15. * means.
  16. *
  17. * In jurisdictions that recognize copyright laws, the author or authors
  18. * of this software dedicate any and all copyright interest in the
  19. * software to the public domain. We make this dedication for the benefit
  20. * of the public at large and to the detriment of our heirs and
  21. * successors. We intend this dedication to be an overt act of
  22. * relinquishment in perpetuity of all present and future rights to this
  23. * software under copyright law.
  24. *
  25. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  28. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  29. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  30. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  31. * OTHER DEALINGS IN THE SOFTWARE.
  32. */
  33. #define SC $2
  34. #define CA $3
  35. #define OUT $4
  36. #define MSG $5
  37. #define LEN $6
  38. #define KEY $7
  39. #define H0 $8
  40. #define H1 $9
  41. #define H2 $10
  42. #define H3 $11
  43. #define H4 $12
  44. #define R0 $13
  45. #define R1 $14
  46. #define R2 $15
  47. #define R3 $16
  48. #define R4 $17
  49. #define O0 $18
  50. #define O1 $19
  51. #define O2 $20
  52. #define O3 $21
  53. #define O4 $22
  54. #define S1 $23
  55. #define S2 $24
  56. #define S3 $25
  57. #define S4 $30
  58. #if defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) || defined(__MIPSEL__)
  59. #define LITTLE_ENDIAN
  60. #elif defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) || defined(__MIPSEB__)
  61. #define BIG_ENDIAN
  62. #else
  63. #error cannot determine byte order of target processor
  64. #endif
  65. .macro CARRY out, carryOut, carryIn, scratch
  66. mflo \out
  67. mfhi \scratch
  68. addu \carryOut,\out,\carryIn
  69. sltu \out,\carryOut,\out
  70. addu \scratch,\out,\scratch
  71. ext \out,\carryOut,0,26
  72. srl \carryOut,\carryOut,26
  73. sll \scratch,\scratch,6
  74. addu \carryOut,\carryOut,\scratch
  75. .endm
  76. .macro MULT_ROW p1b,p2b,p3b,p4b,p5b, p1a,p2a,p3a,p4a,p5a
  77. multu \p1a,\p1b
  78. maddu \p2a,\p2b
  79. maddu \p3a,\p3b
  80. maddu \p4a,\p4b
  81. maddu \p5a,\p5b
  82. .endm
  83. # Add 5 times input2... output = input + input2 * 5
  84. .macro PLUS_5X output, input, input2, scratch
  85. addiu \scratch,$zero,5
  86. multu SC,\input2
  87. mflo \scratch
  88. addu \output,\input,\scratch
  89. .endm
  90. .macro BYTESWAP reg
  91. wsbh \reg,\reg
  92. rotr \reg,\reg,16
  93. .endm
  94. #ifdef BIG_ENDIAN
  95. .macro LITTLE_ENDIAN_TO_HOST reg
  96. BYTESWAP \reg
  97. .endm
  98. #else
  99. .macro LITTLE_ENDIAN_TO_HOST reg
  100. .endm
  101. #endif
  102. .macro HOST_TO_LITTLE_ENDIAN reg
  103. LITTLE_ENDIAN_TO_HOST \reg
  104. .endm
  105. ####### Begin
  106. .abicalls
  107. .text
  108. .set nomips16
  109. .set nomicromips
  110. .globl crypto_onetimeauth_poly1305_mips32r2donna
  111. .ent crypto_onetimeauth_poly1305_mips32r2donna
  112. .type crypto_onetimeauth_poly1305_mips32r2donna, @function
  113. crypto_onetimeauth_poly1305_mips32r2donna:
  114. .frame $sp,52,$31
  115. ## Save all of the callee-saved registers to the stack...
  116. addiu $sp,$sp,-52
  117. sw $fp,48($sp)
  118. sw $23,44($sp)
  119. sw $22,40($sp)
  120. sw $21,36($sp)
  121. sw $20,32($sp)
  122. sw $19,28($sp)
  123. sw $18,24($sp)
  124. sw $17,20($sp)
  125. sw $16,16($sp)
  126. # This is a 32 bit machine so it is physically impossible for this function
  127. # to handle a message larger than 4GB but the message argument is passed as
  128. # an unsigned long long (64 bits) so the key field is currently holding the
  129. # low bits of the message length and LEN is the high bits, move low to high
  130. # and load the key pointer off the stack.
  131. #ifdef BIG_ENDIAN
  132. move LEN,KEY
  133. #endif
  134. lw KEY,68($sp)
  135. # t0 = U8TO32_LE(key+0);
  136. # t1 = U8TO32_LE(key+4);
  137. # t2 = U8TO32_LE(key+8);
  138. # t3 = U8TO32_LE(key+12);
  139. lw O0,0(KEY)
  140. lw O1,4(KEY)
  141. lw O2,8(KEY)
  142. lw O3,12(KEY)
  143. LITTLE_ENDIAN_TO_HOST O0
  144. LITTLE_ENDIAN_TO_HOST O1
  145. LITTLE_ENDIAN_TO_HOST O2
  146. LITTLE_ENDIAN_TO_HOST O3
  147. # r0 = t0 & 0x3ffffff; t0 >>= 26; t0 |= t1 << 6;
  148. ext R0,O0,0,26
  149. srl O0,O0,26
  150. sll SC,O1,6
  151. or O0,SC,O0
  152. ## note: 0xffffff03 == (uint32_t) -253
  153. addiu O4,$zero,-253
  154. # r1 = t0 & 0x3ffff03; t1 >>= 20; t1 |= t2 << 12;
  155. ext R1,O0,0,26
  156. and R1,R1,O4
  157. srl O1,O1,20
  158. sll SC,O2,12
  159. or O1,SC,O1
  160. ## note: 0xffffc0ff == rotl((uint32_t) -253, 6)
  161. rotr O4,O4,-6
  162. # r2 = t1 & 0x3ffc0ff; t2 >>= 14; t2 |= t3 << 18;
  163. ext R2,O1,0,26
  164. and R2,R2,O4
  165. srl O2,O2,14
  166. sll SC,O3,18
  167. or O2,SC,O2
  168. ## note: 0xfff03fff == rotl((uint32_t) -253, 12)
  169. rotr O4,O4,-6
  170. # r3 = t2 & 0x3f03fff; t3 >>= 8;
  171. ext R3,O2,0,26
  172. and R3,R3,O4
  173. srl O3,O3,8
  174. # r4 = t3 & 0x00fffff;
  175. ext R4,O3,0,20
  176. # s1 = r1 * 5;
  177. # s2 = r2 * 5;
  178. # s3 = r3 * 5;
  179. # s4 = r4 * 5;
  180. PLUS_5X S1,$zero,R1,SC
  181. PLUS_5X S2,$zero,R2,SC
  182. PLUS_5X S3,$zero,R3,SC
  183. PLUS_5X S4,$zero,R4,SC
  184. # Initial state
  185. move H0,$zero
  186. move H1,$zero
  187. move H2,$zero
  188. move H3,$zero
  189. move H4,$zero
  190. addiu SC,LEN,-16
  191. bltz SC,poly1305_mips32r2donna_atmost15bytes
  192. poly1305_mips32r2donna_16bytes:
  193. addiu MSG,MSG,16
  194. addiu LEN,LEN,-16
  195. lw O0,-16(MSG)
  196. lw O1,-12(MSG)
  197. lw O2,-8(MSG)
  198. lw O3,-4(MSG)
  199. # h0 += t0 & 0x3ffffff;
  200. LITTLE_ENDIAN_TO_HOST O0
  201. ext SC,O0,0,26
  202. addu H0,SC,H0
  203. # h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
  204. srl SC,O0,26
  205. addu H1,SC,H1
  206. LITTLE_ENDIAN_TO_HOST O1
  207. ext SC,O1,0,20 # 26 - (32 - 26)
  208. sll SC,SC,6 # 32 - 26
  209. addu H1,SC,H1
  210. # h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
  211. srl SC,O1,20
  212. addu H2,SC,H2
  213. LITTLE_ENDIAN_TO_HOST O2
  214. ext SC,O2,0,14 # 26 - (32 - 20)
  215. sll SC,SC,12 # 32 - 20
  216. addu H2,SC,H2
  217. # h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
  218. srl SC,O2,14
  219. addu H3,SC,H3
  220. LITTLE_ENDIAN_TO_HOST O3
  221. ext SC,O3,0,8 # 26 - (32 - 14)
  222. sll SC,SC,18 # 32 - 14
  223. addu H3,SC,H3
  224. # h4 += (t3 >> 8) | (1 << 24);
  225. addiu SC,$zero,1
  226. sll SC,SC,24
  227. addu H4,SC,H4
  228. srl SC,O3,8
  229. addu H4,SC,H4
  230. poly1305_mips32r2donna_mult:
  231. MULT_ROW H0,H1,H2,H3,H4, R0,S4,S3,S2,S1 ; CARRY O0, CA, $zero, SC
  232. MULT_ROW H0,H1,H2,H3,H4, R1,R0,S4,S3,S2 ; CARRY O1,CA,CA,SC
  233. MULT_ROW H0,H1,H2,H3,H4, R2,R1,R0,S4,S3 ; CARRY O2,CA,CA,SC
  234. MULT_ROW H0,H1,H2,H3,H4, R3,R2,R1,R0,S4 ; CARRY O3,CA,CA,SC
  235. MULT_ROW H0,H1,H2,H3,H4, R4,R3,R2,R1,R0 ; CARRY O4,CA,CA,SC
  236. # h0 += b * 5;
  237. PLUS_5X H0,O0,CA,SC
  238. move H1,O1
  239. move H2,O2
  240. move H3,O3
  241. move H4,O4
  242. ## if (inlen >= 16) goto poly1305_donna_16bytes;
  243. addiu SC,LEN,-16
  244. bgez SC,poly1305_mips32r2donna_16bytes
  245. ###
  246. poly1305_mips32r2donna_atmost15bytes:
  247. beq LEN,$zero,poly1305_mips32r2donna_finish
  248. sw $zero,0($sp)
  249. sw $zero,4($sp)
  250. sw $zero,8($sp)
  251. sw $zero,12($sp)
  252. # for (j = 0; j < inlen; j++) mp[j] = m[j];
  253. move O2,$sp
  254. addu O1,MSG,LEN
  255. poly1305_mips32r2donna_loadbyte:
  256. lbu SC,0(MSG)
  257. sb SC,0(O2)
  258. addiu MSG,MSG,1
  259. addiu O2,O2,1
  260. bne MSG,O1,poly1305_mips32r2donna_loadbyte
  261. # mp[j++] = 1;
  262. addiu SC,$zero,1
  263. sb SC,0(O2)
  264. move LEN,$zero
  265. lw O0,0($sp)
  266. lw O1,4($sp)
  267. lw O2,8($sp)
  268. lw O3,12($sp)
  269. # h0 += t0 & 0x3ffffff;
  270. LITTLE_ENDIAN_TO_HOST O0
  271. ext SC,O0,0,26
  272. addu H0,H0,SC
  273. # h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
  274. srl SC,O0,26
  275. addu H1,SC,H1
  276. LITTLE_ENDIAN_TO_HOST O1
  277. ext SC,O1,0,20 # 26 - (32 - 26)
  278. sll SC,SC,6 # 32 - 26
  279. addu H1,SC,H1
  280. # h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
  281. srl SC,O1,20
  282. addu H2,SC,H2
  283. LITTLE_ENDIAN_TO_HOST O2
  284. ext SC,O2,0,14 # 26 - (32 - 20)
  285. sll SC,SC,12 # 32 - 20
  286. addu H2,SC,H2
  287. # h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
  288. srl SC,O2,14
  289. addu H3,SC,H3
  290. LITTLE_ENDIAN_TO_HOST O3
  291. ext SC,O3,0,8 # 26 - (32 - 14)
  292. sll SC,SC,18 # 32 - 14
  293. addu H3,SC,H3
  294. # h4 += (t3 >> 8);
  295. srl SC,O3,8
  296. addu H4,SC,H4
  297. j poly1305_mips32r2donna_mult
  298. poly1305_mips32r2donna_finish:
  299. ## b = h0 >> 26; h0 = h0 & 0x3ffffff;
  300. srl CA,H0,26
  301. ext H0,H0,0,26
  302. ## h1 += b; b = h1 >> 26; h1 = h1 & 0x3ffffff;
  303. addu H1,CA,H1
  304. srl CA,H1,26
  305. ext H1,H1,0,26
  306. ## h2 += b; b = h2 >> 26; h2 = h2 & 0x3ffffff;
  307. addu H2,CA,H2
  308. srl CA,H2,26
  309. ext H2,H2,0,26
  310. ## h3 += b; b = h3 >> 26; h3 = h3 & 0x3ffffff;
  311. addu H3,CA,H3
  312. srl CA,H3,26
  313. ext H3,H3,0,26
  314. ## h4 += b; b = h4 >> 26; h4 = h4 & 0x3ffffff;
  315. addu H4,CA,H4
  316. srl CA,H4,26
  317. ext H4,H4,0,26
  318. ## h0 += b * 5;
  319. PLUS_5X H0,H0,CA,SC
  320. # g0 = h0 + 5; b = g0 >> 26; g0 &= 0x3ffffff;
  321. addiu O0,H0,5
  322. srl CA,O0,26
  323. ext O0,O0,0,26
  324. # g1 = h1 + b; b = g1 >> 26; g1 &= 0x3ffffff;
  325. addu O1,H1,CA
  326. srl CA,O1,26
  327. ext O1,O1,0,26
  328. # g2 = h2 + b; b = g2 >> 26; g2 &= 0x3ffffff;
  329. addu O2,H2,CA
  330. srl CA,O2,26
  331. ext O2,O2,0,26
  332. # g3 = h3 + b; b = g3 >> 26; g3 &= 0x3ffffff;
  333. addu O3,H3,CA
  334. srl CA,O3,26
  335. ext O3,O3,0,26
  336. # g4 = h4 + b - (1 << 26);
  337. addu O4,H4,CA
  338. addiu SC,$zero,1
  339. sll SC,SC,26
  340. subu O4,O4,SC
  341. # b = (g4 >> 31) - 1;
  342. srl CA,O4,31
  343. addiu CA,CA,-1
  344. # nb = ~b;
  345. addiu SC,$zero,-1
  346. xor SC,CA,SC
  347. # h0 = (h0 & nb) | (g0 & b);
  348. and H0,H0,SC
  349. and O0,O0,CA
  350. or H0,O0,H0
  351. # h1 = (h1 & nb) | (g1 & b);
  352. and H1,H1,SC
  353. and O1,O1,CA
  354. or H1,O1,H1
  355. # h2 = (h2 & nb) | (g2 & b);
  356. and H2,H2,SC
  357. and O2,O2,CA
  358. or H2,O2,H2
  359. # h3 = (h3 & nb) | (g3 & b);
  360. and H3,H3,SC
  361. and O3,O3,CA
  362. or H3,O3,H3
  363. # h4 = (h4 & nb) | (g4 & b);
  364. and H4,H4,SC
  365. and O4,O4,CA
  366. or H4,O4,H4
  367. #
  368. # f0 = ((h0 ) | (h1 << 26)) + (uint64_t)U8TO32_LE(&key[16]);
  369. # f1 = ((h1 >> 6) | (h2 << 20)) + (uint64_t)U8TO32_LE(&key[20]);
  370. # f2 = ((h2 >> 12) | (h3 << 14)) + (uint64_t)U8TO32_LE(&key[24]);
  371. # f3 = ((h3 >> 18) | (h4 << 8)) + (uint64_t)U8TO32_LE(&key[28]);
  372. # Done in stages...
  373. #
  374. lw O0,16(KEY)
  375. lw O1,20(KEY)
  376. lw O2,24(KEY)
  377. lw O3,28(KEY)
  378. # h0 = ((h0 ) | (h1 << 26));
  379. sll SC,H1,26
  380. or H0,SC,H0
  381. # h1 = ((h1 >> 6) | (h2 << 20));
  382. sll SC,H2,20
  383. srl H1,H1,6
  384. or H1,SC,H1
  385. # h2 = ((h2 >> 12) | (h3 << 14));
  386. sll SC,H3,14
  387. srl H2,H2,12
  388. or H2,SC,H2
  389. # h3 = ((h3 >> 18) | (h4 << 8));
  390. sll SC,H4,8
  391. srl H3,H3,18
  392. or H3,SC,H3
  393. # o0 = h0 + U8TO32_LE(&key[16]);
  394. LITTLE_ENDIAN_TO_HOST O0
  395. addu O0,O0,H0
  396. sltu CA,O0,H0
  397. LITTLE_ENDIAN_TO_HOST O1
  398. addu O1,O1,H1
  399. sltu SC,O1,H1
  400. addu O1,O1,CA
  401. sltu CA,O1,CA
  402. addu CA,SC,CA
  403. LITTLE_ENDIAN_TO_HOST O2
  404. addu O2,O2,H2
  405. sltu SC,O2,H2
  406. addu O2,O2,CA
  407. sltu CA,O2,CA
  408. addu CA,SC,CA
  409. LITTLE_ENDIAN_TO_HOST O3
  410. addu O3,O3,H3
  411. sltu SC,O3,H3
  412. addu O3,O3,CA
  413. sltu CA,O3,CA
  414. addu CA,SC,CA
  415. HOST_TO_LITTLE_ENDIAN O0
  416. HOST_TO_LITTLE_ENDIAN O1
  417. HOST_TO_LITTLE_ENDIAN O2
  418. HOST_TO_LITTLE_ENDIAN O3
  419. sw O0,0(OUT)
  420. sw O1,4(OUT)
  421. sw O2,8(OUT)
  422. sw O3,12(OUT)
  423. # return 0;
  424. move $2,$zero
  425. ## Pop callee-save registers from stack
  426. lw $16,16($sp)
  427. lw $17,20($sp)
  428. lw $18,24($sp)
  429. lw $19,28($sp)
  430. lw $20,32($sp)
  431. lw $21,36($sp)
  432. lw $22,40($sp)
  433. lw $23,44($sp)
  434. lw $fp,48($sp)
  435. addiu $sp,$sp,52
  436. j $31
  437. .end crypto_onetimeauth_poly1305_mips32r2donna