blocks.q 12 KB


  1. reg128 r0
  2. reg128 r1
  3. reg128 r2
  4. reg128 r3
  5. reg128 r4
  6. reg128 x01
  7. reg128 x23
  8. reg128 x4
  9. reg128 y0
  10. reg128 y12
  11. reg128 y34
  12. reg128 5y12
  13. reg128 5y34
  14. stack128 y0_stack
  15. stack128 y12_stack
  16. stack128 y34_stack
  17. stack128 5y12_stack
  18. stack128 5y34_stack
  19. reg128 z0
  20. reg128 z12
  21. reg128 z34
  22. reg128 5z12
  23. reg128 5z34
  24. stack128 z0_stack
  25. stack128 z12_stack
  26. stack128 z34_stack
  27. stack128 5z12_stack
  28. stack128 5z34_stack
  29. stack128 two24
  30. int32 ptr
  31. reg128 c01
  32. reg128 c23
  33. reg128 d01
  34. reg128 d23
  35. reg128 t0
  36. reg128 t1
  37. reg128 t2
  38. reg128 t3
  39. reg128 t4
  40. reg128 mask
  41. reg128 u0
  42. reg128 u1
  43. reg128 u2
  44. reg128 u3
  45. reg128 u4
  46. reg128 v01
  47. reg128 mid
  48. reg128 v23
  49. reg128 v4
  50. int32 len
  51. qpushenter crypto_onetimeauth_poly1305_neon2_blocks
  52. len = input_3
  53. # XXX: change storage format so the y,z loads are aligned
  54. new y0
  55. y0 = mem64[input_1]y0[1]; input_1 += 8
  56. y12 = mem128[input_1]; input_1 += 16
  57. y34 = mem128[input_1]; input_1 += 16
  58. input_1 += 8
  59. new z0
  60. z0 = mem64[input_1]z0[1]; input_1 += 8
  61. z12 = mem128[input_1]; input_1 += 16
  62. z34 = mem128[input_1]; input_1 += 16
  63. 2x mask = 0xffffffff
  64. 2x u4 = 0xff
  65. x01 aligned= mem128[input_0];input_0+=16
  66. x23 aligned= mem128[input_0];input_0+=16
  67. x4 aligned= mem64[input_0]x4[1]
  68. input_0 -= 32
  69. 2x mask unsigned>>=6
  70. 2x u4 unsigned>>= 7
  71. 4x 5y12 = y12 << 2
  72. 4x 5y34 = y34 << 2
  73. 4x 5y12 += y12
  74. 4x 5y34 += y34
  75. 2x u4 <<= 24
  76. 4x 5z12 = z12 << 2
  77. 4x 5z34 = z34 << 2
  78. 4x 5z12 += z12
  79. 4x 5z34 += z34
  80. new two24
  81. new y0_stack
  82. new y12_stack
  83. new y34_stack
  84. new 5y12_stack
  85. new 5y34_stack
  86. new z0_stack
  87. new z12_stack
  88. new z34_stack
  89. new 5z12_stack
  90. new 5z34_stack
  91. ptr = &two24
  92. mem128[ptr] aligned= u4
  93. r4 = u4
  94. r0 = u4
  95. ptr = &y0_stack
  96. mem128[ptr] aligned= y0
  97. ptr = &y12_stack
  98. mem128[ptr] aligned= y12
  99. ptr = &y34_stack
  100. mem128[ptr] aligned= y34
  101. ptr = &z0_stack
  102. mem128[ptr] aligned= z0
  103. ptr = &z12_stack
  104. mem128[ptr] aligned= z12
  105. ptr = &z34_stack
  106. mem128[ptr] aligned= z34
  107. ptr = &5y12_stack
  108. mem128[ptr] aligned= 5y12
  109. ptr = &5y34_stack
  110. mem128[ptr] aligned= 5y34
  111. ptr = &5z12_stack
  112. mem128[ptr] aligned= 5z12
  113. ptr = &5z34_stack
  114. mem128[ptr] aligned= 5z34
  115. unsigned>? len - 64
  116. goto below64bytes if !unsigned>
  117. input_2 += 32
  118. mainloop2:
  119. c01 = mem128[input_2];input_2+=16 # m0 m1 m2 m3
  120. c23 = mem128[input_2];input_2+=16 # m4 m5 m6 m7
  121. r4[0,1] += x01[0] unsigned* z34[2]; r4[2,3] += x01[1] unsigned* z34[3]
  122. ptr = &z12_stack
  123. z12 aligned= mem128[ptr]
  124. r4[0,1] += x01[2] unsigned* z34[0]; r4[2,3] += x01[3] unsigned* z34[1]
  125. ptr = &z0_stack
  126. z0 aligned= mem128[ptr]
  127. r4[0,1] += x23[0] unsigned* z12[2]; r4[2,3] += x23[1] unsigned* z12[3]
  128. c01 c23 = c01[0]c01[1]c01[2]c23[2]c23[0]c23[1]c01[3]c23[3]
  129. r4[0,1] += x23[2] unsigned* z12[0]; r4[2,3] += x23[3] unsigned* z12[1]
  130. r4[0,1] += x4[0] unsigned* z0[0]; r4[2,3] += x4[1] unsigned* z0[1]
  131. r3[0,1] = c23[2]<<18; r3[2,3] = c23[3]<<18 # m3<<18 0 m7<<18 0
  132. c01 c23 = c01[0]c23[0]c01[2]c01[3]c01[1]c23[1]c23[2]c23[3]
  133. # c01 c23 is now m0 m4 m2 m6; m1 m5 m3 m7
  134. r3[0,1] += x01[0] unsigned* z34[0]; r3[2,3] += x01[1] unsigned* z34[1]
  135. r3[0,1] += x01[2] unsigned* z12[2]; r3[2,3] += x01[3] unsigned* z12[3]
  136. r0 = r0[1]c01[0]r0[2,3] # 0 m0 2^24 0
  137. r3[0,1] += x23[0] unsigned* z12[0]; r3[2,3] += x23[1] unsigned* z12[1]
  138. input_2 -= 64
  139. r3[0,1] += x23[2] unsigned* z0[0]; r3[2,3] += x23[3] unsigned* z0[1]
  140. ptr = &5z34_stack
  141. 5z34 aligned= mem128[ptr]
  142. r3[0,1] += x4[0] unsigned* 5z34[2]; r3[2,3] += x4[1] unsigned* 5z34[3]
  143. r0 = r0[1]r0[0]r0[3]r0[2] # m0 0 0 2^24
  144. r2[0,1] = c01[2]<<12; r2[2,3] = c01[3]<<12 # m2<<12 0 m6<<12 0
  145. d01 = mem128[input_2];input_2+=16 # m0 m1 m2 m3
  146. r2[0,1] += x01[0] unsigned* z12[2]; r2[2,3] += x01[1] unsigned* z12[3]
  147. r2[0,1] += x01[2] unsigned* z12[0]; r2[2,3] += x01[3] unsigned* z12[1]
  148. r2[0,1] += x23[0] unsigned* z0[0]; r2[2,3] += x23[1] unsigned* z0[1]
  149. r2[0,1] += x23[2] unsigned* 5z34[2]; r2[2,3] += x23[3] unsigned* 5z34[3]
  150. r2[0,1] += x4[0] unsigned* 5z34[0]; r2[2,3] += x4[1] unsigned* 5z34[1]
  151. r0 = r0[0,1]c01[1]r0[2] # m0 0 m4 0
  152. r1[0,1] = c23[0]<<6; r1[2,3] = c23[1]<<6 # m1<<6 0 m5<<6 0
  153. r1[0,1] += x01[0] unsigned* z12[0]; r1[2,3] += x01[1] unsigned* z12[1]
  154. r1[0,1] += x01[2] unsigned* z0[0]; r1[2,3] += x01[3] unsigned* z0[1]
  155. r1[0,1] += x23[0] unsigned* 5z34[2]; r1[2,3] += x23[1] unsigned* 5z34[3]
  156. r1[0,1] += x23[2] unsigned* 5z34[0]; r1[2,3] += x23[3] unsigned* 5z34[1]
  157. ptr = &5z12_stack
  158. 5z12 aligned= mem128[ptr]
  159. r1[0,1] += x4[0] unsigned* 5z12[2]; r1[2,3] += x4[1] unsigned* 5z12[3]
  160. d23 = mem128[input_2];input_2+=16 # m4 m5 m6 m7
  161. input_2 += 32
  162. r0[0,1] += x4[0] unsigned* 5z12[0]; r0[2,3] += x4[1] unsigned* 5z12[1]
  163. r0[0,1] += x23[0] unsigned* 5z34[0]; r0[2,3] += x23[1] unsigned* 5z34[1]
  164. d01 d23 = d01[0] d23[0] d01[1] d23[1] # m0 m1 m4 m5; m2 m3 m6 m7
  165. r0[0,1] += x23[2] unsigned* 5z12[2]; r0[2,3] += x23[3] unsigned* 5z12[3]
  166. r0[0,1] += x01[0] unsigned* z0[0]; r0[2,3] += x01[1] unsigned* z0[1]
  167. new mid
  168. 2x v4 = d23 unsigned>> 40 # m2m3>>40, m6m7>>40
  169. mid = d01[1]d23[0] mid[2,3] # m1 m2 ? ?
  170. new v23
  171. v23[2] = d23[0,1] unsigned>> 14; v23[3] = d23[2,3] unsigned>> 14
  172. mid = mid[0,1] d01[3]d23[2] # m1 m2 m5 m6
  173. new v01
  174. v01[2] = d01[0,1] unsigned>> 26; v01[3] = d01[2,3] unsigned>> 26
  175. v01 = d01[1]d01[0] v01[2,3] # m1, m0, m0m1>>26, m4m5>>26
  176. r0[0,1] += x01[2] unsigned* 5z34[2]; r0[2,3] += x01[3] unsigned* 5z34[3]
  177. v01 = v01[1]d01[2] v01[2,3] # m0, m4, m0m1>>26, m4m5>>26
  178. v23[0] = mid[0,1] unsigned>> 20; v23[1] = mid[2,3] unsigned>> 20
  179. # v23: m1m2>>20, m5m6>>20, m2m3>>14, m6m7>>14
  180. v4 = v4[0]v4[2]v4[1]v4[3] # m2m3>>40, m6m7>>40, 0, 0
  181. 4x v01 &= 0x03ffffff
  182. ptr = &y34_stack
  183. y34 aligned= mem128[ptr]
  184. 4x v23 &= 0x03ffffff
  185. ptr = &y12_stack
  186. y12 aligned= mem128[ptr]
  187. 4x v4 |= 0x01000000
  188. ptr = &y0_stack
  189. y0 aligned= mem128[ptr]
  190. r4[0,1] += v01[0] unsigned* y34[2]; r4[2,3] += v01[1] unsigned* y34[3]
  191. r4[0,1] += v01[2] unsigned* y34[0]; r4[2,3] += v01[3] unsigned* y34[1]
  192. r4[0,1] += v23[0] unsigned* y12[2]; r4[2,3] += v23[1] unsigned* y12[3]
  193. r4[0,1] += v23[2] unsigned* y12[0]; r4[2,3] += v23[3] unsigned* y12[1]
  194. r4[0,1] += v4[0] unsigned* y0[0]; r4[2,3] += v4[1] unsigned* y0[1]
  195. ptr = &5y34_stack
  196. 5y34 aligned= mem128[ptr]
  197. r3[0,1] += v01[0] unsigned* y34[0]; r3[2,3] += v01[1] unsigned* y34[1]
  198. r3[0,1] += v01[2] unsigned* y12[2]; r3[2,3] += v01[3] unsigned* y12[3]
  199. r3[0,1] += v23[0] unsigned* y12[0]; r3[2,3] += v23[1] unsigned* y12[1]
  200. r3[0,1] += v23[2] unsigned* y0[0]; r3[2,3] += v23[3] unsigned* y0[1]
  201. r3[0,1] += v4[0] unsigned* 5y34[2]; r3[2,3] += v4[1] unsigned* 5y34[3]
  202. ptr = &5y12_stack
  203. 5y12 aligned= mem128[ptr]
  204. r0[0,1] += v4[0] unsigned* 5y12[0]; r0[2,3] += v4[1] unsigned* 5y12[1]
  205. r0[0,1] += v23[0] unsigned* 5y34[0]; r0[2,3] += v23[1] unsigned* 5y34[1]
  206. r0[0,1] += v23[2] unsigned* 5y12[2]; r0[2,3] += v23[3] unsigned* 5y12[3]
  207. r0[0,1] += v01[0] unsigned* y0[0]; r0[2,3] += v01[1] unsigned* y0[1]
  208. r0[0,1] += v01[2] unsigned* 5y34[2]; r0[2,3] += v01[3] unsigned* 5y34[3]
  209. r1[0,1] += v01[0] unsigned* y12[0]; r1[2,3] += v01[1] unsigned* y12[1]
  210. r1[0,1] += v01[2] unsigned* y0[0]; r1[2,3] += v01[3] unsigned* y0[1]
  211. r1[0,1] += v23[0] unsigned* 5y34[2]; r1[2,3] += v23[1] unsigned* 5y34[3]
  212. r1[0,1] += v23[2] unsigned* 5y34[0]; r1[2,3] += v23[3] unsigned* 5y34[1]
  213. r1[0,1] += v4[0] unsigned* 5y12[2]; r1[2,3] += v4[1] unsigned* 5y12[3]
  214. r2[0,1] += v01[0] unsigned* y12[2]; r2[2,3] += v01[1] unsigned* y12[3]
  215. r2[0,1] += v01[2] unsigned* y12[0]; r2[2,3] += v01[3] unsigned* y12[1]
  216. r2[0,1] += v23[0] unsigned* y0[0]; r2[2,3] += v23[1] unsigned* y0[1]
  217. r2[0,1] += v23[2] unsigned* 5y34[2]; r2[2,3] += v23[3] unsigned* 5y34[3]
  218. r2[0,1] += v4[0] unsigned* 5y34[0]; r2[2,3] += v4[1] unsigned* 5y34[1]
  219. ptr = &two24
  220. 2x t1 = r0 unsigned>> 26
  221. len -= 64
  222. r0 &= mask
  223. 2x r1 += t1
  224. 2x t4 = r3 unsigned>> 26
  225. r3 &= mask
  226. 2x x4 = r4 + t4
  227. r4 aligned= mem128[ptr]
  228. 2x t2 = r1 unsigned>> 26
  229. r1 &= mask
  230. 2x t0 = x4 unsigned>> 26
  231. 2x r2 += t2
  232. x4 &= mask
  233. 2x x01 = r0 + t0
  234. r0 aligned= mem128[ptr]
  235. ptr = &z34_stack
  236. 2x t0 <<= 2
  237. 2x t3 = r2 unsigned>> 26
  238. 2x x01 += t0
  239. z34 aligned= mem128[ptr]
  240. x23 = r2 & mask
  241. 2x r3 += t3
  242. input_2 += 32
  243. 2x t1 = x01 unsigned>> 26
  244. x23 = x23[0,2,1,3]
  245. x01 = x01 & mask
  246. 2x r1 += t1
  247. 2x t4 = r3 unsigned>> 26
  248. x01 = x01[0,2,1,3]
  249. r3 &= mask
  250. r1 = r1[0,2,1,3]
  251. 2x x4 += t4
  252. r3 = r3[0,2,1,3]
  253. x01 = x01[0,1] r1[0,1]
  254. x23 = x23[0,1] r3[0,1]
  255. x4 = x4[0,2,1,3]
  256. # before these final permutations:
  257. # x01 had f0,0,f0',0
  258. # r1 had f1,0,f1',0
  259. # x23 had f2,0,f2',0
  260. # r3 had f3,0,f3',0
  261. # x4 had f4,0,f4',0
  262. # x01 has f0,f0',f1,f1'
  263. # x23 has f2,f2',f3,f3'
  264. # x4 has f4,f4',?,?
  265. unsigned>? len - 64
  266. goto mainloop2 if unsigned>
  267. input_2 -= 32
  268. below64bytes:
  269. unsigned>? len - 32
  270. goto end if !unsigned>
  271. mainloop:
  272. new r0
  273. ptr = &two24
  274. r4 aligned= mem128[ptr]
  275. u4 aligned= mem128[ptr]
  276. c01 = mem128[input_2];input_2+=16 # m0 m1 m2 m3
  277. r4[0,1] += x01[0] unsigned* y34[2]; r4[2,3] += x01[1] unsigned* y34[3]
  278. c23 = mem128[input_2];input_2+=16 # m4 m5 m6 m7
  279. r4[0,1] += x01[2] unsigned* y34[0]; r4[2,3] += x01[3] unsigned* y34[1]
  280. r0 = u4[1]c01[0]r0[2,3] # 0 m0 ? ?
  281. r4[0,1] += x23[0] unsigned* y12[2]; r4[2,3] += x23[1] unsigned* y12[3]
  282. r0 = r0[0,1]u4[1]c23[0] # 0 m0 0 m4
  283. r4[0,1] += x23[2] unsigned* y12[0]; r4[2,3] += x23[3] unsigned* y12[1]
  284. r0 = r0[1]r0[0]r0[3]r0[2] # m0 0 m4 0
  285. r4[0,1] += x4[0] unsigned* y0[0]; r4[2,3] += x4[1] unsigned* y0[1]
  286. r0[0,1] += x4[0] unsigned* 5y12[0]; r0[2,3] += x4[1] unsigned* 5y12[1]
  287. r0[0,1] += x23[0] unsigned* 5y34[0]; r0[2,3] += x23[1] unsigned* 5y34[1]
  288. r0[0,1] += x23[2] unsigned* 5y12[2]; r0[2,3] += x23[3] unsigned* 5y12[3]
  289. c01 c23 = c01[0]c23[0]c01[2]c23[2]c01[1]c23[1]c01[3]c23[3] # m0 m4 m2 m6; m1 m5 m3 m7
  290. r0[0,1] += x01[0] unsigned* y0[0]; r0[2,3] += x01[1] unsigned* y0[1]
  291. r3[0,1] = c23[2]<<18; r3[2,3] = c23[3]<<18 # m3<<18 0 m7<<18 0
  292. r0[0,1] += x01[2] unsigned* 5y34[2]; r0[2,3] += x01[3] unsigned* 5y34[3]
  293. r3[0,1] += x01[0] unsigned* y34[0]; r3[2,3] += x01[1] unsigned* y34[1]
  294. r3[0,1] += x01[2] unsigned* y12[2]; r3[2,3] += x01[3] unsigned* y12[3]
  295. r3[0,1] += x23[0] unsigned* y12[0]; r3[2,3] += x23[1] unsigned* y12[1]
  296. r3[0,1] += x23[2] unsigned* y0[0]; r3[2,3] += x23[3] unsigned* y0[1]
  297. r1[0,1] = c23[0]<<6; r1[2,3] = c23[1]<<6 # m1<<6 0 m5<<6 0
  298. r3[0,1] += x4[0] unsigned* 5y34[2]; r3[2,3] += x4[1] unsigned* 5y34[3]
  299. r1[0,1] += x01[0] unsigned* y12[0]; r1[2,3] += x01[1] unsigned* y12[1]
  300. r1[0,1] += x01[2] unsigned* y0[0]; r1[2,3] += x01[3] unsigned* y0[1]
  301. r1[0,1] += x23[0] unsigned* 5y34[2]; r1[2,3] += x23[1] unsigned* 5y34[3]
  302. r1[0,1] += x23[2] unsigned* 5y34[0]; r1[2,3] += x23[3] unsigned* 5y34[1]
  303. r2[0,1] = c01[2]<<12; r2[2,3] = c01[3]<<12 # m2<<12 0 m6<<12 0
  304. r1[0,1] += x4[0] unsigned* 5y12[2]; r1[2,3] += x4[1] unsigned* 5y12[3]
  305. r2[0,1] += x01[0] unsigned* y12[2]; r2[2,3] += x01[1] unsigned* y12[3]
  306. r2[0,1] += x01[2] unsigned* y12[0]; r2[2,3] += x01[3] unsigned* y12[1]
  307. r2[0,1] += x23[0] unsigned* y0[0]; r2[2,3] += x23[1] unsigned* y0[1]
  308. r2[0,1] += x23[2] unsigned* 5y34[2]; r2[2,3] += x23[3] unsigned* 5y34[3]
  309. r2[0,1] += x4[0] unsigned* 5y34[0]; r2[2,3] += x4[1] unsigned* 5y34[1]
  310. 2x t1 = r0 unsigned>> 26
  311. r0 &= mask
  312. 2x r1 += t1
  313. 2x t4 = r3 unsigned>> 26
  314. r3 &= mask
  315. 2x r4 += t4
  316. 2x t2 = r1 unsigned>> 26
  317. r1 &= mask
  318. 2x t0 = r4 unsigned>> 26
  319. 2x r2 += t2
  320. r4 &= mask
  321. 2x r0 += t0
  322. 2x t0 <<= 2
  323. 2x t3 = r2 unsigned>> 26
  324. 2x r0 += t0
  325. x23 = r2 & mask
  326. 2x r3 += t3
  327. 2x t1 = r0 unsigned>> 26
  328. x01 = r0 & mask
  329. 2x r1 += t1
  330. 2x t4 = r3 unsigned>> 26
  331. r3 &= mask
  332. 2x x4 = r4 + t4
  333. len -= 32
  334. x01 = x01[0,2,1,3]
  335. x23 = x23[0,2,1,3]
  336. r1 = r1[0,2,1,3]
  337. r3 = r3[0,2,1,3]
  338. x4 = x4[0,2,1,3]
  339. x01 = x01[0,1] r1[0,1]
  340. x23 = x23[0,1] r3[0,1]
  341. unsigned>? len - 32
  342. goto mainloop if unsigned>
  343. end:
  344. mem128[input_0] = x01;input_0+=16
  345. mem128[input_0] = x23;input_0+=16
  346. mem64[input_0] = x4[0]
  347. len = len
  348. qpopreturn len