poly1305_asm.S 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109
  1. /* poly1305_asm
  2. *
  3. * Copyright (C) 2006-2020 wolfSSL Inc.
  4. *
  5. * This file is part of wolfSSL.
  6. *
  7. * wolfSSL is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * wolfSSL is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
  20. */
  21. #ifndef HAVE_INTEL_AVX1
  22. #define HAVE_INTEL_AVX1
  23. #endif /* HAVE_INTEL_AVX1 */
  24. #ifndef NO_AVX2_SUPPORT
  25. #define HAVE_INTEL_AVX2
  26. #endif /* NO_AVX2_SUPPORT */
  27. #ifdef HAVE_INTEL_AVX1
  28. #ifndef __APPLE__
  29. .text
  30. .globl poly1305_setkey_avx
  31. .type poly1305_setkey_avx,@function
  32. .align 16
  33. poly1305_setkey_avx:
  34. #else
  35. .section __TEXT,__text
  36. .globl _poly1305_setkey_avx
  37. .p2align 4
  38. _poly1305_setkey_avx:
  39. #endif /* __APPLE__ */
  40. movabsq $0xffffffc0fffffff, %r10
  41. movabsq $0xffffffc0ffffffc, %r11
  42. movq (%rsi), %rdx
  43. movq 8(%rsi), %rax
  44. movq 16(%rsi), %rcx
  45. movq 24(%rsi), %r8
  46. andq %r10, %rdx
  47. andq %r11, %rax
  48. movq %rdx, %r10
  49. movq %rax, %r11
  50. xorq %r9, %r9
  51. movq %rdx, (%rdi)
  52. movq %rax, 8(%rdi)
  53. movq %r9, 24(%rdi)
  54. movq %r9, 32(%rdi)
  55. movq %r9, 40(%rdi)
  56. movq %rcx, 48(%rdi)
  57. movq %r8, 56(%rdi)
  58. movq %r9, 352(%rdi)
  59. movq %r9, 408(%rdi)
  60. movq %rdx, 360(%rdi)
  61. movq %rax, 416(%rdi)
  62. addq %rdx, %r10
  63. addq %rax, %r11
  64. movq %r10, 368(%rdi)
  65. movq %r11, 424(%rdi)
  66. addq %rdx, %r10
  67. addq %rax, %r11
  68. movq %r10, 376(%rdi)
  69. movq %r11, 432(%rdi)
  70. addq %rdx, %r10
  71. addq %rax, %r11
  72. movq %r10, 384(%rdi)
  73. movq %r11, 440(%rdi)
  74. addq %rdx, %r10
  75. addq %rax, %r11
  76. movq %r10, 392(%rdi)
  77. movq %r11, 448(%rdi)
  78. addq %rdx, %r10
  79. addq %rax, %r11
  80. movq %r10, 400(%rdi)
  81. movq %r11, 456(%rdi)
  82. movq %r9, 608(%rdi)
  83. movb $0x01, 616(%rdi)
  84. repz retq
  85. #ifndef __APPLE__
  86. .size poly1305_setkey_avx,.-poly1305_setkey_avx
  87. #endif /* __APPLE__ */
  88. #ifndef __APPLE__
  89. .text
  90. .globl poly1305_block_avx
  91. .type poly1305_block_avx,@function
  92. .align 16
  93. poly1305_block_avx:
  94. #else
  95. .section __TEXT,__text
  96. .globl _poly1305_block_avx
  97. .p2align 4
  98. _poly1305_block_avx:
  99. #endif /* __APPLE__ */
  100. pushq %r15
  101. pushq %rbx
  102. pushq %r12
  103. pushq %r13
  104. pushq %r14
  105. movq (%rdi), %r15
  106. movq 8(%rdi), %rbx
  107. movq 24(%rdi), %r8
  108. movq 32(%rdi), %r9
  109. movq 40(%rdi), %r10
  110. xorq %r14, %r14
  111. movb 616(%rdi), %r14b
  112. # h += m
  113. movq (%rsi), %r11
  114. movq 8(%rsi), %r12
  115. addq %r11, %r8
  116. adcq %r12, %r9
  117. movq %rbx, %rax
  118. adcq %r14, %r10
  119. # r[1] * h[0] => rdx, rax ==> t2, t1
  120. mulq %r8
  121. movq %rax, %r12
  122. movq %rdx, %r13
  123. # r[0] * h[1] => rdx, rax ++> t2, t1
  124. movq %r15, %rax
  125. mulq %r9
  126. addq %rax, %r12
  127. movq %r15, %rax
  128. adcq %rdx, %r13
  129. # r[0] * h[0] => rdx, rax ==> t4, t0
  130. mulq %r8
  131. movq %rax, %r11
  132. movq %rdx, %r8
  133. # r[1] * h[1] => rdx, rax =+> t3, t2
  134. movq %rbx, %rax
  135. mulq %r9
  136. # r[0] * h[2] +> t2
  137. addq 352(%rdi,%r10,8), %r13
  138. movq %rdx, %r14
  139. addq %r8, %r12
  140. adcq %rax, %r13
  141. # r[1] * h[2] +> t3
  142. adcq 408(%rdi,%r10,8), %r14
  143. # r * h in r14, r13, r12, r11
  144. # h = (r * h) mod 2^130 - 5
  145. movq %r13, %r10
  146. andq $-4, %r13
  147. andq $3, %r10
  148. addq %r13, %r11
  149. movq %r13, %r8
  150. adcq %r14, %r12
  151. adcq $0x00, %r10
  152. shrdq $2, %r14, %r8
  153. shrq $2, %r14
  154. addq %r11, %r8
  155. adcq %r14, %r12
  156. movq %r12, %r9
  157. adcq $0x00, %r10
  158. # h in r10, r9, r8
  159. # Store h to ctx
  160. movq %r8, 24(%rdi)
  161. movq %r9, 32(%rdi)
  162. movq %r10, 40(%rdi)
  163. popq %r14
  164. popq %r13
  165. popq %r12
  166. popq %rbx
  167. popq %r15
  168. repz retq
  169. #ifndef __APPLE__
  170. .size poly1305_block_avx,.-poly1305_block_avx
  171. #endif /* __APPLE__ */
  172. #ifndef __APPLE__
  173. .text
  174. .globl poly1305_blocks_avx
  175. .type poly1305_blocks_avx,@function
  176. .align 16
  177. poly1305_blocks_avx:
  178. #else
  179. .section __TEXT,__text
  180. .globl _poly1305_blocks_avx
  181. .p2align 4
  182. _poly1305_blocks_avx:
  183. #endif /* __APPLE__ */
  184. pushq %r15
  185. pushq %rbx
  186. pushq %r12
  187. pushq %r13
  188. pushq %r14
  189. movq %rdx, %rcx
  190. movq (%rdi), %r15
  191. movq 8(%rdi), %rbx
  192. movq 24(%rdi), %r8
  193. movq 32(%rdi), %r9
  194. movq 40(%rdi), %r10
  195. L_poly1305_avx_blocks_start:
  196. # h += m
  197. movq (%rsi), %r11
  198. movq 8(%rsi), %r12
  199. addq %r11, %r8
  200. adcq %r12, %r9
  201. movq %rbx, %rax
  202. adcq $0x00, %r10
  203. # r[1] * h[0] => rdx, rax ==> t2, t1
  204. mulq %r8
  205. movq %rax, %r12
  206. movq %rdx, %r13
  207. # r[0] * h[1] => rdx, rax ++> t2, t1
  208. movq %r15, %rax
  209. mulq %r9
  210. addq %rax, %r12
  211. movq %r15, %rax
  212. adcq %rdx, %r13
  213. # r[0] * h[0] => rdx, rax ==> t4, t0
  214. mulq %r8
  215. movq %rax, %r11
  216. movq %rdx, %r8
  217. # r[1] * h[1] => rdx, rax =+> t3, t2
  218. movq %rbx, %rax
  219. mulq %r9
  220. # r[0] * h[2] +> t2
  221. addq 360(%rdi,%r10,8), %r13
  222. movq %rdx, %r14
  223. addq %r8, %r12
  224. adcq %rax, %r13
  225. # r[1] * h[2] +> t3
  226. adcq 416(%rdi,%r10,8), %r14
  227. # r * h in r14, r13, r12, r11
  228. # h = (r * h) mod 2^130 - 5
  229. movq %r13, %r10
  230. andq $-4, %r13
  231. andq $3, %r10
  232. addq %r13, %r11
  233. movq %r13, %r8
  234. adcq %r14, %r12
  235. adcq $0x00, %r10
  236. shrdq $2, %r14, %r8
  237. shrq $2, %r14
  238. addq %r11, %r8
  239. adcq %r14, %r12
  240. movq %r12, %r9
  241. adcq $0x00, %r10
  242. # h in r10, r9, r8
  243. # Next block from message
  244. addq $16, %rsi
  245. subq $16, %rcx
  246. jg L_poly1305_avx_blocks_start
  247. # Store h to ctx
  248. movq %r8, 24(%rdi)
  249. movq %r9, 32(%rdi)
  250. movq %r10, 40(%rdi)
  251. popq %r14
  252. popq %r13
  253. popq %r12
  254. popq %rbx
  255. popq %r15
  256. repz retq
  257. #ifndef __APPLE__
  258. .size poly1305_blocks_avx,.-poly1305_blocks_avx
  259. #endif /* __APPLE__ */
  260. #ifndef __APPLE__
  261. .text
  262. .globl poly1305_final_avx
  263. .type poly1305_final_avx,@function
  264. .align 16
  265. poly1305_final_avx:
  266. #else
  267. .section __TEXT,__text
  268. .globl _poly1305_final_avx
  269. .p2align 4
  270. _poly1305_final_avx:
  271. #endif /* __APPLE__ */
  272. pushq %rbx
  273. pushq %r12
  274. movq %rsi, %rbx
  275. movq 608(%rdi), %rax
  276. testq %rax, %rax
  277. je L_poly1305_avx_final_no_more
  278. movb $0x01, 480(%rdi,%rax,1)
  279. jmp L_poly1305_avx_final_cmp_rem
  280. L_poly1305_avx_final_zero_rem:
  281. movb $0x00, 480(%rdi,%rax,1)
  282. L_poly1305_avx_final_cmp_rem:
  283. incb %al
  284. cmpq $16, %rax
  285. jl L_poly1305_avx_final_zero_rem
  286. movb $0x00, 616(%rdi)
  287. leaq 480(%rdi), %rsi
  288. #ifndef __APPLE__
  289. callq poly1305_block_avx@plt
  290. #else
  291. callq _poly1305_block_avx
  292. #endif /* __APPLE__ */
  293. L_poly1305_avx_final_no_more:
  294. movq 24(%rdi), %rax
  295. movq 32(%rdi), %rdx
  296. movq 40(%rdi), %rcx
  297. movq 48(%rdi), %r11
  298. movq 56(%rdi), %r12
  299. # h %= p
  300. # h = (h + pad)
  301. # mod 2^130 - 5
  302. movq %rcx, %r8
  303. andq $3, %rcx
  304. shrq $2, %r8
  305. # Multily by 5
  306. leaq 0(%r8,%r8,4), %r8
  307. addq %r8, %rax
  308. adcq $0x00, %rdx
  309. adcq $0x00, %rcx
  310. # Fixup when between (1 << 130) - 1 and (1 << 130) - 5
  311. movq %rax, %r8
  312. movq %rdx, %r9
  313. movq %rcx, %r10
  314. addq $5, %r8
  315. adcq $0x00, %r9
  316. adcq $0x00, %r10
  317. cmpq $4, %r10
  318. cmoveq %r8, %rax
  319. cmoveq %r9, %rdx
  320. # h += pad
  321. addq %r11, %rax
  322. adcq %r12, %rdx
  323. movq %rax, (%rbx)
  324. movq %rdx, 8(%rbx)
  325. # Zero out r
  326. movq $0x00, (%rdi)
  327. movq $0x00, 8(%rdi)
  328. # Zero out h
  329. movq $0x00, 24(%rdi)
  330. movq $0x00, 32(%rdi)
  331. movq $0x00, 40(%rdi)
  332. # Zero out pad
  333. movq $0x00, 48(%rdi)
  334. movq $0x00, 56(%rdi)
  335. popq %r12
  336. popq %rbx
  337. repz retq
  338. #ifndef __APPLE__
  339. .size poly1305_final_avx,.-poly1305_final_avx
  340. #endif /* __APPLE__ */
  341. #endif /* HAVE_INTEL_AVX1 */
  342. #ifdef HAVE_INTEL_AVX2
  343. #ifndef __APPLE__
  344. .text
  345. .globl poly1305_calc_powers_avx2
  346. .type poly1305_calc_powers_avx2,@function
  347. .align 16
  348. poly1305_calc_powers_avx2:
  349. #else
  350. .section __TEXT,__text
  351. .globl _poly1305_calc_powers_avx2
  352. .p2align 4
  353. _poly1305_calc_powers_avx2:
  354. #endif /* __APPLE__ */
  355. pushq %r12
  356. pushq %r13
  357. pushq %r14
  358. pushq %r15
  359. pushq %rbx
  360. pushq %rbp
  361. movq (%rdi), %rcx
  362. movq 8(%rdi), %r8
  363. xorq %r9, %r9
  364. # Convert to 26 bits in 32
  365. movq %rcx, %rax
  366. movq %rcx, %rdx
  367. movq %rcx, %rsi
  368. movq %r8, %rbx
  369. movq %r8, %rbp
  370. shrq $26, %rdx
  371. shrdq $52, %r8, %rsi
  372. shrq $14, %rbx
  373. shrdq $40, %r9, %rbp
  374. andq $0x3ffffff, %rax
  375. andq $0x3ffffff, %rdx
  376. andq $0x3ffffff, %rsi
  377. andq $0x3ffffff, %rbx
  378. andq $0x3ffffff, %rbp
  379. movl %eax, 224(%rdi)
  380. movl %edx, 228(%rdi)
  381. movl %esi, 232(%rdi)
  382. movl %ebx, 236(%rdi)
  383. movl %ebp, 240(%rdi)
  384. movl $0x00, 244(%rdi)
  385. # Square 128-bit
  386. movq %r8, %rax
  387. mulq %rcx
  388. xorq %r13, %r13
  389. movq %rax, %r11
  390. movq %rdx, %r12
  391. addq %rax, %r11
  392. adcq %rdx, %r12
  393. adcq $0x00, %r13
  394. movq %rcx, %rax
  395. mulq %rax
  396. movq %rax, %r10
  397. movq %rdx, %r15
  398. movq %r8, %rax
  399. mulq %rax
  400. addq %r15, %r11
  401. adcq %rax, %r12
  402. adcq %rdx, %r13
  403. # Reduce 256-bit to 130-bit
  404. movq %r12, %rax
  405. movq %r13, %rdx
  406. andq $-4, %rax
  407. andq $3, %r12
  408. addq %rax, %r10
  409. adcq %rdx, %r11
  410. adcq $0x00, %r12
  411. shrdq $2, %rdx, %rax
  412. shrq $2, %rdx
  413. addq %rax, %r10
  414. adcq %rdx, %r11
  415. adcq $0x00, %r12
  416. movq %r12, %rax
  417. shrq $2, %rax
  418. leaq 0(%rax,%rax,4), %rax
  419. andq $3, %r12
  420. addq %rax, %r10
  421. adcq $0x00, %r11
  422. adcq $0x00, %r12
  423. # Convert to 26 bits in 32
  424. movq %r10, %rax
  425. movq %r10, %rdx
  426. movq %r10, %rsi
  427. movq %r11, %rbx
  428. movq %r11, %rbp
  429. shrq $26, %rdx
  430. shrdq $52, %r11, %rsi
  431. shrq $14, %rbx
  432. shrdq $40, %r12, %rbp
  433. andq $0x3ffffff, %rax
  434. andq $0x3ffffff, %rdx
  435. andq $0x3ffffff, %rsi
  436. andq $0x3ffffff, %rbx
  437. andq $0x3ffffff, %rbp
  438. movl %eax, 256(%rdi)
  439. movl %edx, 260(%rdi)
  440. movl %esi, 264(%rdi)
  441. movl %ebx, 268(%rdi)
  442. movl %ebp, 272(%rdi)
  443. movl $0x00, 276(%rdi)
  444. # Multiply 128-bit by 130-bit
  445. # r1[0] * r2[0]
  446. movq %rcx, %rax
  447. mulq %r10
  448. movq %rax, %r13
  449. movq %rdx, %r14
  450. # r1[0] * r2[1]
  451. movq %rcx, %rax
  452. mulq %r11
  453. movq $0x00, %r15
  454. addq %rax, %r14
  455. adcq %rdx, %r15
  456. # r1[1] * r2[0]
  457. movq %r8, %rax
  458. mulq %r10
  459. movq $0x00, %rsi
  460. addq %rax, %r14
  461. adcq %rdx, %r15
  462. adcq $0x00, %rsi
  463. # r1[0] * r2[2]
  464. movq %rcx, %rax
  465. mulq %r12
  466. addq %rax, %r15
  467. adcq %rdx, %rsi
  468. # r1[1] * r2[1]
  469. movq %r8, %rax
  470. mulq %r11
  471. movq $0x00, %rbx
  472. addq %rax, %r15
  473. adcq %rdx, %rsi
  474. adcq $0x00, %rbx
  475. # r1[1] * r2[2]
  476. movq %r8, %rax
  477. mulq %r12
  478. addq %rax, %rsi
  479. adcq %rdx, %rbx
  480. # Reduce 260-bit to 130-bit
  481. movq %r15, %rax
  482. movq %rsi, %rdx
  483. movq %rbx, %rbx
  484. andq $-4, %rax
  485. andq $3, %r15
  486. addq %rax, %r13
  487. adcq %rdx, %r14
  488. adcq %rbx, %r15
  489. shrdq $2, %rdx, %rax
  490. shrdq $2, %rbx, %rdx
  491. shrq $2, %rbx
  492. addq %rax, %r13
  493. adcq %rdx, %r14
  494. adcq %rbx, %r15
  495. movq %r15, %rax
  496. andq $3, %r15
  497. shrq $2, %rax
  498. leaq 0(%rax,%rax,4), %rax
  499. addq %rax, %r13
  500. adcq $0x00, %r14
  501. adcq $0x00, %r15
  502. # Convert to 26 bits in 32
  503. movq %r13, %rax
  504. movq %r13, %rdx
  505. movq %r13, %rsi
  506. movq %r14, %rbx
  507. movq %r14, %rbp
  508. shrq $26, %rdx
  509. shrdq $52, %r14, %rsi
  510. shrq $14, %rbx
  511. shrdq $40, %r15, %rbp
  512. andq $0x3ffffff, %rax
  513. andq $0x3ffffff, %rdx
  514. andq $0x3ffffff, %rsi
  515. andq $0x3ffffff, %rbx
  516. andq $0x3ffffff, %rbp
  517. movl %eax, 288(%rdi)
  518. movl %edx, 292(%rdi)
  519. movl %esi, 296(%rdi)
  520. movl %ebx, 300(%rdi)
  521. movl %ebp, 304(%rdi)
  522. movl $0x00, 308(%rdi)
  523. # Square 130-bit
  524. movq %r11, %rax
  525. mulq %r10
  526. xorq %r13, %r13
  527. movq %rax, %r8
  528. movq %rdx, %r9
  529. addq %rax, %r8
  530. adcq %rdx, %r9
  531. adcq $0x00, %r13
  532. movq %r10, %rax
  533. mulq %rax
  534. movq %rax, %rcx
  535. movq %rdx, %r15
  536. movq %r11, %rax
  537. mulq %rax
  538. addq %r15, %r8
  539. adcq %rax, %r9
  540. adcq %rdx, %r13
  541. movq %r12, %rax
  542. mulq %rax
  543. movq %rax, %r14
  544. movq %r12, %rax
  545. mulq %r10
  546. addq %rax, %r9
  547. adcq %rdx, %r13
  548. adcq $0x00, %r14
  549. addq %rax, %r9
  550. adcq %rdx, %r13
  551. adcq $0x00, %r14
  552. movq %r12, %rax
  553. mulq %r11
  554. addq %rax, %r13
  555. adcq %rdx, %r14
  556. addq %rax, %r13
  557. adcq %rdx, %r14
  558. # Reduce 260-bit to 130-bit
  559. movq %r9, %rax
  560. movq %r13, %rdx
  561. movq %r14, %r15
  562. andq $-4, %rax
  563. andq $3, %r9
  564. addq %rax, %rcx
  565. adcq %rdx, %r8
  566. adcq %r15, %r9
  567. shrdq $2, %rdx, %rax
  568. shrdq $2, %r15, %rdx
  569. shrq $2, %r15
  570. addq %rax, %rcx
  571. adcq %rdx, %r8
  572. adcq %r15, %r9
  573. movq %r9, %rax
  574. andq $3, %r9
  575. shrq $2, %rax
  576. leaq 0(%rax,%rax,4), %rax
  577. addq %rax, %rcx
  578. adcq $0x00, %r8
  579. adcq $0x00, %r9
  580. # Convert to 26 bits in 32
  581. movq %rcx, %rax
  582. movq %rcx, %rdx
  583. movq %rcx, %rsi
  584. movq %r8, %rbx
  585. movq %r8, %rbp
  586. shrq $26, %rdx
  587. shrdq $52, %r8, %rsi
  588. shrq $14, %rbx
  589. shrdq $40, %r9, %rbp
  590. andq $0x3ffffff, %rax
  591. andq $0x3ffffff, %rdx
  592. andq $0x3ffffff, %rsi
  593. andq $0x3ffffff, %rbx
  594. andq $0x3ffffff, %rbp
  595. movl %eax, 320(%rdi)
  596. movl %edx, 324(%rdi)
  597. movl %esi, 328(%rdi)
  598. movl %ebx, 332(%rdi)
  599. movl %ebp, 336(%rdi)
  600. movl $0x00, 340(%rdi)
  601. popq %rbp
  602. popq %rbx
  603. popq %r15
  604. popq %r14
  605. popq %r13
  606. popq %r12
  607. repz retq
  608. #ifndef __APPLE__
  609. .size poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2
  610. #endif /* __APPLE__ */
  611. #ifndef __APPLE__
  612. .text
  613. .globl poly1305_setkey_avx2
  614. .type poly1305_setkey_avx2,@function
  615. .align 16
  616. poly1305_setkey_avx2:
  617. #else
  618. .section __TEXT,__text
  619. .globl _poly1305_setkey_avx2
  620. .p2align 4
  621. _poly1305_setkey_avx2:
  622. #endif /* __APPLE__ */
  623. #ifndef __APPLE__
  624. callq poly1305_setkey_avx@plt
  625. #else
  626. callq _poly1305_setkey_avx
  627. #endif /* __APPLE__ */
  628. vpxor %ymm0, %ymm0, %ymm0
  629. vmovdqu %ymm0, 64(%rdi)
  630. vmovdqu %ymm0, 96(%rdi)
  631. vmovdqu %ymm0, 128(%rdi)
  632. vmovdqu %ymm0, 160(%rdi)
  633. vmovdqu %ymm0, 192(%rdi)
  634. movq $0x00, 608(%rdi)
  635. movw $0x00, 616(%rdi)
  636. repz retq
  637. #ifndef __APPLE__
  638. .size poly1305_setkey_avx2,.-poly1305_setkey_avx2
  639. #endif /* __APPLE__ */
  640. #ifndef __APPLE__
  641. .data
  642. #else
  643. .section __DATA,__data
  644. #endif /* __APPLE__ */
  645. #ifndef __APPLE__
  646. .align 32
  647. #else
  648. .p2align 5
  649. #endif /* __APPLE__ */
  650. L_poly1305_avx2_blocks_mask:
  651. .quad 0x3ffffff, 0x3ffffff
  652. .quad 0x3ffffff, 0x3ffffff
  653. #ifndef __APPLE__
  654. .data
  655. #else
  656. .section __DATA,__data
  657. #endif /* __APPLE__ */
  658. #ifndef __APPLE__
  659. .align 32
  660. #else
  661. .p2align 5
  662. #endif /* __APPLE__ */
  663. L_poly1305_avx2_blocks_hibit:
  664. .quad 0x1000000, 0x1000000
  665. .quad 0x1000000, 0x1000000
  666. #ifndef __APPLE__
  667. .text
  668. .globl poly1305_blocks_avx2
  669. .type poly1305_blocks_avx2,@function
  670. .align 16
  671. poly1305_blocks_avx2:
  672. #else
  673. .section __TEXT,__text
  674. .globl _poly1305_blocks_avx2
  675. .p2align 4
  676. _poly1305_blocks_avx2:
  677. #endif /* __APPLE__ */
  678. pushq %r12
  679. pushq %rbx
  680. subq $0x140, %rsp
  681. movq %rsp, %rcx
  682. andq $-32, %rcx
  683. addq $32, %rcx
  684. vpxor %ymm15, %ymm15, %ymm15
  685. movq %rcx, %rbx
  686. leaq 64(%rdi), %rax
  687. addq $0xa0, %rbx
  688. cmpw $0x00, 616(%rdi)
  689. jne L_poly1305_avx2_blocks_begin_h
  690. # Load the message data
  691. vmovdqu (%rsi), %ymm0
  692. vmovdqu 32(%rsi), %ymm1
  693. vperm2i128 $32, %ymm1, %ymm0, %ymm2
  694. vperm2i128 $49, %ymm1, %ymm0, %ymm0
  695. vpunpckldq %ymm0, %ymm2, %ymm1
  696. vpunpckhdq %ymm0, %ymm2, %ymm3
  697. vpunpckldq %ymm15, %ymm1, %ymm0
  698. vpunpckhdq %ymm15, %ymm1, %ymm1
  699. vpunpckldq %ymm15, %ymm3, %ymm2
  700. vpunpckhdq %ymm15, %ymm3, %ymm3
  701. vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm4
  702. vpsllq $6, %ymm1, %ymm1
  703. vpsllq $12, %ymm2, %ymm2
  704. vpsllq $18, %ymm3, %ymm3
  705. vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
  706. # Reduce, in place, the message data
  707. vpsrlq $26, %ymm0, %ymm10
  708. vpsrlq $26, %ymm3, %ymm11
  709. vpand %ymm14, %ymm0, %ymm0
  710. vpand %ymm14, %ymm3, %ymm3
  711. vpaddq %ymm1, %ymm10, %ymm1
  712. vpaddq %ymm4, %ymm11, %ymm4
  713. vpsrlq $26, %ymm1, %ymm10
  714. vpsrlq $26, %ymm4, %ymm11
  715. vpand %ymm14, %ymm1, %ymm1
  716. vpand %ymm14, %ymm4, %ymm4
  717. vpaddq %ymm2, %ymm10, %ymm2
  718. vpslld $2, %ymm11, %ymm12
  719. vpaddd %ymm12, %ymm11, %ymm12
  720. vpsrlq $26, %ymm2, %ymm10
  721. vpaddq %ymm0, %ymm12, %ymm0
  722. vpsrlq $26, %ymm0, %ymm11
  723. vpand %ymm14, %ymm2, %ymm2
  724. vpand %ymm14, %ymm0, %ymm0
  725. vpaddq %ymm3, %ymm10, %ymm3
  726. vpaddq %ymm1, %ymm11, %ymm1
  727. vpsrlq $26, %ymm3, %ymm10
  728. vpand %ymm14, %ymm3, %ymm3
  729. vpaddq %ymm4, %ymm10, %ymm4
  730. addq $0x40, %rsi
  731. subq $0x40, %rdx
  732. jz L_poly1305_avx2_blocks_store
  733. jmp L_poly1305_avx2_blocks_load_r4
  734. L_poly1305_avx2_blocks_begin_h:
  735. # Load the H values.
  736. vmovdqu (%rax), %ymm0
  737. vmovdqu 32(%rax), %ymm1
  738. vmovdqu 64(%rax), %ymm2
  739. vmovdqu 96(%rax), %ymm3
  740. vmovdqu 128(%rax), %ymm4
  741. # Check if there is a power of r to load - otherwise use r^4.
  742. cmpb $0x00, 616(%rdi)
  743. je L_poly1305_avx2_blocks_load_r4
  744. # Load the 4 powers of r - r^4, r^3, r^2, r^1.
  745. vmovdqu 224(%rdi), %ymm8
  746. vmovdqu 256(%rdi), %ymm7
  747. vmovdqu 288(%rdi), %ymm6
  748. vmovdqu 320(%rdi), %ymm5
  749. vpermq $0xd8, %ymm5, %ymm5
  750. vpermq $0xd8, %ymm6, %ymm6
  751. vpermq $0xd8, %ymm7, %ymm7
  752. vpermq $0xd8, %ymm8, %ymm8
  753. vpunpcklqdq %ymm6, %ymm5, %ymm10
  754. vpunpckhqdq %ymm6, %ymm5, %ymm11
  755. vpunpcklqdq %ymm8, %ymm7, %ymm12
  756. vpunpckhqdq %ymm8, %ymm7, %ymm13
  757. vperm2i128 $32, %ymm12, %ymm10, %ymm5
  758. vperm2i128 $49, %ymm12, %ymm10, %ymm7
  759. vperm2i128 $32, %ymm13, %ymm11, %ymm9
  760. vpsrlq $32, %ymm5, %ymm6
  761. vpsrlq $32, %ymm7, %ymm8
  762. jmp L_poly1305_avx2_blocks_mul_5
  763. L_poly1305_avx2_blocks_load_r4:
  764. # Load r^4 into all four positions.
  765. vmovdqu 320(%rdi), %ymm13
  766. vpermq $0x00, %ymm13, %ymm5
  767. vpsrlq $32, %ymm13, %ymm14
  768. vpermq $0x55, %ymm13, %ymm7
  769. vpermq $0xaa, %ymm13, %ymm9
  770. vpermq $0x00, %ymm14, %ymm6
  771. vpermq $0x55, %ymm14, %ymm8
  772. L_poly1305_avx2_blocks_mul_5:
  773. # Multiply top 4 26-bit values of all four H by 5
  774. vpslld $2, %ymm6, %ymm10
  775. vpslld $2, %ymm7, %ymm11
  776. vpslld $2, %ymm8, %ymm12
  777. vpslld $2, %ymm9, %ymm13
  778. vpaddq %ymm10, %ymm6, %ymm10
  779. vpaddq %ymm11, %ymm7, %ymm11
  780. vpaddq %ymm12, %ymm8, %ymm12
  781. vpaddq %ymm13, %ymm9, %ymm13
  782. # Store powers of r and multiple of 5 for use in multiply.
  783. vmovdqa %ymm10, (%rbx)
  784. vmovdqa %ymm11, 32(%rbx)
  785. vmovdqa %ymm12, 64(%rbx)
  786. vmovdqa %ymm13, 96(%rbx)
  787. vmovdqa %ymm5, (%rcx)
  788. vmovdqa %ymm6, 32(%rcx)
  789. vmovdqa %ymm7, 64(%rcx)
  790. vmovdqa %ymm8, 96(%rcx)
  791. vmovdqa %ymm9, 128(%rcx)
  792. vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
  793. # If not finished then loop over data
  794. cmpb $0x01, 616(%rdi)
  795. jne L_poly1305_avx2_blocks_start
  796. # Do last multiply, reduce, add the four H together and move to
  797. # 32-bit registers
  798. vpmuludq (%rbx), %ymm4, %ymm5
  799. vpmuludq 32(%rbx), %ymm3, %ymm10
  800. vpmuludq 32(%rbx), %ymm4, %ymm6
  801. vpmuludq 64(%rbx), %ymm3, %ymm11
  802. vpmuludq 64(%rbx), %ymm4, %ymm7
  803. vpaddq %ymm5, %ymm10, %ymm5
  804. vpmuludq 64(%rbx), %ymm2, %ymm12
  805. vpmuludq 96(%rbx), %ymm4, %ymm8
  806. vpaddq %ymm6, %ymm11, %ymm6
  807. vpmuludq 96(%rbx), %ymm1, %ymm13
  808. vpmuludq 96(%rbx), %ymm2, %ymm10
  809. vpaddq %ymm5, %ymm12, %ymm5
  810. vpmuludq 96(%rbx), %ymm3, %ymm11
  811. vpmuludq (%rcx), %ymm3, %ymm12
  812. vpaddq %ymm5, %ymm13, %ymm5
  813. vpmuludq (%rcx), %ymm4, %ymm9
  814. vpaddq %ymm6, %ymm10, %ymm6
  815. vpmuludq (%rcx), %ymm0, %ymm13
  816. vpaddq %ymm7, %ymm11, %ymm7
  817. vpmuludq (%rcx), %ymm1, %ymm10
  818. vpaddq %ymm8, %ymm12, %ymm8
  819. vpmuludq (%rcx), %ymm2, %ymm11
  820. vpmuludq 32(%rcx), %ymm2, %ymm12
  821. vpaddq %ymm5, %ymm13, %ymm5
  822. vpmuludq 32(%rcx), %ymm3, %ymm13
  823. vpaddq %ymm6, %ymm10, %ymm6
  824. vpmuludq 32(%rcx), %ymm0, %ymm10
  825. vpaddq %ymm7, %ymm11, %ymm7
  826. vpmuludq 32(%rcx), %ymm1, %ymm11
  827. vpaddq %ymm8, %ymm12, %ymm8
  828. vpmuludq 64(%rcx), %ymm1, %ymm12
  829. vpaddq %ymm9, %ymm13, %ymm9
  830. vpmuludq 64(%rcx), %ymm2, %ymm13
  831. vpaddq %ymm6, %ymm10, %ymm6
  832. vpmuludq 64(%rcx), %ymm0, %ymm10
  833. vpaddq %ymm7, %ymm11, %ymm7
  834. vpmuludq 96(%rcx), %ymm0, %ymm11
  835. vpaddq %ymm8, %ymm12, %ymm8
  836. vpmuludq 96(%rcx), %ymm1, %ymm12
  837. vpaddq %ymm9, %ymm13, %ymm9
  838. vpaddq %ymm7, %ymm10, %ymm7
  839. vpmuludq 128(%rcx), %ymm0, %ymm13
  840. vpaddq %ymm8, %ymm11, %ymm8
  841. vpaddq %ymm9, %ymm12, %ymm9
  842. vpaddq %ymm9, %ymm13, %ymm9
  843. vpsrlq $26, %ymm5, %ymm10
  844. vpsrlq $26, %ymm8, %ymm11
  845. vpand %ymm14, %ymm5, %ymm5
  846. vpand %ymm14, %ymm8, %ymm8
  847. vpaddq %ymm6, %ymm10, %ymm6
  848. vpaddq %ymm9, %ymm11, %ymm9
  849. vpsrlq $26, %ymm6, %ymm10
  850. vpsrlq $26, %ymm9, %ymm11
  851. vpand %ymm14, %ymm6, %ymm1
  852. vpand %ymm14, %ymm9, %ymm4
  853. vpaddq %ymm7, %ymm10, %ymm7
  854. vpslld $2, %ymm11, %ymm12
  855. vpaddd %ymm12, %ymm11, %ymm12
  856. vpsrlq $26, %ymm7, %ymm10
  857. vpaddq %ymm5, %ymm12, %ymm5
  858. vpsrlq $26, %ymm5, %ymm11
  859. vpand %ymm14, %ymm7, %ymm2
  860. vpand %ymm14, %ymm5, %ymm0
  861. vpaddq %ymm8, %ymm10, %ymm8
  862. vpaddq %ymm1, %ymm11, %ymm1
  863. vpsrlq $26, %ymm8, %ymm10
  864. vpand %ymm14, %ymm8, %ymm3
  865. vpaddq %ymm4, %ymm10, %ymm4
  866. vpsrldq $8, %ymm0, %ymm5
  867. vpsrldq $8, %ymm1, %ymm6
  868. vpsrldq $8, %ymm2, %ymm7
  869. vpsrldq $8, %ymm3, %ymm8
  870. vpsrldq $8, %ymm4, %ymm9
  871. vpaddq %ymm0, %ymm5, %ymm0
  872. vpaddq %ymm1, %ymm6, %ymm1
  873. vpaddq %ymm2, %ymm7, %ymm2
  874. vpaddq %ymm3, %ymm8, %ymm3
  875. vpaddq %ymm4, %ymm9, %ymm4
  876. vpermq $2, %ymm0, %ymm5
  877. vpermq $2, %ymm1, %ymm6
  878. vpermq $2, %ymm2, %ymm7
  879. vpermq $2, %ymm3, %ymm8
  880. vpermq $2, %ymm4, %ymm9
  881. vpaddq %ymm0, %ymm5, %ymm0
  882. vpaddq %ymm1, %ymm6, %ymm1
  883. vpaddq %ymm2, %ymm7, %ymm2
  884. vpaddq %ymm3, %ymm8, %ymm3
  885. vpaddq %ymm4, %ymm9, %ymm4
  886. vmovd %xmm0, %r8d
  887. vmovd %xmm1, %r9d
  888. vmovd %xmm2, %r10d
  889. vmovd %xmm3, %r11d
  890. vmovd %xmm4, %r12d
  891. jmp L_poly1305_avx2_blocks_end_calc
  892. L_poly1305_avx2_blocks_start:
  893. vmovdqu (%rsi), %ymm5
  894. vmovdqu 32(%rsi), %ymm6
  895. vperm2i128 $32, %ymm6, %ymm5, %ymm7
  896. vperm2i128 $49, %ymm6, %ymm5, %ymm5
  897. vpunpckldq %ymm5, %ymm7, %ymm6
  898. vpunpckhdq %ymm5, %ymm7, %ymm8
  899. vpunpckldq %ymm15, %ymm6, %ymm5
  900. vpunpckhdq %ymm15, %ymm6, %ymm6
  901. vpunpckldq %ymm15, %ymm8, %ymm7
  902. vpunpckhdq %ymm15, %ymm8, %ymm8
  903. vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm9
  904. vpsllq $6, %ymm6, %ymm6
  905. vpsllq $12, %ymm7, %ymm7
  906. vpsllq $18, %ymm8, %ymm8
  907. vpmuludq (%rbx), %ymm4, %ymm10
  908. vpaddq %ymm5, %ymm10, %ymm5
  909. vpmuludq 32(%rbx), %ymm3, %ymm10
  910. vpmuludq 32(%rbx), %ymm4, %ymm11
  911. vpaddq %ymm6, %ymm11, %ymm6
  912. vpmuludq 64(%rbx), %ymm3, %ymm11
  913. vpmuludq 64(%rbx), %ymm4, %ymm12
  914. vpaddq %ymm7, %ymm12, %ymm7
  915. vpaddq %ymm5, %ymm10, %ymm5
  916. vpmuludq 64(%rbx), %ymm2, %ymm12
  917. vpmuludq 96(%rbx), %ymm4, %ymm13
  918. vpaddq %ymm8, %ymm13, %ymm8
  919. vpaddq %ymm6, %ymm11, %ymm6
  920. vpmuludq 96(%rbx), %ymm1, %ymm13
  921. vpmuludq 96(%rbx), %ymm2, %ymm10
  922. vpaddq %ymm5, %ymm12, %ymm5
  923. vpmuludq 96(%rbx), %ymm3, %ymm11
  924. vpmuludq (%rcx), %ymm3, %ymm12
  925. vpaddq %ymm5, %ymm13, %ymm5
  926. vpmuludq (%rcx), %ymm4, %ymm13
  927. vpaddq %ymm9, %ymm13, %ymm9
  928. vpaddq %ymm6, %ymm10, %ymm6
  929. vpmuludq (%rcx), %ymm0, %ymm13
  930. vpaddq %ymm7, %ymm11, %ymm7
  931. vpmuludq (%rcx), %ymm1, %ymm10
  932. vpaddq %ymm8, %ymm12, %ymm8
  933. vpmuludq (%rcx), %ymm2, %ymm11
  934. vpmuludq 32(%rcx), %ymm2, %ymm12
  935. vpaddq %ymm5, %ymm13, %ymm5
  936. vpmuludq 32(%rcx), %ymm3, %ymm13
  937. vpaddq %ymm6, %ymm10, %ymm6
  938. vpmuludq 32(%rcx), %ymm0, %ymm10
  939. vpaddq %ymm7, %ymm11, %ymm7
  940. vpmuludq 32(%rcx), %ymm1, %ymm11
  941. vpaddq %ymm8, %ymm12, %ymm8
  942. vpmuludq 64(%rcx), %ymm1, %ymm12
  943. vpaddq %ymm9, %ymm13, %ymm9
  944. vpmuludq 64(%rcx), %ymm2, %ymm13
  945. vpaddq %ymm6, %ymm10, %ymm6
  946. vpmuludq 64(%rcx), %ymm0, %ymm10
  947. vpaddq %ymm7, %ymm11, %ymm7
  948. vpmuludq 96(%rcx), %ymm0, %ymm11
  949. vpaddq %ymm8, %ymm12, %ymm8
  950. vpmuludq 96(%rcx), %ymm1, %ymm12
  951. vpaddq %ymm9, %ymm13, %ymm9
  952. vpaddq %ymm7, %ymm10, %ymm7
  953. vpmuludq 128(%rcx), %ymm0, %ymm13
  954. vpaddq %ymm8, %ymm11, %ymm8
  955. vpaddq %ymm9, %ymm12, %ymm9
  956. vpaddq %ymm9, %ymm13, %ymm9
  957. vpsrlq $26, %ymm5, %ymm10
  958. vpsrlq $26, %ymm8, %ymm11
  959. vpand %ymm14, %ymm5, %ymm5
  960. vpand %ymm14, %ymm8, %ymm8
  961. vpaddq %ymm6, %ymm10, %ymm6
  962. vpaddq %ymm9, %ymm11, %ymm9
  963. vpsrlq $26, %ymm6, %ymm10
  964. vpsrlq $26, %ymm9, %ymm11
  965. vpand %ymm14, %ymm6, %ymm1
  966. vpand %ymm14, %ymm9, %ymm4
  967. vpaddq %ymm7, %ymm10, %ymm7
  968. vpslld $2, %ymm11, %ymm12
  969. vpaddd %ymm12, %ymm11, %ymm12
  970. vpsrlq $26, %ymm7, %ymm10
  971. vpaddq %ymm5, %ymm12, %ymm5
  972. vpsrlq $26, %ymm5, %ymm11
  973. vpand %ymm14, %ymm7, %ymm2
  974. vpand %ymm14, %ymm5, %ymm0
  975. vpaddq %ymm8, %ymm10, %ymm8
  976. vpaddq %ymm1, %ymm11, %ymm1
  977. vpsrlq $26, %ymm8, %ymm10
  978. vpand %ymm14, %ymm8, %ymm3
  979. vpaddq %ymm4, %ymm10, %ymm4
  980. addq $0x40, %rsi
  981. subq $0x40, %rdx
  982. jnz L_poly1305_avx2_blocks_start
  983. L_poly1305_avx2_blocks_store:
  984. # Store four H values - state
  985. vmovdqu %ymm0, (%rax)
  986. vmovdqu %ymm1, 32(%rax)
  987. vmovdqu %ymm2, 64(%rax)
  988. vmovdqu %ymm3, 96(%rax)
  989. vmovdqu %ymm4, 128(%rax)
  990. L_poly1305_avx2_blocks_end_calc:
  991. cmpb $0x00, 616(%rdi)
  992. je L_poly1305_avx2_blocks_complete
  993. movq %r8, %rax
  994. movq %r10, %rdx
  995. movq %r12, %rcx
  996. shrq $12, %rdx
  997. shrq $24, %rcx
  998. shlq $26, %r9
  999. shlq $52, %r10
  1000. shlq $14, %r11
  1001. shlq $40, %r12
  1002. addq %r9, %rax
  1003. adcq %r10, %rax
  1004. adcq %r11, %rdx
  1005. adcq %r12, %rdx
  1006. adcq $0x00, %rcx
  1007. movq %rcx, %r8
  1008. andq $3, %rcx
  1009. shrq $2, %r8
  1010. leaq 0(%r8,%r8,4), %r8
  1011. addq %r8, %rax
  1012. adcq $0x00, %rdx
  1013. adcq $0x00, %rcx
  1014. movq %rax, 24(%rdi)
  1015. movq %rdx, 32(%rdi)
  1016. movq %rcx, 40(%rdi)
  1017. L_poly1305_avx2_blocks_complete:
  1018. movb $0x01, 617(%rdi)
  1019. addq $0x140, %rsp
  1020. popq %rbx
  1021. popq %r12
  1022. repz retq
  1023. #ifndef __APPLE__
  1024. .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
  1025. #endif /* __APPLE__ */
  1026. #ifndef __APPLE__
  1027. .text
  1028. .globl poly1305_final_avx2
  1029. .type poly1305_final_avx2,@function
  1030. .align 16
  1031. poly1305_final_avx2:
  1032. #else
  1033. .section __TEXT,__text
  1034. .globl _poly1305_final_avx2
  1035. .p2align 4
  1036. _poly1305_final_avx2:
  1037. #endif /* __APPLE__ */
  1038. movb $0x01, 616(%rdi)
  1039. movb 617(%rdi), %cl
  1040. cmpb $0x00, %cl
  1041. je L_poly1305_avx2_final_done_blocks_X4
  1042. pushq %rsi
  1043. movq $0x40, %rdx
  1044. xorq %rsi, %rsi
  1045. #ifndef __APPLE__
  1046. callq poly1305_blocks_avx2@plt
  1047. #else
  1048. callq _poly1305_blocks_avx2
  1049. #endif /* __APPLE__ */
  1050. popq %rsi
  1051. L_poly1305_avx2_final_done_blocks_X4:
  1052. movq 608(%rdi), %rax
  1053. movq %rax, %rcx
  1054. andq $-16, %rcx
  1055. cmpb $0x00, %cl
  1056. je L_poly1305_avx2_final_done_blocks
  1057. pushq %rcx
  1058. pushq %rax
  1059. pushq %rsi
  1060. movq %rcx, %rdx
  1061. leaq 480(%rdi), %rsi
  1062. #ifndef __APPLE__
  1063. callq poly1305_blocks_avx@plt
  1064. #else
  1065. callq _poly1305_blocks_avx
  1066. #endif /* __APPLE__ */
  1067. popq %rsi
  1068. popq %rax
  1069. popq %rcx
  1070. L_poly1305_avx2_final_done_blocks:
  1071. subq %rcx, 608(%rdi)
  1072. xorq %rdx, %rdx
  1073. jmp L_poly1305_avx2_final_cmp_copy
  1074. L_poly1305_avx2_final_start_copy:
  1075. movb 480(%rdi,%rcx,1), %r8b
  1076. movb %r8b, 480(%rdi,%rdx,1)
  1077. incb %cl
  1078. incb %dl
  1079. L_poly1305_avx2_final_cmp_copy:
  1080. cmp %rcx, %rax
  1081. jne L_poly1305_avx2_final_start_copy
  1082. #ifndef __APPLE__
  1083. callq poly1305_final_avx@plt
  1084. #else
  1085. callq _poly1305_final_avx
  1086. #endif /* __APPLE__ */
  1087. vpxor %ymm0, %ymm0, %ymm0
  1088. vmovdqu %ymm0, 64(%rdi)
  1089. vmovdqu %ymm0, 96(%rdi)
  1090. vmovdqu %ymm0, 128(%rdi)
  1091. vmovdqu %ymm0, 160(%rdi)
  1092. vmovdqu %ymm0, 192(%rdi)
  1093. vmovdqu %ymm0, 224(%rdi)
  1094. vmovdqu %ymm0, 256(%rdi)
  1095. vmovdqu %ymm0, 288(%rdi)
  1096. vmovdqu %ymm0, 320(%rdi)
  1097. movq $0x00, 608(%rdi)
  1098. movw $0x00, 616(%rdi)
  1099. repz retq
  1100. #ifndef __APPLE__
  1101. .size poly1305_final_avx2,.-poly1305_final_avx2
  1102. #endif /* __APPLE__ */
  1103. #endif /* HAVE_INTEL_AVX2 */
  1104. #if defined(__linux__) && defined(__ELF__)
  1105. .section .note.GNU-stack,"",%progbits
  1106. #endif