poly1305_asm.S 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130
  1. /* poly1305_asm.S */
  2. /*
  3. * Copyright (C) 2006-2023 wolfSSL Inc.
  4. *
  5. * This file is part of wolfSSL.
  6. *
  7. * wolfSSL is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * wolfSSL is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
  20. */
  21. #ifdef WOLFSSL_USER_SETTINGS
  22. #ifdef WOLFSSL_USER_SETTINGS_ASM
  23. /*
  24. * user_settings_asm.h is a file generated by the script user_settings_asm.sh.
  25. * The script takes in a user_settings.h and produces user_settings_asm.h, which
  26. * is a stripped down version of user_settings.h containing only preprocessor
  27. * directives. This makes the header safe to include in assembly (.S) files.
  28. */
  29. #include "user_settings_asm.h"
  30. #else
  31. /*
  32. * Note: if user_settings.h contains any C code (e.g. a typedef or function
  33. * prototype), including it here in an assembly (.S) file will cause an
  34. * assembler failure. See user_settings_asm.h above.
  35. */
  36. #include "user_settings.h"
  37. #endif /* WOLFSSL_USER_SETTINGS_ASM */
  38. #endif /* WOLFSSL_USER_SETTINGS */
  39. #ifndef HAVE_INTEL_AVX1
  40. #define HAVE_INTEL_AVX1
  41. #endif /* HAVE_INTEL_AVX1 */
  42. #ifndef NO_AVX2_SUPPORT
  43. #define HAVE_INTEL_AVX2
  44. #endif /* NO_AVX2_SUPPORT */
  45. #ifdef WOLFSSL_X86_64_BUILD
  46. #ifdef HAVE_INTEL_AVX1
  47. #ifndef __APPLE__
  48. .text
  49. .globl poly1305_setkey_avx
  50. .type poly1305_setkey_avx,@function
  51. .align 16
  52. poly1305_setkey_avx:
  53. #else
  54. .section __TEXT,__text
  55. .globl _poly1305_setkey_avx
  56. .p2align 4
  57. _poly1305_setkey_avx:
  58. #endif /* __APPLE__ */
  59. movabsq $0xffffffc0fffffff, %r10
  60. movabsq $0xffffffc0ffffffc, %r11
  61. movq (%rsi), %rdx
  62. movq 8(%rsi), %rax
  63. movq 16(%rsi), %rcx
  64. movq 24(%rsi), %r8
  65. andq %r10, %rdx
  66. andq %r11, %rax
  67. movq %rdx, %r10
  68. movq %rax, %r11
  69. xorq %r9, %r9
  70. movq %rdx, (%rdi)
  71. movq %rax, 8(%rdi)
  72. movq %r9, 24(%rdi)
  73. movq %r9, 32(%rdi)
  74. movq %r9, 40(%rdi)
  75. movq %rcx, 48(%rdi)
  76. movq %r8, 56(%rdi)
  77. movq %r9, 352(%rdi)
  78. movq %r9, 408(%rdi)
  79. movq %rdx, 360(%rdi)
  80. movq %rax, 416(%rdi)
  81. addq %rdx, %r10
  82. addq %rax, %r11
  83. movq %r10, 368(%rdi)
  84. movq %r11, 424(%rdi)
  85. addq %rdx, %r10
  86. addq %rax, %r11
  87. movq %r10, 376(%rdi)
  88. movq %r11, 432(%rdi)
  89. addq %rdx, %r10
  90. addq %rax, %r11
  91. movq %r10, 384(%rdi)
  92. movq %r11, 440(%rdi)
  93. addq %rdx, %r10
  94. addq %rax, %r11
  95. movq %r10, 392(%rdi)
  96. movq %r11, 448(%rdi)
  97. addq %rdx, %r10
  98. addq %rax, %r11
  99. movq %r10, 400(%rdi)
  100. movq %r11, 456(%rdi)
  101. movq %r9, 608(%rdi)
  102. movb $0x01, 616(%rdi)
  103. repz retq
  104. #ifndef __APPLE__
  105. .size poly1305_setkey_avx,.-poly1305_setkey_avx
  106. #endif /* __APPLE__ */
  107. #ifndef __APPLE__
  108. .text
  109. .globl poly1305_block_avx
  110. .type poly1305_block_avx,@function
  111. .align 16
  112. poly1305_block_avx:
  113. #else
  114. .section __TEXT,__text
  115. .globl _poly1305_block_avx
  116. .p2align 4
  117. _poly1305_block_avx:
  118. #endif /* __APPLE__ */
  119. pushq %r15
  120. pushq %rbx
  121. pushq %r12
  122. pushq %r13
  123. pushq %r14
  124. movq (%rdi), %r15
  125. movq 8(%rdi), %rbx
  126. movq 24(%rdi), %r8
  127. movq 32(%rdi), %r9
  128. movq 40(%rdi), %r10
  129. xorq %r14, %r14
  130. movb 616(%rdi), %r14b
  131. # h += m
  132. movq (%rsi), %r11
  133. movq 8(%rsi), %r12
  134. addq %r11, %r8
  135. adcq %r12, %r9
  136. movq %rbx, %rax
  137. adcq %r14, %r10
  138. # r[1] * h[0] => rdx, rax ==> t2, t1
  139. mulq %r8
  140. movq %rax, %r12
  141. movq %rdx, %r13
  142. # r[0] * h[1] => rdx, rax ++> t2, t1
  143. movq %r15, %rax
  144. mulq %r9
  145. addq %rax, %r12
  146. movq %r15, %rax
  147. adcq %rdx, %r13
  148. # r[0] * h[0] => rdx, rax ==> t4, t0
  149. mulq %r8
  150. movq %rax, %r11
  151. movq %rdx, %r8
  152. # r[1] * h[1] => rdx, rax =+> t3, t2
  153. movq %rbx, %rax
  154. mulq %r9
  155. # r[0] * h[2] +> t2
  156. addq 352(%rdi,%r10,8), %r13
  157. movq %rdx, %r14
  158. addq %r8, %r12
  159. adcq %rax, %r13
  160. # r[1] * h[2] +> t3
  161. adcq 408(%rdi,%r10,8), %r14
  162. # r * h in r14, r13, r12, r11
  163. # h = (r * h) mod 2^130 - 5
  164. movq %r13, %r10
  165. andq $-4, %r13
  166. andq $3, %r10
  167. addq %r13, %r11
  168. movq %r13, %r8
  169. adcq %r14, %r12
  170. adcq $0x00, %r10
  171. shrdq $2, %r14, %r8
  172. shrq $2, %r14
  173. addq %r11, %r8
  174. adcq %r14, %r12
  175. movq %r12, %r9
  176. adcq $0x00, %r10
  177. # h in r10, r9, r8
  178. # Store h to ctx
  179. movq %r8, 24(%rdi)
  180. movq %r9, 32(%rdi)
  181. movq %r10, 40(%rdi)
  182. popq %r14
  183. popq %r13
  184. popq %r12
  185. popq %rbx
  186. popq %r15
  187. repz retq
  188. #ifndef __APPLE__
  189. .size poly1305_block_avx,.-poly1305_block_avx
  190. #endif /* __APPLE__ */
  191. #ifndef __APPLE__
  192. .text
  193. .globl poly1305_blocks_avx
  194. .type poly1305_blocks_avx,@function
  195. .align 16
  196. poly1305_blocks_avx:
  197. #else
  198. .section __TEXT,__text
  199. .globl _poly1305_blocks_avx
  200. .p2align 4
  201. _poly1305_blocks_avx:
  202. #endif /* __APPLE__ */
  203. pushq %r15
  204. pushq %rbx
  205. pushq %r12
  206. pushq %r13
  207. pushq %r14
  208. movq %rdx, %rcx
  209. movq (%rdi), %r15
  210. movq 8(%rdi), %rbx
  211. movq 24(%rdi), %r8
  212. movq 32(%rdi), %r9
  213. movq 40(%rdi), %r10
  214. L_poly1305_avx_blocks_start:
  215. # h += m
  216. movq (%rsi), %r11
  217. movq 8(%rsi), %r12
  218. addq %r11, %r8
  219. adcq %r12, %r9
  220. movq %rbx, %rax
  221. adcq $0x00, %r10
  222. # r[1] * h[0] => rdx, rax ==> t2, t1
  223. mulq %r8
  224. movq %rax, %r12
  225. movq %rdx, %r13
  226. # r[0] * h[1] => rdx, rax ++> t2, t1
  227. movq %r15, %rax
  228. mulq %r9
  229. addq %rax, %r12
  230. movq %r15, %rax
  231. adcq %rdx, %r13
  232. # r[0] * h[0] => rdx, rax ==> t4, t0
  233. mulq %r8
  234. movq %rax, %r11
  235. movq %rdx, %r8
  236. # r[1] * h[1] => rdx, rax =+> t3, t2
  237. movq %rbx, %rax
  238. mulq %r9
  239. # r[0] * h[2] +> t2
  240. addq 360(%rdi,%r10,8), %r13
  241. movq %rdx, %r14
  242. addq %r8, %r12
  243. adcq %rax, %r13
  244. # r[1] * h[2] +> t3
  245. adcq 416(%rdi,%r10,8), %r14
  246. # r * h in r14, r13, r12, r11
  247. # h = (r * h) mod 2^130 - 5
  248. movq %r13, %r10
  249. andq $-4, %r13
  250. andq $3, %r10
  251. addq %r13, %r11
  252. movq %r13, %r8
  253. adcq %r14, %r12
  254. adcq $0x00, %r10
  255. shrdq $2, %r14, %r8
  256. shrq $2, %r14
  257. addq %r11, %r8
  258. adcq %r14, %r12
  259. movq %r12, %r9
  260. adcq $0x00, %r10
  261. # h in r10, r9, r8
  262. # Next block from message
  263. addq $16, %rsi
  264. subq $16, %rcx
  265. jg L_poly1305_avx_blocks_start
  266. # Store h to ctx
  267. movq %r8, 24(%rdi)
  268. movq %r9, 32(%rdi)
  269. movq %r10, 40(%rdi)
  270. popq %r14
  271. popq %r13
  272. popq %r12
  273. popq %rbx
  274. popq %r15
  275. repz retq
  276. #ifndef __APPLE__
  277. .size poly1305_blocks_avx,.-poly1305_blocks_avx
  278. #endif /* __APPLE__ */
  279. #ifndef __APPLE__
  280. .text
  281. .globl poly1305_final_avx
  282. .type poly1305_final_avx,@function
  283. .align 16
  284. poly1305_final_avx:
  285. #else
  286. .section __TEXT,__text
  287. .globl _poly1305_final_avx
  288. .p2align 4
  289. _poly1305_final_avx:
  290. #endif /* __APPLE__ */
  291. pushq %rbx
  292. pushq %r12
  293. movq %rsi, %rbx
  294. movq 608(%rdi), %rax
  295. testq %rax, %rax
  296. je L_poly1305_avx_final_no_more
  297. movb $0x01, 480(%rdi,%rax,1)
  298. jmp L_poly1305_avx_final_cmp_rem
  299. L_poly1305_avx_final_zero_rem:
  300. movb $0x00, 480(%rdi,%rax,1)
  301. L_poly1305_avx_final_cmp_rem:
  302. incb %al
  303. cmpq $16, %rax
  304. jl L_poly1305_avx_final_zero_rem
  305. movb $0x00, 616(%rdi)
  306. leaq 480(%rdi), %rsi
  307. #ifndef __APPLE__
  308. callq poly1305_block_avx@plt
  309. #else
  310. callq _poly1305_block_avx
  311. #endif /* __APPLE__ */
  312. L_poly1305_avx_final_no_more:
  313. movq 24(%rdi), %rax
  314. movq 32(%rdi), %rdx
  315. movq 40(%rdi), %rcx
  316. movq 48(%rdi), %r11
  317. movq 56(%rdi), %r12
  318. # h %= p
  319. # h = (h + pad)
  320. # mod 2^130 - 5
  321. movq %rcx, %r8
  322. andq $3, %rcx
  323. shrq $2, %r8
  324. # Multiply by 5
  325. leaq 0(%r8,%r8,4), %r8
  326. addq %r8, %rax
  327. adcq $0x00, %rdx
  328. adcq $0x00, %rcx
  329. # Fixup when between (1 << 130) - 1 and (1 << 130) - 5
  330. movq %rax, %r8
  331. movq %rdx, %r9
  332. movq %rcx, %r10
  333. addq $5, %r8
  334. adcq $0x00, %r9
  335. adcq $0x00, %r10
  336. cmpq $4, %r10
  337. cmoveq %r8, %rax
  338. cmoveq %r9, %rdx
  339. # h += pad
  340. addq %r11, %rax
  341. adcq %r12, %rdx
  342. movq %rax, (%rbx)
  343. movq %rdx, 8(%rbx)
  344. # Zero out r
  345. movq $0x00, (%rdi)
  346. movq $0x00, 8(%rdi)
  347. # Zero out h
  348. movq $0x00, 24(%rdi)
  349. movq $0x00, 32(%rdi)
  350. movq $0x00, 40(%rdi)
  351. # Zero out pad
  352. movq $0x00, 48(%rdi)
  353. movq $0x00, 56(%rdi)
  354. popq %r12
  355. popq %rbx
  356. repz retq
  357. #ifndef __APPLE__
  358. .size poly1305_final_avx,.-poly1305_final_avx
  359. #endif /* __APPLE__ */
  360. #endif /* HAVE_INTEL_AVX1 */
  361. #ifdef HAVE_INTEL_AVX2
  362. #ifndef __APPLE__
  363. .text
  364. .globl poly1305_calc_powers_avx2
  365. .type poly1305_calc_powers_avx2,@function
  366. .align 16
  367. poly1305_calc_powers_avx2:
  368. #else
  369. .section __TEXT,__text
  370. .globl _poly1305_calc_powers_avx2
  371. .p2align 4
  372. _poly1305_calc_powers_avx2:
  373. #endif /* __APPLE__ */
  374. pushq %r12
  375. pushq %r13
  376. pushq %r14
  377. pushq %r15
  378. pushq %rbx
  379. pushq %rbp
  380. movq (%rdi), %rcx
  381. movq 8(%rdi), %r8
  382. xorq %r9, %r9
  383. # Convert to 26 bits in 32
  384. movq %rcx, %rax
  385. movq %rcx, %rdx
  386. movq %rcx, %rsi
  387. movq %r8, %rbx
  388. movq %r8, %rbp
  389. shrq $26, %rdx
  390. shrdq $52, %r8, %rsi
  391. shrq $14, %rbx
  392. shrdq $40, %r9, %rbp
  393. andq $0x3ffffff, %rax
  394. andq $0x3ffffff, %rdx
  395. andq $0x3ffffff, %rsi
  396. andq $0x3ffffff, %rbx
  397. andq $0x3ffffff, %rbp
  398. movl %eax, 224(%rdi)
  399. movl %edx, 228(%rdi)
  400. movl %esi, 232(%rdi)
  401. movl %ebx, 236(%rdi)
  402. movl %ebp, 240(%rdi)
  403. movl $0x00, 244(%rdi)
  404. # Square 128-bit
  405. movq %r8, %rax
  406. mulq %rcx
  407. xorq %r13, %r13
  408. movq %rax, %r11
  409. movq %rdx, %r12
  410. addq %rax, %r11
  411. adcq %rdx, %r12
  412. adcq $0x00, %r13
  413. movq %rcx, %rax
  414. mulq %rax
  415. movq %rax, %r10
  416. movq %rdx, %r15
  417. movq %r8, %rax
  418. mulq %rax
  419. addq %r15, %r11
  420. adcq %rax, %r12
  421. adcq %rdx, %r13
  422. # Reduce 256-bit to 130-bit
  423. movq %r12, %rax
  424. movq %r13, %rdx
  425. andq $-4, %rax
  426. andq $3, %r12
  427. addq %rax, %r10
  428. adcq %rdx, %r11
  429. adcq $0x00, %r12
  430. shrdq $2, %rdx, %rax
  431. shrq $2, %rdx
  432. addq %rax, %r10
  433. adcq %rdx, %r11
  434. adcq $0x00, %r12
  435. movq %r12, %rax
  436. shrq $2, %rax
  437. leaq 0(%rax,%rax,4), %rax
  438. andq $3, %r12
  439. addq %rax, %r10
  440. adcq $0x00, %r11
  441. adcq $0x00, %r12
  442. # Convert to 26 bits in 32
  443. movq %r10, %rax
  444. movq %r10, %rdx
  445. movq %r10, %rsi
  446. movq %r11, %rbx
  447. movq %r11, %rbp
  448. shrq $26, %rdx
  449. shrdq $52, %r11, %rsi
  450. shrq $14, %rbx
  451. shrdq $40, %r12, %rbp
  452. andq $0x3ffffff, %rax
  453. andq $0x3ffffff, %rdx
  454. andq $0x3ffffff, %rsi
  455. andq $0x3ffffff, %rbx
  456. andq $0x3ffffff, %rbp
  457. movl %eax, 256(%rdi)
  458. movl %edx, 260(%rdi)
  459. movl %esi, 264(%rdi)
  460. movl %ebx, 268(%rdi)
  461. movl %ebp, 272(%rdi)
  462. movl $0x00, 276(%rdi)
  463. # Multiply 128-bit by 130-bit
  464. # r1[0] * r2[0]
  465. movq %rcx, %rax
  466. mulq %r10
  467. movq %rax, %r13
  468. movq %rdx, %r14
  469. # r1[0] * r2[1]
  470. movq %rcx, %rax
  471. mulq %r11
  472. movq $0x00, %r15
  473. addq %rax, %r14
  474. adcq %rdx, %r15
  475. # r1[1] * r2[0]
  476. movq %r8, %rax
  477. mulq %r10
  478. movq $0x00, %rsi
  479. addq %rax, %r14
  480. adcq %rdx, %r15
  481. adcq $0x00, %rsi
  482. # r1[0] * r2[2]
  483. movq %rcx, %rax
  484. mulq %r12
  485. addq %rax, %r15
  486. adcq %rdx, %rsi
  487. # r1[1] * r2[1]
  488. movq %r8, %rax
  489. mulq %r11
  490. movq $0x00, %rbx
  491. addq %rax, %r15
  492. adcq %rdx, %rsi
  493. adcq $0x00, %rbx
  494. # r1[1] * r2[2]
  495. movq %r8, %rax
  496. mulq %r12
  497. addq %rax, %rsi
  498. adcq %rdx, %rbx
  499. # Reduce 260-bit to 130-bit
  500. movq %r15, %rax
  501. movq %rsi, %rdx
  502. movq %rbx, %rbx
  503. andq $-4, %rax
  504. andq $3, %r15
  505. addq %rax, %r13
  506. adcq %rdx, %r14
  507. adcq %rbx, %r15
  508. shrdq $2, %rdx, %rax
  509. shrdq $2, %rbx, %rdx
  510. shrq $2, %rbx
  511. addq %rax, %r13
  512. adcq %rdx, %r14
  513. adcq %rbx, %r15
  514. movq %r15, %rax
  515. andq $3, %r15
  516. shrq $2, %rax
  517. leaq 0(%rax,%rax,4), %rax
  518. addq %rax, %r13
  519. adcq $0x00, %r14
  520. adcq $0x00, %r15
  521. # Convert to 26 bits in 32
  522. movq %r13, %rax
  523. movq %r13, %rdx
  524. movq %r13, %rsi
  525. movq %r14, %rbx
  526. movq %r14, %rbp
  527. shrq $26, %rdx
  528. shrdq $52, %r14, %rsi
  529. shrq $14, %rbx
  530. shrdq $40, %r15, %rbp
  531. andq $0x3ffffff, %rax
  532. andq $0x3ffffff, %rdx
  533. andq $0x3ffffff, %rsi
  534. andq $0x3ffffff, %rbx
  535. andq $0x3ffffff, %rbp
  536. movl %eax, 288(%rdi)
  537. movl %edx, 292(%rdi)
  538. movl %esi, 296(%rdi)
  539. movl %ebx, 300(%rdi)
  540. movl %ebp, 304(%rdi)
  541. movl $0x00, 308(%rdi)
  542. # Square 130-bit
  543. movq %r11, %rax
  544. mulq %r10
  545. xorq %r13, %r13
  546. movq %rax, %r8
  547. movq %rdx, %r9
  548. addq %rax, %r8
  549. adcq %rdx, %r9
  550. adcq $0x00, %r13
  551. movq %r10, %rax
  552. mulq %rax
  553. movq %rax, %rcx
  554. movq %rdx, %r15
  555. movq %r11, %rax
  556. mulq %rax
  557. addq %r15, %r8
  558. adcq %rax, %r9
  559. adcq %rdx, %r13
  560. movq %r12, %rax
  561. mulq %rax
  562. movq %rax, %r14
  563. movq %r12, %rax
  564. mulq %r10
  565. addq %rax, %r9
  566. adcq %rdx, %r13
  567. adcq $0x00, %r14
  568. addq %rax, %r9
  569. adcq %rdx, %r13
  570. adcq $0x00, %r14
  571. movq %r12, %rax
  572. mulq %r11
  573. addq %rax, %r13
  574. adcq %rdx, %r14
  575. addq %rax, %r13
  576. adcq %rdx, %r14
  577. # Reduce 260-bit to 130-bit
  578. movq %r9, %rax
  579. movq %r13, %rdx
  580. movq %r14, %r15
  581. andq $-4, %rax
  582. andq $3, %r9
  583. addq %rax, %rcx
  584. adcq %rdx, %r8
  585. adcq %r15, %r9
  586. shrdq $2, %rdx, %rax
  587. shrdq $2, %r15, %rdx
  588. shrq $2, %r15
  589. addq %rax, %rcx
  590. adcq %rdx, %r8
  591. adcq %r15, %r9
  592. movq %r9, %rax
  593. andq $3, %r9
  594. shrq $2, %rax
  595. leaq 0(%rax,%rax,4), %rax
  596. addq %rax, %rcx
  597. adcq $0x00, %r8
  598. adcq $0x00, %r9
  599. # Convert to 26 bits in 32
  600. movq %rcx, %rax
  601. movq %rcx, %rdx
  602. movq %rcx, %rsi
  603. movq %r8, %rbx
  604. movq %r8, %rbp
  605. shrq $26, %rdx
  606. shrdq $52, %r8, %rsi
  607. shrq $14, %rbx
  608. shrdq $40, %r9, %rbp
  609. andq $0x3ffffff, %rax
  610. andq $0x3ffffff, %rdx
  611. andq $0x3ffffff, %rsi
  612. andq $0x3ffffff, %rbx
  613. andq $0x3ffffff, %rbp
  614. movl %eax, 320(%rdi)
  615. movl %edx, 324(%rdi)
  616. movl %esi, 328(%rdi)
  617. movl %ebx, 332(%rdi)
  618. movl %ebp, 336(%rdi)
  619. movl $0x00, 340(%rdi)
  620. popq %rbp
  621. popq %rbx
  622. popq %r15
  623. popq %r14
  624. popq %r13
  625. popq %r12
  626. repz retq
  627. #ifndef __APPLE__
  628. .size poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2
  629. #endif /* __APPLE__ */
  630. #ifndef __APPLE__
  631. .text
  632. .globl poly1305_setkey_avx2
  633. .type poly1305_setkey_avx2,@function
  634. .align 16
  635. poly1305_setkey_avx2:
  636. #else
  637. .section __TEXT,__text
  638. .globl _poly1305_setkey_avx2
  639. .p2align 4
  640. _poly1305_setkey_avx2:
  641. #endif /* __APPLE__ */
  642. #ifndef __APPLE__
  643. callq poly1305_setkey_avx@plt
  644. #else
  645. callq _poly1305_setkey_avx
  646. #endif /* __APPLE__ */
  647. vpxor %ymm0, %ymm0, %ymm0
  648. vmovdqu %ymm0, 64(%rdi)
  649. vmovdqu %ymm0, 96(%rdi)
  650. vmovdqu %ymm0, 128(%rdi)
  651. vmovdqu %ymm0, 160(%rdi)
  652. vmovdqu %ymm0, 192(%rdi)
  653. movq $0x00, 608(%rdi)
  654. movw $0x00, 616(%rdi)
  655. repz retq
  656. #ifndef __APPLE__
  657. .size poly1305_setkey_avx2,.-poly1305_setkey_avx2
  658. #endif /* __APPLE__ */
  659. #ifndef __APPLE__
  660. .data
  661. #else
  662. .section __DATA,__data
  663. #endif /* __APPLE__ */
  664. #ifndef __APPLE__
  665. .align 32
  666. #else
  667. .p2align 5
  668. #endif /* __APPLE__ */
  669. L_poly1305_avx2_blocks_mask:
  670. .quad 0x3ffffff, 0x3ffffff
  671. .quad 0x3ffffff, 0x3ffffff
  672. #ifndef __APPLE__
  673. .data
  674. #else
  675. .section __DATA,__data
  676. #endif /* __APPLE__ */
  677. #ifndef __APPLE__
  678. .align 32
  679. #else
  680. .p2align 5
  681. #endif /* __APPLE__ */
  682. L_poly1305_avx2_blocks_hibit:
  683. .quad 0x1000000, 0x1000000
  684. .quad 0x1000000, 0x1000000
  685. #ifndef __APPLE__
  686. .text
  687. .globl poly1305_blocks_avx2
  688. .type poly1305_blocks_avx2,@function
  689. .align 16
  690. poly1305_blocks_avx2:
  691. #else
  692. .section __TEXT,__text
  693. .globl _poly1305_blocks_avx2
  694. .p2align 4
  695. _poly1305_blocks_avx2:
  696. #endif /* __APPLE__ */
  697. pushq %r12
  698. pushq %rbx
  699. subq $0x140, %rsp
  700. movq %rsp, %rcx
  701. andq $-32, %rcx
  702. addq $32, %rcx
  703. vpxor %ymm15, %ymm15, %ymm15
  704. movq %rcx, %rbx
  705. leaq 64(%rdi), %rax
  706. addq $0xa0, %rbx
  707. cmpw $0x00, 616(%rdi)
  708. jne L_poly1305_avx2_blocks_begin_h
  709. # Load the message data
  710. vmovdqu (%rsi), %ymm0
  711. vmovdqu 32(%rsi), %ymm1
  712. vperm2i128 $32, %ymm1, %ymm0, %ymm2
  713. vperm2i128 $49, %ymm1, %ymm0, %ymm0
  714. vpunpckldq %ymm0, %ymm2, %ymm1
  715. vpunpckhdq %ymm0, %ymm2, %ymm3
  716. vpunpckldq %ymm15, %ymm1, %ymm0
  717. vpunpckhdq %ymm15, %ymm1, %ymm1
  718. vpunpckldq %ymm15, %ymm3, %ymm2
  719. vpunpckhdq %ymm15, %ymm3, %ymm3
  720. vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm4
  721. vpsllq $6, %ymm1, %ymm1
  722. vpsllq $12, %ymm2, %ymm2
  723. vpsllq $18, %ymm3, %ymm3
  724. vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
  725. # Reduce, in place, the message data
  726. vpsrlq $26, %ymm0, %ymm10
  727. vpsrlq $26, %ymm3, %ymm11
  728. vpand %ymm14, %ymm0, %ymm0
  729. vpand %ymm14, %ymm3, %ymm3
  730. vpaddq %ymm1, %ymm10, %ymm1
  731. vpaddq %ymm4, %ymm11, %ymm4
  732. vpsrlq $26, %ymm1, %ymm10
  733. vpsrlq $26, %ymm4, %ymm11
  734. vpand %ymm14, %ymm1, %ymm1
  735. vpand %ymm14, %ymm4, %ymm4
  736. vpaddq %ymm2, %ymm10, %ymm2
  737. vpslld $2, %ymm11, %ymm12
  738. vpaddd %ymm12, %ymm11, %ymm12
  739. vpsrlq $26, %ymm2, %ymm10
  740. vpaddq %ymm0, %ymm12, %ymm0
  741. vpsrlq $26, %ymm0, %ymm11
  742. vpand %ymm14, %ymm2, %ymm2
  743. vpand %ymm14, %ymm0, %ymm0
  744. vpaddq %ymm3, %ymm10, %ymm3
  745. vpaddq %ymm1, %ymm11, %ymm1
  746. vpsrlq $26, %ymm3, %ymm10
  747. vpand %ymm14, %ymm3, %ymm3
  748. vpaddq %ymm4, %ymm10, %ymm4
  749. addq $0x40, %rsi
  750. subq $0x40, %rdx
  751. jz L_poly1305_avx2_blocks_store
  752. jmp L_poly1305_avx2_blocks_load_r4
  753. L_poly1305_avx2_blocks_begin_h:
  754. # Load the H values.
  755. vmovdqu (%rax), %ymm0
  756. vmovdqu 32(%rax), %ymm1
  757. vmovdqu 64(%rax), %ymm2
  758. vmovdqu 96(%rax), %ymm3
  759. vmovdqu 128(%rax), %ymm4
  760. # Check if there is a power of r to load - otherwise use r^4.
  761. cmpb $0x00, 616(%rdi)
  762. je L_poly1305_avx2_blocks_load_r4
  763. # Load the 4 powers of r - r^4, r^3, r^2, r^1.
  764. vmovdqu 224(%rdi), %ymm8
  765. vmovdqu 256(%rdi), %ymm7
  766. vmovdqu 288(%rdi), %ymm6
  767. vmovdqu 320(%rdi), %ymm5
  768. vpermq $0xd8, %ymm5, %ymm5
  769. vpermq $0xd8, %ymm6, %ymm6
  770. vpermq $0xd8, %ymm7, %ymm7
  771. vpermq $0xd8, %ymm8, %ymm8
  772. vpunpcklqdq %ymm6, %ymm5, %ymm10
  773. vpunpckhqdq %ymm6, %ymm5, %ymm11
  774. vpunpcklqdq %ymm8, %ymm7, %ymm12
  775. vpunpckhqdq %ymm8, %ymm7, %ymm13
  776. vperm2i128 $32, %ymm12, %ymm10, %ymm5
  777. vperm2i128 $49, %ymm12, %ymm10, %ymm7
  778. vperm2i128 $32, %ymm13, %ymm11, %ymm9
  779. vpsrlq $32, %ymm5, %ymm6
  780. vpsrlq $32, %ymm7, %ymm8
  781. jmp L_poly1305_avx2_blocks_mul_5
  782. L_poly1305_avx2_blocks_load_r4:
  783. # Load r^4 into all four positions.
  784. vmovdqu 320(%rdi), %ymm13
  785. vpermq $0x00, %ymm13, %ymm5
  786. vpsrlq $32, %ymm13, %ymm14
  787. vpermq $0x55, %ymm13, %ymm7
  788. vpermq $0xaa, %ymm13, %ymm9
  789. vpermq $0x00, %ymm14, %ymm6
  790. vpermq $0x55, %ymm14, %ymm8
  791. L_poly1305_avx2_blocks_mul_5:
  792. # Multiply top 4 26-bit values of all four H by 5
  793. vpslld $2, %ymm6, %ymm10
  794. vpslld $2, %ymm7, %ymm11
  795. vpslld $2, %ymm8, %ymm12
  796. vpslld $2, %ymm9, %ymm13
  797. vpaddq %ymm10, %ymm6, %ymm10
  798. vpaddq %ymm11, %ymm7, %ymm11
  799. vpaddq %ymm12, %ymm8, %ymm12
  800. vpaddq %ymm13, %ymm9, %ymm13
  801. # Store powers of r and multiple of 5 for use in multiply.
  802. vmovdqa %ymm10, (%rbx)
  803. vmovdqa %ymm11, 32(%rbx)
  804. vmovdqa %ymm12, 64(%rbx)
  805. vmovdqa %ymm13, 96(%rbx)
  806. vmovdqa %ymm5, (%rcx)
  807. vmovdqa %ymm6, 32(%rcx)
  808. vmovdqa %ymm7, 64(%rcx)
  809. vmovdqa %ymm8, 96(%rcx)
  810. vmovdqa %ymm9, 128(%rcx)
  811. vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
  812. # If not finished then loop over data
  813. cmpb $0x01, 616(%rdi)
  814. jne L_poly1305_avx2_blocks_start
  815. # Do last multiply, reduce, add the four H together and move to
  816. # 32-bit registers
  817. vpmuludq (%rbx), %ymm4, %ymm5
  818. vpmuludq 32(%rbx), %ymm3, %ymm10
  819. vpmuludq 32(%rbx), %ymm4, %ymm6
  820. vpmuludq 64(%rbx), %ymm3, %ymm11
  821. vpmuludq 64(%rbx), %ymm4, %ymm7
  822. vpaddq %ymm5, %ymm10, %ymm5
  823. vpmuludq 64(%rbx), %ymm2, %ymm12
  824. vpmuludq 96(%rbx), %ymm4, %ymm8
  825. vpaddq %ymm6, %ymm11, %ymm6
  826. vpmuludq 96(%rbx), %ymm1, %ymm13
  827. vpmuludq 96(%rbx), %ymm2, %ymm10
  828. vpaddq %ymm5, %ymm12, %ymm5
  829. vpmuludq 96(%rbx), %ymm3, %ymm11
  830. vpmuludq (%rcx), %ymm3, %ymm12
  831. vpaddq %ymm5, %ymm13, %ymm5
  832. vpmuludq (%rcx), %ymm4, %ymm9
  833. vpaddq %ymm6, %ymm10, %ymm6
  834. vpmuludq (%rcx), %ymm0, %ymm13
  835. vpaddq %ymm7, %ymm11, %ymm7
  836. vpmuludq (%rcx), %ymm1, %ymm10
  837. vpaddq %ymm8, %ymm12, %ymm8
  838. vpmuludq (%rcx), %ymm2, %ymm11
  839. vpmuludq 32(%rcx), %ymm2, %ymm12
  840. vpaddq %ymm5, %ymm13, %ymm5
  841. vpmuludq 32(%rcx), %ymm3, %ymm13
  842. vpaddq %ymm6, %ymm10, %ymm6
  843. vpmuludq 32(%rcx), %ymm0, %ymm10
  844. vpaddq %ymm7, %ymm11, %ymm7
  845. vpmuludq 32(%rcx), %ymm1, %ymm11
  846. vpaddq %ymm8, %ymm12, %ymm8
  847. vpmuludq 64(%rcx), %ymm1, %ymm12
  848. vpaddq %ymm9, %ymm13, %ymm9
  849. vpmuludq 64(%rcx), %ymm2, %ymm13
  850. vpaddq %ymm6, %ymm10, %ymm6
  851. vpmuludq 64(%rcx), %ymm0, %ymm10
  852. vpaddq %ymm7, %ymm11, %ymm7
  853. vpmuludq 96(%rcx), %ymm0, %ymm11
  854. vpaddq %ymm8, %ymm12, %ymm8
  855. vpmuludq 96(%rcx), %ymm1, %ymm12
  856. vpaddq %ymm9, %ymm13, %ymm9
  857. vpaddq %ymm7, %ymm10, %ymm7
  858. vpmuludq 128(%rcx), %ymm0, %ymm13
  859. vpaddq %ymm8, %ymm11, %ymm8
  860. vpaddq %ymm9, %ymm12, %ymm9
  861. vpaddq %ymm9, %ymm13, %ymm9
  862. vpsrlq $26, %ymm5, %ymm10
  863. vpsrlq $26, %ymm8, %ymm11
  864. vpand %ymm14, %ymm5, %ymm5
  865. vpand %ymm14, %ymm8, %ymm8
  866. vpaddq %ymm6, %ymm10, %ymm6
  867. vpaddq %ymm9, %ymm11, %ymm9
  868. vpsrlq $26, %ymm6, %ymm10
  869. vpsrlq $26, %ymm9, %ymm11
  870. vpand %ymm14, %ymm6, %ymm1
  871. vpand %ymm14, %ymm9, %ymm4
  872. vpaddq %ymm7, %ymm10, %ymm7
  873. vpslld $2, %ymm11, %ymm12
  874. vpaddd %ymm12, %ymm11, %ymm12
  875. vpsrlq $26, %ymm7, %ymm10
  876. vpaddq %ymm5, %ymm12, %ymm5
  877. vpsrlq $26, %ymm5, %ymm11
  878. vpand %ymm14, %ymm7, %ymm2
  879. vpand %ymm14, %ymm5, %ymm0
  880. vpaddq %ymm8, %ymm10, %ymm8
  881. vpaddq %ymm1, %ymm11, %ymm1
  882. vpsrlq $26, %ymm8, %ymm10
  883. vpand %ymm14, %ymm8, %ymm3
  884. vpaddq %ymm4, %ymm10, %ymm4
  885. vpsrldq $8, %ymm0, %ymm5
  886. vpsrldq $8, %ymm1, %ymm6
  887. vpsrldq $8, %ymm2, %ymm7
  888. vpsrldq $8, %ymm3, %ymm8
  889. vpsrldq $8, %ymm4, %ymm9
  890. vpaddq %ymm0, %ymm5, %ymm0
  891. vpaddq %ymm1, %ymm6, %ymm1
  892. vpaddq %ymm2, %ymm7, %ymm2
  893. vpaddq %ymm3, %ymm8, %ymm3
  894. vpaddq %ymm4, %ymm9, %ymm4
  895. vpermq $2, %ymm0, %ymm5
  896. vpermq $2, %ymm1, %ymm6
  897. vpermq $2, %ymm2, %ymm7
  898. vpermq $2, %ymm3, %ymm8
  899. vpermq $2, %ymm4, %ymm9
  900. vpaddq %ymm0, %ymm5, %ymm0
  901. vpaddq %ymm1, %ymm6, %ymm1
  902. vpaddq %ymm2, %ymm7, %ymm2
  903. vpaddq %ymm3, %ymm8, %ymm3
  904. vpaddq %ymm4, %ymm9, %ymm4
  905. vmovd %xmm0, %r8d
  906. vmovd %xmm1, %r9d
  907. vmovd %xmm2, %r10d
  908. vmovd %xmm3, %r11d
  909. vmovd %xmm4, %r12d
  910. jmp L_poly1305_avx2_blocks_end_calc
  911. L_poly1305_avx2_blocks_start:
  912. vmovdqu (%rsi), %ymm5
  913. vmovdqu 32(%rsi), %ymm6
  914. vperm2i128 $32, %ymm6, %ymm5, %ymm7
  915. vperm2i128 $49, %ymm6, %ymm5, %ymm5
  916. vpunpckldq %ymm5, %ymm7, %ymm6
  917. vpunpckhdq %ymm5, %ymm7, %ymm8
  918. vpunpckldq %ymm15, %ymm6, %ymm5
  919. vpunpckhdq %ymm15, %ymm6, %ymm6
  920. vpunpckldq %ymm15, %ymm8, %ymm7
  921. vpunpckhdq %ymm15, %ymm8, %ymm8
  922. vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm9
  923. vpsllq $6, %ymm6, %ymm6
  924. vpsllq $12, %ymm7, %ymm7
  925. vpsllq $18, %ymm8, %ymm8
  926. vpmuludq (%rbx), %ymm4, %ymm10
  927. vpaddq %ymm5, %ymm10, %ymm5
  928. vpmuludq 32(%rbx), %ymm3, %ymm10
  929. vpmuludq 32(%rbx), %ymm4, %ymm11
  930. vpaddq %ymm6, %ymm11, %ymm6
  931. vpmuludq 64(%rbx), %ymm3, %ymm11
  932. vpmuludq 64(%rbx), %ymm4, %ymm12
  933. vpaddq %ymm7, %ymm12, %ymm7
  934. vpaddq %ymm5, %ymm10, %ymm5
  935. vpmuludq 64(%rbx), %ymm2, %ymm12
  936. vpmuludq 96(%rbx), %ymm4, %ymm13
  937. vpaddq %ymm8, %ymm13, %ymm8
  938. vpaddq %ymm6, %ymm11, %ymm6
  939. vpmuludq 96(%rbx), %ymm1, %ymm13
  940. vpmuludq 96(%rbx), %ymm2, %ymm10
  941. vpaddq %ymm5, %ymm12, %ymm5
  942. vpmuludq 96(%rbx), %ymm3, %ymm11
  943. vpmuludq (%rcx), %ymm3, %ymm12
  944. vpaddq %ymm5, %ymm13, %ymm5
  945. vpmuludq (%rcx), %ymm4, %ymm13
  946. vpaddq %ymm9, %ymm13, %ymm9
  947. vpaddq %ymm6, %ymm10, %ymm6
  948. vpmuludq (%rcx), %ymm0, %ymm13
  949. vpaddq %ymm7, %ymm11, %ymm7
  950. vpmuludq (%rcx), %ymm1, %ymm10
  951. vpaddq %ymm8, %ymm12, %ymm8
  952. vpmuludq (%rcx), %ymm2, %ymm11
  953. vpmuludq 32(%rcx), %ymm2, %ymm12
  954. vpaddq %ymm5, %ymm13, %ymm5
  955. vpmuludq 32(%rcx), %ymm3, %ymm13
  956. vpaddq %ymm6, %ymm10, %ymm6
  957. vpmuludq 32(%rcx), %ymm0, %ymm10
  958. vpaddq %ymm7, %ymm11, %ymm7
  959. vpmuludq 32(%rcx), %ymm1, %ymm11
  960. vpaddq %ymm8, %ymm12, %ymm8
  961. vpmuludq 64(%rcx), %ymm1, %ymm12
  962. vpaddq %ymm9, %ymm13, %ymm9
  963. vpmuludq 64(%rcx), %ymm2, %ymm13
  964. vpaddq %ymm6, %ymm10, %ymm6
  965. vpmuludq 64(%rcx), %ymm0, %ymm10
  966. vpaddq %ymm7, %ymm11, %ymm7
  967. vpmuludq 96(%rcx), %ymm0, %ymm11
  968. vpaddq %ymm8, %ymm12, %ymm8
  969. vpmuludq 96(%rcx), %ymm1, %ymm12
  970. vpaddq %ymm9, %ymm13, %ymm9
  971. vpaddq %ymm7, %ymm10, %ymm7
  972. vpmuludq 128(%rcx), %ymm0, %ymm13
  973. vpaddq %ymm8, %ymm11, %ymm8
  974. vpaddq %ymm9, %ymm12, %ymm9
  975. vpaddq %ymm9, %ymm13, %ymm9
  976. vpsrlq $26, %ymm5, %ymm10
  977. vpsrlq $26, %ymm8, %ymm11
  978. vpand %ymm14, %ymm5, %ymm5
  979. vpand %ymm14, %ymm8, %ymm8
  980. vpaddq %ymm6, %ymm10, %ymm6
  981. vpaddq %ymm9, %ymm11, %ymm9
  982. vpsrlq $26, %ymm6, %ymm10
  983. vpsrlq $26, %ymm9, %ymm11
  984. vpand %ymm14, %ymm6, %ymm1
  985. vpand %ymm14, %ymm9, %ymm4
  986. vpaddq %ymm7, %ymm10, %ymm7
  987. vpslld $2, %ymm11, %ymm12
  988. vpaddd %ymm12, %ymm11, %ymm12
  989. vpsrlq $26, %ymm7, %ymm10
  990. vpaddq %ymm5, %ymm12, %ymm5
  991. vpsrlq $26, %ymm5, %ymm11
  992. vpand %ymm14, %ymm7, %ymm2
  993. vpand %ymm14, %ymm5, %ymm0
  994. vpaddq %ymm8, %ymm10, %ymm8
  995. vpaddq %ymm1, %ymm11, %ymm1
  996. vpsrlq $26, %ymm8, %ymm10
  997. vpand %ymm14, %ymm8, %ymm3
  998. vpaddq %ymm4, %ymm10, %ymm4
  999. addq $0x40, %rsi
  1000. subq $0x40, %rdx
  1001. jnz L_poly1305_avx2_blocks_start
  1002. L_poly1305_avx2_blocks_store:
  1003. # Store four H values - state
  1004. vmovdqu %ymm0, (%rax)
  1005. vmovdqu %ymm1, 32(%rax)
  1006. vmovdqu %ymm2, 64(%rax)
  1007. vmovdqu %ymm3, 96(%rax)
  1008. vmovdqu %ymm4, 128(%rax)
  1009. L_poly1305_avx2_blocks_end_calc:
  1010. cmpb $0x00, 616(%rdi)
  1011. je L_poly1305_avx2_blocks_complete
  1012. movq %r8, %rax
  1013. movq %r10, %rdx
  1014. movq %r12, %rcx
  1015. shrq $12, %rdx
  1016. shrq $24, %rcx
  1017. shlq $26, %r9
  1018. shlq $52, %r10
  1019. shlq $14, %r11
  1020. shlq $40, %r12
  1021. addq %r9, %rax
  1022. adcq %r10, %rax
  1023. adcq %r11, %rdx
  1024. adcq %r12, %rdx
  1025. adcq $0x00, %rcx
  1026. movq %rcx, %r8
  1027. andq $3, %rcx
  1028. shrq $2, %r8
  1029. leaq 0(%r8,%r8,4), %r8
  1030. addq %r8, %rax
  1031. adcq $0x00, %rdx
  1032. adcq $0x00, %rcx
  1033. movq %rax, 24(%rdi)
  1034. movq %rdx, 32(%rdi)
  1035. movq %rcx, 40(%rdi)
  1036. L_poly1305_avx2_blocks_complete:
  1037. movb $0x01, 617(%rdi)
  1038. addq $0x140, %rsp
  1039. popq %rbx
  1040. popq %r12
  1041. repz retq
  1042. #ifndef __APPLE__
  1043. .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
  1044. #endif /* __APPLE__ */
  1045. #ifndef __APPLE__
  1046. .text
  1047. .globl poly1305_final_avx2
  1048. .type poly1305_final_avx2,@function
  1049. .align 16
  1050. poly1305_final_avx2:
  1051. #else
  1052. .section __TEXT,__text
  1053. .globl _poly1305_final_avx2
  1054. .p2align 4
  1055. _poly1305_final_avx2:
  1056. #endif /* __APPLE__ */
  1057. movb $0x01, 616(%rdi)
  1058. movb 617(%rdi), %cl
  1059. cmpb $0x00, %cl
  1060. je L_poly1305_avx2_final_done_blocks_X4
  1061. pushq %rsi
  1062. movq $0x40, %rdx
  1063. xorq %rsi, %rsi
  1064. #ifndef __APPLE__
  1065. callq poly1305_blocks_avx2@plt
  1066. #else
  1067. callq _poly1305_blocks_avx2
  1068. #endif /* __APPLE__ */
  1069. popq %rsi
  1070. L_poly1305_avx2_final_done_blocks_X4:
  1071. movq 608(%rdi), %rax
  1072. movq %rax, %rcx
  1073. andq $-16, %rcx
  1074. cmpb $0x00, %cl
  1075. je L_poly1305_avx2_final_done_blocks
  1076. pushq %rcx
  1077. pushq %rax
  1078. pushq %rsi
  1079. movq %rcx, %rdx
  1080. leaq 480(%rdi), %rsi
  1081. #ifndef __APPLE__
  1082. callq poly1305_blocks_avx@plt
  1083. #else
  1084. callq _poly1305_blocks_avx
  1085. #endif /* __APPLE__ */
  1086. popq %rsi
  1087. popq %rax
  1088. popq %rcx
  1089. L_poly1305_avx2_final_done_blocks:
  1090. subq %rcx, 608(%rdi)
  1091. xorq %rdx, %rdx
  1092. jmp L_poly1305_avx2_final_cmp_copy
  1093. L_poly1305_avx2_final_start_copy:
  1094. movb 480(%rdi,%rcx,1), %r8b
  1095. movb %r8b, 480(%rdi,%rdx,1)
  1096. incb %cl
  1097. incb %dl
  1098. L_poly1305_avx2_final_cmp_copy:
  1099. cmp %rcx, %rax
  1100. jne L_poly1305_avx2_final_start_copy
  1101. #ifndef __APPLE__
  1102. callq poly1305_final_avx@plt
  1103. #else
  1104. callq _poly1305_final_avx
  1105. #endif /* __APPLE__ */
  1106. vpxor %ymm0, %ymm0, %ymm0
  1107. vmovdqu %ymm0, 64(%rdi)
  1108. vmovdqu %ymm0, 96(%rdi)
  1109. vmovdqu %ymm0, 128(%rdi)
  1110. vmovdqu %ymm0, 160(%rdi)
  1111. vmovdqu %ymm0, 192(%rdi)
  1112. vmovdqu %ymm0, 224(%rdi)
  1113. vmovdqu %ymm0, 256(%rdi)
  1114. vmovdqu %ymm0, 288(%rdi)
  1115. vmovdqu %ymm0, 320(%rdi)
  1116. movq $0x00, 608(%rdi)
  1117. movw $0x00, 616(%rdi)
  1118. repz retq
  1119. #ifndef __APPLE__
  1120. .size poly1305_final_avx2,.-poly1305_final_avx2
  1121. #endif /* __APPLE__ */
  1122. #endif /* HAVE_INTEL_AVX2 */
  1123. #endif /* WOLFSSL_X86_64_BUILD */
  1124. #if defined(__linux__) && defined(__ELF__)
  1125. .section .note.GNU-stack,"",%progbits
  1126. #endif