addmulmod.q 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. reg128 r0
  2. reg128 r1
  3. reg128 r2
  4. reg128 r3
  5. reg128 r4
  6. reg128 x01
  7. reg128 x23
  8. reg128 x4
  9. reg128 y01
  10. reg128 y23
  11. reg128 y4
  12. reg128 _5y01
  13. reg128 _5y23
  14. reg128 _5y4
  15. reg128 c01
  16. reg128 c23
  17. reg128 c4
  18. reg128 t0
  19. reg128 t1
  20. reg128 t2
  21. reg128 t3
  22. reg128 t4
  23. reg128 mask
  24. enter crypto_onetimeauth_poly1305_neon2_addmulmod
  25. 2x mask = 0xffffffff
  26. y01 aligned= mem128[input_2];input_2+=16
  27. 4x _5y01 = y01 << 2
  28. y23 aligned= mem128[input_2];input_2+=16
  29. 4x _5y23 = y23 << 2
  30. y4 aligned= mem64[input_2]y4[1]
  31. 4x _5y4 = y4 << 2
  32. x01 aligned= mem128[input_1];input_1+=16
  33. 4x _5y01 += y01
  34. x23 aligned= mem128[input_1];input_1+=16
  35. 4x _5y23 += y23
  36. 4x _5y4 += y4
  37. c01 aligned= mem128[input_3];input_3+=16
  38. 4x x01 += c01
  39. c23 aligned= mem128[input_3];input_3+=16
  40. 4x x23 += c23
  41. x4 aligned= mem64[input_1]x4[1]
  42. 2x mask unsigned>>=6
  43. c4 aligned= mem64[input_3]c4[1]
  44. 4x x4 += c4
  45. r0[0,1] = x01[0] unsigned* y01[0]; r0[2,3] = x01[1] unsigned* y01[1]
  46. r0[0,1] += x01[2] unsigned* _5y4[0]; r0[2,3] += x01[3] unsigned* _5y4[1]
  47. r0[0,1] += x23[0] unsigned* _5y23[2]; r0[2,3] += x23[1] unsigned* _5y23[3]
  48. r0[0,1] += x23[2] unsigned* _5y23[0]; r0[2,3] += x23[3] unsigned* _5y23[1]
  49. r0[0,1] += x4[0] unsigned* _5y01[2]; r0[2,3] += x4[1] unsigned* _5y01[3]
  50. r1[0,1] = x01[0] unsigned* y01[2]; r1[2,3] = x01[1] unsigned* y01[3]
  51. r1[0,1] += x01[2] unsigned* y01[0]; r1[2,3] += x01[3] unsigned* y01[1]
  52. r1[0,1] += x23[0] unsigned* _5y4[0]; r1[2,3] += x23[1] unsigned* _5y4[1]
  53. r1[0,1] += x23[2] unsigned* _5y23[2]; r1[2,3] += x23[3] unsigned* _5y23[3]
  54. r1[0,1] += x4[0] unsigned* _5y23[0]; r1[2,3] += x4[1] unsigned* _5y23[1]
  55. r2[0,1] = x01[0] unsigned* y23[0]; r2[2,3] = x01[1] unsigned* y23[1]
  56. r2[0,1] += x01[2] unsigned* y01[2]; r2[2,3] += x01[3] unsigned* y01[3]
  57. r2[0,1] += x23[0] unsigned* y01[0]; r2[2,3] += x23[1] unsigned* y01[1]
  58. r2[0,1] += x23[2] unsigned* _5y4[0]; r2[2,3] += x23[3] unsigned* _5y4[1]
  59. r2[0,1] += x4[0] unsigned* _5y23[2]; r2[2,3] += x4[1] unsigned* _5y23[3]
  60. r3[0,1] = x01[0] unsigned* y23[2]; r3[2,3] = x01[1] unsigned* y23[3]
  61. r3[0,1] += x01[2] unsigned* y23[0]; r3[2,3] += x01[3] unsigned* y23[1]
  62. r3[0,1] += x23[0] unsigned* y01[2]; r3[2,3] += x23[1] unsigned* y01[3]
  63. r3[0,1] += x23[2] unsigned* y01[0]; r3[2,3] += x23[3] unsigned* y01[1]
  64. r3[0,1] += x4[0] unsigned* _5y4[0]; r3[2,3] += x4[1] unsigned* _5y4[1]
  65. r4[0,1] = x01[0] unsigned* y4[0]; r4[2,3] = x01[1] unsigned* y4[1]
  66. r4[0,1] += x01[2] unsigned* y23[2]; r4[2,3] += x01[3] unsigned* y23[3]
  67. r4[0,1] += x23[0] unsigned* y23[0]; r4[2,3] += x23[1] unsigned* y23[1]
  68. r4[0,1] += x23[2] unsigned* y01[2]; r4[2,3] += x23[3] unsigned* y01[3]
  69. r4[0,1] += x4[0] unsigned* y01[0]; r4[2,3] += x4[1] unsigned* y01[1]
  70. 2x t1 = r0 unsigned>> 26
  71. r0 &= mask
  72. 2x r1 += t1
  73. 2x t4 = r3 unsigned>> 26
  74. r3 &= mask
  75. 2x r4 += t4
  76. 2x t2 = r1 unsigned>> 26
  77. r1 &= mask
  78. 2x t0 = r4 unsigned>> 26
  79. 2x r2 += t2
  80. r4 &= mask
  81. 2x r0 += t0
  82. 2x t0 <<= 2
  83. 2x t3 = r2 unsigned>> 26
  84. 2x r0 += t0
  85. x23 = r2 & mask
  86. 2x r3 += t3
  87. 2x t1 = r0 unsigned>> 26
  88. x23 = x23[0,2,1,3]
  89. x01 = r0 & mask
  90. 2x r1 += t1
  91. 2x t4 = r3 unsigned>> 26
  92. x01 = x01[0,2,1,3]
  93. r3 &= mask
  94. r1 = r1[0,2,1,3]
  95. 2x x4 = r4 + t4
  96. r3 = r3[0,2,1,3]
  97. x01 = x01[0,1] r1[0,1]
  98. x23 = x23[0,1] r3[0,1]
  99. x4 = x4[0,2,1,3]
  100. mem128[input_0] aligned= x01;input_0+=16
  101. mem128[input_0] aligned= x23;input_0+=16
  102. mem64[input_0] aligned= x4[0]
  103. return