poly1305-ia64.S 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. // ====================================================================
  2. // Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
  3. // project.
  4. // ====================================================================
  5. //
  6. // Poly1305 for Itanium.
  7. //
  8. // January 2019
  9. //
  10. // Performance was reported to be ~2.1 cycles per byte on Itanium 2.
  11. // With exception for processors in 95xx family, which have higher
  12. // floating-point instructions' latencies and deliver ~2.6 cpb.
  13. // Comparison to compiler-generated code is not exactly fair, because
  14. // of different radixes. But just for reference, it was observed to be
  15. // >3x faster. Originally it was argued that floating-point base 2^32
  16. // implementation would be optimal. Upon closer look estimate for below
  17. // integer base 2^64 implementation turned to be approximately same on
  18. // Itanium 2. But floating-point code would be larger, and have higher
  19. // overhead, which would negatively affect small-block performance...
  20. #if defined(_HPUX_SOURCE)
  21. # if !defined(_LP64)
  22. # define ADDP addp4
  23. # else
  24. # define ADDP add
  25. # endif
  26. # define RUM rum
  27. # define SUM sum
  28. #else
  29. # define ADDP add
  30. # define RUM nop
  31. # define SUM nop
  32. #endif
  33. .text
  34. .explicit
  35. .global poly1305_init#
  36. .proc poly1305_init#
  37. .align 64
  38. poly1305_init:
  39. .prologue
  40. .save ar.pfs,r2
  41. { .mmi; alloc r2=ar.pfs,2,0,0,0
  42. cmp.eq p6,p7=0,r33 } // key == NULL?
  43. { .mmi; ADDP r9=8,r32
  44. ADDP r10=16,r32
  45. ADDP r32=0,r32 };;
  46. .body
  47. { .mmi; st8 [r32]=r0,24 // ctx->h0 = 0
  48. st8 [r9]=r0 // ctx->h1 = 0
  49. (p7) ADDP r8=0,r33 }
  50. { .mib; st8 [r10]=r0 // ctx->h2 = 0
  51. (p6) mov r8=0
  52. (p6) br.ret.spnt b0 };;
  53. { .mmi; ADDP r9=1,r33
  54. ADDP r10=2,r33
  55. ADDP r11=3,r33 };;
  56. { .mmi; ld1 r16=[r8],4 // load key, little-endian
  57. ld1 r17=[r9],4 }
  58. { .mmi; ld1 r18=[r10],4
  59. ld1 r19=[r11],4 };;
  60. { .mmi; ld1 r20=[r8],4
  61. ld1 r21=[r9],4 }
  62. { .mmi; ld1 r22=[r10],4
  63. ld1 r23=[r11],4
  64. and r19=15,r19 };;
  65. { .mmi; ld1 r24=[r8],4
  66. ld1 r25=[r9],4
  67. and r20=-4,r20 }
  68. { .mmi; ld1 r26=[r10],4
  69. ld1 r27=[r11],4
  70. and r23=15,r23 };;
  71. { .mmi; ld1 r28=[r8],4
  72. ld1 r29=[r9],4
  73. and r24=-4,r24 }
  74. { .mmi; ld1 r30=[r10],4
  75. ld1 r31=[r11],4
  76. and r27=15,r27 };;
  77. { .mii; and r28=-4,r28
  78. dep r16=r17,r16,8,8
  79. dep r18=r19,r18,8,8 };;
  80. { .mii; and r31=15,r31
  81. dep r16=r18,r16,16,16
  82. dep r20=r21,r20,8,8 };;
  83. { .mii; dep r16=r20,r16,32,16
  84. dep r22=r23,r22,8,8 };;
  85. { .mii; dep r16=r22,r16,48,16
  86. dep r24=r25,r24,8,8 };;
  87. { .mii; dep r26=r27,r26,8,8
  88. dep r28=r29,r28,8,8 };;
  89. { .mii; dep r24=r26,r24,16,16
  90. dep r30=r31,r30,8,8 };;
  91. { .mii; st8 [r32]=r16,8 // ctx->r0
  92. dep r24=r28,r24,32,16;;
  93. dep r24=r30,r24,48,16 };;
  94. { .mii; st8 [r32]=r24,8 // ctx->r1
  95. shr.u r25=r24,2;;
  96. add r25=r25,r24 };;
  97. { .mib; st8 [r32]=r25 // ctx->s1
  98. mov r8=0
  99. br.ret.sptk b0 };;
  100. .endp poly1305_init#
  101. h0=r17; h1=r18; h2=r19;
  102. i0=r20; i1=r21;
  103. HF0=f8; HF1=f9; HF2=f10;
  104. RF0=f11; RF1=f12; SF1=f13;
  105. .global poly1305_blocks#
  106. .proc poly1305_blocks#
  107. .align 64
  108. poly1305_blocks:
  109. .prologue
  110. .save ar.pfs,r2
  111. { .mii; alloc r2=ar.pfs,4,1,0,0
  112. .save ar.lc,r3
  113. mov r3=ar.lc
  114. .save pr,r36
  115. mov r36=pr }
  116. .body
  117. { .mmi; ADDP r8=0,r32
  118. ADDP r9=8,r32
  119. and r29=7,r33 };;
  120. { .mmi; ld8 h0=[r8],16
  121. ld8 h1=[r9],16
  122. and r33=-8,r33 };;
  123. { .mmi; ld8 h2=[r8],16
  124. ldf8 RF0=[r9],16
  125. shr.u r34=r34,4 };;
  126. { .mmi; ldf8 RF1=[r8],-32
  127. ldf8 SF1=[r9],-32
  128. cmp.ltu p16,p17=1,r34 };;
  129. { .mmi;
  130. (p16) add r34=-2,r34
  131. (p17) mov r34=0
  132. ADDP r10=0,r33 }
  133. { .mii; ADDP r11=8,r33
  134. (p16) mov ar.ec=2
  135. (p17) mov ar.ec=1 };;
  136. { .mib; RUM 1<<1 // go little-endian
  137. mov ar.lc=r34
  138. brp.loop.imp .Loop,.Lcend-16 }
  139. { .mmi; cmp.eq p8,p7=0,r29
  140. cmp.eq p9,p0=1,r29
  141. cmp.eq p10,p0=2,r29 }
  142. { .mmi; cmp.eq p11,p0=3,r29
  143. cmp.eq p12,p0=4,r29
  144. cmp.eq p13,p0=5,r29 }
  145. { .mmi; cmp.eq p14,p0=6,r29
  146. cmp.eq p15,p0=7,r29
  147. add r16=16,r10 };;
  148. { .mmb;
  149. (p8) ld8 i0=[r10],16 // aligned input
  150. (p8) ld8 i1=[r11],16
  151. (p8) br.cond.sptk .Loop };;
  152. // align first block
  153. .pred.rel "mutex",p8,p9,p10,p11,p12,p13,p14,p15
  154. { .mmi; (p7) ld8 r14=[r10],24
  155. (p7) ld8 r15=[r11],24 }
  156. { .mii; (p7) ld8 r16=[r16]
  157. nop.i 0;;
  158. (p15) shrp i0=r15,r14,56 }
  159. { .mii; (p15) shrp i1=r16,r15,56
  160. (p14) shrp i0=r15,r14,48 }
  161. { .mii; (p14) shrp i1=r16,r15,48
  162. (p13) shrp i0=r15,r14,40 }
  163. { .mii; (p13) shrp i1=r16,r15,40
  164. (p12) shrp i0=r15,r14,32 }
  165. { .mii; (p12) shrp i1=r16,r15,32
  166. (p11) shrp i0=r15,r14,24 }
  167. { .mii; (p11) shrp i1=r16,r15,24
  168. (p10) shrp i0=r15,r14,16 }
  169. { .mii; (p10) shrp i1=r16,r15,16
  170. (p9) shrp i0=r15,r14,8 }
  171. { .mii; (p9) shrp i1=r16,r15,8
  172. mov r14=r16 };;
  173. .Loop:
  174. .pred.rel "mutex",p8,p9,p10,p11,p12,p13,p14,p15
  175. { .mmi; add h0=h0,i0
  176. add h1=h1,i1
  177. add h2=h2,r35 };;
  178. { .mmi; setf.sig HF0=h0
  179. cmp.ltu p6,p0=h0,i0
  180. cmp.ltu p7,p0=h1,i1 };;
  181. { .mmi; (p6) add h1=1,h1;;
  182. setf.sig HF1=h1
  183. (p6) cmp.eq.or p7,p0=0,h1 };;
  184. { .mmi; (p7) add h2=1,h2;;
  185. setf.sig HF2=h2 };;
  186. { .mfi; (p16) ld8 r15=[r10],16
  187. xmpy.lu f32=HF0,RF0 }
  188. { .mfi; (p16) ld8 r16=[r11],16
  189. xmpy.hu f33=HF0,RF0 }
  190. { .mfi; xmpy.lu f36=HF0,RF1 }
  191. { .mfi; xmpy.hu f37=HF0,RF1 };;
  192. { .mfi; xmpy.lu f34=HF1,SF1
  193. (p15) shrp i0=r15,r14,56 }
  194. { .mfi; xmpy.hu f35=HF1,SF1 }
  195. { .mfi; xmpy.lu f38=HF1,RF0
  196. (p15) shrp i1=r16,r15,56 }
  197. { .mfi; xmpy.hu f39=HF1,RF0 }
  198. { .mfi; xmpy.lu f40=HF2,SF1
  199. (p14) shrp i0=r15,r14,48 }
  200. { .mfi; xmpy.lu f41=HF2,RF0 };;
  201. { .mmi; getf.sig r22=f32
  202. getf.sig r23=f33
  203. (p14) shrp i1=r16,r15,48 }
  204. { .mmi; getf.sig r24=f34
  205. getf.sig r25=f35
  206. (p13) shrp i0=r15,r14,40 }
  207. { .mmi; getf.sig r26=f36
  208. getf.sig r27=f37
  209. (p13) shrp i1=r16,r15,40 }
  210. { .mmi; getf.sig r28=f38
  211. getf.sig r29=f39
  212. (p12) shrp i0=r15,r14,32 }
  213. { .mmi; getf.sig r30=f40
  214. getf.sig r31=f41 };;
  215. { .mmi; add h0=r22,r24
  216. add r23=r23,r25
  217. (p12) shrp i1=r16,r15,32 }
  218. { .mmi; add h1=r26,r28
  219. add r27=r27,r29
  220. (p11) shrp i0=r15,r14,24 };;
  221. { .mmi; cmp.ltu p6,p0=h0,r24
  222. cmp.ltu p7,p0=h1,r28
  223. add r23=r23,r30 };;
  224. { .mmi; (p6) add r23=1,r23
  225. (p7) add r27=1,r27
  226. (p11) shrp i1=r16,r15,24 };;
  227. { .mmi; add h1=h1,r23;;
  228. cmp.ltu p6,p7=h1,r23
  229. (p10) shrp i0=r15,r14,16 };;
  230. { .mmi; (p6) add h2=r31,r27,1
  231. (p7) add h2=r31,r27
  232. (p10) shrp i1=r16,r15,16 };;
  233. { .mmi; (p8) mov i0=r15
  234. and r22=-4,h2
  235. shr.u r23=h2,2 };;
  236. { .mmi; add r22=r22,r23
  237. and h2=3,h2
  238. (p9) shrp i0=r15,r14,8 };;
  239. { .mmi; add h0=h0,r22;;
  240. cmp.ltu p6,p0=h0,r22
  241. (p9) shrp i1=r16,r15,8 };;
  242. { .mmi; (p8) mov i1=r16
  243. (p6) cmp.eq.unc p7,p0=-1,h1
  244. (p6) add h1=1,h1 };;
  245. { .mmb; (p7) add h2=1,h2
  246. mov r14=r16
  247. br.ctop.sptk .Loop };;
  248. .Lcend:
  249. { .mii; SUM 1<<1 // back to big-endian
  250. mov ar.lc=r3 };;
  251. { .mmi; st8 [r8]=h0,16
  252. st8 [r9]=h1
  253. mov pr=r36,0x1ffff };;
  254. { .mmb; st8 [r8]=h2
  255. rum 1<<5
  256. br.ret.sptk b0 };;
  257. .endp poly1305_blocks#
  258. .global poly1305_emit#
  259. .proc poly1305_emit#
  260. .align 64
  261. poly1305_emit:
  262. .prologue
  263. .save ar.pfs,r2
  264. { .mmi; alloc r2=ar.pfs,3,0,0,0
  265. ADDP r8=0,r32
  266. ADDP r9=8,r32 };;
  267. .body
  268. { .mmi; ld8 r16=[r8],16 // load hash
  269. ld8 r17=[r9]
  270. ADDP r10=0,r34 };;
  271. { .mmi; ld8 r18=[r8]
  272. ld4 r24=[r10],8 // load nonce
  273. ADDP r11=4,r34 };;
  274. { .mmi; ld4 r25=[r11],8
  275. ld4 r26=[r10]
  276. add r20=5,r16 };;
  277. { .mmi; ld4 r27=[r11]
  278. cmp.ltu p6,p7=r20,r16
  279. shl r25=r25,32 };;
  280. { .mmi;
  281. (p6) add r21=1,r17
  282. (p7) add r21=0,r17
  283. (p6) cmp.eq.or.andcm p6,p7=-1,r17 };;
  284. { .mmi;
  285. (p6) add r22=1,r18
  286. (p7) add r22=0,r18
  287. shl r27=r27,32 };;
  288. { .mmi; or r24=r24,r25
  289. or r26=r26,r27
  290. cmp.leu p6,p7=4,r22 };;
  291. { .mmi;
  292. (p6) add r16=r20,r24
  293. (p7) add r16=r16,r24
  294. (p6) add r17=r21,r26 };;
  295. { .mii;
  296. (p7) add r17=r17,r26
  297. cmp.ltu p6,p7=r16,r24;;
  298. (p6) add r17=1,r17 };;
  299. { .mmi; ADDP r8=0,r33
  300. ADDP r9=4,r33
  301. shr.u r20=r16,32 }
  302. { .mmi; ADDP r10=8,r33
  303. ADDP r11=12,r33
  304. shr.u r21=r17,32 };;
  305. { .mmi; st1 [r8]=r16,1 // write mac, little-endian
  306. st1 [r9]=r20,1
  307. shr.u r16=r16,8 }
  308. { .mii; st1 [r10]=r17,1
  309. shr.u r20=r20,8
  310. shr.u r17=r17,8 }
  311. { .mmi; st1 [r11]=r21,1
  312. shr.u r21=r21,8 };;
  313. { .mmi; st1 [r8]=r16,1
  314. st1 [r9]=r20,1
  315. shr.u r16=r16,8 }
  316. { .mii; st1 [r10]=r17,1
  317. shr.u r20=r20,8
  318. shr.u r17=r17,8 }
  319. { .mmi; st1 [r11]=r21,1
  320. shr.u r21=r21,8 };;
  321. { .mmi; st1 [r8]=r16,1
  322. st1 [r9]=r20,1
  323. shr.u r16=r16,8 }
  324. { .mii; st1 [r10]=r17,1
  325. shr.u r20=r20,8
  326. shr.u r17=r17,8 }
  327. { .mmi; st1 [r11]=r21,1
  328. shr.u r21=r21,8 };;
  329. { .mmi; st1 [r8]=r16
  330. st1 [r9]=r20 }
  331. { .mmb; st1 [r10]=r17
  332. st1 [r11]=r21
  333. br.ret.sptk b0 };;
  334. .endp poly1305_emit#
  335. stringz "Poly1305 for IA64, CRYPTOGAMS by \@dot-asm"