ghash-alpha.pl 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # March 2010
  11. #
  12. # The module implements "4-bit" GCM GHASH function and underlying
  13. # single multiplication operation in GF(2^128). "4-bit" means that it
  14. # uses 256 bytes per-key table [+128 bytes shared table]. Even though
  15. # loops are aggressively modulo-scheduled in respect to references to
  16. # Htbl and Z.hi updates for 8 cycles per byte, measured performance is
  17. # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
  18. # scheduling "glitch," because uprofile(1) indicates uniform sample
  19. # distribution, as if all instruction bundles execute in 1.5 cycles.
  20. # Meaning that it could have been even faster, yet 12 cycles is ~60%
  21. # better than gcc-generated code and ~80% than code generated by vendor
  22. # compiler.
  23. $cnt="v0"; # $0
  24. $t0="t0";
  25. $t1="t1";
  26. $t2="t2";
  27. $Thi0="t3"; # $4
  28. $Tlo0="t4";
  29. $Thi1="t5";
  30. $Tlo1="t6";
  31. $rem="t7"; # $8
  32. #################
  33. $Xi="a0"; # $16, input argument block
  34. $Htbl="a1";
  35. $inp="a2";
  36. $len="a3";
  37. $nlo="a4"; # $20
  38. $nhi="a5";
  39. $Zhi="t8";
  40. $Zlo="t9";
  41. $Xhi="t10"; # $24
  42. $Xlo="t11";
  43. $remp="t12";
  44. $rem_4bit="AT"; # $28
  45. { my $N;
  46. sub loop() {
  47. $N++;
  48. $code.=<<___;
  49. .align 4
  50. extbl $Xlo,7,$nlo
  51. and $nlo,0xf0,$nhi
  52. sll $nlo,4,$nlo
  53. and $nlo,0xf0,$nlo
  54. addq $nlo,$Htbl,$nlo
  55. ldq $Zlo,8($nlo)
  56. addq $nhi,$Htbl,$nhi
  57. ldq $Zhi,0($nlo)
  58. and $Zlo,0x0f,$remp
  59. sll $Zhi,60,$t0
  60. lda $cnt,6(zero)
  61. extbl $Xlo,6,$nlo
  62. ldq $Tlo1,8($nhi)
  63. s8addq $remp,$rem_4bit,$remp
  64. ldq $Thi1,0($nhi)
  65. srl $Zlo,4,$Zlo
  66. ldq $rem,0($remp)
  67. srl $Zhi,4,$Zhi
  68. xor $t0,$Zlo,$Zlo
  69. and $nlo,0xf0,$nhi
  70. xor $Tlo1,$Zlo,$Zlo
  71. sll $nlo,4,$nlo
  72. xor $Thi1,$Zhi,$Zhi
  73. and $nlo,0xf0,$nlo
  74. addq $nlo,$Htbl,$nlo
  75. ldq $Tlo0,8($nlo)
  76. addq $nhi,$Htbl,$nhi
  77. ldq $Thi0,0($nlo)
  78. .Looplo$N:
  79. and $Zlo,0x0f,$remp
  80. sll $Zhi,60,$t0
  81. subq $cnt,1,$cnt
  82. srl $Zlo,4,$Zlo
  83. ldq $Tlo1,8($nhi)
  84. xor $rem,$Zhi,$Zhi
  85. ldq $Thi1,0($nhi)
  86. s8addq $remp,$rem_4bit,$remp
  87. ldq $rem,0($remp)
  88. srl $Zhi,4,$Zhi
  89. xor $t0,$Zlo,$Zlo
  90. extbl $Xlo,$cnt,$nlo
  91. and $nlo,0xf0,$nhi
  92. xor $Thi0,$Zhi,$Zhi
  93. xor $Tlo0,$Zlo,$Zlo
  94. sll $nlo,4,$nlo
  95. and $Zlo,0x0f,$remp
  96. sll $Zhi,60,$t0
  97. and $nlo,0xf0,$nlo
  98. srl $Zlo,4,$Zlo
  99. s8addq $remp,$rem_4bit,$remp
  100. xor $rem,$Zhi,$Zhi
  101. addq $nlo,$Htbl,$nlo
  102. addq $nhi,$Htbl,$nhi
  103. ldq $rem,0($remp)
  104. srl $Zhi,4,$Zhi
  105. ldq $Tlo0,8($nlo)
  106. xor $t0,$Zlo,$Zlo
  107. xor $Tlo1,$Zlo,$Zlo
  108. xor $Thi1,$Zhi,$Zhi
  109. ldq $Thi0,0($nlo)
  110. bne $cnt,.Looplo$N
  111. and $Zlo,0x0f,$remp
  112. sll $Zhi,60,$t0
  113. lda $cnt,7(zero)
  114. srl $Zlo,4,$Zlo
  115. ldq $Tlo1,8($nhi)
  116. xor $rem,$Zhi,$Zhi
  117. ldq $Thi1,0($nhi)
  118. s8addq $remp,$rem_4bit,$remp
  119. ldq $rem,0($remp)
  120. srl $Zhi,4,$Zhi
  121. xor $t0,$Zlo,$Zlo
  122. extbl $Xhi,$cnt,$nlo
  123. and $nlo,0xf0,$nhi
  124. xor $Thi0,$Zhi,$Zhi
  125. xor $Tlo0,$Zlo,$Zlo
  126. sll $nlo,4,$nlo
  127. and $Zlo,0x0f,$remp
  128. sll $Zhi,60,$t0
  129. and $nlo,0xf0,$nlo
  130. srl $Zlo,4,$Zlo
  131. s8addq $remp,$rem_4bit,$remp
  132. xor $rem,$Zhi,$Zhi
  133. addq $nlo,$Htbl,$nlo
  134. addq $nhi,$Htbl,$nhi
  135. ldq $rem,0($remp)
  136. srl $Zhi,4,$Zhi
  137. ldq $Tlo0,8($nlo)
  138. xor $t0,$Zlo,$Zlo
  139. xor $Tlo1,$Zlo,$Zlo
  140. xor $Thi1,$Zhi,$Zhi
  141. ldq $Thi0,0($nlo)
  142. unop
  143. .Loophi$N:
  144. and $Zlo,0x0f,$remp
  145. sll $Zhi,60,$t0
  146. subq $cnt,1,$cnt
  147. srl $Zlo,4,$Zlo
  148. ldq $Tlo1,8($nhi)
  149. xor $rem,$Zhi,$Zhi
  150. ldq $Thi1,0($nhi)
  151. s8addq $remp,$rem_4bit,$remp
  152. ldq $rem,0($remp)
  153. srl $Zhi,4,$Zhi
  154. xor $t0,$Zlo,$Zlo
  155. extbl $Xhi,$cnt,$nlo
  156. and $nlo,0xf0,$nhi
  157. xor $Thi0,$Zhi,$Zhi
  158. xor $Tlo0,$Zlo,$Zlo
  159. sll $nlo,4,$nlo
  160. and $Zlo,0x0f,$remp
  161. sll $Zhi,60,$t0
  162. and $nlo,0xf0,$nlo
  163. srl $Zlo,4,$Zlo
  164. s8addq $remp,$rem_4bit,$remp
  165. xor $rem,$Zhi,$Zhi
  166. addq $nlo,$Htbl,$nlo
  167. addq $nhi,$Htbl,$nhi
  168. ldq $rem,0($remp)
  169. srl $Zhi,4,$Zhi
  170. ldq $Tlo0,8($nlo)
  171. xor $t0,$Zlo,$Zlo
  172. xor $Tlo1,$Zlo,$Zlo
  173. xor $Thi1,$Zhi,$Zhi
  174. ldq $Thi0,0($nlo)
  175. bne $cnt,.Loophi$N
  176. and $Zlo,0x0f,$remp
  177. sll $Zhi,60,$t0
  178. srl $Zlo,4,$Zlo
  179. ldq $Tlo1,8($nhi)
  180. xor $rem,$Zhi,$Zhi
  181. ldq $Thi1,0($nhi)
  182. s8addq $remp,$rem_4bit,$remp
  183. ldq $rem,0($remp)
  184. srl $Zhi,4,$Zhi
  185. xor $t0,$Zlo,$Zlo
  186. xor $Tlo0,$Zlo,$Zlo
  187. xor $Thi0,$Zhi,$Zhi
  188. and $Zlo,0x0f,$remp
  189. sll $Zhi,60,$t0
  190. srl $Zlo,4,$Zlo
  191. s8addq $remp,$rem_4bit,$remp
  192. xor $rem,$Zhi,$Zhi
  193. ldq $rem,0($remp)
  194. srl $Zhi,4,$Zhi
  195. xor $Tlo1,$Zlo,$Zlo
  196. xor $Thi1,$Zhi,$Zhi
  197. xor $t0,$Zlo,$Zlo
  198. xor $rem,$Zhi,$Zhi
  199. ___
  200. }}
  201. $code=<<___;
  202. #ifdef __linux__
  203. #include <asm/regdef.h>
  204. #else
  205. #include <asm.h>
  206. #include <regdef.h>
  207. #endif
  208. .text
  209. .set noat
  210. .set noreorder
  211. .globl gcm_gmult_4bit
  212. .align 4
  213. .ent gcm_gmult_4bit
  214. gcm_gmult_4bit:
  215. .frame sp,0,ra
  216. .prologue 0
  217. ldq $Xlo,8($Xi)
  218. ldq $Xhi,0($Xi)
  219. br $rem_4bit,.Lpic1
  220. .Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
  221. ___
  222. &loop();
  223. $code.=<<___;
  224. srl $Zlo,24,$t0 # byte swap
  225. srl $Zlo,8,$t1
  226. sll $Zlo,8,$t2
  227. sll $Zlo,24,$Zlo
  228. zapnot $t0,0x11,$t0
  229. zapnot $t1,0x22,$t1
  230. zapnot $Zlo,0x88,$Zlo
  231. or $t0,$t1,$t0
  232. zapnot $t2,0x44,$t2
  233. or $Zlo,$t0,$Zlo
  234. srl $Zhi,24,$t0
  235. srl $Zhi,8,$t1
  236. or $Zlo,$t2,$Zlo
  237. sll $Zhi,8,$t2
  238. sll $Zhi,24,$Zhi
  239. srl $Zlo,32,$Xlo
  240. sll $Zlo,32,$Zlo
  241. zapnot $t0,0x11,$t0
  242. zapnot $t1,0x22,$t1
  243. or $Zlo,$Xlo,$Xlo
  244. zapnot $Zhi,0x88,$Zhi
  245. or $t0,$t1,$t0
  246. zapnot $t2,0x44,$t2
  247. or $Zhi,$t0,$Zhi
  248. or $Zhi,$t2,$Zhi
  249. srl $Zhi,32,$Xhi
  250. sll $Zhi,32,$Zhi
  251. or $Zhi,$Xhi,$Xhi
  252. stq $Xlo,8($Xi)
  253. stq $Xhi,0($Xi)
  254. ret (ra)
  255. .end gcm_gmult_4bit
  256. ___
  257. $inhi="s0";
  258. $inlo="s1";
  259. $code.=<<___;
  260. .globl gcm_ghash_4bit
  261. .align 4
  262. .ent gcm_ghash_4bit
  263. gcm_ghash_4bit:
  264. lda sp,-32(sp)
  265. stq ra,0(sp)
  266. stq s0,8(sp)
  267. stq s1,16(sp)
  268. .mask 0x04000600,-32
  269. .frame sp,32,ra
  270. .prologue 0
  271. ldq_u $inhi,0($inp)
  272. ldq_u $Thi0,7($inp)
  273. ldq_u $inlo,8($inp)
  274. ldq_u $Tlo0,15($inp)
  275. ldq $Xhi,0($Xi)
  276. ldq $Xlo,8($Xi)
  277. br $rem_4bit,.Lpic2
  278. .Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
  279. .Louter:
  280. extql $inhi,$inp,$inhi
  281. extqh $Thi0,$inp,$Thi0
  282. or $inhi,$Thi0,$inhi
  283. lda $inp,16($inp)
  284. extql $inlo,$inp,$inlo
  285. extqh $Tlo0,$inp,$Tlo0
  286. or $inlo,$Tlo0,$inlo
  287. subq $len,16,$len
  288. xor $Xlo,$inlo,$Xlo
  289. xor $Xhi,$inhi,$Xhi
  290. ___
  291. &loop();
  292. $code.=<<___;
  293. srl $Zlo,24,$t0 # byte swap
  294. srl $Zlo,8,$t1
  295. sll $Zlo,8,$t2
  296. sll $Zlo,24,$Zlo
  297. zapnot $t0,0x11,$t0
  298. zapnot $t1,0x22,$t1
  299. zapnot $Zlo,0x88,$Zlo
  300. or $t0,$t1,$t0
  301. zapnot $t2,0x44,$t2
  302. or $Zlo,$t0,$Zlo
  303. srl $Zhi,24,$t0
  304. srl $Zhi,8,$t1
  305. or $Zlo,$t2,$Zlo
  306. sll $Zhi,8,$t2
  307. sll $Zhi,24,$Zhi
  308. srl $Zlo,32,$Xlo
  309. sll $Zlo,32,$Zlo
  310. beq $len,.Ldone
  311. zapnot $t0,0x11,$t0
  312. zapnot $t1,0x22,$t1
  313. or $Zlo,$Xlo,$Xlo
  314. ldq_u $inhi,0($inp)
  315. zapnot $Zhi,0x88,$Zhi
  316. or $t0,$t1,$t0
  317. zapnot $t2,0x44,$t2
  318. ldq_u $Thi0,7($inp)
  319. or $Zhi,$t0,$Zhi
  320. or $Zhi,$t2,$Zhi
  321. ldq_u $inlo,8($inp)
  322. ldq_u $Tlo0,15($inp)
  323. srl $Zhi,32,$Xhi
  324. sll $Zhi,32,$Zhi
  325. or $Zhi,$Xhi,$Xhi
  326. br zero,.Louter
  327. .Ldone:
  328. zapnot $t0,0x11,$t0
  329. zapnot $t1,0x22,$t1
  330. or $Zlo,$Xlo,$Xlo
  331. zapnot $Zhi,0x88,$Zhi
  332. or $t0,$t1,$t0
  333. zapnot $t2,0x44,$t2
  334. or $Zhi,$t0,$Zhi
  335. or $Zhi,$t2,$Zhi
  336. srl $Zhi,32,$Xhi
  337. sll $Zhi,32,$Zhi
  338. or $Zhi,$Xhi,$Xhi
  339. stq $Xlo,8($Xi)
  340. stq $Xhi,0($Xi)
  341. .set noreorder
  342. /*ldq ra,0(sp)*/
  343. ldq s0,8(sp)
  344. ldq s1,16(sp)
  345. lda sp,32(sp)
  346. ret (ra)
  347. .end gcm_ghash_4bit
  348. .align 4
  349. rem_4bit:
  350. .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
  351. .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
  352. .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
  353. .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
  354. .ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
  355. .align 4
  356. ___
  357. $output=shift and open STDOUT,">$output";
  358. print $code;
  359. close STDOUT;