c64xplus-gf2m.pl 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # February 2012
  11. #
  12. # The module implements bn_GF2m_mul_2x2 polynomial multiplication
  13. # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
  14. # C for the time being... The subroutine runs in 37 cycles, which is
  15. # 4.5x faster than compiler-generated code. Though comparison is
  16. # totally unfair, because this module utilizes Galois Field Multiply
  17. # instruction.
  18. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  19. open STDOUT,">$output";
  20. ($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector
  21. ($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
  22. ($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
  23. ($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
  24. ($A,$B)=($Alo,$B_1);
  25. $xFF="B1";
  26. sub mul_1x1_upper {
  27. my ($A,$B)=@_;
  28. $code.=<<___;
  29. EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
  30. || AND $B,$xFF,$B_0
  31. || SHRU $B,24,$B_3
  32. SHRU $A,16, $Ahi ; smash $A to two halfwords
  33. || EXTU $A,16,16,$Alo
  34. XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits muliplication
  35. || XORMPY $Ahi,$B_2,$Ahix2
  36. || EXTU $B,16,24,$B_1
  37. XORMPY $Alo,$B_0,$Alox0
  38. || XORMPY $Ahi,$B_0,$Ahix0
  39. XORMPY $Alo,$B_3,$Alox3
  40. || XORMPY $Ahi,$B_3,$Ahix3
  41. XORMPY $Alo,$B_1,$Alox1
  42. || XORMPY $Ahi,$B_1,$Ahix1
  43. ___
  44. }
  45. sub mul_1x1_merged {
  46. my ($OUTlo,$OUThi,$A,$B)=@_;
  47. $code.=<<___;
  48. EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
  49. || AND $B,$xFF,$B_0
  50. || SHRU $B,24,$B_3
  51. SHRU $A,16, $Ahi ; smash $A to two halfwords
  52. || EXTU $A,16,16,$Alo
  53. XOR $Ahix0,$Alox2,$Ahix0
  54. || MV $Ahix2,$OUThi
  55. || XORMPY $Alo,$B_2,$Alox2
  56. XORMPY $Ahi,$B_2,$Ahix2
  57. || EXTU $B,16,24,$B_1
  58. || XORMPY $Alo,$B_0,A1 ; $Alox0
  59. XOR $Ahix1,$Alox3,$Ahix1
  60. || SHL $Ahix0,16,$OUTlo
  61. || SHRU $Ahix0,16,$Ahix0
  62. XOR $Alox0,$OUTlo,$OUTlo
  63. || XOR $Ahix0,$OUThi,$OUThi
  64. || XORMPY $Ahi,$B_0,$Ahix0
  65. || XORMPY $Alo,$B_3,$Alox3
  66. || SHL $Alox1,8,$Alox1
  67. || SHL $Ahix3,8,$Ahix3
  68. XOR $Alox1,$OUTlo,$OUTlo
  69. || XOR $Ahix3,$OUThi,$OUThi
  70. || XORMPY $Ahi,$B_3,$Ahix3
  71. || SHL $Ahix1,24,$Alox1
  72. || SHRU $Ahix1,8, $Ahix1
  73. XOR $Alox1,$OUTlo,$OUTlo
  74. || XOR $Ahix1,$OUThi,$OUThi
  75. || XORMPY $Alo,$B_1,$Alox1
  76. || XORMPY $Ahi,$B_1,$Ahix1
  77. || MV A1,$Alox0
  78. ___
  79. }
  80. sub mul_1x1_lower {
  81. my ($OUTlo,$OUThi)=@_;
  82. $code.=<<___;
  83. ;NOP
  84. XOR $Ahix0,$Alox2,$Ahix0
  85. || MV $Ahix2,$OUThi
  86. NOP
  87. XOR $Ahix1,$Alox3,$Ahix1
  88. || SHL $Ahix0,16,$OUTlo
  89. || SHRU $Ahix0,16,$Ahix0
  90. XOR $Alox0,$OUTlo,$OUTlo
  91. || XOR $Ahix0,$OUThi,$OUThi
  92. || SHL $Alox1,8,$Alox1
  93. || SHL $Ahix3,8,$Ahix3
  94. XOR $Alox1,$OUTlo,$OUTlo
  95. || XOR $Ahix3,$OUThi,$OUThi
  96. || SHL $Ahix1,24,$Alox1
  97. || SHRU $Ahix1,8, $Ahix1
  98. XOR $Alox1,$OUTlo,$OUTlo
  99. || XOR $Ahix1,$OUThi,$OUThi
  100. ___
  101. }
  102. $code.=<<___;
  103. .text
  104. .global _bn_GF2m_mul_2x2
  105. _bn_GF2m_mul_2x2:
  106. .asmfunc
  107. MVK 0xFF,$xFF
  108. ___
  109. &mul_1x1_upper($a0,$b0); # a0·b0
  110. $code.=<<___;
  111. || MV $b1,$B
  112. MV $a1,$A
  113. ___
  114. &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1
  115. $code.=<<___;
  116. || XOR $b0,$b1,$B
  117. XOR $a0,$a1,$A
  118. ___
  119. &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1)
  120. $code.=<<___;
  121. XOR A28,A31,A29
  122. || XOR B28,B31,B29 ; a0·b0+a1·b1
  123. ___
  124. &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1)
  125. $code.=<<___;
  126. || BNOP B3
  127. XOR A29,A30,A30
  128. || XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
  129. XOR B28,A30,A30
  130. || STW A28,*${rp}[0]
  131. XOR B30,A31,A31
  132. || STW A30,*${rp}[1]
  133. STW A31,*${rp}[2]
  134. STW B31,*${rp}[3]
  135. .endasmfunc
  136. ___
  137. print $code;
  138. close STDOUT;