c64xplus-gf2m.pl 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # February 2012
  17. #
  18. # The module implements bn_GF2m_mul_2x2 polynomial multiplication
  19. # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
  20. # C for the time being... The subroutine runs in 37 cycles, which is
  21. # 4.5x faster than compiler-generated code. Though comparison is
  22. # totally unfair, because this module utilizes Galois Field Multiply
  23. # instruction.
  24. $output = pop and open STDOUT,">$output";
  25. ($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector
  26. ($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
  27. ($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
  28. ($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
  29. ($A,$B)=($Alo,$B_1);
  30. $xFF="B1";
  31. sub mul_1x1_upper {
  32. my ($A,$B)=@_;
  33. $code.=<<___;
  34. EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
  35. || AND $B,$xFF,$B_0
  36. || SHRU $B,24,$B_3
  37. SHRU $A,16, $Ahi ; smash $A to two halfwords
  38. || EXTU $A,16,16,$Alo
  39. XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication
  40. || XORMPY $Ahi,$B_2,$Ahix2
  41. || EXTU $B,16,24,$B_1
  42. XORMPY $Alo,$B_0,$Alox0
  43. || XORMPY $Ahi,$B_0,$Ahix0
  44. XORMPY $Alo,$B_3,$Alox3
  45. || XORMPY $Ahi,$B_3,$Ahix3
  46. XORMPY $Alo,$B_1,$Alox1
  47. || XORMPY $Ahi,$B_1,$Ahix1
  48. ___
  49. }
  50. sub mul_1x1_merged {
  51. my ($OUTlo,$OUThi,$A,$B)=@_;
  52. $code.=<<___;
  53. EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
  54. || AND $B,$xFF,$B_0
  55. || SHRU $B,24,$B_3
  56. SHRU $A,16, $Ahi ; smash $A to two halfwords
  57. || EXTU $A,16,16,$Alo
  58. XOR $Ahix0,$Alox2,$Ahix0
  59. || MV $Ahix2,$OUThi
  60. || XORMPY $Alo,$B_2,$Alox2
  61. XORMPY $Ahi,$B_2,$Ahix2
  62. || EXTU $B,16,24,$B_1
  63. || XORMPY $Alo,$B_0,A1 ; $Alox0
  64. XOR $Ahix1,$Alox3,$Ahix1
  65. || SHL $Ahix0,16,$OUTlo
  66. || SHRU $Ahix0,16,$Ahix0
  67. XOR $Alox0,$OUTlo,$OUTlo
  68. || XOR $Ahix0,$OUThi,$OUThi
  69. || XORMPY $Ahi,$B_0,$Ahix0
  70. || XORMPY $Alo,$B_3,$Alox3
  71. || SHL $Alox1,8,$Alox1
  72. || SHL $Ahix3,8,$Ahix3
  73. XOR $Alox1,$OUTlo,$OUTlo
  74. || XOR $Ahix3,$OUThi,$OUThi
  75. || XORMPY $Ahi,$B_3,$Ahix3
  76. || SHL $Ahix1,24,$Alox1
  77. || SHRU $Ahix1,8, $Ahix1
  78. XOR $Alox1,$OUTlo,$OUTlo
  79. || XOR $Ahix1,$OUThi,$OUThi
  80. || XORMPY $Alo,$B_1,$Alox1
  81. || XORMPY $Ahi,$B_1,$Ahix1
  82. || MV A1,$Alox0
  83. ___
  84. }
  85. sub mul_1x1_lower {
  86. my ($OUTlo,$OUThi)=@_;
  87. $code.=<<___;
  88. ;NOP
  89. XOR $Ahix0,$Alox2,$Ahix0
  90. || MV $Ahix2,$OUThi
  91. NOP
  92. XOR $Ahix1,$Alox3,$Ahix1
  93. || SHL $Ahix0,16,$OUTlo
  94. || SHRU $Ahix0,16,$Ahix0
  95. XOR $Alox0,$OUTlo,$OUTlo
  96. || XOR $Ahix0,$OUThi,$OUThi
  97. || SHL $Alox1,8,$Alox1
  98. || SHL $Ahix3,8,$Ahix3
  99. XOR $Alox1,$OUTlo,$OUTlo
  100. || XOR $Ahix3,$OUThi,$OUThi
  101. || SHL $Ahix1,24,$Alox1
  102. || SHRU $Ahix1,8, $Ahix1
  103. XOR $Alox1,$OUTlo,$OUTlo
  104. || XOR $Ahix1,$OUThi,$OUThi
  105. ___
  106. }
  107. $code.=<<___;
  108. .text
  109. .if .ASSEMBLER_VERSION<7000000
  110. .asg 0,__TI_EABI__
  111. .endif
  112. .if __TI_EABI__
  113. .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
  114. .endif
  115. .global _bn_GF2m_mul_2x2
  116. _bn_GF2m_mul_2x2:
  117. .asmfunc
  118. MVK 0xFF,$xFF
  119. ___
  120. &mul_1x1_upper($a0,$b0); # a0·b0
  121. $code.=<<___;
  122. || MV $b1,$B
  123. MV $a1,$A
  124. ___
  125. &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1
  126. $code.=<<___;
  127. || XOR $b0,$b1,$B
  128. XOR $a0,$a1,$A
  129. ___
  130. &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1)
  131. $code.=<<___;
  132. XOR A28,A31,A29
  133. || XOR B28,B31,B29 ; a0·b0+a1·b1
  134. ___
  135. &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1)
  136. $code.=<<___;
  137. || BNOP B3
  138. XOR A29,A30,A30
  139. || XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
  140. XOR B28,A30,A30
  141. || STW A28,*${rp}[0]
  142. XOR B30,A31,A31
  143. || STW A30,*${rp}[1]
  144. STW A31,*${rp}[2]
  145. STW B31,*${rp}[3]
  146. .endasmfunc
  147. ___
  148. print $code;
  149. close STDOUT or die "error closing STDOUT";