2
0

c64xplus-gf2m.pl 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # February 2012
  17. #
  18. # The module implements bn_GF2m_mul_2x2 polynomial multiplication
  19. # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
  20. # C for the time being... The subroutine runs in 37 cycles, which is
  21. # 4.5x faster than compiler-generated code. Though comparison is
  22. # totally unfair, because this module utilizes Galois Field Multiply
  23. # instruction.
  24. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  25. open STDOUT,">$output";
  26. ($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector
  27. ($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
  28. ($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
  29. ($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
  30. ($A,$B)=($Alo,$B_1);
  31. $xFF="B1";
  32. sub mul_1x1_upper {
  33. my ($A,$B)=@_;
  34. $code.=<<___;
  35. EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
  36. || AND $B,$xFF,$B_0
  37. || SHRU $B,24,$B_3
  38. SHRU $A,16, $Ahi ; smash $A to two halfwords
  39. || EXTU $A,16,16,$Alo
  40. XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication
  41. || XORMPY $Ahi,$B_2,$Ahix2
  42. || EXTU $B,16,24,$B_1
  43. XORMPY $Alo,$B_0,$Alox0
  44. || XORMPY $Ahi,$B_0,$Ahix0
  45. XORMPY $Alo,$B_3,$Alox3
  46. || XORMPY $Ahi,$B_3,$Ahix3
  47. XORMPY $Alo,$B_1,$Alox1
  48. || XORMPY $Ahi,$B_1,$Ahix1
  49. ___
  50. }
  51. sub mul_1x1_merged {
  52. my ($OUTlo,$OUThi,$A,$B)=@_;
  53. $code.=<<___;
  54. EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
  55. || AND $B,$xFF,$B_0
  56. || SHRU $B,24,$B_3
  57. SHRU $A,16, $Ahi ; smash $A to two halfwords
  58. || EXTU $A,16,16,$Alo
  59. XOR $Ahix0,$Alox2,$Ahix0
  60. || MV $Ahix2,$OUThi
  61. || XORMPY $Alo,$B_2,$Alox2
  62. XORMPY $Ahi,$B_2,$Ahix2
  63. || EXTU $B,16,24,$B_1
  64. || XORMPY $Alo,$B_0,A1 ; $Alox0
  65. XOR $Ahix1,$Alox3,$Ahix1
  66. || SHL $Ahix0,16,$OUTlo
  67. || SHRU $Ahix0,16,$Ahix0
  68. XOR $Alox0,$OUTlo,$OUTlo
  69. || XOR $Ahix0,$OUThi,$OUThi
  70. || XORMPY $Ahi,$B_0,$Ahix0
  71. || XORMPY $Alo,$B_3,$Alox3
  72. || SHL $Alox1,8,$Alox1
  73. || SHL $Ahix3,8,$Ahix3
  74. XOR $Alox1,$OUTlo,$OUTlo
  75. || XOR $Ahix3,$OUThi,$OUThi
  76. || XORMPY $Ahi,$B_3,$Ahix3
  77. || SHL $Ahix1,24,$Alox1
  78. || SHRU $Ahix1,8, $Ahix1
  79. XOR $Alox1,$OUTlo,$OUTlo
  80. || XOR $Ahix1,$OUThi,$OUThi
  81. || XORMPY $Alo,$B_1,$Alox1
  82. || XORMPY $Ahi,$B_1,$Ahix1
  83. || MV A1,$Alox0
  84. ___
  85. }
  86. sub mul_1x1_lower {
  87. my ($OUTlo,$OUThi)=@_;
  88. $code.=<<___;
  89. ;NOP
  90. XOR $Ahix0,$Alox2,$Ahix0
  91. || MV $Ahix2,$OUThi
  92. NOP
  93. XOR $Ahix1,$Alox3,$Ahix1
  94. || SHL $Ahix0,16,$OUTlo
  95. || SHRU $Ahix0,16,$Ahix0
  96. XOR $Alox0,$OUTlo,$OUTlo
  97. || XOR $Ahix0,$OUThi,$OUThi
  98. || SHL $Alox1,8,$Alox1
  99. || SHL $Ahix3,8,$Ahix3
  100. XOR $Alox1,$OUTlo,$OUTlo
  101. || XOR $Ahix3,$OUThi,$OUThi
  102. || SHL $Ahix1,24,$Alox1
  103. || SHRU $Ahix1,8, $Ahix1
  104. XOR $Alox1,$OUTlo,$OUTlo
  105. || XOR $Ahix1,$OUThi,$OUThi
  106. ___
  107. }
  108. $code.=<<___;
  109. .text
  110. .if .ASSEMBLER_VERSION<7000000
  111. .asg 0,__TI_EABI__
  112. .endif
  113. .if __TI_EABI__
  114. .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
  115. .endif
  116. .global _bn_GF2m_mul_2x2
  117. _bn_GF2m_mul_2x2:
  118. .asmfunc
  119. MVK 0xFF,$xFF
  120. ___
  121. &mul_1x1_upper($a0,$b0); # a0·b0
  122. $code.=<<___;
  123. || MV $b1,$B
  124. MV $a1,$A
  125. ___
  126. &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1
  127. $code.=<<___;
  128. || XOR $b0,$b1,$B
  129. XOR $a0,$a1,$A
  130. ___
  131. &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1)
  132. $code.=<<___;
  133. XOR A28,A31,A29
  134. || XOR B28,B31,B29 ; a0·b0+a1·b1
  135. ___
  136. &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1)
  137. $code.=<<___;
  138. || BNOP B3
  139. XOR A29,A30,A30
  140. || XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
  141. XOR B28,A30,A30
  142. || STW A28,*${rp}[0]
  143. XOR B30,A31,A31
  144. || STW A30,*${rp}[1]
  145. STW A31,*${rp}[2]
  146. STW B31,*${rp}[3]
  147. .endasmfunc
  148. ___
  149. print $code;
  150. close STDOUT;