memset-power.s 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. TEXT memset(SB),$0
  2. #define BDNZ BC 16,0,
  3. MOVW R3, p+0(FP) /* R3 is pointer */
  4. /*
  5. * performance:
  6. * about 100mbytes/sec (8k blocks) on a 603/105 without L2 cache
  7. * drops to 40mbytes/sec (10k blocks) and 28mbytes/sec with 32k blocks
  8. */
  9. MOVW n+8(FP), R4 /* R4 is count */
  10. CMP R4, $0
  11. BLE ret
  12. MOVW c+4(FP), R5 /* R5 is char */
  13. /*
  14. * create 16 copies of c in R5 .. R8
  15. */
  16. RLWNM $0, R5, $0xff, R5
  17. RLWMI $8, R5, $0xff00, R5
  18. RLWMI $16, R5, $0xffff0000, R5
  19. MOVW R5, R6
  20. MOVW R5, R7
  21. MOVW R5, R8
  22. /*
  23. * let STSW do the work for 16 characters or less; aligned and unaligned
  24. */
  25. CMP R4, $16
  26. BLE out
  27. /*
  28. * store enough bytes to align pointer
  29. */
  30. ANDCC $7,R3, R9
  31. BEQ l2
  32. SUBC R9, $8, R9
  33. MOVW R9, XER
  34. STSW R5, (R3)
  35. ADD R9, R3
  36. SUB R9, R4
  37. /*
  38. * store 16 at a time while there's room
  39. * STSW was used here originally, but it's `completion serialised'
  40. */
  41. l2:
  42. SRAWCC $4, R4, R9
  43. BLE out
  44. MOVW R9, CTR
  45. l3:
  46. MOVW R5, 0(R3)
  47. ADD $8, R3, R10
  48. MOVW R6, 4(R3)
  49. MOVW R7, 0(R10)
  50. ADD $8, R10, R3
  51. MOVW R8, 4(R10)
  52. BDNZ l3
  53. RLWNMCC $0, R4, $15, R4 /* residue */
  54. BEQ ret
  55. /*
  56. * store up to 16 bytes from R5 .. R8; aligned and unaligned
  57. */
  58. out:
  59. MOVW R4, XER
  60. STSW R5, (R3)
  61. ret:
  62. MOVW 0(FP), R3
  63. RETURN
  64. END