misc_helpers.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. /*
  2. * Copyright (c) 2013-2024, Arm Limited and Contributors. All rights reserved.
  3. *
  4. * SPDX-License-Identifier: BSD-3-Clause
  5. */
  6. #include <arch.h>
  7. #include <asm_macros.S>
  8. #include <assert_macros.S>
  9. #include <common/bl_common.h>
  10. #include <lib/xlat_tables/xlat_tables_defs.h>
  11. .globl smc
  12. .globl zero_normalmem
  13. .globl zeromem
  14. .globl memcpy16
  15. .globl disable_mmu_el1
  16. .globl disable_mmu_el3
  17. .globl disable_mmu_icache_el1
  18. .globl disable_mmu_icache_el3
  19. .globl fixup_gdt_reloc
  20. #if SUPPORT_VFP
  21. .globl enable_vfp
  22. #endif
  23. func smc
  24. smc #0
  25. endfunc smc
  26. /* -----------------------------------------------------------------------
  27. * void zero_normalmem(void *mem, unsigned int length);
  28. *
  29. * Initialise a region in normal memory to 0. This functions complies with the
  30. * AAPCS and can be called from C code.
  31. *
  32. * NOTE: MMU must be enabled when using this function as it can only operate on
  33. * normal memory. It is intended to be mainly used from C code when MMU
  34. * is usually enabled.
  35. * -----------------------------------------------------------------------
  36. */
  37. .equ zero_normalmem, zeromem_dczva
  38. /* -----------------------------------------------------------------------
  39. * void zeromem(void *mem, unsigned int length);
  40. *
  41. * Initialise a region of device memory to 0. This functions complies with the
  42. * AAPCS and can be called from C code.
  43. *
  44. * NOTE: When data caches and MMU are enabled, zero_normalmem can usually be
  45. * used instead for faster zeroing.
  46. *
  47. * -----------------------------------------------------------------------
  48. */
  49. func zeromem
  50. /* x2 is the address past the last zeroed address */
  51. add x2, x0, x1
  52. /*
  53. * Uses the fallback path that does not use DC ZVA instruction and
  54. * therefore does not need enabled MMU
  55. */
  56. b .Lzeromem_dczva_fallback_entry
  57. endfunc zeromem
  58. /* -----------------------------------------------------------------------
  59. * void zeromem_dczva(void *mem, unsigned int length);
  60. *
  61. * Fill a region of normal memory of size "length" in bytes with null bytes.
  62. * MMU must be enabled and the memory be of
  63. * normal type. This is because this function internally uses the DC ZVA
  64. * instruction, which generates an Alignment fault if used on any type of
  65. * Device memory (see section D3.4.9 of the ARMv8 ARM, issue k). When the MMU
  66. * is disabled, all memory behaves like Device-nGnRnE memory (see section
  67. * D4.2.8), hence the requirement on the MMU being enabled.
  68. * NOTE: The code assumes that the block size as defined in DCZID_EL0
  69. * register is at least 16 bytes.
  70. *
  71. * -----------------------------------------------------------------------
  72. */
  73. func zeromem_dczva
  74. /*
  75. * The function consists of a series of loops that zero memory one byte
  76. * at a time, 16 bytes at a time or using the DC ZVA instruction to
  77. * zero aligned block of bytes, which is assumed to be more than 16.
  78. * In the case where the DC ZVA instruction cannot be used or if the
  79. * first 16 bytes loop would overflow, there is fallback path that does
  80. * not use DC ZVA.
  81. * Note: The fallback path is also used by the zeromem function that
  82. * branches to it directly.
  83. *
  84. * +---------+ zeromem_dczva
  85. * | entry |
  86. * +----+----+
  87. * |
  88. * v
  89. * +---------+
  90. * | checks |>o-------+ (If any check fails, fallback)
  91. * +----+----+ |
  92. * | |---------------+
  93. * v | Fallback path |
  94. * +------+------+ |---------------+
  95. * | 1 byte loop | |
  96. * +------+------+ .Lzeromem_dczva_initial_1byte_aligned_end
  97. * | |
  98. * v |
  99. * +-------+-------+ |
  100. * | 16 bytes loop | |
  101. * +-------+-------+ |
  102. * | |
  103. * v |
  104. * +------+------+ .Lzeromem_dczva_blocksize_aligned
  105. * | DC ZVA loop | |
  106. * +------+------+ |
  107. * +--------+ | |
  108. * | | | |
  109. * | v v |
  110. * | +-------+-------+ .Lzeromem_dczva_final_16bytes_aligned
  111. * | | 16 bytes loop | |
  112. * | +-------+-------+ |
  113. * | | |
  114. * | v |
  115. * | +------+------+ .Lzeromem_dczva_final_1byte_aligned
  116. * | | 1 byte loop | |
  117. * | +-------------+ |
  118. * | | |
  119. * | v |
  120. * | +---+--+ |
  121. * | | exit | |
  122. * | +------+ |
  123. * | |
  124. * | +--------------+ +------------------+ zeromem
  125. * | | +----------------| zeromem function |
  126. * | | | +------------------+
  127. * | v v
  128. * | +-------------+ .Lzeromem_dczva_fallback_entry
  129. * | | 1 byte loop |
  130. * | +------+------+
  131. * | |
  132. * +-----------+
  133. */
  134. /*
  135. * Readable names for registers
  136. *
  137. * Registers x0, x1 and x2 are also set by zeromem which
  138. * branches into the fallback path directly, so cursor, length and
  139. * stop_address should not be retargeted to other registers.
  140. */
  141. cursor .req x0 /* Start address and then current address */
  142. length .req x1 /* Length in bytes of the region to zero out */
  143. /* Reusing x1 as length is never used after block_mask is set */
  144. block_mask .req x1 /* Bitmask of the block size read in DCZID_EL0 */
  145. stop_address .req x2 /* Address past the last zeroed byte */
  146. block_size .req x3 /* Size of a block in bytes as read in DCZID_EL0 */
  147. tmp1 .req x4
  148. tmp2 .req x5
  149. #if ENABLE_ASSERTIONS
  150. /*
  151. * Check for M bit (MMU enabled) of the current SCTLR_EL(1|3)
  152. * register value and panic if the MMU is disabled.
  153. */
  154. #if defined(IMAGE_BL1) || defined(IMAGE_BL31) || (defined(IMAGE_BL2) && \
  155. BL2_RUNS_AT_EL3)
  156. mrs tmp1, sctlr_el3
  157. #else
  158. mrs tmp1, sctlr_el1
  159. #endif
  160. tst tmp1, #SCTLR_M_BIT
  161. ASM_ASSERT(ne)
  162. #endif /* ENABLE_ASSERTIONS */
  163. /* stop_address is the address past the last to zero */
  164. add stop_address, cursor, length
  165. /*
  166. * Get block_size = (log2(<block size>) >> 2) (see encoding of
  167. * dczid_el0 reg)
  168. */
  169. mrs block_size, dczid_el0
  170. /*
  171. * Select the 4 lowest bits and convert the extracted log2(<block size
  172. * in words>) to <block size in bytes>
  173. */
  174. ubfx block_size, block_size, #0, #4
  175. mov tmp2, #(1 << 2)
  176. lsl block_size, tmp2, block_size
  177. #if ENABLE_ASSERTIONS
  178. /*
  179. * Assumes block size is at least 16 bytes to avoid manual realignment
  180. * of the cursor at the end of the DCZVA loop.
  181. */
  182. cmp block_size, #16
  183. ASM_ASSERT(hs)
  184. #endif
  185. /*
  186. * Not worth doing all the setup for a region less than a block and
  187. * protects against zeroing a whole block when the area to zero is
  188. * smaller than that. Also, as it is assumed that the block size is at
  189. * least 16 bytes, this also protects the initial aligning loops from
  190. * trying to zero 16 bytes when length is less than 16.
  191. */
  192. cmp length, block_size
  193. b.lo .Lzeromem_dczva_fallback_entry
  194. /*
  195. * Calculate the bitmask of the block alignment. It will never
  196. * underflow as the block size is between 4 bytes and 2kB.
  197. * block_mask = block_size - 1
  198. */
  199. sub block_mask, block_size, #1
  200. /*
  201. * length alias should not be used after this point unless it is
  202. * defined as a register other than block_mask's.
  203. */
  204. .unreq length
  205. /*
  206. * If the start address is already aligned to zero block size, go
  207. * straight to the cache zeroing loop. This is safe because at this
  208. * point, the length cannot be smaller than a block size.
  209. */
  210. tst cursor, block_mask
  211. b.eq .Lzeromem_dczva_blocksize_aligned
  212. /*
  213. * Calculate the first block-size-aligned address. It is assumed that
  214. * the zero block size is at least 16 bytes. This address is the last
  215. * address of this initial loop.
  216. */
  217. orr tmp1, cursor, block_mask
  218. add tmp1, tmp1, #1
  219. /*
  220. * If the addition overflows, skip the cache zeroing loops. This is
  221. * quite unlikely however.
  222. */
  223. cbz tmp1, .Lzeromem_dczva_fallback_entry
  224. /*
  225. * If the first block-size-aligned address is past the last address,
  226. * fallback to the simpler code.
  227. */
  228. cmp tmp1, stop_address
  229. b.hi .Lzeromem_dczva_fallback_entry
  230. /*
  231. * If the start address is already aligned to 16 bytes, skip this loop.
  232. * It is safe to do this because tmp1 (the stop address of the initial
  233. * 16 bytes loop) will never be greater than the final stop address.
  234. */
  235. tst cursor, #0xf
  236. b.eq .Lzeromem_dczva_initial_1byte_aligned_end
  237. /* Calculate the next address aligned to 16 bytes */
  238. orr tmp2, cursor, #0xf
  239. add tmp2, tmp2, #1
  240. /* If it overflows, fallback to the simple path (unlikely) */
  241. cbz tmp2, .Lzeromem_dczva_fallback_entry
  242. /*
  243. * Next aligned address cannot be after the stop address because the
  244. * length cannot be smaller than 16 at this point.
  245. */
  246. /* First loop: zero byte per byte */
  247. 1:
  248. strb wzr, [cursor], #1
  249. cmp cursor, tmp2
  250. b.ne 1b
  251. .Lzeromem_dczva_initial_1byte_aligned_end:
  252. /*
  253. * Second loop: we need to zero 16 bytes at a time from cursor to tmp1
  254. * before being able to use the code that deals with block-size-aligned
  255. * addresses.
  256. */
  257. cmp cursor, tmp1
  258. b.hs 2f
  259. 1:
  260. stp xzr, xzr, [cursor], #16
  261. cmp cursor, tmp1
  262. b.lo 1b
  263. 2:
  264. /*
  265. * Third loop: zero a block at a time using DC ZVA cache block zeroing
  266. * instruction.
  267. */
  268. .Lzeromem_dczva_blocksize_aligned:
  269. /*
  270. * Calculate the last block-size-aligned address. If the result equals
  271. * to the start address, the loop will exit immediately.
  272. */
  273. bic tmp1, stop_address, block_mask
  274. cmp cursor, tmp1
  275. b.hs 2f
  276. 1:
  277. /* Zero the block containing the cursor */
  278. dc zva, cursor
  279. /* Increment the cursor by the size of a block */
  280. add cursor, cursor, block_size
  281. cmp cursor, tmp1
  282. b.lo 1b
  283. 2:
  284. /*
  285. * Fourth loop: zero 16 bytes at a time and then byte per byte the
  286. * remaining area
  287. */
  288. .Lzeromem_dczva_final_16bytes_aligned:
  289. /*
  290. * Calculate the last 16 bytes aligned address. It is assumed that the
  291. * block size will never be smaller than 16 bytes so that the current
  292. * cursor is aligned to at least 16 bytes boundary.
  293. */
  294. bic tmp1, stop_address, #15
  295. cmp cursor, tmp1
  296. b.hs 2f
  297. 1:
  298. stp xzr, xzr, [cursor], #16
  299. cmp cursor, tmp1
  300. b.lo 1b
  301. 2:
  302. /* Fifth and final loop: zero byte per byte */
  303. .Lzeromem_dczva_final_1byte_aligned:
  304. cmp cursor, stop_address
  305. b.eq 2f
  306. 1:
  307. strb wzr, [cursor], #1
  308. cmp cursor, stop_address
  309. b.ne 1b
  310. 2:
  311. ret
  312. /* Fallback for unaligned start addresses */
  313. .Lzeromem_dczva_fallback_entry:
  314. /*
  315. * If the start address is already aligned to 16 bytes, skip this loop.
  316. */
  317. tst cursor, #0xf
  318. b.eq .Lzeromem_dczva_final_16bytes_aligned
  319. /* Calculate the next address aligned to 16 bytes */
  320. orr tmp1, cursor, #15
  321. add tmp1, tmp1, #1
  322. /* If it overflows, fallback to byte per byte zeroing */
  323. cbz tmp1, .Lzeromem_dczva_final_1byte_aligned
  324. /* If the next aligned address is after the stop address, fall back */
  325. cmp tmp1, stop_address
  326. b.hs .Lzeromem_dczva_final_1byte_aligned
  327. /* Fallback entry loop: zero byte per byte */
  328. 1:
  329. strb wzr, [cursor], #1
  330. cmp cursor, tmp1
  331. b.ne 1b
  332. b .Lzeromem_dczva_final_16bytes_aligned
  333. .unreq cursor
  334. /*
  335. * length is already unreq'ed to reuse the register for another
  336. * variable.
  337. */
  338. .unreq stop_address
  339. .unreq block_size
  340. .unreq block_mask
  341. .unreq tmp1
  342. .unreq tmp2
  343. endfunc zeromem_dczva
  344. /* --------------------------------------------------------------------------
  345. * void memcpy16(void *dest, const void *src, unsigned int length)
  346. *
  347. * Copy length bytes from memory area src to memory area dest.
  348. * The memory areas should not overlap.
  349. * Destination and source addresses must be 16-byte aligned.
  350. * --------------------------------------------------------------------------
  351. */
  352. func memcpy16
  353. #if ENABLE_ASSERTIONS
  354. orr x3, x0, x1
  355. tst x3, #0xf
  356. ASM_ASSERT(eq)
  357. #endif
  358. /* copy 16 bytes at a time */
  359. m_loop16:
  360. cmp x2, #16
  361. b.lo m_loop1
  362. ldp x3, x4, [x1], #16
  363. stp x3, x4, [x0], #16
  364. sub x2, x2, #16
  365. b m_loop16
  366. /* copy byte per byte */
  367. m_loop1:
  368. cbz x2, m_end
  369. ldrb w3, [x1], #1
  370. strb w3, [x0], #1
  371. subs x2, x2, #1
  372. b.ne m_loop1
  373. m_end:
  374. ret
  375. endfunc memcpy16
  376. /* ---------------------------------------------------------------------------
  377. * Disable the MMU at EL3
  378. * ---------------------------------------------------------------------------
  379. */
  380. func disable_mmu_el3
  381. mov x1, #(SCTLR_M_BIT | SCTLR_C_BIT)
  382. do_disable_mmu_el3:
  383. mrs x0, sctlr_el3
  384. bic x0, x0, x1
  385. msr sctlr_el3, x0
  386. isb /* ensure MMU is off */
  387. dsb sy
  388. ret
  389. endfunc disable_mmu_el3
  390. func disable_mmu_icache_el3
  391. mov x1, #(SCTLR_M_BIT | SCTLR_C_BIT | SCTLR_I_BIT)
  392. b do_disable_mmu_el3
  393. endfunc disable_mmu_icache_el3
  394. /* ---------------------------------------------------------------------------
  395. * Disable the MMU at EL1
  396. * ---------------------------------------------------------------------------
  397. */
  398. func disable_mmu_el1
  399. mov x1, #(SCTLR_M_BIT | SCTLR_C_BIT)
  400. do_disable_mmu_el1:
  401. mrs x0, sctlr_el1
  402. bic x0, x0, x1
  403. msr sctlr_el1, x0
  404. isb /* ensure MMU is off */
  405. dsb sy
  406. ret
  407. endfunc disable_mmu_el1
  408. func disable_mmu_icache_el1
  409. mov x1, #(SCTLR_M_BIT | SCTLR_C_BIT | SCTLR_I_BIT)
  410. b do_disable_mmu_el1
  411. endfunc disable_mmu_icache_el1
  412. /* ---------------------------------------------------------------------------
  413. * Enable the use of VFP at EL3
  414. * ---------------------------------------------------------------------------
  415. */
  416. #if SUPPORT_VFP
  417. func enable_vfp
  418. mrs x0, cpacr_el1
  419. orr x0, x0, #CPACR_VFP_BITS
  420. msr cpacr_el1, x0
  421. mrs x0, cptr_el3
  422. mov x1, #AARCH64_CPTR_TFP
  423. bic x0, x0, x1
  424. msr cptr_el3, x0
  425. isb
  426. ret
  427. endfunc enable_vfp
  428. #endif
  429. /* ---------------------------------------------------------------------------
  430. * Helper to fixup Global Descriptor table (GDT) and dynamic relocations
  431. * (.rela.dyn) at runtime.
  432. *
  433. * This function is meant to be used when the firmware is compiled with -fpie
  434. * and linked with -pie options. We rely on the linker script exporting
  435. * appropriate markers for start and end of the section. For GOT, we
  436. * expect __GOT_START__ and __GOT_END__. Similarly for .rela.dyn, we expect
  437. * __RELA_START__ and __RELA_END__.
  438. *
  439. * The function takes the limits of the memory to apply fixups to as
  440. * arguments (which is usually the limits of the relocable BL image).
  441. * x0 - the start of the fixup region
  442. * x1 - the limit of the fixup region
  443. * These addresses have to be 4KB page aligned.
  444. * ---------------------------------------------------------------------------
  445. */
  446. /* Relocation codes */
  447. #define R_AARCH64_NONE 0
  448. #define R_AARCH64_RELATIVE 1027
  449. func fixup_gdt_reloc
  450. mov x6, x0
  451. mov x7, x1
  452. #if ENABLE_ASSERTIONS
  453. /* Test if the limits are 4KB aligned */
  454. orr x0, x0, x1
  455. tst x0, #(PAGE_SIZE_MASK)
  456. ASM_ASSERT(eq)
  457. #endif
  458. /*
  459. * Calculate the offset based on return address in x30.
  460. * Assume that this function is called within a page at the start of
  461. * fixup region.
  462. */
  463. and x2, x30, #~(PAGE_SIZE_MASK)
  464. subs x0, x2, x6 /* Diff(S) = Current Address - Compiled Address */
  465. b.eq 3f /* Diff(S) = 0. No relocation needed */
  466. adrp x1, __GOT_START__
  467. add x1, x1, :lo12:__GOT_START__
  468. adrp x2, __GOT_END__
  469. add x2, x2, :lo12:__GOT_END__
  470. /*
  471. * GOT is an array of 64_bit addresses which must be fixed up as
  472. * new_addr = old_addr + Diff(S).
  473. * The new_addr is the address currently the binary is executing from
  474. * and old_addr is the address at compile time.
  475. */
  476. 1: ldr x3, [x1]
  477. /* Skip adding offset if address is < lower limit */
  478. cmp x3, x6
  479. b.lo 2f
  480. /* Skip adding offset if address is > upper limit */
  481. cmp x3, x7
  482. b.hi 2f
  483. add x3, x3, x0
  484. str x3, [x1]
  485. 2: add x1, x1, #8
  486. cmp x1, x2
  487. b.lo 1b
  488. /* Starting dynamic relocations. Use adrp/adr to get RELA_START and END */
  489. 3: adrp x1, __RELA_START__
  490. add x1, x1, :lo12:__RELA_START__
  491. adrp x2, __RELA_END__
  492. add x2, x2, :lo12:__RELA_END__
  493. /*
  494. * According to ELF-64 specification, the RELA data structure is as
  495. * follows:
  496. * typedef struct {
  497. * Elf64_Addr r_offset;
  498. * Elf64_Xword r_info;
  499. * Elf64_Sxword r_addend;
  500. * } Elf64_Rela;
  501. *
  502. * r_offset is address of reference
  503. * r_info is symbol index and type of relocation (in this case
  504. * code 1027 which corresponds to R_AARCH64_RELATIVE).
  505. * r_addend is constant part of expression.
  506. *
  507. * Size of Elf64_Rela structure is 24 bytes.
  508. */
  509. /* Skip R_AARCH64_NONE entry with code 0 */
  510. 1: ldr x3, [x1, #8]
  511. cbz x3, 2f
  512. #if ENABLE_ASSERTIONS
  513. /* Assert that the relocation type is R_AARCH64_RELATIVE */
  514. cmp x3, #R_AARCH64_RELATIVE
  515. ASM_ASSERT(eq)
  516. #endif
  517. ldr x3, [x1] /* r_offset */
  518. add x3, x0, x3
  519. ldr x4, [x1, #16] /* r_addend */
  520. /* Skip adding offset if r_addend is < lower limit */
  521. cmp x4, x6
  522. b.lo 2f
  523. /* Skip adding offset if r_addend entry is > upper limit */
  524. cmp x4, x7
  525. b.hi 2f
  526. add x4, x0, x4 /* Diff(S) + r_addend */
  527. str x4, [x3]
  528. 2: add x1, x1, #24
  529. cmp x1, x2
  530. b.lo 1b
  531. ret
  532. endfunc fixup_gdt_reloc