mmu.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. #include "u.h"
  2. #include "../port/lib.h"
  3. #include "mem.h"
  4. #include "dat.h"
  5. #include "fns.h"
  6. #include "io.h"
  7. #define DATASEGM(p) { 0xFFFF, SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW }
  8. #define EXECSEGM(p) { 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR }
  9. #define TSSSEGM(b,p) { ((b)<<16)|sizeof(Tss),\
  10. ((b)&0xFF000000)|(((b)>>16)&0xFF)|SEGTSS|SEGPL(p)|SEGP }
  11. Segdesc gdt[NGDT] =
  12. {
  13. [NULLSEG] { 0, 0}, /* null descriptor */
  14. [KDSEG] DATASEGM(0), /* kernel data/stack */
  15. [KESEG] EXECSEGM(0), /* kernel code */
  16. [UDSEG] DATASEGM(3), /* user data/stack */
  17. [UESEG] EXECSEGM(3), /* user code */
  18. [TSSSEG] TSSSEGM(0,0), /* tss segment */
  19. };
  20. static void
  21. taskswitch(ulong pdb, ulong stack)
  22. {
  23. Tss *tss;
  24. tss = m->tss;
  25. tss->ss0 = KDSEL;
  26. tss->esp0 = stack;
  27. tss->ss1 = KDSEL;
  28. tss->esp1 = stack;
  29. tss->ss2 = KDSEL;
  30. tss->esp2 = stack;
  31. tss->cr3 = pdb;
  32. putcr3(pdb);
  33. }
  34. /*
  35. * On processors that support it, we set the PTEGLOBAL bit in
  36. * page table and page directory entries that map kernel memory.
  37. * Doing this tells the processor not to bother flushing them
  38. * from the TLB when doing the TLB flush associated with a
  39. * context switch (write to CR3). Since kernel memory mappings
  40. * are never removed, this is safe. (If we ever remove kernel memory
  41. * mappings, we can do a full flush by turning off the PGE bit in CR4,
  42. * writing to CR3, and then turning the PGE bit back on.)
  43. *
  44. * See also mmukmap below.
  45. *
  46. * Processor support for the PTEGLOBAL bit is enabled in devarch.c.
  47. */
  48. static void
  49. memglobal(void)
  50. {
  51. int i, j;
  52. ulong *pde, *pte;
  53. /* only need to do this once, on bootstrap processor */
  54. if(m->machno != 0)
  55. return;
  56. if(!m->havepge)
  57. return;
  58. pde = m->pdb;
  59. for(i=512; i<1024; i++){ /* 512: start at entry for virtual 0x80000000 */
  60. if(pde[i] & PTEVALID){
  61. pde[i] |= PTEGLOBAL;
  62. if(!(pde[i] & PTESIZE)){
  63. pte = KADDR(pde[i]&~(BY2PG-1));
  64. for(j=0; j<1024; j++)
  65. if(pte[j] & PTEVALID)
  66. pte[j] |= PTEGLOBAL;
  67. }
  68. }
  69. }
  70. }
  71. void
  72. mmuinit(void)
  73. {
  74. ulong x, *p;
  75. ushort ptr[3];
  76. memglobal();
  77. m->tss = malloc(sizeof(Tss));
  78. memset(m->tss, 0, sizeof(Tss));
  79. m->tss->iomap = 0xDFFF<<16;
  80. /*
  81. * We used to keep the GDT in the Mach structure, but it
  82. * turns out that that slows down access to the rest of the
  83. * page. Since the Mach structure is accessed quite often,
  84. * it pays off anywhere from a factor of 1.25 to 2 on real
  85. * hardware to separate them (the AMDs are more sensitive
  86. * than Intels in this regard). Under VMware it pays off
  87. * a factor of about 10 to 100.
  88. */
  89. memmove(m->gdt, gdt, sizeof gdt);
  90. x = (ulong)m->tss;
  91. m->gdt[TSSSEG].d0 = (x<<16)|sizeof(Tss);
  92. m->gdt[TSSSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP;
  93. ptr[0] = sizeof(gdt)-1;
  94. x = (ulong)m->gdt;
  95. ptr[1] = x & 0xFFFF;
  96. ptr[2] = (x>>16) & 0xFFFF;
  97. lgdt(ptr);
  98. ptr[0] = sizeof(Segdesc)*256-1;
  99. x = IDTADDR;
  100. ptr[1] = x & 0xFFFF;
  101. ptr[2] = (x>>16) & 0xFFFF;
  102. lidt(ptr);
  103. /* make kernel text unwritable */
  104. for(x = KTZERO; x < (ulong)etext; x += BY2PG){
  105. p = mmuwalk(m->pdb, x, 2, 0);
  106. if(p == nil)
  107. panic("mmuinit");
  108. *p &= ~PTEWRITE;
  109. }
  110. taskswitch(PADDR(m->pdb), (ulong)m + BY2PG);
  111. ltr(TSSSEL);
  112. }
  113. void
  114. flushmmu(void)
  115. {
  116. int s;
  117. s = splhi();
  118. up->newtlb = 1;
  119. mmuswitch(up);
  120. splx(s);
  121. }
  122. static void
  123. mmuptefree(Proc* proc)
  124. {
  125. ulong *pdb;
  126. Page **last, *page;
  127. if(proc->mmupdb && proc->mmuused){
  128. pdb = (ulong*)proc->mmupdb->va;
  129. last = &proc->mmuused;
  130. for(page = *last; page; page = page->next){
  131. pdb[page->daddr] = 0;
  132. last = &page->next;
  133. }
  134. *last = proc->mmufree;
  135. proc->mmufree = proc->mmuused;
  136. proc->mmuused = 0;
  137. }
  138. }
  139. void
  140. mmuswitch(Proc* proc)
  141. {
  142. ulong *pdb;
  143. if(proc->newtlb){
  144. mmuptefree(proc);
  145. proc->newtlb = 0;
  146. }
  147. if(proc->mmupdb){
  148. pdb = (ulong*)proc->mmupdb->va;
  149. pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
  150. taskswitch(proc->mmupdb->pa, (ulong)(proc->kstack+KSTACK));
  151. }
  152. else
  153. taskswitch(PADDR(m->pdb), (ulong)(proc->kstack+KSTACK));
  154. }
  155. void
  156. mmurelease(Proc* proc)
  157. {
  158. Page *page, *next;
  159. /*
  160. * Release any pages allocated for a page directory base or page-tables
  161. * for this process:
  162. * switch to the prototype pdb for this processor (m->pdb);
  163. * call mmuptefree() to place all pages used for page-tables (proc->mmuused)
  164. * onto the process' free list (proc->mmufree). This has the side-effect of
  165. * cleaning any user entries in the pdb (proc->mmupdb);
  166. * if there's a pdb put it in the cache of pre-initialised pdb's
  167. * for this processor (m->pdbpool) or on the process' free list;
  168. * finally, place any pages freed back into the free pool (palloc).
  169. * This routine is only called from sched() with palloc locked.
  170. */
  171. taskswitch(PADDR(m->pdb), (ulong)m + BY2PG);
  172. mmuptefree(proc);
  173. if(proc->mmupdb){
  174. if(m->pdbcnt > 10){
  175. proc->mmupdb->next = proc->mmufree;
  176. proc->mmufree = proc->mmupdb;
  177. }
  178. else{
  179. proc->mmupdb->next = m->pdbpool;
  180. m->pdbpool = proc->mmupdb;
  181. m->pdbcnt++;
  182. }
  183. proc->mmupdb = 0;
  184. }
  185. for(page = proc->mmufree; page; page = next){
  186. next = page->next;
  187. if(--page->ref)
  188. panic("mmurelease: page->ref %d\n", page->ref);
  189. pagechainhead(page);
  190. }
  191. if(proc->mmufree && palloc.r.p)
  192. wakeup(&palloc.r);
  193. proc->mmufree = 0;
  194. }
  195. static Page*
  196. mmupdballoc(void)
  197. {
  198. int s;
  199. Page *page;
  200. s = splhi();
  201. if(m->pdbpool == 0){
  202. spllo();
  203. page = newpage(0, 0, 0);
  204. page->va = VA(kmap(page));
  205. memmove((void*)page->va, m->pdb, BY2PG);
  206. }
  207. else{
  208. page = m->pdbpool;
  209. m->pdbpool = page->next;
  210. m->pdbcnt--;
  211. }
  212. splx(s);
  213. return page;
  214. }
  215. void
  216. checkmmu(ulong va, ulong pa)
  217. {
  218. ulong *pdb, *pte;
  219. int pdbx;
  220. if(up->mmupdb == 0)
  221. return;
  222. pdb = (ulong*)up->mmupdb->va;
  223. pdbx = PDX(va);
  224. if(PPN(pdb[pdbx]) == 0){
  225. /* okay to be empty - will fault and get filled */
  226. return;
  227. }
  228. pte = KADDR(PPN(pdb[pdbx]));
  229. if(pte[PTX(va)] == 0)
  230. return;
  231. if((pte[PTX(va)]&~4095) != pa)
  232. print("%ld %s: va=0x%08lux pa=0x%08lux pte=0x%08lux\n",
  233. up->pid, up->text,
  234. va, pa, pte[PTX(va)]);
  235. }
  236. void
  237. putmmu(ulong va, ulong pa, Page*)
  238. {
  239. int pdbx;
  240. Page *page;
  241. ulong *pdb, *pte;
  242. int s;
  243. if(up->mmupdb == 0)
  244. up->mmupdb = mmupdballoc();
  245. pdb = (ulong*)up->mmupdb->va;
  246. pdbx = PDX(va);
  247. if(PPN(pdb[pdbx]) == 0){
  248. if(up->mmufree == 0){
  249. page = newpage(1, 0, 0);
  250. page->va = VA(kmap(page));
  251. }
  252. else {
  253. page = up->mmufree;
  254. up->mmufree = page->next;
  255. memset((void*)page->va, 0, BY2PG);
  256. }
  257. pdb[pdbx] = PPN(page->pa)|PTEUSER|PTEWRITE|PTEVALID;
  258. page->daddr = pdbx;
  259. page->next = up->mmuused;
  260. up->mmuused = page;
  261. }
  262. pte = KADDR(PPN(pdb[pdbx]));
  263. pte[PTX(va)] = pa|PTEUSER;
  264. s = splhi();
  265. pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
  266. mmuflushtlb(up->mmupdb->pa);
  267. splx(s);
  268. }
  269. ulong*
  270. mmuwalk(ulong* pdb, ulong va, int level, int create)
  271. {
  272. ulong pa, *table;
  273. /*
  274. * Walk the page-table pointed to by pdb and return a pointer
  275. * to the entry for virtual address va at the requested level.
  276. * If the entry is invalid and create isn't requested then bail
  277. * out early. Otherwise, for the 2nd level walk, allocate a new
  278. * page-table page and register it in the 1st level.
  279. */
  280. table = &pdb[PDX(va)];
  281. if(!(*table & PTEVALID) && create == 0)
  282. return 0;
  283. switch(level){
  284. default:
  285. return 0;
  286. case 1:
  287. return table;
  288. case 2:
  289. if(*table & PTESIZE)
  290. panic("mmuwalk2: va %luX entry %luX\n", va, *table);
  291. if(!(*table & PTEVALID)){
  292. pa = PADDR(xspanalloc(BY2PG, BY2PG, 0));
  293. *table = pa|PTEWRITE|PTEVALID;
  294. }
  295. table = KADDR(PPN(*table));
  296. return &table[PTX(va)];
  297. }
  298. }
  299. static Lock mmukmaplock;
  300. int
  301. mmukmapsync(ulong va)
  302. {
  303. Mach *mach0;
  304. ulong entry, *pte;
  305. mach0 = MACHP(0);
  306. ilock(&mmukmaplock);
  307. if((pte = mmuwalk(mach0->pdb, va, 1, 0)) == nil){
  308. iunlock(&mmukmaplock);
  309. return 0;
  310. }
  311. if(!(*pte & PTESIZE) && mmuwalk(mach0->pdb, va, 2, 0) == nil){
  312. iunlock(&mmukmaplock);
  313. return 0;
  314. }
  315. entry = *pte;
  316. if(!(m->pdb[PDX(va)] & PTEVALID))
  317. m->pdb[PDX(va)] = entry;
  318. if(up && up->mmupdb){
  319. ((ulong*)up->mmupdb->va)[PDX(va)] = entry;
  320. mmuflushtlb(up->mmupdb->pa);
  321. }
  322. else
  323. mmuflushtlb(PADDR(m->pdb));
  324. iunlock(&mmukmaplock);
  325. return 1;
  326. }
  327. ulong
  328. mmukmap(ulong pa, ulong va, int size)
  329. {
  330. Mach *mach0;
  331. ulong ova, pae, *table, pgsz, *pte, x;
  332. int pse, sync;
  333. mach0 = MACHP(0);
  334. if((mach0->cpuiddx & 0x08) && (getcr4() & 0x10))
  335. pse = 1;
  336. else
  337. pse = 0;
  338. sync = 0;
  339. pa = PPN(pa);
  340. if(va == 0)
  341. va = (ulong)KADDR(pa);
  342. else
  343. va = PPN(va);
  344. ova = va;
  345. pae = pa + size;
  346. ilock(&mmukmaplock);
  347. while(pa < pae){
  348. table = &mach0->pdb[PDX(va)];
  349. /*
  350. * Possibly already mapped.
  351. */
  352. if(*table & PTEVALID){
  353. if(*table & PTESIZE){
  354. /*
  355. * Big page. Does it fit within?
  356. * If it does, adjust pgsz so the correct end can be
  357. * returned and get out.
  358. * If not, adjust pgsz up to the next 4MB boundary
  359. * and continue.
  360. */
  361. x = PPN(*table);
  362. if(x != pa)
  363. panic("mmukmap1: pa %luX entry %luX\n",
  364. pa, *table);
  365. x += 4*MB;
  366. if(pae <= x){
  367. pa = pae;
  368. break;
  369. }
  370. pgsz = x - pa;
  371. pa += pgsz;
  372. va += pgsz;
  373. continue;
  374. }
  375. else{
  376. /*
  377. * Little page. Walk to the entry.
  378. * If the entry is valid, set pgsz and continue.
  379. * If not, make it so, set pgsz, sync and continue.
  380. */
  381. pte = mmuwalk(mach0->pdb, va, 2, 0);
  382. if(pte && *pte & PTEVALID){
  383. x = PPN(*pte);
  384. if(x != pa)
  385. panic("mmukmap2: pa %luX entry %luX\n",
  386. pa, *pte);
  387. pgsz = BY2PG;
  388. pa += pgsz;
  389. va += pgsz;
  390. sync++;
  391. continue;
  392. }
  393. }
  394. }
  395. /*
  396. * Not mapped. Check if it can be mapped using a big page -
  397. * starts on a 4MB boundary, size >= 4MB and processor can do it.
  398. * If not a big page, walk the walk, talk the talk.
  399. * Sync is set.
  400. *
  401. * If we're creating a kernel mapping, we know that it will never
  402. * expire and thus we can set the PTEGLOBAL bit to make the entry
  403. * persist in the TLB across flushes. If we do add support later for
  404. * unmapping kernel addresses, see devarch.c for instructions on
  405. * how to do a full TLB flush.
  406. */
  407. if(pse && (pa % (4*MB)) == 0 && (pae >= pa+4*MB)){
  408. *table = pa|PTESIZE|PTEWRITE|PTEUNCACHED|PTEVALID;
  409. if((va&KZERO) && m->havepge)
  410. *table |= PTEGLOBAL;
  411. pgsz = 4*MB;
  412. }
  413. else{
  414. pte = mmuwalk(mach0->pdb, va, 2, 1);
  415. *pte = pa|PTEWRITE|PTEUNCACHED|PTEVALID;
  416. if((va&KZERO) && m->havepge)
  417. *pte |= PTEGLOBAL;
  418. pgsz = BY2PG;
  419. }
  420. pa += pgsz;
  421. va += pgsz;
  422. sync++;
  423. }
  424. iunlock(&mmukmaplock);
  425. /*
  426. * If something was added
  427. * then need to sync up.
  428. */
  429. if(sync)
  430. mmukmapsync(ova);
  431. return pa;
  432. }