mmu.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. #include "u.h"
  2. #include "../port/lib.h"
  3. #include "mem.h"
  4. #include "dat.h"
  5. #include "fns.h"
  6. #include "io.h"
  7. #define DATASEGM(p) { 0xFFFF, SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW }
  8. #define EXECSEGM(p) { 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR }
  9. #define TSSSEGM(b,p) { ((b)<<16)|sizeof(Tss),\
  10. ((b)&0xFF000000)|(((b)>>16)&0xFF)|SEGTSS|SEGPL(p)|SEGP }
  11. Segdesc gdt[NGDT] =
  12. {
  13. [NULLSEG] { 0, 0}, /* null descriptor */
  14. [KDSEG] DATASEGM(0), /* kernel data/stack */
  15. [KESEG] EXECSEGM(0), /* kernel code */
  16. [UDSEG] DATASEGM(3), /* user data/stack */
  17. [UESEG] EXECSEGM(3), /* user code */
  18. [TSSSEG] TSSSEGM(0,0), /* tss segment */
  19. };
  20. static void
  21. taskswitch(ulong pdb, ulong stack)
  22. {
  23. Tss *tss;
  24. tss = m->tss;
  25. tss->ss0 = KDSEL;
  26. tss->esp0 = stack;
  27. tss->ss1 = KDSEL;
  28. tss->esp1 = stack;
  29. tss->ss2 = KDSEL;
  30. tss->esp2 = stack;
  31. tss->cr3 = pdb;
  32. putcr3(pdb);
  33. }
  34. /*
  35. * On processors that support it, we set the PTEGLOBAL bit in
  36. * page table and page directory entries that map kernel memory.
  37. * Doing this tells the processor not to bother flushing them
  38. * from the TLB when doing the TLB flush associated with a
  39. * context switch (write to CR3). Since kernel memory mappings
  40. * are never removed, this is safe. (If we ever remove kernel memory
  41. * mappings, we can do a full flush by turning off the PGE bit in CR4,
  42. * writing to CR3, and then turning the PGE bit back on.)
  43. *
  44. * See also mmukmap below.
  45. *
  46. * Processor support for the PTEGLOBAL bit is enabled in devarch.c.
  47. */
  48. static void
  49. memglobal(void)
  50. {
  51. int i, j;
  52. ulong *pde, *pte;
  53. /* only need to do this once, on bootstrap processor */
  54. if(m->machno != 0)
  55. return;
  56. if(!m->havepge)
  57. return;
  58. pde = m->pdb;
  59. for(i=512; i<1024; i++){ /* 512: start at entry for virtual 0x80000000 */
  60. if(pde[i] & PTEVALID){
  61. pde[i] |= PTEGLOBAL;
  62. if(!(pde[i] & PTESIZE)){
  63. pte = KADDR(pde[i]&~(BY2PG-1));
  64. for(j=0; j<1024; j++)
  65. if(pte[j] & PTEVALID)
  66. pte[j] |= PTEGLOBAL;
  67. }
  68. }
  69. }
  70. }
  71. void
  72. mmuinit(void)
  73. {
  74. ulong x, *p;
  75. ushort ptr[3];
  76. memglobal();
  77. m->tss = malloc(sizeof(Tss));
  78. memset(m->tss, 0, sizeof(Tss));
  79. /*
  80. * We used to keep the GDT in the Mach structure, but it
  81. * turns out that that slows down access to the rest of the
  82. * page. Since the Mach structure is accessed quite often,
  83. * it pays off anywhere from a factor of 1.25 to 2 on real
  84. * hardware to separate them (the AMDs are more sensitive
  85. * than Intels in this regard). Under VMware it pays off
  86. * a factor of about 10 to 100.
  87. */
  88. memmove(m->gdt, gdt, sizeof gdt);
  89. x = (ulong)m->tss;
  90. m->gdt[TSSSEG].d0 = (x<<16)|sizeof(Tss);
  91. m->gdt[TSSSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP;
  92. ptr[0] = sizeof(gdt)-1;
  93. x = (ulong)m->gdt;
  94. ptr[1] = x & 0xFFFF;
  95. ptr[2] = (x>>16) & 0xFFFF;
  96. lgdt(ptr);
  97. ptr[0] = sizeof(Segdesc)*256-1;
  98. x = IDTADDR;
  99. ptr[1] = x & 0xFFFF;
  100. ptr[2] = (x>>16) & 0xFFFF;
  101. lidt(ptr);
  102. /* make kernel text unwritable */
  103. for(x = KTZERO; x < (ulong)etext; x += BY2PG){
  104. p = mmuwalk(m->pdb, x, 2, 0);
  105. if(p == nil)
  106. panic("mmuinit");
  107. *p &= ~PTEWRITE;
  108. }
  109. taskswitch(PADDR(m->pdb), (ulong)m + BY2PG);
  110. ltr(TSSSEL);
  111. }
  112. void
  113. flushmmu(void)
  114. {
  115. int s;
  116. s = splhi();
  117. up->newtlb = 1;
  118. mmuswitch(up);
  119. splx(s);
  120. }
  121. static void
  122. mmuptefree(Proc* proc)
  123. {
  124. ulong *pdb;
  125. Page **last, *page;
  126. if(proc->mmupdb && proc->mmuused){
  127. pdb = (ulong*)proc->mmupdb->va;
  128. last = &proc->mmuused;
  129. for(page = *last; page; page = page->next){
  130. pdb[page->daddr] = 0;
  131. last = &page->next;
  132. }
  133. *last = proc->mmufree;
  134. proc->mmufree = proc->mmuused;
  135. proc->mmuused = 0;
  136. }
  137. }
  138. void
  139. mmuswitch(Proc* proc)
  140. {
  141. ulong *pdb;
  142. if(proc->newtlb){
  143. mmuptefree(proc);
  144. proc->newtlb = 0;
  145. }
  146. if(proc->mmupdb){
  147. pdb = (ulong*)proc->mmupdb->va;
  148. pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
  149. taskswitch(proc->mmupdb->pa, (ulong)(proc->kstack+KSTACK));
  150. }
  151. else
  152. taskswitch(PADDR(m->pdb), (ulong)(proc->kstack+KSTACK));
  153. }
  154. void
  155. mmurelease(Proc* proc)
  156. {
  157. Page *page, *next;
  158. /*
  159. * Release any pages allocated for a page directory base or page-tables
  160. * for this process:
  161. * switch to the prototype pdb for this processor (m->pdb);
  162. * call mmuptefree() to place all pages used for page-tables (proc->mmuused)
  163. * onto the process' free list (proc->mmufree). This has the side-effect of
  164. * cleaning any user entries in the pdb (proc->mmupdb);
  165. * if there's a pdb put it in the cache of pre-initialised pdb's
  166. * for this processor (m->pdbpool) or on the process' free list;
  167. * finally, place any pages freed back into the free pool (palloc).
  168. * This routine is only called from sched() with palloc locked.
  169. */
  170. taskswitch(PADDR(m->pdb), (ulong)m + BY2PG);
  171. mmuptefree(proc);
  172. if(proc->mmupdb){
  173. if(m->pdbcnt > 10){
  174. proc->mmupdb->next = proc->mmufree;
  175. proc->mmufree = proc->mmupdb;
  176. }
  177. else{
  178. proc->mmupdb->next = m->pdbpool;
  179. m->pdbpool = proc->mmupdb;
  180. m->pdbcnt++;
  181. }
  182. proc->mmupdb = 0;
  183. }
  184. for(page = proc->mmufree; page; page = next){
  185. next = page->next;
  186. if(--page->ref)
  187. panic("mmurelease: page->ref %d\n", page->ref);
  188. pagechainhead(page);
  189. }
  190. if(proc->mmufree && palloc.r.p)
  191. wakeup(&palloc.r);
  192. proc->mmufree = 0;
  193. }
  194. static Page*
  195. mmupdballoc(void)
  196. {
  197. int s;
  198. Page *page;
  199. s = splhi();
  200. if(m->pdbpool == 0){
  201. spllo();
  202. page = newpage(0, 0, 0);
  203. page->va = VA(kmap(page));
  204. memmove((void*)page->va, m->pdb, BY2PG);
  205. }
  206. else{
  207. page = m->pdbpool;
  208. m->pdbpool = page->next;
  209. m->pdbcnt--;
  210. }
  211. splx(s);
  212. return page;
  213. }
  214. void
  215. putmmu(ulong va, ulong pa, Page*)
  216. {
  217. int pdbx;
  218. Page *page;
  219. ulong *pdb, *pte;
  220. int s;
  221. if(up->mmupdb == 0)
  222. up->mmupdb = mmupdballoc();
  223. pdb = (ulong*)up->mmupdb->va;
  224. pdbx = PDX(va);
  225. if(PPN(pdb[pdbx]) == 0){
  226. if(up->mmufree == 0){
  227. page = newpage(1, 0, 0);
  228. page->va = VA(kmap(page));
  229. }
  230. else {
  231. page = up->mmufree;
  232. up->mmufree = page->next;
  233. memset((void*)page->va, 0, BY2PG);
  234. }
  235. pdb[pdbx] = PPN(page->pa)|PTEUSER|PTEWRITE|PTEVALID;
  236. page->daddr = pdbx;
  237. page->next = up->mmuused;
  238. up->mmuused = page;
  239. }
  240. pte = KADDR(PPN(pdb[pdbx]));
  241. pte[PTX(va)] = pa|PTEUSER;
  242. s = splhi();
  243. pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
  244. mmuflushtlb(up->mmupdb->pa);
  245. splx(s);
  246. }
  247. ulong*
  248. mmuwalk(ulong* pdb, ulong va, int level, int create)
  249. {
  250. ulong pa, *table;
  251. /*
  252. * Walk the page-table pointed to by pdb and return a pointer
  253. * to the entry for virtual address va at the requested level.
  254. * If the entry is invalid and create isn't requested then bail
  255. * out early. Otherwise, for the 2nd level walk, allocate a new
  256. * page-table page and register it in the 1st level.
  257. */
  258. table = &pdb[PDX(va)];
  259. if(!(*table & PTEVALID) && create == 0)
  260. return 0;
  261. switch(level){
  262. default:
  263. return 0;
  264. case 1:
  265. return table;
  266. case 2:
  267. if(*table & PTESIZE)
  268. panic("mmuwalk2: va %luX entry %luX\n", va, *table);
  269. if(!(*table & PTEVALID)){
  270. pa = PADDR(xspanalloc(BY2PG, BY2PG, 0));
  271. *table = pa|PTEWRITE|PTEVALID;
  272. }
  273. table = KADDR(PPN(*table));
  274. return &table[PTX(va)];
  275. }
  276. }
  277. static Lock mmukmaplock;
  278. int
  279. mmukmapsync(ulong va)
  280. {
  281. Mach *mach0;
  282. ulong entry, *pte;
  283. mach0 = MACHP(0);
  284. lock(&mmukmaplock);
  285. if((pte = mmuwalk(mach0->pdb, va, 1, 0)) == nil){
  286. unlock(&mmukmaplock);
  287. return 0;
  288. }
  289. if(!(*pte & PTESIZE) && mmuwalk(mach0->pdb, va, 2, 0) == nil){
  290. unlock(&mmukmaplock);
  291. return 0;
  292. }
  293. entry = *pte;
  294. if(!(m->pdb[PDX(va)] & PTEVALID))
  295. m->pdb[PDX(va)] = entry;
  296. if(up && up->mmupdb){
  297. ((ulong*)up->mmupdb->va)[PDX(va)] = entry;
  298. mmuflushtlb(up->mmupdb->pa);
  299. }
  300. else
  301. mmuflushtlb(PADDR(m->pdb));
  302. unlock(&mmukmaplock);
  303. return 1;
  304. }
  305. ulong
  306. mmukmap(ulong pa, ulong va, int size)
  307. {
  308. Mach *mach0;
  309. ulong ova, pae, *table, pgsz, *pte, x;
  310. int pse, sync;
  311. mach0 = MACHP(0);
  312. if((mach0->cpuiddx & 0x08) && (getcr4() & 0x10))
  313. pse = 1;
  314. else
  315. pse = 0;
  316. sync = 0;
  317. pa = PPN(pa);
  318. if(va == 0)
  319. va = (ulong)KADDR(pa);
  320. else
  321. va = PPN(va);
  322. ova = va;
  323. pae = pa + size;
  324. lock(&mmukmaplock);
  325. while(pa < pae){
  326. table = &mach0->pdb[PDX(va)];
  327. /*
  328. * Possibly already mapped.
  329. */
  330. if(*table & PTEVALID){
  331. if(*table & PTESIZE){
  332. /*
  333. * Big page. Does it fit within?
  334. * If it does, adjust pgsz so the correct end can be
  335. * returned and get out.
  336. * If not, adjust pgsz up to the next 4MB boundary
  337. * and continue.
  338. */
  339. x = PPN(*table);
  340. if(x != pa)
  341. panic("mmukmap1: pa %luX entry %luX\n",
  342. pa, *table);
  343. x += 4*MB;
  344. if(pae <= x){
  345. pa = pae;
  346. break;
  347. }
  348. pgsz = x - pa;
  349. pa += pgsz;
  350. va += pgsz;
  351. continue;
  352. }
  353. else{
  354. /*
  355. * Little page. Walk to the entry.
  356. * If the entry is valid, set pgsz and continue.
  357. * If not, make it so, set pgsz, sync and continue.
  358. */
  359. pte = mmuwalk(mach0->pdb, va, 2, 0);
  360. if(pte && *pte & PTEVALID){
  361. x = PPN(*pte);
  362. if(x != pa)
  363. panic("mmukmap2: pa %luX entry %luX\n",
  364. pa, *pte);
  365. pgsz = BY2PG;
  366. pa += pgsz;
  367. va += pgsz;
  368. sync++;
  369. continue;
  370. }
  371. }
  372. }
  373. /*
  374. * Not mapped. Check if it can be mapped using a big page -
  375. * starts on a 4MB boundary, size >= 4MB and processor can do it.
  376. * If not a big page, walk the walk, talk the talk.
  377. * Sync is set.
  378. *
  379. * If we're creating a kernel mapping, we know that it will never
  380. * expire and thus we can set the PTEGLOBAL bit to make the entry
  381. * persist in the TLB across flushes. If we do add support later for
  382. * unmapping kernel addresses, see devarch.c for instructions on
  383. * how to do a full TLB flush.
  384. */
  385. if(pse && (pa % (4*MB)) == 0 && (pae >= pa+4*MB)){
  386. *table = pa|PTESIZE|PTEWRITE|PTEUNCACHED|PTEVALID;
  387. if((va&KZERO) && m->havepge)
  388. *table |= PTEGLOBAL;
  389. pgsz = 4*MB;
  390. }
  391. else{
  392. pte = mmuwalk(mach0->pdb, va, 2, 1);
  393. *pte = pa|PTEWRITE|PTEUNCACHED|PTEVALID;
  394. if((va&KZERO) && m->havepge)
  395. *pte |= PTEGLOBAL;
  396. pgsz = BY2PG;
  397. }
  398. pa += pgsz;
  399. va += pgsz;
  400. sync++;
  401. }
  402. unlock(&mmukmaplock);
  403. /*
  404. * If something was added
  405. * then need to sync up.
  406. */
  407. if(sync)
  408. mmukmapsync(ova);
  409. return pa;
  410. }