1 #include <lunaix/mm/procvm.h>
2 #include <lunaix/mm/valloc.h>
3 #include <lunaix/mm/region.h>
4 #include <lunaix/mm/page.h>
5 #include <lunaix/mm/mmap.h>
6 #include <lunaix/process.h>
7 #include <lunaix/syslog.h>
9 #include <asm/mm_defs.h>
11 #include <klibc/string.h>
13 #define alloc_pagetable_trace(ptep, pte, ord, level) \
15 alloc_kpage_at(ptep, pte, ord); \
18 #define free_pagetable_trace(ptep, pte, level) \
20 struct leaflet* leaflet = pte_leaflet_aligned(pte); \
21 assert(leaflet_order(leaflet) == 0); \
22 leaflet_return(leaflet); \
23 set_pte(ptep, null_pte); \
27 procvm_create(struct proc_info* proc) {
28 struct proc_mm* mm = vzalloc(sizeof(struct proc_mm));
35 llist_init_head(&mm->regions);
39 static inline unsigned int
40 __ptep_advancement(struct leaflet* leaflet, int level)
42 size_t shifts = MAX(MAX_LEVEL - level - 1, 1) * LEVEL_SHIFT;
43 return (1 << (leaflet_order(leaflet) % shifts)) - 1;
47 __descend(ptr_t dest_mnt, ptr_t src_mnt, ptr_t va, bool alloc)
49 pte_t *dest, *src, pte;
52 while (!pt_last_level(i))
54 dest = mklntep_va(i, dest_mnt, va);
55 src = mklntep_va(i, src_mnt, va);
58 if (!pte_isloaded(pte) || pte_huge(pte)) {
62 if (alloc && pte_isnull(pte_at(dest))) {
63 alloc_pagetable_trace(dest, pte, 0, i);
73 __free_hierarchy(ptr_t mnt, ptr_t va, int level)
75 pte_t pte, *ptep, *ptep_next;
77 if (pt_last_level(level)) {
81 __free_hierarchy(mnt, va, level + 1);
83 ptep = mklntep_va(level, mnt, va);
85 if (pte_isnull(pte)) {
89 ptep_next = ptep_step_into(ptep);
90 for (unsigned i = 0; i < LEVEL_SIZE; i++, ptep_next++)
92 if (!pte_isnull(pte_at(ptep_next))) {
97 free_pagetable_trace(ptep, pte, level);
101 copy_leaf(pte_t* dest, pte_t* src, pte_t pte, int level)
103 struct leaflet* leaflet;
107 if (!pte_isloaded(pte)) {
111 leaflet = pte_leaflet(pte);
112 assert(leaflet_refcount(leaflet));
114 if (leaflet_ppfn(leaflet) == pte_ppfn(pte)) {
115 leaflet_borrow(leaflet);
120 copy_root(pte_t* dest, pte_t* src, pte_t pte, int level)
122 alloc_pagetable_trace(dest, pte, 0, level);
126 vmrcpy(ptr_t dest_mnt, ptr_t src_mnt, struct mm_region* region)
131 struct leaflet* leaflet;
134 src = mkptep_va(src_mnt, loc);
135 dest = mkptep_va(dest_mnt, loc);
137 level = __descend(dest_mnt, src_mnt, loc, true);
139 while (loc < region->end)
143 if (pte_isnull(pte)) {
147 if (pt_last_level(level) || pte_huge(pte)) {
148 copy_leaf(dest, src, pte, level);
152 if (!pt_last_level(level)) {
153 copy_root(dest, src, pte, level);
155 src = ptep_step_into(src);
156 dest = ptep_step_into(dest);
163 loc += lnt_page_size(level);
164 while (ptep_vfn(src) == MAX_PTEN - 1) {
166 src = ptep_step_out(src);
167 dest = ptep_step_out(dest);
177 vmrfree_hierachy(ptr_t vm_mnt, struct mm_region* region)
179 __free_hierarchy(vm_mnt, region->start, 0);
183 vmrfree(ptr_t vm_mnt, struct mm_region* region)
188 struct leaflet* leaflet;
191 src = mkptep_va(vm_mnt, region->start);
192 end = mkptep_va(vm_mnt, region->end);
194 level = __descend(vm_mnt, vm_mnt, loc, false);
199 ptr_t pa = pte_paddr(pte);
201 if (pte_isnull(pte)) {
205 if (!pt_last_level(level) && !pte_huge(pte)) {
206 src = ptep_step_into(src);
212 set_pte(src, null_pte);
214 if (pte_isloaded(pte)) {
215 leaflet = pte_leaflet_aligned(pte);
216 leaflet_return(leaflet);
218 src += __ptep_advancement(leaflet, level);
222 while (ptep_vfn(src) == MAX_PTEN - 1) {
223 src = ptep_step_out(src);
224 free_pagetable_trace(src, pte_at(src), level);
234 vmscpy(struct proc_mm* dest_mm, struct proc_mm* src_mm)
236 // Build the self-reference on dest vms
239 * -- What the heck are ptep_ssm and ptep_sms ? --
241 * ptep_dest point to the pagetable itself that is mounted
242 * at dest_mnt (or simply mnt):
243 * mnt -> self -> self -> self -> L0TE@offset
245 * ptep_sms shallowed the recursion chain:
246 * self -> mnt -> self -> self -> L0TE@self
248 * ptep_ssm shallowed the recursion chain:
249 * self -> self -> mnt -> self -> L0TE@self
251 * Now, here is the problem, back to x86_32, the translation is
252 * a depth-3 recursion:
255 * So ptep_ssm will terminate at mnt and give us a leaf
256 * slot for allocate a fresh page table for mnt:
257 * self -> self -> L0TE@mnt
259 * but in x86_64 translation has extra two more step:
260 * L0T -> L1T -> L2T -> LFT -> Page
262 * So we must continue push down....
263 * ptep_sssms shallowed the recursion chain:
264 * self -> self -> self -> mnt -> L0TE@self
266 * ptep_ssssm shallowed the recursion chain:
267 * self -> self -> self -> self -> L0TE@mnt
269 * Note: PML4: 2 extra steps
270 * PML5: 3 extra steps
273 ptr_t dest_mnt, src_mnt;
275 dest_mnt = dest_mm->vm_mnt;
278 pte_t* ptep_ssm = mkl0tep_va(VMS_SELF, dest_mnt);
279 pte_t* ptep_smx = mkl1tep_va(VMS_SELF, dest_mnt);
280 pte_t pte_sms = mkpte_prot(KERNEL_PGTAB);
282 pte_sms = alloc_pagetable_trace(ptep_ssm, pte_sms, 0, 0);
283 set_pte(&ptep_smx[VMS_SELF_L0TI], pte_sms);
285 tlb_flush_kernel((ptr_t)dest_mnt);
291 src_mnt = src_mm->vm_mnt;
293 struct mm_region *pos, *n;
294 llist_for_each(pos, n, &src_mm->regions, head)
296 vmrcpy(dest_mnt, src_mnt, pos);
300 procvm_link_kernel(dest_mnt);
302 dest_mm->vmroot = pte_paddr(pte_sms);
306 vmsfree(struct proc_mm* mm)
308 struct leaflet* leaflet;
309 struct mm_region *pos, *n;
314 ptep_self = mkl0tep(mkptep_va(vm_mnt, VMS_SELF));
316 // first pass: free region mappings
317 llist_for_each(pos, n, &mm->regions, head)
319 vmrfree(vm_mnt, pos);
322 // second pass: free the hierarchical
323 llist_for_each(pos, n, &mm->regions, head)
325 vmrfree_hierachy(vm_mnt, pos);
328 procvm_unlink_kernel();
330 free_pagetable_trace(ptep_self, pte_at(ptep_self), 0);
334 __attach_to_current_vms(struct proc_mm* guest_mm)
336 struct proc_mm* mm_current = vmspace(__current);
338 assert(!mm_current->guest_mm);
339 mm_current->guest_mm = guest_mm;
344 __detach_from_current_vms(struct proc_mm* guest_mm)
346 struct proc_mm* mm_current = vmspace(__current);
348 assert(mm_current->guest_mm == guest_mm);
349 mm_current->guest_mm = NULL;
354 procvm_prune_vmr(ptr_t vm_mnt, struct mm_region* region)
356 vmrfree(vm_mnt, region);
357 vmrfree_hierachy(vm_mnt, region);
361 procvm_dupvms_mount(struct proc_mm* mm) {
365 struct proc_mm* mm_current = vmspace(__current);
367 __attach_to_current_vms(mm);
369 mm->heap = mm_current->heap;
370 mm->vm_mnt = VMS_MOUNT_1;
372 vmscpy(mm, mm_current);
373 region_copy_mm(mm_current, mm);
377 procvm_mount(struct proc_mm* mm)
379 // if current mm is already active
380 if (active_vms(mm->vm_mnt)) {
384 // we are double mounting
388 vms_mount(VMS_MOUNT_1, mm->vmroot);
390 __attach_to_current_vms(mm);
392 mm->vm_mnt = VMS_MOUNT_1;
396 procvm_unmount(struct proc_mm* mm)
398 if (active_vms(mm->vm_mnt)) {
403 vms_unmount(VMS_MOUNT_1);
405 struct proc_mm* mm_current = vmspace(__current);
407 mm_current->guest_mm = NULL;
414 procvm_initvms_mount(struct proc_mm* mm)
418 __attach_to_current_vms(mm);
420 mm->vm_mnt = VMS_MOUNT_1;
425 procvm_unmount_release(struct proc_mm* mm) {
426 ptr_t vm_mnt = mm->vm_mnt;
427 struct mm_region *pos, *n;
429 llist_for_each(pos, n, &mm->regions, head)
431 mem_sync_pages(vm_mnt, pos, pos->start, pos->end - pos->start, 0);
436 llist_for_each(pos, n, &mm->regions, head)
444 __detach_from_current_vms(mm);
448 procvm_mount_self(struct proc_mm* mm)
452 mm->vm_mnt = VMS_SELF;
456 procvm_unmount_self(struct proc_mm* mm)
458 assert(active_vms(mm->vm_mnt));
464 procvm_enter_remote(struct remote_vmctx* rvmctx, struct proc_mm* mm,
465 ptr_t remote_base, size_t size)
467 ptr_t vm_mnt = mm->vm_mnt;
470 pfn_t size_pn = pfn(size + PAGE_SIZE);
471 assert(size_pn < REMOTEVM_MAX_PAGES);
473 struct mm_region* region = region_get(&mm->regions, remote_base);
474 assert(region && region_contains(region, remote_base + size));
476 rvmctx->vms_mnt = vm_mnt;
477 rvmctx->page_cnt = size_pn;
479 remote_base = page_aligned(remote_base);
480 rvmctx->remote = remote_base;
481 rvmctx->local_mnt = PG_MOUNT_VAR;
483 pte_t* rptep = mkptep_va(vm_mnt, remote_base);
484 pte_t* lptep = mkptep_va(VMS_SELF, rvmctx->local_mnt);
486 pte_t pte, rpte = null_pte;
487 rpte = region_tweakpte(region, rpte);
489 for (size_t i = 0; i < size_pn; i++)
491 pte = vmm_tryptep(rptep, PAGE_SIZE);
492 if (pte_isloaded(pte)) {
497 ptr_t pa = ppage_addr(pmm_alloc_normal(0));
498 set_pte(lptep, mkpte(pa, KERNEL_DATA));
499 set_pte(rptep, pte_setpaddr(rpte, pa));
507 procvm_copy_remote_transaction(struct remote_vmctx* rvmctx,
508 ptr_t remote_dest, void* local_src, size_t sz)
510 if (remote_dest < rvmctx->remote) {
514 ptr_t offset = remote_dest - rvmctx->remote;
515 if (pfn(offset + sz) >= rvmctx->page_cnt) {
519 memcpy((void*)(rvmctx->local_mnt + offset), local_src, sz);
525 procvm_exit_remote(struct remote_vmctx* rvmctx)
527 pte_t* lptep = mkptep_va(VMS_SELF, rvmctx->local_mnt);
528 vmm_unset_ptes(lptep, rvmctx->page_cnt);