X-Git-Url: https://scm.lunaixsky.com/lunaix-os.git/blobdiff_plain/11b423249f224e3c9b7d940862c3cae310f81a23..669e8fc63dd1aa7fe4c830b1d40371a1ab47fc4f:/lunaix-os/kernel/mm/procvm.c diff --git a/lunaix-os/kernel/mm/procvm.c b/lunaix-os/kernel/mm/procvm.c index 7ba6f53..8081971 100644 --- a/lunaix-os/kernel/mm/procvm.c +++ b/lunaix-os/kernel/mm/procvm.c @@ -1,15 +1,28 @@ #include #include #include -#include -#include +#include #include #include +#include -#include +#include #include +#define alloc_pagetable_trace(ptep, pte, ord, level) \ + ({ \ + alloc_kpage_at(ptep, pte, ord); \ + }) + +#define free_pagetable_trace(ptep, pte, level) \ + ({ \ + struct leaflet* leaflet = pte_leaflet_aligned(pte); \ + assert(leaflet_order(leaflet) == 0); \ + leaflet_return(leaflet); \ + set_pte(ptep, null_pte); \ + }) + struct proc_mm* procvm_create(struct proc_info* proc) { struct proc_mm* mm = vzalloc(sizeof(struct proc_mm)); @@ -23,100 +36,132 @@ procvm_create(struct proc_info* proc) { return mm; } -static ptr_t -vmscpy(ptr_t dest_mnt, ptr_t src_mnt, bool only_kernel) +static inline unsigned int +__ptep_advancement(struct leaflet* leaflet, int level) { - pte_t* ptep_dest = mkl0tep(mkptep_va(dest_mnt, 0)); - pte_t* ptep = mkl0tep(mkptep_va(src_mnt, 0)); - pte_t* ptepd_kernel = mkl0tep(mkptep_va(dest_mnt, KERNEL_RESIDENT)); - pte_t* ptep_kernel = mkl0tep(mkptep_va(src_mnt, KERNEL_RESIDENT)); + size_t shifts = MAX(MAX_LEVEL - level - 1, 1) * LEVEL_SHIFT; + return (1 << (leaflet_order(leaflet) % shifts)) - 1; +} - // Build the self-reference on dest vms - pte_t* ptep_sms = mkptep_va(VMS_SELF, (ptr_t)ptep_dest); - pte_t* ptep_ssm = mkptep_va(VMS_SELF, (ptr_t)ptep_sms); - pte_t pte_sms = mkpte_prot(KERNEL_DATA); +static inline int +__descend(ptr_t dest_mnt, ptr_t src_mnt, ptr_t va, bool alloc) +{ + pte_t *dest, *src, pte; + + int i = 0; + while (!pt_last_level(i)) + { + dest = mklntep_va(i, dest_mnt, va); + src = mklntep_va(i, src_mnt, va); + pte = pte_at(src); + + if (!pte_isloaded(pte) || pte_huge(pte)) { + break; + } + + if (alloc && pte_isnull(pte_at(dest))) { + alloc_pagetable_trace(dest, pte, 0, i); + } + + i++; + } + + return i; +} - pte_sms = vmm_alloc_page(ptep_ssm, pte_sms); - set_pte(ptep_sms, pte_sms); +static inline void +copy_leaf(pte_t* dest, pte_t* src, pte_t pte, int level) +{ + struct leaflet* leaflet; + + set_pte(dest, pte); + + if (!pte_isloaded(pte)) { + return; + } + + leaflet = pte_leaflet(pte); + assert(leaflet_refcount(leaflet)); - cpu_flush_page((ptr_t)dest_mnt); - - if (only_kernel) { - ptep = ptep_kernel; - ptep_dest += ptep_vfn(ptep_kernel); - } else { - ptep++; - ptep_dest++; + if (leaflet_ppfn(leaflet) == pte_ppfn(pte)) { + leaflet_borrow(leaflet); } +} + +static inline void +copy_root(pte_t* dest, pte_t* src, pte_t pte, int level) +{ + alloc_pagetable_trace(dest, pte, 0, level); +} + +static void +vmrcpy(ptr_t dest_mnt, ptr_t src_mnt, struct mm_region* region) +{ + pte_t *src, *dest; + ptr_t loc; + int level; + struct leaflet* leaflet; - int level = 0; - while (ptep < ptep_kernel) + loc = region->start; + src = mkptep_va(src_mnt, loc); + dest = mkptep_va(dest_mnt, loc); + + level = __descend(dest_mnt, src_mnt, loc, true); + + while (loc < region->end) { - pte_t pte = *ptep; - ptr_t pa = pte_paddr(pte); + pte_t pte = *src; if (pte_isnull(pte)) { goto cont; } if (pt_last_level(level) || pte_huge(pte)) { - set_pte(ptep_dest, pte); - - if (pte_isloaded(pte)) - pmm_ref_page(pa); + copy_leaf(dest, src, pte, level); + goto cont; } - else if (!pt_last_level(level)) { - vmm_alloc_page(ptep_dest, pte); + + if (!pt_last_level(level)) { + copy_root(dest, src, pte, level); - ptep = ptep_step_into(ptep); - ptep_dest = ptep_step_into(ptep_dest); + src = ptep_step_into(src); + dest = ptep_step_into(dest); level++; continue; } cont: - if (ptep_vfn(ptep) == MAX_PTEN - 1) { + loc += lnt_page_size(level); + while (ptep_vfn(src) == MAX_PTEN - 1) { assert(level > 0); - ptep = ptep_step_out(ptep); - ptep_dest = ptep_step_out(ptep_dest); + src = ptep_step_out(src); + dest = ptep_step_out(dest); level--; } - ptep++; - ptep_dest++; - } - - // Ensure we step back to L0T - assert(!level); - assert(ptep_dest == ptepd_kernel); - - // Carry over the kernel (exclude last two entry) - while (ptep_vfn(ptep) < MAX_PTEN - 2) { - pte_t pte = *ptep; - assert(!pte_isnull(pte)); - - set_pte(ptep_dest, pte); - pmm_ref_page(pte_paddr(pte)); - - ptep++; - ptep_dest++; + src++; + dest++; } - - return pte_paddr(*(ptep_dest + 1)); } static void -vmsfree(ptr_t vm_mnt) +vmrfree(ptr_t vm_mnt, struct mm_region* region) { - pte_t* ptep_head = mkl0tep(mkptep_va(vm_mnt, 0)); - pte_t* ptep_kernel = mkl0tep(mkptep_va(vm_mnt, KERNEL_RESIDENT)); + pte_t *src, *end; + ptr_t loc; + int level; + struct leaflet* leaflet; + + loc = region->start; + src = mkptep_va(vm_mnt, region->start); + end = mkptep_va(vm_mnt, region->end); + + level = __descend(vm_mnt, vm_mnt, loc, false); - int level = 0; - pte_t* ptep = ptep_head; - while (ptep < ptep_kernel) + while (src < end) { - pte_t pte = *ptep; + pte_t pte = *src; ptr_t pa = pte_paddr(pte); if (pte_isnull(pte)) { @@ -124,27 +169,159 @@ vmsfree(ptr_t vm_mnt) } if (!pt_last_level(level) && !pte_huge(pte)) { - ptep = ptep_step_into(ptep); + src = ptep_step_into(src); level++; continue; } - if (pte_isloaded(pte)) - pmm_free_any(pa); + set_pte(src, null_pte); + + if (pte_isloaded(pte)) { + leaflet = pte_leaflet_aligned(pte); + leaflet_return(leaflet); + + src += __ptep_advancement(leaflet, level); + } cont: - if (ptep_vfn(ptep) == MAX_PTEN - 1) { - ptep = ptep_step_out(ptep); - pmm_free_any(pte_paddr(pte_at(ptep))); + while (ptep_vfn(src) == MAX_PTEN - 1) { + src = ptep_step_out(src); + free_pagetable_trace(src, pte_at(src), level); + level--; } - ptep++; + src++; + } +} + +static void +vmscpy(struct proc_mm* dest_mm, struct proc_mm* src_mm) +{ + // Build the self-reference on dest vms + + /* + * -- What the heck are ptep_ssm and ptep_sms ? -- + * + * ptep_dest point to the pagetable itself that is mounted + * at dest_mnt (or simply mnt): + * mnt -> self -> self -> self -> L0TE@offset + * + * ptep_sms shallowed the recursion chain: + * self -> mnt -> self -> self -> L0TE@self + * + * ptep_ssm shallowed the recursion chain: + * self -> self -> mnt -> self -> L0TE@self + * + * Now, here is the problem, back to x86_32, the translation is + * a depth-3 recursion: + * L0T -> LFT -> Page + * + * So ptep_ssm will terminate at mnt and give us a leaf + * slot for allocate a fresh page table for mnt: + * self -> self -> L0TE@mnt + * + * but in x86_64 translation has extra two more step: + * L0T -> L1T -> L2T -> LFT -> Page + * + * So we must continue push down.... + * ptep_sssms shallowed the recursion chain: + * self -> self -> self -> mnt -> L0TE@self + * + * ptep_ssssm shallowed the recursion chain: + * self -> self -> self -> self -> L0TE@mnt + * + * Note: PML4: 2 extra steps + * PML5: 3 extra steps + */ + + ptr_t dest_mnt, src_mnt; + + dest_mnt = dest_mm->vm_mnt; + assert(dest_mnt); + + pte_t* ptep_ssm = mkl0tep_va(VMS_SELF, dest_mnt); + pte_t* ptep_smx = mkl1tep_va(VMS_SELF, dest_mnt); + pte_t pte_sms = mkpte_prot(KERNEL_PGTAB); + + pte_sms = alloc_pagetable_trace(ptep_ssm, pte_sms, 0, 0); + set_pte(&ptep_smx[VMS_SELF_L0TI], pte_sms); + + tlb_flush_kernel((ptr_t)dest_mnt); + + if (!src_mm) { + goto done; + } + + src_mnt = src_mm->vm_mnt; + + struct mm_region *pos, *n; + llist_for_each(pos, n, &src_mm->regions, head) + { + vmrcpy(dest_mnt, src_mnt, pos); + } + +done:; + procvm_link_kernel(dest_mnt); + + dest_mm->vmroot = pte_paddr(pte_sms); +} + +static void +__purge_vms_residual(struct proc_mm* mm, int level, ptr_t va) +{ + pte_t *ptep, pte; + ptr_t _va; + + if (level >= MAX_LEVEL) { + return; + } + + ptep = mklntep_va(level, mm->vm_mnt, va); + + for (unsigned i = 0; i < LEVEL_SIZE; i++, ptep++) + { + pte = pte_at(ptep); + if (pte_isnull(pte) || !pte_isloaded(pte)) { + continue; + } + + if (lntep_implie_vmnts(ptep, lnt_page_size(level))) { + continue; + } + + _va = va + (i * lnt_page_size(level)); + __purge_vms_residual(mm, level + 1, _va); + + set_pte(ptep, null_pte); + leaflet_return(pte_leaflet_aligned(pte)); } +} - ptr_t self_pa = pte_paddr(ptep_head[MAX_PTEN - 1]); - pmm_free_any(self_pa); +static void +vmsfree(struct proc_mm* mm) +{ + struct leaflet* leaflet; + struct mm_region *pos, *n; + ptr_t vm_mnt; + pte_t* ptep_self; + + vm_mnt = mm->vm_mnt; + ptep_self = mkl0tep_va(vm_mnt, VMS_SELF); + + // first pass: free region mappings + llist_for_each(pos, n, &mm->regions, head) + { + vmrfree(vm_mnt, pos); + } + + procvm_unlink_kernel(vm_mnt); + + // free up all allocated tables on intermediate levels + __purge_vms_residual(mm, 0, 0); + + free_pagetable_trace(ptep_self, pte_at(ptep_self), 0); } static inline void @@ -167,6 +344,11 @@ __detach_from_current_vms(struct proc_mm* guest_mm) } } +void +procvm_prune_vmr(ptr_t vm_mnt, struct mm_region* region) +{ + vmrfree(vm_mnt, region); +} void procvm_dupvms_mount(struct proc_mm* mm) { @@ -179,14 +361,20 @@ procvm_dupvms_mount(struct proc_mm* mm) { mm->heap = mm_current->heap; mm->vm_mnt = VMS_MOUNT_1; - mm->vmroot = vmscpy(VMS_MOUNT_1, VMS_SELF, false); + vmscpy(mm, mm_current); region_copy_mm(mm_current, mm); } void procvm_mount(struct proc_mm* mm) { + // if current mm is already active + if (active_vms(mm->vm_mnt)) { + return; + } + + // we are double mounting assert(!mm->vm_mnt); assert(mm->vmroot); @@ -200,9 +388,13 @@ procvm_mount(struct proc_mm* mm) void procvm_unmount(struct proc_mm* mm) { + if (active_vms(mm->vm_mnt)) { + return; + } + assert(mm->vm_mnt); - vms_unmount(VMS_MOUNT_1); + struct proc_mm* mm_current = vmspace(__current); if (mm_current) { mm_current->guest_mm = NULL; @@ -219,22 +411,28 @@ procvm_initvms_mount(struct proc_mm* mm) __attach_to_current_vms(mm); mm->vm_mnt = VMS_MOUNT_1; - mm->vmroot = vmscpy(VMS_MOUNT_1, VMS_SELF, true); + vmscpy(mm, NULL); } void procvm_unmount_release(struct proc_mm* mm) { ptr_t vm_mnt = mm->vm_mnt; struct mm_region *pos, *n; + llist_for_each(pos, n, &mm->regions, head) { mem_sync_pages(vm_mnt, pos, pos->start, pos->end - pos->start, 0); + } + + vmsfree(mm); + + llist_for_each(pos, n, &mm->regions, head) + { region_release(pos); } - vfree(mm); - vmsfree(vm_mnt); vms_unmount(vm_mnt); + vfree(mm); __detach_from_current_vms(mm); } @@ -243,7 +441,6 @@ void procvm_mount_self(struct proc_mm* mm) { assert(!mm->vm_mnt); - assert(!mm->guest_mm); mm->vm_mnt = VMS_SELF; } @@ -251,7 +448,7 @@ procvm_mount_self(struct proc_mm* mm) void procvm_unmount_self(struct proc_mm* mm) { - assert(mm->vm_mnt == VMS_SELF); + assert(active_vms(mm->vm_mnt)); mm->vm_mnt = 0; } @@ -263,7 +460,7 @@ procvm_enter_remote(struct remote_vmctx* rvmctx, struct proc_mm* mm, ptr_t vm_mnt = mm->vm_mnt; assert(vm_mnt); - pfn_t size_pn = pfn(size + MEM_PAGE); + pfn_t size_pn = pfn(size + PAGE_SIZE); assert(size_pn < REMOTEVM_MAX_PAGES); struct mm_region* region = region_get(&mm->regions, remote_base); @@ -272,25 +469,27 @@ procvm_enter_remote(struct remote_vmctx* rvmctx, struct proc_mm* mm, rvmctx->vms_mnt = vm_mnt; rvmctx->page_cnt = size_pn; - remote_base = va_align(remote_base); + remote_base = page_aligned(remote_base); rvmctx->remote = remote_base; - rvmctx->local_mnt = PG_MOUNT_4_END + 1; + rvmctx->local_mnt = PG_MOUNT_VAR; pte_t* rptep = mkptep_va(vm_mnt, remote_base); pte_t* lptep = mkptep_va(VMS_SELF, rvmctx->local_mnt); - unsigned int pattr = region_pteprot(region); + + pte_t pte, rpte = null_pte; + rpte = region_tweakpte(region, rpte); for (size_t i = 0; i < size_pn; i++) { - pte_t pte = vmm_tryptep(rptep, PAGE_SIZE); + pte = vmm_tryptep(rptep, PAGE_SIZE); if (pte_isloaded(pte)) { set_pte(lptep, pte); continue; } - ptr_t pa = pmm_alloc_page(0); + ptr_t pa = ppage_addr(pmm_alloc_normal(0)); set_pte(lptep, mkpte(pa, KERNEL_DATA)); - set_pte(rptep, mkpte(pa, pattr)); + set_pte(rptep, pte_setpaddr(rpte, pa)); } return vm_mnt;