X-Git-Url: https://scm.lunaixsky.com/lunaix-os.git/blobdiff_plain/b60166b327a9108b07e3069fa6568a451529ffd9..34f6af4f61e0eec9c96113e07f140b609b4113c8:/lunaix-os/kernel/mm/procvm.c diff --git a/lunaix-os/kernel/mm/procvm.c b/lunaix-os/kernel/mm/procvm.c index d9e357a..31d9f3f 100644 --- a/lunaix-os/kernel/mm/procvm.c +++ b/lunaix-os/kernel/mm/procvm.c @@ -1,18 +1,17 @@ #include #include #include -#include -#include +#include #include #include -#include +#include #include struct proc_mm* procvm_create(struct proc_info* proc) { - struct proc_mm* mm = valloc(sizeof(struct proc_mm)); + struct proc_mm* mm = vzalloc(sizeof(struct proc_mm)); assert(mm); @@ -23,122 +22,387 @@ procvm_create(struct proc_info* proc) { return mm; } +static inline unsigned int +__ptep_advancement(struct leaflet* leaflet, int level) +{ + size_t shifts = MAX(MAX_LEVEL - level - 1, 1) * LEVEL_SHIFT; + return (1 << (leaflet_order(leaflet) % shifts)) - 1; +} + +static inline int +__descend(ptr_t dest_mnt, ptr_t src_mnt, ptr_t va, bool alloc) +{ + pte_t *dest, *src, pte; + + int i = 0; + while (!pt_last_level(i)) + { + dest = mklntep_va(i, dest_mnt, va); + src = mklntep_va(i, src_mnt, va); + pte = pte_at(src); + + if (!pte_isloaded(pte) || pte_huge(pte)) { + break; + } + + if (alloc && pte_isnull(pte_at(dest))) { + alloc_kpage_at(dest, pte, 0); + } + + i++; + } + + return i; +} + +static inline void +copy_leaf(pte_t* dest, pte_t* src, pte_t pte, int level) +{ + struct leaflet* leaflet; + + set_pte(dest, pte); + + if (!pte_isloaded(pte)) { + return; + } + + leaflet = pte_leaflet(pte); + assert(leaflet_refcount(leaflet)); + + if (leaflet_ppfn(leaflet) == pte_ppfn(pte)) { + leaflet_borrow(leaflet); + } +} -static ptr_t -__dup_vmspace(ptr_t mount_point, bool only_kernel) +static inline void +copy_root(pte_t* dest, pte_t* src, pte_t pte, int level) { - ptr_t ptd_pp = pmm_alloc_page(PP_FGPERSIST); - vmm_set_mapping(VMS_SELF, PG_MOUNT_1, ptd_pp, PG_PREM_RW, VMAP_NULL); + alloc_kpage_at(dest, pte, 0); +} + +static void +vmrcpy(ptr_t dest_mnt, ptr_t src_mnt, struct mm_region* region) +{ + pte_t *src, *dest; + ptr_t loc; + int level; + struct leaflet* leaflet; + + loc = region->start; + src = mkptep_va(src_mnt, loc); + dest = mkptep_va(dest_mnt, loc); + + level = __descend(dest_mnt, src_mnt, loc, true); + + while (loc < region->end) + { + pte_t pte = *src; + + if (pte_isnull(pte)) { + goto cont; + } + + if (pt_last_level(level) || pte_huge(pte)) { + copy_leaf(dest, src, pte, level); + goto cont; + } + + if (!pt_last_level(level)) { + copy_root(dest, src, pte, level); - x86_page_table* ptd = (x86_page_table*)PG_MOUNT_1; - x86_page_table* pptd = (x86_page_table*)(mount_point | (0x3FF << 12)); + src = ptep_step_into(src); + dest = ptep_step_into(dest); + level++; - size_t kspace_l1inx = L1_INDEX(KERNEL_EXEC); - size_t i = 1; // skip first 4MiB, to avoid bring other thread's stack + continue; + } + + cont: + loc += lnt_page_size(level); + while (ptep_vfn(src) == MAX_PTEN - 1) { + assert(level > 0); + src = ptep_step_out(src); + dest = ptep_step_out(dest); + level--; + } - ptd->entry[0] = 0; - if (only_kernel) { - i = kspace_l1inx; - memset(ptd, 0, PG_SIZE); + src++; + dest++; } +} - for (; i < PG_MAX_ENTRIES - 1; i++) { +static void +vmrfree(ptr_t vm_mnt, struct mm_region* region) +{ + pte_t *src, *end; + ptr_t loc; + int level; + struct leaflet* leaflet; + + loc = region->start; + src = mkptep_va(vm_mnt, region->start); + end = mkptep_va(vm_mnt, region->end); + + level = __descend(0, vm_mnt, loc, false); + + while (src < end) + { + pte_t pte = *src; + ptr_t pa = pte_paddr(pte); + + if (pte_isnull(pte)) { + goto cont; + } + + if (!pt_last_level(level) && !pte_huge(pte)) { + src = ptep_step_into(src); + level++; - x86_pte_t ptde = pptd->entry[i]; - // 空或者是未在内存中的L1页表项直接照搬过去。 - // 内核地址空间直接共享过去。 - if (!ptde || i >= kspace_l1inx || !(ptde & PG_PRESENT)) { - ptd->entry[i] = ptde; continue; } - // 复制L2页表 - ptr_t pt_pp = pmm_alloc_page(PP_FGPERSIST); - vmm_set_mapping(VMS_SELF, PG_MOUNT_2, pt_pp, PG_PREM_RW, VMAP_NULL); + if (pte_isloaded(pte)) { + leaflet = pte_leaflet_aligned(pte); + leaflet_return(leaflet); - x86_page_table* ppt = (x86_page_table*)(mount_point | (i << 12)); - x86_page_table* pt = (x86_page_table*)PG_MOUNT_2; + src += __ptep_advancement(leaflet, level); + } - for (size_t j = 0; j < PG_MAX_ENTRIES; j++) { - x86_pte_t pte = ppt->entry[j]; - pmm_ref_page(PG_ENTRY_ADDR(pte)); - pt->entry[j] = pte; + cont: + while (ptep_vfn(src) == MAX_PTEN - 1) { + src = ptep_step_out(src); + leaflet = pte_leaflet_aligned(pte_at(src)); + + assert(leaflet_order(leaflet) == 0); + leaflet_return(leaflet); + + level--; } - ptd->entry[i] = (ptr_t)pt_pp | PG_ENTRY_FLAGS(ptde); + src++; } +} + +static void +vmscpy(struct proc_mm* dest_mm, struct proc_mm* src_mm) +{ + // Build the self-reference on dest vms + + /* + * -- What the heck are ptep_ssm and ptep_sms ? -- + * + * ptep_dest point to the pagetable itself that is mounted + * at dest_mnt (or simply mnt): + * mnt -> self -> self -> self -> L0TE@offset + * + * ptep_sms shallowed the recursion chain: + * self -> mnt -> self -> self -> L0TE@self + * + * ptep_ssm shallowed the recursion chain: + * self -> self -> mnt -> self -> L0TE@self + * + * Now, here is the problem, back to x86_32, the translation is + * a depth-3 recursion: + * L0T -> LFT -> Page + * + * So ptep_ssm will terminate at mnt and give us a leaf + * slot for allocate a fresh page table for mnt: + * self -> self -> L0TE@mnt + * + * but in x86_64 translation has extra two more step: + * L0T -> L1T -> L2T -> LFT -> Page + * + * So we must continue push down.... + * ptep_sssms shallowed the recursion chain: + * self -> self -> self -> mnt -> L0TE@self + * + * ptep_ssssm shallowed the recursion chain: + * self -> self -> self -> self -> L0TE@mnt + * + * Note: PML4: 2 extra steps + * PML5: 3 extra steps + */ + + ptr_t dest_mnt, src_mnt; + + dest_mnt = dest_mm->vm_mnt; + assert(dest_mnt); + + pte_t* ptep_ssm = mkl0tep_va(VMS_SELF, dest_mnt); + pte_t* ptep_smx = mkl1tep_va(VMS_SELF, dest_mnt); + pte_t pte_sms = mkpte_prot(KERNEL_PGTAB); - ptd->entry[PG_MAX_ENTRIES - 1] = NEW_L1_ENTRY(T_SELF_REF_PERM, ptd_pp); + pte_sms = alloc_kpage_at(ptep_ssm, pte_sms, 0); + set_pte(&ptep_smx[VMS_SELF_L0TI], pte_sms); + + tlb_flush_kernel((ptr_t)dest_mnt); - return ptd_pp; + if (!src_mm) { + goto done; + } + + src_mnt = src_mm->vm_mnt; + + struct mm_region *pos, *n; + llist_for_each(pos, n, &src_mm->regions, head) + { + vmrcpy(dest_mnt, src_mnt, pos); + } + +done:; + procvm_link_kernel(dest_mnt); + + dest_mm->vmroot = pte_paddr(pte_sms); } +static void +vmsfree(struct proc_mm* mm) +{ + struct leaflet* leaflet; + ptr_t vm_mnt; + pte_t* ptep_self; + + vm_mnt = mm->vm_mnt; + ptep_self = mkl0tep(mkptep_va(vm_mnt, VMS_SELF)); + + struct mm_region *pos, *n; + llist_for_each(pos, n, &mm->regions, head) + { + vmrfree(vm_mnt, pos); + } + + procvm_unlink_kernel(); + + leaflet = pte_leaflet_aligned(pte_at(ptep_self)); + leaflet_return(leaflet); +} + +static inline void +__attach_to_current_vms(struct proc_mm* guest_mm) +{ + struct proc_mm* mm_current = vmspace(__current); + if (mm_current) { + assert(!mm_current->guest_mm); + mm_current->guest_mm = guest_mm; + } +} + +static inline void +__detach_from_current_vms(struct proc_mm* guest_mm) +{ + struct proc_mm* mm_current = vmspace(__current); + if (mm_current) { + assert(mm_current->guest_mm == guest_mm); + mm_current->guest_mm = NULL; + } +} + + void -procvm_dup(struct proc_info* proc) { - struct proc_mm* mm = vmspace(proc); - struct proc_mm* mm_current = vmspace(__current); - - mm->heap = mm_current->heap; - mm->vmroot = __dup_vmspace(VMS_SELF, false); +procvm_dupvms_mount(struct proc_mm* mm) { + assert(__current); + assert(!mm->vm_mnt); + + struct proc_mm* mm_current = vmspace(__current); + + __attach_to_current_vms(mm); - region_copy_mm(mm_current, mm); + mm->heap = mm_current->heap; + mm->vm_mnt = VMS_MOUNT_1; + + vmscpy(mm, mm_current); + region_copy_mm(mm_current, mm); } void -procvm_init_clean(struct proc_info* proc) +procvm_mount(struct proc_mm* mm) { - struct proc_mm* mm = vmspace(proc); - mm->vmroot = __dup_vmspace(VMS_SELF, true); -} + // if current mm is already active + if (active_vms(mm->vm_mnt)) { + return; + } + + // we are double mounting + assert(!mm->vm_mnt); + assert(mm->vmroot); + vms_mount(VMS_MOUNT_1, mm->vmroot); -static void -__delete_vmspace(ptr_t vm_mnt) + __attach_to_current_vms(mm); + + mm->vm_mnt = VMS_MOUNT_1; +} + +void +procvm_unmount(struct proc_mm* mm) { - x86_page_table* pptd = (x86_page_table*)(vm_mnt | (0x3FF << 12)); + if (active_vms(mm->vm_mnt)) { + return; + } + + assert(mm->vm_mnt); + vms_unmount(VMS_MOUNT_1); + + struct proc_mm* mm_current = vmspace(__current); + if (mm_current) { + mm_current->guest_mm = NULL; + } - // only remove user address space - for (size_t i = 0; i < L1_INDEX(KERNEL_EXEC); i++) { - x86_pte_t ptde = pptd->entry[i]; - if (!ptde || !(ptde & PG_PRESENT)) { - continue; - } + mm->vm_mnt = 0; +} - x86_page_table* ppt = (x86_page_table*)(vm_mnt | (i << 12)); +void +procvm_initvms_mount(struct proc_mm* mm) +{ + assert(!mm->vm_mnt); - for (size_t j = 0; j < PG_MAX_ENTRIES; j++) { - x86_pte_t pte = ppt->entry[j]; - // free the 4KB data page - if ((pte & PG_PRESENT)) { - pmm_free_page(PG_ENTRY_ADDR(pte)); - } - } - // free the L2 page table - pmm_free_page(PG_ENTRY_ADDR(ptde)); - } - // free the L1 directory - pmm_free_page(PG_ENTRY_ADDR(pptd->entry[PG_MAX_ENTRIES - 1])); + __attach_to_current_vms(mm); + + mm->vm_mnt = VMS_MOUNT_1; + vmscpy(mm, NULL); } void -procvm_cleanup(ptr_t vm_mnt, struct proc_info* proc) { +procvm_unmount_release(struct proc_mm* mm) { + ptr_t vm_mnt = mm->vm_mnt; struct mm_region *pos, *n; - llist_for_each(pos, n, vmregions(proc), head) + llist_for_each(pos, n, &mm->regions, head) { mem_sync_pages(vm_mnt, pos, pos->start, pos->end - pos->start, 0); region_release(pos); } - vfree(proc->mm); + vmsfree(mm); + vms_unmount(vm_mnt); + vfree(mm); - __delete_vmspace(vm_mnt); + __detach_from_current_vms(mm); +} + +void +procvm_mount_self(struct proc_mm* mm) +{ + assert(!mm->vm_mnt); + + mm->vm_mnt = VMS_SELF; +} + +void +procvm_unmount_self(struct proc_mm* mm) +{ + assert(active_vms(mm->vm_mnt)); + + mm->vm_mnt = 0; } ptr_t procvm_enter_remote(struct remote_vmctx* rvmctx, struct proc_mm* mm, - ptr_t vm_mnt, ptr_t remote_base, size_t size) + ptr_t remote_base, size_t size) { - ptr_t size_pn = PN(size + MEM_PAGE); + ptr_t vm_mnt = mm->vm_mnt; + assert(vm_mnt); + + pfn_t size_pn = pfn(size + PAGE_SIZE); assert(size_pn < REMOTEVM_MAX_PAGES); struct mm_region* region = region_get(&mm->regions, remote_base); @@ -147,23 +411,27 @@ procvm_enter_remote(struct remote_vmctx* rvmctx, struct proc_mm* mm, rvmctx->vms_mnt = vm_mnt; rvmctx->page_cnt = size_pn; - remote_base = PG_ALIGN(remote_base); + remote_base = page_aligned(remote_base); rvmctx->remote = remote_base; - rvmctx->local_mnt = PG_MOUNT_4_END + 1; + rvmctx->local_mnt = PG_MOUNT_VAR; + + pte_t* rptep = mkptep_va(vm_mnt, remote_base); + pte_t* lptep = mkptep_va(VMS_SELF, rvmctx->local_mnt); - v_mapping m; - unsigned int pattr = region_ptattr(region); - ptr_t raddr = remote_base, lmnt = rvmctx->local_mnt; - for (size_t i = 0; i < size_pn; i++, lmnt += MEM_PAGE, raddr += MEM_PAGE) + pte_t pte, rpte = null_pte; + rpte = region_tweakpte(region, rpte); + + for (size_t i = 0; i < size_pn; i++) { - if (vmm_lookupat(vm_mnt, raddr, &m) && PG_IS_PRESENT(m.flags)) { - vmm_set_mapping(VMS_SELF, lmnt, m.pa, PG_PREM_RW, 0); + pte = vmm_tryptep(rptep, PAGE_SIZE); + if (pte_isloaded(pte)) { + set_pte(lptep, pte); continue; } - ptr_t pa = pmm_alloc_page(0); - vmm_set_mapping(VMS_SELF, lmnt, pa, PG_PREM_RW, 0); - vmm_set_mapping(vm_mnt, raddr, pa, pattr, 0); + ptr_t pa = ppage_addr(pmm_alloc_normal(0)); + set_pte(lptep, mkpte(pa, KERNEL_DATA)); + set_pte(rptep, pte_setpaddr(rpte, pa)); } return vm_mnt; @@ -179,7 +447,7 @@ procvm_copy_remote_transaction(struct remote_vmctx* rvmctx, } ptr_t offset = remote_dest - rvmctx->remote; - if (PN(offset + sz) >= rvmctx->page_cnt) { + if (pfn(offset + sz) >= rvmctx->page_cnt) { return -1; } @@ -189,11 +457,8 @@ procvm_copy_remote_transaction(struct remote_vmctx* rvmctx, } void -procvm_exit_remote_transaction(struct remote_vmctx* rvmctx) +procvm_exit_remote(struct remote_vmctx* rvmctx) { - ptr_t lmnt = rvmctx->local_mnt; - for (size_t i = 0; i < rvmctx->page_cnt; i++, lmnt += MEM_PAGE) - { - vmm_del_mapping(VMS_SELF, lmnt); - } + pte_t* lptep = mkptep_va(VMS_SELF, rvmctx->local_mnt); + vmm_unset_ptes(lptep, rvmctx->page_cnt); } \ No newline at end of file