Boot framework rework (#45)

[lunaix-os.git] / lunaix-os / kernel / process / sched.c
diff --git a/lunaix-os/kernel/process/sched.c b/lunaix-os/kernel/process/sched.c

index 91932ca3dd5d7d4aa3ac3c7c9e170b89cc8d2cb2..945fc01bf6db0a8ae5edb1fcc48f867732fb3cbd 100644 (file)
--- a/lunaix-os/kernel/process/sched.c
+++ b/lunaix-os/kernel/process/sched.c
@@ -1,15 +1,15 @@
-#include <arch/x86/interrupts.h>
-#include <arch/x86/tss.h>
+#include <sys/abi.h>
+#include <sys/mm/mempart.h>
  
-#include <hal/apic.h>
-#include <hal/cpu.h>
+#include <sys/cpu.h>
  
  #include <lunaix/fs/taskfs.h>
  #include <lunaix/mm/cake.h>
-#include <lunaix/mm/kalloc.h>
+#include <lunaix/mm/mmap.h>
  #include <lunaix/mm/pmm.h>
  #include <lunaix/mm/valloc.h>
  #include <lunaix/mm/vmm.h>
+#include <lunaix/mm/procvm.h>
  #include <lunaix/process.h>
  #include <lunaix/sched.h>
  #include <lunaix/signal.h>
@@ -17,113 +17,147 @@
  #include <lunaix/status.h>
  #include <lunaix/syscall.h>
  #include <lunaix/syslog.h>
+#include <lunaix/hart_state.h>
+#include <lunaix/kpreempt.h>
  
-volatile struct proc_info* __current;
+#include <lunaix/generic/isrm.h>
+
+#include <klibc/string.h>
  
-static struct proc_info dummy_proc;
+struct thread empty_thread_obj;
  
-struct proc_info dummy;
+volatile struct proc_info* __current;
+volatile struct thread* current_thread = &empty_thread_obj;
  
  struct scheduler sched_ctx;
  
-struct cake_pile* proc_pile;
+struct cake_pile *proc_pile ,*thread_pile;
  
-LOG_MODULE("SCHED")
+#define root_process   (sched_ctx.procs[1])
  
-void
-sched_init_dummy();
+LOG_MODULE("SCHED")
  
  void
  sched_init()
  {
      proc_pile = cake_new_pile("proc", sizeof(struct proc_info), 1, 0);
+    thread_pile = cake_new_pile("thread", sizeof(struct thread), 1, 0);
      cake_set_constructor(proc_pile, cake_ctor_zeroing);
+    cake_set_constructor(thread_pile, cake_ctor_zeroing);
  
-    sched_ctx = (struct scheduler){ ._procs = vzalloc(PROC_TABLE_SIZE),
-                                    .ptable_len = 0,
-                                    .procs_index = 0 };
-
-    // TODO initialize dummy_proc
-    sched_init_dummy();
+    sched_ctx = (struct scheduler){
+        .procs = vzalloc(PROC_TABLE_SIZE), .ptable_len = 0, .procs_index = 0};
+    
+    llist_init_head(&sched_ctx.sleepers);
  }
  
  void
-sched_init_dummy()
+run(struct thread* thread)
  {
-    // This surely need to be simplified or encapsulated!
-    // It is a living nightmare!
-
-    extern void my_dummy();
-    static char dummy_stack[1024] __attribute__((aligned(16)));
-
-    // memset to 0
-    dummy_proc = (struct proc_info){};
-    dummy_proc.intr_ctx =
-      (isr_param){ .registers = { .ds = KDATA_SEG,
-                                  .es = KDATA_SEG,
-                                  .fs = KDATA_SEG,
-                                  .gs = KDATA_SEG,
-                                  .esp = (void*)dummy_stack + 1004 },
-                   .cs = KCODE_SEG,
-                   .eip = (void*)my_dummy,
-                   .ss = KDATA_SEG,
-                   .eflags = cpu_reflags() | 0x0200 };
-
-    *(u32_t*)(&dummy_stack[1020]) = dummy_proc.intr_ctx.eflags;
-    *(u32_t*)(&dummy_stack[1016]) = KCODE_SEG;
-    *(u32_t*)(&dummy_stack[1012]) = dummy_proc.intr_ctx.eip;
-
-    dummy_proc.page_table = cpu_rcr3();
-    dummy_proc.state = PS_READY;
-    dummy_proc.parent = &dummy_proc;
-
-    __current = &dummy_proc;
+    thread->state = PS_RUNNING;
+    thread->process->state = PS_RUNNING;
+    thread->process->th_active = thread;
+
+    procvm_mount_self(vmspace(thread->process));
+    set_current_executing(thread);
+
+    switch_context();
+
+    fail("unexpected return from switching");
  }
  
+/*
+    Currently, we do not allow self-destorying thread, doing
+    so will eliminate current kernel stack which is disaster.
+    A compromise solution is to perform a regular scan and 
+    clean-up on these thread, in the preemptible kernel thread.
+*/
+
  void
-run(struct proc_info* proc)
+cleanup_detached_threads() 
  {
-    proc->state = PS_RUNNING;
-
-    /*
-        将tss.esp0设置为上次调度前的esp值。
-        当处理信号时，上下文信息是不会恢复的，而是保存在用户栈中，然后直接跳转进位于用户空间的sig_wrapper进行
-          信号的处理。当用户自定义的信号处理函数返回时，sigreturn的系统调用才开始进行上下文的恢复（或者说是进行
-          另一次调度。
-        由于这中间没有进行地址空间的交换，所以第二次跳转使用的是同一个内核栈，而之前默认tss.esp0的值是永远指向最顶部
-        这样一来就有可能会覆盖更早的上下文信息（比如嵌套的信号捕获函数）
-    */
-    tss_update_esp(proc->intr_ctx.registers.esp);
-
-    apic_done_servicing();
-
-    asm volatile("pushl %0\n"
-                 "jmp switch_to\n" ::"r"(proc)
-                 : "memory"); // kernel/asm/x86/interrupt.S
+    // XXX may be a lock on sched_context will ben the most appropriate?
+    cpu_disable_interrupt();
+
+    int i = 0;
+    struct thread *pos, *n;
+    llist_for_each(pos, n, sched_ctx.threads, sched_sibs) {
+        if (likely(!proc_terminated(pos) || !thread_detached(pos))) {
+            continue;
+        }
+
+        struct proc_mm* mm = vmspace(pos->process);
+
+        procvm_mount(mm);
+        destory_thread(pos);
+        procvm_unmount(mm);
+        
+        i++;
+    }
+
+    if (i) {
+        INFO("cleaned %d terminated detached thread(s)", i);
+    }
+
+    cpu_enable_interrupt();
  }
  
-int
-can_schedule(struct proc_info* proc)
+bool
+can_schedule(struct thread* thread)
  {
-    if (__SIGTEST(proc->sig_pending, _SIGCONT)) {
-        __SIGCLEAR(proc->sig_pending, _SIGSTOP);
-    } else if (__SIGTEST(proc->sig_pending, _SIGSTOP)) {
-        // 如果进程受到SIGSTOP，则该进程不给予调度。
+    if (!thread) {
          return 0;
      }
  
-    return 1;
+    if (proc_terminated(thread)) {
+        return false;
+    }
+
+    if (preempt_check_stalled(thread)) {
+        thread_flags_set(thread, TH_STALLED);
+        return true;
+    }
+
+    if (unlikely(kernel_process(thread->process))) {
+        // a kernel process is always runnable
+        return thread->state == PS_READY;
+    }
+
+    struct sigctx* sh = &thread->sigctx;
+
+    if ((thread->state & PS_PAUSED)) {
+        return !!(sh->sig_pending & ~1);
+    }
+
+    if ((thread->state & PS_BLOCKED)) {
+        return sigset_test(sh->sig_pending, _SIGINT);
+    }
+
+    if (sigset_test(sh->sig_pending, _SIGSTOP)) {
+        // If one thread is experiencing SIGSTOP, then we know
+        // all other threads are also SIGSTOP (as per POSIX-2008.1)
+        // In which case, the entire process is stopped.
+        thread->state = PS_STOPPED;
+        return false;
+    }
+    
+    if (sigset_test(sh->sig_pending, _SIGCONT)) {
+        thread->state = PS_READY;
+    }
+
+    return (thread->state == PS_READY) \
+            && proc_runnable(thread->process);
  }
  
  void
  check_sleepers()
  {
-    struct proc_info* leader = sched_ctx._procs[0];
-    struct proc_info *pos, *n;
-    time_t now = clock_systime();
-    llist_for_each(pos, n, &leader->sleep.sleepers, sleep.sleepers)
+    struct thread *pos, *n;
+    time_t now = clock_systime() / 1000;
+
+    llist_for_each(pos, n, &sched_ctx.sleepers, sleep.sleepers)
      {
-        if (PROC_TERMINATED(pos->state)) {
+        if (proc_terminated(pos)) {
              goto del;
          }
  
@@ -137,7 +171,7 @@ check_sleepers()
  
          if (atime && now >= atime) {
              pos->sleep.alarm_time = 0;
-            __SIGSET(pos->sig_pending, _SIGALRM);
+            thread_setsignal(pos, _SIGALRM);
          }
  
          if (!wtime && !atime) {
@@ -150,51 +184,47 @@ check_sleepers()
  void
  schedule()
  {
-    if (!sched_ctx.ptable_len) {
-        return;
-    }
+    assert(sched_ctx.ptable_len && sched_ctx.ttable_len);
  
      // 上下文切换相当的敏感！我们不希望任何的中断打乱栈的顺序……
-    cpu_disable_interrupt();
-    struct proc_info* next;
-    int prev_ptr = sched_ctx.procs_index;
-    int ptr = prev_ptr;
+    no_preemption();
  
-    if (!(__current->state & ~PS_RUNNING)) {
+    if (!(current_thread->state & ~PS_RUNNING)) {
+        current_thread->state = PS_READY;
          __current->state = PS_READY;
+
      }
  
+    procvm_unmount_self(vmspace(__current));
      check_sleepers();
  
      // round-robin scheduler
-redo:
+    
+    struct thread* current = current_thread;
+    struct thread* to_check = current;
+    
      do {
-        ptr = (ptr + 1) % sched_ctx.ptable_len;
-        next = sched_ctx._procs[ptr];
-    } while (!next || (next->state != PS_READY && ptr != prev_ptr));
+        to_check = list_next(to_check, struct thread, sched_sibs);
  
-    sched_ctx.procs_index = ptr;
+        if (can_schedule(to_check)) {
+            break;
+        }
  
-    if (next->state != PS_READY) {
-        // schedule the dummy process if we're out of choice
-        next = &dummy_proc;
-        goto done;
-    }
+        if (to_check == current) {
+            // FIXME do something less leathal here
+            fail("Ran out of threads!")
+            goto done;  
+        }
  
-    if (!can_schedule(next)) {
-        // 如果该进程不给予调度，则尝试重新选择
-        goto redo;
-    }
+    } while (1);
+
+    sched_ctx.procs_index = to_check->process->pid;
  
  done:
-    run(next);
-}
+    isrm_notify_eos(0);
+    run(to_check);
  
-void
-sched_yieldk()
-{
-    cpu_enable_interrupt();
-    cpu_int(LUNAIX_SCHED);
+    fail("unexpected return from scheduler");
  }
  
  __DEFINE_LXSYSCALL1(unsigned int, sleep, unsigned int, seconds)
@@ -203,38 +233,45 @@ __DEFINE_LXSYSCALL1(unsigned int, sleep, unsigned int, seconds)
          return 0;
      }
  
-    if (__current->sleep.wakeup_time) {
-        return (__current->sleep.wakeup_time - clock_systime()) / 1000U;
+    time_t systime = clock_systime() / 1000;
+    struct haybed* bed = &current_thread->sleep;
+
+    if (bed->wakeup_time) {
+        return (bed->wakeup_time - systime);
      }
  
-    struct proc_info* root_proc = sched_ctx._procs[0];
-    __current->sleep.wakeup_time = clock_systime() + seconds * 1000;
-    llist_append(&root_proc->sleep.sleepers, &__current->sleep.sleepers);
+    bed->wakeup_time = systime + seconds;
+
+    if (llist_empty(&bed->sleepers)) {
+        llist_append(&sched_ctx.sleepers, &bed->sleepers);
+    }
  
-    __current->intr_ctx.registers.eax = seconds;
+    store_retval(seconds);
  
-    block_current();
+    block_current_thread();
      schedule();
+
+    return 0;
  }
  
  __DEFINE_LXSYSCALL1(unsigned int, alarm, unsigned int, seconds)
  {
-    time_t prev_ddl = __current->sleep.alarm_time;
-    time_t now = clock_systime();
+    struct haybed* bed = &current_thread->sleep;
+    time_t prev_ddl = bed->alarm_time;
+    time_t now = clock_systime() / 1000;
  
-    __current->sleep.alarm_time = seconds ? now + seconds * 1000 : 0;
+    bed->alarm_time = seconds ? now + seconds : 0;
  
-    struct proc_info* root_proc = sched_ctx._procs[0];
-    if (llist_empty(&__current->sleep.sleepers)) {
-        llist_append(&root_proc->sleep.sleepers, &__current->sleep.sleepers);
+    if (llist_empty(&bed->sleepers)) {
+        llist_append(&sched_ctx.sleepers, &bed->sleepers);
      }
  
-    return prev_ddl ? (prev_ddl - now) / 1000 : 0;
+    return prev_ddl ? (prev_ddl - now) : 0;
  }
  
  __DEFINE_LXSYSCALL1(void, exit, int, status)
  {
-    terminate_proc(status);
+    terminate_current(status);
      schedule();
  }
  
@@ -258,7 +295,7 @@ __DEFINE_LXSYSCALL3(pid_t, waitpid, pid_t, pid, int*, status, int, options)
  
  __DEFINE_LXSYSCALL(int, geterrno)
  {
-    return __current->k_status;
+    return current_thread->syscall_ret;
  }
  
  pid_t
@@ -272,6 +309,7 @@ _wait(pid_t wpid, int* status, int options)
      }
  
      wpid = wpid ? wpid : -__current->pgid;
+
  repeat:
      llist_for_each(proc, n, &__current->children, siblings)
      {
@@ -290,94 +328,187 @@ repeat:
          return 0;
      }
      // 放弃当前的运行机会
-    sched_yieldk();
+    yield_current();
      goto repeat;
  
  done:
-    status_flags |= PEXITSIG * (proc->sig_inprogress != 0);
      if (status) {
-        *status = proc->exit_code | status_flags;
+        *status = PEXITNUM(status_flags, proc->exit_code);
      }
      return destroy_process(proc->pid);
  }
  
-struct proc_info*
-alloc_process()
-{
+static inline pid_t
+get_free_pid() {
      pid_t i = 0;
-    for (; i < sched_ctx.ptable_len && sched_ctx._procs[i]; i++)
+    
+    for (; i < sched_ctx.ptable_len && sched_ctx.procs[i]; i++)
          ;
-
-    if (i == MAX_PROCESS) {
+    
+    if (unlikely(i == MAX_PROCESS)) {
          panick("Panic in Ponyville shimmer!");
      }
  
+    return i;
+}
+
+struct thread*
+alloc_thread(struct proc_info* process) {
+    if (process->thread_count >= MAX_THREAD_PP) {
+        return NULL;
+    }
+    
+    struct thread* th = cake_grab(thread_pile);
+
+    th->process = process;
+    th->created = clock_systime();
+
+    // FIXME we need a better tid allocation method!
+    th->tid = th->created;
+    th->tid = (th->created ^ ((ptr_t)th)) % MAX_THREAD_PP;
+
+    th->state = PS_CREATED;
+    
+    llist_init_head(&th->sleep.sleepers);
+    llist_init_head(&th->sched_sibs);
+    llist_init_head(&th->proc_sibs);
+    waitq_init(&th->waitqueue);
+
+    return th;
+}
+
+struct proc_info*
+alloc_process()
+{
+    pid_t i = get_free_pid();
+
      if (i == sched_ctx.ptable_len) {
          sched_ctx.ptable_len++;
      }
  
      struct proc_info* proc = cake_grab(proc_pile);
+    if (!proc) {
+        return NULL;
+    }
  
      proc->state = PS_CREATED;
      proc->pid = i;
      proc->created = clock_systime();
      proc->pgid = proc->pid;
+
+    proc->sigreg = vzalloc(sizeof(struct sigregistry));
      proc->fdtable = vzalloc(sizeof(struct v_fdtable));
-    proc->fxstate =
-      vzalloc_dma(512); // FXSAVE需要十六位对齐地址，使用DMA块（128位对齐）
  
-    llist_init_head(&proc->mm.regions.head);
+    proc->mm = procvm_create(proc);
+    
      llist_init_head(&proc->tasks);
      llist_init_head(&proc->children);
      llist_init_head(&proc->grp_member);
-    llist_init_head(&proc->sleep.sleepers);
-    waitq_init(&proc->waitqueue);
+    llist_init_head(&proc->threads);
+
+    iopoll_init(&proc->pollctx);
  
-    sched_ctx._procs[i] = proc;
+    sched_ctx.procs[i] = proc;
  
      return proc;
  }
  
  void
-commit_process(struct proc_info* process)
-{
-    assert(process == sched_ctx._procs[process->pid]);
+commit_thread(struct thread* thread) {
+    struct proc_info* process = thread->process;
  
-    if (process->state != PS_CREATED) {
-        __current->k_status = EINVAL;
-        return;
+    assert(process && !proc_terminated(process));
+
+    llist_append(&process->threads, &thread->proc_sibs);
+    
+    if (sched_ctx.threads) {
+        llist_append(sched_ctx.threads, &thread->sched_sibs);
+    } else {
+        sched_ctx.threads = &thread->sched_sibs;
      }
  
+    sched_ctx.ttable_len++;
+    process->thread_count++;
+    thread->state = PS_READY;
+}
+
+void
+commit_process(struct proc_info* process)
+{
+    assert(process == sched_ctx.procs[process->pid]);
+    assert(process->state == PS_CREATED);
+
      // every process is the child of first process (pid=1)
      if (!process->parent) {
-        process->parent = sched_ctx._procs[1];
+        if (likely(!kernel_process(process))) {
+            process->parent = root_process;
+        } else {
+            process->parent = process;
+        }
+    } else {
+        assert(!proc_terminated(process->parent));
+    }
+
+    if (sched_ctx.proc_list) {
+        llist_append(sched_ctx.proc_list, &process->tasks);
+    } else {
+        sched_ctx.proc_list = &process->tasks;
      }
  
      llist_append(&process->parent->children, &process->siblings);
-    llist_append(&sched_ctx._procs[0]->tasks, &process->tasks);
  
      process->state = PS_READY;
  }
  
-// from <kernel/process.c>
-extern void
-__del_pagetable(pid_t pid, uintptr_t mount_point);
+void
+destory_thread(struct thread* thread) 
+{
+    cake_ensure_valid(thread);
+    
+    struct proc_info* proc = thread->process;
+
+    llist_delete(&thread->sched_sibs);
+    llist_delete(&thread->proc_sibs);
+    llist_delete(&thread->sleep.sleepers);
+    waitq_cancel_wait(&thread->waitqueue);
  
-pid_t
-destroy_process(pid_t pid)
+    thread_release_mem(thread);
+
+    proc->thread_count--;
+    sched_ctx.ttable_len--;
+
+    cake_release(thread_pile, thread);
+}
+
+static void
+orphan_children(struct proc_info* proc)
  {
-    int index = pid;
-    if (index <= 0 || index > sched_ctx.ptable_len) {
-        __current->k_status = EINVAL;
-        return;
+    struct proc_info *root;
+    struct proc_info *pos, *n;
+
+    root = root_process;
+
+    llist_for_each(pos, n, &proc->children, siblings) {
+        pos->parent = root;
+        llist_append(&root->children, &pos->siblings);
      }
-    struct proc_info* proc = sched_ctx._procs[index];
-    sched_ctx._procs[index] = 0;
+}
+
+void 
+delete_process(struct proc_info* proc)
+{
+    pid_t pid = proc->pid;
+    struct proc_mm* mm = vmspace(proc);
+
+    assert(pid);    // long live the pid0 !!
+
+    sched_ctx.procs[pid] = NULL;
  
      llist_delete(&proc->siblings);
      llist_delete(&proc->grp_member);
      llist_delete(&proc->tasks);
-    llist_delete(&proc->sleep.sleepers);
+
+    iopoll_free(proc);
  
      taskfs_invalidate(pid);
  
@@ -385,6 +516,10 @@ destroy_process(pid_t pid)
          vfs_unref_dnode(proc->cwd);
      }
  
+    if (proc->cmd) {
+        vfree(proc->cmd);
+    }
+
      for (size_t i = 0; i < VFS_MAX_FD; i++) {
          struct v_fd* fd = proc->fdtable->fds[i];
          if (fd) {
@@ -394,32 +529,85 @@ destroy_process(pid_t pid)
      }
  
      vfree(proc->fdtable);
-    vfree_dma(proc->fxstate);
  
-    struct mm_region *pos, *n;
-    llist_for_each(pos, n, &proc->mm.regions.head, head)
-    {
-        vfree(pos);
-    }
+    signal_free_registry(proc->sigreg);
  
-    vmm_mount_pd(PD_MOUNT_1, proc->page_table);
+    procvm_mount(mm);
+    
+    struct thread *pos, *n;
+    llist_for_each(pos, n, &proc->threads, proc_sibs) {
+        // terminate and destory all thread unconditionally
+        destory_thread(pos);
+    }
  
-    __del_pagetable(pid, PD_MOUNT_1);
+    orphan_children(proc);
  
-    vmm_unmount_pd(PD_MOUNT_1);
+    procvm_unmount_release(mm);
  
      cake_release(proc_pile, proc);
+}
+
+pid_t
+destroy_process(pid_t pid)
+{    
+    int index = pid;
+    if (index <= 0 || index > sched_ctx.ptable_len) {
+        syscall_result(EINVAL);
+        return -1;
+    }
+
+    struct proc_info* proc = sched_ctx.procs[index];
+    delete_process(proc);
  
      return pid;
  }
  
+static void 
+terminate_proc_only(struct proc_info* proc, int exit_code) {
+    assert(proc->pid != 0);
+
+    proc->state = PS_TERMNAT;
+    proc->exit_code = exit_code;
+
+    proc_setsignal(proc->parent, _SIGCHLD);
+}
+
  void
-terminate_proc(int exit_code)
-{
-    __current->state = PS_TERMNAT;
-    __current->exit_code = exit_code;
+terminate_thread(struct thread* thread, ptr_t val) {
+    thread->exit_val = val;
+    thread->state = PS_TERMNAT;
+
+    struct proc_info* proc = thread->process;
+    if (proc->thread_count == 1) {
+        terminate_proc_only(thread->process, 0);
+    }
+}
+
+void
+terminate_current_thread(ptr_t val) {
+    terminate_thread(current_thread, val);
+}
+
+void 
+terminate_proccess(struct proc_info* proc, int exit_code) {
+    assert(!kernel_process(proc));
  
-    __SIGSET(__current->parent->sig_pending, _SIGCHLD);
+    if (proc->pid == 1) {
+        panick("Attempt to kill init");
+    }
+
+    terminate_proc_only(proc, exit_code);
+
+    struct thread *pos, *n;
+    llist_for_each(pos, n, &proc->threads, proc_sibs) {
+        pos->state = PS_TERMNAT;
+    }
+}
+
+void
+terminate_current(int exit_code)
+{
+    terminate_proccess(__current, exit_code);
  }
  
  struct proc_info*
@@ -429,7 +617,7 @@ get_process(pid_t pid)
      if (index < 0 || index > sched_ctx.ptable_len) {
          return NULL;
      }
-    return sched_ctx._procs[index];
+    return sched_ctx.procs[index];
  }
  
  int
@@ -439,10 +627,10 @@ orphaned_proc(pid_t pid)
          return 0;
      if (pid >= sched_ctx.ptable_len)
          return 0;
-    struct proc_info* proc = sched_ctx._procs[pid];
+    struct proc_info* proc = sched_ctx.procs[pid];
      struct proc_info* parent = proc->parent;
  
      // 如果其父进程的状态是terminated 或 destroy中的一种
      // 或者其父进程是在该进程之后创建的，那么该进程为孤儿进程
-    return PROC_TERMINATED(parent->state) || parent->created > proc->created;
+    return proc_terminated(parent) || parent->created > proc->created;
  }
 \ No newline at end of file