refactor: remove kernel memory overhead for saving x87 context by saving it into user stack
--- /dev/null
+*.S.inc
\ No newline at end of file
#define UDATA_SEG 0x23
#define TSS_SEG 0x28
+#define stack_alignment 0xfffffff0
+
#endif /* __LUNAIX_I386_ASM_H */
--- /dev/null
+
+#define regsize 4
+
+/* stack layout: saved interrupt context */
+ .struct 0
+idepth:
+ .struct idepth + regsize
+ieax:
+ .struct ieax + regsize
+iebx:
+ .struct iebx + regsize
+iecx:
+ .struct iecx + regsize
+iedx:
+ .struct iedx + regsize
+iedi:
+ .struct iedi + regsize
+iebp:
+ .struct iebp + regsize
+iesi:
+ .struct iesi + regsize
+ids:
+ .struct ids + regsize
+ies:
+ .struct ies + regsize
+ifs:
+ .struct ifs + regsize
+igs:
+ .struct igs + regsize
+iexecp:
+iesp:
+ .struct iesp + regsize
+isave_prev:
+ .struct isave_prev + regsize
+ivec:
+ .struct ivec + regsize
+iecode:
+ .struct iecode + regsize
+ieip:
+ .struct ieip + regsize
+ics:
+ .struct ics + regsize
+ieflags:
+ .struct ieflags + regsize
+iuesp:
+ .struct iuesp + regsize
+iuss:
+
+
+/* stack layout: execution (flow-control) state context */
+ .struct 0
+exsave_prev:
+ .struct exsave_prev + regsize
+exvec:
+ .struct exvec + regsize
+execode:
+ .struct execode + regsize
+exeip:
+ .struct exeip + regsize
+excs:
+ .struct excs + regsize
+exeflags:
+ .struct exeflags + regsize
+exuesp:
+ .struct exuesp + regsize
+exuss:
+
+/* struct layout: critical section of struct proc_info */
+ .struct 0
+proc_pid:
+ .struct proc_pid + regsize
+proc_parent:
+ .struct proc_parent + regsize
+proc_intr_ctx:
+ .struct proc_intr_ctx + regsize
+proc_ustack_top:
+ .struct proc_ustack_top + regsize
+proc_page_table:
+ .struct proc_page_table + regsize
+proc_fxstate:
+
+/* struct layout: proc_sig */
+ .struct 0
+psig_signum:
+ .struct psig_signum + regsize
+psig_sigact:
+ .struct psig_sigact + regsize
+psig_sighand:
+ .struct psig_sighand + regsize
+psig_saved_ictx:
\ No newline at end of file
#define KSTACK_SIZE MEM_1MB
#define KSTACK_START (USER_START - KSTACK_SIZE)
#define KSTACK_TOP ((USER_START - 1) & ~0xf)
+#define within_kstack(addr) (KSTACK_START <= (addr) && (addr) <= KSTACK_TOP)
#define KERNEL_MM_BASE 0xC0000000
#define PS_GrBP (PS_PAUSED | PS_BLOCKED)
#define PS_GrDT (PS_TERMNAT | PS_DESTROY)
-#define PROC_TERMINATED(state) ((state)&PS_GrDT)
-#define PROC_HANGED(state) ((state)&PS_BLOCKED)
-#define PROC_RUNNABLE(state) ((state)&PS_PAUSED)
+#define proc_terminated(proc) (((proc)->state) & PS_GrDT)
+#define proc_hanged(proc) (((proc)->state) & PS_BLOCKED)
+#define proc_runnable(proc) (((proc)->state) & PS_PAUSED)
#define PROC_FINPAUSE 1
-struct proc_sigstate
-{
- isr_param proc_regs;
- char fxstate[512] __attribute__((aligned(16)));
-};
-
struct sigact
{
struct sigact* prev;
int sig_num;
void* sigact;
void* sighand;
- struct proc_sigstate prev_context;
+ isr_param* saved_ictx;
} __attribute__((packed));
struct proc_info
isr_param* intr_ctx; // offset = 8
ptr_t ustack_top; // offset = 84 -> 56 -> 60 -> 12
ptr_t page_table; // offset = 88 -> 60 -> 64 -> 16
- void* fxstate; // offset = 92 -> 64 -> 68 -> 20
/* ---- critical section end ---- */
#include <arch/x86/i386_asm.h>
#include <arch/x86/tss.h>
#include <lunaix/syscall.h>
+#include <arch/x86/interrupt.S.inc>
+
#define __ASM_INTR_DIAGNOSIS
#ifdef __ASM_INTR_DIAGNOSIS
*/
-#define regsize 4
-
-/* stack layout: saved interrupt context */
- .struct 0
-idepth:
- .struct idepth + regsize
-ieax:
- .struct ieax + regsize
-iebx:
- .struct iebx + regsize
-iecx:
- .struct iecx + regsize
-iedx:
- .struct iedx + regsize
-iedi:
- .struct iedi + regsize
-iebp:
- .struct iebp + regsize
-iesi:
- .struct iesi + regsize
-ids:
- .struct ids + regsize
-ies:
- .struct ies + regsize
-ifs:
- .struct ifs + regsize
-igs:
- .struct igs + regsize
-iesp:
- .struct iesp + regsize
-isave_prev:
- .struct isave_prev + regsize
-ivec:
- .struct ivec + regsize
-iecode:
- .struct iecode + regsize
-ieip:
- .struct ieip + regsize
-ics:
- .struct ics + regsize
-ieflags:
- .struct ieflags + regsize
-iuesp:
- .struct iuesp + regsize
-iuss:
-
-
-/* stack layout: execution (flow-control) state context */
- .struct 0
-exsave_prev:
- .struct exsave_prev + regsize
-exvec:
- .struct exvec + regsize
-execode:
- .struct execode + regsize
-exeip:
- .struct exeip + regsize
-excs:
- .struct excs + regsize
-exeflags:
- .struct exeflags + regsize
-exuesp:
- .struct exuesp + regsize
-exuss:
-
-/* struct layout: critical section of struct proc_info */
- .struct 0
-proc_pid:
- .struct proc_pid + regsize
-proc_parent:
- .struct proc_parent + regsize
-proc_intr_ctx:
- .struct proc_intr_ctx + regsize
-proc_ustack_top:
- .struct proc_ustack_top + regsize
-proc_page_table:
- .struct proc_page_table + regsize
-proc_fxstate:
-
.section .text
.global interrupt_wrapper
interrupt_wrapper:
andl $0x3, %eax /* 判断 RPL */
jz 1f
- movw $KDATA_SEG, %ax /* 如果从用户模式转来,则切换至内核数据段 */
+ /* crossing the user/kernel boundary */
+ movw $KDATA_SEG, %ax
movw %ax, %gs
movw %ax, %fs
movw %ax, %ds
movl __current, %eax
- # FIXME: Save x87 context to user stack, rather than kernel's memory.
- # 保存x87FPU的状态
- movl proc_fxstate(%eax), %ebx
- fxsave (%ebx)
-
# 保存用户栈顶指针。因为我们允许同级中断的产生,所以需要该手段跟踪用户栈的地址。
movl iuesp(%esp), %ebx # 取出esp
movl %ebx, proc_ustack_top(%eax) # 存入__current->ustack_top
+ # Save x87 context to user stack, rather than kernel's memory.
+ # XXX: what will happen if we triggered a page fault during fxsave?
+ movl iuesp(%esp), %eax
+ andl $stack_alignment, %eax
+ subl $512, %eax
+ fxsave (%eax)
+
+ /* kernel space same-level switch */
1:
movl %esp, %eax
- andl $0xfffffff0, %esp
+ andl $stack_alignment, %esp
subl $16, %esp
movl %eax, (%esp)
movl exeip(%eax), %eax
movl %eax, (debug_resv + 4) # eip
#endif
- movl __current, %eax
- movl proc_fxstate(%eax), %eax
+ // movl __current, %eax
+ // movl proc_fxstate(%eax), %eax
- test %eax, %eax # do we have stored x87 context?
+ // test %eax, %eax # do we have stored x87 context?
+
+ movl ics(%esp), %eax
+ andl $3, %eax
jz 1f
- fxrstor (%eax)
+
+ movl iuesp(%esp), %eax
+ andl $stack_alignment, %eax
+ subl $512, %eax
+ fxrstor (%eax)
1:
popl %eax # discard isr_param::depth
# 我们已经处在了新的地址空间,为了避免影响其先前的栈布局
# 需要使用一个临时的栈空间
movl $tmp_stack, %esp
-
- # 更新 tss
- movl proc_intr_ctx(%ebx), %eax # proc->intr_ctx
- movl iesp(%eax), %eax # intr_ctx->esp
- movl %eax, (tss_esp0_off + _tss)
call signal_dispatch # kernel/signal.c
test %eax, %eax # do we have signal to handle?
jz 1f
+
+ # 更新 tss
+ movl proc_intr_ctx(%ebx), %ecx # __current->intr_ctx
+ movl %ecx, (tss_esp0_off + _tss)
+
jmp handle_signal
1:
movl proc_intr_ctx(%ebx), %eax
# 注意1:任何对proc_sig的布局改动,都须及时的保证这里的一致性!
# 注意2:handle_signal在调用之前,须确保proc_sig已经写入用户栈!
# arg1 in %eax: addr of proc_sig structure in user stack
- leal 12(%eax), %ebx # %ebx = &proc_sig->prev_context
+ movl psig_saved_ictx(%eax), %ebx # %ebx = &proc_sig->saved_ictx
- pushl $UDATA_SEG # proc_sig->prev_context.proc_regs.ss
+ pushl $UDATA_SEG
pushl %eax # esp
- movl 48(%ebx), %ebx
- pushl 68(%ebx) # proc_sig->prev_context.proc_regs.execp->eflags
+ movl iexecp(%ebx), %ebx
+ pushl exeflags(%ebx) # proc_sig->saved_ictx->execp->eflags
pushl $UCODE_SEG # cs
- pushl 4(%eax) # %eip = proc_sig->sigact
+ pushl psig_sigact(%eax) # %eip = proc_sig->sigact
movw $UDATA_SEG, %cx # switch data seg to user mode
movw %cx, %es
proc0->intr_ctx = isrp;
- // 加载x87默认配置
- asm volatile("fninit\n"
- "fxsave (%%eax)" ::"a"(proc0->fxstate)
- : "memory");
-
// 向调度器注册进程。
commit_process(proc0);
pcb->intr_ctx = __current->intr_ctx;
pcb->parent = __current;
- memcpy(pcb->fxstate, __current->fxstate, 512);
-
if (__current->cwd) {
pcb->cwd = __current->cwd;
vfs_ref_dnode(pcb->cwd);
time_t now = clock_systime();
llist_for_each(pos, n, &leader->sleep.sleepers, sleep.sleepers)
{
- if (PROC_TERMINATED(pos->state)) {
+ if (proc_terminated(pos)) {
goto del;
}
proc->created = clock_systime();
proc->pgid = proc->pid;
proc->fdtable = vzalloc(sizeof(struct v_fdtable));
- proc->fxstate =
- vzalloc_dma(512); // FXSAVE需要十六位对齐地址,使用DMA块(128位对齐)
llist_init_head(&proc->mm.regions);
llist_init_head(&proc->tasks);
}
vfree(proc->fdtable);
- vfree_dma(proc->fxstate);
vmm_mount_pd(VMS_MOUNT_1, proc->page_table);
// 如果其父进程的状态是terminated 或 destroy中的一种
// 或者其父进程是在该进程之后创建的,那么该进程为孤儿进程
- return PROC_TERMINATED(parent->state) || parent->created > proc->created;
+ return proc_terminated(parent) || parent->created > proc->created;
}
\ No newline at end of file
#include <lunaix/spike.h>
#include <lunaix/status.h>
#include <lunaix/syscall.h>
+#include <lunaix/syslog.h>
+
+LOG_MODULE("SIG")
#include <klibc/string.h>
#define UNMASKABLE (sigset(SIGKILL) | sigset(SIGTERM))
#define TERMSIG (sigset(SIGSEGV) | sigset(SIGINT) | UNMASKABLE)
+#define CORE (sigset(SIGSEGV))
+
+static inline void
+signal_terminate(int errcode)
+{
+ terminate_proc(errcode | PEXITSIG);
+}
// Referenced in kernel/asm/x86/interrupt.S
void*
if (!action->sa_actor) {
if (sigset_test(TERMSIG, sig_selected)) {
- terminate_proc(sig_selected | PEXITSIG);
+ signal_terminate(sig_selected);
schedule();
// never return
}
struct proc_sig* sigframe =
(struct proc_sig*)((ustack - sizeof(struct proc_sig)) & ~0xf);
- /*
- 这是一个相当恶心的坑。
- 问题是出在原本的sigframe->prev_context = __current->intr_ctx的上面
- 这个语句会被gcc在编译时,用更加高效的 rep movsl 来代替。
-
- 由于我们采用按需分页,所以在很多情况下,用户栈实际被分配的空间不允许我们进行完整的
- 注入,而需要走page fault handler进行动态分页。
-
- 竞态条件就出现在这里!
-
- 假若我们的__current->intr_ctx注入了一半,然后产生page-fault中断,
- 那么这就会导致我们的__current->intr_ctx被这个page-fault中断导致的
- 上下文信息覆盖。那么当page-fault handler成功分配了一个页,返回,
- 拷贝也就得以进行。遗憾的是,只不过这次拷贝的内容和前面的拷贝是没有任何的关系
- (因为此时的intr_ctx已经不是之前的intr_ctx了!)
- 而这就会导致我们保存在信号上下文中的进程上下文信息不完整,从而在soft_iret时
- 触发#GP。
-
- 解决办法就是先吧intr_ctx拷贝到一个静态分配的区域里,然后再注入到用户栈。
- */
- static volatile struct proc_sigstate __temp_save;
- __temp_save.proc_regs = *__current->intr_ctx;
- memcpy(__temp_save.fxstate, __current->fxstate, 512);
-
sigframe->sig_num = sig_selected;
-
sigframe->sigact = action->sa_actor;
sigframe->sighand = action->sa_handler;
- sigframe->prev_context = __temp_save;
+
+ sigframe->saved_ictx = __current->intr_ctx;
action->prev = prev_working;
psig->inprogress = action;
}
send_single:
- if (PROC_TERMINATED(proc->state)) {
+ if (proc_terminated(proc)) {
__current->k_status = EINVAL;
return -1;
}
__DEFINE_LXSYSCALL1(int, sigreturn, struct proc_sig, *sig_ctx)
{
- memcpy(__current->fxstate, sig_ctx->prev_context.fxstate, 512);
- // FIXME: Interrupt context is exposed to user space!
- *__current->intr_ctx = sig_ctx->prev_context.proc_regs;
+ __current->intr_ctx = sig_ctx->saved_ictx;
struct sigact* current = __current->sigctx.inprogress;
if (current) {
__current->sigctx.inprogress = NULL;
}
+ if (proc_terminated(__current)) {
+ __current->exit_code |= PEXITSIG;
+ } else if (sigset_test(CORE, sig_ctx->sig_num)) {
+ signal_terminate(sig_ctx->sig_num);
+ }
+
+ ptr_t ictx = (ptr_t)__current->intr_ctx;
+
+ /*
+ Ensure our restored context is within kernel stack
+
+ This prevent user to forge their own context such that arbitrary code
+ can be executed as supervisor level
+ */
+ if (!within_kstack(ictx)) {
+ signal_terminate(SIGSEGV);
+ }
+
schedule();
// never reach!
waitpid(pid, &err, 0);
- if (err) {
+ if (WEXITSTATUS(err)) {
printf("shell exit abnormally (%d)", err);
}
.global _start
_start:
xorl %eax, %eax
+ fninit
call main
1:
printf("I, pid %d, have received an alarm!\n", pid);
}
-void
+int
main()
{
signal(SIGCHLD, sigchild_handler);
}
printf("done\n");
+
+ return 0;
}
\ No newline at end of file