docs/tutorial/8-multitasking.md

   1 ## 准备工作
   2
   3 ```sh
   4 git checkout 088403ac98acf7991507715d29a282dcba222053
   5 ```
   6
   7 观看对应视频
   8
   9 ### 实现多进程的要素
  10
  11 连续性：记录切换进程的CS:IP
  12
  13 正确性：记录进程的EFLAGS、各种寄存器、栈、堆、程序本身
  14
  15 公平性：时间分配合理
  16
  17 在正确性中：栈、堆、程序本身和页表关联，进程的EFLAGS、各种寄存器和中断上下文关联
  18
  19 ### 中断上下文
  20
  21 根据Intel IA32手册[1]，有两种情况
  22
  23 第一种是特权级不变，也就是内核代码切换到内核，或者Ring3用户代码切换到Ring3
  24
  25 栈上由高到低存放EFLAGS、CS、EIP、ERROR CODE，并且ESP指向ERROR CODE
  26
  27 lunaix使用中断来进行进程切换，那么看看怎么处理栈
  28
  29 中断程序如下，注：中断程序执行第一句时，栈空间已经是上述状态
  30
  31 可以看到，这个代码不管有没有权限改变都把除了ss的段寄存器push入栈（cs已经入栈了）
  32
  33 ```assembly
  34         pushl %esp
  35
  36         subl $16, %esp
  37         movw %gs, 12(%esp)
  38         movw %fs,  8(%esp)
  39         movw %es,  4(%esp)
  40         movw %ds,   (%esp)
  41
  42         pushl %esi
  43         pushl %ebp
  44         pushl %edi
  45         pushl %edx
  46         pushl %ecx
  47         pushl %ebx
  48         pushl %eax
  49 ```
  50
  51 下面这段代码则是通过60(%esp)取出cs，假设为0x10
  52
  53 ```assembly
  54         movl 60(%esp), %eax
  55         andl $0x3, %eax
  56         jz somewheref
  57 ```
  58
  59 `eax`与0x3可以获得第2号段描述符，实际上设置了第二号为ring0代码段，所以这样可以判断是不是发生了权限切换
  60
  61 如果从ring3到ring0，则把ring0相关段地址填充一下
  62
  63 ```assembly
  64         movw $segment, %ax
  65         movw %ax, %gs
  66         movw %ax, %fs
  67         movw %ax, %ds
  68         movw %ax, %es
  69 ```
  70
  71 ### 中断函数派发
  72
  73 子函数其实是一个dispatcher，派发到不同的函数
  74
  75 例如我们派发到系统调用函数
  76
  77 这个函数又会根据栈上的eax的值，即陷入前根据eax的值来判断调用哪个系统调用
  78
  79 这样我们可以用变化的eax和一个中断向量号来执行多个系统调用（当然可以通过其他寄存器传参数，反正都保存在栈上了）
  80
  81 ### fork函数实现
  82
  83 这是一个非常经典的函数
  84
  85 还是介绍一下它：fork用于创建一个进程，所创建的进程复制父进程的代码段/数据段/BSS段/堆/栈等所有用户空间信息；在内核中操作系统重新为其申请了一个PCB，并使用父进程的PCB进行初始化。
  86
  87 先看看第一部分代码
  88
  89 ```c
  90 struct proc_info {
  91     pid_t pid;
  92     struct proc_info* parent;
  93     isr_param intr_ctx;//存储中断上下文
  94     struct llist_header siblings;//与遍历兄弟姊妹有关
  95     struct llist_header children;//与遍历孩子进程有关
  96     struct proc_mm mm;//描述该进程的占用的虚拟内存范围和属性
  97     void* page_table;//指向页目录
  98     time_t created;//创建时的时间戳
  99     uint8_t state;
 100     int32_t exit_code;
 101     int32_t k_status;
 102     struct lx_timer* timer;//与时间控制有关
 103 };
 104 ```
 105
 106 下面是fork实现的第一段，用于初始化pcb
 107
 108 ```c
 109     struct proc_info pcb;
 110     init_proc(&pcb);
 111     pcb.mm = __current->mm;
 112     pcb.intr_ctx = __current->intr_ctx;
 113     pcb.parent = __current;
 114 ```
 115
 116 根据上面的描述，初始化就是复制父进程的代码段/数据段/BSS段/堆/栈、标记进程为创建状态等
 117
 118 ```c
 119 setup_proc_mem(&pcb, PD_REFERENCED);
 120 ```
 121
 122 PD_REFERENCED是页表虚拟地址
 123
 124 这个函数作用是复制页表相关操作，稍后分析
 125
 126 TODO:setup_proc_mem
 127
 128 ```c
 129     llist_init_head(&pcb.mm.regions);
 130     struct mm_region *pos, *n;
 131     llist_for_each(pos, n, &__current->mm.regions->head, head)
 132     {
 133 ```
 134
 135 llish_for_each是一个遍历regions的宏，接下来就遍历__current的各种区域（regions也可视为一个链表）
 136
 137 ```c
 138         region_add(&pcb, pos->start, pos->end, pos->attr);
 139 ```
 140
 141 这个就是复制了，不多说
 142
 143 ```c
 144 if ((pos->attr & REGION_WSHARED)) {
 145             continue;
 146         }
 147 ```
 148
 149 如果这个区域共享写，就没什么要处理的了
 150
 151 ```c
 152         uintptr_t start_vpn = PG_ALIGN(pos->start) >> 12;
 153         uintptr_t end_vpn = PG_ALIGN(pos->end) >> 12;
 154         for (size_t i = start_vpn; i < end_vpn; i++) {
 155 ```
 156
 157 后面又是一个循环，用于遍历该区域，如果不是共享写的话
 158
 159 ```c
 160             x86_pte_t* curproc = &PTE_MOUNTED(PD_MOUNT_1, i);
 161             x86_pte_t* newproc = &PTE_MOUNTED(PD_MOUNT_2, i);
 162             cpu_invplg(newproc);
 163
 164             if (pos->attr == REGION_RSHARED) {
 165                 cpu_invplg(curproc);
 166                 *curproc = *curproc & ~PG_WRITE;
 167                 *newproc = *newproc & ~PG_WRITE;
 168             } else {
 169                 *newproc = 0;
 170             }
 171 ```
 172
 173 PTE_MOUNTED 这个宏意思就是获得页目录的第几号页表项
 174
 175 PD_MOUNT_1这个页表就是当前进程的地址
 176
 177 ```c
 178             if (pos->attr == REGION_RSHARED) {
 179                 cpu_invplg(curproc);
 180                 *curproc = *curproc & ~PG_WRITE;[2]
 181                 *newproc = *newproc & ~PG_WRITE;
 182 ```
 183
 184 如果是共享读，就根据Intel manual 把页表设置成Read
 185
 186 ```c
 187             } else {
 188                 *newproc = 0;
 189             }
 190 ```
 191
 192 否则视为私有，子进程不继承该页表，直接清空
 193
 194 上面这个阶段就是复制页表
 195
 196 最后看看TODO:setup_proc_mem(&pcb, PD_REFERENCED);
 197
 198 ```c
 199 void
 200 setup_proc_mem(struct proc_info* proc, uintptr_t usedMnt)
 201 {
 202     pid_t pid = proc->pid;
 203     void* pt_copy = __dup_pagetable(pid, usedMnt);
 204     vmm_mount_pd(PD_MOUNT_2, pt_copy);
 205     // copy the kernel stack
 206     for (size_t i = KSTACK_START >> 12; i <= KSTACK_TOP >> 12; i++) {
 207         volatile x86_pte_t* ppte = &PTE_MOUNTED(PD_MOUNT_2, i);
 208         cpu_invplg(ppte);
 209         x86_pte_t p = *ppte;
 210         void* ppa = vmm_dup_page(pid, PG_ENTRY_ADDR(p));
 211         *ppte = (p & 0xfff) | (uintptr_t)ppa;
 212     }
 213     proc->page_table = pt_copy;
 214 }
 215 ```
 216
 217 __dup_pagetable(pid, usedMnt);这个函数是复制全部kernel页表
 218
 219 先不看，加为TODO:__dup_pagetable(pid, usedMnt);
 220
 221 vmm_mount_pd(PD_MOUNT_2, pt_copy);复制到二号页目录挂载点
 222
 223 TODO:`__dup_pagetable(pid, usedMnt);` `vmm_mount_pd(PD_MOUNT_2, pt_copy);`
 224
 225 `vmm_dup_page(pid, PG_ENTRY_ADDR(p));`这个也加为TODO，最后的循环暂且知道是处理了每个页目录项即可。
 226
 227 最后执行not cpoy段
 228
 229 ```c
 230 not_copy:
 231     vmm_unmount_pd(PD_MOUNT_2);
 232     pcb.intr_ctx.registers.eax = 0;
 233     push_process(&pcb);//把子进程放入执行队列，轮询制
 234     return pcb.pid;
 235 ```
 236
 237 ### __dup_pagetable
 238
 239 解决第一个TODO
 240
 241 这个函数作用是复制全部kernel页表、页目录
 242
 243
 244
 245 ```c
 246     void* ptd_pp = pmm_alloc_page(pid, PP_FGPERSIST);
 247 ```
 248
 249 先调用pmm_alloc_page给当前pid分配一个物理页，篇幅有限，就不说了，后面的一些函数又是如此
 250
 251
 252
 253 ```c
 254     x86_page_table* ptd = vmm_fmap_page(pid, PG_MOUNT_1, ptd_pp, PG_PREM_RW);
 255     x86_page_table* pptd = (x86_page_table*)(mount_point | (0x3FF << 12));
 256 ```
 257
 258 把分配的物理页映射到这个挂载点，其实就是给挂载点分配物理空间，拿到页目录指针ptd
 259
 260 pptd指向kernel页目录第一项（这个kernel页表使用了循环引用，页目录最后一项指向页目录基址）
 261
 262
 263
 264 ```c
 265     for (size_t i = 0; i < PG_MAX_ENTRIES - 1; i++) {
 266 ```
 267
 268 遍历所有page entry，除了最后一个
 269
 270 ```c
 271         x86_pte_t ptde = pptd->entry[i];
 272         if (!ptde || !(ptde & PG_PRESENT)) {
 273             ptd->entry[i] = ptde;
 274             continue;
 275         }
 276 ```
 277
 278 取出kernel page directory entry，简单判断并复制页目录的每一项
 279
 280 ```c
 281         x86_page_table* ppt = (x86_page_table*)(mount_point | (i << 12));
 282         void* pt_pp = pmm_alloc_page(pid, PP_FGPERSIST);
 283         x86_page_table* pt = vmm_fmap_page(pid, PG_MOUNT_2, pt_pp, PG_PREM_RW);
 284 ```
 285
 286 ppt指向 kernel page table entry，接着给二号页表挂载点分配物理页并映射
 287
 288 ```c
 289         for (size_t j = 0; j < PG_MAX_ENTRIES; j++) {
 290             x86_pte_t pte = ppt->entry[j];
 291             pmm_ref_page(pid, PG_ENTRY_ADDR(pte));
 292             pt->entry[j] = pte;
 293         }
 294         ptd->entry[i] = (uintptr_t)pt_pp | PG_PREM_RW;
 295 ```
 296
 297 上面是这个循环中最后一段，复制页表，最后设置权限
 298
 299 pmm_ref_page(pid, PG_ENTRY_ADDR(pte));这个是增加页面引用计数，和内存共享有关
 300
 301
 302
 303 循环结束后
 304
 305 ```c
 306     ptd->entry[PG_MAX_ENTRIES - 1] = NEW_L1_ENTRY(T_SELF_REF_PERM, ptd_pp);
 307     return ptd_pp;
 308 ```
 309
 310 同样做一个循环映射
 311
 312 返回挂载点1的虚拟地址
 313
 314 ### vmm_mount_pd
 315
 316 回顾：它的第二个参数是页目录
 317
 318 ```c
 319 void* pt_copy = __dup_pagetable(pid, usedMnt);
 320     vmm_mount_pd(PD_MOUNT_2, pt_copy);
 321 ```
 322
 323 下面是它的代码
 324
 325 ```c
 326 void*
 327 vmm_mount_pd(uintptr_t mnt, void* pde) {
 328     x86_page_table* l1pt = (x86_page_table*)L1_BASE_VADDR;
 329     l1pt->entry[(mnt >> 22)] = NEW_L1_ENTRY(T_SELF_REF_PERM, pde);
 330     cpu_invplg(mnt);
 331     return mnt;
 332 }
 333 ```
 334
 335 很简单，就是把kernel的页目录指向新页目录，于是我们可以通过虚拟地址访问这个页目录了
 336
 337 ### vmm_dup_page
 338
 339
 340
 341 先回顾一下：它是在复制kernel栈的情况下使用的
 342
 343 ```c
 344     // copy the kernel stack
 345     for (size_t i = KSTACK_START >> 12; i <= KSTACK_TOP >> 12; i++) {
 346         volatile x86_pte_t* ppte = &PTE_MOUNTED(PD_MOUNT_2, i);
 347         cpu_invplg(ppte);
 348         x86_pte_t p = *ppte;
 349         void* ppa = vmm_dup_page(pid, PG_ENTRY_ADDR(p));
 350         *ppte = (p & 0xfff) | (uintptr_t)ppa;
 351     }
 352 ```
 353
 354
 355
 356 ```c
 357 void* vmm_dup_page(pid_t pid, void* pa) {
 358     void* new_ppg = pmm_alloc_page(pid, 0);
 359     vmm_fmap_page(pid, PG_MOUNT_3, new_ppg, PG_PREM_RW);
 360     vmm_fmap_page(pid, PG_MOUNT_4, pa, PG_PREM_RW);
 361
 362     asm volatile (
 363         "movl %1, %%edi\n"
 364         "movl %2, %%esi\n"
 365         "rep movsl\n"
 366         :: "c"(1024), "r"(PG_MOUNT_3), "r"(PG_MOUNT_4)
 367         : "memory", "%edi", "%esi");
 368
 369     vmm_unset_mapping(PG_MOUNT_3);
 370     vmm_unset_mapping(PG_MOUNT_4);
 371
 372     return new_ppg;
 373 }
 374 ```
 375
 376 参数pa是页表项的值取高20位
 377
 378 最后实现 把pa指向页面值复制到新页面
 379
 380 其实实现的就是页面复制到新地址，并返回
 381
 382
 383
 384 ### exit系统调用实现
 385
 386 先来看看exit的说明
 387 void exit(int status) 立即终止调用进程。任何属于该进程的打开的文件描述符都会被关闭，该进程的子进程由进程 1 继承，初始化，且会向父进程发送一个 SIGCHLD 信号。
 388
 389 我们的exit还是有一些不一样的
 390
 391 exit定义如下，依旧是通过中断陷入TRAP
 392 ```c
 393 static void _exit(int status) {
 394     // 使用汇编语句设置寄存器ebx的值为status
 395     asm("" :: "b"(status));
 396     int v;
 397     // 使用汇编语句触发一个中断，中断号为33（通常用于系统调用），功能号为8
 398     asm volatile("int %1\n" : "=a"(v) : "i"(33), "a"(8));
 399     // 返回中断处理后的结果
 400     return (void)v;
 401 }
 402 ```
 403 为什么要设置ebx的值为status呢，因为上一次我们的中断框架规定了ebx传递第一个参数
 404
 405 ```c
 406 terminate_proc(status);
 407 void
 408 terminate_proc(int exit_code)
 409 {
 410     __current->state = PROC_TERMNAT;
 411     __current->exit_code = exit_code;
 412     schedule();
 413 }
 414 ```
 415 这个代码很简单，设置返回值为status和状态为终止，最后调用schedule
 416
 417 ```c
 418     if (!sched_ctx.ptable_len) {
 419         return;
 420     }
 421 ```
 422 在schedule函数中，首先检查进程表长度，如果为空就不需要schedule了
 423
 424 ```c
 425     cpu_disable_interrupt();
 426     struct proc_info* next;
 427     int prev_ptr = sched_ctx.procs_index;
 428     int ptr = prev_ptr;
 429
 430     do {
 431         ptr = (ptr + 1) % sched_ctx.ptable_len;
 432         next = &sched_ctx._procs[ptr];
 433     } while (next->state != PROC_STOPPED && ptr != prev_ptr);
 434
 435     sched_ctx.procs_index = ptr;
 436
 437     run(next);
 438 ```
 439 如果不为空，则（以轮询算法的方式）获得下一个进程指针存入next，而且该进程不能为停止状态
 440 更新index，最后run
 441
 442 ```c
 443     if (!(__current->state & ~PROC_RUNNING)) {
 444         __current->state = PROC_STOPPED;
 445     }
 446     proc->state = PROC_RUNNING;
 447 ```
 448 run函数也是先检查状态
 449
 450 ```c
 451     if (__current->page_table != proc->page_table) {
 452         __current = proc;
 453         cpu_lcr3(__current->page_table);
 454         // from now on, the we are in the kstack of another process
 455     } else {
 456         __current = proc;
 457     }
 458 ```
 459 更新__current的信息，然后更新CR3寄存器
 460
 461 ```c
 462 static inline void
 463 cpu_lcr3(reg32 v)
 464 {
 465     asm("mov %0, %%cr3" ::"r"(v));
 466 }
 467 ```
 468 更新CR3寄存器很简单
 469
 470 ```c
 471     apic_done_servicing();
 472     asm volatile("pushl %0\n"
 473                  "jmp soft_iret\n" ::"r"(&__current->intr_ctx)
 474                  : "memory");
 475 ```
 476 将当前中断上下文push，然后中断返回
 477 我们在中断context中已经保存了需要切换的进程的信息
 478
 479 ```c
 480     .global soft_iret
 481     soft_iret:
 482         popl %esp
 483 ```
 484 先pop，也就是让esp等于(&__current->intr_ctx)
 485
 486 esp此时指向__current->intr_ctx这个结构体
 487 回顾一下这个结构体
 488 ```c
 489 typedef struct
 490 {
 491     struct
 492     {
 493         reg32 eax;
 494         reg32 ebx;
 495         reg32 ecx;
 496         reg32 edx;
 497         reg32 edi;
 498         reg32 ebp;
 499         reg32 esi;
 500         reg32 ds;
 501         reg32 es;
 502         reg32 fs;
 503         reg32 gs;
 504         reg32 esp;
 505     } registers;
 506     unsigned int vector;
 507     unsigned int err_code;
 508     unsigned int eip;
 509     unsigned int cs;
 510     unsigned int eflags;
 511     unsigned int esp;
 512     unsigned int ss;
 513 } __attribute__((packed)) isr_param;
 514 ```
 515 就是一堆寄存器，继续看soft_iret代码
 516
 517 ```c
 518         popl %eax
 519         popl %ebx
 520         popl %ecx
 521         popl %edx
 522         popl %edi
 523         popl %ebp
 524         popl %esi
 525 ```
 526 这个就是把保存的寄存器都恢复出来
 527
 528 ```c
 529         movw   (%esp), %ds
 530         movw  4(%esp), %es
 531         movw  8(%esp), %fs
 532         movw 12(%esp), %gs
 533 ```
 534 恢复段寄存器
 535
 536 ```c
 537         movl 16(%esp), %esp
 538         addl $8, %esp
 539         iret
 540 ```
 541 这个过程是和中断wrapper调用时完全相反的，不多说了
 542
 543 总的来说，exit会返回exit code 并且中断跳到sched_ctx._procs数组中的一个进程
 544
 545 那么，接下来看看这个数组有哪些相关操作
 546
 547 ```c
 548 void push_process(struct proc_info* process);
 549 ```
 550 挑push_process来说，它的作用是把一个进程放入轮询调度器
 551
 552 之前实现了timer就能很快实现调度器了，略过
 553
 554 push_process内容是对proc_info内容做一些检查最后写入数组，就没了
 555
 556 最后看看push_process的使用
 557
 558 然后是创建0号进程的时候调用了push_process
 559
 560 那么看看为什么0号进程的创建和创建意义
 561
 562 ### 0号进程创建及其意义
 563 ```c
 564     struct proc_info proc0;
 565     init_proc(&proc0);
 566     proc0.intr_ctx = (isr_param){ .registers = { .ds = KDATA_SEG,
 567                                                  .es = KDATA_SEG,
 568                                                  .fs = KDATA_SEG,
 569                                                  .gs = KDATA_SEG },
 570                                   .cs = KCODE_SEG,
 571                                   .eip = (void*)__proc0,
 572                                   .ss = KDATA_SEG,
 573                                   .eflags = cpu_reflags() };
 574 ```
 575 上面这些是PCB的初始化，略过
 576
 577 ```c
 578 setup_proc_mem(&proc0, PD_REFERENCED);
 579 ```
 580 复制内核的页目录到0号进程信息里面
 581
 582 ```c
 583     asm volatile("movl %%cr3, %%eax\n"
 584                  "movl %%esp, %%ebx\n"
 585                  "movl %1, %%cr3\n"
 586                  "movl %2, %%esp\n"
 587                  "pushf\n"
 588                  "pushl %3\n"
 589                  "pushl %4\n"
 590                  "pushl $0\n"
 591                  "pushl $0\n"
 592                  "movl %%esp, %0\n"
 593                  "movl %%eax, %%cr3\n"
 594                  "movl %%ebx, %%esp\n"
 595                  : "=m"(proc0.intr_ctx.registers.esp)
 596                  : "r"(proc0.page_table),
 597                    "i"(KSTACK_TOP),
 598                    "i"(KCODE_SEG),
 599                    "r"(proc0.intr_ctx.eip)
 600                  : "%eax", "%ebx", "memory");
 601 ```
 602 布置好中断栈，依次push eflags、代码段地址（cs）、eip、0（error code）、0（vector）
 603
 604 ```c
 605     // 向调度器注册进程。
 606     push_process(&proc0);
 607
 608     // 由于时钟中断未就绪，我们需要手动通知调度器进行第一次调度。这里也会同时隐式地恢复我们的eflags.IF位
 609     schedule();
 610 ```
 611 最后进行注册和调度
 612
 613
 614 ### 进程调用过程细节
 615 进程调用是由调度器管理
 616 假设我们调用fork执行了一个进程，之后调度器触发了进程切换
 617
 618 ```c
 619
 620 static void
 621 timer_update(const isr_param* param)
 622 {
 623     /*...*/
 624     sched_ticks_counter++;
 625
 626     if (sched_ticks_counter >= sched_ticks) {
 627         sched_ticks_counter = 0;
 628         schedule();
 629     }
 630 }
 631 ```
 632 timer会每隔固定时间利用外部中断执行timer_update
 633 timer_update函数会调用schedule
 634
 635 schedule也分析过了，它会取出其中一个PCB信息，并调用run函数
 636 run函数上面也分析了
 637 **就此进程管理大部分实现**
 638
 639 ### sleep实现
 640
 641 ```c
 642     if (!seconds) {
 643         return 0;
 644     }
 645
 646     if (__current->timer) {
 647         return __current->timer->counter / timer_context()->running_frequency;
 648     }
 649
 650     struct lx_timer* timer =
 651       timer_run_second(seconds, proc_timer_callback, __current, 0);
 652     __current->timer = timer;
 653     __current->intr_ctx.registers.eax = seconds;
 654     __current->state = PROC_BLOCKED;
 655     schedule();
 656 ```
 657 主要逻辑就是每秒运行proc_timer_callback
 658 同时设置阻塞状态等，最后调用schedule函数
 659
 660 ```c
 661 static void
 662 proc_timer_callback(struct proc_info* proc)
 663 {
 664     proc->timer = NULL;
 665     proc->state = PROC_STOPPED;
 666 }
 667 ```
 668 proc_timer_callback就是清空timer，设置为STOP
 669
 670 ```c
 671     if (__current->timer) {
 672         return __current->timer->counter / timer_context()->running_frequency;
 673     }
 674 ```
 675 sleep函数还有这一段没解释，当调用了proc_timer_callback，timer为空就不会返回
 676
 677 ## 参考
 678
 679 [1]Intel Manual, Vol 1, 6-15, Figure 6-7. Stack Usage on Transfers to Interrupt and Exception Handling Routines
 680
 681 [2]Intel Manual, Vol 3A, 4-13, Table 4-6. Format of a 32-Bit Page-Table Entry that Maps a 4-KByte Page