lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/dirent.h>
  48 #include <lunaix/foptions.h>
  49 #include <lunaix/fs.h>
  50 #include <lunaix/mm/cake.h>
  51 #include <lunaix/mm/page.h>
  52 #include <lunaix/mm/valloc.h>
  53 #include <lunaix/process.h>
  54 #include <lunaix/spike.h>
  55 #include <lunaix/syscall.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 static struct cake_pile* dnode_pile;
  60 static struct cake_pile* inode_pile;
  61 static struct cake_pile* file_pile;
  62 static struct cake_pile* superblock_pile;
  63 static struct cake_pile* fd_pile;
  64
  65 struct v_dnode* vfs_sysroot;
  66 static struct hbucket* dnode_cache;
  67
  68 struct lru_zone *dnode_lru, *inode_lru;
  69
  70 struct hstr vfs_ddot = HSTR("..", 2);
  71 struct hstr vfs_dot = HSTR(".", 1);
  72 struct hstr vfs_empty = HSTR("", 0);
  73
  74 struct v_superblock*
  75 vfs_sb_alloc();
  76
  77 void
  78 vfs_sb_free(struct v_superblock* sb);
  79
  80 static int
  81 __vfs_try_evict_dnode(struct lru_node* obj);
  82
  83 static int
  84 __vfs_try_evict_inode(struct lru_node* obj);
  85
  86 void
  87 vfs_init()
  88 {
  89     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  90     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  91     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  92     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  93     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  94     superblock_pile =
  95       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  96
  97     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
  98
  99     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 100     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 101
 102     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 103     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 104
 105     // 创建一个根dnode。
 106     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 107     vfs_sysroot->parent = vfs_sysroot;
 108     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 109 }
 110
 111 inline struct hbucket*
 112 __dcache_hash(struct v_dnode* parent, uint32_t* hash)
 113 {
 114     uint32_t _hash = *hash;
 115     // 确保低位更加随机
 116     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 117     // 与parent的指针值做加法，来减小碰撞的可能性。
 118     _hash += (uint32_t)parent;
 119     *hash = _hash;
 120     return &dnode_cache[_hash & VFS_HASH_MASK];
 121 }
 122
 123 struct v_dnode*
 124 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 125 {
 126     if (!str->len || HSTR_EQ(str, &vfs_dot))
 127         return parent;
 128
 129     if (HSTR_EQ(str, &vfs_ddot)) {
 130         return parent->parent;
 131     }
 132
 133     uint32_t hash = str->hash;
 134     struct hbucket* slot = __dcache_hash(parent, &hash);
 135
 136     struct v_dnode *pos, *n;
 137     hashtable_bucket_foreach(slot, pos, n, hash_list)
 138     {
 139         if (pos->name.hash == hash) {
 140             return pos;
 141         }
 142     }
 143     return NULL;
 144 }
 145
 146 void
 147 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 148 {
 149     assert(parent);
 150
 151     atomic_fetch_add(&dnode->ref_count, 1);
 152     dnode->parent = parent;
 153     llist_append(&parent->children, &dnode->siblings);
 154
 155     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 156     hlist_add(&bucket->head, &dnode->hash_list);
 157 }
 158
 159 void
 160 vfs_dcache_remove(struct v_dnode* dnode)
 161 {
 162     assert(dnode);
 163     assert(dnode->ref_count == 1);
 164
 165     llist_delete(&dnode->siblings);
 166     llist_delete(&dnode->aka_list);
 167     hlist_delete(&dnode->hash_list);
 168
 169     dnode->parent = NULL;
 170     atomic_fetch_sub(&dnode->ref_count, 1);
 171 }
 172
 173 void
 174 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 175 {
 176     assert(new_parent);
 177
 178     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 179     vfs_dcache_remove(dnode);
 180     vfs_dcache_add(new_parent, dnode);
 181 }
 182
 183 int
 184 vfs_open(struct v_dnode* dnode, struct v_file** file)
 185 {
 186     if (!dnode->inode || !dnode->inode->ops->open) {
 187         return ENOTSUP;
 188     }
 189
 190     struct v_inode* inode = dnode->inode;
 191
 192     lock_inode(inode);
 193
 194     struct v_file* vfile = cake_grab(file_pile);
 195     memset(vfile, 0, sizeof(*vfile));
 196
 197     vfile->dnode = dnode;
 198     vfile->inode = inode;
 199     vfile->ref_count = ATOMIC_VAR_INIT(1);
 200     vfile->ops = inode->default_fops;
 201
 202     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 203         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 204         pcache_init(pcache);
 205         pcache->master = inode;
 206         inode->pg_cache = pcache;
 207     }
 208
 209     int errno = inode->ops->open(inode, vfile);
 210     if (errno) {
 211         cake_release(file_pile, vfile);
 212     } else {
 213         atomic_fetch_add(&dnode->ref_count, 1);
 214         inode->open_count++;
 215         mnt_mkbusy(dnode->mnt);
 216
 217         *file = vfile;
 218     }
 219
 220     unlock_inode(inode);
 221
 222     return errno;
 223 }
 224
 225 void
 226 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 227 {
 228     if (assign_to->inode) {
 229         llist_delete(&assign_to->aka_list);
 230         assign_to->inode->link_count--;
 231     }
 232     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 233     assign_to->inode = inode;
 234     inode->link_count++;
 235 }
 236
 237 int
 238 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 239 {
 240     int errno;
 241
 242     if ((errno = vfs_check_writable(to_link))) {
 243         return errno;
 244     }
 245
 246     lock_inode(to_link->inode);
 247     if (to_link->super_block->root != name->super_block->root) {
 248         errno = EXDEV;
 249     } else if (!to_link->inode->ops->link) {
 250         errno = ENOTSUP;
 251     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 252         vfs_assign_inode(name, to_link->inode);
 253     }
 254     unlock_inode(to_link->inode);
 255
 256     return errno;
 257 }
 258
 259 int
 260 vfs_pclose(struct v_file* file, pid_t pid)
 261 {
 262     int errno = 0;
 263     if (file->ref_count > 1) {
 264         atomic_fetch_sub(&file->ref_count, 1);
 265     } else if (!(errno = file->ops->close(file))) {
 266         atomic_fetch_sub(&file->dnode->ref_count, 1);
 267         file->inode->open_count--;
 268
 269         // Prevent dead lock.
 270         // This happened when process is terminated while blocking on read.
 271         // In that case, the process is still holding the inode lock and it will
 272         // never get released.
 273         /*
 274          * The unlocking should also include ownership check.
 275          *
 276          * To see why, consider two process both open the same file both with
 277          * fd=x.
 278          *      Process A: busy on reading x
 279          *      Process B: do nothing with x
 280          * Assuming that, after a very short time, process B get terminated
 281          * while process A is still busy in it's reading business. By this
 282          * design, the inode lock of this file x is get released by B rather
 283          * than A. And this will cause a probable race condition on A if other
 284          * process is writing to this file later after B exit.
 285          */
 286         if (mutex_on_hold(&file->inode->lock)) {
 287             mutex_unlock_for(&file->inode->lock, pid);
 288         }
 289         mnt_chillax(file->dnode->mnt);
 290
 291         pcache_commit_all(file->inode);
 292         cake_release(file_pile, file);
 293     }
 294     return errno;
 295 }
 296
 297 int
 298 vfs_close(struct v_file* file)
 299 {
 300     return vfs_pclose(file, __current->pid);
 301 }
 302
 303 int
 304 vfs_fsync(struct v_file* file)
 305 {
 306     int errno;
 307     if ((errno = vfs_check_writable(file->dnode))) {
 308         return errno;
 309     }
 310
 311     lock_inode(file->inode);
 312
 313     pcache_commit_all(file->inode);
 314
 315     errno = ENOTSUP;
 316     if (file->ops->sync) {
 317         errno = file->ops->sync(file);
 318     }
 319
 320     unlock_inode(file->inode);
 321
 322     return errno;
 323 }
 324
 325 int
 326 vfs_alloc_fdslot(int* fd)
 327 {
 328     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 329         if (!__current->fdtable->fds[i]) {
 330             *fd = i;
 331             return 0;
 332         }
 333     }
 334     return EMFILE;
 335 }
 336
 337 struct v_superblock*
 338 vfs_sb_alloc()
 339 {
 340     struct v_superblock* sb = cake_grab(superblock_pile);
 341     memset(sb, 0, sizeof(*sb));
 342     llist_init_head(&sb->sb_list);
 343     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 344     return sb;
 345 }
 346
 347 void
 348 vfs_sb_free(struct v_superblock* sb)
 349 {
 350     vfree(sb->i_cache);
 351     cake_release(superblock_pile, sb);
 352 }
 353
 354 static int
 355 __vfs_try_evict_dnode(struct lru_node* obj)
 356 {
 357     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 358
 359     if (!dnode->ref_count) {
 360         vfs_d_free(dnode);
 361         return 1;
 362     }
 363     return 0;
 364 }
 365
 366 static int
 367 __vfs_try_evict_inode(struct lru_node* obj)
 368 {
 369     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 370
 371     if (!inode->link_count && !inode->open_count) {
 372         vfs_i_free(inode);
 373         return 1;
 374     }
 375     return 0;
 376 }
 377
 378 struct v_dnode*
 379 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 380 {
 381     struct v_dnode* dnode = cake_grab(dnode_pile);
 382     if (!dnode) {
 383         lru_evict_half(dnode_lru);
 384
 385         if (!(dnode = cake_grab(dnode_pile))) {
 386             return NULL;
 387         }
 388     }
 389
 390     memset(dnode, 0, sizeof(*dnode));
 391     llist_init_head(&dnode->children);
 392     llist_init_head(&dnode->siblings);
 393     llist_init_head(&dnode->aka_list);
 394     mutex_init(&dnode->lock);
 395
 396     dnode->ref_count = ATOMIC_VAR_INIT(0);
 397     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 398
 399     hstrcpy(&dnode->name, name);
 400
 401     if (parent) {
 402         dnode->super_block = parent->super_block;
 403         dnode->mnt = parent->mnt;
 404     }
 405
 406     lru_use_one(dnode_lru, &dnode->lru);
 407
 408     return dnode;
 409 }
 410
 411 void
 412 vfs_d_free(struct v_dnode* dnode)
 413 {
 414     assert(dnode->ref_count == 1);
 415
 416     if (dnode->inode) {
 417         assert(dnode->inode->link_count > 0);
 418         dnode->inode->link_count--;
 419     }
 420
 421     vfs_dcache_remove(dnode);
 422     // Make sure the children de-referencing their parent.
 423     // With lru presented, the eviction will be propagated over the entire
 424     // detached subtree eventually
 425     struct v_dnode *pos, *n;
 426     llist_for_each(pos, n, &dnode->children, siblings)
 427     {
 428         vfs_dcache_remove(pos);
 429     }
 430
 431     vfree(dnode->name.value);
 432     cake_release(dnode_pile, dnode);
 433 }
 434
 435 struct v_inode*
 436 vfs_i_find(struct v_superblock* sb, uint32_t i_id)
 437 {
 438     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 439     struct v_inode *pos, *n;
 440     hashtable_bucket_foreach(slot, pos, n, hash_list)
 441     {
 442         if (pos->id == i_id) {
 443             lru_use_one(inode_lru, &pos->lru);
 444             return pos;
 445         }
 446     }
 447
 448     return NULL;
 449 }
 450
 451 void
 452 vfs_i_addhash(struct v_inode* inode)
 453 {
 454     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 455
 456     hlist_delete(&inode->hash_list);
 457     hlist_add(&slot->head, &inode->hash_list);
 458 }
 459
 460 struct v_inode*
 461 vfs_i_alloc(struct v_superblock* sb)
 462 {
 463     assert(sb->ops.init_inode);
 464
 465     struct v_inode* inode;
 466     if (!(inode = cake_grab(inode_pile))) {
 467         lru_evict_half(inode_lru);
 468         if (!(inode = cake_grab(inode_pile))) {
 469             return NULL;
 470         }
 471     }
 472
 473     memset(inode, 0, sizeof(*inode));
 474     mutex_init(&inode->lock);
 475     llist_init_head(&inode->xattrs);
 476     llist_init_head(&inode->aka_dnodes);
 477
 478     sb->ops.init_inode(sb, inode);
 479
 480     inode->sb = sb;
 481     inode->ctime = clock_unixtime();
 482     inode->atime = inode->ctime;
 483     inode->mtime = inode->ctime;
 484
 485 done:
 486     lru_use_one(inode_lru, &inode->lru);
 487     return inode;
 488 }
 489
 490 void
 491 vfs_i_free(struct v_inode* inode)
 492 {
 493     if (inode->pg_cache) {
 494         pcache_release(inode->pg_cache);
 495         vfree(inode->pg_cache);
 496     }
 497     inode->ops->sync(inode);
 498     hlist_delete(&inode->hash_list);
 499     cake_release(inode_pile, inode);
 500 }
 501
 502 /* ---- System call definition and support ---- */
 503
 504 #define FLOCATE_CREATE_EMPTY 1
 505
 506 int
 507 vfs_getfd(int fd, struct v_fd** fd_s)
 508 {
 509     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 510         return 0;
 511     }
 512     return EBADF;
 513 }
 514
 515 int
 516 __vfs_try_locate_file(const char* path,
 517                       struct v_dnode** fdir,
 518                       struct v_dnode** file,
 519                       int options)
 520 {
 521     char name_str[VFS_NAME_MAXLEN];
 522     struct hstr name = HSTR(name_str, 0);
 523     int errno;
 524
 525     name_str[0] = 0;
 526     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 527         return errno;
 528     }
 529
 530     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 531     if (errno != ENOENT || !(options & FLOCATE_CREATE_EMPTY)) {
 532         return errno;
 533     }
 534
 535     struct v_dnode* parent = *fdir;
 536     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 537
 538     if (!file_new) {
 539         return ENOMEM;
 540     }
 541
 542     lock_dnode(parent);
 543
 544     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 545         vfs_dcache_add(parent, file_new);
 546         *file = file_new;
 547     } else {
 548         vfs_d_free(file_new);
 549     }
 550
 551     unlock_dnode(parent);
 552
 553     return errno;
 554 }
 555
 556 int
 557 vfs_do_open(const char* path, int options)
 558 {
 559     int errno, fd;
 560     struct v_dnode *dentry, *file;
 561     struct v_file* ofile = 0;
 562
 563     errno = __vfs_try_locate_file(
 564       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 565
 566     if (errno || (errno = vfs_open(file, &ofile))) {
 567         return errno;
 568     }
 569
 570     struct v_inode* o_inode = ofile->inode;
 571
 572     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 573         struct v_fd* fd_s = vzalloc(sizeof(*fd_s));
 574         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 575         fd_s->file = ofile;
 576         fd_s->flags = options;
 577         __current->fdtable->fds[fd] = fd_s;
 578         return fd;
 579     }
 580
 581     return errno;
 582 }
 583
 584 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 585 {
 586     int errno = vfs_do_open(path, options);
 587     return DO_STATUS_OR_RETURN(errno);
 588 }
 589
 590 __DEFINE_LXSYSCALL1(int, close, int, fd)
 591 {
 592     struct v_fd* fd_s;
 593     int errno = 0;
 594     if ((errno = vfs_getfd(fd, &fd_s))) {
 595         goto done_err;
 596     }
 597
 598     if ((errno = vfs_close(fd_s->file))) {
 599         goto done_err;
 600     }
 601
 602     vfree(fd_s);
 603     __current->fdtable->fds[fd] = 0;
 604
 605 done_err:
 606     return DO_STATUS(errno);
 607 }
 608
 609 void
 610 __vfs_readdir_callback(struct dir_context* dctx,
 611                        const char* name,
 612                        const int len,
 613                        const int dtype)
 614 {
 615     struct dirent* dent = (struct dirent*)dctx->cb_data;
 616     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 617     dent->d_nlen = len;
 618     dent->d_type = dtype;
 619 }
 620
 621 __DEFINE_LXSYSCALL2(int, readdir, int, fd, struct dirent*, dent)
 622 {
 623     struct v_fd* fd_s;
 624     int errno;
 625
 626     if ((errno = vfs_getfd(fd, &fd_s))) {
 627         goto done;
 628     }
 629
 630     struct v_inode* inode = fd_s->file->inode;
 631
 632     lock_inode(inode);
 633
 634     if (!(inode->itype & VFS_IFDIR)) {
 635         errno = ENOTDIR;
 636     } else {
 637         struct dir_context dctx =
 638           (struct dir_context){ .cb_data = dent,
 639                                 .index = dent->d_offset,
 640                                 .read_complete_callback =
 641                                   __vfs_readdir_callback };
 642         errno = 1;
 643         if (dent->d_offset == 0) {
 644             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 645         } else if (dent->d_offset == 1) {
 646             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 647         } else {
 648             dctx.index -= 2;
 649             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 650                 unlock_inode(inode);
 651                 goto done;
 652             }
 653         }
 654         dent->d_offset++;
 655     }
 656
 657     unlock_inode(inode);
 658
 659 done:
 660     return DO_STATUS_OR_RETURN(errno);
 661 }
 662
 663 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 664 {
 665     int errno = 0;
 666     struct v_fd* fd_s;
 667     if ((errno = vfs_getfd(fd, &fd_s))) {
 668         goto done;
 669     }
 670
 671     struct v_file* file = fd_s->file;
 672     if ((file->inode->itype & VFS_IFDIR)) {
 673         errno = EISDIR;
 674         goto done;
 675     }
 676
 677     lock_inode(file->inode);
 678
 679     file->inode->atime = clock_unixtime();
 680
 681     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 682         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 683     } else {
 684         errno = pcache_read(file->inode, buf, count, file->f_pos);
 685     }
 686
 687     if (errno > 0) {
 688         file->f_pos += errno;
 689         unlock_inode(file->inode);
 690         return errno;
 691     }
 692
 693     unlock_inode(file->inode);
 694
 695 done:
 696     return DO_STATUS(errno);
 697 }
 698
 699 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 700 {
 701     int errno = 0;
 702     struct v_fd* fd_s;
 703     if ((errno = vfs_getfd(fd, &fd_s))) {
 704         goto done;
 705     }
 706
 707     struct v_file* file = fd_s->file;
 708
 709     if ((errno = vfs_check_writable(file->dnode))) {
 710         goto done;
 711     }
 712
 713     if ((file->inode->itype & VFS_IFDIR)) {
 714         errno = EISDIR;
 715         goto done;
 716     }
 717
 718     lock_inode(file->inode);
 719
 720     file->inode->mtime = clock_unixtime();
 721
 722     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 723         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 724     } else {
 725         errno = pcache_write(file->inode, buf, count, file->f_pos);
 726     }
 727
 728     if (errno > 0) {
 729         file->f_pos += errno;
 730         unlock_inode(file->inode);
 731         return errno;
 732     }
 733
 734     unlock_inode(file->inode);
 735
 736 done:
 737     return DO_STATUS(errno);
 738 }
 739
 740 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 741 {
 742     int errno = 0;
 743     struct v_fd* fd_s;
 744     if ((errno = vfs_getfd(fd, &fd_s))) {
 745         goto done;
 746     }
 747
 748     struct v_file* file = fd_s->file;
 749
 750     if (!file->ops->seek) {
 751         errno = ENOTSUP;
 752         goto done;
 753     }
 754
 755     lock_inode(file->inode);
 756
 757     int overflow = 0;
 758     int fpos = file->f_pos;
 759     switch (options) {
 760         case FSEEK_CUR:
 761             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 762             break;
 763         case FSEEK_END:
 764             overflow =
 765               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 766             break;
 767         case FSEEK_SET:
 768             fpos = offset;
 769             break;
 770     }
 771     if (overflow) {
 772         errno = EOVERFLOW;
 773     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 774         file->f_pos = fpos;
 775     }
 776
 777     unlock_inode(file->inode);
 778
 779 done:
 780     return DO_STATUS(errno);
 781 }
 782
 783 int
 784 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 785 {
 786     if (!dnode) {
 787         return 0;
 788     }
 789
 790     if (depth > 64) {
 791         return ENAMETOOLONG;
 792     }
 793
 794     size_t len = 0;
 795
 796     if (dnode->parent != dnode) {
 797         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 798     }
 799
 800     if (len >= size) {
 801         return len;
 802     }
 803
 804     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 805         buf[len++] = VFS_PATH_DELIM;
 806     }
 807
 808     size_t cpy_size = MIN(dnode->name.len, size - len);
 809     strncpy(buf + len, dnode->name.value, cpy_size);
 810     len += cpy_size;
 811
 812     return len;
 813 }
 814
 815 int
 816 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 817 {
 818     const char* link;
 819     struct v_inode* inode = dnode->inode;
 820     if (inode->ops->read_symlink) {
 821         lock_inode(inode);
 822
 823         int errno = inode->ops->read_symlink(inode, &link);
 824         strncpy(buf, link, size);
 825
 826         unlock_inode(inode);
 827         return errno;
 828     }
 829     return 0;
 830 }
 831
 832 int
 833 vfs_get_dtype(int itype)
 834 {
 835     switch (itype) {
 836         case VFS_IFDIR:
 837             return DT_DIR;
 838         case VFS_IFSYMLINK:
 839             return DT_SYMLINK;
 840         default:
 841             return DT_PIPE;
 842     }
 843 }
 844
 845 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 846 {
 847     int errno;
 848     struct v_fd* fd_s;
 849     if ((errno = vfs_getfd(fd, &fd_s))) {
 850         goto done;
 851     }
 852
 853     struct v_dnode* dnode;
 854     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 855
 856     if (errno >= 0) {
 857         return errno;
 858     }
 859
 860 done:
 861     return DO_STATUS(errno);
 862 }
 863
 864 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 865 {
 866     int errno;
 867     struct v_dnode* dnode;
 868     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 869         errno = vfs_readlink(dnode, buf, size);
 870     }
 871
 872     if (errno >= 0) {
 873         return errno;
 874     }
 875
 876     return DO_STATUS(errno);
 877 }
 878
 879 __DEFINE_LXSYSCALL4(int,
 880                     readlinkat,
 881                     int,
 882                     dirfd,
 883                     const char*,
 884                     pathname,
 885                     char*,
 886                     buf,
 887                     size_t,
 888                     size)
 889 {
 890     int errno;
 891     struct v_fd* fd_s;
 892     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 893         goto done;
 894     }
 895
 896     struct v_dnode* dnode;
 897     if (!(errno = vfs_walk(
 898             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 899         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 900     }
 901
 902     if (errno >= 0) {
 903         return errno;
 904     }
 905
 906 done:
 907     return DO_STATUS(errno);
 908 }
 909
 910 /*
 911     NOTE
 912     When we perform operation that could affect the layout of
 913     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 914     whenever possible. This will blocking any ongoing path walking to reach
 915     it hence avoid any partial state.
 916 */
 917
 918 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 919 {
 920     int errno;
 921     struct v_dnode* dnode;
 922     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 923         return DO_STATUS(errno);
 924     }
 925
 926     lock_dnode(dnode);
 927
 928     if ((errno = vfs_check_writable(dnode))) {
 929         goto done;
 930     }
 931
 932     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 933         errno = EROFS;
 934         goto done;
 935     }
 936
 937     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 938         errno = EBUSY;
 939         goto done;
 940     }
 941
 942     if (!llist_empty(&dnode->children)) {
 943         errno = ENOTEMPTY;
 944         goto done;
 945     }
 946
 947     struct v_dnode* parent = dnode->parent;
 948
 949     if (!parent) {
 950         errno = EINVAL;
 951         goto done;
 952     }
 953
 954     lock_dnode(parent);
 955     lock_inode(parent->inode);
 956
 957     if ((dnode->inode->itype & VFS_IFDIR)) {
 958         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 959         if (!errno) {
 960             vfs_dcache_remove(dnode);
 961         }
 962     } else {
 963         errno = ENOTDIR;
 964     }
 965
 966     unlock_inode(parent->inode);
 967     unlock_dnode(parent);
 968
 969 done:
 970     unlock_dnode(dnode);
 971     return DO_STATUS(errno);
 972 }
 973
 974 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 975 {
 976     int errno = 0;
 977     struct v_dnode *parent, *dir;
 978     char name_value[VFS_NAME_MAXLEN];
 979     struct hstr name = HHSTR(name_value, 0, 0);
 980
 981     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
 982         goto done;
 983     }
 984
 985     if ((errno = vfs_check_writable(parent))) {
 986         goto done;
 987     }
 988
 989     if (!(dir = vfs_d_alloc(parent, &name))) {
 990         errno = ENOMEM;
 991         goto done;
 992     }
 993
 994     lock_dnode(parent);
 995     lock_inode(parent->inode);
 996
 997     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
 998         errno = ENOTSUP;
 999     } else if (!parent->inode->ops->mkdir) {
1000         errno = ENOTSUP;
1001     } else if (!(parent->inode->itype & VFS_IFDIR)) {
1002         errno = ENOTDIR;
1003     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1004         vfs_dcache_add(parent, dir);
1005         goto cleanup;
1006     }
1007
1008     vfs_d_free(dir);
1009
1010 cleanup:
1011     unlock_inode(parent->inode);
1012     unlock_dnode(parent);
1013 done:
1014     return DO_STATUS(errno);
1015 }
1016
1017 int
1018 __vfs_do_unlink(struct v_dnode* dnode)
1019 {
1020     int errno;
1021     struct v_inode* inode = dnode->inode;
1022
1023     if (dnode->ref_count > 1) {
1024         return EBUSY;
1025     }
1026
1027     if ((errno = vfs_check_writable(dnode))) {
1028         return errno;
1029     }
1030
1031     lock_inode(inode);
1032
1033     if (inode->open_count) {
1034         errno = EBUSY;
1035     } else if (!(inode->itype & VFS_IFDIR)) {
1036         // The underlying unlink implementation should handle
1037         //  symlink case
1038         errno = inode->ops->unlink(inode);
1039         if (!errno) {
1040             vfs_d_free(dnode);
1041         }
1042     } else {
1043         errno = EISDIR;
1044     }
1045
1046     unlock_inode(inode);
1047
1048     return errno;
1049 }
1050
1051 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1052 {
1053     int errno;
1054     struct v_dnode* dnode;
1055     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1056         goto done;
1057     }
1058
1059     errno = __vfs_do_unlink(dnode);
1060
1061 done:
1062     return DO_STATUS(errno);
1063 }
1064
1065 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1066 {
1067     int errno;
1068     struct v_fd* fd_s;
1069     if ((errno = vfs_getfd(fd, &fd_s))) {
1070         goto done;
1071     }
1072
1073     struct v_dnode* dnode;
1074     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1075         errno = __vfs_do_unlink(dnode);
1076     }
1077
1078 done:
1079     return DO_STATUS(errno);
1080 }
1081
1082 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1083 {
1084     int errno;
1085     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1086
1087     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1088     if (!errno) {
1089         errno = __vfs_try_locate_file(
1090           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1091         if (!errno) {
1092             errno = EEXIST;
1093         } else if (name_file) {
1094             errno = vfs_link(to_link, name_file);
1095         }
1096     }
1097     return DO_STATUS(errno);
1098 }
1099
1100 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1101 {
1102     int errno;
1103     struct v_fd* fd_s;
1104
1105     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1106         errno = vfs_fsync(fd_s->file);
1107     }
1108
1109     return DO_STATUS(errno);
1110 }
1111
1112 int
1113 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1114 {
1115     int errno = 0;
1116     struct v_fd* copied = cake_grab(fd_pile);
1117
1118     memcpy(copied, old, sizeof(struct v_fd));
1119
1120     atomic_fetch_add(&old->file->ref_count, 1);
1121
1122     *new = copied;
1123
1124     return errno;
1125 }
1126
1127 int
1128 vfs_dup2(int oldfd, int newfd)
1129 {
1130     if (newfd == oldfd) {
1131         return newfd;
1132     }
1133
1134     int errno;
1135     struct v_fd *oldfd_s, *newfd_s;
1136     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1137         goto done;
1138     }
1139
1140     if (!TEST_FD(newfd)) {
1141         errno = EBADF;
1142         goto done;
1143     }
1144
1145     newfd_s = __current->fdtable->fds[newfd];
1146     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1147         goto done;
1148     }
1149
1150     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1151         __current->fdtable->fds[newfd] = newfd_s;
1152         return newfd;
1153     }
1154
1155 done:
1156     return DO_STATUS(errno);
1157 }
1158
1159 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1160 {
1161     return vfs_dup2(oldfd, newfd);
1162 }
1163
1164 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1165 {
1166     int errno, newfd;
1167     struct v_fd *oldfd_s, *newfd_s;
1168     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1169         goto done;
1170     }
1171
1172     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1173         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1174         __current->fdtable->fds[newfd] = newfd_s;
1175         return newfd;
1176     }
1177
1178 done:
1179     return DO_STATUS(errno);
1180 }
1181
1182 __DEFINE_LXSYSCALL2(int,
1183                     symlink,
1184                     const char*,
1185                     pathname,
1186                     const char*,
1187                     link_target)
1188 {
1189     int errno;
1190     struct v_dnode* dnode;
1191     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1192         goto done;
1193     }
1194
1195     if (errno = vfs_check_writable(dnode)) {
1196         goto done;
1197     }
1198
1199     if (!dnode->inode->ops->set_symlink) {
1200         errno = ENOTSUP;
1201         goto done;
1202     }
1203
1204     lock_inode(dnode->inode);
1205
1206     errno = dnode->inode->ops->set_symlink(dnode->inode, link_target);
1207
1208     unlock_inode(dnode->inode);
1209
1210 done:
1211     return DO_STATUS(errno);
1212 }
1213
1214 void
1215 vfs_ref_dnode(struct v_dnode* dnode)
1216 {
1217     atomic_fetch_add(&dnode->ref_count, 1);
1218     mnt_mkbusy(dnode->mnt);
1219 }
1220
1221 void
1222 vfs_unref_dnode(struct v_dnode* dnode)
1223 {
1224     atomic_fetch_sub(&dnode->ref_count, 1);
1225     mnt_chillax(dnode->mnt);
1226 }
1227
1228 int
1229 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1230 {
1231     int errno = 0;
1232
1233     lock_dnode(dnode);
1234
1235     if (!(dnode->inode->itype & VFS_IFDIR)) {
1236         errno = ENOTDIR;
1237         goto done;
1238     }
1239
1240     if (proc->cwd) {
1241         vfs_unref_dnode(proc->cwd);
1242     }
1243
1244     vfs_ref_dnode(dnode);
1245     proc->cwd = dnode;
1246
1247     unlock_dnode(dnode);
1248
1249 done:
1250     return errno;
1251 }
1252
1253 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1254 {
1255     struct v_dnode* dnode;
1256     int errno = 0;
1257
1258     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1259         goto done;
1260     }
1261
1262     errno = vfs_do_chdir(__current, dnode);
1263
1264 done:
1265     return DO_STATUS(errno);
1266 }
1267
1268 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1269 {
1270     struct v_fd* fd_s;
1271     int errno = 0;
1272
1273     if ((errno = vfs_getfd(fd, &fd_s))) {
1274         goto done;
1275     }
1276
1277     errno = vfs_do_chdir(__current, fd_s->file->dnode);
1278
1279 done:
1280     return DO_STATUS(errno);
1281 }
1282
1283 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1284 {
1285     int errno = 0;
1286     char* ret_ptr = 0;
1287     if (size < 2) {
1288         errno = ERANGE;
1289         goto done;
1290     }
1291
1292     size_t len = 0;
1293
1294     if (!__current->cwd) {
1295         *buf = VFS_PATH_DELIM;
1296         len = 1;
1297     } else {
1298         len = vfs_get_path(__current->cwd, buf, size, 0);
1299         if (len == size) {
1300             errno = ERANGE;
1301             goto done;
1302         }
1303     }
1304
1305     buf[len + 1] = '\0';
1306
1307     ret_ptr = buf;
1308
1309 done:
1310     __current->k_status = errno;
1311     return ret_ptr;
1312 }
1313
1314 int
1315 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1316 {
1317     int errno = 0;
1318     if (current->inode->id == target->inode->id) {
1319         // hard link
1320         return 0;
1321     }
1322
1323     if (errno = vfs_check_writable(current)) {
1324         return errno;
1325     }
1326
1327     if (current->ref_count > 1 || target->ref_count > 1) {
1328         return EBUSY;
1329     }
1330
1331     if (current->super_block != target->super_block) {
1332         return EXDEV;
1333     }
1334
1335     struct v_dnode* oldparent = current->parent;
1336     struct v_dnode* newparent = target->parent;
1337
1338     lock_dnode(current);
1339     lock_dnode(target);
1340     if (oldparent)
1341         lock_dnode(oldparent);
1342     if (newparent)
1343         lock_dnode(newparent);
1344
1345     if (!llist_empty(&target->children)) {
1346         errno = ENOTEMPTY;
1347         unlock_dnode(target);
1348         goto cleanup;
1349     }
1350
1351     if ((errno =
1352            current->inode->ops->rename(current->inode, current, target))) {
1353         unlock_dnode(target);
1354         goto cleanup;
1355     }
1356
1357     // re-position current
1358     hstrcpy(&current->name, &target->name);
1359     vfs_dcache_rehash(newparent, current);
1360
1361     // detach target
1362     vfs_d_free(target);
1363
1364     unlock_dnode(target);
1365
1366 cleanup:
1367     unlock_dnode(current);
1368     if (oldparent)
1369         unlock_dnode(oldparent);
1370     if (newparent)
1371         unlock_dnode(newparent);
1372
1373     return errno;
1374 }
1375
1376 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1377 {
1378     struct v_dnode *cur, *target_parent, *target;
1379     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1380     int errno = 0;
1381
1382     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1383         goto done;
1384     }
1385
1386     if ((errno = vfs_walk(
1387            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1388         goto done;
1389     }
1390
1391     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1392     if (errno == ENOENT) {
1393         target = vfs_d_alloc(target_parent, &name);
1394         vfs_dcache_add(target_parent, target);
1395     } else if (errno) {
1396         goto done;
1397     }
1398
1399     if (!target) {
1400         errno = ENOMEM;
1401         goto done;
1402     }
1403
1404     errno = vfs_do_rename(cur, target);
1405
1406 done:
1407     vfree(name.value);
1408     return DO_STATUS(errno);
1409 }