lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/dirent.h>
  48 #include <lunaix/foptions.h>
  49 #include <lunaix/fs.h>
  50 #include <lunaix/mm/cake.h>
  51 #include <lunaix/mm/page.h>
  52 #include <lunaix/mm/valloc.h>
  53 #include <lunaix/process.h>
  54 #include <lunaix/spike.h>
  55 #include <lunaix/syscall.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 static struct cake_pile* dnode_pile;
  60 static struct cake_pile* inode_pile;
  61 static struct cake_pile* file_pile;
  62 static struct cake_pile* superblock_pile;
  63 static struct cake_pile* fd_pile;
  64
  65 struct v_dnode* vfs_sysroot;
  66 static struct hbucket* dnode_cache;
  67
  68 struct lru_zone *dnode_lru, *inode_lru;
  69
  70 struct hstr vfs_ddot = HSTR("..", 2);
  71 struct hstr vfs_dot = HSTR(".", 1);
  72 struct hstr vfs_empty = HSTR("", 0);
  73
  74 struct v_superblock*
  75 vfs_sb_alloc();
  76
  77 void
  78 vfs_sb_free(struct v_superblock* sb);
  79
  80 static int
  81 __vfs_try_evict_dnode(struct lru_node* obj);
  82
  83 static int
  84 __vfs_try_evict_inode(struct lru_node* obj);
  85
  86 void
  87 vfs_init()
  88 {
  89     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  90     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  91     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  92     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  93     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  94     superblock_pile =
  95       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  96
  97     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
  98
  99     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 100     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 101
 102     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 103     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 104
 105     // 创建一个根dnode。
 106     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 107     vfs_sysroot->parent = vfs_sysroot;
 108     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 109 }
 110
 111 inline struct hbucket*
 112 __dcache_hash(struct v_dnode* parent, uint32_t* hash)
 113 {
 114     uint32_t _hash = *hash;
 115     // 与parent的指针值做加法，来减小碰撞的可能性。
 116     _hash += (uint32_t)parent;
 117     // 确保低位更加随机
 118     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 119     *hash = _hash;
 120     return &dnode_cache[_hash & VFS_HASH_MASK];
 121 }
 122
 123 struct v_dnode*
 124 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 125 {
 126     if (!str->len || HSTR_EQ(str, &vfs_dot))
 127         return parent;
 128
 129     if (HSTR_EQ(str, &vfs_ddot)) {
 130         return parent->parent;
 131     }
 132
 133     uint32_t hash = str->hash;
 134     struct hbucket* slot = __dcache_hash(parent, &hash);
 135
 136     struct v_dnode *pos, *n;
 137     hashtable_bucket_foreach(slot, pos, n, hash_list)
 138     {
 139         if (pos->name.hash == hash) {
 140             return pos;
 141         }
 142     }
 143     return NULL;
 144 }
 145
 146 void
 147 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 148 {
 149     assert(parent);
 150
 151     atomic_fetch_add(&dnode->ref_count, 1);
 152     dnode->parent = parent;
 153     llist_append(&parent->children, &dnode->siblings);
 154
 155     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 156     hlist_add(&bucket->head, &dnode->hash_list);
 157 }
 158
 159 void
 160 vfs_dcache_remove(struct v_dnode* dnode)
 161 {
 162     assert(dnode);
 163     assert(dnode->ref_count == 1);
 164
 165     llist_delete(&dnode->siblings);
 166     hlist_delete(&dnode->hash_list);
 167
 168     dnode->parent = NULL;
 169     atomic_fetch_sub(&dnode->ref_count, 1);
 170 }
 171
 172 void
 173 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 174 {
 175     assert(new_parent);
 176
 177     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 178     vfs_dcache_remove(dnode);
 179     vfs_dcache_add(new_parent, dnode);
 180 }
 181
 182 int
 183 vfs_open(struct v_dnode* dnode, struct v_file** file)
 184 {
 185     if (!dnode->inode || !dnode->inode->ops->open) {
 186         return ENOTSUP;
 187     }
 188
 189     struct v_inode* inode = dnode->inode;
 190
 191     lock_inode(inode);
 192
 193     struct v_file* vfile = cake_grab(file_pile);
 194     memset(vfile, 0, sizeof(*vfile));
 195
 196     vfile->dnode = dnode;
 197     vfile->inode = inode;
 198     vfile->ref_count = ATOMIC_VAR_INIT(1);
 199     vfile->ops = inode->default_fops;
 200
 201     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 202         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 203         pcache_init(pcache);
 204         pcache->master = inode;
 205         inode->pg_cache = pcache;
 206     }
 207
 208     int errno = inode->ops->open(inode, vfile);
 209     if (errno) {
 210         cake_release(file_pile, vfile);
 211     } else {
 212         atomic_fetch_add(&dnode->ref_count, 1);
 213         inode->open_count++;
 214         mnt_mkbusy(dnode->mnt);
 215
 216         *file = vfile;
 217     }
 218
 219     unlock_inode(inode);
 220
 221     return errno;
 222 }
 223
 224 void
 225 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 226 {
 227     if (assign_to->inode) {
 228         assign_to->inode->link_count--;
 229     }
 230     assign_to->inode = inode;
 231     inode->link_count++;
 232 }
 233
 234 int
 235 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 236 {
 237     int errno;
 238
 239     if ((errno = vfs_check_writable(to_link))) {
 240         return errno;
 241     }
 242
 243     lock_inode(to_link->inode);
 244     if (to_link->super_block->root != name->super_block->root) {
 245         errno = EXDEV;
 246     } else if (!to_link->inode->ops->link) {
 247         errno = ENOTSUP;
 248     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 249         vfs_assign_inode(name, to_link->inode);
 250     }
 251     unlock_inode(to_link->inode);
 252
 253     return errno;
 254 }
 255
 256 int
 257 vfs_close(struct v_file* file)
 258 {
 259     int errno = 0;
 260     if (!(errno = file->ops->close(file))) {
 261         atomic_fetch_sub(&file->dnode->ref_count, 1);
 262         file->inode->open_count--;
 263         mnt_chillax(file->dnode->mnt);
 264
 265         pcache_commit_all(file->inode);
 266         cake_release(file_pile, file);
 267     }
 268     return errno;
 269 }
 270
 271 int
 272 vfs_fsync(struct v_file* file)
 273 {
 274     int errno;
 275     if ((errno = vfs_check_writable(file->dnode))) {
 276         return errno;
 277     }
 278
 279     lock_inode(file->inode);
 280
 281     pcache_commit_all(file->inode);
 282
 283     errno = ENOTSUP;
 284     if (file->ops->sync) {
 285         errno = file->ops->sync(file);
 286     }
 287
 288     unlock_inode(file->inode);
 289
 290     return errno;
 291 }
 292
 293 int
 294 vfs_alloc_fdslot(int* fd)
 295 {
 296     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 297         if (!__current->fdtable->fds[i]) {
 298             *fd = i;
 299             return 0;
 300         }
 301     }
 302     return EMFILE;
 303 }
 304
 305 struct v_superblock*
 306 vfs_sb_alloc()
 307 {
 308     struct v_superblock* sb = cake_grab(superblock_pile);
 309     memset(sb, 0, sizeof(*sb));
 310     llist_init_head(&sb->sb_list);
 311     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 312     return sb;
 313 }
 314
 315 void
 316 vfs_sb_free(struct v_superblock* sb)
 317 {
 318     vfree(sb->i_cache);
 319     cake_release(superblock_pile, sb);
 320 }
 321
 322 static int
 323 __vfs_try_evict_dnode(struct lru_node* obj)
 324 {
 325     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 326
 327     if (!dnode->ref_count) {
 328         vfs_d_free(dnode);
 329         return 1;
 330     }
 331     return 0;
 332 }
 333
 334 static int
 335 __vfs_try_evict_inode(struct lru_node* obj)
 336 {
 337     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 338
 339     if (!inode->link_count && !inode->open_count) {
 340         vfs_i_free(inode);
 341         return 1;
 342     }
 343     return 0;
 344 }
 345
 346 struct v_dnode*
 347 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 348 {
 349     struct v_dnode* dnode = cake_grab(dnode_pile);
 350     if (!dnode) {
 351         lru_evict_half(dnode_lru);
 352
 353         if (!(dnode = cake_grab(dnode_pile))) {
 354             return NULL;
 355         }
 356     }
 357
 358     memset(dnode, 0, sizeof(*dnode));
 359     llist_init_head(&dnode->children);
 360     llist_init_head(&dnode->siblings);
 361     mutex_init(&dnode->lock);
 362
 363     dnode->ref_count = ATOMIC_VAR_INIT(0);
 364     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 365
 366     hstrcpy(&dnode->name, name);
 367
 368     if (parent) {
 369         dnode->super_block = parent->super_block;
 370         dnode->mnt = parent->mnt;
 371     }
 372
 373     lru_use_one(dnode_lru, &dnode->lru);
 374
 375     return dnode;
 376 }
 377
 378 void
 379 vfs_d_free(struct v_dnode* dnode)
 380 {
 381     assert(dnode->ref_count == 1);
 382
 383     if (dnode->inode) {
 384         assert(dnode->inode->link_count > 0);
 385         dnode->inode->link_count--;
 386     }
 387
 388     vfs_dcache_remove(dnode);
 389     // Make sure the children de-referencing their parent.
 390     // With lru presented, the eviction will be propagated over the entire
 391     // detached subtree eventually
 392     struct v_dnode *pos, *n;
 393     llist_for_each(pos, n, &dnode->children, siblings)
 394     {
 395         vfs_dcache_remove(pos);
 396     }
 397
 398     vfree(dnode->name.value);
 399     cake_release(dnode_pile, dnode);
 400 }
 401
 402 struct v_inode*
 403 vfs_i_find(struct v_superblock* sb, uint32_t i_id)
 404 {
 405     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 406     struct v_inode *pos, *n;
 407     hashtable_bucket_foreach(slot, pos, n, hash_list)
 408     {
 409         if (pos->id == i_id) {
 410             lru_use_one(inode_lru, &pos->lru);
 411             return pos;
 412         }
 413     }
 414
 415     return NULL;
 416 }
 417
 418 void
 419 vfs_i_addhash(struct v_inode* inode)
 420 {
 421     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 422
 423     hlist_delete(&inode->hash_list);
 424     hlist_add(&slot->head, &inode->hash_list);
 425 }
 426
 427 struct v_inode*
 428 vfs_i_alloc(struct v_superblock* sb)
 429 {
 430     assert(sb->ops.init_inode);
 431
 432     struct v_inode* inode;
 433     if (!(inode = cake_grab(inode_pile))) {
 434         lru_evict_half(inode_lru);
 435         if (!(inode = cake_grab(inode_pile))) {
 436             return NULL;
 437         }
 438     }
 439
 440     memset(inode, 0, sizeof(*inode));
 441     mutex_init(&inode->lock);
 442     llist_init_head(&inode->xattrs);
 443
 444     sb->ops.init_inode(sb, inode);
 445
 446     inode->sb = sb;
 447     inode->ctime = clock_unixtime();
 448     inode->atime = inode->ctime;
 449     inode->mtime = inode->ctime;
 450
 451 done:
 452     lru_use_one(inode_lru, &inode->lru);
 453     return inode;
 454 }
 455
 456 void
 457 vfs_i_free(struct v_inode* inode)
 458 {
 459     if (inode->pg_cache) {
 460         pcache_release(inode->pg_cache);
 461         vfree(inode->pg_cache);
 462     }
 463     inode->ops->sync(inode);
 464     hlist_delete(&inode->hash_list);
 465     cake_release(inode_pile, inode);
 466 }
 467
 468 /* ---- System call definition and support ---- */
 469
 470 #define FLOCATE_CREATE_EMPTY 1
 471
 472 int
 473 vfs_getfd(int fd, struct v_fd** fd_s)
 474 {
 475     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 476         return 0;
 477     }
 478     return EBADF;
 479 }
 480
 481 int
 482 __vfs_try_locate_file(const char* path,
 483                       struct v_dnode** fdir,
 484                       struct v_dnode** file,
 485                       int options)
 486 {
 487     char name_str[VFS_NAME_MAXLEN];
 488     struct hstr name = HSTR(name_str, 0);
 489     int errno;
 490
 491     name_str[0] = 0;
 492     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 493         return errno;
 494     }
 495
 496     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 497     if (errno != ENOENT || !(options & FLOCATE_CREATE_EMPTY)) {
 498         return errno;
 499     }
 500
 501     struct v_dnode* parent = *fdir;
 502     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 503
 504     if (!file_new) {
 505         return ENOMEM;
 506     }
 507
 508     lock_dnode(parent);
 509
 510     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 511         vfs_dcache_add(parent, file_new);
 512         *file = file_new;
 513     } else {
 514         vfs_d_free(file_new);
 515     }
 516
 517     unlock_dnode(parent);
 518
 519     return errno;
 520 }
 521
 522 int
 523 vfs_do_open(const char* path, int options)
 524 {
 525     int errno, fd;
 526     struct v_dnode *dentry, *file;
 527     struct v_file* ofile = 0;
 528
 529     errno = __vfs_try_locate_file(
 530       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 531
 532     if (errno || (errno = vfs_open(file, &ofile))) {
 533         return errno;
 534     }
 535
 536     struct v_inode* o_inode = ofile->inode;
 537
 538     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 539         struct v_fd* fd_s = vzalloc(sizeof(*fd_s));
 540         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 541         fd_s->file = ofile;
 542         fd_s->flags = options;
 543         __current->fdtable->fds[fd] = fd_s;
 544         return fd;
 545     }
 546
 547     return errno;
 548 }
 549
 550 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 551 {
 552     int errno = vfs_do_open(path, options);
 553     return DO_STATUS_OR_RETURN(errno);
 554 }
 555
 556 __DEFINE_LXSYSCALL1(int, close, int, fd)
 557 {
 558     struct v_fd* fd_s;
 559     int errno = 0;
 560     if ((errno = vfs_getfd(fd, &fd_s))) {
 561         goto done_err;
 562     }
 563
 564     if (fd_s->file->ref_count > 1) {
 565         fd_s->file->ref_count--;
 566     } else if ((errno = vfs_close(fd_s->file))) {
 567         goto done_err;
 568     }
 569
 570     vfree(fd_s);
 571     __current->fdtable->fds[fd] = 0;
 572
 573 done_err:
 574     return DO_STATUS(errno);
 575 }
 576
 577 void
 578 __vfs_readdir_callback(struct dir_context* dctx,
 579                        const char* name,
 580                        const int len,
 581                        const int dtype)
 582 {
 583     struct dirent* dent = (struct dirent*)dctx->cb_data;
 584     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 585     dent->d_nlen = len;
 586     dent->d_type = dtype;
 587 }
 588
 589 __DEFINE_LXSYSCALL2(int, readdir, int, fd, struct dirent*, dent)
 590 {
 591     struct v_fd* fd_s;
 592     int errno;
 593
 594     if ((errno = vfs_getfd(fd, &fd_s))) {
 595         goto done;
 596     }
 597
 598     struct v_inode* inode = fd_s->file->inode;
 599
 600     lock_inode(inode);
 601
 602     if (!(inode->itype & VFS_IFDIR)) {
 603         errno = ENOTDIR;
 604     } else {
 605         struct dir_context dctx =
 606           (struct dir_context){ .cb_data = dent,
 607                                 .index = dent->d_offset,
 608                                 .read_complete_callback =
 609                                   __vfs_readdir_callback };
 610         errno = 1;
 611         if (dent->d_offset == 0) {
 612             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 613         } else if (dent->d_offset == 1) {
 614             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 615         } else {
 616             dctx.index -= 2;
 617             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 618                 unlock_inode(inode);
 619                 goto done;
 620             }
 621         }
 622         dent->d_offset++;
 623     }
 624
 625     unlock_inode(inode);
 626
 627 done:
 628     return DO_STATUS_OR_RETURN(errno);
 629 }
 630
 631 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 632 {
 633     int errno = 0;
 634     struct v_fd* fd_s;
 635     if ((errno = vfs_getfd(fd, &fd_s))) {
 636         goto done;
 637     }
 638
 639     struct v_file* file = fd_s->file;
 640     if ((file->inode->itype & VFS_IFDIR)) {
 641         errno = EISDIR;
 642         goto done;
 643     }
 644
 645     lock_inode(file->inode);
 646
 647     file->inode->atime = clock_unixtime();
 648
 649     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 650         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 651     } else {
 652         errno = pcache_read(file->inode, buf, count, file->f_pos);
 653     }
 654
 655     if (errno > 0) {
 656         file->f_pos += errno;
 657         unlock_inode(file->inode);
 658         return errno;
 659     }
 660
 661     unlock_inode(file->inode);
 662
 663 done:
 664     return DO_STATUS(errno);
 665 }
 666
 667 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 668 {
 669     int errno = 0;
 670     struct v_fd* fd_s;
 671     if ((errno = vfs_getfd(fd, &fd_s))) {
 672         goto done;
 673     }
 674
 675     struct v_file* file = fd_s->file;
 676
 677     if ((errno = vfs_check_writable(file->dnode))) {
 678         goto done;
 679     }
 680
 681     if ((file->inode->itype & VFS_IFDIR)) {
 682         errno = EISDIR;
 683         goto done;
 684     }
 685
 686     lock_inode(file->inode);
 687
 688     file->inode->mtime = clock_unixtime();
 689
 690     __SYSCALL_INTERRUPTIBLE({
 691         if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 692             errno = file->ops->write(file->inode, buf, count, file->f_pos);
 693         } else {
 694             errno = pcache_write(file->inode, buf, count, file->f_pos);
 695         }
 696     })
 697
 698     if (errno > 0) {
 699         file->f_pos += errno;
 700         unlock_inode(file->inode);
 701         return errno;
 702     }
 703
 704     unlock_inode(file->inode);
 705
 706 done:
 707     return DO_STATUS(errno);
 708 }
 709
 710 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 711 {
 712     int errno = 0;
 713     struct v_fd* fd_s;
 714     if ((errno = vfs_getfd(fd, &fd_s))) {
 715         goto done;
 716     }
 717
 718     struct v_file* file = fd_s->file;
 719
 720     if (!file->ops->seek) {
 721         errno = ENOTSUP;
 722         goto done;
 723     }
 724
 725     lock_inode(file->inode);
 726
 727     int overflow = 0;
 728     int fpos = file->f_pos;
 729     switch (options) {
 730         case FSEEK_CUR:
 731             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 732             break;
 733         case FSEEK_END:
 734             overflow =
 735               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 736             break;
 737         case FSEEK_SET:
 738             fpos = offset;
 739             break;
 740     }
 741     if (overflow) {
 742         errno = EOVERFLOW;
 743     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 744         file->f_pos = fpos;
 745     }
 746
 747     unlock_inode(file->inode);
 748
 749 done:
 750     return DO_STATUS(errno);
 751 }
 752
 753 int
 754 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 755 {
 756     if (!dnode || dnode->parent == dnode) {
 757         return 0;
 758     }
 759
 760     if (depth > 64) {
 761         return ENAMETOOLONG;
 762     }
 763
 764     size_t len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 765
 766     if (len >= size) {
 767         return len;
 768     }
 769
 770     buf[len++] = VFS_PATH_DELIM;
 771
 772     size_t cpy_size = MIN(dnode->name.len, size - len);
 773     strncpy(buf + len, dnode->name.value, cpy_size);
 774     len += cpy_size;
 775
 776     return len;
 777 }
 778
 779 int
 780 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 781 {
 782     const char* link;
 783     struct v_inode* inode = dnode->inode;
 784     if (inode->ops->read_symlink) {
 785         lock_inode(inode);
 786
 787         int errno = inode->ops->read_symlink(inode, &link);
 788         strncpy(buf, link, size);
 789
 790         unlock_inode(inode);
 791         return errno;
 792     }
 793     return 0;
 794 }
 795
 796 int
 797 vfs_get_dtype(int itype)
 798 {
 799     switch (itype) {
 800         case VFS_IFDIR:
 801             return DT_DIR;
 802         case VFS_IFSYMLINK:
 803             return DT_SYMLINK;
 804         default:
 805             return DT_PIPE;
 806     }
 807 }
 808
 809 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 810 {
 811     int errno;
 812     struct v_fd* fd_s;
 813     if ((errno = vfs_getfd(fd, &fd_s))) {
 814         goto done;
 815     }
 816
 817     struct v_dnode* dnode;
 818     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 819
 820     if (errno >= 0) {
 821         return errno;
 822     }
 823
 824 done:
 825     return DO_STATUS(errno);
 826 }
 827
 828 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 829 {
 830     int errno;
 831     struct v_dnode* dnode;
 832     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 833         errno = vfs_readlink(dnode, buf, size);
 834     }
 835
 836     if (errno >= 0) {
 837         return errno;
 838     }
 839
 840     return DO_STATUS(errno);
 841 }
 842
 843 __DEFINE_LXSYSCALL4(int,
 844                     readlinkat,
 845                     int,
 846                     dirfd,
 847                     const char*,
 848                     pathname,
 849                     char*,
 850                     buf,
 851                     size_t,
 852                     size)
 853 {
 854     int errno;
 855     struct v_fd* fd_s;
 856     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 857         goto done;
 858     }
 859
 860     struct v_dnode* dnode;
 861     if (!(errno = vfs_walk(
 862             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 863         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 864     }
 865
 866     if (errno >= 0) {
 867         return errno;
 868     }
 869
 870 done:
 871     return DO_STATUS(errno);
 872 }
 873
 874 /*
 875     NOTE
 876     When we perform operation that could affect the layout of
 877     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 878     whenever possible. This will blocking any ongoing path walking to reach
 879     it hence avoid any partial state.
 880 */
 881
 882 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 883 {
 884     int errno;
 885     struct v_dnode* dnode;
 886     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 887         return DO_STATUS(errno);
 888     }
 889
 890     lock_dnode(dnode);
 891
 892     if ((errno = vfs_check_writable(dnode))) {
 893         goto done;
 894     }
 895
 896     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 897         errno = EROFS;
 898         goto done;
 899     }
 900
 901     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 902         errno = EBUSY;
 903         goto done;
 904     }
 905
 906     if (!llist_empty(&dnode->children)) {
 907         errno = ENOTEMPTY;
 908         goto done;
 909     }
 910
 911     struct v_dnode* parent = dnode->parent;
 912
 913     if (!parent) {
 914         errno = EINVAL;
 915         goto done;
 916     }
 917
 918     lock_dnode(parent);
 919     lock_inode(parent->inode);
 920
 921     if ((dnode->inode->itype & VFS_IFDIR)) {
 922         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 923         if (!errno) {
 924             vfs_dcache_remove(dnode);
 925         }
 926     } else {
 927         errno = ENOTDIR;
 928     }
 929
 930     unlock_inode(parent->inode);
 931     unlock_dnode(parent);
 932
 933 done:
 934     unlock_dnode(dnode);
 935     return DO_STATUS(errno);
 936 }
 937
 938 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 939 {
 940     int errno = 0;
 941     struct v_dnode *parent, *dir;
 942     char name_value[VFS_NAME_MAXLEN];
 943     struct hstr name = HHSTR(name_value, 0, 0);
 944
 945     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
 946         goto done;
 947     }
 948
 949     if ((errno = vfs_check_writable(parent))) {
 950         goto done;
 951     }
 952
 953     if (!(dir = vfs_d_alloc(parent, &name))) {
 954         errno = ENOMEM;
 955         goto done;
 956     }
 957
 958     lock_dnode(parent);
 959     lock_inode(parent->inode);
 960
 961     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
 962         errno = ENOTSUP;
 963     } else if (!parent->inode->ops->mkdir) {
 964         errno = ENOTSUP;
 965     } else if (!(parent->inode->itype & VFS_IFDIR)) {
 966         errno = ENOTDIR;
 967     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
 968         vfs_dcache_add(parent, dir);
 969         goto cleanup;
 970     }
 971
 972     vfs_d_free(dir);
 973
 974 cleanup:
 975     unlock_inode(parent->inode);
 976     unlock_dnode(parent);
 977 done:
 978     return DO_STATUS(errno);
 979 }
 980
 981 int
 982 __vfs_do_unlink(struct v_dnode* dnode)
 983 {
 984     int errno;
 985     struct v_inode* inode = dnode->inode;
 986
 987     if (dnode->ref_count > 1) {
 988         return EBUSY;
 989     }
 990
 991     if ((errno = vfs_check_writable(dnode))) {
 992         return errno;
 993     }
 994
 995     lock_inode(inode);
 996
 997     if (inode->open_count) {
 998         errno = EBUSY;
 999     } else if (!(inode->itype & VFS_IFDIR)) {
1000         // The underlying unlink implementation should handle
1001         //  symlink case
1002         errno = inode->ops->unlink(inode);
1003         if (!errno) {
1004             vfs_d_free(dnode);
1005         }
1006     } else {
1007         errno = EISDIR;
1008     }
1009
1010     unlock_inode(inode);
1011
1012     return errno;
1013 }
1014
1015 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1016 {
1017     int errno;
1018     struct v_dnode* dnode;
1019     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1020         goto done;
1021     }
1022
1023     errno = __vfs_do_unlink(dnode);
1024
1025 done:
1026     return DO_STATUS(errno);
1027 }
1028
1029 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1030 {
1031     int errno;
1032     struct v_fd* fd_s;
1033     if ((errno = vfs_getfd(fd, &fd_s))) {
1034         goto done;
1035     }
1036
1037     struct v_dnode* dnode;
1038     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1039         errno = __vfs_do_unlink(dnode);
1040     }
1041
1042 done:
1043     return DO_STATUS(errno);
1044 }
1045
1046 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1047 {
1048     int errno;
1049     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1050
1051     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1052     if (!errno) {
1053         errno = __vfs_try_locate_file(
1054           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1055         if (!errno) {
1056             errno = EEXIST;
1057         } else if (name_file) {
1058             errno = vfs_link(to_link, name_file);
1059         }
1060     }
1061     return DO_STATUS(errno);
1062 }
1063
1064 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1065 {
1066     int errno;
1067     struct v_fd* fd_s;
1068
1069     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1070         errno = vfs_fsync(fd_s->file);
1071     }
1072
1073     return DO_STATUS(errno);
1074 }
1075
1076 int
1077 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1078 {
1079     int errno = 0;
1080     struct v_fd* copied = cake_grab(fd_pile);
1081
1082     memcpy(copied, old, sizeof(struct v_fd));
1083
1084     atomic_fetch_add(&old->file->ref_count, 1);
1085
1086     *new = copied;
1087
1088     return errno;
1089 }
1090
1091 int
1092 vfs_dup2(int oldfd, int newfd)
1093 {
1094     if (newfd == oldfd) {
1095         return newfd;
1096     }
1097
1098     int errno;
1099     struct v_fd *oldfd_s, *newfd_s;
1100     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1101         goto done;
1102     }
1103
1104     if (!TEST_FD(newfd)) {
1105         errno = EBADF;
1106         goto done;
1107     }
1108
1109     newfd_s = __current->fdtable->fds[newfd];
1110     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1111         goto done;
1112     }
1113
1114     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1115         __current->fdtable->fds[newfd] = newfd_s;
1116         return newfd;
1117     }
1118
1119 done:
1120     return DO_STATUS(errno);
1121 }
1122
1123 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1124 {
1125     return vfs_dup2(oldfd, newfd);
1126 }
1127
1128 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1129 {
1130     int errno, newfd;
1131     struct v_fd *oldfd_s, *newfd_s;
1132     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1133         goto done;
1134     }
1135
1136     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1137         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1138         __current->fdtable->fds[newfd] = newfd_s;
1139         return newfd;
1140     }
1141
1142 done:
1143     return DO_STATUS(errno);
1144 }
1145
1146 __DEFINE_LXSYSCALL2(int,
1147                     symlink,
1148                     const char*,
1149                     pathname,
1150                     const char*,
1151                     link_target)
1152 {
1153     int errno;
1154     struct v_dnode* dnode;
1155     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1156         goto done;
1157     }
1158
1159     if (errno = vfs_check_writable(dnode)) {
1160         goto done;
1161     }
1162
1163     if (!dnode->inode->ops->set_symlink) {
1164         errno = ENOTSUP;
1165         goto done;
1166     }
1167
1168     lock_inode(dnode->inode);
1169
1170     errno = dnode->inode->ops->set_symlink(dnode->inode, link_target);
1171
1172     unlock_inode(dnode->inode);
1173
1174 done:
1175     return DO_STATUS(errno);
1176 }
1177
1178 int
1179 __vfs_do_chdir(struct v_dnode* dnode)
1180 {
1181     int errno = 0;
1182
1183     lock_dnode(dnode);
1184
1185     if (!(dnode->inode->itype & VFS_IFDIR)) {
1186         errno = ENOTDIR;
1187         goto done;
1188     }
1189
1190     if (__current->cwd) {
1191         atomic_fetch_sub(&__current->cwd->ref_count, 1);
1192         mnt_chillax(__current->cwd->mnt);
1193     }
1194
1195     atomic_fetch_add(&dnode->ref_count, 1);
1196     mnt_mkbusy(dnode->mnt);
1197     __current->cwd = dnode;
1198
1199     unlock_dnode(dnode);
1200
1201 done:
1202     return errno;
1203 }
1204
1205 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1206 {
1207     struct v_dnode* dnode;
1208     int errno = 0;
1209
1210     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1211         goto done;
1212     }
1213
1214     errno = __vfs_do_chdir(dnode);
1215
1216 done:
1217     return DO_STATUS(errno);
1218 }
1219
1220 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1221 {
1222     struct v_fd* fd_s;
1223     int errno = 0;
1224
1225     if ((errno = vfs_getfd(fd, &fd_s))) {
1226         goto done;
1227     }
1228
1229     errno = __vfs_do_chdir(fd_s->file->dnode);
1230
1231 done:
1232     return DO_STATUS(errno);
1233 }
1234
1235 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1236 {
1237     int errno = 0;
1238     char* ret_ptr = 0;
1239     if (size < 2) {
1240         errno = ERANGE;
1241         goto done;
1242     }
1243
1244     size_t len = 0;
1245
1246     if (!__current->cwd) {
1247         *buf = VFS_PATH_DELIM;
1248         len = 1;
1249     } else {
1250         len = vfs_get_path(__current->cwd, buf, size, 0);
1251         if (len == size) {
1252             errno = ERANGE;
1253             goto done;
1254         }
1255     }
1256
1257     buf[len + 1] = '\0';
1258
1259     ret_ptr = buf;
1260
1261 done:
1262     __current->k_status = errno;
1263     return ret_ptr;
1264 }
1265
1266 int
1267 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1268 {
1269     int errno = 0;
1270     if (current->inode->id == target->inode->id) {
1271         // hard link
1272         return 0;
1273     }
1274
1275     if (errno = vfs_check_writable(current)) {
1276         return errno;
1277     }
1278
1279     if (current->ref_count > 1 || target->ref_count > 1) {
1280         return EBUSY;
1281     }
1282
1283     if (current->super_block != target->super_block) {
1284         return EXDEV;
1285     }
1286
1287     struct v_dnode* oldparent = current->parent;
1288     struct v_dnode* newparent = target->parent;
1289
1290     lock_dnode(current);
1291     lock_dnode(target);
1292     if (oldparent)
1293         lock_dnode(oldparent);
1294     if (newparent)
1295         lock_dnode(newparent);
1296
1297     if (!llist_empty(&target->children)) {
1298         errno = ENOTEMPTY;
1299         unlock_dnode(target);
1300         goto cleanup;
1301     }
1302
1303     if ((errno =
1304            current->inode->ops->rename(current->inode, current, target))) {
1305         unlock_dnode(target);
1306         goto cleanup;
1307     }
1308
1309     // re-position current
1310     hstrcpy(&current->name, &target->name);
1311     vfs_dcache_rehash(newparent, current);
1312
1313     // detach target
1314     vfs_d_free(target);
1315
1316     unlock_dnode(target);
1317
1318 cleanup:
1319     unlock_dnode(current);
1320     if (oldparent)
1321         unlock_dnode(oldparent);
1322     if (newparent)
1323         unlock_dnode(newparent);
1324
1325     return errno;
1326 }
1327
1328 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1329 {
1330     struct v_dnode *cur, *target_parent, *target;
1331     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1332     int errno = 0;
1333
1334     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1335         goto done;
1336     }
1337
1338     if ((errno = vfs_walk(
1339            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1340         goto done;
1341     }
1342
1343     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1344     if (errno == ENOENT) {
1345         target = vfs_d_alloc(target_parent, &name);
1346         vfs_dcache_add(target_parent, target);
1347     } else if (errno) {
1348         goto done;
1349     }
1350
1351     if (!target) {
1352         errno = ENOMEM;
1353         goto done;
1354     }
1355
1356     errno = vfs_do_rename(cur, target);
1357
1358 done:
1359     vfree(name.value);
1360     return DO_STATUS(errno);
1361 }