lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition (CHECKED)
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/foptions.h>
  48 #include <lunaix/fs.h>
  49 #include <lunaix/mm/cake.h>
  50 #include <lunaix/mm/page.h>
  51 #include <lunaix/mm/valloc.h>
  52 #include <lunaix/process.h>
  53 #include <lunaix/spike.h>
  54 #include <lunaix/syscall.h>
  55 #include <lunaix/syscall_utils.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 #include <usr/lunaix/dirent_defs.h>
  60
  61 static struct cake_pile* dnode_pile;
  62 static struct cake_pile* inode_pile;
  63 static struct cake_pile* file_pile;
  64 static struct cake_pile* superblock_pile;
  65 static struct cake_pile* fd_pile;
  66
  67 struct v_dnode* vfs_sysroot;
  68 static struct hbucket* dnode_cache;
  69
  70 struct lru_zone *dnode_lru, *inode_lru;
  71
  72 struct hstr vfs_ddot = HSTR("..", 2);
  73 struct hstr vfs_dot = HSTR(".", 1);
  74 struct hstr vfs_empty = HSTR("", 0);
  75
  76 struct v_superblock*
  77 vfs_sb_alloc();
  78
  79 void
  80 vfs_sb_free(struct v_superblock* sb);
  81
  82 static int
  83 __vfs_try_evict_dnode(struct lru_node* obj);
  84
  85 static int
  86 __vfs_try_evict_inode(struct lru_node* obj);
  87
  88 void
  89 vfs_init()
  90 {
  91     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  92     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  93     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  94     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  95     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  96     superblock_pile =
  97       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  98
  99     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 100
 101     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 102     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 103
 104     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 105     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 106
 107     // 创建一个根dnode。
 108     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 109     vfs_sysroot->parent = vfs_sysroot;
 110     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 111 }
 112
 113 static inline struct hbucket*
 114 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 115 {
 116     u32_t _hash = *hash;
 117     // 确保低位更加随机
 118     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 119     // 与parent的指针值做加法，来减小碰撞的可能性。
 120     _hash += (u32_t)parent;
 121     *hash = _hash;
 122     return &dnode_cache[_hash & VFS_HASH_MASK];
 123 }
 124
 125 struct v_dnode*
 126 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 127 {
 128     if (!str->len || HSTR_EQ(str, &vfs_dot))
 129         return parent;
 130
 131     if (HSTR_EQ(str, &vfs_ddot)) {
 132         return parent->parent;
 133     }
 134
 135     u32_t hash = str->hash;
 136     struct hbucket* slot = __dcache_hash(parent, &hash);
 137
 138     struct v_dnode *pos, *n;
 139     hashtable_bucket_foreach(slot, pos, n, hash_list)
 140     {
 141         if (pos->name.hash == hash) {
 142             return pos;
 143         }
 144     }
 145     return NULL;
 146 }
 147
 148 void
 149 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 150 {
 151     assert(parent);
 152
 153     atomic_fetch_add(&dnode->ref_count, 1);
 154     dnode->parent = parent;
 155     llist_append(&parent->children, &dnode->siblings);
 156
 157     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 158     hlist_add(&bucket->head, &dnode->hash_list);
 159 }
 160
 161 void
 162 vfs_dcache_remove(struct v_dnode* dnode)
 163 {
 164     assert(dnode);
 165     assert(dnode->ref_count == 1);
 166
 167     llist_delete(&dnode->siblings);
 168     llist_delete(&dnode->aka_list);
 169     hlist_delete(&dnode->hash_list);
 170
 171     dnode->parent = NULL;
 172     atomic_fetch_sub(&dnode->ref_count, 1);
 173 }
 174
 175 void
 176 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 177 {
 178     assert(new_parent);
 179
 180     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 181     vfs_dcache_remove(dnode);
 182     vfs_dcache_add(new_parent, dnode);
 183 }
 184
 185 int
 186 vfs_open(struct v_dnode* dnode, struct v_file** file)
 187 {
 188     if (!dnode->inode || !dnode->inode->ops->open) {
 189         return ENOTSUP;
 190     }
 191
 192     struct v_inode* inode = dnode->inode;
 193
 194     lock_inode(inode);
 195
 196     struct v_file* vfile = cake_grab(file_pile);
 197     memset(vfile, 0, sizeof(*vfile));
 198
 199     vfile->dnode = dnode;
 200     vfile->inode = inode;
 201     vfile->ref_count = ATOMIC_VAR_INIT(1);
 202     vfile->ops = inode->default_fops;
 203
 204     if ((inode->itype & F_MFILE) && !inode->pg_cache) {
 205         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 206         pcache_init(pcache);
 207         pcache->master = inode;
 208         inode->pg_cache = pcache;
 209     }
 210
 211     int errno = inode->ops->open(inode, vfile);
 212     if (errno) {
 213         cake_release(file_pile, vfile);
 214     } else {
 215         atomic_fetch_add(&dnode->ref_count, 1);
 216         inode->open_count++;
 217         mnt_mkbusy(dnode->mnt);
 218
 219         *file = vfile;
 220     }
 221
 222     unlock_inode(inode);
 223
 224     return errno;
 225 }
 226
 227 void
 228 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 229 {
 230     if (assign_to->inode) {
 231         llist_delete(&assign_to->aka_list);
 232         assign_to->inode->link_count--;
 233     }
 234     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 235     assign_to->inode = inode;
 236     inode->link_count++;
 237 }
 238
 239 int
 240 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 241 {
 242     int errno;
 243
 244     if ((errno = vfs_check_writable(to_link))) {
 245         return errno;
 246     }
 247
 248     lock_inode(to_link->inode);
 249     if (to_link->super_block->root != name->super_block->root) {
 250         errno = EXDEV;
 251     } else if (!to_link->inode->ops->link) {
 252         errno = ENOTSUP;
 253     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 254         vfs_assign_inode(name, to_link->inode);
 255     }
 256     unlock_inode(to_link->inode);
 257
 258     return errno;
 259 }
 260
 261 int
 262 vfs_pclose(struct v_file* file, pid_t pid)
 263 {
 264     int errno = 0;
 265     if (file->ref_count > 1) {
 266         atomic_fetch_sub(&file->ref_count, 1);
 267     } else if (!(errno = file->ops->close(file))) {
 268         atomic_fetch_sub(&file->dnode->ref_count, 1);
 269         file->inode->open_count--;
 270
 271         /*
 272          * Prevent dead lock.
 273          * This happened when process is terminated while blocking on read.
 274          * In that case, the process is still holding the inode lock and it
 275              will never get released.
 276          * The unlocking should also include ownership check.
 277          *
 278          * To see why, consider two process both open the same file both with
 279          * fd=x.
 280          *      Process A: busy on reading x
 281          *      Process B: do nothing with x
 282          * Assuming that, after a very short time, process B get terminated
 283          * while process A is still busy in it's reading business. By this
 284          * design, the inode lock of this file x is get released by B rather
 285          * than A. And this will cause a probable race condition on A if other
 286          * process is writing to this file later after B exit.
 287          */
 288         if (mutex_on_hold(&file->inode->lock)) {
 289             mutex_unlock_for(&file->inode->lock, pid);
 290         }
 291         mnt_chillax(file->dnode->mnt);
 292
 293         pcache_commit_all(file->inode);
 294         cake_release(file_pile, file);
 295     }
 296     return errno;
 297 }
 298
 299 int
 300 vfs_close(struct v_file* file)
 301 {
 302     return vfs_pclose(file, __current->pid);
 303 }
 304
 305 void
 306 vfs_free_fd(struct v_fd* fd)
 307 {
 308     cake_release(fd_pile, fd);
 309 }
 310
 311 int
 312 vfs_fsync(struct v_file* file)
 313 {
 314     int errno;
 315     if ((errno = vfs_check_writable(file->dnode))) {
 316         return errno;
 317     }
 318
 319     lock_inode(file->inode);
 320
 321     pcache_commit_all(file->inode);
 322
 323     errno = ENOTSUP;
 324     if (file->ops->sync) {
 325         errno = file->ops->sync(file);
 326     }
 327
 328     unlock_inode(file->inode);
 329
 330     return errno;
 331 }
 332
 333 int
 334 vfs_alloc_fdslot(int* fd)
 335 {
 336     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 337         if (!__current->fdtable->fds[i]) {
 338             *fd = i;
 339             return 0;
 340         }
 341     }
 342     return EMFILE;
 343 }
 344
 345 struct v_superblock*
 346 vfs_sb_alloc()
 347 {
 348     struct v_superblock* sb = cake_grab(superblock_pile);
 349     memset(sb, 0, sizeof(*sb));
 350     llist_init_head(&sb->sb_list);
 351     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 352     return sb;
 353 }
 354
 355 void
 356 vfs_sb_free(struct v_superblock* sb)
 357 {
 358     vfree(sb->i_cache);
 359     cake_release(superblock_pile, sb);
 360 }
 361
 362 static int
 363 __vfs_try_evict_dnode(struct lru_node* obj)
 364 {
 365     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 366
 367     if (!dnode->ref_count) {
 368         vfs_d_free(dnode);
 369         return 1;
 370     }
 371     return 0;
 372 }
 373
 374 static int
 375 __vfs_try_evict_inode(struct lru_node* obj)
 376 {
 377     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 378
 379     if (!inode->link_count && !inode->open_count) {
 380         vfs_i_free(inode);
 381         return 1;
 382     }
 383     return 0;
 384 }
 385
 386 struct v_dnode*
 387 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 388 {
 389     struct v_dnode* dnode = cake_grab(dnode_pile);
 390     if (!dnode) {
 391         lru_evict_half(dnode_lru);
 392
 393         if (!(dnode = cake_grab(dnode_pile))) {
 394             return NULL;
 395         }
 396     }
 397
 398     memset(dnode, 0, sizeof(*dnode));
 399     llist_init_head(&dnode->children);
 400     llist_init_head(&dnode->siblings);
 401     llist_init_head(&dnode->aka_list);
 402     mutex_init(&dnode->lock);
 403
 404     dnode->ref_count = ATOMIC_VAR_INIT(0);
 405     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 406
 407     hstrcpy(&dnode->name, name);
 408
 409     if (parent) {
 410         dnode->super_block = parent->super_block;
 411         dnode->mnt = parent->mnt;
 412     }
 413
 414     lru_use_one(dnode_lru, &dnode->lru);
 415
 416     return dnode;
 417 }
 418
 419 void
 420 vfs_d_free(struct v_dnode* dnode)
 421 {
 422     assert(dnode->ref_count == 1);
 423
 424     if (dnode->inode) {
 425         assert(dnode->inode->link_count > 0);
 426         dnode->inode->link_count--;
 427     }
 428
 429     vfs_dcache_remove(dnode);
 430     // Make sure the children de-referencing their parent.
 431     // With lru presented, the eviction will be propagated over the entire
 432     // detached subtree eventually
 433     struct v_dnode *pos, *n;
 434     llist_for_each(pos, n, &dnode->children, siblings)
 435     {
 436         vfs_dcache_remove(pos);
 437     }
 438
 439     vfree((void*)dnode->name.value);
 440     cake_release(dnode_pile, dnode);
 441 }
 442
 443 struct v_inode*
 444 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 445 {
 446     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 447     struct v_inode *pos, *n;
 448     hashtable_bucket_foreach(slot, pos, n, hash_list)
 449     {
 450         if (pos->id == i_id) {
 451             lru_use_one(inode_lru, &pos->lru);
 452             return pos;
 453         }
 454     }
 455
 456     return NULL;
 457 }
 458
 459 void
 460 vfs_i_addhash(struct v_inode* inode)
 461 {
 462     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 463
 464     hlist_delete(&inode->hash_list);
 465     hlist_add(&slot->head, &inode->hash_list);
 466 }
 467
 468 struct v_inode*
 469 vfs_i_alloc(struct v_superblock* sb)
 470 {
 471     assert(sb->ops.init_inode);
 472
 473     struct v_inode* inode;
 474     if (!(inode = cake_grab(inode_pile))) {
 475         lru_evict_half(inode_lru);
 476         if (!(inode = cake_grab(inode_pile))) {
 477             return NULL;
 478         }
 479     }
 480
 481     memset(inode, 0, sizeof(*inode));
 482     mutex_init(&inode->lock);
 483     llist_init_head(&inode->xattrs);
 484     llist_init_head(&inode->aka_dnodes);
 485
 486     sb->ops.init_inode(sb, inode);
 487
 488     inode->sb = sb;
 489     inode->ctime = clock_unixtime();
 490     inode->atime = inode->ctime;
 491     inode->mtime = inode->ctime;
 492
 493     lru_use_one(inode_lru, &inode->lru);
 494     return inode;
 495 }
 496
 497 void
 498 vfs_i_free(struct v_inode* inode)
 499 {
 500     if (inode->pg_cache) {
 501         pcache_release(inode->pg_cache);
 502         vfree(inode->pg_cache);
 503     }
 504     // we don't need to sync inode.
 505     // If an inode can be free, then it must be properly closed.
 506     // Hence it must be synced already!
 507     if (inode->destruct) {
 508         inode->destruct(inode);
 509     }
 510     hlist_delete(&inode->hash_list);
 511     cake_release(inode_pile, inode);
 512 }
 513
 514 /* ---- System call definition and support ---- */
 515
 516 #define FLOCATE_CREATE_EMPTY 1
 517 #define FLOCATE_CREATE_ONLY 2
 518 #define FLOCATE_NOFOLLOW 4
 519
 520 int
 521 vfs_getfd(int fd, struct v_fd** fd_s)
 522 {
 523     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 524         return 0;
 525     }
 526     return EBADF;
 527 }
 528
 529 int
 530 __vfs_try_locate_file(const char* path,
 531                       struct v_dnode** fdir,
 532                       struct v_dnode** file,
 533                       int options)
 534 {
 535     char name_str[VFS_NAME_MAXLEN];
 536     struct hstr name = HSTR(name_str, 0);
 537     int errno, woption = 0;
 538
 539     if ((options & FLOCATE_NOFOLLOW)) {
 540         woption |= VFS_WALK_NOFOLLOW;
 541     }
 542
 543     name_str[0] = 0;
 544     if ((errno = vfs_walk_proc(path, fdir, &name, woption | VFS_WALK_PARENT))) {
 545         return errno;
 546     }
 547
 548     errno = vfs_walk(*fdir, name.value, file, NULL, woption);
 549
 550     if (errno != ENOENT && (options & FLOCATE_CREATE_ONLY)) {
 551         return EEXIST;
 552     }
 553
 554     if (errno != ENOENT ||
 555         !(options & (FLOCATE_CREATE_EMPTY | FLOCATE_CREATE_ONLY))) {
 556         return errno;
 557     }
 558
 559     struct v_dnode* parent = *fdir;
 560     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 561
 562     if (!file_new) {
 563         return ENOMEM;
 564     }
 565
 566     lock_dnode(parent);
 567
 568     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 569         vfs_dcache_add(parent, file_new);
 570         *file = file_new;
 571     } else {
 572         vfs_d_free(file_new);
 573     }
 574
 575     unlock_dnode(parent);
 576
 577     return errno;
 578 }
 579
 580 int
 581 vfs_do_open(const char* path, int options)
 582 {
 583     int errno, fd, loptions = 0;
 584     struct v_dnode *dentry, *file;
 585     struct v_file* ofile = NULL;
 586
 587     if ((options & FO_CREATE)) {
 588         loptions |= FLOCATE_CREATE_EMPTY;
 589     } else if ((options & FO_NOFOLLOW)) {
 590         loptions |= FLOCATE_NOFOLLOW;
 591     }
 592
 593     errno = __vfs_try_locate_file(path, &dentry, &file, loptions);
 594
 595     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 596
 597         if (errno || (errno = vfs_open(file, &ofile))) {
 598             return errno;
 599         }
 600
 601         struct v_fd* fd_s = cake_grab(fd_pile);
 602         memset(fd_s, 0, sizeof(*fd_s));
 603
 604         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 605         fd_s->file = ofile;
 606         fd_s->flags = options;
 607         __current->fdtable->fds[fd] = fd_s;
 608         return fd;
 609     }
 610
 611     return errno;
 612 }
 613
 614 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 615 {
 616     int errno = vfs_do_open(path, options);
 617     return DO_STATUS_OR_RETURN(errno);
 618 }
 619
 620 __DEFINE_LXSYSCALL1(int, close, int, fd)
 621 {
 622     struct v_fd* fd_s;
 623     int errno = 0;
 624     if ((errno = vfs_getfd(fd, &fd_s))) {
 625         goto done_err;
 626     }
 627
 628     if ((errno = vfs_close(fd_s->file))) {
 629         goto done_err;
 630     }
 631
 632     cake_release(fd_pile, fd_s);
 633     __current->fdtable->fds[fd] = 0;
 634
 635 done_err:
 636     return DO_STATUS(errno);
 637 }
 638
 639 void
 640 __vfs_readdir_callback(struct dir_context* dctx,
 641                        const char* name,
 642                        const int len,
 643                        const int dtype)
 644 {
 645     struct lx_dirent* dent = (struct lx_dirent*)dctx->cb_data;
 646     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 647     dent->d_nlen = len;
 648     dent->d_type = dtype;
 649 }
 650
 651 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent)
 652 {
 653     struct v_fd* fd_s;
 654     int errno;
 655
 656     if ((errno = vfs_getfd(fd, &fd_s))) {
 657         goto done;
 658     }
 659
 660     struct v_inode* inode = fd_s->file->inode;
 661
 662     lock_inode(inode);
 663
 664     if ((inode->itype & F_FILE)) {
 665         errno = ENOTDIR;
 666     } else {
 667         struct dir_context dctx =
 668           (struct dir_context){ .cb_data = dent,
 669                                 .index = dent->d_offset,
 670                                 .read_complete_callback =
 671                                   __vfs_readdir_callback };
 672         errno = 1;
 673         if (dent->d_offset == 0) {
 674             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 675         } else if (dent->d_offset == 1) {
 676             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 677         } else {
 678             dctx.index -= 2;
 679             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 680                 unlock_inode(inode);
 681                 goto done;
 682             }
 683         }
 684         dent->d_offset++;
 685     }
 686
 687     unlock_inode(inode);
 688
 689 done:
 690     return DO_STATUS_OR_RETURN(errno);
 691 }
 692
 693 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 694 {
 695     int errno = 0;
 696     struct v_fd* fd_s;
 697     if ((errno = vfs_getfd(fd, &fd_s))) {
 698         goto done;
 699     }
 700
 701     struct v_file* file = fd_s->file;
 702     if (!(file->inode->itype & F_FILE)) {
 703         errno = EISDIR;
 704         goto done;
 705     }
 706
 707     lock_inode(file->inode);
 708
 709     file->inode->atime = clock_unixtime();
 710
 711     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 712         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 713     } else {
 714         errno = pcache_read(file->inode, buf, count, file->f_pos);
 715     }
 716
 717     if (errno > 0) {
 718         file->f_pos += errno;
 719         unlock_inode(file->inode);
 720         return errno;
 721     }
 722
 723     unlock_inode(file->inode);
 724
 725 done:
 726     return DO_STATUS(errno);
 727 }
 728
 729 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 730 {
 731     int errno = 0;
 732     struct v_fd* fd_s;
 733     if ((errno = vfs_getfd(fd, &fd_s))) {
 734         goto done;
 735     }
 736
 737     struct v_file* file = fd_s->file;
 738
 739     if ((errno = vfs_check_writable(file->dnode))) {
 740         goto done;
 741     }
 742
 743     if (!(file->inode->itype & F_FILE)) {
 744         errno = EISDIR;
 745         goto done;
 746     }
 747
 748     lock_inode(file->inode);
 749
 750     file->inode->mtime = clock_unixtime();
 751
 752     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 753         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 754     } else {
 755         errno = pcache_write(file->inode, buf, count, file->f_pos);
 756     }
 757
 758     if (errno > 0) {
 759         file->f_pos += errno;
 760         unlock_inode(file->inode);
 761         return errno;
 762     }
 763
 764     unlock_inode(file->inode);
 765
 766 done:
 767     return DO_STATUS(errno);
 768 }
 769
 770 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 771 {
 772     int errno = 0;
 773     struct v_fd* fd_s;
 774     if ((errno = vfs_getfd(fd, &fd_s))) {
 775         goto done;
 776     }
 777
 778     struct v_file* file = fd_s->file;
 779
 780     if (!file->ops->seek) {
 781         errno = ENOTSUP;
 782         goto done;
 783     }
 784
 785     lock_inode(file->inode);
 786
 787     int overflow = 0;
 788     int fpos = file->f_pos;
 789     switch (options) {
 790         case FSEEK_CUR:
 791             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 792             break;
 793         case FSEEK_END:
 794             overflow =
 795               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 796             break;
 797         case FSEEK_SET:
 798             fpos = offset;
 799             break;
 800     }
 801     if (overflow) {
 802         errno = EOVERFLOW;
 803     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 804         file->f_pos = fpos;
 805     }
 806
 807     unlock_inode(file->inode);
 808
 809 done:
 810     return DO_STATUS(errno);
 811 }
 812
 813 int
 814 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 815 {
 816     if (!dnode) {
 817         return 0;
 818     }
 819
 820     if (depth > 64) {
 821         return ENAMETOOLONG;
 822     }
 823
 824     size_t len = 0;
 825
 826     if (dnode->parent != dnode) {
 827         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 828     }
 829
 830     if (len >= size) {
 831         return len;
 832     }
 833
 834     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 835         buf[len++] = VFS_PATH_DELIM;
 836     }
 837
 838     size_t cpy_size = MIN(dnode->name.len, size - len);
 839     strncpy(buf + len, dnode->name.value, cpy_size);
 840     len += cpy_size;
 841
 842     return len;
 843 }
 844
 845 int
 846 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 847 {
 848     const char* link;
 849     struct v_inode* inode = dnode->inode;
 850     if (inode->ops->read_symlink) {
 851         lock_inode(inode);
 852
 853         int errno = inode->ops->read_symlink(inode, &link);
 854         strncpy(buf, link, size);
 855
 856         unlock_inode(inode);
 857         return errno;
 858     }
 859     return 0;
 860 }
 861
 862 int
 863 vfs_get_dtype(int itype)
 864 {
 865     if ((itype & VFS_IFSYMLINK)) {
 866         return DT_SYMLINK;
 867     } else if (!(itype & VFS_IFFILE)) {
 868         return DT_DIR;
 869     } else {
 870         return DT_FILE;
 871     }
 872 }
 873
 874 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 875 {
 876     int errno;
 877     struct v_fd* fd_s;
 878     if ((errno = vfs_getfd(fd, &fd_s))) {
 879         goto done;
 880     }
 881
 882     struct v_dnode* dnode;
 883     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 884
 885     if (errno >= 0) {
 886         return errno;
 887     }
 888
 889 done:
 890     return DO_STATUS(errno);
 891 }
 892
 893 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 894 {
 895     int errno;
 896     struct v_dnode* dnode;
 897     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 898         errno = vfs_readlink(dnode, buf, size);
 899     }
 900
 901     if (errno >= 0) {
 902         return errno;
 903     }
 904
 905     return DO_STATUS(errno);
 906 }
 907
 908 __DEFINE_LXSYSCALL4(int,
 909                     readlinkat,
 910                     int,
 911                     dirfd,
 912                     const char*,
 913                     pathname,
 914                     char*,
 915                     buf,
 916                     size_t,
 917                     size)
 918 {
 919     int errno;
 920     struct v_fd* fd_s;
 921     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 922         goto done;
 923     }
 924
 925     pathname = pathname ? pathname : "";
 926
 927     struct v_dnode* dnode;
 928     if (!(errno = vfs_walk(
 929             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 930         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 931     }
 932
 933     if (errno >= 0) {
 934         return errno;
 935     }
 936
 937 done:
 938     return DO_STATUS(errno);
 939 }
 940
 941 /*
 942     NOTE
 943     When we perform operation that could affect the layout of
 944     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 945     whenever possible. This will blocking any ongoing path walking to reach
 946     it hence avoid any partial state.
 947 */
 948
 949 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 950 {
 951     int errno;
 952     struct v_dnode* dnode;
 953     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 954         return DO_STATUS(errno);
 955     }
 956
 957     lock_dnode(dnode);
 958
 959     if ((errno = vfs_check_writable(dnode))) {
 960         goto done;
 961     }
 962
 963     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 964         errno = EROFS;
 965         goto done;
 966     }
 967
 968     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 969         errno = EBUSY;
 970         goto done;
 971     }
 972
 973     if (!llist_empty(&dnode->children)) {
 974         errno = ENOTEMPTY;
 975         goto done;
 976     }
 977
 978     struct v_dnode* parent = dnode->parent;
 979
 980     if (!parent) {
 981         errno = EINVAL;
 982         goto done;
 983     }
 984
 985     lock_dnode(parent);
 986     lock_inode(parent->inode);
 987
 988     if (!(dnode->inode->itype & F_MFILE)) {
 989         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 990         if (!errno) {
 991             vfs_dcache_remove(dnode);
 992         }
 993     } else {
 994         errno = ENOTDIR;
 995     }
 996
 997     unlock_inode(parent->inode);
 998     unlock_dnode(parent);
 999
1000 done:
1001     unlock_dnode(dnode);
1002     return DO_STATUS(errno);
1003 }
1004
1005 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
1006 {
1007     int errno = 0;
1008     struct v_dnode *parent, *dir;
1009     char name_value[VFS_NAME_MAXLEN];
1010     struct hstr name = HHSTR(name_value, 0, 0);
1011
1012     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
1013         goto done;
1014     }
1015
1016     if ((errno = vfs_check_writable(parent))) {
1017         goto done;
1018     }
1019
1020     if (!(dir = vfs_d_alloc(parent, &name))) {
1021         errno = ENOMEM;
1022         goto done;
1023     }
1024
1025     lock_dnode(parent);
1026     lock_inode(parent->inode);
1027
1028     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1029         errno = ENOTSUP;
1030     } else if (!parent->inode->ops->mkdir) {
1031         errno = ENOTSUP;
1032     } else if ((parent->inode->itype & F_FILE)) {
1033         errno = ENOTDIR;
1034     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1035         vfs_dcache_add(parent, dir);
1036         goto cleanup;
1037     }
1038
1039     vfs_d_free(dir);
1040
1041 cleanup:
1042     unlock_inode(parent->inode);
1043     unlock_dnode(parent);
1044 done:
1045     return DO_STATUS(errno);
1046 }
1047
1048 int
1049 __vfs_do_unlink(struct v_dnode* dnode)
1050 {
1051     int errno;
1052     struct v_inode* inode = dnode->inode;
1053
1054     if (dnode->ref_count > 1) {
1055         return EBUSY;
1056     }
1057
1058     if ((errno = vfs_check_writable(dnode))) {
1059         return errno;
1060     }
1061
1062     lock_inode(inode);
1063
1064     if (inode->open_count) {
1065         errno = EBUSY;
1066     } else if ((inode->itype & F_MFILE)) {
1067         errno = inode->ops->unlink(inode);
1068         if (!errno) {
1069             vfs_d_free(dnode);
1070         }
1071     } else {
1072         errno = EISDIR;
1073     }
1074
1075     unlock_inode(inode);
1076
1077     return errno;
1078 }
1079
1080 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1081 {
1082     int errno;
1083     struct v_dnode* dnode;
1084     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1085         goto done;
1086     }
1087
1088     errno = __vfs_do_unlink(dnode);
1089
1090 done:
1091     return DO_STATUS(errno);
1092 }
1093
1094 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1095 {
1096     int errno;
1097     struct v_fd* fd_s;
1098     if ((errno = vfs_getfd(fd, &fd_s))) {
1099         goto done;
1100     }
1101
1102     struct v_dnode* dnode;
1103     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1104         errno = __vfs_do_unlink(dnode);
1105     }
1106
1107 done:
1108     return DO_STATUS(errno);
1109 }
1110
1111 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1112 {
1113     int errno;
1114     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1115
1116     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1117     if (!errno) {
1118         errno = __vfs_try_locate_file(
1119           newpath, &name_dentry, &name_file, FLOCATE_CREATE_ONLY);
1120         if (!errno) {
1121             errno = vfs_link(to_link, name_file);
1122         }
1123     }
1124     return DO_STATUS(errno);
1125 }
1126
1127 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1128 {
1129     int errno;
1130     struct v_fd* fd_s;
1131
1132     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1133         errno = vfs_fsync(fd_s->file);
1134     }
1135
1136     return DO_STATUS(errno);
1137 }
1138
1139 int
1140 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1141 {
1142     int errno = 0;
1143     struct v_fd* copied = cake_grab(fd_pile);
1144
1145     memcpy(copied, old, sizeof(struct v_fd));
1146
1147     atomic_fetch_add(&old->file->ref_count, 1);
1148
1149     *new = copied;
1150
1151     return errno;
1152 }
1153
1154 int
1155 vfs_dup2(int oldfd, int newfd)
1156 {
1157     if (newfd == oldfd) {
1158         return newfd;
1159     }
1160
1161     int errno;
1162     struct v_fd *oldfd_s, *newfd_s;
1163     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1164         goto done;
1165     }
1166
1167     if (!TEST_FD(newfd)) {
1168         errno = EBADF;
1169         goto done;
1170     }
1171
1172     newfd_s = __current->fdtable->fds[newfd];
1173     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1174         goto done;
1175     }
1176
1177     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1178         __current->fdtable->fds[newfd] = newfd_s;
1179         return newfd;
1180     }
1181
1182 done:
1183     return DO_STATUS(errno);
1184 }
1185
1186 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1187 {
1188     return vfs_dup2(oldfd, newfd);
1189 }
1190
1191 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1192 {
1193     int errno, newfd;
1194     struct v_fd *oldfd_s, *newfd_s;
1195     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1196         goto done;
1197     }
1198
1199     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1200         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1201         __current->fdtable->fds[newfd] = newfd_s;
1202         return newfd;
1203     }
1204
1205 done:
1206     return DO_STATUS(errno);
1207 }
1208
1209 __DEFINE_LXSYSCALL2(int,
1210                     symlink,
1211                     const char*,
1212                     pathname,
1213                     const char*,
1214                     link_target)
1215 {
1216     int errno;
1217     struct v_dnode *dnode, *file;
1218     if ((errno = __vfs_try_locate_file(
1219            pathname, &dnode, &file, FLOCATE_CREATE_ONLY))) {
1220         goto done;
1221     }
1222
1223     if ((errno = vfs_check_writable(file))) {
1224         goto done;
1225     }
1226
1227     if (!file->inode->ops->set_symlink) {
1228         errno = ENOTSUP;
1229         goto done;
1230     }
1231
1232     lock_inode(file->inode);
1233
1234     errno = file->inode->ops->set_symlink(file->inode, link_target);
1235
1236     unlock_inode(file->inode);
1237
1238 done:
1239     return DO_STATUS(errno);
1240 }
1241
1242 void
1243 vfs_ref_file(struct v_file* file)
1244 {
1245     atomic_fetch_add(&file->ref_count, 1);
1246 }
1247
1248 void
1249 vfs_ref_dnode(struct v_dnode* dnode)
1250 {
1251     atomic_fetch_add(&dnode->ref_count, 1);
1252     mnt_mkbusy(dnode->mnt);
1253 }
1254
1255 void
1256 vfs_unref_dnode(struct v_dnode* dnode)
1257 {
1258     atomic_fetch_sub(&dnode->ref_count, 1);
1259     mnt_chillax(dnode->mnt);
1260 }
1261
1262 int
1263 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1264 {
1265     int errno = 0;
1266
1267     lock_dnode(dnode);
1268
1269     if ((dnode->inode->itype & F_FILE)) {
1270         errno = ENOTDIR;
1271         goto done;
1272     }
1273
1274     if (proc->cwd) {
1275         vfs_unref_dnode(proc->cwd);
1276     }
1277
1278     vfs_ref_dnode(dnode);
1279     proc->cwd = dnode;
1280
1281     unlock_dnode(dnode);
1282
1283 done:
1284     return errno;
1285 }
1286
1287 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1288 {
1289     struct v_dnode* dnode;
1290     int errno = 0;
1291
1292     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1293         goto done;
1294     }
1295
1296     errno = vfs_do_chdir((struct proc_info*)__current, dnode);
1297
1298 done:
1299     return DO_STATUS(errno);
1300 }
1301
1302 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1303 {
1304     struct v_fd* fd_s;
1305     int errno = 0;
1306
1307     if ((errno = vfs_getfd(fd, &fd_s))) {
1308         goto done;
1309     }
1310
1311     errno = vfs_do_chdir((struct proc_info*)__current, fd_s->file->dnode);
1312
1313 done:
1314     return DO_STATUS(errno);
1315 }
1316
1317 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1318 {
1319     int errno = 0;
1320     char* ret_ptr = 0;
1321     if (size < 2) {
1322         errno = ERANGE;
1323         goto done;
1324     }
1325
1326     size_t len = 0;
1327
1328     if (!__current->cwd) {
1329         *buf = VFS_PATH_DELIM;
1330         len = 1;
1331     } else {
1332         len = vfs_get_path(__current->cwd, buf, size, 0);
1333         if (len == size) {
1334             errno = ERANGE;
1335             goto done;
1336         }
1337     }
1338
1339     buf[len] = '\0';
1340
1341     ret_ptr = buf;
1342
1343 done:
1344     __current->k_status = errno;
1345     return ret_ptr;
1346 }
1347
1348 int
1349 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1350 {
1351     int errno = 0;
1352     if (current->inode->id == target->inode->id) {
1353         // hard link
1354         return 0;
1355     }
1356
1357     if ((errno = vfs_check_writable(current))) {
1358         return errno;
1359     }
1360
1361     if (current->ref_count > 1 || target->ref_count > 1) {
1362         return EBUSY;
1363     }
1364
1365     if (current->super_block != target->super_block) {
1366         return EXDEV;
1367     }
1368
1369     struct v_dnode* oldparent = current->parent;
1370     struct v_dnode* newparent = target->parent;
1371
1372     lock_dnode(current);
1373     lock_dnode(target);
1374     if (oldparent)
1375         lock_dnode(oldparent);
1376     if (newparent)
1377         lock_dnode(newparent);
1378
1379     if (!llist_empty(&target->children)) {
1380         errno = ENOTEMPTY;
1381         unlock_dnode(target);
1382         goto cleanup;
1383     }
1384
1385     if ((errno =
1386            current->inode->ops->rename(current->inode, current, target))) {
1387         unlock_dnode(target);
1388         goto cleanup;
1389     }
1390
1391     // re-position current
1392     hstrcpy(&current->name, &target->name);
1393     vfs_dcache_rehash(newparent, current);
1394
1395     // detach target
1396     vfs_d_free(target);
1397
1398     unlock_dnode(target);
1399
1400 cleanup:
1401     unlock_dnode(current);
1402     if (oldparent)
1403         unlock_dnode(oldparent);
1404     if (newparent)
1405         unlock_dnode(newparent);
1406
1407     return errno;
1408 }
1409
1410 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1411 {
1412     struct v_dnode *cur, *target_parent, *target;
1413     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1414     int errno = 0;
1415
1416     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1417         goto done;
1418     }
1419
1420     if ((errno = vfs_walk(
1421            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1422         goto done;
1423     }
1424
1425     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1426     if (errno == ENOENT) {
1427         target = vfs_d_alloc(target_parent, &name);
1428         vfs_dcache_add(target_parent, target);
1429     } else if (errno) {
1430         goto done;
1431     }
1432
1433     if (!target) {
1434         errno = ENOMEM;
1435         goto done;
1436     }
1437
1438     errno = vfs_do_rename(cur, target);
1439
1440 done:
1441     vfree((void*)name.value);
1442     return DO_STATUS(errno);
1443 }
1444
1445 __DEFINE_LXSYSCALL2(int, fstat, int, fd, struct file_stat*, stat)
1446 {
1447     int errno = 0;
1448     struct v_fd* fds;
1449
1450     if ((errno = vfs_getfd(fd, &fds))) {
1451         goto done;
1452     }
1453
1454     struct v_inode* vino = fds->file->inode;
1455     struct device* fdev = vino->sb->dev;
1456
1457     *stat = (struct file_stat){ .st_ino = vino->id,
1458                                 .st_blocks = vino->lb_usage,
1459                                 .st_size = vino->fsize,
1460                                 .mode = vino->itype,
1461                                 .st_ioblksize = PG_SIZE,
1462                                 .st_blksize = vino->sb->blksize };
1463
1464     if (VFS_DEVFILE(vino->itype)) {
1465         struct device* rdev = (struct device*)vino->data;
1466         if (!rdev || rdev->magic != DEV_STRUCT_MAGIC) {
1467             errno = EINVAL;
1468             goto done;
1469         }
1470
1471         stat->st_rdev = (dev_t){ .meta = rdev->ident.fn_grp,
1472                                  .unique = rdev->ident.unique,
1473                                  .index = rdev->dev_uid };
1474     }
1475
1476     if (fdev) {
1477         stat->st_dev = (dev_t){ .meta = fdev->ident.fn_grp,
1478                                 .unique = fdev->ident.unique,
1479                                 .index = fdev->dev_uid };
1480     }
1481
1482 done:
1483     return DO_STATUS(errno);
1484 }