lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/foptions.h>
  48 #include <lunaix/fs.h>
  49 #include <lunaix/mm/cake.h>
  50 #include <lunaix/mm/page.h>
  51 #include <lunaix/mm/valloc.h>
  52 #include <lunaix/process.h>
  53 #include <lunaix/spike.h>
  54 #include <lunaix/syscall.h>
  55 #include <lunaix/syscall_utils.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 #include <sys/dirent_defs.h>
  60
  61 static struct cake_pile* dnode_pile;
  62 static struct cake_pile* inode_pile;
  63 static struct cake_pile* file_pile;
  64 static struct cake_pile* superblock_pile;
  65 static struct cake_pile* fd_pile;
  66
  67 struct v_dnode* vfs_sysroot;
  68 static struct hbucket* dnode_cache;
  69
  70 struct lru_zone *dnode_lru, *inode_lru;
  71
  72 struct hstr vfs_ddot = HSTR("..", 2);
  73 struct hstr vfs_dot = HSTR(".", 1);
  74 struct hstr vfs_empty = HSTR("", 0);
  75
  76 struct v_superblock*
  77 vfs_sb_alloc();
  78
  79 void
  80 vfs_sb_free(struct v_superblock* sb);
  81
  82 static int
  83 __vfs_try_evict_dnode(struct lru_node* obj);
  84
  85 static int
  86 __vfs_try_evict_inode(struct lru_node* obj);
  87
  88 void
  89 vfs_init()
  90 {
  91     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  92     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  93     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  94     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  95     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  96     superblock_pile =
  97       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  98
  99     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 100
 101     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 102     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 103
 104     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 105     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 106
 107     // 创建一个根dnode。
 108     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 109     vfs_sysroot->parent = vfs_sysroot;
 110     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 111 }
 112
 113 static inline struct hbucket*
 114 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 115 {
 116     u32_t _hash = *hash;
 117     // 确保低位更加随机
 118     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 119     // 与parent的指针值做加法，来减小碰撞的可能性。
 120     _hash += (u32_t)parent;
 121     *hash = _hash;
 122     return &dnode_cache[_hash & VFS_HASH_MASK];
 123 }
 124
 125 struct v_dnode*
 126 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 127 {
 128     if (!str->len || HSTR_EQ(str, &vfs_dot))
 129         return parent;
 130
 131     if (HSTR_EQ(str, &vfs_ddot)) {
 132         return parent->parent;
 133     }
 134
 135     u32_t hash = str->hash;
 136     struct hbucket* slot = __dcache_hash(parent, &hash);
 137
 138     struct v_dnode *pos, *n;
 139     hashtable_bucket_foreach(slot, pos, n, hash_list)
 140     {
 141         if (pos->name.hash == hash) {
 142             return pos;
 143         }
 144     }
 145     return NULL;
 146 }
 147
 148 void
 149 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 150 {
 151     assert(parent);
 152
 153     atomic_fetch_add(&dnode->ref_count, 1);
 154     dnode->parent = parent;
 155     llist_append(&parent->children, &dnode->siblings);
 156
 157     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 158     hlist_add(&bucket->head, &dnode->hash_list);
 159 }
 160
 161 void
 162 vfs_dcache_remove(struct v_dnode* dnode)
 163 {
 164     assert(dnode);
 165     assert(dnode->ref_count == 1);
 166
 167     llist_delete(&dnode->siblings);
 168     llist_delete(&dnode->aka_list);
 169     hlist_delete(&dnode->hash_list);
 170
 171     dnode->parent = NULL;
 172     atomic_fetch_sub(&dnode->ref_count, 1);
 173 }
 174
 175 void
 176 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 177 {
 178     assert(new_parent);
 179
 180     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 181     vfs_dcache_remove(dnode);
 182     vfs_dcache_add(new_parent, dnode);
 183 }
 184
 185 int
 186 vfs_open(struct v_dnode* dnode, struct v_file** file)
 187 {
 188     if (!dnode->inode || !dnode->inode->ops->open) {
 189         return ENOTSUP;
 190     }
 191
 192     struct v_inode* inode = dnode->inode;
 193
 194     lock_inode(inode);
 195
 196     struct v_file* vfile = cake_grab(file_pile);
 197     memset(vfile, 0, sizeof(*vfile));
 198
 199     vfile->dnode = dnode;
 200     vfile->inode = inode;
 201     vfile->ref_count = ATOMIC_VAR_INIT(1);
 202     vfile->ops = inode->default_fops;
 203
 204     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 205         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 206         pcache_init(pcache);
 207         pcache->master = inode;
 208         inode->pg_cache = pcache;
 209     }
 210
 211     int errno = inode->ops->open(inode, vfile);
 212     if (errno) {
 213         cake_release(file_pile, vfile);
 214     } else {
 215         atomic_fetch_add(&dnode->ref_count, 1);
 216         inode->open_count++;
 217         mnt_mkbusy(dnode->mnt);
 218
 219         *file = vfile;
 220     }
 221
 222     unlock_inode(inode);
 223
 224     return errno;
 225 }
 226
 227 void
 228 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 229 {
 230     if (assign_to->inode) {
 231         llist_delete(&assign_to->aka_list);
 232         assign_to->inode->link_count--;
 233     }
 234     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 235     assign_to->inode = inode;
 236     inode->link_count++;
 237 }
 238
 239 int
 240 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 241 {
 242     int errno;
 243
 244     if ((errno = vfs_check_writable(to_link))) {
 245         return errno;
 246     }
 247
 248     lock_inode(to_link->inode);
 249     if (to_link->super_block->root != name->super_block->root) {
 250         errno = EXDEV;
 251     } else if (!to_link->inode->ops->link) {
 252         errno = ENOTSUP;
 253     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 254         vfs_assign_inode(name, to_link->inode);
 255     }
 256     unlock_inode(to_link->inode);
 257
 258     return errno;
 259 }
 260
 261 int
 262 vfs_pclose(struct v_file* file, pid_t pid)
 263 {
 264     int errno = 0;
 265     if (file->ref_count > 1) {
 266         atomic_fetch_sub(&file->ref_count, 1);
 267     } else if (!(errno = file->ops->close(file))) {
 268         atomic_fetch_sub(&file->dnode->ref_count, 1);
 269         file->inode->open_count--;
 270
 271         /*
 272          * Prevent dead lock.
 273          * This happened when process is terminated while blocking on read.
 274          * In that case, the process is still holding the inode lock and it
 275              will never get released.
 276          * The unlocking should also include ownership check.
 277          *
 278          * To see why, consider two process both open the same file both with
 279          * fd=x.
 280          *      Process A: busy on reading x
 281          *      Process B: do nothing with x
 282          * Assuming that, after a very short time, process B get terminated
 283          * while process A is still busy in it's reading business. By this
 284          * design, the inode lock of this file x is get released by B rather
 285          * than A. And this will cause a probable race condition on A if other
 286          * process is writing to this file later after B exit.
 287          */
 288         if (mutex_on_hold(&file->inode->lock)) {
 289             mutex_unlock_for(&file->inode->lock, pid);
 290         }
 291         mnt_chillax(file->dnode->mnt);
 292
 293         pcache_commit_all(file->inode);
 294         cake_release(file_pile, file);
 295     }
 296     return errno;
 297 }
 298
 299 int
 300 vfs_close(struct v_file* file)
 301 {
 302     return vfs_pclose(file, __current->pid);
 303 }
 304
 305 void
 306 vfs_free_fd(struct v_fd* fd)
 307 {
 308     cake_release(fd_pile, fd);
 309 }
 310
 311 int
 312 vfs_fsync(struct v_file* file)
 313 {
 314     int errno;
 315     if ((errno = vfs_check_writable(file->dnode))) {
 316         return errno;
 317     }
 318
 319     lock_inode(file->inode);
 320
 321     pcache_commit_all(file->inode);
 322
 323     errno = ENOTSUP;
 324     if (file->ops->sync) {
 325         errno = file->ops->sync(file);
 326     }
 327
 328     unlock_inode(file->inode);
 329
 330     return errno;
 331 }
 332
 333 int
 334 vfs_alloc_fdslot(int* fd)
 335 {
 336     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 337         if (!__current->fdtable->fds[i]) {
 338             *fd = i;
 339             return 0;
 340         }
 341     }
 342     return EMFILE;
 343 }
 344
 345 struct v_superblock*
 346 vfs_sb_alloc()
 347 {
 348     struct v_superblock* sb = cake_grab(superblock_pile);
 349     memset(sb, 0, sizeof(*sb));
 350     llist_init_head(&sb->sb_list);
 351     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 352     return sb;
 353 }
 354
 355 void
 356 vfs_sb_free(struct v_superblock* sb)
 357 {
 358     vfree(sb->i_cache);
 359     cake_release(superblock_pile, sb);
 360 }
 361
 362 static int
 363 __vfs_try_evict_dnode(struct lru_node* obj)
 364 {
 365     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 366
 367     if (!dnode->ref_count) {
 368         vfs_d_free(dnode);
 369         return 1;
 370     }
 371     return 0;
 372 }
 373
 374 static int
 375 __vfs_try_evict_inode(struct lru_node* obj)
 376 {
 377     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 378
 379     if (!inode->link_count && !inode->open_count) {
 380         vfs_i_free(inode);
 381         return 1;
 382     }
 383     return 0;
 384 }
 385
 386 struct v_dnode*
 387 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 388 {
 389     struct v_dnode* dnode = cake_grab(dnode_pile);
 390     if (!dnode) {
 391         lru_evict_half(dnode_lru);
 392
 393         if (!(dnode = cake_grab(dnode_pile))) {
 394             return NULL;
 395         }
 396     }
 397
 398     memset(dnode, 0, sizeof(*dnode));
 399     llist_init_head(&dnode->children);
 400     llist_init_head(&dnode->siblings);
 401     llist_init_head(&dnode->aka_list);
 402     mutex_init(&dnode->lock);
 403
 404     dnode->ref_count = ATOMIC_VAR_INIT(0);
 405     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 406
 407     hstrcpy(&dnode->name, name);
 408
 409     if (parent) {
 410         dnode->super_block = parent->super_block;
 411         dnode->mnt = parent->mnt;
 412     }
 413
 414     lru_use_one(dnode_lru, &dnode->lru);
 415
 416     return dnode;
 417 }
 418
 419 void
 420 vfs_d_free(struct v_dnode* dnode)
 421 {
 422     assert(dnode->ref_count == 1);
 423
 424     if (dnode->inode) {
 425         assert(dnode->inode->link_count > 0);
 426         dnode->inode->link_count--;
 427     }
 428
 429     vfs_dcache_remove(dnode);
 430     // Make sure the children de-referencing their parent.
 431     // With lru presented, the eviction will be propagated over the entire
 432     // detached subtree eventually
 433     struct v_dnode *pos, *n;
 434     llist_for_each(pos, n, &dnode->children, siblings)
 435     {
 436         vfs_dcache_remove(pos);
 437     }
 438
 439     vfree((void*)dnode->name.value);
 440     cake_release(dnode_pile, dnode);
 441 }
 442
 443 struct v_inode*
 444 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 445 {
 446     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 447     struct v_inode *pos, *n;
 448     hashtable_bucket_foreach(slot, pos, n, hash_list)
 449     {
 450         if (pos->id == i_id) {
 451             lru_use_one(inode_lru, &pos->lru);
 452             return pos;
 453         }
 454     }
 455
 456     return NULL;
 457 }
 458
 459 void
 460 vfs_i_addhash(struct v_inode* inode)
 461 {
 462     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 463
 464     hlist_delete(&inode->hash_list);
 465     hlist_add(&slot->head, &inode->hash_list);
 466 }
 467
 468 struct v_inode*
 469 vfs_i_alloc(struct v_superblock* sb)
 470 {
 471     assert(sb->ops.init_inode);
 472
 473     struct v_inode* inode;
 474     if (!(inode = cake_grab(inode_pile))) {
 475         lru_evict_half(inode_lru);
 476         if (!(inode = cake_grab(inode_pile))) {
 477             return NULL;
 478         }
 479     }
 480
 481     memset(inode, 0, sizeof(*inode));
 482     mutex_init(&inode->lock);
 483     llist_init_head(&inode->xattrs);
 484     llist_init_head(&inode->aka_dnodes);
 485
 486     sb->ops.init_inode(sb, inode);
 487
 488     inode->sb = sb;
 489     inode->ctime = clock_unixtime();
 490     inode->atime = inode->ctime;
 491     inode->mtime = inode->ctime;
 492
 493     lru_use_one(inode_lru, &inode->lru);
 494     return inode;
 495 }
 496
 497 void
 498 vfs_i_free(struct v_inode* inode)
 499 {
 500     if (inode->pg_cache) {
 501         pcache_release(inode->pg_cache);
 502         vfree(inode->pg_cache);
 503     }
 504     // we don't need to sync inode.
 505     // If an inode can be free, then it must be properly closed.
 506     // Hence it must be synced already!
 507     if (inode->destruct) {
 508         inode->destruct(inode);
 509     }
 510     hlist_delete(&inode->hash_list);
 511     cake_release(inode_pile, inode);
 512 }
 513
 514 /* ---- System call definition and support ---- */
 515
 516 #define FLOCATE_CREATE_EMPTY 1
 517 #define FLOCATE_CREATE_ONLY 2
 518
 519 int
 520 vfs_getfd(int fd, struct v_fd** fd_s)
 521 {
 522     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 523         return 0;
 524     }
 525     return EBADF;
 526 }
 527
 528 int
 529 __vfs_try_locate_file(const char* path,
 530                       struct v_dnode** fdir,
 531                       struct v_dnode** file,
 532                       int options)
 533 {
 534     char name_str[VFS_NAME_MAXLEN];
 535     struct hstr name = HSTR(name_str, 0);
 536     int errno;
 537
 538     name_str[0] = 0;
 539     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 540         return errno;
 541     }
 542
 543     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 544
 545     if (errno != ENOENT && (options & FLOCATE_CREATE_ONLY)) {
 546         return EEXIST;
 547     }
 548
 549     if (errno != ENOENT ||
 550         !(options & (FLOCATE_CREATE_EMPTY | FLOCATE_CREATE_ONLY))) {
 551         return errno;
 552     }
 553
 554     struct v_dnode* parent = *fdir;
 555     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 556
 557     if (!file_new) {
 558         return ENOMEM;
 559     }
 560
 561     lock_dnode(parent);
 562
 563     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 564         vfs_dcache_add(parent, file_new);
 565         *file = file_new;
 566     } else {
 567         vfs_d_free(file_new);
 568     }
 569
 570     unlock_dnode(parent);
 571
 572     return errno;
 573 }
 574
 575 int
 576 vfs_do_open(const char* path, int options)
 577 {
 578     int errno, fd;
 579     struct v_dnode *dentry, *file;
 580     struct v_file* ofile = NULL;
 581
 582     errno = __vfs_try_locate_file(
 583       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 584
 585     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 586
 587         if (errno || (errno = vfs_open(file, &ofile))) {
 588             return errno;
 589         }
 590
 591         struct v_fd* fd_s = cake_grab(fd_pile);
 592         memset(fd_s, 0, sizeof(*fd_s));
 593
 594         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 595         fd_s->file = ofile;
 596         fd_s->flags = options;
 597         __current->fdtable->fds[fd] = fd_s;
 598         return fd;
 599     }
 600
 601     return errno;
 602 }
 603
 604 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 605 {
 606     int errno = vfs_do_open(path, options);
 607     return DO_STATUS_OR_RETURN(errno);
 608 }
 609
 610 __DEFINE_LXSYSCALL1(int, close, int, fd)
 611 {
 612     struct v_fd* fd_s;
 613     int errno = 0;
 614     if ((errno = vfs_getfd(fd, &fd_s))) {
 615         goto done_err;
 616     }
 617
 618     if ((errno = vfs_close(fd_s->file))) {
 619         goto done_err;
 620     }
 621
 622     cake_release(fd_pile, fd_s);
 623     __current->fdtable->fds[fd] = 0;
 624
 625 done_err:
 626     return DO_STATUS(errno);
 627 }
 628
 629 void
 630 __vfs_readdir_callback(struct dir_context* dctx,
 631                        const char* name,
 632                        const int len,
 633                        const int dtype)
 634 {
 635     struct lx_dirent* dent = (struct lx_dirent*)dctx->cb_data;
 636     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 637     dent->d_nlen = len;
 638     dent->d_type = dtype;
 639 }
 640
 641 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent)
 642 {
 643     struct v_fd* fd_s;
 644     int errno;
 645
 646     if ((errno = vfs_getfd(fd, &fd_s))) {
 647         goto done;
 648     }
 649
 650     struct v_inode* inode = fd_s->file->inode;
 651
 652     lock_inode(inode);
 653
 654     if (!(inode->itype & VFS_IFDIR)) {
 655         errno = ENOTDIR;
 656     } else {
 657         struct dir_context dctx =
 658           (struct dir_context){ .cb_data = dent,
 659                                 .index = dent->d_offset,
 660                                 .read_complete_callback =
 661                                   __vfs_readdir_callback };
 662         errno = 1;
 663         if (dent->d_offset == 0) {
 664             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 665         } else if (dent->d_offset == 1) {
 666             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 667         } else {
 668             dctx.index -= 2;
 669             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 670                 unlock_inode(inode);
 671                 goto done;
 672             }
 673         }
 674         dent->d_offset++;
 675     }
 676
 677     unlock_inode(inode);
 678
 679 done:
 680     return DO_STATUS_OR_RETURN(errno);
 681 }
 682
 683 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 684 {
 685     int errno = 0;
 686     struct v_fd* fd_s;
 687     if ((errno = vfs_getfd(fd, &fd_s))) {
 688         goto done;
 689     }
 690
 691     struct v_file* file = fd_s->file;
 692     if ((file->inode->itype & VFS_IFDIR)) {
 693         errno = EISDIR;
 694         goto done;
 695     }
 696
 697     lock_inode(file->inode);
 698
 699     file->inode->atime = clock_unixtime();
 700
 701     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 702         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 703     } else {
 704         errno = pcache_read(file->inode, buf, count, file->f_pos);
 705     }
 706
 707     if (errno > 0) {
 708         file->f_pos += errno;
 709         unlock_inode(file->inode);
 710         return errno;
 711     }
 712
 713     unlock_inode(file->inode);
 714
 715 done:
 716     return DO_STATUS(errno);
 717 }
 718
 719 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 720 {
 721     int errno = 0;
 722     struct v_fd* fd_s;
 723     if ((errno = vfs_getfd(fd, &fd_s))) {
 724         goto done;
 725     }
 726
 727     struct v_file* file = fd_s->file;
 728
 729     if ((errno = vfs_check_writable(file->dnode))) {
 730         goto done;
 731     }
 732
 733     if ((file->inode->itype & VFS_IFDIR)) {
 734         errno = EISDIR;
 735         goto done;
 736     }
 737
 738     lock_inode(file->inode);
 739
 740     file->inode->mtime = clock_unixtime();
 741
 742     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 743         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 744     } else {
 745         errno = pcache_write(file->inode, buf, count, file->f_pos);
 746     }
 747
 748     if (errno > 0) {
 749         file->f_pos += errno;
 750         unlock_inode(file->inode);
 751         return errno;
 752     }
 753
 754     unlock_inode(file->inode);
 755
 756 done:
 757     return DO_STATUS(errno);
 758 }
 759
 760 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 761 {
 762     int errno = 0;
 763     struct v_fd* fd_s;
 764     if ((errno = vfs_getfd(fd, &fd_s))) {
 765         goto done;
 766     }
 767
 768     struct v_file* file = fd_s->file;
 769
 770     if (!file->ops->seek) {
 771         errno = ENOTSUP;
 772         goto done;
 773     }
 774
 775     lock_inode(file->inode);
 776
 777     int overflow = 0;
 778     int fpos = file->f_pos;
 779     switch (options) {
 780         case FSEEK_CUR:
 781             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 782             break;
 783         case FSEEK_END:
 784             overflow =
 785               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 786             break;
 787         case FSEEK_SET:
 788             fpos = offset;
 789             break;
 790     }
 791     if (overflow) {
 792         errno = EOVERFLOW;
 793     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 794         file->f_pos = fpos;
 795     }
 796
 797     unlock_inode(file->inode);
 798
 799 done:
 800     return DO_STATUS(errno);
 801 }
 802
 803 int
 804 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 805 {
 806     if (!dnode) {
 807         return 0;
 808     }
 809
 810     if (depth > 64) {
 811         return ENAMETOOLONG;
 812     }
 813
 814     size_t len = 0;
 815
 816     if (dnode->parent != dnode) {
 817         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 818     }
 819
 820     if (len >= size) {
 821         return len;
 822     }
 823
 824     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 825         buf[len++] = VFS_PATH_DELIM;
 826     }
 827
 828     size_t cpy_size = MIN(dnode->name.len, size - len);
 829     strncpy(buf + len, dnode->name.value, cpy_size);
 830     len += cpy_size;
 831
 832     return len;
 833 }
 834
 835 int
 836 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 837 {
 838     const char* link;
 839     struct v_inode* inode = dnode->inode;
 840     if (inode->ops->read_symlink) {
 841         lock_inode(inode);
 842
 843         int errno = inode->ops->read_symlink(inode, &link);
 844         strncpy(buf, link, size);
 845
 846         unlock_inode(inode);
 847         return errno;
 848     }
 849     return 0;
 850 }
 851
 852 int
 853 vfs_get_dtype(int itype)
 854 {
 855     switch (itype) {
 856         case VFS_IFDIR:
 857             return DT_DIR;
 858         case VFS_IFSYMLINK:
 859             return DT_SYMLINK;
 860         default:
 861             return DT_PIPE;
 862     }
 863 }
 864
 865 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 866 {
 867     int errno;
 868     struct v_fd* fd_s;
 869     if ((errno = vfs_getfd(fd, &fd_s))) {
 870         goto done;
 871     }
 872
 873     struct v_dnode* dnode;
 874     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 875
 876     if (errno >= 0) {
 877         return errno;
 878     }
 879
 880 done:
 881     return DO_STATUS(errno);
 882 }
 883
 884 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 885 {
 886     int errno;
 887     struct v_dnode* dnode;
 888     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 889         errno = vfs_readlink(dnode, buf, size);
 890     }
 891
 892     if (errno >= 0) {
 893         return errno;
 894     }
 895
 896     return DO_STATUS(errno);
 897 }
 898
 899 __DEFINE_LXSYSCALL4(int,
 900                     readlinkat,
 901                     int,
 902                     dirfd,
 903                     const char*,
 904                     pathname,
 905                     char*,
 906                     buf,
 907                     size_t,
 908                     size)
 909 {
 910     int errno;
 911     struct v_fd* fd_s;
 912     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 913         goto done;
 914     }
 915
 916     struct v_dnode* dnode;
 917     if (!(errno = vfs_walk(
 918             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 919         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 920     }
 921
 922     if (errno >= 0) {
 923         return errno;
 924     }
 925
 926 done:
 927     return DO_STATUS(errno);
 928 }
 929
 930 /*
 931     NOTE
 932     When we perform operation that could affect the layout of
 933     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 934     whenever possible. This will blocking any ongoing path walking to reach
 935     it hence avoid any partial state.
 936 */
 937
 938 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 939 {
 940     int errno;
 941     struct v_dnode* dnode;
 942     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 943         return DO_STATUS(errno);
 944     }
 945
 946     lock_dnode(dnode);
 947
 948     if ((errno = vfs_check_writable(dnode))) {
 949         goto done;
 950     }
 951
 952     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 953         errno = EROFS;
 954         goto done;
 955     }
 956
 957     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 958         errno = EBUSY;
 959         goto done;
 960     }
 961
 962     if (!llist_empty(&dnode->children)) {
 963         errno = ENOTEMPTY;
 964         goto done;
 965     }
 966
 967     struct v_dnode* parent = dnode->parent;
 968
 969     if (!parent) {
 970         errno = EINVAL;
 971         goto done;
 972     }
 973
 974     lock_dnode(parent);
 975     lock_inode(parent->inode);
 976
 977     if ((dnode->inode->itype & VFS_IFDIR)) {
 978         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 979         if (!errno) {
 980             vfs_dcache_remove(dnode);
 981         }
 982     } else {
 983         errno = ENOTDIR;
 984     }
 985
 986     unlock_inode(parent->inode);
 987     unlock_dnode(parent);
 988
 989 done:
 990     unlock_dnode(dnode);
 991     return DO_STATUS(errno);
 992 }
 993
 994 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 995 {
 996     int errno = 0;
 997     struct v_dnode *parent, *dir;
 998     char name_value[VFS_NAME_MAXLEN];
 999     struct hstr name = HHSTR(name_value, 0, 0);
1000
1001     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
1002         goto done;
1003     }
1004
1005     if ((errno = vfs_check_writable(parent))) {
1006         goto done;
1007     }
1008
1009     if (!(dir = vfs_d_alloc(parent, &name))) {
1010         errno = ENOMEM;
1011         goto done;
1012     }
1013
1014     lock_dnode(parent);
1015     lock_inode(parent->inode);
1016
1017     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1018         errno = ENOTSUP;
1019     } else if (!parent->inode->ops->mkdir) {
1020         errno = ENOTSUP;
1021     } else if (!(parent->inode->itype & VFS_IFDIR)) {
1022         errno = ENOTDIR;
1023     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1024         vfs_dcache_add(parent, dir);
1025         goto cleanup;
1026     }
1027
1028     vfs_d_free(dir);
1029
1030 cleanup:
1031     unlock_inode(parent->inode);
1032     unlock_dnode(parent);
1033 done:
1034     return DO_STATUS(errno);
1035 }
1036
1037 int
1038 __vfs_do_unlink(struct v_dnode* dnode)
1039 {
1040     int errno;
1041     struct v_inode* inode = dnode->inode;
1042
1043     if (dnode->ref_count > 1) {
1044         return EBUSY;
1045     }
1046
1047     if ((errno = vfs_check_writable(dnode))) {
1048         return errno;
1049     }
1050
1051     lock_inode(inode);
1052
1053     if (inode->open_count) {
1054         errno = EBUSY;
1055     } else if (!(inode->itype & VFS_IFDIR)) {
1056         errno = inode->ops->unlink(inode);
1057         if (!errno) {
1058             vfs_d_free(dnode);
1059         }
1060     } else {
1061         errno = EISDIR;
1062     }
1063
1064     unlock_inode(inode);
1065
1066     return errno;
1067 }
1068
1069 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1070 {
1071     int errno;
1072     struct v_dnode* dnode;
1073     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1074         goto done;
1075     }
1076
1077     errno = __vfs_do_unlink(dnode);
1078
1079 done:
1080     return DO_STATUS(errno);
1081 }
1082
1083 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1084 {
1085     int errno;
1086     struct v_fd* fd_s;
1087     if ((errno = vfs_getfd(fd, &fd_s))) {
1088         goto done;
1089     }
1090
1091     struct v_dnode* dnode;
1092     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1093         errno = __vfs_do_unlink(dnode);
1094     }
1095
1096 done:
1097     return DO_STATUS(errno);
1098 }
1099
1100 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1101 {
1102     int errno;
1103     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1104
1105     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1106     if (!errno) {
1107         errno = __vfs_try_locate_file(
1108           newpath, &name_dentry, &name_file, FLOCATE_CREATE_ONLY);
1109         if (!errno) {
1110             errno = vfs_link(to_link, name_file);
1111         }
1112     }
1113     return DO_STATUS(errno);
1114 }
1115
1116 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1117 {
1118     int errno;
1119     struct v_fd* fd_s;
1120
1121     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1122         errno = vfs_fsync(fd_s->file);
1123     }
1124
1125     return DO_STATUS(errno);
1126 }
1127
1128 int
1129 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1130 {
1131     int errno = 0;
1132     struct v_fd* copied = cake_grab(fd_pile);
1133
1134     memcpy(copied, old, sizeof(struct v_fd));
1135
1136     atomic_fetch_add(&old->file->ref_count, 1);
1137
1138     *new = copied;
1139
1140     return errno;
1141 }
1142
1143 int
1144 vfs_dup2(int oldfd, int newfd)
1145 {
1146     if (newfd == oldfd) {
1147         return newfd;
1148     }
1149
1150     int errno;
1151     struct v_fd *oldfd_s, *newfd_s;
1152     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1153         goto done;
1154     }
1155
1156     if (!TEST_FD(newfd)) {
1157         errno = EBADF;
1158         goto done;
1159     }
1160
1161     newfd_s = __current->fdtable->fds[newfd];
1162     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1163         goto done;
1164     }
1165
1166     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1167         __current->fdtable->fds[newfd] = newfd_s;
1168         return newfd;
1169     }
1170
1171 done:
1172     return DO_STATUS(errno);
1173 }
1174
1175 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1176 {
1177     return vfs_dup2(oldfd, newfd);
1178 }
1179
1180 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1181 {
1182     int errno, newfd;
1183     struct v_fd *oldfd_s, *newfd_s;
1184     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1185         goto done;
1186     }
1187
1188     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1189         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1190         __current->fdtable->fds[newfd] = newfd_s;
1191         return newfd;
1192     }
1193
1194 done:
1195     return DO_STATUS(errno);
1196 }
1197
1198 __DEFINE_LXSYSCALL2(int,
1199                     symlink,
1200                     const char*,
1201                     pathname,
1202                     const char*,
1203                     link_target)
1204 {
1205     int errno;
1206     struct v_dnode *dnode, *file;
1207     if ((errno = __vfs_try_locate_file(
1208            pathname, &dnode, &file, FLOCATE_CREATE_ONLY))) {
1209         goto done;
1210     }
1211
1212     if ((errno = vfs_check_writable(file))) {
1213         goto done;
1214     }
1215
1216     if (!file->inode->ops->set_symlink) {
1217         errno = ENOTSUP;
1218         goto done;
1219     }
1220
1221     lock_inode(file->inode);
1222
1223     errno = file->inode->ops->set_symlink(file->inode, link_target);
1224
1225     unlock_inode(file->inode);
1226
1227 done:
1228     return DO_STATUS(errno);
1229 }
1230
1231 void
1232 vfs_ref_file(struct v_file* file)
1233 {
1234     atomic_fetch_add(&file->ref_count, 1);
1235 }
1236
1237 void
1238 vfs_ref_dnode(struct v_dnode* dnode)
1239 {
1240     atomic_fetch_add(&dnode->ref_count, 1);
1241     mnt_mkbusy(dnode->mnt);
1242 }
1243
1244 void
1245 vfs_unref_dnode(struct v_dnode* dnode)
1246 {
1247     atomic_fetch_sub(&dnode->ref_count, 1);
1248     mnt_chillax(dnode->mnt);
1249 }
1250
1251 int
1252 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1253 {
1254     int errno = 0;
1255
1256     lock_dnode(dnode);
1257
1258     if (!(dnode->inode->itype & VFS_IFDIR)) {
1259         errno = ENOTDIR;
1260         goto done;
1261     }
1262
1263     if (proc->cwd) {
1264         vfs_unref_dnode(proc->cwd);
1265     }
1266
1267     vfs_ref_dnode(dnode);
1268     proc->cwd = dnode;
1269
1270     unlock_dnode(dnode);
1271
1272 done:
1273     return errno;
1274 }
1275
1276 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1277 {
1278     struct v_dnode* dnode;
1279     int errno = 0;
1280
1281     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1282         goto done;
1283     }
1284
1285     errno = vfs_do_chdir((struct proc_info*)__current, dnode);
1286
1287 done:
1288     return DO_STATUS(errno);
1289 }
1290
1291 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1292 {
1293     struct v_fd* fd_s;
1294     int errno = 0;
1295
1296     if ((errno = vfs_getfd(fd, &fd_s))) {
1297         goto done;
1298     }
1299
1300     errno = vfs_do_chdir((struct proc_info*)__current, fd_s->file->dnode);
1301
1302 done:
1303     return DO_STATUS(errno);
1304 }
1305
1306 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1307 {
1308     int errno = 0;
1309     char* ret_ptr = 0;
1310     if (size < 2) {
1311         errno = ERANGE;
1312         goto done;
1313     }
1314
1315     size_t len = 0;
1316
1317     if (!__current->cwd) {
1318         *buf = VFS_PATH_DELIM;
1319         len = 1;
1320     } else {
1321         len = vfs_get_path(__current->cwd, buf, size, 0);
1322         if (len == size) {
1323             errno = ERANGE;
1324             goto done;
1325         }
1326     }
1327
1328     buf[len] = '\0';
1329
1330     ret_ptr = buf;
1331
1332 done:
1333     __current->k_status = errno;
1334     return ret_ptr;
1335 }
1336
1337 int
1338 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1339 {
1340     int errno = 0;
1341     if (current->inode->id == target->inode->id) {
1342         // hard link
1343         return 0;
1344     }
1345
1346     if ((errno = vfs_check_writable(current))) {
1347         return errno;
1348     }
1349
1350     if (current->ref_count > 1 || target->ref_count > 1) {
1351         return EBUSY;
1352     }
1353
1354     if (current->super_block != target->super_block) {
1355         return EXDEV;
1356     }
1357
1358     struct v_dnode* oldparent = current->parent;
1359     struct v_dnode* newparent = target->parent;
1360
1361     lock_dnode(current);
1362     lock_dnode(target);
1363     if (oldparent)
1364         lock_dnode(oldparent);
1365     if (newparent)
1366         lock_dnode(newparent);
1367
1368     if (!llist_empty(&target->children)) {
1369         errno = ENOTEMPTY;
1370         unlock_dnode(target);
1371         goto cleanup;
1372     }
1373
1374     if ((errno =
1375            current->inode->ops->rename(current->inode, current, target))) {
1376         unlock_dnode(target);
1377         goto cleanup;
1378     }
1379
1380     // re-position current
1381     hstrcpy(&current->name, &target->name);
1382     vfs_dcache_rehash(newparent, current);
1383
1384     // detach target
1385     vfs_d_free(target);
1386
1387     unlock_dnode(target);
1388
1389 cleanup:
1390     unlock_dnode(current);
1391     if (oldparent)
1392         unlock_dnode(oldparent);
1393     if (newparent)
1394         unlock_dnode(newparent);
1395
1396     return errno;
1397 }
1398
1399 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1400 {
1401     struct v_dnode *cur, *target_parent, *target;
1402     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1403     int errno = 0;
1404
1405     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1406         goto done;
1407     }
1408
1409     if ((errno = vfs_walk(
1410            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1411         goto done;
1412     }
1413
1414     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1415     if (errno == ENOENT) {
1416         target = vfs_d_alloc(target_parent, &name);
1417         vfs_dcache_add(target_parent, target);
1418     } else if (errno) {
1419         goto done;
1420     }
1421
1422     if (!target) {
1423         errno = ENOMEM;
1424         goto done;
1425     }
1426
1427     errno = vfs_do_rename(cur, target);
1428
1429 done:
1430     vfree((void*)name.value);
1431     return DO_STATUS(errno);
1432 }