lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/foptions.h>
  48 #include <lunaix/fs.h>
  49 #include <lunaix/mm/cake.h>
  50 #include <lunaix/mm/page.h>
  51 #include <lunaix/mm/valloc.h>
  52 #include <lunaix/process.h>
  53 #include <lunaix/spike.h>
  54 #include <lunaix/syscall.h>
  55 #include <lunaix/syscall_utils.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 #include <sys/dirent_defs.h>
  60
  61 static struct cake_pile* dnode_pile;
  62 static struct cake_pile* inode_pile;
  63 static struct cake_pile* file_pile;
  64 static struct cake_pile* superblock_pile;
  65 static struct cake_pile* fd_pile;
  66
  67 struct v_dnode* vfs_sysroot;
  68 static struct hbucket* dnode_cache;
  69
  70 struct lru_zone *dnode_lru, *inode_lru;
  71
  72 struct hstr vfs_ddot = HSTR("..", 2);
  73 struct hstr vfs_dot = HSTR(".", 1);
  74 struct hstr vfs_empty = HSTR("", 0);
  75
  76 struct v_superblock*
  77 vfs_sb_alloc();
  78
  79 void
  80 vfs_sb_free(struct v_superblock* sb);
  81
  82 static int
  83 __vfs_try_evict_dnode(struct lru_node* obj);
  84
  85 static int
  86 __vfs_try_evict_inode(struct lru_node* obj);
  87
  88 void
  89 vfs_init()
  90 {
  91     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  92     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  93     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  94     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  95     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  96     superblock_pile =
  97       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  98
  99     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 100
 101     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 102     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 103
 104     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 105     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 106
 107     // 创建一个根dnode。
 108     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 109     vfs_sysroot->parent = vfs_sysroot;
 110     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 111 }
 112
 113 inline struct hbucket*
 114 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 115 {
 116     u32_t _hash = *hash;
 117     // 确保低位更加随机
 118     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 119     // 与parent的指针值做加法，来减小碰撞的可能性。
 120     _hash += (u32_t)parent;
 121     *hash = _hash;
 122     return &dnode_cache[_hash & VFS_HASH_MASK];
 123 }
 124
 125 struct v_dnode*
 126 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 127 {
 128     if (!str->len || HSTR_EQ(str, &vfs_dot))
 129         return parent;
 130
 131     if (HSTR_EQ(str, &vfs_ddot)) {
 132         return parent->parent;
 133     }
 134
 135     u32_t hash = str->hash;
 136     struct hbucket* slot = __dcache_hash(parent, &hash);
 137
 138     struct v_dnode *pos, *n;
 139     hashtable_bucket_foreach(slot, pos, n, hash_list)
 140     {
 141         if (pos->name.hash == hash) {
 142             return pos;
 143         }
 144     }
 145     return NULL;
 146 }
 147
 148 void
 149 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 150 {
 151     assert(parent);
 152
 153     atomic_fetch_add(&dnode->ref_count, 1);
 154     dnode->parent = parent;
 155     llist_append(&parent->children, &dnode->siblings);
 156
 157     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 158     hlist_add(&bucket->head, &dnode->hash_list);
 159 }
 160
 161 void
 162 vfs_dcache_remove(struct v_dnode* dnode)
 163 {
 164     assert(dnode);
 165     assert(dnode->ref_count == 1);
 166
 167     llist_delete(&dnode->siblings);
 168     llist_delete(&dnode->aka_list);
 169     hlist_delete(&dnode->hash_list);
 170
 171     dnode->parent = NULL;
 172     atomic_fetch_sub(&dnode->ref_count, 1);
 173 }
 174
 175 void
 176 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 177 {
 178     assert(new_parent);
 179
 180     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 181     vfs_dcache_remove(dnode);
 182     vfs_dcache_add(new_parent, dnode);
 183 }
 184
 185 int
 186 vfs_open(struct v_dnode* dnode, struct v_file** file)
 187 {
 188     if (!dnode->inode || !dnode->inode->ops->open) {
 189         return ENOTSUP;
 190     }
 191
 192     struct v_inode* inode = dnode->inode;
 193
 194     lock_inode(inode);
 195
 196     struct v_file* vfile = cake_grab(file_pile);
 197     memset(vfile, 0, sizeof(*vfile));
 198
 199     vfile->dnode = dnode;
 200     vfile->inode = inode;
 201     vfile->ref_count = ATOMIC_VAR_INIT(1);
 202     vfile->ops = inode->default_fops;
 203
 204     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 205         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 206         pcache_init(pcache);
 207         pcache->master = inode;
 208         inode->pg_cache = pcache;
 209     }
 210
 211     int errno = inode->ops->open(inode, vfile);
 212     if (errno) {
 213         cake_release(file_pile, vfile);
 214     } else {
 215         atomic_fetch_add(&dnode->ref_count, 1);
 216         inode->open_count++;
 217         mnt_mkbusy(dnode->mnt);
 218
 219         *file = vfile;
 220     }
 221
 222     unlock_inode(inode);
 223
 224     return errno;
 225 }
 226
 227 void
 228 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 229 {
 230     if (assign_to->inode) {
 231         llist_delete(&assign_to->aka_list);
 232         assign_to->inode->link_count--;
 233     }
 234     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 235     assign_to->inode = inode;
 236     inode->link_count++;
 237 }
 238
 239 int
 240 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 241 {
 242     int errno;
 243
 244     if ((errno = vfs_check_writable(to_link))) {
 245         return errno;
 246     }
 247
 248     lock_inode(to_link->inode);
 249     if (to_link->super_block->root != name->super_block->root) {
 250         errno = EXDEV;
 251     } else if (!to_link->inode->ops->link) {
 252         errno = ENOTSUP;
 253     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 254         vfs_assign_inode(name, to_link->inode);
 255     }
 256     unlock_inode(to_link->inode);
 257
 258     return errno;
 259 }
 260
 261 int
 262 vfs_pclose(struct v_file* file, pid_t pid)
 263 {
 264     int errno = 0;
 265     if (file->ref_count > 1) {
 266         atomic_fetch_sub(&file->ref_count, 1);
 267     } else if (!(errno = file->ops->close(file))) {
 268         atomic_fetch_sub(&file->dnode->ref_count, 1);
 269         file->inode->open_count--;
 270
 271         /*
 272          * Prevent dead lock.
 273          * This happened when process is terminated while blocking on read.
 274          * In that case, the process is still holding the inode lock and it
 275              will never get released.
 276          * The unlocking should also include ownership check.
 277          *
 278          * To see why, consider two process both open the same file both with
 279          * fd=x.
 280          *      Process A: busy on reading x
 281          *      Process B: do nothing with x
 282          * Assuming that, after a very short time, process B get terminated
 283          * while process A is still busy in it's reading business. By this
 284          * design, the inode lock of this file x is get released by B rather
 285          * than A. And this will cause a probable race condition on A if other
 286          * process is writing to this file later after B exit.
 287          */
 288         if (mutex_on_hold(&file->inode->lock)) {
 289             mutex_unlock_for(&file->inode->lock, pid);
 290         }
 291         mnt_chillax(file->dnode->mnt);
 292
 293         pcache_commit_all(file->inode);
 294         cake_release(file_pile, file);
 295     }
 296     return errno;
 297 }
 298
 299 int
 300 vfs_close(struct v_file* file)
 301 {
 302     return vfs_pclose(file, __current->pid);
 303 }
 304
 305 void
 306 vfs_free_fd(struct v_fd* fd)
 307 {
 308     cake_release(fd_pile, fd);
 309 }
 310
 311 int
 312 vfs_fsync(struct v_file* file)
 313 {
 314     int errno;
 315     if ((errno = vfs_check_writable(file->dnode))) {
 316         return errno;
 317     }
 318
 319     lock_inode(file->inode);
 320
 321     pcache_commit_all(file->inode);
 322
 323     errno = ENOTSUP;
 324     if (file->ops->sync) {
 325         errno = file->ops->sync(file);
 326     }
 327
 328     unlock_inode(file->inode);
 329
 330     return errno;
 331 }
 332
 333 int
 334 vfs_alloc_fdslot(int* fd)
 335 {
 336     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 337         if (!__current->fdtable->fds[i]) {
 338             *fd = i;
 339             return 0;
 340         }
 341     }
 342     return EMFILE;
 343 }
 344
 345 struct v_superblock*
 346 vfs_sb_alloc()
 347 {
 348     struct v_superblock* sb = cake_grab(superblock_pile);
 349     memset(sb, 0, sizeof(*sb));
 350     llist_init_head(&sb->sb_list);
 351     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 352     return sb;
 353 }
 354
 355 void
 356 vfs_sb_free(struct v_superblock* sb)
 357 {
 358     vfree(sb->i_cache);
 359     cake_release(superblock_pile, sb);
 360 }
 361
 362 static int
 363 __vfs_try_evict_dnode(struct lru_node* obj)
 364 {
 365     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 366
 367     if (!dnode->ref_count) {
 368         vfs_d_free(dnode);
 369         return 1;
 370     }
 371     return 0;
 372 }
 373
 374 static int
 375 __vfs_try_evict_inode(struct lru_node* obj)
 376 {
 377     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 378
 379     if (!inode->link_count && !inode->open_count) {
 380         vfs_i_free(inode);
 381         return 1;
 382     }
 383     return 0;
 384 }
 385
 386 struct v_dnode*
 387 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 388 {
 389     struct v_dnode* dnode = cake_grab(dnode_pile);
 390     if (!dnode) {
 391         lru_evict_half(dnode_lru);
 392
 393         if (!(dnode = cake_grab(dnode_pile))) {
 394             return NULL;
 395         }
 396     }
 397
 398     memset(dnode, 0, sizeof(*dnode));
 399     llist_init_head(&dnode->children);
 400     llist_init_head(&dnode->siblings);
 401     llist_init_head(&dnode->aka_list);
 402     mutex_init(&dnode->lock);
 403
 404     dnode->ref_count = ATOMIC_VAR_INIT(0);
 405     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 406
 407     hstrcpy(&dnode->name, name);
 408
 409     if (parent) {
 410         dnode->super_block = parent->super_block;
 411         dnode->mnt = parent->mnt;
 412     }
 413
 414     lru_use_one(dnode_lru, &dnode->lru);
 415
 416     return dnode;
 417 }
 418
 419 void
 420 vfs_d_free(struct v_dnode* dnode)
 421 {
 422     assert(dnode->ref_count == 1);
 423
 424     if (dnode->inode) {
 425         assert(dnode->inode->link_count > 0);
 426         dnode->inode->link_count--;
 427     }
 428
 429     vfs_dcache_remove(dnode);
 430     // Make sure the children de-referencing their parent.
 431     // With lru presented, the eviction will be propagated over the entire
 432     // detached subtree eventually
 433     struct v_dnode *pos, *n;
 434     llist_for_each(pos, n, &dnode->children, siblings)
 435     {
 436         vfs_dcache_remove(pos);
 437     }
 438
 439     vfree(dnode->name.value);
 440     cake_release(dnode_pile, dnode);
 441 }
 442
 443 struct v_inode*
 444 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 445 {
 446     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 447     struct v_inode *pos, *n;
 448     hashtable_bucket_foreach(slot, pos, n, hash_list)
 449     {
 450         if (pos->id == i_id) {
 451             lru_use_one(inode_lru, &pos->lru);
 452             return pos;
 453         }
 454     }
 455
 456     return NULL;
 457 }
 458
 459 void
 460 vfs_i_addhash(struct v_inode* inode)
 461 {
 462     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 463
 464     hlist_delete(&inode->hash_list);
 465     hlist_add(&slot->head, &inode->hash_list);
 466 }
 467
 468 struct v_inode*
 469 vfs_i_alloc(struct v_superblock* sb)
 470 {
 471     assert(sb->ops.init_inode);
 472
 473     struct v_inode* inode;
 474     if (!(inode = cake_grab(inode_pile))) {
 475         lru_evict_half(inode_lru);
 476         if (!(inode = cake_grab(inode_pile))) {
 477             return NULL;
 478         }
 479     }
 480
 481     memset(inode, 0, sizeof(*inode));
 482     mutex_init(&inode->lock);
 483     llist_init_head(&inode->xattrs);
 484     llist_init_head(&inode->aka_dnodes);
 485
 486     sb->ops.init_inode(sb, inode);
 487
 488     inode->sb = sb;
 489     inode->ctime = clock_unixtime();
 490     inode->atime = inode->ctime;
 491     inode->mtime = inode->ctime;
 492
 493 done:
 494     lru_use_one(inode_lru, &inode->lru);
 495     return inode;
 496 }
 497
 498 void
 499 vfs_i_free(struct v_inode* inode)
 500 {
 501     if (inode->pg_cache) {
 502         pcache_release(inode->pg_cache);
 503         vfree(inode->pg_cache);
 504     }
 505     // we don't need to sync inode.
 506     // If an inode can be free, then it must be properly closed.
 507     // Hence it must be synced already!
 508     if (inode->destruct) {
 509         inode->destruct(inode);
 510     }
 511     hlist_delete(&inode->hash_list);
 512     cake_release(inode_pile, inode);
 513 }
 514
 515 /* ---- System call definition and support ---- */
 516
 517 #define FLOCATE_CREATE_EMPTY 1
 518 #define FLOCATE_CREATE_ONLY 2
 519
 520 int
 521 vfs_getfd(int fd, struct v_fd** fd_s)
 522 {
 523     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 524         return 0;
 525     }
 526     return EBADF;
 527 }
 528
 529 int
 530 __vfs_try_locate_file(const char* path,
 531                       struct v_dnode** fdir,
 532                       struct v_dnode** file,
 533                       int options)
 534 {
 535     char name_str[VFS_NAME_MAXLEN];
 536     struct hstr name = HSTR(name_str, 0);
 537     int errno;
 538
 539     name_str[0] = 0;
 540     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 541         return errno;
 542     }
 543
 544     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 545
 546     if (errno != ENOENT && (options & FLOCATE_CREATE_ONLY)) {
 547         return EEXIST;
 548     }
 549
 550     if (errno != ENOENT ||
 551         !(options & (FLOCATE_CREATE_EMPTY | FLOCATE_CREATE_ONLY))) {
 552         return errno;
 553     }
 554
 555     struct v_dnode* parent = *fdir;
 556     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 557
 558     if (!file_new) {
 559         return ENOMEM;
 560     }
 561
 562     lock_dnode(parent);
 563
 564     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 565         vfs_dcache_add(parent, file_new);
 566         *file = file_new;
 567     } else {
 568         vfs_d_free(file_new);
 569     }
 570
 571     unlock_dnode(parent);
 572
 573     return errno;
 574 }
 575
 576 int
 577 vfs_do_open(const char* path, int options)
 578 {
 579     int errno, fd;
 580     struct v_dnode *dentry, *file;
 581     struct v_file* ofile = NULL;
 582
 583     errno = __vfs_try_locate_file(
 584       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 585
 586     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 587
 588         if (errno || (errno = vfs_open(file, &ofile))) {
 589             return errno;
 590         }
 591
 592         struct v_fd* fd_s = cake_grab(fd_pile);
 593         memset(fd_s, 0, sizeof(*fd_s));
 594
 595         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 596         fd_s->file = ofile;
 597         fd_s->flags = options;
 598         __current->fdtable->fds[fd] = fd_s;
 599         return fd;
 600     }
 601
 602     return errno;
 603 }
 604
 605 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 606 {
 607     int errno = vfs_do_open(path, options);
 608     return DO_STATUS_OR_RETURN(errno);
 609 }
 610
 611 __DEFINE_LXSYSCALL1(int, close, int, fd)
 612 {
 613     struct v_fd* fd_s;
 614     int errno = 0;
 615     if ((errno = vfs_getfd(fd, &fd_s))) {
 616         goto done_err;
 617     }
 618
 619     if ((errno = vfs_close(fd_s->file))) {
 620         goto done_err;
 621     }
 622
 623     cake_release(fd_pile, fd_s);
 624     __current->fdtable->fds[fd] = 0;
 625
 626 done_err:
 627     return DO_STATUS(errno);
 628 }
 629
 630 void
 631 __vfs_readdir_callback(struct dir_context* dctx,
 632                        const char* name,
 633                        const int len,
 634                        const int dtype)
 635 {
 636     struct lx_dirent* dent = (struct lx_dirent*)dctx->cb_data;
 637     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 638     dent->d_nlen = len;
 639     dent->d_type = dtype;
 640 }
 641
 642 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent)
 643 {
 644     struct v_fd* fd_s;
 645     int errno;
 646
 647     if ((errno = vfs_getfd(fd, &fd_s))) {
 648         goto done;
 649     }
 650
 651     struct v_inode* inode = fd_s->file->inode;
 652
 653     lock_inode(inode);
 654
 655     if (!(inode->itype & VFS_IFDIR)) {
 656         errno = ENOTDIR;
 657     } else {
 658         struct dir_context dctx =
 659           (struct dir_context){ .cb_data = dent,
 660                                 .index = dent->d_offset,
 661                                 .read_complete_callback =
 662                                   __vfs_readdir_callback };
 663         errno = 1;
 664         if (dent->d_offset == 0) {
 665             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 666         } else if (dent->d_offset == 1) {
 667             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 668         } else {
 669             dctx.index -= 2;
 670             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 671                 unlock_inode(inode);
 672                 goto done;
 673             }
 674         }
 675         dent->d_offset++;
 676     }
 677
 678     unlock_inode(inode);
 679
 680 done:
 681     return DO_STATUS_OR_RETURN(errno);
 682 }
 683
 684 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 685 {
 686     int errno = 0;
 687     struct v_fd* fd_s;
 688     if ((errno = vfs_getfd(fd, &fd_s))) {
 689         goto done;
 690     }
 691
 692     struct v_file* file = fd_s->file;
 693     if ((file->inode->itype & VFS_IFDIR)) {
 694         errno = EISDIR;
 695         goto done;
 696     }
 697
 698     lock_inode(file->inode);
 699
 700     file->inode->atime = clock_unixtime();
 701
 702     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 703         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 704     } else {
 705         errno = pcache_read(file->inode, buf, count, file->f_pos);
 706     }
 707
 708     if (errno > 0) {
 709         file->f_pos += errno;
 710         unlock_inode(file->inode);
 711         return errno;
 712     }
 713
 714     unlock_inode(file->inode);
 715
 716 done:
 717     return DO_STATUS(errno);
 718 }
 719
 720 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 721 {
 722     int errno = 0;
 723     struct v_fd* fd_s;
 724     if ((errno = vfs_getfd(fd, &fd_s))) {
 725         goto done;
 726     }
 727
 728     struct v_file* file = fd_s->file;
 729
 730     if ((errno = vfs_check_writable(file->dnode))) {
 731         goto done;
 732     }
 733
 734     if ((file->inode->itype & VFS_IFDIR)) {
 735         errno = EISDIR;
 736         goto done;
 737     }
 738
 739     lock_inode(file->inode);
 740
 741     file->inode->mtime = clock_unixtime();
 742
 743     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 744         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 745     } else {
 746         errno = pcache_write(file->inode, buf, count, file->f_pos);
 747     }
 748
 749     if (errno > 0) {
 750         file->f_pos += errno;
 751         unlock_inode(file->inode);
 752         return errno;
 753     }
 754
 755     unlock_inode(file->inode);
 756
 757 done:
 758     return DO_STATUS(errno);
 759 }
 760
 761 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 762 {
 763     int errno = 0;
 764     struct v_fd* fd_s;
 765     if ((errno = vfs_getfd(fd, &fd_s))) {
 766         goto done;
 767     }
 768
 769     struct v_file* file = fd_s->file;
 770
 771     if (!file->ops->seek) {
 772         errno = ENOTSUP;
 773         goto done;
 774     }
 775
 776     lock_inode(file->inode);
 777
 778     int overflow = 0;
 779     int fpos = file->f_pos;
 780     switch (options) {
 781         case FSEEK_CUR:
 782             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 783             break;
 784         case FSEEK_END:
 785             overflow =
 786               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 787             break;
 788         case FSEEK_SET:
 789             fpos = offset;
 790             break;
 791     }
 792     if (overflow) {
 793         errno = EOVERFLOW;
 794     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 795         file->f_pos = fpos;
 796     }
 797
 798     unlock_inode(file->inode);
 799
 800 done:
 801     return DO_STATUS(errno);
 802 }
 803
 804 int
 805 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 806 {
 807     if (!dnode) {
 808         return 0;
 809     }
 810
 811     if (depth > 64) {
 812         return ENAMETOOLONG;
 813     }
 814
 815     size_t len = 0;
 816
 817     if (dnode->parent != dnode) {
 818         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 819     }
 820
 821     if (len >= size) {
 822         return len;
 823     }
 824
 825     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 826         buf[len++] = VFS_PATH_DELIM;
 827     }
 828
 829     size_t cpy_size = MIN(dnode->name.len, size - len);
 830     strncpy(buf + len, dnode->name.value, cpy_size);
 831     len += cpy_size;
 832
 833     return len;
 834 }
 835
 836 int
 837 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 838 {
 839     const char* link;
 840     struct v_inode* inode = dnode->inode;
 841     if (inode->ops->read_symlink) {
 842         lock_inode(inode);
 843
 844         int errno = inode->ops->read_symlink(inode, &link);
 845         strncpy(buf, link, size);
 846
 847         unlock_inode(inode);
 848         return errno;
 849     }
 850     return 0;
 851 }
 852
 853 int
 854 vfs_get_dtype(int itype)
 855 {
 856     switch (itype) {
 857         case VFS_IFDIR:
 858             return DT_DIR;
 859         case VFS_IFSYMLINK:
 860             return DT_SYMLINK;
 861         default:
 862             return DT_PIPE;
 863     }
 864 }
 865
 866 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 867 {
 868     int errno;
 869     struct v_fd* fd_s;
 870     if ((errno = vfs_getfd(fd, &fd_s))) {
 871         goto done;
 872     }
 873
 874     struct v_dnode* dnode;
 875     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 876
 877     if (errno >= 0) {
 878         return errno;
 879     }
 880
 881 done:
 882     return DO_STATUS(errno);
 883 }
 884
 885 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 886 {
 887     int errno;
 888     struct v_dnode* dnode;
 889     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 890         errno = vfs_readlink(dnode, buf, size);
 891     }
 892
 893     if (errno >= 0) {
 894         return errno;
 895     }
 896
 897     return DO_STATUS(errno);
 898 }
 899
 900 __DEFINE_LXSYSCALL4(int,
 901                     readlinkat,
 902                     int,
 903                     dirfd,
 904                     const char*,
 905                     pathname,
 906                     char*,
 907                     buf,
 908                     size_t,
 909                     size)
 910 {
 911     int errno;
 912     struct v_fd* fd_s;
 913     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 914         goto done;
 915     }
 916
 917     struct v_dnode* dnode;
 918     if (!(errno = vfs_walk(
 919             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 920         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 921     }
 922
 923     if (errno >= 0) {
 924         return errno;
 925     }
 926
 927 done:
 928     return DO_STATUS(errno);
 929 }
 930
 931 /*
 932     NOTE
 933     When we perform operation that could affect the layout of
 934     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 935     whenever possible. This will blocking any ongoing path walking to reach
 936     it hence avoid any partial state.
 937 */
 938
 939 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 940 {
 941     int errno;
 942     struct v_dnode* dnode;
 943     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 944         return DO_STATUS(errno);
 945     }
 946
 947     lock_dnode(dnode);
 948
 949     if ((errno = vfs_check_writable(dnode))) {
 950         goto done;
 951     }
 952
 953     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 954         errno = EROFS;
 955         goto done;
 956     }
 957
 958     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 959         errno = EBUSY;
 960         goto done;
 961     }
 962
 963     if (!llist_empty(&dnode->children)) {
 964         errno = ENOTEMPTY;
 965         goto done;
 966     }
 967
 968     struct v_dnode* parent = dnode->parent;
 969
 970     if (!parent) {
 971         errno = EINVAL;
 972         goto done;
 973     }
 974
 975     lock_dnode(parent);
 976     lock_inode(parent->inode);
 977
 978     if ((dnode->inode->itype & VFS_IFDIR)) {
 979         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 980         if (!errno) {
 981             vfs_dcache_remove(dnode);
 982         }
 983     } else {
 984         errno = ENOTDIR;
 985     }
 986
 987     unlock_inode(parent->inode);
 988     unlock_dnode(parent);
 989
 990 done:
 991     unlock_dnode(dnode);
 992     return DO_STATUS(errno);
 993 }
 994
 995 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 996 {
 997     int errno = 0;
 998     struct v_dnode *parent, *dir;
 999     char name_value[VFS_NAME_MAXLEN];
1000     struct hstr name = HHSTR(name_value, 0, 0);
1001
1002     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
1003         goto done;
1004     }
1005
1006     if ((errno = vfs_check_writable(parent))) {
1007         goto done;
1008     }
1009
1010     if (!(dir = vfs_d_alloc(parent, &name))) {
1011         errno = ENOMEM;
1012         goto done;
1013     }
1014
1015     lock_dnode(parent);
1016     lock_inode(parent->inode);
1017
1018     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1019         errno = ENOTSUP;
1020     } else if (!parent->inode->ops->mkdir) {
1021         errno = ENOTSUP;
1022     } else if (!(parent->inode->itype & VFS_IFDIR)) {
1023         errno = ENOTDIR;
1024     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1025         vfs_dcache_add(parent, dir);
1026         goto cleanup;
1027     }
1028
1029     vfs_d_free(dir);
1030
1031 cleanup:
1032     unlock_inode(parent->inode);
1033     unlock_dnode(parent);
1034 done:
1035     return DO_STATUS(errno);
1036 }
1037
1038 int
1039 __vfs_do_unlink(struct v_dnode* dnode)
1040 {
1041     int errno;
1042     struct v_inode* inode = dnode->inode;
1043
1044     if (dnode->ref_count > 1) {
1045         return EBUSY;
1046     }
1047
1048     if ((errno = vfs_check_writable(dnode))) {
1049         return errno;
1050     }
1051
1052     lock_inode(inode);
1053
1054     if (inode->open_count) {
1055         errno = EBUSY;
1056     } else if (!(inode->itype & VFS_IFDIR)) {
1057         // The underlying unlink implementation should handle
1058         //  symlink case
1059         errno = inode->ops->unlink(inode);
1060         if (!errno) {
1061             vfs_d_free(dnode);
1062         }
1063     } else {
1064         errno = EISDIR;
1065     }
1066
1067     unlock_inode(inode);
1068
1069     return errno;
1070 }
1071
1072 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1073 {
1074     int errno;
1075     struct v_dnode* dnode;
1076     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1077         goto done;
1078     }
1079
1080     errno = __vfs_do_unlink(dnode);
1081
1082 done:
1083     return DO_STATUS(errno);
1084 }
1085
1086 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1087 {
1088     int errno;
1089     struct v_fd* fd_s;
1090     if ((errno = vfs_getfd(fd, &fd_s))) {
1091         goto done;
1092     }
1093
1094     struct v_dnode* dnode;
1095     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1096         errno = __vfs_do_unlink(dnode);
1097     }
1098
1099 done:
1100     return DO_STATUS(errno);
1101 }
1102
1103 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1104 {
1105     int errno;
1106     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1107
1108     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1109     if (!errno) {
1110         errno = __vfs_try_locate_file(
1111           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1112         if (!errno) {
1113             errno = EEXIST;
1114         } else if (name_file) {
1115             errno = vfs_link(to_link, name_file);
1116         }
1117     }
1118     return DO_STATUS(errno);
1119 }
1120
1121 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1122 {
1123     int errno;
1124     struct v_fd* fd_s;
1125
1126     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1127         errno = vfs_fsync(fd_s->file);
1128     }
1129
1130     return DO_STATUS(errno);
1131 }
1132
1133 int
1134 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1135 {
1136     int errno = 0;
1137     struct v_fd* copied = cake_grab(fd_pile);
1138
1139     memcpy(copied, old, sizeof(struct v_fd));
1140
1141     atomic_fetch_add(&old->file->ref_count, 1);
1142
1143     *new = copied;
1144
1145     return errno;
1146 }
1147
1148 int
1149 vfs_dup2(int oldfd, int newfd)
1150 {
1151     if (newfd == oldfd) {
1152         return newfd;
1153     }
1154
1155     int errno;
1156     struct v_fd *oldfd_s, *newfd_s;
1157     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1158         goto done;
1159     }
1160
1161     if (!TEST_FD(newfd)) {
1162         errno = EBADF;
1163         goto done;
1164     }
1165
1166     newfd_s = __current->fdtable->fds[newfd];
1167     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1168         goto done;
1169     }
1170
1171     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1172         __current->fdtable->fds[newfd] = newfd_s;
1173         return newfd;
1174     }
1175
1176 done:
1177     return DO_STATUS(errno);
1178 }
1179
1180 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1181 {
1182     return vfs_dup2(oldfd, newfd);
1183 }
1184
1185 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1186 {
1187     int errno, newfd;
1188     struct v_fd *oldfd_s, *newfd_s;
1189     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1190         goto done;
1191     }
1192
1193     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1194         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1195         __current->fdtable->fds[newfd] = newfd_s;
1196         return newfd;
1197     }
1198
1199 done:
1200     return DO_STATUS(errno);
1201 }
1202
1203 __DEFINE_LXSYSCALL2(int,
1204                     symlink,
1205                     const char*,
1206                     pathname,
1207                     const char*,
1208                     link_target)
1209 {
1210     int errno;
1211     struct v_dnode *dnode, *file;
1212     if ((errno = __vfs_try_locate_file(
1213            pathname, &dnode, &file, FLOCATE_CREATE_ONLY))) {
1214         goto done;
1215     }
1216
1217     if (errno = vfs_check_writable(file)) {
1218         goto done;
1219     }
1220
1221     if (!file->inode->ops->set_symlink) {
1222         errno = ENOTSUP;
1223         goto done;
1224     }
1225
1226     lock_inode(file->inode);
1227
1228     errno = file->inode->ops->set_symlink(file->inode, link_target);
1229
1230     unlock_inode(file->inode);
1231
1232 done:
1233     return DO_STATUS(errno);
1234 }
1235
1236 void
1237 vfs_ref_file(struct v_file* file)
1238 {
1239     atomic_fetch_add(&file->ref_count, 1);
1240 }
1241
1242 void
1243 vfs_ref_dnode(struct v_dnode* dnode)
1244 {
1245     atomic_fetch_add(&dnode->ref_count, 1);
1246     mnt_mkbusy(dnode->mnt);
1247 }
1248
1249 void
1250 vfs_unref_dnode(struct v_dnode* dnode)
1251 {
1252     atomic_fetch_sub(&dnode->ref_count, 1);
1253     mnt_chillax(dnode->mnt);
1254 }
1255
1256 int
1257 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1258 {
1259     int errno = 0;
1260
1261     lock_dnode(dnode);
1262
1263     if (!(dnode->inode->itype & VFS_IFDIR)) {
1264         errno = ENOTDIR;
1265         goto done;
1266     }
1267
1268     if (proc->cwd) {
1269         vfs_unref_dnode(proc->cwd);
1270     }
1271
1272     vfs_ref_dnode(dnode);
1273     proc->cwd = dnode;
1274
1275     unlock_dnode(dnode);
1276
1277 done:
1278     return errno;
1279 }
1280
1281 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1282 {
1283     struct v_dnode* dnode;
1284     int errno = 0;
1285
1286     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1287         goto done;
1288     }
1289
1290     errno = vfs_do_chdir(__current, dnode);
1291
1292 done:
1293     return DO_STATUS(errno);
1294 }
1295
1296 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1297 {
1298     struct v_fd* fd_s;
1299     int errno = 0;
1300
1301     if ((errno = vfs_getfd(fd, &fd_s))) {
1302         goto done;
1303     }
1304
1305     errno = vfs_do_chdir(__current, fd_s->file->dnode);
1306
1307 done:
1308     return DO_STATUS(errno);
1309 }
1310
1311 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1312 {
1313     int errno = 0;
1314     char* ret_ptr = 0;
1315     if (size < 2) {
1316         errno = ERANGE;
1317         goto done;
1318     }
1319
1320     size_t len = 0;
1321
1322     if (!__current->cwd) {
1323         *buf = VFS_PATH_DELIM;
1324         len = 1;
1325     } else {
1326         len = vfs_get_path(__current->cwd, buf, size, 0);
1327         if (len == size) {
1328             errno = ERANGE;
1329             goto done;
1330         }
1331     }
1332
1333     buf[len] = '\0';
1334
1335     ret_ptr = buf;
1336
1337 done:
1338     __current->k_status = errno;
1339     return ret_ptr;
1340 }
1341
1342 int
1343 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1344 {
1345     int errno = 0;
1346     if (current->inode->id == target->inode->id) {
1347         // hard link
1348         return 0;
1349     }
1350
1351     if (errno = vfs_check_writable(current)) {
1352         return errno;
1353     }
1354
1355     if (current->ref_count > 1 || target->ref_count > 1) {
1356         return EBUSY;
1357     }
1358
1359     if (current->super_block != target->super_block) {
1360         return EXDEV;
1361     }
1362
1363     struct v_dnode* oldparent = current->parent;
1364     struct v_dnode* newparent = target->parent;
1365
1366     lock_dnode(current);
1367     lock_dnode(target);
1368     if (oldparent)
1369         lock_dnode(oldparent);
1370     if (newparent)
1371         lock_dnode(newparent);
1372
1373     if (!llist_empty(&target->children)) {
1374         errno = ENOTEMPTY;
1375         unlock_dnode(target);
1376         goto cleanup;
1377     }
1378
1379     if ((errno =
1380            current->inode->ops->rename(current->inode, current, target))) {
1381         unlock_dnode(target);
1382         goto cleanup;
1383     }
1384
1385     // re-position current
1386     hstrcpy(&current->name, &target->name);
1387     vfs_dcache_rehash(newparent, current);
1388
1389     // detach target
1390     vfs_d_free(target);
1391
1392     unlock_dnode(target);
1393
1394 cleanup:
1395     unlock_dnode(current);
1396     if (oldparent)
1397         unlock_dnode(oldparent);
1398     if (newparent)
1399         unlock_dnode(newparent);
1400
1401     return errno;
1402 }
1403
1404 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1405 {
1406     struct v_dnode *cur, *target_parent, *target;
1407     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1408     int errno = 0;
1409
1410     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1411         goto done;
1412     }
1413
1414     if ((errno = vfs_walk(
1415            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1416         goto done;
1417     }
1418
1419     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1420     if (errno == ENOENT) {
1421         target = vfs_d_alloc(target_parent, &name);
1422         vfs_dcache_add(target_parent, target);
1423     } else if (errno) {
1424         goto done;
1425     }
1426
1427     if (!target) {
1428         errno = ENOMEM;
1429         goto done;
1430     }
1431
1432     errno = vfs_do_rename(cur, target);
1433
1434 done:
1435     vfree(name.value);
1436     return DO_STATUS(errno);
1437 }