lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/dirent.h>
  48 #include <lunaix/foptions.h>
  49 #include <lunaix/fs.h>
  50 #include <lunaix/mm/cake.h>
  51 #include <lunaix/mm/page.h>
  52 #include <lunaix/mm/valloc.h>
  53 #include <lunaix/process.h>
  54 #include <lunaix/spike.h>
  55 #include <lunaix/syscall.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 static struct cake_pile* dnode_pile;
  60 static struct cake_pile* inode_pile;
  61 static struct cake_pile* file_pile;
  62 static struct cake_pile* superblock_pile;
  63 static struct cake_pile* fd_pile;
  64
  65 struct v_dnode* vfs_sysroot;
  66 static struct hbucket* dnode_cache;
  67
  68 struct lru_zone *dnode_lru, *inode_lru;
  69
  70 struct hstr vfs_ddot = HSTR("..", 2);
  71 struct hstr vfs_dot = HSTR(".", 1);
  72 struct hstr vfs_empty = HSTR("", 0);
  73
  74 struct v_superblock*
  75 vfs_sb_alloc();
  76
  77 void
  78 vfs_sb_free(struct v_superblock* sb);
  79
  80 static int
  81 __vfs_try_evict_dnode(struct lru_node* obj);
  82
  83 static int
  84 __vfs_try_evict_inode(struct lru_node* obj);
  85
  86 void
  87 vfs_init()
  88 {
  89     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  90     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  91     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  92     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  93     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  94     superblock_pile =
  95       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  96
  97     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
  98
  99     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 100     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 101
 102     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 103     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 104
 105     // 创建一个根dnode。
 106     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 107     vfs_sysroot->parent = vfs_sysroot;
 108     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 109 }
 110
 111 inline struct hbucket*
 112 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 113 {
 114     u32_t _hash = *hash;
 115     // 确保低位更加随机
 116     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 117     // 与parent的指针值做加法，来减小碰撞的可能性。
 118     _hash += (u32_t)parent;
 119     *hash = _hash;
 120     return &dnode_cache[_hash & VFS_HASH_MASK];
 121 }
 122
 123 struct v_dnode*
 124 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 125 {
 126     if (!str->len || HSTR_EQ(str, &vfs_dot))
 127         return parent;
 128
 129     if (HSTR_EQ(str, &vfs_ddot)) {
 130         return parent->parent;
 131     }
 132
 133     u32_t hash = str->hash;
 134     struct hbucket* slot = __dcache_hash(parent, &hash);
 135
 136     struct v_dnode *pos, *n;
 137     hashtable_bucket_foreach(slot, pos, n, hash_list)
 138     {
 139         if (pos->name.hash == hash) {
 140             return pos;
 141         }
 142     }
 143     return NULL;
 144 }
 145
 146 void
 147 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 148 {
 149     assert(parent);
 150
 151     atomic_fetch_add(&dnode->ref_count, 1);
 152     dnode->parent = parent;
 153     llist_append(&parent->children, &dnode->siblings);
 154
 155     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 156     hlist_add(&bucket->head, &dnode->hash_list);
 157 }
 158
 159 void
 160 vfs_dcache_remove(struct v_dnode* dnode)
 161 {
 162     assert(dnode);
 163     assert(dnode->ref_count == 1);
 164
 165     llist_delete(&dnode->siblings);
 166     llist_delete(&dnode->aka_list);
 167     hlist_delete(&dnode->hash_list);
 168
 169     dnode->parent = NULL;
 170     atomic_fetch_sub(&dnode->ref_count, 1);
 171 }
 172
 173 void
 174 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 175 {
 176     assert(new_parent);
 177
 178     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 179     vfs_dcache_remove(dnode);
 180     vfs_dcache_add(new_parent, dnode);
 181 }
 182
 183 int
 184 vfs_open(struct v_dnode* dnode, struct v_file** file)
 185 {
 186     if (!dnode->inode || !dnode->inode->ops->open) {
 187         return ENOTSUP;
 188     }
 189
 190     struct v_inode* inode = dnode->inode;
 191
 192     lock_inode(inode);
 193
 194     struct v_file* vfile = cake_grab(file_pile);
 195     memset(vfile, 0, sizeof(*vfile));
 196
 197     vfile->dnode = dnode;
 198     vfile->inode = inode;
 199     vfile->ref_count = ATOMIC_VAR_INIT(1);
 200     vfile->ops = inode->default_fops;
 201
 202     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 203         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 204         pcache_init(pcache);
 205         pcache->master = inode;
 206         inode->pg_cache = pcache;
 207     }
 208
 209     int errno = inode->ops->open(inode, vfile);
 210     if (errno) {
 211         cake_release(file_pile, vfile);
 212     } else {
 213         atomic_fetch_add(&dnode->ref_count, 1);
 214         inode->open_count++;
 215         mnt_mkbusy(dnode->mnt);
 216
 217         *file = vfile;
 218     }
 219
 220     unlock_inode(inode);
 221
 222     return errno;
 223 }
 224
 225 void
 226 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 227 {
 228     if (assign_to->inode) {
 229         llist_delete(&assign_to->aka_list);
 230         assign_to->inode->link_count--;
 231     }
 232     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 233     assign_to->inode = inode;
 234     inode->link_count++;
 235 }
 236
 237 int
 238 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 239 {
 240     int errno;
 241
 242     if ((errno = vfs_check_writable(to_link))) {
 243         return errno;
 244     }
 245
 246     lock_inode(to_link->inode);
 247     if (to_link->super_block->root != name->super_block->root) {
 248         errno = EXDEV;
 249     } else if (!to_link->inode->ops->link) {
 250         errno = ENOTSUP;
 251     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 252         vfs_assign_inode(name, to_link->inode);
 253     }
 254     unlock_inode(to_link->inode);
 255
 256     return errno;
 257 }
 258
 259 int
 260 vfs_pclose(struct v_file* file, pid_t pid)
 261 {
 262     int errno = 0;
 263     if (file->ref_count > 1) {
 264         atomic_fetch_sub(&file->ref_count, 1);
 265     } else if (!(errno = file->ops->close(file))) {
 266         atomic_fetch_sub(&file->dnode->ref_count, 1);
 267         file->inode->open_count--;
 268
 269         // Prevent dead lock.
 270         // This happened when process is terminated while blocking on read.
 271         // In that case, the process is still holding the inode lock and it will
 272         // never get released.
 273         /*
 274          * The unlocking should also include ownership check.
 275          *
 276          * To see why, consider two process both open the same file both with
 277          * fd=x.
 278          *      Process A: busy on reading x
 279          *      Process B: do nothing with x
 280          * Assuming that, after a very short time, process B get terminated
 281          * while process A is still busy in it's reading business. By this
 282          * design, the inode lock of this file x is get released by B rather
 283          * than A. And this will cause a probable race condition on A if other
 284          * process is writing to this file later after B exit.
 285          */
 286         if (mutex_on_hold(&file->inode->lock)) {
 287             mutex_unlock_for(&file->inode->lock, pid);
 288         }
 289         mnt_chillax(file->dnode->mnt);
 290
 291         pcache_commit_all(file->inode);
 292         cake_release(file_pile, file);
 293     }
 294     return errno;
 295 }
 296
 297 int
 298 vfs_close(struct v_file* file)
 299 {
 300     return vfs_pclose(file, __current->pid);
 301 }
 302
 303 void
 304 vfs_free_fd(struct v_fd* fd)
 305 {
 306     cake_release(fd_pile, fd);
 307 }
 308
 309 int
 310 vfs_fsync(struct v_file* file)
 311 {
 312     int errno;
 313     if ((errno = vfs_check_writable(file->dnode))) {
 314         return errno;
 315     }
 316
 317     lock_inode(file->inode);
 318
 319     pcache_commit_all(file->inode);
 320
 321     errno = ENOTSUP;
 322     if (file->ops->sync) {
 323         errno = file->ops->sync(file);
 324     }
 325
 326     unlock_inode(file->inode);
 327
 328     return errno;
 329 }
 330
 331 int
 332 vfs_alloc_fdslot(int* fd)
 333 {
 334     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 335         if (!__current->fdtable->fds[i]) {
 336             *fd = i;
 337             return 0;
 338         }
 339     }
 340     return EMFILE;
 341 }
 342
 343 struct v_superblock*
 344 vfs_sb_alloc()
 345 {
 346     struct v_superblock* sb = cake_grab(superblock_pile);
 347     memset(sb, 0, sizeof(*sb));
 348     llist_init_head(&sb->sb_list);
 349     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 350     return sb;
 351 }
 352
 353 void
 354 vfs_sb_free(struct v_superblock* sb)
 355 {
 356     vfree(sb->i_cache);
 357     cake_release(superblock_pile, sb);
 358 }
 359
 360 static int
 361 __vfs_try_evict_dnode(struct lru_node* obj)
 362 {
 363     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 364
 365     if (!dnode->ref_count) {
 366         vfs_d_free(dnode);
 367         return 1;
 368     }
 369     return 0;
 370 }
 371
 372 static int
 373 __vfs_try_evict_inode(struct lru_node* obj)
 374 {
 375     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 376
 377     if (!inode->link_count && !inode->open_count) {
 378         vfs_i_free(inode);
 379         return 1;
 380     }
 381     return 0;
 382 }
 383
 384 struct v_dnode*
 385 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 386 {
 387     struct v_dnode* dnode = cake_grab(dnode_pile);
 388     if (!dnode) {
 389         lru_evict_half(dnode_lru);
 390
 391         if (!(dnode = cake_grab(dnode_pile))) {
 392             return NULL;
 393         }
 394     }
 395
 396     memset(dnode, 0, sizeof(*dnode));
 397     llist_init_head(&dnode->children);
 398     llist_init_head(&dnode->siblings);
 399     llist_init_head(&dnode->aka_list);
 400     mutex_init(&dnode->lock);
 401
 402     dnode->ref_count = ATOMIC_VAR_INIT(0);
 403     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 404
 405     hstrcpy(&dnode->name, name);
 406
 407     if (parent) {
 408         dnode->super_block = parent->super_block;
 409         dnode->mnt = parent->mnt;
 410     }
 411
 412     lru_use_one(dnode_lru, &dnode->lru);
 413
 414     return dnode;
 415 }
 416
 417 void
 418 vfs_d_free(struct v_dnode* dnode)
 419 {
 420     assert(dnode->ref_count == 1);
 421
 422     if (dnode->inode) {
 423         assert(dnode->inode->link_count > 0);
 424         dnode->inode->link_count--;
 425     }
 426
 427     vfs_dcache_remove(dnode);
 428     // Make sure the children de-referencing their parent.
 429     // With lru presented, the eviction will be propagated over the entire
 430     // detached subtree eventually
 431     struct v_dnode *pos, *n;
 432     llist_for_each(pos, n, &dnode->children, siblings)
 433     {
 434         vfs_dcache_remove(pos);
 435     }
 436
 437     vfree(dnode->name.value);
 438     cake_release(dnode_pile, dnode);
 439 }
 440
 441 struct v_inode*
 442 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 443 {
 444     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 445     struct v_inode *pos, *n;
 446     hashtable_bucket_foreach(slot, pos, n, hash_list)
 447     {
 448         if (pos->id == i_id) {
 449             lru_use_one(inode_lru, &pos->lru);
 450             return pos;
 451         }
 452     }
 453
 454     return NULL;
 455 }
 456
 457 void
 458 vfs_i_addhash(struct v_inode* inode)
 459 {
 460     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 461
 462     hlist_delete(&inode->hash_list);
 463     hlist_add(&slot->head, &inode->hash_list);
 464 }
 465
 466 struct v_inode*
 467 vfs_i_alloc(struct v_superblock* sb)
 468 {
 469     assert(sb->ops.init_inode);
 470
 471     struct v_inode* inode;
 472     if (!(inode = cake_grab(inode_pile))) {
 473         lru_evict_half(inode_lru);
 474         if (!(inode = cake_grab(inode_pile))) {
 475             return NULL;
 476         }
 477     }
 478
 479     memset(inode, 0, sizeof(*inode));
 480     mutex_init(&inode->lock);
 481     llist_init_head(&inode->xattrs);
 482     llist_init_head(&inode->aka_dnodes);
 483
 484     sb->ops.init_inode(sb, inode);
 485
 486     inode->sb = sb;
 487     inode->ctime = clock_unixtime();
 488     inode->atime = inode->ctime;
 489     inode->mtime = inode->ctime;
 490
 491 done:
 492     lru_use_one(inode_lru, &inode->lru);
 493     return inode;
 494 }
 495
 496 void
 497 vfs_i_free(struct v_inode* inode)
 498 {
 499     if (inode->pg_cache) {
 500         pcache_release(inode->pg_cache);
 501         vfree(inode->pg_cache);
 502     }
 503     // we don't need to sync inode.
 504     // If an inode can be free, then it must be properly closed.
 505     // Hence it must be synced already!
 506     if (inode->destruct) {
 507         inode->destruct(inode);
 508     }
 509     hlist_delete(&inode->hash_list);
 510     cake_release(inode_pile, inode);
 511 }
 512
 513 /* ---- System call definition and support ---- */
 514
 515 #define FLOCATE_CREATE_EMPTY 1
 516
 517 int
 518 vfs_getfd(int fd, struct v_fd** fd_s)
 519 {
 520     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 521         return 0;
 522     }
 523     return EBADF;
 524 }
 525
 526 int
 527 __vfs_try_locate_file(const char* path,
 528                       struct v_dnode** fdir,
 529                       struct v_dnode** file,
 530                       int options)
 531 {
 532     char name_str[VFS_NAME_MAXLEN];
 533     struct hstr name = HSTR(name_str, 0);
 534     int errno;
 535
 536     name_str[0] = 0;
 537     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 538         return errno;
 539     }
 540
 541     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 542     if (errno != ENOENT || !(options & FLOCATE_CREATE_EMPTY)) {
 543         return errno;
 544     }
 545
 546     struct v_dnode* parent = *fdir;
 547     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 548
 549     if (!file_new) {
 550         return ENOMEM;
 551     }
 552
 553     lock_dnode(parent);
 554
 555     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 556         vfs_dcache_add(parent, file_new);
 557         *file = file_new;
 558     } else {
 559         vfs_d_free(file_new);
 560     }
 561
 562     unlock_dnode(parent);
 563
 564     return errno;
 565 }
 566
 567 int
 568 vfs_do_open(const char* path, int options)
 569 {
 570     int errno, fd;
 571     struct v_dnode *dentry, *file;
 572     struct v_file* ofile = 0;
 573
 574     errno = __vfs_try_locate_file(
 575       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 576
 577     if (errno || (errno = vfs_open(file, &ofile))) {
 578         return errno;
 579     }
 580
 581     struct v_inode* o_inode = ofile->inode;
 582
 583     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 584         struct v_fd* fd_s = cake_grab(fd_pile);
 585         memset(fd_s, 0, sizeof(*fd_s));
 586
 587         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 588         fd_s->file = ofile;
 589         fd_s->flags = options;
 590         __current->fdtable->fds[fd] = fd_s;
 591         return fd;
 592     }
 593
 594     return errno;
 595 }
 596
 597 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 598 {
 599     int errno = vfs_do_open(path, options);
 600     return DO_STATUS_OR_RETURN(errno);
 601 }
 602
 603 __DEFINE_LXSYSCALL1(int, close, int, fd)
 604 {
 605     struct v_fd* fd_s;
 606     int errno = 0;
 607     if ((errno = vfs_getfd(fd, &fd_s))) {
 608         goto done_err;
 609     }
 610
 611     if ((errno = vfs_close(fd_s->file))) {
 612         goto done_err;
 613     }
 614
 615     cake_release(fd_pile, fd_s);
 616     __current->fdtable->fds[fd] = 0;
 617
 618 done_err:
 619     return DO_STATUS(errno);
 620 }
 621
 622 void
 623 __vfs_readdir_callback(struct dir_context* dctx,
 624                        const char* name,
 625                        const int len,
 626                        const int dtype)
 627 {
 628     struct dirent* dent = (struct dirent*)dctx->cb_data;
 629     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 630     dent->d_nlen = len;
 631     dent->d_type = dtype;
 632 }
 633
 634 __DEFINE_LXSYSCALL2(int, readdir, int, fd, struct dirent*, dent)
 635 {
 636     struct v_fd* fd_s;
 637     int errno;
 638
 639     if ((errno = vfs_getfd(fd, &fd_s))) {
 640         goto done;
 641     }
 642
 643     struct v_inode* inode = fd_s->file->inode;
 644
 645     lock_inode(inode);
 646
 647     if (!(inode->itype & VFS_IFDIR)) {
 648         errno = ENOTDIR;
 649     } else {
 650         struct dir_context dctx =
 651           (struct dir_context){ .cb_data = dent,
 652                                 .index = dent->d_offset,
 653                                 .read_complete_callback =
 654                                   __vfs_readdir_callback };
 655         errno = 1;
 656         if (dent->d_offset == 0) {
 657             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 658         } else if (dent->d_offset == 1) {
 659             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 660         } else {
 661             dctx.index -= 2;
 662             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 663                 unlock_inode(inode);
 664                 goto done;
 665             }
 666         }
 667         dent->d_offset++;
 668     }
 669
 670     unlock_inode(inode);
 671
 672 done:
 673     return DO_STATUS_OR_RETURN(errno);
 674 }
 675
 676 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 677 {
 678     int errno = 0;
 679     struct v_fd* fd_s;
 680     if ((errno = vfs_getfd(fd, &fd_s))) {
 681         goto done;
 682     }
 683
 684     struct v_file* file = fd_s->file;
 685     if ((file->inode->itype & VFS_IFDIR)) {
 686         errno = EISDIR;
 687         goto done;
 688     }
 689
 690     lock_inode(file->inode);
 691
 692     file->inode->atime = clock_unixtime();
 693
 694     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 695         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 696     } else {
 697         errno = pcache_read(file->inode, buf, count, file->f_pos);
 698     }
 699
 700     if (errno > 0) {
 701         file->f_pos += errno;
 702         unlock_inode(file->inode);
 703         return errno;
 704     }
 705
 706     unlock_inode(file->inode);
 707
 708 done:
 709     return DO_STATUS(errno);
 710 }
 711
 712 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 713 {
 714     int errno = 0;
 715     struct v_fd* fd_s;
 716     if ((errno = vfs_getfd(fd, &fd_s))) {
 717         goto done;
 718     }
 719
 720     struct v_file* file = fd_s->file;
 721
 722     if ((errno = vfs_check_writable(file->dnode))) {
 723         goto done;
 724     }
 725
 726     if ((file->inode->itype & VFS_IFDIR)) {
 727         errno = EISDIR;
 728         goto done;
 729     }
 730
 731     lock_inode(file->inode);
 732
 733     file->inode->mtime = clock_unixtime();
 734
 735     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 736         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 737     } else {
 738         errno = pcache_write(file->inode, buf, count, file->f_pos);
 739     }
 740
 741     if (errno > 0) {
 742         file->f_pos += errno;
 743         unlock_inode(file->inode);
 744         return errno;
 745     }
 746
 747     unlock_inode(file->inode);
 748
 749 done:
 750     return DO_STATUS(errno);
 751 }
 752
 753 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 754 {
 755     int errno = 0;
 756     struct v_fd* fd_s;
 757     if ((errno = vfs_getfd(fd, &fd_s))) {
 758         goto done;
 759     }
 760
 761     struct v_file* file = fd_s->file;
 762
 763     if (!file->ops->seek) {
 764         errno = ENOTSUP;
 765         goto done;
 766     }
 767
 768     lock_inode(file->inode);
 769
 770     int overflow = 0;
 771     int fpos = file->f_pos;
 772     switch (options) {
 773         case FSEEK_CUR:
 774             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 775             break;
 776         case FSEEK_END:
 777             overflow =
 778               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 779             break;
 780         case FSEEK_SET:
 781             fpos = offset;
 782             break;
 783     }
 784     if (overflow) {
 785         errno = EOVERFLOW;
 786     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 787         file->f_pos = fpos;
 788     }
 789
 790     unlock_inode(file->inode);
 791
 792 done:
 793     return DO_STATUS(errno);
 794 }
 795
 796 int
 797 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 798 {
 799     if (!dnode) {
 800         return 0;
 801     }
 802
 803     if (depth > 64) {
 804         return ENAMETOOLONG;
 805     }
 806
 807     size_t len = 0;
 808
 809     if (dnode->parent != dnode) {
 810         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 811     }
 812
 813     if (len >= size) {
 814         return len;
 815     }
 816
 817     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 818         buf[len++] = VFS_PATH_DELIM;
 819     }
 820
 821     size_t cpy_size = MIN(dnode->name.len, size - len);
 822     strncpy(buf + len, dnode->name.value, cpy_size);
 823     len += cpy_size;
 824
 825     return len;
 826 }
 827
 828 int
 829 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 830 {
 831     const char* link;
 832     struct v_inode* inode = dnode->inode;
 833     if (inode->ops->read_symlink) {
 834         lock_inode(inode);
 835
 836         int errno = inode->ops->read_symlink(inode, &link);
 837         strncpy(buf, link, size);
 838
 839         unlock_inode(inode);
 840         return errno;
 841     }
 842     return 0;
 843 }
 844
 845 int
 846 vfs_get_dtype(int itype)
 847 {
 848     switch (itype) {
 849         case VFS_IFDIR:
 850             return DT_DIR;
 851         case VFS_IFSYMLINK:
 852             return DT_SYMLINK;
 853         default:
 854             return DT_PIPE;
 855     }
 856 }
 857
 858 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 859 {
 860     int errno;
 861     struct v_fd* fd_s;
 862     if ((errno = vfs_getfd(fd, &fd_s))) {
 863         goto done;
 864     }
 865
 866     struct v_dnode* dnode;
 867     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 868
 869     if (errno >= 0) {
 870         return errno;
 871     }
 872
 873 done:
 874     return DO_STATUS(errno);
 875 }
 876
 877 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 878 {
 879     int errno;
 880     struct v_dnode* dnode;
 881     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 882         errno = vfs_readlink(dnode, buf, size);
 883     }
 884
 885     if (errno >= 0) {
 886         return errno;
 887     }
 888
 889     return DO_STATUS(errno);
 890 }
 891
 892 __DEFINE_LXSYSCALL4(int,
 893                     readlinkat,
 894                     int,
 895                     dirfd,
 896                     const char*,
 897                     pathname,
 898                     char*,
 899                     buf,
 900                     size_t,
 901                     size)
 902 {
 903     int errno;
 904     struct v_fd* fd_s;
 905     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 906         goto done;
 907     }
 908
 909     struct v_dnode* dnode;
 910     if (!(errno = vfs_walk(
 911             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 912         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 913     }
 914
 915     if (errno >= 0) {
 916         return errno;
 917     }
 918
 919 done:
 920     return DO_STATUS(errno);
 921 }
 922
 923 /*
 924     NOTE
 925     When we perform operation that could affect the layout of
 926     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 927     whenever possible. This will blocking any ongoing path walking to reach
 928     it hence avoid any partial state.
 929 */
 930
 931 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 932 {
 933     int errno;
 934     struct v_dnode* dnode;
 935     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 936         return DO_STATUS(errno);
 937     }
 938
 939     lock_dnode(dnode);
 940
 941     if ((errno = vfs_check_writable(dnode))) {
 942         goto done;
 943     }
 944
 945     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 946         errno = EROFS;
 947         goto done;
 948     }
 949
 950     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 951         errno = EBUSY;
 952         goto done;
 953     }
 954
 955     if (!llist_empty(&dnode->children)) {
 956         errno = ENOTEMPTY;
 957         goto done;
 958     }
 959
 960     struct v_dnode* parent = dnode->parent;
 961
 962     if (!parent) {
 963         errno = EINVAL;
 964         goto done;
 965     }
 966
 967     lock_dnode(parent);
 968     lock_inode(parent->inode);
 969
 970     if ((dnode->inode->itype & VFS_IFDIR)) {
 971         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 972         if (!errno) {
 973             vfs_dcache_remove(dnode);
 974         }
 975     } else {
 976         errno = ENOTDIR;
 977     }
 978
 979     unlock_inode(parent->inode);
 980     unlock_dnode(parent);
 981
 982 done:
 983     unlock_dnode(dnode);
 984     return DO_STATUS(errno);
 985 }
 986
 987 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 988 {
 989     int errno = 0;
 990     struct v_dnode *parent, *dir;
 991     char name_value[VFS_NAME_MAXLEN];
 992     struct hstr name = HHSTR(name_value, 0, 0);
 993
 994     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
 995         goto done;
 996     }
 997
 998     if ((errno = vfs_check_writable(parent))) {
 999         goto done;
1000     }
1001
1002     if (!(dir = vfs_d_alloc(parent, &name))) {
1003         errno = ENOMEM;
1004         goto done;
1005     }
1006
1007     lock_dnode(parent);
1008     lock_inode(parent->inode);
1009
1010     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1011         errno = ENOTSUP;
1012     } else if (!parent->inode->ops->mkdir) {
1013         errno = ENOTSUP;
1014     } else if (!(parent->inode->itype & VFS_IFDIR)) {
1015         errno = ENOTDIR;
1016     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1017         vfs_dcache_add(parent, dir);
1018         goto cleanup;
1019     }
1020
1021     vfs_d_free(dir);
1022
1023 cleanup:
1024     unlock_inode(parent->inode);
1025     unlock_dnode(parent);
1026 done:
1027     return DO_STATUS(errno);
1028 }
1029
1030 int
1031 __vfs_do_unlink(struct v_dnode* dnode)
1032 {
1033     int errno;
1034     struct v_inode* inode = dnode->inode;
1035
1036     if (dnode->ref_count > 1) {
1037         return EBUSY;
1038     }
1039
1040     if ((errno = vfs_check_writable(dnode))) {
1041         return errno;
1042     }
1043
1044     lock_inode(inode);
1045
1046     if (inode->open_count) {
1047         errno = EBUSY;
1048     } else if (!(inode->itype & VFS_IFDIR)) {
1049         // The underlying unlink implementation should handle
1050         //  symlink case
1051         errno = inode->ops->unlink(inode);
1052         if (!errno) {
1053             vfs_d_free(dnode);
1054         }
1055     } else {
1056         errno = EISDIR;
1057     }
1058
1059     unlock_inode(inode);
1060
1061     return errno;
1062 }
1063
1064 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1065 {
1066     int errno;
1067     struct v_dnode* dnode;
1068     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1069         goto done;
1070     }
1071
1072     errno = __vfs_do_unlink(dnode);
1073
1074 done:
1075     return DO_STATUS(errno);
1076 }
1077
1078 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1079 {
1080     int errno;
1081     struct v_fd* fd_s;
1082     if ((errno = vfs_getfd(fd, &fd_s))) {
1083         goto done;
1084     }
1085
1086     struct v_dnode* dnode;
1087     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1088         errno = __vfs_do_unlink(dnode);
1089     }
1090
1091 done:
1092     return DO_STATUS(errno);
1093 }
1094
1095 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1096 {
1097     int errno;
1098     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1099
1100     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1101     if (!errno) {
1102         errno = __vfs_try_locate_file(
1103           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1104         if (!errno) {
1105             errno = EEXIST;
1106         } else if (name_file) {
1107             errno = vfs_link(to_link, name_file);
1108         }
1109     }
1110     return DO_STATUS(errno);
1111 }
1112
1113 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1114 {
1115     int errno;
1116     struct v_fd* fd_s;
1117
1118     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1119         errno = vfs_fsync(fd_s->file);
1120     }
1121
1122     return DO_STATUS(errno);
1123 }
1124
1125 int
1126 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1127 {
1128     int errno = 0;
1129     struct v_fd* copied = cake_grab(fd_pile);
1130
1131     memcpy(copied, old, sizeof(struct v_fd));
1132
1133     atomic_fetch_add(&old->file->ref_count, 1);
1134
1135     *new = copied;
1136
1137     return errno;
1138 }
1139
1140 int
1141 vfs_dup2(int oldfd, int newfd)
1142 {
1143     if (newfd == oldfd) {
1144         return newfd;
1145     }
1146
1147     int errno;
1148     struct v_fd *oldfd_s, *newfd_s;
1149     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1150         goto done;
1151     }
1152
1153     if (!TEST_FD(newfd)) {
1154         errno = EBADF;
1155         goto done;
1156     }
1157
1158     newfd_s = __current->fdtable->fds[newfd];
1159     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1160         goto done;
1161     }
1162
1163     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1164         __current->fdtable->fds[newfd] = newfd_s;
1165         return newfd;
1166     }
1167
1168 done:
1169     return DO_STATUS(errno);
1170 }
1171
1172 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1173 {
1174     return vfs_dup2(oldfd, newfd);
1175 }
1176
1177 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1178 {
1179     int errno, newfd;
1180     struct v_fd *oldfd_s, *newfd_s;
1181     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1182         goto done;
1183     }
1184
1185     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1186         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1187         __current->fdtable->fds[newfd] = newfd_s;
1188         return newfd;
1189     }
1190
1191 done:
1192     return DO_STATUS(errno);
1193 }
1194
1195 __DEFINE_LXSYSCALL2(int,
1196                     symlink,
1197                     const char*,
1198                     pathname,
1199                     const char*,
1200                     link_target)
1201 {
1202     int errno;
1203     struct v_dnode* dnode;
1204     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1205         goto done;
1206     }
1207
1208     if (errno = vfs_check_writable(dnode)) {
1209         goto done;
1210     }
1211
1212     if (!dnode->inode->ops->set_symlink) {
1213         errno = ENOTSUP;
1214         goto done;
1215     }
1216
1217     lock_inode(dnode->inode);
1218
1219     errno = dnode->inode->ops->set_symlink(dnode->inode, link_target);
1220
1221     unlock_inode(dnode->inode);
1222
1223 done:
1224     return DO_STATUS(errno);
1225 }
1226
1227 void
1228 vfs_ref_dnode(struct v_dnode* dnode)
1229 {
1230     atomic_fetch_add(&dnode->ref_count, 1);
1231     mnt_mkbusy(dnode->mnt);
1232 }
1233
1234 void
1235 vfs_unref_dnode(struct v_dnode* dnode)
1236 {
1237     atomic_fetch_sub(&dnode->ref_count, 1);
1238     mnt_chillax(dnode->mnt);
1239 }
1240
1241 int
1242 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1243 {
1244     int errno = 0;
1245
1246     lock_dnode(dnode);
1247
1248     if (!(dnode->inode->itype & VFS_IFDIR)) {
1249         errno = ENOTDIR;
1250         goto done;
1251     }
1252
1253     if (proc->cwd) {
1254         vfs_unref_dnode(proc->cwd);
1255     }
1256
1257     vfs_ref_dnode(dnode);
1258     proc->cwd = dnode;
1259
1260     unlock_dnode(dnode);
1261
1262 done:
1263     return errno;
1264 }
1265
1266 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1267 {
1268     struct v_dnode* dnode;
1269     int errno = 0;
1270
1271     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1272         goto done;
1273     }
1274
1275     errno = vfs_do_chdir(__current, dnode);
1276
1277 done:
1278     return DO_STATUS(errno);
1279 }
1280
1281 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1282 {
1283     struct v_fd* fd_s;
1284     int errno = 0;
1285
1286     if ((errno = vfs_getfd(fd, &fd_s))) {
1287         goto done;
1288     }
1289
1290     errno = vfs_do_chdir(__current, fd_s->file->dnode);
1291
1292 done:
1293     return DO_STATUS(errno);
1294 }
1295
1296 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1297 {
1298     int errno = 0;
1299     char* ret_ptr = 0;
1300     if (size < 2) {
1301         errno = ERANGE;
1302         goto done;
1303     }
1304
1305     size_t len = 0;
1306
1307     if (!__current->cwd) {
1308         *buf = VFS_PATH_DELIM;
1309         len = 1;
1310     } else {
1311         len = vfs_get_path(__current->cwd, buf, size, 0);
1312         if (len == size) {
1313             errno = ERANGE;
1314             goto done;
1315         }
1316     }
1317
1318     buf[len + 1] = '\0';
1319
1320     ret_ptr = buf;
1321
1322 done:
1323     __current->k_status = errno;
1324     return ret_ptr;
1325 }
1326
1327 int
1328 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1329 {
1330     int errno = 0;
1331     if (current->inode->id == target->inode->id) {
1332         // hard link
1333         return 0;
1334     }
1335
1336     if (errno = vfs_check_writable(current)) {
1337         return errno;
1338     }
1339
1340     if (current->ref_count > 1 || target->ref_count > 1) {
1341         return EBUSY;
1342     }
1343
1344     if (current->super_block != target->super_block) {
1345         return EXDEV;
1346     }
1347
1348     struct v_dnode* oldparent = current->parent;
1349     struct v_dnode* newparent = target->parent;
1350
1351     lock_dnode(current);
1352     lock_dnode(target);
1353     if (oldparent)
1354         lock_dnode(oldparent);
1355     if (newparent)
1356         lock_dnode(newparent);
1357
1358     if (!llist_empty(&target->children)) {
1359         errno = ENOTEMPTY;
1360         unlock_dnode(target);
1361         goto cleanup;
1362     }
1363
1364     if ((errno =
1365            current->inode->ops->rename(current->inode, current, target))) {
1366         unlock_dnode(target);
1367         goto cleanup;
1368     }
1369
1370     // re-position current
1371     hstrcpy(&current->name, &target->name);
1372     vfs_dcache_rehash(newparent, current);
1373
1374     // detach target
1375     vfs_d_free(target);
1376
1377     unlock_dnode(target);
1378
1379 cleanup:
1380     unlock_dnode(current);
1381     if (oldparent)
1382         unlock_dnode(oldparent);
1383     if (newparent)
1384         unlock_dnode(newparent);
1385
1386     return errno;
1387 }
1388
1389 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1390 {
1391     struct v_dnode *cur, *target_parent, *target;
1392     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1393     int errno = 0;
1394
1395     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1396         goto done;
1397     }
1398
1399     if ((errno = vfs_walk(
1400            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1401         goto done;
1402     }
1403
1404     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1405     if (errno == ENOENT) {
1406         target = vfs_d_alloc(target_parent, &name);
1407         vfs_dcache_add(target_parent, target);
1408     } else if (errno) {
1409         goto done;
1410     }
1411
1412     if (!target) {
1413         errno = ENOMEM;
1414         goto done;
1415     }
1416
1417     errno = vfs_do_rename(cur, target);
1418
1419 done:
1420     vfree(name.value);
1421     return DO_STATUS(errno);
1422 }