lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/dirent.h>
  48 #include <lunaix/foptions.h>
  49 #include <lunaix/fs.h>
  50 #include <lunaix/mm/cake.h>
  51 #include <lunaix/mm/page.h>
  52 #include <lunaix/mm/valloc.h>
  53 #include <lunaix/process.h>
  54 #include <lunaix/spike.h>
  55 #include <lunaix/syscall.h>
  56 #include <lunaix/syscall_utils.h>
  57
  58 #include <lunaix/fs/twifs.h>
  59
  60 static struct cake_pile* dnode_pile;
  61 static struct cake_pile* inode_pile;
  62 static struct cake_pile* file_pile;
  63 static struct cake_pile* superblock_pile;
  64 static struct cake_pile* fd_pile;
  65
  66 struct v_dnode* vfs_sysroot;
  67 static struct hbucket* dnode_cache;
  68
  69 struct lru_zone *dnode_lru, *inode_lru;
  70
  71 struct hstr vfs_ddot = HSTR("..", 2);
  72 struct hstr vfs_dot = HSTR(".", 1);
  73 struct hstr vfs_empty = HSTR("", 0);
  74
  75 struct v_superblock*
  76 vfs_sb_alloc();
  77
  78 void
  79 vfs_sb_free(struct v_superblock* sb);
  80
  81 static int
  82 __vfs_try_evict_dnode(struct lru_node* obj);
  83
  84 static int
  85 __vfs_try_evict_inode(struct lru_node* obj);
  86
  87 void
  88 vfs_init()
  89 {
  90     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  91     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  92     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  93     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  94     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  95     superblock_pile =
  96       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  97
  98     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
  99
 100     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 101     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 102
 103     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 104     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 105
 106     // 创建一个根dnode。
 107     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 108     vfs_sysroot->parent = vfs_sysroot;
 109     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 110 }
 111
 112 inline struct hbucket*
 113 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 114 {
 115     u32_t _hash = *hash;
 116     // 确保低位更加随机
 117     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 118     // 与parent的指针值做加法，来减小碰撞的可能性。
 119     _hash += (u32_t)parent;
 120     *hash = _hash;
 121     return &dnode_cache[_hash & VFS_HASH_MASK];
 122 }
 123
 124 struct v_dnode*
 125 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 126 {
 127     if (!str->len || HSTR_EQ(str, &vfs_dot))
 128         return parent;
 129
 130     if (HSTR_EQ(str, &vfs_ddot)) {
 131         return parent->parent;
 132     }
 133
 134     u32_t hash = str->hash;
 135     struct hbucket* slot = __dcache_hash(parent, &hash);
 136
 137     struct v_dnode *pos, *n;
 138     hashtable_bucket_foreach(slot, pos, n, hash_list)
 139     {
 140         if (pos->name.hash == hash) {
 141             return pos;
 142         }
 143     }
 144     return NULL;
 145 }
 146
 147 void
 148 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 149 {
 150     assert(parent);
 151
 152     atomic_fetch_add(&dnode->ref_count, 1);
 153     dnode->parent = parent;
 154     llist_append(&parent->children, &dnode->siblings);
 155
 156     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 157     hlist_add(&bucket->head, &dnode->hash_list);
 158 }
 159
 160 void
 161 vfs_dcache_remove(struct v_dnode* dnode)
 162 {
 163     assert(dnode);
 164     assert(dnode->ref_count == 1);
 165
 166     llist_delete(&dnode->siblings);
 167     llist_delete(&dnode->aka_list);
 168     hlist_delete(&dnode->hash_list);
 169
 170     dnode->parent = NULL;
 171     atomic_fetch_sub(&dnode->ref_count, 1);
 172 }
 173
 174 void
 175 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 176 {
 177     assert(new_parent);
 178
 179     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 180     vfs_dcache_remove(dnode);
 181     vfs_dcache_add(new_parent, dnode);
 182 }
 183
 184 int
 185 vfs_open(struct v_dnode* dnode, struct v_file** file)
 186 {
 187     if (!dnode->inode || !dnode->inode->ops->open) {
 188         return ENOTSUP;
 189     }
 190
 191     struct v_inode* inode = dnode->inode;
 192
 193     lock_inode(inode);
 194
 195     struct v_file* vfile = cake_grab(file_pile);
 196     memset(vfile, 0, sizeof(*vfile));
 197
 198     vfile->dnode = dnode;
 199     vfile->inode = inode;
 200     vfile->ref_count = ATOMIC_VAR_INIT(1);
 201     vfile->ops = inode->default_fops;
 202
 203     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 204         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 205         pcache_init(pcache);
 206         pcache->master = inode;
 207         inode->pg_cache = pcache;
 208     }
 209
 210     int errno = inode->ops->open(inode, vfile);
 211     if (errno) {
 212         cake_release(file_pile, vfile);
 213     } else {
 214         atomic_fetch_add(&dnode->ref_count, 1);
 215         inode->open_count++;
 216         mnt_mkbusy(dnode->mnt);
 217
 218         *file = vfile;
 219     }
 220
 221     unlock_inode(inode);
 222
 223     return errno;
 224 }
 225
 226 void
 227 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 228 {
 229     if (assign_to->inode) {
 230         llist_delete(&assign_to->aka_list);
 231         assign_to->inode->link_count--;
 232     }
 233     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 234     assign_to->inode = inode;
 235     inode->link_count++;
 236 }
 237
 238 int
 239 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 240 {
 241     int errno;
 242
 243     if ((errno = vfs_check_writable(to_link))) {
 244         return errno;
 245     }
 246
 247     lock_inode(to_link->inode);
 248     if (to_link->super_block->root != name->super_block->root) {
 249         errno = EXDEV;
 250     } else if (!to_link->inode->ops->link) {
 251         errno = ENOTSUP;
 252     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 253         vfs_assign_inode(name, to_link->inode);
 254     }
 255     unlock_inode(to_link->inode);
 256
 257     return errno;
 258 }
 259
 260 int
 261 vfs_pclose(struct v_file* file, pid_t pid)
 262 {
 263     int errno = 0;
 264     if (file->ref_count > 1) {
 265         atomic_fetch_sub(&file->ref_count, 1);
 266     } else if (!(errno = file->ops->close(file))) {
 267         atomic_fetch_sub(&file->dnode->ref_count, 1);
 268         file->inode->open_count--;
 269
 270         /*
 271          * Prevent dead lock.
 272          * This happened when process is terminated while blocking on read.
 273          * In that case, the process is still holding the inode lock and it
 274              will never get released.
 275          * The unlocking should also include ownership check.
 276          *
 277          * To see why, consider two process both open the same file both with
 278          * fd=x.
 279          *      Process A: busy on reading x
 280          *      Process B: do nothing with x
 281          * Assuming that, after a very short time, process B get terminated
 282          * while process A is still busy in it's reading business. By this
 283          * design, the inode lock of this file x is get released by B rather
 284          * than A. And this will cause a probable race condition on A if other
 285          * process is writing to this file later after B exit.
 286          */
 287         if (mutex_on_hold(&file->inode->lock)) {
 288             mutex_unlock_for(&file->inode->lock, pid);
 289         }
 290         mnt_chillax(file->dnode->mnt);
 291
 292         pcache_commit_all(file->inode);
 293         cake_release(file_pile, file);
 294     }
 295     return errno;
 296 }
 297
 298 int
 299 vfs_close(struct v_file* file)
 300 {
 301     return vfs_pclose(file, __current->pid);
 302 }
 303
 304 void
 305 vfs_free_fd(struct v_fd* fd)
 306 {
 307     cake_release(fd_pile, fd);
 308 }
 309
 310 int
 311 vfs_fsync(struct v_file* file)
 312 {
 313     int errno;
 314     if ((errno = vfs_check_writable(file->dnode))) {
 315         return errno;
 316     }
 317
 318     lock_inode(file->inode);
 319
 320     pcache_commit_all(file->inode);
 321
 322     errno = ENOTSUP;
 323     if (file->ops->sync) {
 324         errno = file->ops->sync(file);
 325     }
 326
 327     unlock_inode(file->inode);
 328
 329     return errno;
 330 }
 331
 332 int
 333 vfs_alloc_fdslot(int* fd)
 334 {
 335     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 336         if (!__current->fdtable->fds[i]) {
 337             *fd = i;
 338             return 0;
 339         }
 340     }
 341     return EMFILE;
 342 }
 343
 344 struct v_superblock*
 345 vfs_sb_alloc()
 346 {
 347     struct v_superblock* sb = cake_grab(superblock_pile);
 348     memset(sb, 0, sizeof(*sb));
 349     llist_init_head(&sb->sb_list);
 350     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 351     return sb;
 352 }
 353
 354 void
 355 vfs_sb_free(struct v_superblock* sb)
 356 {
 357     vfree(sb->i_cache);
 358     cake_release(superblock_pile, sb);
 359 }
 360
 361 static int
 362 __vfs_try_evict_dnode(struct lru_node* obj)
 363 {
 364     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 365
 366     if (!dnode->ref_count) {
 367         vfs_d_free(dnode);
 368         return 1;
 369     }
 370     return 0;
 371 }
 372
 373 static int
 374 __vfs_try_evict_inode(struct lru_node* obj)
 375 {
 376     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 377
 378     if (!inode->link_count && !inode->open_count) {
 379         vfs_i_free(inode);
 380         return 1;
 381     }
 382     return 0;
 383 }
 384
 385 struct v_dnode*
 386 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 387 {
 388     struct v_dnode* dnode = cake_grab(dnode_pile);
 389     if (!dnode) {
 390         lru_evict_half(dnode_lru);
 391
 392         if (!(dnode = cake_grab(dnode_pile))) {
 393             return NULL;
 394         }
 395     }
 396
 397     memset(dnode, 0, sizeof(*dnode));
 398     llist_init_head(&dnode->children);
 399     llist_init_head(&dnode->siblings);
 400     llist_init_head(&dnode->aka_list);
 401     mutex_init(&dnode->lock);
 402
 403     dnode->ref_count = ATOMIC_VAR_INIT(0);
 404     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 405
 406     hstrcpy(&dnode->name, name);
 407
 408     if (parent) {
 409         dnode->super_block = parent->super_block;
 410         dnode->mnt = parent->mnt;
 411     }
 412
 413     lru_use_one(dnode_lru, &dnode->lru);
 414
 415     return dnode;
 416 }
 417
 418 void
 419 vfs_d_free(struct v_dnode* dnode)
 420 {
 421     assert(dnode->ref_count == 1);
 422
 423     if (dnode->inode) {
 424         assert(dnode->inode->link_count > 0);
 425         dnode->inode->link_count--;
 426     }
 427
 428     vfs_dcache_remove(dnode);
 429     // Make sure the children de-referencing their parent.
 430     // With lru presented, the eviction will be propagated over the entire
 431     // detached subtree eventually
 432     struct v_dnode *pos, *n;
 433     llist_for_each(pos, n, &dnode->children, siblings)
 434     {
 435         vfs_dcache_remove(pos);
 436     }
 437
 438     vfree(dnode->name.value);
 439     cake_release(dnode_pile, dnode);
 440 }
 441
 442 struct v_inode*
 443 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 444 {
 445     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 446     struct v_inode *pos, *n;
 447     hashtable_bucket_foreach(slot, pos, n, hash_list)
 448     {
 449         if (pos->id == i_id) {
 450             lru_use_one(inode_lru, &pos->lru);
 451             return pos;
 452         }
 453     }
 454
 455     return NULL;
 456 }
 457
 458 void
 459 vfs_i_addhash(struct v_inode* inode)
 460 {
 461     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 462
 463     hlist_delete(&inode->hash_list);
 464     hlist_add(&slot->head, &inode->hash_list);
 465 }
 466
 467 struct v_inode*
 468 vfs_i_alloc(struct v_superblock* sb)
 469 {
 470     assert(sb->ops.init_inode);
 471
 472     struct v_inode* inode;
 473     if (!(inode = cake_grab(inode_pile))) {
 474         lru_evict_half(inode_lru);
 475         if (!(inode = cake_grab(inode_pile))) {
 476             return NULL;
 477         }
 478     }
 479
 480     memset(inode, 0, sizeof(*inode));
 481     mutex_init(&inode->lock);
 482     llist_init_head(&inode->xattrs);
 483     llist_init_head(&inode->aka_dnodes);
 484
 485     sb->ops.init_inode(sb, inode);
 486
 487     inode->sb = sb;
 488     inode->ctime = clock_unixtime();
 489     inode->atime = inode->ctime;
 490     inode->mtime = inode->ctime;
 491
 492 done:
 493     lru_use_one(inode_lru, &inode->lru);
 494     return inode;
 495 }
 496
 497 void
 498 vfs_i_free(struct v_inode* inode)
 499 {
 500     if (inode->pg_cache) {
 501         pcache_release(inode->pg_cache);
 502         vfree(inode->pg_cache);
 503     }
 504     // we don't need to sync inode.
 505     // If an inode can be free, then it must be properly closed.
 506     // Hence it must be synced already!
 507     if (inode->destruct) {
 508         inode->destruct(inode);
 509     }
 510     hlist_delete(&inode->hash_list);
 511     cake_release(inode_pile, inode);
 512 }
 513
 514 /* ---- System call definition and support ---- */
 515
 516 #define FLOCATE_CREATE_EMPTY 1
 517
 518 int
 519 vfs_getfd(int fd, struct v_fd** fd_s)
 520 {
 521     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 522         return 0;
 523     }
 524     return EBADF;
 525 }
 526
 527 int
 528 __vfs_try_locate_file(const char* path,
 529                       struct v_dnode** fdir,
 530                       struct v_dnode** file,
 531                       int options)
 532 {
 533     char name_str[VFS_NAME_MAXLEN];
 534     struct hstr name = HSTR(name_str, 0);
 535     int errno;
 536
 537     name_str[0] = 0;
 538     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 539         return errno;
 540     }
 541
 542     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 543     if (errno != ENOENT || !(options & FLOCATE_CREATE_EMPTY)) {
 544         return errno;
 545     }
 546
 547     struct v_dnode* parent = *fdir;
 548     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 549
 550     if (!file_new) {
 551         return ENOMEM;
 552     }
 553
 554     lock_dnode(parent);
 555
 556     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 557         vfs_dcache_add(parent, file_new);
 558         *file = file_new;
 559     } else {
 560         vfs_d_free(file_new);
 561     }
 562
 563     unlock_dnode(parent);
 564
 565     return errno;
 566 }
 567
 568 int
 569 vfs_do_open(const char* path, int options)
 570 {
 571     int errno, fd;
 572     struct v_dnode *dentry, *file;
 573     struct v_file* ofile = NULL;
 574
 575     errno = __vfs_try_locate_file(
 576       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 577
 578     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 579
 580         if (errno || (errno = vfs_open(file, &ofile))) {
 581             return errno;
 582         }
 583
 584         struct v_fd* fd_s = cake_grab(fd_pile);
 585         memset(fd_s, 0, sizeof(*fd_s));
 586
 587         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 588         fd_s->file = ofile;
 589         fd_s->flags = options;
 590         __current->fdtable->fds[fd] = fd_s;
 591         return fd;
 592     }
 593
 594     return errno;
 595 }
 596
 597 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 598 {
 599     int errno = vfs_do_open(path, options);
 600     return DO_STATUS_OR_RETURN(errno);
 601 }
 602
 603 __DEFINE_LXSYSCALL1(int, close, int, fd)
 604 {
 605     struct v_fd* fd_s;
 606     int errno = 0;
 607     if ((errno = vfs_getfd(fd, &fd_s))) {
 608         goto done_err;
 609     }
 610
 611     if ((errno = vfs_close(fd_s->file))) {
 612         goto done_err;
 613     }
 614
 615     cake_release(fd_pile, fd_s);
 616     __current->fdtable->fds[fd] = 0;
 617
 618 done_err:
 619     return DO_STATUS(errno);
 620 }
 621
 622 void
 623 __vfs_readdir_callback(struct dir_context* dctx,
 624                        const char* name,
 625                        const int len,
 626                        const int dtype)
 627 {
 628     struct dirent* dent = (struct dirent*)dctx->cb_data;
 629     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 630     dent->d_nlen = len;
 631     dent->d_type = dtype;
 632 }
 633
 634 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct dirent*, dent)
 635 {
 636     struct v_fd* fd_s;
 637     int errno;
 638
 639     if ((errno = vfs_getfd(fd, &fd_s))) {
 640         goto done;
 641     }
 642
 643     struct v_inode* inode = fd_s->file->inode;
 644
 645     lock_inode(inode);
 646
 647     if (!(inode->itype & VFS_IFDIR)) {
 648         errno = ENOTDIR;
 649     } else {
 650         struct dir_context dctx =
 651           (struct dir_context){ .cb_data = dent,
 652                                 .index = dent->d_offset,
 653                                 .read_complete_callback =
 654                                   __vfs_readdir_callback };
 655         errno = 1;
 656         if (dent->d_offset == 0) {
 657             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 658         } else if (dent->d_offset == 1) {
 659             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 660         } else {
 661             dctx.index -= 2;
 662             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 663                 unlock_inode(inode);
 664                 goto done;
 665             }
 666         }
 667         dent->d_offset++;
 668     }
 669
 670     unlock_inode(inode);
 671
 672 done:
 673     return DO_STATUS_OR_RETURN(errno);
 674 }
 675
 676 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 677 {
 678     int errno = 0;
 679     struct v_fd* fd_s;
 680     if ((errno = vfs_getfd(fd, &fd_s))) {
 681         goto done;
 682     }
 683
 684     struct v_file* file = fd_s->file;
 685     if ((file->inode->itype & VFS_IFDIR)) {
 686         errno = EISDIR;
 687         goto done;
 688     }
 689
 690     lock_inode(file->inode);
 691
 692     file->inode->atime = clock_unixtime();
 693
 694     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 695         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 696     } else {
 697         errno = pcache_read(file->inode, buf, count, file->f_pos);
 698     }
 699
 700     if (errno > 0) {
 701         file->f_pos += errno;
 702         unlock_inode(file->inode);
 703         return errno;
 704     }
 705
 706     unlock_inode(file->inode);
 707
 708 done:
 709     return DO_STATUS(errno);
 710 }
 711
 712 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 713 {
 714     int errno = 0;
 715     struct v_fd* fd_s;
 716     if ((errno = vfs_getfd(fd, &fd_s))) {
 717         goto done;
 718     }
 719
 720     struct v_file* file = fd_s->file;
 721
 722     if ((errno = vfs_check_writable(file->dnode))) {
 723         goto done;
 724     }
 725
 726     if ((file->inode->itype & VFS_IFDIR)) {
 727         errno = EISDIR;
 728         goto done;
 729     }
 730
 731     lock_inode(file->inode);
 732
 733     file->inode->mtime = clock_unixtime();
 734
 735     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 736         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 737     } else {
 738         errno = pcache_write(file->inode, buf, count, file->f_pos);
 739     }
 740
 741     if (errno > 0) {
 742         file->f_pos += errno;
 743         unlock_inode(file->inode);
 744         return errno;
 745     }
 746
 747     unlock_inode(file->inode);
 748
 749 done:
 750     return DO_STATUS(errno);
 751 }
 752
 753 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 754 {
 755     int errno = 0;
 756     struct v_fd* fd_s;
 757     if ((errno = vfs_getfd(fd, &fd_s))) {
 758         goto done;
 759     }
 760
 761     struct v_file* file = fd_s->file;
 762
 763     if (!file->ops->seek) {
 764         errno = ENOTSUP;
 765         goto done;
 766     }
 767
 768     lock_inode(file->inode);
 769
 770     int overflow = 0;
 771     int fpos = file->f_pos;
 772     switch (options) {
 773         case FSEEK_CUR:
 774             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 775             break;
 776         case FSEEK_END:
 777             overflow =
 778               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 779             break;
 780         case FSEEK_SET:
 781             fpos = offset;
 782             break;
 783     }
 784     if (overflow) {
 785         errno = EOVERFLOW;
 786     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 787         file->f_pos = fpos;
 788     }
 789
 790     unlock_inode(file->inode);
 791
 792 done:
 793     return DO_STATUS(errno);
 794 }
 795
 796 int
 797 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 798 {
 799     if (!dnode) {
 800         return 0;
 801     }
 802
 803     if (depth > 64) {
 804         return ENAMETOOLONG;
 805     }
 806
 807     size_t len = 0;
 808
 809     if (dnode->parent != dnode) {
 810         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 811     }
 812
 813     if (len >= size) {
 814         return len;
 815     }
 816
 817     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 818         buf[len++] = VFS_PATH_DELIM;
 819     }
 820
 821     size_t cpy_size = MIN(dnode->name.len, size - len);
 822     strncpy(buf + len, dnode->name.value, cpy_size);
 823     len += cpy_size;
 824
 825     return len;
 826 }
 827
 828 int
 829 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 830 {
 831     const char* link;
 832     struct v_inode* inode = dnode->inode;
 833     if (inode->ops->read_symlink) {
 834         lock_inode(inode);
 835
 836         int errno = inode->ops->read_symlink(inode, &link);
 837         strncpy(buf, link, size);
 838
 839         unlock_inode(inode);
 840         return errno;
 841     }
 842     return 0;
 843 }
 844
 845 int
 846 vfs_get_dtype(int itype)
 847 {
 848     switch (itype) {
 849         case VFS_IFDIR:
 850             return DT_DIR;
 851         case VFS_IFSYMLINK:
 852             return DT_SYMLINK;
 853         default:
 854             return DT_PIPE;
 855     }
 856 }
 857
 858 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 859 {
 860     int errno;
 861     struct v_fd* fd_s;
 862     if ((errno = vfs_getfd(fd, &fd_s))) {
 863         goto done;
 864     }
 865
 866     struct v_dnode* dnode;
 867     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 868
 869     if (errno >= 0) {
 870         return errno;
 871     }
 872
 873 done:
 874     return DO_STATUS(errno);
 875 }
 876
 877 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 878 {
 879     int errno;
 880     struct v_dnode* dnode;
 881     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 882         errno = vfs_readlink(dnode, buf, size);
 883     }
 884
 885     if (errno >= 0) {
 886         return errno;
 887     }
 888
 889     return DO_STATUS(errno);
 890 }
 891
 892 __DEFINE_LXSYSCALL4(int,
 893                     readlinkat,
 894                     int,
 895                     dirfd,
 896                     const char*,
 897                     pathname,
 898                     char*,
 899                     buf,
 900                     size_t,
 901                     size)
 902 {
 903     int errno;
 904     struct v_fd* fd_s;
 905     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 906         goto done;
 907     }
 908
 909     struct v_dnode* dnode;
 910     if (!(errno = vfs_walk(
 911             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 912         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 913     }
 914
 915     if (errno >= 0) {
 916         return errno;
 917     }
 918
 919 done:
 920     return DO_STATUS(errno);
 921 }
 922
 923 /*
 924     NOTE
 925     When we perform operation that could affect the layout of
 926     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 927     whenever possible. This will blocking any ongoing path walking to reach
 928     it hence avoid any partial state.
 929 */
 930
 931 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 932 {
 933     int errno;
 934     struct v_dnode* dnode;
 935     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 936         return DO_STATUS(errno);
 937     }
 938
 939     lock_dnode(dnode);
 940
 941     if ((errno = vfs_check_writable(dnode))) {
 942         goto done;
 943     }
 944
 945     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 946         errno = EROFS;
 947         goto done;
 948     }
 949
 950     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 951         errno = EBUSY;
 952         goto done;
 953     }
 954
 955     if (!llist_empty(&dnode->children)) {
 956         errno = ENOTEMPTY;
 957         goto done;
 958     }
 959
 960     struct v_dnode* parent = dnode->parent;
 961
 962     if (!parent) {
 963         errno = EINVAL;
 964         goto done;
 965     }
 966
 967     lock_dnode(parent);
 968     lock_inode(parent->inode);
 969
 970     if ((dnode->inode->itype & VFS_IFDIR)) {
 971         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 972         if (!errno) {
 973             vfs_dcache_remove(dnode);
 974         }
 975     } else {
 976         errno = ENOTDIR;
 977     }
 978
 979     unlock_inode(parent->inode);
 980     unlock_dnode(parent);
 981
 982 done:
 983     unlock_dnode(dnode);
 984     return DO_STATUS(errno);
 985 }
 986
 987 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 988 {
 989     int errno = 0;
 990     struct v_dnode *parent, *dir;
 991     char name_value[VFS_NAME_MAXLEN];
 992     struct hstr name = HHSTR(name_value, 0, 0);
 993
 994     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
 995         goto done;
 996     }
 997
 998     if ((errno = vfs_check_writable(parent))) {
 999         goto done;
1000     }
1001
1002     if (!(dir = vfs_d_alloc(parent, &name))) {
1003         errno = ENOMEM;
1004         goto done;
1005     }
1006
1007     lock_dnode(parent);
1008     lock_inode(parent->inode);
1009
1010     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1011         errno = ENOTSUP;
1012     } else if (!parent->inode->ops->mkdir) {
1013         errno = ENOTSUP;
1014     } else if (!(parent->inode->itype & VFS_IFDIR)) {
1015         errno = ENOTDIR;
1016     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1017         vfs_dcache_add(parent, dir);
1018         goto cleanup;
1019     }
1020
1021     vfs_d_free(dir);
1022
1023 cleanup:
1024     unlock_inode(parent->inode);
1025     unlock_dnode(parent);
1026 done:
1027     return DO_STATUS(errno);
1028 }
1029
1030 int
1031 __vfs_do_unlink(struct v_dnode* dnode)
1032 {
1033     int errno;
1034     struct v_inode* inode = dnode->inode;
1035
1036     if (dnode->ref_count > 1) {
1037         return EBUSY;
1038     }
1039
1040     if ((errno = vfs_check_writable(dnode))) {
1041         return errno;
1042     }
1043
1044     lock_inode(inode);
1045
1046     if (inode->open_count) {
1047         errno = EBUSY;
1048     } else if (!(inode->itype & VFS_IFDIR)) {
1049         // The underlying unlink implementation should handle
1050         //  symlink case
1051         errno = inode->ops->unlink(inode);
1052         if (!errno) {
1053             vfs_d_free(dnode);
1054         }
1055     } else {
1056         errno = EISDIR;
1057     }
1058
1059     unlock_inode(inode);
1060
1061     return errno;
1062 }
1063
1064 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1065 {
1066     int errno;
1067     struct v_dnode* dnode;
1068     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1069         goto done;
1070     }
1071
1072     errno = __vfs_do_unlink(dnode);
1073
1074 done:
1075     return DO_STATUS(errno);
1076 }
1077
1078 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1079 {
1080     int errno;
1081     struct v_fd* fd_s;
1082     if ((errno = vfs_getfd(fd, &fd_s))) {
1083         goto done;
1084     }
1085
1086     struct v_dnode* dnode;
1087     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1088         errno = __vfs_do_unlink(dnode);
1089     }
1090
1091 done:
1092     return DO_STATUS(errno);
1093 }
1094
1095 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1096 {
1097     int errno;
1098     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1099
1100     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1101     if (!errno) {
1102         errno = __vfs_try_locate_file(
1103           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1104         if (!errno) {
1105             errno = EEXIST;
1106         } else if (name_file) {
1107             errno = vfs_link(to_link, name_file);
1108         }
1109     }
1110     return DO_STATUS(errno);
1111 }
1112
1113 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1114 {
1115     int errno;
1116     struct v_fd* fd_s;
1117
1118     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1119         errno = vfs_fsync(fd_s->file);
1120     }
1121
1122     return DO_STATUS(errno);
1123 }
1124
1125 int
1126 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1127 {
1128     int errno = 0;
1129     struct v_fd* copied = cake_grab(fd_pile);
1130
1131     memcpy(copied, old, sizeof(struct v_fd));
1132
1133     atomic_fetch_add(&old->file->ref_count, 1);
1134
1135     *new = copied;
1136
1137     return errno;
1138 }
1139
1140 int
1141 vfs_dup2(int oldfd, int newfd)
1142 {
1143     if (newfd == oldfd) {
1144         return newfd;
1145     }
1146
1147     int errno;
1148     struct v_fd *oldfd_s, *newfd_s;
1149     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1150         goto done;
1151     }
1152
1153     if (!TEST_FD(newfd)) {
1154         errno = EBADF;
1155         goto done;
1156     }
1157
1158     newfd_s = __current->fdtable->fds[newfd];
1159     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1160         goto done;
1161     }
1162
1163     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1164         __current->fdtable->fds[newfd] = newfd_s;
1165         return newfd;
1166     }
1167
1168 done:
1169     return DO_STATUS(errno);
1170 }
1171
1172 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1173 {
1174     return vfs_dup2(oldfd, newfd);
1175 }
1176
1177 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1178 {
1179     int errno, newfd;
1180     struct v_fd *oldfd_s, *newfd_s;
1181     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1182         goto done;
1183     }
1184
1185     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1186         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1187         __current->fdtable->fds[newfd] = newfd_s;
1188         return newfd;
1189     }
1190
1191 done:
1192     return DO_STATUS(errno);
1193 }
1194
1195 __DEFINE_LXSYSCALL2(int,
1196                     symlink,
1197                     const char*,
1198                     pathname,
1199                     const char*,
1200                     link_target)
1201 {
1202     int errno;
1203     struct v_dnode* dnode;
1204     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1205         goto done;
1206     }
1207
1208     if (errno = vfs_check_writable(dnode)) {
1209         goto done;
1210     }
1211
1212     if (!dnode->inode->ops->set_symlink) {
1213         errno = ENOTSUP;
1214         goto done;
1215     }
1216
1217     lock_inode(dnode->inode);
1218
1219     errno = dnode->inode->ops->set_symlink(dnode->inode, link_target);
1220
1221     unlock_inode(dnode->inode);
1222
1223 done:
1224     return DO_STATUS(errno);
1225 }
1226
1227 void
1228 vfs_ref_dnode(struct v_dnode* dnode)
1229 {
1230     atomic_fetch_add(&dnode->ref_count, 1);
1231     mnt_mkbusy(dnode->mnt);
1232 }
1233
1234 void
1235 vfs_unref_dnode(struct v_dnode* dnode)
1236 {
1237     atomic_fetch_sub(&dnode->ref_count, 1);
1238     mnt_chillax(dnode->mnt);
1239 }
1240
1241 int
1242 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1243 {
1244     int errno = 0;
1245
1246     lock_dnode(dnode);
1247
1248     if (!(dnode->inode->itype & VFS_IFDIR)) {
1249         errno = ENOTDIR;
1250         goto done;
1251     }
1252
1253     if (proc->cwd) {
1254         vfs_unref_dnode(proc->cwd);
1255     }
1256
1257     vfs_ref_dnode(dnode);
1258     proc->cwd = dnode;
1259
1260     unlock_dnode(dnode);
1261
1262 done:
1263     return errno;
1264 }
1265
1266 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1267 {
1268     struct v_dnode* dnode;
1269     int errno = 0;
1270
1271     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1272         goto done;
1273     }
1274
1275     errno = vfs_do_chdir(__current, dnode);
1276
1277 done:
1278     return DO_STATUS(errno);
1279 }
1280
1281 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1282 {
1283     struct v_fd* fd_s;
1284     int errno = 0;
1285
1286     if ((errno = vfs_getfd(fd, &fd_s))) {
1287         goto done;
1288     }
1289
1290     errno = vfs_do_chdir(__current, fd_s->file->dnode);
1291
1292 done:
1293     return DO_STATUS(errno);
1294 }
1295
1296 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1297 {
1298     int errno = 0;
1299     char* ret_ptr = 0;
1300     if (size < 2) {
1301         errno = ERANGE;
1302         goto done;
1303     }
1304
1305     size_t len = 0;
1306
1307     if (!__current->cwd) {
1308         *buf = VFS_PATH_DELIM;
1309         len = 1;
1310     } else {
1311         len = vfs_get_path(__current->cwd, buf, size, 0);
1312         if (len == size) {
1313             errno = ERANGE;
1314             goto done;
1315         }
1316     }
1317
1318     buf[len + 1] = '\0';
1319
1320     ret_ptr = buf;
1321
1322 done:
1323     __current->k_status = errno;
1324     return ret_ptr;
1325 }
1326
1327 int
1328 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1329 {
1330     int errno = 0;
1331     if (current->inode->id == target->inode->id) {
1332         // hard link
1333         return 0;
1334     }
1335
1336     if (errno = vfs_check_writable(current)) {
1337         return errno;
1338     }
1339
1340     if (current->ref_count > 1 || target->ref_count > 1) {
1341         return EBUSY;
1342     }
1343
1344     if (current->super_block != target->super_block) {
1345         return EXDEV;
1346     }
1347
1348     struct v_dnode* oldparent = current->parent;
1349     struct v_dnode* newparent = target->parent;
1350
1351     lock_dnode(current);
1352     lock_dnode(target);
1353     if (oldparent)
1354         lock_dnode(oldparent);
1355     if (newparent)
1356         lock_dnode(newparent);
1357
1358     if (!llist_empty(&target->children)) {
1359         errno = ENOTEMPTY;
1360         unlock_dnode(target);
1361         goto cleanup;
1362     }
1363
1364     if ((errno =
1365            current->inode->ops->rename(current->inode, current, target))) {
1366         unlock_dnode(target);
1367         goto cleanup;
1368     }
1369
1370     // re-position current
1371     hstrcpy(&current->name, &target->name);
1372     vfs_dcache_rehash(newparent, current);
1373
1374     // detach target
1375     vfs_d_free(target);
1376
1377     unlock_dnode(target);
1378
1379 cleanup:
1380     unlock_dnode(current);
1381     if (oldparent)
1382         unlock_dnode(oldparent);
1383     if (newparent)
1384         unlock_dnode(newparent);
1385
1386     return errno;
1387 }
1388
1389 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1390 {
1391     struct v_dnode *cur, *target_parent, *target;
1392     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1393     int errno = 0;
1394
1395     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1396         goto done;
1397     }
1398
1399     if ((errno = vfs_walk(
1400            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1401         goto done;
1402     }
1403
1404     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1405     if (errno == ENOENT) {
1406         target = vfs_d_alloc(target_parent, &name);
1407         vfs_dcache_add(target_parent, target);
1408     } else if (errno) {
1409         goto done;
1410     }
1411
1412     if (!target) {
1413         errno = ENOMEM;
1414         goto done;
1415     }
1416
1417     errno = vfs_do_rename(cur, target);
1418
1419 done:
1420     vfree(name.value);
1421     return DO_STATUS(errno);
1422 }