lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition (CHECKED)
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/foptions.h>
  48 #include <lunaix/fs.h>
  49 #include <lunaix/mm/cake.h>
  50 #include <lunaix/mm/valloc.h>
  51 #include <lunaix/process.h>
  52 #include <lunaix/spike.h>
  53 #include <lunaix/syscall.h>
  54 #include <lunaix/syscall_utils.h>
  55
  56 #include <lunaix/fs/twifs.h>
  57
  58 #include <usr/lunaix/dirent_defs.h>
  59
  60 static struct cake_pile* dnode_pile;
  61 static struct cake_pile* inode_pile;
  62 static struct cake_pile* file_pile;
  63 static struct cake_pile* superblock_pile;
  64 static struct cake_pile* fd_pile;
  65
  66 struct v_dnode* vfs_sysroot;
  67 static struct hbucket* dnode_cache;
  68
  69 struct lru_zone *dnode_lru, *inode_lru;
  70
  71 struct hstr vfs_ddot = HSTR("..", 2);
  72 struct hstr vfs_dot = HSTR(".", 1);
  73 struct hstr vfs_empty = HSTR("", 0);
  74
  75 struct v_superblock*
  76 vfs_sb_alloc();
  77
  78 void
  79 vfs_sb_free(struct v_superblock* sb);
  80
  81 static int
  82 __vfs_try_evict_dnode(struct lru_node* obj);
  83
  84 static int
  85 __vfs_try_evict_inode(struct lru_node* obj);
  86
  87 void
  88 vfs_init()
  89 {
  90     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  91     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  92     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  93     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  94     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  95     superblock_pile =
  96       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  97
  98     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
  99
 100     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 101     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 102
 103     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 104     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 105
 106     // 创建一个根dnode。
 107     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 108     vfs_sysroot->parent = vfs_sysroot;
 109     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 110 }
 111
 112 static inline struct hbucket*
 113 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 114 {
 115     u32_t _hash = *hash;
 116     // 确保低位更加随机
 117     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 118     // 与parent的指针值做加法，来减小碰撞的可能性。
 119     _hash += (u32_t)parent;
 120     *hash = _hash;
 121     return &dnode_cache[_hash & VFS_HASH_MASK];
 122 }
 123
 124 struct v_dnode*
 125 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 126 {
 127     if (!str->len || HSTR_EQ(str, &vfs_dot))
 128         return parent;
 129
 130     if (HSTR_EQ(str, &vfs_ddot)) {
 131         return parent->parent;
 132     }
 133
 134     u32_t hash = str->hash;
 135     struct hbucket* slot = __dcache_hash(parent, &hash);
 136
 137     struct v_dnode *pos, *n;
 138     hashtable_bucket_foreach(slot, pos, n, hash_list)
 139     {
 140         if (pos->name.hash == hash) {
 141             return pos;
 142         }
 143     }
 144     return NULL;
 145 }
 146
 147 void
 148 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 149 {
 150     assert(parent);
 151
 152     atomic_fetch_add(&dnode->ref_count, 1);
 153     dnode->parent = parent;
 154     llist_append(&parent->children, &dnode->siblings);
 155
 156     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 157     hlist_add(&bucket->head, &dnode->hash_list);
 158 }
 159
 160 void
 161 vfs_dcache_remove(struct v_dnode* dnode)
 162 {
 163     assert(dnode);
 164     assert(dnode->ref_count == 1);
 165
 166     llist_delete(&dnode->siblings);
 167     llist_delete(&dnode->aka_list);
 168     hlist_delete(&dnode->hash_list);
 169
 170     dnode->parent = NULL;
 171     atomic_fetch_sub(&dnode->ref_count, 1);
 172 }
 173
 174 void
 175 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 176 {
 177     assert(new_parent);
 178
 179     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 180     vfs_dcache_remove(dnode);
 181     vfs_dcache_add(new_parent, dnode);
 182 }
 183
 184 int
 185 vfs_open(struct v_dnode* dnode, struct v_file** file)
 186 {
 187     if (!dnode->inode || !dnode->inode->ops->open) {
 188         return ENOTSUP;
 189     }
 190
 191     struct v_inode* inode = dnode->inode;
 192
 193     lock_inode(inode);
 194
 195     struct v_file* vfile = cake_grab(file_pile);
 196     memset(vfile, 0, sizeof(*vfile));
 197
 198     vfile->dnode = dnode;
 199     vfile->inode = inode;
 200     vfile->ref_count = ATOMIC_VAR_INIT(1);
 201     vfile->ops = inode->default_fops;
 202
 203     if ((inode->itype & F_MFILE) && !inode->pg_cache) {
 204         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 205         pcache_init(pcache);
 206         pcache->master = inode;
 207         inode->pg_cache = pcache;
 208     }
 209
 210     int errno = inode->ops->open(inode, vfile);
 211     if (errno) {
 212         cake_release(file_pile, vfile);
 213     } else {
 214         atomic_fetch_add(&dnode->ref_count, 1);
 215         inode->open_count++;
 216         mnt_mkbusy(dnode->mnt);
 217
 218         *file = vfile;
 219     }
 220
 221     unlock_inode(inode);
 222
 223     return errno;
 224 }
 225
 226 void
 227 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 228 {
 229     if (assign_to->inode) {
 230         llist_delete(&assign_to->aka_list);
 231         assign_to->inode->link_count--;
 232     }
 233     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 234     assign_to->inode = inode;
 235     inode->link_count++;
 236 }
 237
 238 int
 239 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 240 {
 241     int errno;
 242
 243     if ((errno = vfs_check_writable(to_link))) {
 244         return errno;
 245     }
 246
 247     lock_inode(to_link->inode);
 248     if (to_link->super_block->root != name->super_block->root) {
 249         errno = EXDEV;
 250     } else if (!to_link->inode->ops->link) {
 251         errno = ENOTSUP;
 252     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 253         vfs_assign_inode(name, to_link->inode);
 254     }
 255     unlock_inode(to_link->inode);
 256
 257     return errno;
 258 }
 259
 260 int
 261 vfs_pclose(struct v_file* file, pid_t pid)
 262 {
 263     int errno = 0;
 264     if (file->ref_count > 1) {
 265         atomic_fetch_sub(&file->ref_count, 1);
 266     } else if (!(errno = file->ops->close(file))) {
 267         atomic_fetch_sub(&file->dnode->ref_count, 1);
 268         file->inode->open_count--;
 269
 270         /*
 271          * Prevent dead lock.
 272          * This happened when process is terminated while blocking on read.
 273          * In that case, the process is still holding the inode lock and it
 274              will never get released.
 275          * The unlocking should also include ownership check.
 276          *
 277          * To see why, consider two process both open the same file both with
 278          * fd=x.
 279          *      Process A: busy on reading x
 280          *      Process B: do nothing with x
 281          * Assuming that, after a very short time, process B get terminated
 282          * while process A is still busy in it's reading business. By this
 283          * design, the inode lock of this file x is get released by B rather
 284          * than A. And this will cause a probable race condition on A if other
 285          * process is writing to this file later after B exit.
 286          */
 287         if (mutex_on_hold(&file->inode->lock)) {
 288             mutex_unlock_for(&file->inode->lock, pid);
 289         }
 290         mnt_chillax(file->dnode->mnt);
 291
 292         pcache_commit_all(file->inode);
 293         cake_release(file_pile, file);
 294     }
 295     return errno;
 296 }
 297
 298 int
 299 vfs_close(struct v_file* file)
 300 {
 301     return vfs_pclose(file, __current->pid);
 302 }
 303
 304 void
 305 vfs_free_fd(struct v_fd* fd)
 306 {
 307     cake_release(fd_pile, fd);
 308 }
 309
 310 int
 311 vfs_fsync(struct v_file* file)
 312 {
 313     int errno;
 314     if ((errno = vfs_check_writable(file->dnode))) {
 315         return errno;
 316     }
 317
 318     lock_inode(file->inode);
 319
 320     pcache_commit_all(file->inode);
 321
 322     errno = ENOTSUP;
 323     if (file->ops->sync) {
 324         errno = file->ops->sync(file);
 325     }
 326
 327     unlock_inode(file->inode);
 328
 329     return errno;
 330 }
 331
 332 int
 333 vfs_alloc_fdslot(int* fd)
 334 {
 335     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 336         if (!__current->fdtable->fds[i]) {
 337             *fd = i;
 338             return 0;
 339         }
 340     }
 341     return EMFILE;
 342 }
 343
 344 struct v_superblock*
 345 vfs_sb_alloc()
 346 {
 347     struct v_superblock* sb = cake_grab(superblock_pile);
 348     memset(sb, 0, sizeof(*sb));
 349     llist_init_head(&sb->sb_list);
 350     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 351     return sb;
 352 }
 353
 354 void
 355 vfs_sb_free(struct v_superblock* sb)
 356 {
 357     vfree(sb->i_cache);
 358     cake_release(superblock_pile, sb);
 359 }
 360
 361 static int
 362 __vfs_try_evict_dnode(struct lru_node* obj)
 363 {
 364     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 365
 366     if (!dnode->ref_count) {
 367         vfs_d_free(dnode);
 368         return 1;
 369     }
 370     return 0;
 371 }
 372
 373 static int
 374 __vfs_try_evict_inode(struct lru_node* obj)
 375 {
 376     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 377
 378     if (!inode->link_count && !inode->open_count) {
 379         vfs_i_free(inode);
 380         return 1;
 381     }
 382     return 0;
 383 }
 384
 385 struct v_dnode*
 386 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 387 {
 388     struct v_dnode* dnode = cake_grab(dnode_pile);
 389     if (!dnode) {
 390         lru_evict_half(dnode_lru);
 391
 392         if (!(dnode = cake_grab(dnode_pile))) {
 393             return NULL;
 394         }
 395     }
 396
 397     memset(dnode, 0, sizeof(*dnode));
 398     llist_init_head(&dnode->children);
 399     llist_init_head(&dnode->siblings);
 400     llist_init_head(&dnode->aka_list);
 401     mutex_init(&dnode->lock);
 402
 403     dnode->ref_count = ATOMIC_VAR_INIT(0);
 404     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 405
 406     hstrcpy(&dnode->name, name);
 407
 408     if (parent) {
 409         dnode->super_block = parent->super_block;
 410         dnode->mnt = parent->mnt;
 411     }
 412
 413     lru_use_one(dnode_lru, &dnode->lru);
 414
 415     return dnode;
 416 }
 417
 418 void
 419 vfs_d_free(struct v_dnode* dnode)
 420 {
 421     assert(dnode->ref_count == 1);
 422
 423     if (dnode->inode) {
 424         assert(dnode->inode->link_count > 0);
 425         dnode->inode->link_count--;
 426     }
 427
 428     vfs_dcache_remove(dnode);
 429     // Make sure the children de-referencing their parent.
 430     // With lru presented, the eviction will be propagated over the entire
 431     // detached subtree eventually
 432     struct v_dnode *pos, *n;
 433     llist_for_each(pos, n, &dnode->children, siblings)
 434     {
 435         vfs_dcache_remove(pos);
 436     }
 437
 438     vfree((void*)dnode->name.value);
 439     cake_release(dnode_pile, dnode);
 440 }
 441
 442 struct v_inode*
 443 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 444 {
 445     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 446     struct v_inode *pos, *n;
 447     hashtable_bucket_foreach(slot, pos, n, hash_list)
 448     {
 449         if (pos->id == i_id) {
 450             lru_use_one(inode_lru, &pos->lru);
 451             return pos;
 452         }
 453     }
 454
 455     return NULL;
 456 }
 457
 458 void
 459 vfs_i_addhash(struct v_inode* inode)
 460 {
 461     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 462
 463     hlist_delete(&inode->hash_list);
 464     hlist_add(&slot->head, &inode->hash_list);
 465 }
 466
 467 struct v_inode*
 468 vfs_i_alloc(struct v_superblock* sb)
 469 {
 470     assert(sb->ops.init_inode);
 471
 472     struct v_inode* inode;
 473     if (!(inode = cake_grab(inode_pile))) {
 474         lru_evict_half(inode_lru);
 475         if (!(inode = cake_grab(inode_pile))) {
 476             return NULL;
 477         }
 478     }
 479
 480     memset(inode, 0, sizeof(*inode));
 481     mutex_init(&inode->lock);
 482     llist_init_head(&inode->xattrs);
 483     llist_init_head(&inode->aka_dnodes);
 484
 485     sb->ops.init_inode(sb, inode);
 486
 487     inode->sb = sb;
 488     inode->ctime = clock_unixtime();
 489     inode->atime = inode->ctime;
 490     inode->mtime = inode->ctime;
 491
 492     lru_use_one(inode_lru, &inode->lru);
 493     return inode;
 494 }
 495
 496 void
 497 vfs_i_free(struct v_inode* inode)
 498 {
 499     if (inode->pg_cache) {
 500         pcache_release(inode->pg_cache);
 501         vfree(inode->pg_cache);
 502     }
 503     // we don't need to sync inode.
 504     // If an inode can be free, then it must be properly closed.
 505     // Hence it must be synced already!
 506     if (inode->destruct) {
 507         inode->destruct(inode);
 508     }
 509     hlist_delete(&inode->hash_list);
 510     cake_release(inode_pile, inode);
 511 }
 512
 513 /* ---- System call definition and support ---- */
 514
 515 #define FLOCATE_CREATE_EMPTY 1
 516 #define FLOCATE_CREATE_ONLY 2
 517 #define FLOCATE_NOFOLLOW 4
 518
 519 int
 520 vfs_getfd(int fd, struct v_fd** fd_s)
 521 {
 522     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 523         return 0;
 524     }
 525     return EBADF;
 526 }
 527
 528 int
 529 __vfs_try_locate_file(const char* path,
 530                       struct v_dnode** fdir,
 531                       struct v_dnode** file,
 532                       int options)
 533 {
 534     char name_str[VFS_NAME_MAXLEN];
 535     struct hstr name = HSTR(name_str, 0);
 536     int errno, woption = 0;
 537
 538     if ((options & FLOCATE_NOFOLLOW)) {
 539         woption |= VFS_WALK_NOFOLLOW;
 540     }
 541
 542     name_str[0] = 0;
 543     if ((errno = vfs_walk_proc(path, fdir, &name, woption | VFS_WALK_PARENT))) {
 544         return errno;
 545     }
 546
 547     errno = vfs_walk(*fdir, name.value, file, NULL, woption);
 548
 549     if (errno != ENOENT && (options & FLOCATE_CREATE_ONLY)) {
 550         return EEXIST;
 551     }
 552
 553     if (errno != ENOENT ||
 554         !(options & (FLOCATE_CREATE_EMPTY | FLOCATE_CREATE_ONLY))) {
 555         return errno;
 556     }
 557
 558     struct v_dnode* parent = *fdir;
 559     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 560
 561     if (!file_new) {
 562         return ENOMEM;
 563     }
 564
 565     lock_dnode(parent);
 566
 567     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 568         vfs_dcache_add(parent, file_new);
 569         *file = file_new;
 570     } else {
 571         vfs_d_free(file_new);
 572     }
 573
 574     unlock_dnode(parent);
 575
 576     return errno;
 577 }
 578
 579 int
 580 vfs_do_open(const char* path, int options)
 581 {
 582     int errno, fd, loptions = 0;
 583     struct v_dnode *dentry, *file;
 584     struct v_file* ofile = NULL;
 585
 586     if ((options & FO_CREATE)) {
 587         loptions |= FLOCATE_CREATE_EMPTY;
 588     } else if ((options & FO_NOFOLLOW)) {
 589         loptions |= FLOCATE_NOFOLLOW;
 590     }
 591
 592     errno = __vfs_try_locate_file(path, &dentry, &file, loptions);
 593
 594     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 595
 596         if (errno || (errno = vfs_open(file, &ofile))) {
 597             return errno;
 598         }
 599
 600         struct v_fd* fd_s = cake_grab(fd_pile);
 601         memset(fd_s, 0, sizeof(*fd_s));
 602
 603         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 604         fd_s->file = ofile;
 605         fd_s->flags = options;
 606         __current->fdtable->fds[fd] = fd_s;
 607         return fd;
 608     }
 609
 610     return errno;
 611 }
 612
 613 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 614 {
 615     int errno = vfs_do_open(path, options);
 616     return DO_STATUS_OR_RETURN(errno);
 617 }
 618
 619 __DEFINE_LXSYSCALL1(int, close, int, fd)
 620 {
 621     struct v_fd* fd_s;
 622     int errno = 0;
 623     if ((errno = vfs_getfd(fd, &fd_s))) {
 624         goto done_err;
 625     }
 626
 627     if ((errno = vfs_close(fd_s->file))) {
 628         goto done_err;
 629     }
 630
 631     cake_release(fd_pile, fd_s);
 632     __current->fdtable->fds[fd] = 0;
 633
 634 done_err:
 635     return DO_STATUS(errno);
 636 }
 637
 638 void
 639 __vfs_readdir_callback(struct dir_context* dctx,
 640                        const char* name,
 641                        const int len,
 642                        const int dtype)
 643 {
 644     struct lx_dirent* dent = (struct lx_dirent*)dctx->cb_data;
 645     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 646     dent->d_nlen = len;
 647     dent->d_type = dtype;
 648 }
 649
 650 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent)
 651 {
 652     struct v_fd* fd_s;
 653     int errno;
 654
 655     if ((errno = vfs_getfd(fd, &fd_s))) {
 656         goto done;
 657     }
 658
 659     struct v_inode* inode = fd_s->file->inode;
 660
 661     lock_inode(inode);
 662
 663     if ((inode->itype & F_FILE)) {
 664         errno = ENOTDIR;
 665     } else {
 666         struct dir_context dctx = (struct dir_context){
 667           .cb_data = dent,
 668           .index = dent->d_offset,
 669           .read_complete_callback = __vfs_readdir_callback};
 670         errno = 1;
 671         if (dent->d_offset == 0) {
 672             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 673         } else if (dent->d_offset == 1) {
 674             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 675         } else {
 676             dctx.index -= 2;
 677             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 678                 unlock_inode(inode);
 679                 goto done;
 680             }
 681         }
 682         dent->d_offset++;
 683     }
 684
 685     unlock_inode(inode);
 686
 687 done:
 688     return DO_STATUS_OR_RETURN(errno);
 689 }
 690
 691 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 692 {
 693     int errno = 0;
 694     struct v_fd* fd_s;
 695     if ((errno = vfs_getfd(fd, &fd_s))) {
 696         goto done;
 697     }
 698
 699     struct v_file* file = fd_s->file;
 700     if (!(file->inode->itype & F_FILE)) {
 701         errno = EISDIR;
 702         goto done;
 703     }
 704
 705     lock_inode(file->inode);
 706
 707     file->inode->atime = clock_unixtime();
 708
 709     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 710         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 711     } else {
 712         errno = pcache_read(file->inode, buf, count, file->f_pos);
 713     }
 714
 715     if (errno > 0) {
 716         file->f_pos += errno;
 717         unlock_inode(file->inode);
 718         return errno;
 719     }
 720
 721     unlock_inode(file->inode);
 722
 723 done:
 724     return DO_STATUS(errno);
 725 }
 726
 727 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 728 {
 729     int errno = 0;
 730     struct v_fd* fd_s;
 731     if ((errno = vfs_getfd(fd, &fd_s))) {
 732         goto done;
 733     }
 734
 735     struct v_file* file = fd_s->file;
 736
 737     if ((errno = vfs_check_writable(file->dnode))) {
 738         goto done;
 739     }
 740
 741     if (!(file->inode->itype & F_FILE)) {
 742         errno = EISDIR;
 743         goto done;
 744     }
 745
 746     lock_inode(file->inode);
 747
 748     file->inode->mtime = clock_unixtime();
 749
 750     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 751         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 752     } else {
 753         errno = pcache_write(file->inode, buf, count, file->f_pos);
 754     }
 755
 756     if (errno > 0) {
 757         file->f_pos += errno;
 758         unlock_inode(file->inode);
 759         return errno;
 760     }
 761
 762     unlock_inode(file->inode);
 763
 764 done:
 765     return DO_STATUS(errno);
 766 }
 767
 768 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 769 {
 770     int errno = 0;
 771     struct v_fd* fd_s;
 772     if ((errno = vfs_getfd(fd, &fd_s))) {
 773         goto done;
 774     }
 775
 776     struct v_file* file = fd_s->file;
 777
 778     if (!file->ops->seek) {
 779         errno = ENOTSUP;
 780         goto done;
 781     }
 782
 783     lock_inode(file->inode);
 784
 785     int overflow = 0;
 786     int fpos = file->f_pos;
 787     switch (options) {
 788         case FSEEK_CUR:
 789             overflow = sadd_overflow((int)file->f_pos, offset, &fpos);
 790             break;
 791         case FSEEK_END:
 792             overflow = sadd_overflow((int)file->inode->fsize, offset, &fpos);
 793             break;
 794         case FSEEK_SET:
 795             fpos = offset;
 796             break;
 797     }
 798     if (overflow) {
 799         errno = EOVERFLOW;
 800     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 801         file->f_pos = fpos;
 802     }
 803
 804     unlock_inode(file->inode);
 805
 806 done:
 807     return DO_STATUS(errno);
 808 }
 809
 810 int
 811 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 812 {
 813     if (!dnode) {
 814         return 0;
 815     }
 816
 817     if (depth > 64) {
 818         return ENAMETOOLONG;
 819     }
 820
 821     size_t len = 0;
 822
 823     if (dnode->parent != dnode) {
 824         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 825     }
 826
 827     if (len >= size) {
 828         return len;
 829     }
 830
 831     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 832         buf[len++] = VFS_PATH_DELIM;
 833     }
 834
 835     size_t cpy_size = MIN(dnode->name.len, size - len);
 836     strncpy(buf + len, dnode->name.value, cpy_size);
 837     len += cpy_size;
 838
 839     return len;
 840 }
 841
 842 int
 843 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 844 {
 845     const char* link;
 846     struct v_inode* inode = dnode->inode;
 847     if (inode->ops->read_symlink) {
 848         lock_inode(inode);
 849
 850         int errno = inode->ops->read_symlink(inode, &link);
 851         strncpy(buf, link, size);
 852
 853         unlock_inode(inode);
 854         return errno;
 855     }
 856     return 0;
 857 }
 858
 859 int
 860 vfs_get_dtype(int itype)
 861 {
 862     if ((itype & VFS_IFSYMLINK) == VFS_IFSYMLINK) {
 863         return DT_SYMLINK;
 864     } else if (!(itype & VFS_IFFILE)) {
 865         return DT_DIR;
 866     } else {
 867         return DT_FILE;
 868     }
 869 }
 870
 871 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 872 {
 873     int errno;
 874     struct v_fd* fd_s;
 875     if ((errno = vfs_getfd(fd, &fd_s))) {
 876         goto done;
 877     }
 878
 879     struct v_dnode* dnode;
 880     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 881
 882     if (errno >= 0) {
 883         return errno;
 884     }
 885
 886 done:
 887     return DO_STATUS(errno);
 888 }
 889
 890 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 891 {
 892     int errno;
 893     struct v_dnode* dnode;
 894     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 895         errno = vfs_readlink(dnode, buf, size);
 896     }
 897
 898     if (errno >= 0) {
 899         return errno;
 900     }
 901
 902     return DO_STATUS(errno);
 903 }
 904
 905 __DEFINE_LXSYSCALL4(
 906   int, readlinkat, int, dirfd, const char*, pathname, char*, buf, size_t, size)
 907 {
 908     int errno;
 909     struct v_fd* fd_s;
 910     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 911         goto done;
 912     }
 913
 914     pathname = pathname ? pathname : "";
 915
 916     struct v_dnode* dnode;
 917     if (!(errno = vfs_walk(
 918             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 919         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 920     }
 921
 922     if (errno >= 0) {
 923         return errno;
 924     }
 925
 926 done:
 927     return DO_STATUS(errno);
 928 }
 929
 930 /*
 931     NOTE
 932     When we perform operation that could affect the layout of
 933     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 934     whenever possible. This will blocking any ongoing path walking to reach
 935     it hence avoid any partial state.
 936 */
 937
 938 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 939 {
 940     int errno;
 941     struct v_dnode* dnode;
 942     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 943         return DO_STATUS(errno);
 944     }
 945
 946     lock_dnode(dnode);
 947
 948     if ((errno = vfs_check_writable(dnode))) {
 949         goto done;
 950     }
 951
 952     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 953         errno = EROFS;
 954         goto done;
 955     }
 956
 957     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 958         errno = EBUSY;
 959         goto done;
 960     }
 961
 962     if (!llist_empty(&dnode->children)) {
 963         errno = ENOTEMPTY;
 964         goto done;
 965     }
 966
 967     struct v_dnode* parent = dnode->parent;
 968
 969     if (!parent) {
 970         errno = EINVAL;
 971         goto done;
 972     }
 973
 974     lock_dnode(parent);
 975     lock_inode(parent->inode);
 976
 977     if (!(dnode->inode->itype & F_MFILE)) {
 978         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 979         if (!errno) {
 980             vfs_dcache_remove(dnode);
 981         }
 982     } else {
 983         errno = ENOTDIR;
 984     }
 985
 986     unlock_inode(parent->inode);
 987     unlock_dnode(parent);
 988
 989 done:
 990     unlock_dnode(dnode);
 991     return DO_STATUS(errno);
 992 }
 993
 994 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 995 {
 996     int errno = 0;
 997     struct v_dnode *parent, *dir;
 998     char name_value[VFS_NAME_MAXLEN];
 999     struct hstr name = HHSTR(name_value, 0, 0);
1000
1001     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
1002         goto done;
1003     }
1004
1005     if (!(errno = vfs_walk(parent, name_value, &dir, NULL, 0))) {
1006         errno = EEXIST;
1007         goto done;
1008     }
1009
1010     if ((errno = vfs_check_writable(parent))) {
1011         goto done;
1012     }
1013
1014     if (!(dir = vfs_d_alloc(parent, &name))) {
1015         errno = ENOMEM;
1016         goto done;
1017     }
1018
1019     lock_dnode(parent);
1020     lock_inode(parent->inode);
1021
1022     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1023         errno = ENOTSUP;
1024     } else if (!parent->inode->ops->mkdir) {
1025         errno = ENOTSUP;
1026     } else if ((parent->inode->itype & F_FILE)) {
1027         errno = ENOTDIR;
1028     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1029         vfs_dcache_add(parent, dir);
1030         goto cleanup;
1031     }
1032
1033     vfs_d_free(dir);
1034
1035 cleanup:
1036     unlock_inode(parent->inode);
1037     unlock_dnode(parent);
1038 done:
1039     return DO_STATUS(errno);
1040 }
1041
1042 int
1043 __vfs_do_unlink(struct v_dnode* dnode)
1044 {
1045     int errno;
1046     struct v_inode* inode = dnode->inode;
1047
1048     if (dnode->ref_count > 1) {
1049         return EBUSY;
1050     }
1051
1052     if ((errno = vfs_check_writable(dnode))) {
1053         return errno;
1054     }
1055
1056     lock_inode(inode);
1057
1058     if (inode->open_count) {
1059         errno = EBUSY;
1060     } else if ((inode->itype & F_MFILE)) {
1061         errno = inode->ops->unlink(inode);
1062         if (!errno) {
1063             vfs_d_free(dnode);
1064         }
1065     } else {
1066         errno = EISDIR;
1067     }
1068
1069     unlock_inode(inode);
1070
1071     return errno;
1072 }
1073
1074 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1075 {
1076     int errno;
1077     struct v_dnode* dnode;
1078     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1079         goto done;
1080     }
1081
1082     errno = __vfs_do_unlink(dnode);
1083
1084 done:
1085     return DO_STATUS(errno);
1086 }
1087
1088 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1089 {
1090     int errno;
1091     struct v_fd* fd_s;
1092     if ((errno = vfs_getfd(fd, &fd_s))) {
1093         goto done;
1094     }
1095
1096     struct v_dnode* dnode;
1097     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1098         errno = __vfs_do_unlink(dnode);
1099     }
1100
1101 done:
1102     return DO_STATUS(errno);
1103 }
1104
1105 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1106 {
1107     int errno;
1108     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1109
1110     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1111     if (!errno) {
1112         errno = __vfs_try_locate_file(
1113           newpath, &name_dentry, &name_file, FLOCATE_CREATE_ONLY);
1114         if (!errno) {
1115             errno = vfs_link(to_link, name_file);
1116         }
1117     }
1118     return DO_STATUS(errno);
1119 }
1120
1121 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1122 {
1123     int errno;
1124     struct v_fd* fd_s;
1125
1126     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1127         errno = vfs_fsync(fd_s->file);
1128     }
1129
1130     return DO_STATUS(errno);
1131 }
1132
1133 int
1134 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1135 {
1136     int errno = 0;
1137     struct v_fd* copied = cake_grab(fd_pile);
1138
1139     memcpy(copied, old, sizeof(struct v_fd));
1140
1141     atomic_fetch_add(&old->file->ref_count, 1);
1142
1143     *new = copied;
1144
1145     return errno;
1146 }
1147
1148 int
1149 vfs_dup2(int oldfd, int newfd)
1150 {
1151     if (newfd == oldfd) {
1152         return newfd;
1153     }
1154
1155     int errno;
1156     struct v_fd *oldfd_s, *newfd_s;
1157     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1158         goto done;
1159     }
1160
1161     if (!TEST_FD(newfd)) {
1162         errno = EBADF;
1163         goto done;
1164     }
1165
1166     newfd_s = __current->fdtable->fds[newfd];
1167     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1168         goto done;
1169     }
1170
1171     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1172         __current->fdtable->fds[newfd] = newfd_s;
1173         return newfd;
1174     }
1175
1176 done:
1177     return DO_STATUS(errno);
1178 }
1179
1180 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1181 {
1182     return vfs_dup2(oldfd, newfd);
1183 }
1184
1185 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1186 {
1187     int errno, newfd;
1188     struct v_fd *oldfd_s, *newfd_s;
1189     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1190         goto done;
1191     }
1192
1193     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1194         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1195         __current->fdtable->fds[newfd] = newfd_s;
1196         return newfd;
1197     }
1198
1199 done:
1200     return DO_STATUS(errno);
1201 }
1202
1203 __DEFINE_LXSYSCALL2(
1204   int, symlink, const char*, pathname, const char*, link_target)
1205 {
1206     int errno;
1207     struct v_dnode *dnode, *file;
1208     if ((errno = __vfs_try_locate_file(
1209            pathname, &dnode, &file, FLOCATE_CREATE_ONLY))) {
1210         goto done;
1211     }
1212
1213     if ((errno = vfs_check_writable(file))) {
1214         goto done;
1215     }
1216
1217     if (!file->inode->ops->set_symlink) {
1218         errno = ENOTSUP;
1219         goto done;
1220     }
1221
1222     lock_inode(file->inode);
1223
1224     errno = file->inode->ops->set_symlink(file->inode, link_target);
1225
1226     unlock_inode(file->inode);
1227
1228 done:
1229     return DO_STATUS(errno);
1230 }
1231
1232 void
1233 vfs_ref_file(struct v_file* file)
1234 {
1235     atomic_fetch_add(&file->ref_count, 1);
1236 }
1237
1238 void
1239 vfs_ref_dnode(struct v_dnode* dnode)
1240 {
1241     atomic_fetch_add(&dnode->ref_count, 1);
1242
1243     if (dnode->mnt) {
1244         mnt_mkbusy(dnode->mnt);
1245     }
1246 }
1247
1248 void
1249 vfs_unref_dnode(struct v_dnode* dnode)
1250 {
1251     atomic_fetch_sub(&dnode->ref_count, 1);
1252     if (dnode->mnt) {
1253         mnt_chillax(dnode->mnt);
1254     }
1255 }
1256
1257 int
1258 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1259 {
1260     int errno = 0;
1261
1262     lock_dnode(dnode);
1263
1264     if ((dnode->inode->itype & F_FILE)) {
1265         errno = ENOTDIR;
1266         goto done;
1267     }
1268
1269     if (proc->cwd) {
1270         vfs_unref_dnode(proc->cwd);
1271     }
1272
1273     vfs_ref_dnode(dnode);
1274     proc->cwd = dnode;
1275
1276     unlock_dnode(dnode);
1277
1278 done:
1279     return errno;
1280 }
1281
1282 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1283 {
1284     struct v_dnode* dnode;
1285     int errno = 0;
1286
1287     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1288         goto done;
1289     }
1290
1291     errno = vfs_do_chdir((struct proc_info*)__current, dnode);
1292
1293 done:
1294     return DO_STATUS(errno);
1295 }
1296
1297 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1298 {
1299     struct v_fd* fd_s;
1300     int errno = 0;
1301
1302     if ((errno = vfs_getfd(fd, &fd_s))) {
1303         goto done;
1304     }
1305
1306     errno = vfs_do_chdir((struct proc_info*)__current, fd_s->file->dnode);
1307
1308 done:
1309     return DO_STATUS(errno);
1310 }
1311
1312 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1313 {
1314     int errno = 0;
1315     char* ret_ptr = 0;
1316     if (size < 2) {
1317         errno = ERANGE;
1318         goto done;
1319     }
1320
1321     size_t len = 0;
1322
1323     if (!__current->cwd) {
1324         *buf = VFS_PATH_DELIM;
1325         len = 1;
1326     } else {
1327         len = vfs_get_path(__current->cwd, buf, size, 0);
1328         if (len == size) {
1329             errno = ERANGE;
1330             goto done;
1331         }
1332     }
1333
1334     buf[len] = '\0';
1335
1336     ret_ptr = buf;
1337
1338 done:
1339     syscall_result(errno);
1340     return ret_ptr;
1341 }
1342
1343 int
1344 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1345 {
1346     int errno = 0;
1347     if (current->inode->id == target->inode->id) {
1348         // hard link
1349         return 0;
1350     }
1351
1352     if ((errno = vfs_check_writable(current))) {
1353         return errno;
1354     }
1355
1356     if (current->ref_count > 1 || target->ref_count > 1) {
1357         return EBUSY;
1358     }
1359
1360     if (current->super_block != target->super_block) {
1361         return EXDEV;
1362     }
1363
1364     struct v_dnode* oldparent = current->parent;
1365     struct v_dnode* newparent = target->parent;
1366
1367     lock_dnode(current);
1368     lock_dnode(target);
1369     if (oldparent)
1370         lock_dnode(oldparent);
1371     if (newparent)
1372         lock_dnode(newparent);
1373
1374     if (!llist_empty(&target->children)) {
1375         errno = ENOTEMPTY;
1376         unlock_dnode(target);
1377         goto cleanup;
1378     }
1379
1380     if ((errno =
1381            current->inode->ops->rename(current->inode, current, target))) {
1382         unlock_dnode(target);
1383         goto cleanup;
1384     }
1385
1386     // re-position current
1387     hstrcpy(&current->name, &target->name);
1388     vfs_dcache_rehash(newparent, current);
1389
1390     // detach target
1391     vfs_d_free(target);
1392
1393     unlock_dnode(target);
1394
1395 cleanup:
1396     unlock_dnode(current);
1397     if (oldparent)
1398         unlock_dnode(oldparent);
1399     if (newparent)
1400         unlock_dnode(newparent);
1401
1402     return errno;
1403 }
1404
1405 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1406 {
1407     struct v_dnode *cur, *target_parent, *target;
1408     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1409     int errno = 0;
1410
1411     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1412         goto done;
1413     }
1414
1415     if ((errno = vfs_walk(
1416            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1417         goto done;
1418     }
1419
1420     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1421     if (errno == ENOENT) {
1422         target = vfs_d_alloc(target_parent, &name);
1423         vfs_dcache_add(target_parent, target);
1424     } else if (errno) {
1425         goto done;
1426     }
1427
1428     if (!target) {
1429         errno = ENOMEM;
1430         goto done;
1431     }
1432
1433     errno = vfs_do_rename(cur, target);
1434
1435 done:
1436     vfree((void*)name.value);
1437     return DO_STATUS(errno);
1438 }
1439
1440 __DEFINE_LXSYSCALL2(int, fstat, int, fd, struct file_stat*, stat)
1441 {
1442     int errno = 0;
1443     struct v_fd* fds;
1444
1445     if ((errno = vfs_getfd(fd, &fds))) {
1446         goto done;
1447     }
1448
1449     struct v_inode* vino = fds->file->inode;
1450     struct device* fdev = vino->sb->dev;
1451
1452     *stat = (struct file_stat){.st_ino = vino->id,
1453                                .st_blocks = vino->lb_usage,
1454                                .st_size = vino->fsize,
1455                                .mode = vino->itype,
1456                                .st_ioblksize = PAGE_SIZE,
1457                                .st_blksize = vino->sb->blksize};
1458
1459     if (VFS_DEVFILE(vino->itype)) {
1460         struct device* rdev = resolve_device(vino->data);
1461         if (!rdev || rdev->magic != DEV_STRUCT_MAGIC) {
1462             errno = EINVAL;
1463             goto done;
1464         }
1465
1466         stat->st_rdev = (dev_t){.meta = rdev->ident.fn_grp,
1467                                 .unique = rdev->ident.unique,
1468                                 .index = rdev->dev_uid};
1469     }
1470
1471     if (fdev) {
1472         stat->st_dev = (dev_t){.meta = fdev->ident.fn_grp,
1473                                .unique = fdev->ident.unique,
1474                                .index = fdev->dev_uid};
1475     }
1476
1477 done:
1478     return DO_STATUS(errno);
1479 }