lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition (CHECKED)
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/foptions.h>
  48 #include <lunaix/fs.h>
  49 #include <lunaix/mm/cake.h>
  50 #include <lunaix/mm/valloc.h>
  51 #include <lunaix/process.h>
  52 #include <lunaix/spike.h>
  53 #include <lunaix/syscall.h>
  54 #include <lunaix/syscall_utils.h>
  55
  56 #include <lunaix/fs/twifs.h>
  57
  58 #include <usr/lunaix/dirent_defs.h>
  59
  60 #define INODE_ACCESSED  0
  61 #define INODE_MODIFY    1
  62
  63 static struct cake_pile* dnode_pile;
  64 static struct cake_pile* inode_pile;
  65 static struct cake_pile* file_pile;
  66 static struct cake_pile* superblock_pile;
  67 static struct cake_pile* fd_pile;
  68
  69 struct v_dnode* vfs_sysroot = NULL;
  70
  71 struct lru_zone *dnode_lru, *inode_lru;
  72
  73 struct hstr vfs_ddot = HSTR("..", 2);
  74 struct hstr vfs_dot = HSTR(".", 1);
  75 struct hstr vfs_empty = HSTR("", 0);
  76
  77 static int
  78 __vfs_try_evict_dnode(struct lru_node* obj);
  79
  80 static int
  81 __vfs_try_evict_inode(struct lru_node* obj);
  82
  83 void
  84 vfs_init()
  85 {
  86     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  87     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  88     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  89     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  90     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  91     superblock_pile =
  92       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  93
  94     dnode_lru = lru_new_zone("vfs_dnode", __vfs_try_evict_dnode);
  95     inode_lru = lru_new_zone("vfs_inode", __vfs_try_evict_inode);
  96
  97     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
  98     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
  99
 100     // 创建一个根dnode。
 101     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 102     vfs_sysroot->parent = vfs_sysroot;
 103
 104     vfs_ref_dnode(vfs_sysroot);
 105 }
 106
 107 static inline struct hbucket*
 108 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 109 {
 110     struct hbucket* d_cache;
 111     u32_t _hash;
 112
 113     d_cache = parent->super_block->d_cache;
 114     _hash = *hash;
 115     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 116     _hash += (u32_t)__ptr(parent);
 117
 118     *hash = _hash;
 119     return &d_cache[_hash & VFS_HASH_MASK];
 120 }
 121
 122 static inline int
 123 __sync_inode_nolock(struct v_inode* inode)
 124 {
 125     pcache_commit_all(inode);
 126
 127     int errno = ENOTSUP;
 128     if (inode->ops->sync) {
 129         errno = inode->ops->sync(inode);
 130     }
 131
 132     return errno;
 133 }
 134
 135 struct v_dnode*
 136 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 137 {
 138     if (!str->len || HSTR_EQ(str, &vfs_dot))
 139         return parent;
 140
 141     if (HSTR_EQ(str, &vfs_ddot)) {
 142         return parent->parent;
 143     }
 144
 145     u32_t hash = str->hash;
 146     struct hbucket* slot = __dcache_hash(parent, &hash);
 147
 148     struct v_dnode *pos, *n;
 149     hashtable_bucket_foreach(slot, pos, n, hash_list)
 150     {
 151         if (pos->name.hash == hash && pos->parent == parent) {
 152             return pos;
 153         }
 154     }
 155     return NULL;
 156 }
 157
 158 static void
 159 __vfs_touch_inode(struct v_inode* inode, const int type)
 160 {
 161     if (type == INODE_MODIFY) {
 162         inode->mtime = clock_unixtime();
 163     }
 164
 165     else if (type == INODE_ACCESSED) {
 166         inode->atime = clock_unixtime();
 167     }
 168
 169     lru_use_one(inode_lru, &inode->lru);
 170 }
 171
 172 void
 173 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 174 {
 175     assert(parent);
 176
 177     dnode->ref_count = 1;
 178     dnode->parent = parent;
 179     llist_append(&parent->children, &dnode->siblings);
 180
 181     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 182     hlist_add(&bucket->head, &dnode->hash_list);
 183 }
 184
 185 void
 186 vfs_dcache_remove(struct v_dnode* dnode)
 187 {
 188     assert(dnode);
 189     assert(dnode->ref_count == 1);
 190
 191     llist_delete(&dnode->siblings);
 192     llist_delete(&dnode->aka_list);
 193     hlist_delete(&dnode->hash_list);
 194
 195     dnode->parent = NULL;
 196     dnode->ref_count = 0;
 197 }
 198
 199 void
 200 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 201 {
 202     assert(new_parent);
 203
 204     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 205     vfs_dcache_remove(dnode);
 206     vfs_dcache_add(new_parent, dnode);
 207 }
 208
 209 int
 210 vfs_open(struct v_dnode* dnode, struct v_file** file)
 211 {
 212     struct v_inode* inode = dnode->inode;
 213
 214     if (!inode || !inode->ops->open) {
 215         return ENOTSUP;
 216     }
 217
 218     lock_inode(inode);
 219
 220     struct v_file* vfile = cake_grab(file_pile);
 221     memset(vfile, 0, sizeof(*vfile));
 222
 223     vfile->dnode = dnode;
 224     vfile->inode = inode;
 225     vfile->ref_count = 1;
 226     vfile->ops = inode->default_fops;
 227
 228     if (check_regfile_node(inode) && !inode->pg_cache) {
 229         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 230         pcache_init(pcache);
 231         pcache->master = inode;
 232         inode->pg_cache = pcache;
 233     }
 234
 235     int errno = inode->ops->open(inode, vfile);
 236     if (errno) {
 237         cake_release(file_pile, vfile);
 238     } else {
 239         vfs_ref_dnode(dnode);
 240         inode->open_count++;
 241
 242         *file = vfile;
 243     }
 244
 245     unlock_inode(inode);
 246
 247     return errno;
 248 }
 249
 250 void
 251 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 252 {
 253     if (assign_to->inode) {
 254         llist_delete(&assign_to->aka_list);
 255         assign_to->inode->link_count--;
 256     }
 257
 258     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 259     assign_to->inode = inode;
 260     inode->link_count++;
 261 }
 262
 263 int
 264 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 265 {
 266     int errno;
 267
 268     if ((errno = vfs_check_writable(to_link))) {
 269         return errno;
 270     }
 271
 272     lock_inode(to_link->inode);
 273     if (to_link->super_block->root != name->super_block->root) {
 274         errno = EXDEV;
 275     } else if (!to_link->inode->ops->link) {
 276         errno = ENOTSUP;
 277     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 278         vfs_assign_inode(name, to_link->inode);
 279     }
 280     unlock_inode(to_link->inode);
 281
 282     return errno;
 283 }
 284
 285 int
 286 vfs_pclose(struct v_file* file, pid_t pid)
 287 {
 288     struct v_inode* inode;
 289     int errno = 0;
 290
 291     inode = file->inode;
 292
 293     /*
 294      * Prevent dead lock.
 295      * This happened when process is terminated while blocking on read.
 296      * In that case, the process is still holding the inode lock and it
 297          will never get released.
 298      * The unlocking should also include ownership check.
 299      *
 300      * To see why, consider two process both open the same file both with
 301      * fd=x.
 302      *      Process A: busy on reading x
 303      *      Process B: do nothing with x
 304      * Assuming that, after a very short time, process B get terminated
 305      * while process A is still busy in it's reading business. By this
 306      * design, the inode lock of this file x is get released by B rather
 307      * than A. And this will cause a probable race condition on A if other
 308      * process is writing to this file later after B exit.
 309     */
 310
 311     mutex_unlock_for(&inode->lock, pid);
 312
 313     if (vfs_check_duped_file(file)) {
 314         vfs_unref_file(file);
 315         return 0;
 316     }
 317
 318     if ((errno = file->ops->close(file))) {
 319         goto done;
 320     }
 321
 322     vfs_unref_dnode(file->dnode);
 323     cake_release(file_pile, file);
 324
 325     /*
 326         if the current inode is not being locked by other
 327         threads that does not share same open context,
 328         then we can try to do sync opportunistically
 329     */
 330     if (mutex_on_hold(&inode->lock)) {
 331         goto done;
 332     }
 333
 334     lock_inode(inode);
 335
 336     pcache_commit_all(inode);
 337     inode->open_count--;
 338
 339     if (!inode->open_count) {
 340         __sync_inode_nolock(inode);
 341     }
 342
 343     unlock_inode(inode);
 344
 345 done:
 346     return errno;
 347 }
 348
 349 int
 350 vfs_close(struct v_file* file)
 351 {
 352     return vfs_pclose(file, __current->pid);
 353 }
 354
 355 void
 356 vfs_free_fd(struct v_fd* fd)
 357 {
 358     cake_release(fd_pile, fd);
 359 }
 360
 361 int
 362 vfs_isync(struct v_inode* inode)
 363 {
 364     lock_inode(inode);
 365
 366     int errno = __sync_inode_nolock(inode);
 367
 368     unlock_inode(inode);
 369
 370     return errno;
 371 }
 372
 373 int
 374 vfs_fsync(struct v_file* file)
 375 {
 376     int errno;
 377     if ((errno = vfs_check_writable(file->dnode))) {
 378         return errno;
 379     }
 380
 381     return vfs_isync(file->inode);
 382 }
 383
 384 int
 385 vfs_alloc_fdslot(int* fd)
 386 {
 387     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 388         if (!__current->fdtable->fds[i]) {
 389             *fd = i;
 390             return 0;
 391         }
 392     }
 393     return EMFILE;
 394 }
 395
 396 struct v_superblock*
 397 vfs_sb_alloc()
 398 {
 399     struct v_superblock* sb = cake_grab(superblock_pile);
 400     memset(sb, 0, sizeof(*sb));
 401     llist_init_head(&sb->sb_list);
 402
 403     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 404     sb->d_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 405
 406     sb->ref_count = 1;
 407     return sb;
 408 }
 409
 410 void
 411 vfs_sb_ref(struct v_superblock* sb)
 412 {
 413     sb->ref_count++;
 414 }
 415
 416 void
 417 vfs_sb_unref(struct v_superblock* sb)
 418 {
 419     assert(sb->ref_count);
 420
 421     sb->ref_count--;
 422     if (likely(sb->ref_count)) {
 423         return;
 424     }
 425
 426     if (sb->ops.release) {
 427         sb->ops.release(sb);
 428     }
 429
 430     vfree(sb->i_cache);
 431     vfree(sb->d_cache);
 432
 433     cake_release(superblock_pile, sb);
 434 }
 435
 436 static int
 437 __vfs_try_evict_dnode(struct lru_node* obj)
 438 {
 439     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 440
 441     if (!dnode->ref_count) {
 442         vfs_d_free(dnode);
 443         return 1;
 444     }
 445     return 0;
 446 }
 447
 448 static int
 449 __vfs_try_evict_inode(struct lru_node* obj)
 450 {
 451     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 452
 453     if (!inode->link_count && !inode->open_count) {
 454         vfs_i_free(inode);
 455         return 1;
 456     }
 457     return 0;
 458 }
 459
 460 struct v_dnode*
 461 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 462 {
 463     struct v_dnode* dnode = cake_grab(dnode_pile);
 464     if (!dnode) {
 465         lru_evict_half(dnode_lru);
 466
 467         if (!(dnode = cake_grab(dnode_pile))) {
 468             return NULL;
 469         }
 470     }
 471
 472     memset(dnode, 0, sizeof(*dnode));
 473     llist_init_head(&dnode->children);
 474     llist_init_head(&dnode->siblings);
 475     llist_init_head(&dnode->aka_list);
 476     mutex_init(&dnode->lock);
 477
 478     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 479
 480     hstrcpy(&dnode->name, name);
 481
 482     if (parent) {
 483         vfs_d_assign_sb(dnode, parent->super_block);
 484         dnode->mnt = parent->mnt;
 485     }
 486
 487     lru_use_one(dnode_lru, &dnode->lru);
 488
 489     return dnode;
 490 }
 491
 492 void
 493 vfs_d_free(struct v_dnode* dnode)
 494 {
 495     assert(dnode->ref_count == 1);
 496
 497     if (dnode->inode) {
 498         assert(dnode->inode->link_count > 0);
 499         dnode->inode->link_count--;
 500     }
 501
 502     vfs_dcache_remove(dnode);
 503     // Make sure the children de-referencing their parent.
 504     // With lru presented, the eviction will be propagated over the entire
 505     // detached subtree eventually
 506     struct v_dnode *pos, *n;
 507     llist_for_each(pos, n, &dnode->children, siblings)
 508     {
 509         vfs_dcache_remove(pos);
 510     }
 511
 512     if (dnode->destruct) {
 513         dnode->destruct(dnode);
 514     }
 515
 516     vfs_sb_unref(dnode->super_block);
 517     vfree((void*)dnode->name.value);
 518     cake_release(dnode_pile, dnode);
 519 }
 520
 521 struct v_inode*
 522 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 523 {
 524     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 525     struct v_inode *pos, *n;
 526     hashtable_bucket_foreach(slot, pos, n, hash_list)
 527     {
 528         if (pos->id == i_id) {
 529             lru_use_one(inode_lru, &pos->lru);
 530             return pos;
 531         }
 532     }
 533
 534     return NULL;
 535 }
 536
 537 void
 538 vfs_i_addhash(struct v_inode* inode)
 539 {
 540     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 541
 542     hlist_delete(&inode->hash_list);
 543     hlist_add(&slot->head, &inode->hash_list);
 544 }
 545
 546 struct v_inode*
 547 vfs_i_alloc(struct v_superblock* sb)
 548 {
 549     assert(sb->ops.init_inode);
 550
 551     struct v_inode* inode;
 552     if (!(inode = cake_grab(inode_pile))) {
 553         lru_evict_half(inode_lru);
 554         if (!(inode = cake_grab(inode_pile))) {
 555             return NULL;
 556         }
 557     }
 558
 559     memset(inode, 0, sizeof(*inode));
 560     mutex_init(&inode->lock);
 561     llist_init_head(&inode->xattrs);
 562     llist_init_head(&inode->aka_dnodes);
 563
 564     sb->ops.init_inode(sb, inode);
 565
 566     inode->ctime = clock_unixtime();
 567     inode->atime = inode->ctime;
 568     inode->mtime = inode->ctime;
 569
 570     vfs_i_assign_sb(inode, sb);
 571     lru_use_one(inode_lru, &inode->lru);
 572     return inode;
 573 }
 574
 575 void
 576 vfs_i_free(struct v_inode* inode)
 577 {
 578     if (inode->pg_cache) {
 579         pcache_release(inode->pg_cache);
 580         vfree(inode->pg_cache);
 581     }
 582     // we don't need to sync inode.
 583     // If an inode can be free, then it must be properly closed.
 584     // Hence it must be synced already!
 585     if (inode->destruct) {
 586         inode->destruct(inode);
 587     }
 588
 589     vfs_sb_unref(inode->sb);
 590     hlist_delete(&inode->hash_list);
 591     cake_release(inode_pile, inode);
 592 }
 593
 594 /* ---- System call definition and support ---- */
 595
 596 // make a new name when not exists
 597 #define FLOC_MAYBE_MKNAME 1
 598
 599 // name must be non-exist and made.
 600 #define FLOC_MKNAME 2
 601
 602 // no follow symlink
 603 #define FLOC_NOFOLLOW 4
 604
 605 int
 606 vfs_getfd(int fd, struct v_fd** fd_s)
 607 {
 608     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 609         return 0;
 610     }
 611     return EBADF;
 612 }
 613
 614 static int
 615 __vfs_mknod(struct v_inode* parent, struct v_dnode* dnode,
 616             unsigned int itype, dev_t* dev)
 617 {
 618     int errno;
 619
 620     errno = parent->ops->create(parent, dnode, itype);
 621     if (errno) {
 622         return errno;
 623     }
 624
 625     return 0;
 626 }
 627
 628 struct file_locator {
 629     struct v_dnode* dir;
 630     struct v_dnode* file;
 631     bool fresh;
 632 };
 633
 634 /**
 635  * @brief unlock the file locator (floc) if possible.
 636  *        If the file to be located if not exists, and
 637  *        any FLOC_*MKNAME flag is set, then the parent
 638  *        dnode will be locked until the file has been properly
 639  *        finalised by subsequent logic.
 640  *
 641  * @param floc
 642  */
 643 static inline void
 644 __floc_try_unlock(struct file_locator* floc)
 645 {
 646     if (floc->fresh) {
 647         assert(floc->dir);
 648         unlock_dnode(floc->dir);
 649     }
 650 }
 651
 652 static int
 653 __vfs_try_locate_file(const char* path,
 654                       struct file_locator* floc,
 655                       int options)
 656 {
 657     char name_str[VFS_NAME_MAXLEN];
 658     struct v_dnode *fdir, *file;
 659     struct hstr name = HSTR(name_str, 0);
 660     int errno, woption = 0;
 661
 662     if ((options & FLOC_NOFOLLOW)) {
 663         woption |= VFS_WALK_NOFOLLOW;
 664         options &= ~FLOC_NOFOLLOW;
 665     }
 666
 667     floc->fresh = false;
 668     name_str[0] = 0;
 669     errno = vfs_walk_proc(path, &fdir, &name, woption | VFS_WALK_PARENT);
 670     if (errno) {
 671         return errno;
 672     }
 673
 674     errno = vfs_walk(fdir, name.value, &file, NULL, woption);
 675
 676     if (errno && errno != ENOENT) {
 677         goto done;
 678     }
 679
 680     if (!errno) {
 681         if ((options & FLOC_MKNAME)) {
 682             errno = EEXIST;
 683         }
 684         goto done;
 685     }
 686
 687     // errno == ENOENT
 688     if (!options) {
 689         goto done;
 690     }
 691
 692     errno = vfs_check_writable(fdir);
 693     if (errno) {
 694         goto done;
 695     }
 696
 697     floc->fresh = true;
 698
 699     file = vfs_d_alloc(fdir, &name);
 700
 701     if (!file) {
 702         return ENOMEM;
 703     }
 704
 705     lock_dnode(fdir);
 706
 707     vfs_dcache_add(fdir, file);
 708
 709 done:
 710     floc->dir   = fdir;
 711     floc->file  = file;
 712
 713     return errno;
 714 }
 715
 716
 717 static bool
 718 __check_unlinkable(struct v_dnode* dnode)
 719 {
 720     int acl;
 721     bool wr_self, wr_parent;
 722     struct v_dnode* parent;
 723
 724     parent = dnode->parent;
 725     acl = dnode->inode->acl;
 726
 727     wr_self = check_allow_write(dnode->inode);
 728     wr_parent = check_allow_write(parent->inode);
 729
 730     if (!fsacl_test(acl, svtx)) {
 731         return wr_self;
 732     }
 733
 734     if (current_euid() == dnode->inode->uid) {
 735         return true;
 736     }
 737
 738     return wr_self && wr_parent;
 739 }
 740
 741 int
 742 vfs_do_open(const char* path, int options)
 743 {
 744     int errno, fd, loptions = 0;
 745     struct v_dnode *dentry, *file;
 746     struct v_file* ofile = NULL;
 747     struct file_locator floc;
 748     struct v_inode* inode;
 749
 750     if ((options & FO_CREATE)) {
 751         loptions |= FLOC_MAYBE_MKNAME;
 752     } else if ((options & FO_NOFOLLOW)) {
 753         loptions |= FLOC_NOFOLLOW;
 754     }
 755
 756     errno = __vfs_try_locate_file(path, &floc, loptions);
 757
 758     if (errno || (errno = vfs_alloc_fdslot(&fd))) {
 759         return errno;
 760     }
 761
 762     file   = floc.file;
 763     dentry = floc.dir;
 764
 765     if (floc.fresh) {
 766         errno = __vfs_mknod(dentry->inode, file, VFS_IFFILE, NULL);
 767         if (errno) {
 768             vfs_d_free(file);
 769             __floc_try_unlock(&floc);
 770             return errno;
 771         }
 772
 773         __floc_try_unlock(&floc);
 774     }
 775
 776
 777     if ((errno = vfs_open(file, &ofile))) {
 778         return errno;
 779     }
 780
 781     inode = ofile->inode;
 782     lock_inode(inode);
 783
 784     struct v_fd* fd_s = cake_grab(fd_pile);
 785     memset(fd_s, 0, sizeof(*fd_s));
 786
 787     if ((options & O_TRUNC)) {
 788         file->inode->fsize = 0;
 789     }
 790
 791     if (vfs_get_dtype(inode->itype) == DT_DIR) {
 792         ofile->f_pos = 0;
 793     }
 794
 795     fd_s->file = ofile;
 796     fd_s->flags = options;
 797     __current->fdtable->fds[fd] = fd_s;
 798
 799     unlock_inode(inode);
 800
 801     return fd;
 802 }
 803
 804 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 805 {
 806     int errno = vfs_do_open(path, options);
 807     return DO_STATUS_OR_RETURN(errno);
 808 }
 809
 810 __DEFINE_LXSYSCALL1(int, close, int, fd)
 811 {
 812     struct v_fd* fd_s;
 813     int errno = 0;
 814     if ((errno = vfs_getfd(fd, &fd_s))) {
 815         goto done_err;
 816     }
 817
 818     if ((errno = vfs_close(fd_s->file))) {
 819         goto done_err;
 820     }
 821
 822     cake_release(fd_pile, fd_s);
 823     __current->fdtable->fds[fd] = 0;
 824
 825 done_err:
 826     return DO_STATUS(errno);
 827 }
 828
 829 void
 830 __vfs_readdir_callback(struct dir_context* dctx,
 831                        const char* name,
 832                        const int len,
 833                        const int dtype)
 834 {
 835     struct lx_dirent* dent = (struct lx_dirent*)dctx->cb_data;
 836     strncpy(dent->d_name, name, MIN(len, DIRENT_NAME_MAX_LEN));
 837     dent->d_nlen = len;
 838     dent->d_type = dtype;
 839 }
 840
 841 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent)
 842 {
 843     struct v_fd* fd_s;
 844     int errno;
 845
 846     if ((errno = vfs_getfd(fd, &fd_s))) {
 847         goto done;
 848     }
 849
 850     struct v_inode* inode = fd_s->file->inode;
 851
 852     lock_inode(inode);
 853
 854     if (!check_directory_node(inode)) {
 855         errno = ENOTDIR;
 856         goto unlock;
 857     }
 858
 859     if (!check_allow_read(inode)) {
 860         errno = EPERM;
 861         goto unlock;
 862     }
 863
 864     struct dir_context dctx = (struct dir_context) {
 865         .cb_data = dent,
 866         .read_complete_callback = __vfs_readdir_callback
 867     };
 868
 869     if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 870         goto unlock;
 871     }
 872     dent->d_offset++;
 873     fd_s->file->f_pos++;
 874
 875 unlock:
 876     unlock_inode(inode);
 877
 878 done:
 879     return DO_STATUS_OR_RETURN(errno);
 880 }
 881
 882 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 883 {
 884     int errno = 0;
 885     struct v_fd* fd_s;
 886     struct v_inode* inode;
 887
 888     if ((errno = vfs_getfd(fd, &fd_s))) {
 889         goto done;
 890     }
 891
 892     struct v_file* file = fd_s->file;
 893     if (check_directory_node(file->inode)) {
 894         errno = EISDIR;
 895         goto done;
 896     }
 897
 898     if (!check_allow_read(file->inode)) {
 899         errno = EPERM;
 900         goto done;
 901     }
 902
 903     inode = file->inode;
 904     lock_inode(inode);
 905
 906     __vfs_touch_inode(inode, INODE_ACCESSED);
 907
 908     if (check_seqdev_node(inode) || (fd_s->flags & FO_DIRECT)) {
 909         errno = file->ops->read(inode, buf, count, file->f_pos);
 910     } else {
 911         errno = pcache_read(inode, buf, count, file->f_pos);
 912     }
 913
 914     if (errno > 0) {
 915         file->f_pos += errno;
 916         unlock_inode(inode);
 917         return errno;
 918     }
 919
 920     unlock_inode(inode);
 921
 922 done:
 923     return DO_STATUS(errno);
 924 }
 925
 926 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 927 {
 928     int errno = 0;
 929     struct v_fd* fd_s;
 930     if ((errno = vfs_getfd(fd, &fd_s))) {
 931         goto done;
 932     }
 933
 934     struct v_inode* inode;
 935     struct v_file* file = fd_s->file;
 936
 937     if ((errno = vfs_check_writable(file->dnode))) {
 938         goto done;
 939     }
 940
 941     if (check_directory_node(file->inode)) {
 942         errno = EISDIR;
 943         goto done;
 944     }
 945
 946     inode = file->inode;
 947     lock_inode(inode);
 948
 949     __vfs_touch_inode(inode, INODE_MODIFY);
 950     if ((fd_s->flags & O_APPEND)) {
 951         file->f_pos = inode->fsize;
 952     }
 953
 954     if (check_seqdev_node(inode) || (fd_s->flags & FO_DIRECT)) {
 955         errno = file->ops->write(inode, buf, count, file->f_pos);
 956     } else {
 957         errno = pcache_write(inode, buf, count, file->f_pos);
 958     }
 959
 960     if (errno > 0) {
 961         file->f_pos += errno;
 962         inode->fsize = MAX(inode->fsize, file->f_pos);
 963
 964         unlock_inode(inode);
 965         return errno;
 966     }
 967
 968     unlock_inode(inode);
 969
 970 done:
 971     return DO_STATUS(errno);
 972 }
 973
 974 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 975 {
 976     int errno = 0;
 977     struct v_fd* fd_s;
 978     if ((errno = vfs_getfd(fd, &fd_s))) {
 979         goto done;
 980     }
 981
 982     struct v_file* file = fd_s->file;
 983     struct v_inode* inode = file->inode;
 984
 985     if (!file->ops->seek) {
 986         errno = ENOTSUP;
 987         goto done;
 988     }
 989
 990     if (!check_allow_read(inode)) {
 991         errno = EPERM;
 992         goto done;
 993     }
 994
 995     lock_inode(inode);
 996
 997     int overflow = 0;
 998     int fpos = file->f_pos;
 999
1000     if (vfs_get_dtype(inode->itype) == DT_DIR) {
1001         options = (options != FSEEK_END) ? options : FSEEK_SET;
1002     }
1003
1004     switch (options) {
1005         case FSEEK_CUR:
1006             overflow = sadd_of((int)file->f_pos, offset, &fpos);
1007             break;
1008         case FSEEK_END:
1009             overflow = sadd_of((int)inode->fsize, offset, &fpos);
1010             break;
1011         case FSEEK_SET:
1012             fpos = offset;
1013             break;
1014     }
1015
1016     if (overflow) {
1017         errno = EOVERFLOW;
1018     }
1019     else {
1020         errno = file->ops->seek(file, fpos);
1021     }
1022
1023     unlock_inode(inode);
1024
1025 done:
1026     return DO_STATUS(errno);
1027 }
1028
1029 int
1030 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
1031 {
1032     if (!dnode) {
1033         return 0;
1034     }
1035
1036     if (depth > 64) {
1037         return ENAMETOOLONG;
1038     }
1039
1040     size_t len = 0;
1041
1042     if (dnode->parent != dnode) {
1043         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
1044     }
1045
1046     if (len >= size) {
1047         return len;
1048     }
1049
1050     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
1051         buf[len++] = VFS_PATH_DELIM;
1052     }
1053
1054     size_t cpy_size = MIN(dnode->name.len, size - len);
1055     strncpy(buf + len, dnode->name.value, cpy_size);
1056     len += cpy_size;
1057
1058     return len;
1059 }
1060
1061 int
1062 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
1063 {
1064     const char* link;
1065     struct v_inode* inode = dnode->inode;
1066
1067     if (!check_symlink_node(inode)) {
1068         return EINVAL;
1069     }
1070
1071     if (!inode->ops->read_symlink) {
1072         return ENOTSUP;
1073     }
1074
1075     if (!check_allow_read(inode)) {
1076         return EPERM;
1077     }
1078
1079     lock_inode(inode);
1080
1081     int errno = inode->ops->read_symlink(inode, &link);
1082     if (errno >= 0) {
1083         strncpy(buf, link, MIN(size, (size_t)errno));
1084     }
1085
1086     unlock_inode(inode);
1087     return errno;
1088 }
1089
1090 int
1091 vfs_get_dtype(int itype)
1092 {
1093     int dtype = DT_FILE;
1094     if (check_itype(itype, VFS_IFSYMLINK)) {
1095         dtype |= DT_SYMLINK;
1096     }
1097
1098     if (check_itype(itype, VFS_IFDIR)) {
1099         dtype |= DT_DIR;
1100         return dtype;
1101     }
1102
1103     // TODO other types
1104
1105     return dtype;
1106 }
1107
1108 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
1109 {
1110     int errno;
1111     struct v_fd* fd_s;
1112     if ((errno = vfs_getfd(fd, &fd_s))) {
1113         goto done;
1114     }
1115
1116     struct v_dnode* dnode;
1117     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
1118
1119     if (errno >= 0) {
1120         return errno;
1121     }
1122
1123 done:
1124     return DO_STATUS(errno);
1125 }
1126
1127 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
1128 {
1129     int errno;
1130     struct v_dnode* dnode;
1131     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
1132         errno = vfs_readlink(dnode, buf, size);
1133     }
1134
1135     if (errno >= 0) {
1136         return errno;
1137     }
1138
1139     return DO_STATUS(errno);
1140 }
1141
1142 __DEFINE_LXSYSCALL4(
1143   int, readlinkat, int, dirfd, const char*, pathname, char*, buf, size_t, size)
1144 {
1145     int errno;
1146     struct v_fd* fd_s;
1147     if ((errno = vfs_getfd(dirfd, &fd_s))) {
1148         goto done;
1149     }
1150
1151     pathname = pathname ? pathname : "";
1152
1153     struct v_dnode* dnode;
1154     if (!(errno = vfs_walk(
1155             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
1156         errno = vfs_readlink(fd_s->file->dnode, buf, size);
1157     }
1158
1159     if (errno >= 0) {
1160         return errno;
1161     }
1162
1163 done:
1164     return DO_STATUS(errno);
1165 }
1166
1167 /*
1168     NOTE
1169     When we perform operation that could affect the layout of
1170     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
1171     whenever possible. This will blocking any ongoing path walking to reach
1172     it hence avoid any partial state.
1173 */
1174
1175 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
1176 {
1177     int errno;
1178     struct v_dnode* dnode;
1179     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1180         return DO_STATUS(errno);
1181     }
1182
1183     lock_dnode(dnode);
1184
1185     if (!__check_unlinkable(dnode)) {
1186         errno = EPERM;
1187         goto done;
1188     }
1189
1190     if ((errno = vfs_check_writable(dnode))) {
1191         goto done;
1192     }
1193
1194     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
1195         errno = EROFS;
1196         goto done;
1197     }
1198
1199     if (dnode->ref_count > 1 || dnode->inode->open_count) {
1200         errno = EBUSY;
1201         goto done;
1202     }
1203
1204     if (!llist_empty(&dnode->children)) {
1205         errno = ENOTEMPTY;
1206         goto done;
1207     }
1208
1209     struct v_dnode* parent = dnode->parent;
1210
1211     if (!parent) {
1212         errno = EINVAL;
1213         goto done;
1214     }
1215
1216     lock_dnode(parent);
1217     lock_inode(parent->inode);
1218
1219     if (check_directory_node(dnode->inode)) {
1220         errno = parent->inode->ops->rmdir(parent->inode, dnode);
1221         if (!errno) {
1222             vfs_dcache_remove(dnode);
1223         }
1224     } else {
1225         errno = ENOTDIR;
1226     }
1227
1228     unlock_inode(parent->inode);
1229     unlock_dnode(parent);
1230
1231 done:
1232     unlock_dnode(dnode);
1233     return DO_STATUS(errno);
1234 }
1235
1236 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
1237 {
1238     int errno = 0;
1239     struct v_dnode *parent, *dir;
1240     char name_value[VFS_NAME_MAXLEN];
1241     struct hstr name = HHSTR(name_value, 0, 0);
1242
1243     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
1244         goto done;
1245     }
1246
1247     if (!(errno = vfs_walk(parent, name_value, &dir, NULL, 0))) {
1248         errno = EEXIST;
1249         goto done;
1250     }
1251
1252     if ((errno = vfs_check_writable(parent))) {
1253         goto done;
1254     }
1255
1256     if (!(dir = vfs_d_alloc(parent, &name))) {
1257         errno = ENOMEM;
1258         goto done;
1259     }
1260
1261     struct v_inode* inode = parent->inode;
1262
1263     lock_dnode(parent);
1264     lock_inode(inode);
1265
1266     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1267         errno = ENOTSUP;
1268     } else if (!inode->ops->mkdir) {
1269         errno = ENOTSUP;
1270     } else if (!check_directory_node(inode)) {
1271         errno = ENOTDIR;
1272     } else if (!(errno = inode->ops->mkdir(inode, dir))) {
1273         vfs_dcache_add(parent, dir);
1274         goto cleanup;
1275     }
1276
1277     vfs_d_free(dir);
1278
1279 cleanup:
1280     unlock_inode(inode);
1281     unlock_dnode(parent);
1282 done:
1283     return DO_STATUS(errno);
1284 }
1285
1286 static int
1287 __vfs_do_unlink(struct v_dnode* dnode)
1288 {
1289     int errno;
1290     struct v_inode* inode = dnode->inode;
1291
1292     if (dnode->ref_count > 1) {
1293         return EBUSY;
1294     }
1295
1296     if (!__check_unlinkable(dnode)) {
1297         return EPERM;
1298     }
1299
1300     if ((errno = vfs_check_writable(dnode))) {
1301         return errno;
1302     }
1303
1304     lock_inode(inode);
1305
1306     if (inode->open_count) {
1307         errno = EBUSY;
1308     } else if (!check_directory_node(inode)) {
1309         errno = inode->ops->unlink(inode, dnode);
1310         if (!errno) {
1311             vfs_d_free(dnode);
1312         }
1313     } else {
1314         errno = EISDIR;
1315     }
1316
1317     unlock_inode(inode);
1318
1319     return errno;
1320 }
1321
1322 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1323 {
1324     int errno;
1325     struct v_dnode* dnode;
1326     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1327         goto done;
1328     }
1329
1330     errno = __vfs_do_unlink(dnode);
1331
1332 done:
1333     return DO_STATUS(errno);
1334 }
1335
1336 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1337 {
1338     int errno;
1339     struct v_fd* fd_s;
1340     if ((errno = vfs_getfd(fd, &fd_s))) {
1341         goto done;
1342     }
1343
1344     struct v_dnode* dnode;
1345     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1346         errno = __vfs_do_unlink(dnode);
1347     }
1348
1349 done:
1350     return DO_STATUS(errno);
1351 }
1352
1353 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1354 {
1355     int errno;
1356     struct file_locator floc;
1357     struct v_dnode *to_link, *name_file;
1358
1359     errno = __vfs_try_locate_file(oldpath, &floc, 0);
1360     if (errno) {
1361         goto done;
1362     }
1363
1364     __floc_try_unlock(&floc);
1365
1366     to_link = floc.file;
1367     errno = __vfs_try_locate_file(newpath, &floc, FLOC_MKNAME);
1368     if (!errno) {
1369         goto done;
1370     }
1371
1372     name_file = floc.file;
1373     errno = vfs_link(to_link, name_file);
1374     if (errno) {
1375         vfs_d_free(name_file);
1376     }
1377
1378 done:
1379     __floc_try_unlock(&floc);
1380     return DO_STATUS(errno);
1381 }
1382
1383 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1384 {
1385     int errno;
1386     struct v_fd* fd_s;
1387
1388     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1389         errno = vfs_fsync(fd_s->file);
1390     }
1391
1392     return DO_STATUS(errno);
1393 }
1394
1395 int
1396 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1397 {
1398     int errno = 0;
1399     struct v_fd* copied = cake_grab(fd_pile);
1400
1401     memcpy(copied, old, sizeof(struct v_fd));
1402
1403     vfs_ref_file(old->file);
1404
1405     *new = copied;
1406
1407     return errno;
1408 }
1409
1410 int
1411 vfs_dup2(int oldfd, int newfd)
1412 {
1413     if (newfd == oldfd) {
1414         return newfd;
1415     }
1416
1417     int errno;
1418     struct v_fd *oldfd_s, *newfd_s;
1419     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1420         goto done;
1421     }
1422
1423     if (!TEST_FD(newfd)) {
1424         errno = EBADF;
1425         goto done;
1426     }
1427
1428     newfd_s = __current->fdtable->fds[newfd];
1429     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1430         goto done;
1431     }
1432
1433     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1434         __current->fdtable->fds[newfd] = newfd_s;
1435         return newfd;
1436     }
1437
1438 done:
1439     return DO_STATUS(errno);
1440 }
1441
1442 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1443 {
1444     return vfs_dup2(oldfd, newfd);
1445 }
1446
1447 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1448 {
1449     int errno, newfd;
1450     struct v_fd *oldfd_s, *newfd_s;
1451     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1452         goto done;
1453     }
1454
1455     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1456         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1457         __current->fdtable->fds[newfd] = newfd_s;
1458         return newfd;
1459     }
1460
1461 done:
1462     return DO_STATUS(errno);
1463 }
1464
1465 __DEFINE_LXSYSCALL2(
1466   int, symlink, const char*, pathname, const char*, link_target)
1467 {
1468     int errno;
1469     struct file_locator floc;
1470     struct v_dnode *file;
1471     struct v_inode *f_ino;
1472
1473     errno = __vfs_try_locate_file(pathname, &floc, FLOC_MKNAME);
1474     if (errno) {
1475         goto done;
1476     }
1477
1478     file = floc.file;
1479     errno = __vfs_mknod(floc.dir->inode, file, VFS_IFSYMLINK, NULL);
1480     if (errno) {
1481         vfs_d_free(file);
1482         goto done;
1483     }
1484
1485     f_ino = file->inode;
1486
1487     assert(f_ino);
1488
1489     errno = vfs_check_writable(file);
1490     if (errno) {
1491         goto done;
1492     }
1493
1494     if (!f_ino->ops->set_symlink) {
1495         errno = ENOTSUP;
1496         goto done;
1497     }
1498
1499     lock_inode(f_ino);
1500
1501     errno = f_ino->ops->set_symlink(f_ino, link_target);
1502
1503     unlock_inode(f_ino);
1504
1505 done:
1506     __floc_try_unlock(&floc);
1507     return DO_STATUS(errno);
1508 }
1509
1510 static int
1511 vfs_do_chdir_nolock(struct proc_info* proc, struct v_dnode* dnode)
1512 {
1513     if (!check_directory_node(dnode->inode)) {
1514         return ENOTDIR;
1515     }
1516
1517     if (proc->cwd) {
1518         vfs_unref_dnode(proc->cwd);
1519     }
1520
1521     vfs_ref_dnode(dnode);
1522     proc->cwd = dnode;
1523
1524     return 0;
1525 }
1526
1527 static int
1528 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1529 {
1530     int errno = 0;
1531
1532     lock_dnode(dnode);
1533
1534     errno = vfs_do_chdir_nolock(proc, dnode);
1535
1536     unlock_dnode(dnode);
1537
1538     return errno;
1539 }
1540
1541 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1542 {
1543     struct v_dnode* dnode;
1544     int errno = 0;
1545
1546     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1547         goto done;
1548     }
1549
1550     errno = vfs_do_chdir((struct proc_info*)__current, dnode);
1551
1552 done:
1553     return DO_STATUS(errno);
1554 }
1555
1556 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1557 {
1558     struct v_fd* fd_s;
1559     int errno = 0;
1560
1561     if ((errno = vfs_getfd(fd, &fd_s))) {
1562         goto done;
1563     }
1564
1565     errno = vfs_do_chdir((struct proc_info*)__current, fd_s->file->dnode);
1566
1567 done:
1568     return DO_STATUS(errno);
1569 }
1570
1571
1572 __DEFINE_LXSYSCALL1(int, chroot, const char*, path)
1573 {
1574     int errno;
1575     struct v_dnode* dnode;
1576     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1577         return errno;
1578     }
1579
1580     lock_dnode(dnode);
1581
1582     errno = vfs_do_chdir_nolock(__current, dnode);
1583     if (errno) {
1584         unlock_dnode(dnode);
1585         goto done;
1586     }
1587
1588     __current->root = dnode;
1589
1590     unlock_dnode(dnode);
1591
1592 done:
1593     return DO_STATUS(errno);
1594 }
1595
1596 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1597 {
1598     int errno = 0;
1599     char* ret_ptr = 0;
1600     if (size < 2) {
1601         errno = ERANGE;
1602         goto done;
1603     }
1604
1605     size_t len = 0;
1606
1607     if (!__current->cwd) {
1608         *buf = VFS_PATH_DELIM;
1609         len = 1;
1610     } else {
1611         len = vfs_get_path(__current->cwd, buf, size, 0);
1612         if (len == size) {
1613             errno = ERANGE;
1614             goto done;
1615         }
1616     }
1617
1618     buf[len] = '\0';
1619
1620     ret_ptr = buf;
1621
1622 done:
1623     syscall_result(errno);
1624     return ret_ptr;
1625 }
1626
1627 int
1628 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1629 {
1630     int errno = 0;
1631     if (current->inode->id == target->inode->id) {
1632         // hard link
1633         return 0;
1634     }
1635
1636     if ((errno = vfs_check_writable(current))) {
1637         return errno;
1638     }
1639
1640     if (current->ref_count > 1 || target->ref_count > 1) {
1641         return EBUSY;
1642     }
1643
1644     if (current->super_block != target->super_block) {
1645         return EXDEV;
1646     }
1647
1648     struct v_dnode* oldparent = current->parent;
1649     struct v_dnode* newparent = target->parent;
1650
1651     lock_dnode(current);
1652     lock_dnode(target);
1653     if (oldparent)
1654         lock_dnode(oldparent);
1655     if (newparent)
1656         lock_dnode(newparent);
1657
1658     if (!llist_empty(&target->children)) {
1659         errno = ENOTEMPTY;
1660         unlock_dnode(target);
1661         goto cleanup;
1662     }
1663
1664     if ((errno =
1665            current->inode->ops->rename(current->inode, current, target))) {
1666         unlock_dnode(target);
1667         goto cleanup;
1668     }
1669
1670     // re-position current
1671     hstrcpy(&current->name, &target->name);
1672     vfs_dcache_rehash(newparent, current);
1673
1674     // detach target
1675     vfs_d_free(target);
1676
1677     unlock_dnode(target);
1678
1679 cleanup:
1680     unlock_dnode(current);
1681     if (oldparent)
1682         unlock_dnode(oldparent);
1683     if (newparent)
1684         unlock_dnode(newparent);
1685
1686     return errno;
1687 }
1688
1689 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1690 {
1691     struct v_dnode *cur, *target_parent, *target;
1692     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1693     int errno = 0;
1694
1695     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1696         goto done;
1697     }
1698
1699     if ((errno = vfs_walk(
1700            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1701         goto done;
1702     }
1703
1704     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1705     if (errno == ENOENT) {
1706         target = vfs_d_alloc(target_parent, &name);
1707         vfs_dcache_add(target_parent, target);
1708     } else if (errno) {
1709         goto done;
1710     }
1711
1712     if (!target) {
1713         errno = ENOMEM;
1714         goto done;
1715     }
1716
1717     errno = vfs_do_rename(cur, target);
1718
1719 done:
1720     vfree((void*)name.value);
1721     return DO_STATUS(errno);
1722 }
1723
1724 __DEFINE_LXSYSCALL2(int, fstat, int, fd, struct file_stat*, stat)
1725 {
1726     int errno = 0;
1727     struct v_fd* fds;
1728
1729     if ((errno = vfs_getfd(fd, &fds))) {
1730         goto done;
1731     }
1732
1733     struct v_inode* vino = fds->file->inode;
1734     struct device* fdev = vino->sb->dev;
1735
1736     *stat = (struct file_stat){.st_ino = vino->id,
1737                                .st_blocks = vino->lb_usage,
1738                                .st_size = vino->fsize,
1739                                .mode = vino->itype,
1740                                .st_ioblksize = PAGE_SIZE,
1741                                .st_blksize = vino->sb->blksize};
1742
1743     if (check_device_node(vino)) {
1744         struct device* rdev = resolve_device(vino->data);
1745         if (!rdev) {
1746             errno = EINVAL;
1747             goto done;
1748         }
1749
1750         stat->st_rdev = (dev_t){.meta = rdev->ident.fn_grp,
1751                                 .unique = rdev->ident.unique,
1752                                 .index = dev_uid(rdev) };
1753     }
1754
1755     if (fdev) {
1756         stat->st_dev = (dev_t){.meta = fdev->ident.fn_grp,
1757                                .unique = fdev->ident.unique,
1758                                .index = dev_uid(fdev) };
1759     }
1760
1761 done:
1762     return DO_STATUS(errno);
1763 }
1764
1765 __DEFINE_LXSYSCALL4(int, fchmodat, int, fd,
1766                     const char*, path, int, mode, int, flags)
1767 {
1768     int errno;
1769     struct v_dnode *dnode;
1770     struct v_inode* inode;
1771
1772     errno = vfs_walkat(fd, path, flags, &dnode);
1773     if (errno) {
1774         goto done;
1775     }
1776
1777     errno = vfs_check_writable(dnode);
1778     if (errno) {
1779         return errno;
1780     }
1781
1782     inode = dnode->inode;
1783     lock_inode(inode);
1784
1785     if (!current_is_root()) {
1786         mode = mode & FSACL_RWXMASK;
1787     }
1788
1789     inode->acl = mode;
1790     __vfs_touch_inode(inode, INODE_MODIFY);
1791
1792     unlock_inode(inode);
1793
1794 done:
1795     return DO_STATUS(errno);
1796 }
1797
1798 __DEFINE_LXSYSCALL5(int, fchownat, int, fd,
1799                     const char*, path, uid_t, uid, gid_t, gid, int, flags)
1800 {
1801     int errno;
1802     struct v_dnode *dnode;
1803     struct v_inode *inode;
1804
1805     errno = vfs_walkat(fd, path, flags, &dnode);
1806     if (errno) {
1807         goto done;
1808     }
1809
1810     errno = vfs_check_writable(dnode);
1811     if (errno) {
1812         return errno;
1813     }
1814
1815     inode = dnode->inode;
1816     lock_inode(inode);
1817
1818     inode->uid = uid;
1819     inode->gid = gid;
1820     __vfs_touch_inode(inode, INODE_MODIFY);
1821
1822     unlock_inode(inode);
1823
1824 done:
1825     return DO_STATUS(errno);
1826 }
1827
1828 __DEFINE_LXSYSCALL4(int, faccessat, int, fd,
1829                     const char*, path, int, amode, int, flags)
1830 {
1831     int errno, acl;
1832     struct v_dnode *dnode;
1833     struct v_inode *inode;
1834     struct user_scope* uscope;
1835
1836     uid_t tuid;
1837     gid_t tgid;
1838
1839     errno = vfs_walkat(fd, path, flags, &dnode);
1840     if (errno) {
1841         goto done;
1842     }
1843
1844     if ((flags & AT_EACCESS)) {
1845         tuid = current_euid();
1846         tgid = current_egid();
1847     }
1848     else {
1849         uscope = current_user_scope();
1850         tuid = uscope->ruid;
1851         tgid = uscope->rgid;
1852     }
1853
1854     inode = dnode->inode;
1855
1856     acl  = inode->acl;
1857     acl &= amode;
1858     acl &= check_acl_between(inode->uid, inode->gid, tuid, tgid);
1859     if (!acl) {
1860         errno = EACCESS;
1861     }
1862
1863 done:
1864     return DO_STATUS(errno);
1865 }