lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition (CHECKED)
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/foptions.h>
  48 #include <lunaix/fs.h>
  49 #include <lunaix/mm/cake.h>
  50 #include <lunaix/mm/valloc.h>
  51 #include <lunaix/process.h>
  52 #include <lunaix/spike.h>
  53 #include <lunaix/syscall.h>
  54 #include <lunaix/syscall_utils.h>
  55
  56 #include <lunaix/fs/twifs.h>
  57
  58 #include <usr/lunaix/dirent_defs.h>
  59
  60 #define INODE_ACCESSED  0
  61 #define INODE_MODIFY    1
  62
  63 static struct cake_pile* dnode_pile;
  64 static struct cake_pile* inode_pile;
  65 static struct cake_pile* file_pile;
  66 static struct cake_pile* superblock_pile;
  67 static struct cake_pile* fd_pile;
  68
  69 struct v_dnode* vfs_sysroot = NULL;
  70
  71 struct lru_zone *dnode_lru, *inode_lru;
  72
  73 struct hstr vfs_ddot = HSTR("..", 2);
  74 struct hstr vfs_dot = HSTR(".", 1);
  75 struct hstr vfs_empty = HSTR("", 0);
  76
  77 static int
  78 __vfs_try_evict_dnode(struct lru_node* obj);
  79
  80 static int
  81 __vfs_try_evict_inode(struct lru_node* obj);
  82
  83 void
  84 vfs_init()
  85 {
  86     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  87     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  88     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  89     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  90     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  91     superblock_pile =
  92       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  93
  94     dnode_lru = lru_new_zone("vfs_dnode", __vfs_try_evict_dnode);
  95     inode_lru = lru_new_zone("vfs_inode", __vfs_try_evict_inode);
  96
  97     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
  98     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
  99
 100     // 创建一个根dnode。
 101     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 102     vfs_sysroot->parent = vfs_sysroot;
 103
 104     vfs_ref_dnode(vfs_sysroot);
 105 }
 106
 107 static inline struct hbucket*
 108 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 109 {
 110     struct hbucket* d_cache;
 111     u32_t _hash;
 112
 113     d_cache = parent->super_block->d_cache;
 114     _hash = *hash;
 115     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 116     _hash += (u32_t)__ptr(parent);
 117
 118     *hash = _hash;
 119     return &d_cache[_hash & VFS_HASH_MASK];
 120 }
 121
 122 static inline int
 123 __sync_inode_nolock(struct v_inode* inode)
 124 {
 125     pcache_commit_all(inode);
 126
 127     int errno = ENOTSUP;
 128     if (inode->ops->sync) {
 129         errno = inode->ops->sync(inode);
 130     }
 131
 132     return errno;
 133 }
 134
 135 struct v_dnode*
 136 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 137 {
 138     if (!str->len || HSTR_EQ(str, &vfs_dot))
 139         return parent;
 140
 141     if (HSTR_EQ(str, &vfs_ddot)) {
 142         return parent->parent;
 143     }
 144
 145     u32_t hash = str->hash;
 146     struct hbucket* slot = __dcache_hash(parent, &hash);
 147
 148     struct v_dnode *pos, *n;
 149     hashtable_bucket_foreach(slot, pos, n, hash_list)
 150     {
 151         if (pos->name.hash == hash && pos->parent == parent) {
 152             return pos;
 153         }
 154     }
 155     return NULL;
 156 }
 157
 158 static void
 159 __vfs_touch_inode(struct v_inode* inode, const int type)
 160 {
 161     if (type == INODE_MODIFY) {
 162         inode->mtime = clock_unixtime();
 163     }
 164
 165     else if (type == INODE_ACCESSED) {
 166         inode->atime = clock_unixtime();
 167     }
 168
 169     lru_use_one(inode_lru, &inode->lru);
 170 }
 171
 172 void
 173 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 174 {
 175     assert(parent);
 176
 177     dnode->ref_count = 1;
 178     dnode->parent = parent;
 179     llist_append(&parent->children, &dnode->siblings);
 180
 181     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 182     hlist_add(&bucket->head, &dnode->hash_list);
 183 }
 184
 185 void
 186 vfs_dcache_remove(struct v_dnode* dnode)
 187 {
 188     assert(dnode);
 189     assert(dnode->ref_count == 1);
 190
 191     llist_delete(&dnode->siblings);
 192     llist_delete(&dnode->aka_list);
 193     hlist_delete(&dnode->hash_list);
 194
 195     dnode->parent = NULL;
 196     dnode->ref_count = 0;
 197 }
 198
 199 void
 200 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 201 {
 202     assert(new_parent);
 203
 204     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 205     vfs_dcache_remove(dnode);
 206     vfs_dcache_add(new_parent, dnode);
 207 }
 208
 209 int
 210 vfs_open(struct v_dnode* dnode, struct v_file** file)
 211 {
 212     struct v_inode* inode = dnode->inode;
 213
 214     if (!inode || !inode->ops->open) {
 215         return ENOTSUP;
 216     }
 217
 218     lock_inode(inode);
 219
 220     struct v_file* vfile = cake_grab(file_pile);
 221     memset(vfile, 0, sizeof(*vfile));
 222
 223     vfile->dnode = dnode;
 224     vfile->inode = inode;
 225     vfile->ref_count = 1;
 226     vfile->ops = inode->default_fops;
 227
 228     if (check_regfile_node(inode) && !inode->pg_cache) {
 229         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 230         pcache_init(pcache);
 231         pcache->master = inode;
 232         inode->pg_cache = pcache;
 233     }
 234
 235     int errno = inode->ops->open(inode, vfile);
 236     if (errno) {
 237         cake_release(file_pile, vfile);
 238     } else {
 239         vfs_ref_dnode(dnode);
 240         inode->open_count++;
 241
 242         *file = vfile;
 243     }
 244
 245     unlock_inode(inode);
 246
 247     return errno;
 248 }
 249
 250 void
 251 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 252 {
 253     if (assign_to->inode) {
 254         llist_delete(&assign_to->aka_list);
 255         assign_to->inode->link_count--;
 256     }
 257
 258     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 259     assign_to->inode = inode;
 260     inode->link_count++;
 261 }
 262
 263 int
 264 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 265 {
 266     int errno;
 267
 268     if ((errno = vfs_check_writable(to_link))) {
 269         return errno;
 270     }
 271
 272     lock_inode(to_link->inode);
 273     if (to_link->super_block->root != name->super_block->root) {
 274         errno = EXDEV;
 275     } else if (!to_link->inode->ops->link) {
 276         errno = ENOTSUP;
 277     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 278         vfs_assign_inode(name, to_link->inode);
 279     }
 280     unlock_inode(to_link->inode);
 281
 282     return errno;
 283 }
 284
 285 int
 286 vfs_pclose(struct v_file* file, pid_t pid)
 287 {
 288     struct v_inode* inode;
 289     int errno = 0;
 290
 291     inode = file->inode;
 292
 293     if (vfs_check_duped_file(file)) {
 294         vfs_unref_file(file);
 295         return 0;
 296     }
 297
 298     /*
 299      * Prevent dead lock.
 300      * This happened when process is terminated while blocking on read.
 301      * In that case, the process is still holding the inode lock and it
 302          will never get released.
 303      * The unlocking should also include ownership check.
 304      *
 305      * To see why, consider two process both open the same file both with
 306      * fd=x.
 307      *      Process A: busy on reading x
 308      *      Process B: do nothing with x
 309      * Assuming that, after a very short time, process B get terminated
 310      * while process A is still busy in it's reading business. By this
 311      * design, the inode lock of this file x is get released by B rather
 312      * than A. And this will cause a probable race condition on A if other
 313      * process is writing to this file later after B exit.
 314     */
 315     mutex_unlock_for(&inode->lock, pid);
 316
 317     // now regain lock for inode syncing
 318
 319     lock_inode(inode);
 320
 321     if ((errno = file->ops->close(file))) {
 322         goto done;
 323     }
 324
 325     vfs_unref_dnode(file->dnode);
 326     cake_release(file_pile, file);
 327
 328     pcache_commit_all(inode);
 329     inode->open_count--;
 330
 331     if (!inode->open_count) {
 332         __sync_inode_nolock(inode);
 333     }
 334
 335 done:
 336     unlock_inode(inode);
 337     return errno;
 338 }
 339
 340 int
 341 vfs_close(struct v_file* file)
 342 {
 343     return vfs_pclose(file, __current->pid);
 344 }
 345
 346 void
 347 vfs_free_fd(struct v_fd* fd)
 348 {
 349     cake_release(fd_pile, fd);
 350 }
 351
 352 int
 353 vfs_isync(struct v_inode* inode)
 354 {
 355     lock_inode(inode);
 356
 357     int errno = __sync_inode_nolock(inode);
 358
 359     unlock_inode(inode);
 360
 361     return errno;
 362 }
 363
 364 int
 365 vfs_fsync(struct v_file* file)
 366 {
 367     int errno;
 368     if ((errno = vfs_check_writable(file->dnode))) {
 369         return errno;
 370     }
 371
 372     return vfs_isync(file->inode);
 373 }
 374
 375 int
 376 vfs_alloc_fdslot(int* fd)
 377 {
 378     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 379         if (!__current->fdtable->fds[i]) {
 380             *fd = i;
 381             return 0;
 382         }
 383     }
 384     return EMFILE;
 385 }
 386
 387 struct v_superblock*
 388 vfs_sb_alloc()
 389 {
 390     struct v_superblock* sb = cake_grab(superblock_pile);
 391     memset(sb, 0, sizeof(*sb));
 392     llist_init_head(&sb->sb_list);
 393
 394     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 395     sb->d_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 396
 397     sb->ref_count = 1;
 398     return sb;
 399 }
 400
 401 void
 402 vfs_sb_ref(struct v_superblock* sb)
 403 {
 404     sb->ref_count++;
 405 }
 406
 407 void
 408 vfs_sb_unref(struct v_superblock* sb)
 409 {
 410     assert(sb->ref_count);
 411
 412     sb->ref_count--;
 413     if (likely(sb->ref_count)) {
 414         return;
 415     }
 416
 417     if (sb->ops.release) {
 418         sb->ops.release(sb);
 419     }
 420
 421     vfree(sb->i_cache);
 422     vfree(sb->d_cache);
 423
 424     cake_release(superblock_pile, sb);
 425 }
 426
 427 static int
 428 __vfs_try_evict_dnode(struct lru_node* obj)
 429 {
 430     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 431
 432     if (!dnode->ref_count) {
 433         vfs_d_free(dnode);
 434         return 1;
 435     }
 436     return 0;
 437 }
 438
 439 static int
 440 __vfs_try_evict_inode(struct lru_node* obj)
 441 {
 442     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 443
 444     if (!inode->link_count && !inode->open_count) {
 445         vfs_i_free(inode);
 446         return 1;
 447     }
 448     return 0;
 449 }
 450
 451 struct v_dnode*
 452 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 453 {
 454     struct v_dnode* dnode = cake_grab(dnode_pile);
 455     if (!dnode) {
 456         lru_evict_half(dnode_lru);
 457
 458         if (!(dnode = cake_grab(dnode_pile))) {
 459             return NULL;
 460         }
 461     }
 462
 463     memset(dnode, 0, sizeof(*dnode));
 464     llist_init_head(&dnode->children);
 465     llist_init_head(&dnode->siblings);
 466     llist_init_head(&dnode->aka_list);
 467     mutex_init(&dnode->lock);
 468
 469     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 470
 471     hstrcpy(&dnode->name, name);
 472
 473     if (parent) {
 474         vfs_d_assign_sb(dnode, parent->super_block);
 475         dnode->mnt = parent->mnt;
 476     }
 477
 478     lru_use_one(dnode_lru, &dnode->lru);
 479
 480     return dnode;
 481 }
 482
 483 void
 484 vfs_d_free(struct v_dnode* dnode)
 485 {
 486     assert(dnode->ref_count == 1);
 487
 488     if (dnode->inode) {
 489         assert(dnode->inode->link_count > 0);
 490         dnode->inode->link_count--;
 491     }
 492
 493     vfs_dcache_remove(dnode);
 494     // Make sure the children de-referencing their parent.
 495     // With lru presented, the eviction will be propagated over the entire
 496     // detached subtree eventually
 497     struct v_dnode *pos, *n;
 498     llist_for_each(pos, n, &dnode->children, siblings)
 499     {
 500         vfs_dcache_remove(pos);
 501     }
 502
 503     if (dnode->destruct) {
 504         dnode->destruct(dnode);
 505     }
 506
 507     vfs_sb_unref(dnode->super_block);
 508     vfree((void*)dnode->name.value);
 509     cake_release(dnode_pile, dnode);
 510 }
 511
 512 struct v_inode*
 513 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 514 {
 515     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 516     struct v_inode *pos, *n;
 517     hashtable_bucket_foreach(slot, pos, n, hash_list)
 518     {
 519         if (pos->id == i_id) {
 520             lru_use_one(inode_lru, &pos->lru);
 521             return pos;
 522         }
 523     }
 524
 525     return NULL;
 526 }
 527
 528 void
 529 vfs_i_addhash(struct v_inode* inode)
 530 {
 531     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 532
 533     hlist_delete(&inode->hash_list);
 534     hlist_add(&slot->head, &inode->hash_list);
 535 }
 536
 537 struct v_inode*
 538 vfs_i_alloc(struct v_superblock* sb)
 539 {
 540     assert(sb->ops.init_inode);
 541
 542     struct v_inode* inode;
 543     if (!(inode = cake_grab(inode_pile))) {
 544         lru_evict_half(inode_lru);
 545         if (!(inode = cake_grab(inode_pile))) {
 546             return NULL;
 547         }
 548     }
 549
 550     memset(inode, 0, sizeof(*inode));
 551     mutex_init(&inode->lock);
 552     llist_init_head(&inode->xattrs);
 553     llist_init_head(&inode->aka_dnodes);
 554
 555     sb->ops.init_inode(sb, inode);
 556
 557     inode->ctime = clock_unixtime();
 558     inode->atime = inode->ctime;
 559     inode->mtime = inode->ctime;
 560
 561     vfs_i_assign_sb(inode, sb);
 562     lru_use_one(inode_lru, &inode->lru);
 563     return inode;
 564 }
 565
 566 void
 567 vfs_i_free(struct v_inode* inode)
 568 {
 569     if (inode->pg_cache) {
 570         pcache_release(inode->pg_cache);
 571         vfree(inode->pg_cache);
 572     }
 573     // we don't need to sync inode.
 574     // If an inode can be free, then it must be properly closed.
 575     // Hence it must be synced already!
 576     if (inode->destruct) {
 577         inode->destruct(inode);
 578     }
 579
 580     vfs_sb_unref(inode->sb);
 581     hlist_delete(&inode->hash_list);
 582     cake_release(inode_pile, inode);
 583 }
 584
 585 /* ---- System call definition and support ---- */
 586
 587 // make a new name when not exists
 588 #define FLOC_MAYBE_MKNAME 1
 589
 590 // name must be non-exist and made.
 591 #define FLOC_MKNAME 2
 592
 593 // no follow symlink
 594 #define FLOC_NOFOLLOW 4
 595
 596 int
 597 vfs_getfd(int fd, struct v_fd** fd_s)
 598 {
 599     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 600         return 0;
 601     }
 602     return EBADF;
 603 }
 604
 605 static int
 606 __vfs_mknod(struct v_inode* parent, struct v_dnode* dnode,
 607             unsigned int itype, dev_t* dev)
 608 {
 609     int errno;
 610
 611     errno = parent->ops->create(parent, dnode, itype);
 612     if (errno) {
 613         return errno;
 614     }
 615
 616     return 0;
 617 }
 618
 619 struct file_locator {
 620     struct v_dnode* dir;
 621     struct v_dnode* file;
 622     bool fresh;
 623 };
 624
 625 /**
 626  * @brief unlock the file locator (floc) if possible.
 627  *        If the file to be located if not exists, and
 628  *        any FLOC_*MKNAME flag is set, then the parent
 629  *        dnode will be locked until the file has been properly
 630  *        finalised by subsequent logic.
 631  *
 632  * @param floc
 633  */
 634 static inline void
 635 __floc_try_unlock(struct file_locator* floc)
 636 {
 637     if (floc->fresh) {
 638         assert(floc->dir);
 639         unlock_dnode(floc->dir);
 640     }
 641 }
 642
 643 static int
 644 __vfs_try_locate_file(const char* path,
 645                       struct file_locator* floc,
 646                       int options)
 647 {
 648     char name_str[VFS_NAME_MAXLEN];
 649     struct v_dnode *fdir, *file;
 650     struct hstr name = HSTR(name_str, 0);
 651     int errno, woption = 0;
 652
 653     if ((options & FLOC_NOFOLLOW)) {
 654         woption |= VFS_WALK_NOFOLLOW;
 655         options &= ~FLOC_NOFOLLOW;
 656     }
 657
 658     floc->fresh = false;
 659     name_str[0] = 0;
 660     errno = vfs_walk_proc(path, &fdir, &name, woption | VFS_WALK_PARENT);
 661     if (errno) {
 662         return errno;
 663     }
 664
 665     lock_dnode(fdir);
 666
 667     errno = vfs_walk(fdir, name.value, &file, NULL, woption);
 668
 669     if (errno && errno != ENOENT) {
 670         goto error;
 671     }
 672
 673     if (!errno && (options & FLOC_MKNAME)) {
 674         errno = EEXIST;
 675         goto error;
 676     }
 677
 678     if (!errno) {
 679         // the file present, no need to hold the directory lock
 680         unlock_dnode(fdir);
 681         goto done;
 682     }
 683
 684     // errno == ENOENT
 685     if (!options) {
 686         goto error;
 687     }
 688
 689     errno = vfs_check_writable(fdir);
 690     if (errno) {
 691         goto error;
 692     }
 693
 694     floc->fresh = true;
 695
 696     file = vfs_d_alloc(fdir, &name);
 697
 698     if (!file) {
 699         errno = ENOMEM;
 700         goto error;
 701     }
 702
 703     vfs_dcache_add(fdir, file);
 704
 705 done:
 706     floc->dir   = fdir;
 707     floc->file  = file;
 708
 709     return errno;
 710
 711 error:
 712     unlock_dnode(fdir);
 713     return errno;
 714 }
 715
 716
 717 static bool
 718 __check_unlinkable(struct v_dnode* dnode)
 719 {
 720     int acl;
 721     bool wr_self, wr_parent;
 722     struct v_dnode* parent;
 723
 724     parent = dnode->parent;
 725     acl = dnode->inode->acl;
 726
 727     wr_self = check_allow_write(dnode->inode);
 728     wr_parent = check_allow_write(parent->inode);
 729
 730     if (!fsacl_test(acl, svtx)) {
 731         return wr_self;
 732     }
 733
 734     if (current_euid() == dnode->inode->uid) {
 735         return true;
 736     }
 737
 738     return wr_self && wr_parent;
 739 }
 740
 741 int
 742 vfs_do_open(const char* path, int options)
 743 {
 744     int errno, fd, loptions = 0;
 745     struct v_dnode *dentry, *file;
 746     struct v_file* ofile = NULL;
 747     struct file_locator floc;
 748     struct v_inode* inode;
 749
 750     if ((options & FO_CREATE)) {
 751         loptions |= FLOC_MAYBE_MKNAME;
 752     } else if ((options & FO_NOFOLLOW)) {
 753         loptions |= FLOC_NOFOLLOW;
 754     }
 755
 756     errno = __vfs_try_locate_file(path, &floc, loptions);
 757
 758     if (errno || (errno = vfs_alloc_fdslot(&fd))) {
 759         return errno;
 760     }
 761
 762     file   = floc.file;
 763     dentry = floc.dir;
 764
 765     if (floc.fresh) {
 766         errno = __vfs_mknod(dentry->inode, file, VFS_IFFILE, NULL);
 767         if (errno) {
 768             vfs_d_free(file);
 769             __floc_try_unlock(&floc);
 770             return errno;
 771         }
 772
 773         __floc_try_unlock(&floc);
 774     }
 775
 776
 777     if ((errno = vfs_open(file, &ofile))) {
 778         return errno;
 779     }
 780
 781     inode = ofile->inode;
 782     lock_inode(inode);
 783
 784     struct v_fd* fd_s = cake_grab(fd_pile);
 785     memset(fd_s, 0, sizeof(*fd_s));
 786
 787     if ((options & O_TRUNC)) {
 788         file->inode->fsize = 0;
 789     }
 790
 791     if (vfs_get_dtype(inode->itype) == DT_DIR) {
 792         ofile->f_pos = 0;
 793     }
 794
 795     fd_s->file = ofile;
 796     fd_s->flags = options;
 797     __current->fdtable->fds[fd] = fd_s;
 798
 799     unlock_inode(inode);
 800
 801     return fd;
 802 }
 803
 804 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 805 {
 806     int errno = vfs_do_open(path, options);
 807     return DO_STATUS_OR_RETURN(errno);
 808 }
 809
 810 __DEFINE_LXSYSCALL1(int, close, int, fd)
 811 {
 812     struct v_fd* fd_s;
 813     int errno = 0;
 814     if ((errno = vfs_getfd(fd, &fd_s))) {
 815         goto done_err;
 816     }
 817
 818     if ((errno = vfs_close(fd_s->file))) {
 819         goto done_err;
 820     }
 821
 822     cake_release(fd_pile, fd_s);
 823     __current->fdtable->fds[fd] = 0;
 824
 825 done_err:
 826     return DO_STATUS(errno);
 827 }
 828
 829 void
 830 __vfs_readdir_callback(struct dir_context* dctx,
 831                        const char* name,
 832                        const int len,
 833                        const int dtype)
 834 {
 835     struct lx_dirent* dent = (struct lx_dirent*)dctx->cb_data;
 836     strncpy(dent->d_name, name, MIN(len, DIRENT_NAME_MAX_LEN));
 837     dent->d_nlen = len;
 838     dent->d_type = dtype;
 839 }
 840
 841 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent)
 842 {
 843     struct v_fd* fd_s;
 844     int errno;
 845
 846     if ((errno = vfs_getfd(fd, &fd_s))) {
 847         goto done;
 848     }
 849
 850     struct v_inode* inode = fd_s->file->inode;
 851
 852     lock_inode(inode);
 853
 854     if (!check_directory_node(inode)) {
 855         errno = ENOTDIR;
 856         goto unlock;
 857     }
 858
 859     if (!check_allow_read(inode)) {
 860         errno = EPERM;
 861         goto unlock;
 862     }
 863
 864     struct dir_context dctx = (struct dir_context) {
 865         .cb_data = dent,
 866         .read_complete_callback = __vfs_readdir_callback
 867     };
 868
 869     if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 870         goto unlock;
 871     }
 872     dent->d_offset++;
 873     fd_s->file->f_pos++;
 874
 875 unlock:
 876     unlock_inode(inode);
 877
 878 done:
 879     return DO_STATUS_OR_RETURN(errno);
 880 }
 881
 882 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 883 {
 884     int errno = 0;
 885     struct v_fd* fd_s;
 886     struct v_inode* inode;
 887
 888     if ((errno = vfs_getfd(fd, &fd_s))) {
 889         goto done;
 890     }
 891
 892     struct v_file* file = fd_s->file;
 893     if (check_directory_node(file->inode)) {
 894         errno = EISDIR;
 895         goto done;
 896     }
 897
 898     if (!check_allow_read(file->inode)) {
 899         errno = EPERM;
 900         goto done;
 901     }
 902
 903     inode = file->inode;
 904     lock_inode(inode);
 905
 906     __vfs_touch_inode(inode, INODE_ACCESSED);
 907
 908     if (check_seqdev_node(inode) || (fd_s->flags & FO_DIRECT)) {
 909         errno = file->ops->read(inode, buf, count, file->f_pos);
 910     } else {
 911         errno = pcache_read(inode, buf, count, file->f_pos);
 912     }
 913
 914     if (errno > 0) {
 915         file->f_pos += errno;
 916         unlock_inode(inode);
 917         return errno;
 918     }
 919
 920     unlock_inode(inode);
 921
 922 done:
 923     return DO_STATUS(errno);
 924 }
 925
 926 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 927 {
 928     int errno = 0;
 929     struct v_fd* fd_s;
 930     if ((errno = vfs_getfd(fd, &fd_s))) {
 931         goto done;
 932     }
 933
 934     struct v_inode* inode;
 935     struct v_file* file = fd_s->file;
 936
 937     if ((errno = vfs_check_writable(file->dnode))) {
 938         goto done;
 939     }
 940
 941     if (check_directory_node(file->inode)) {
 942         errno = EISDIR;
 943         goto done;
 944     }
 945
 946     inode = file->inode;
 947     lock_inode(inode);
 948
 949     __vfs_touch_inode(inode, INODE_MODIFY);
 950     if ((fd_s->flags & O_APPEND)) {
 951         file->f_pos = inode->fsize;
 952     }
 953
 954     if (check_seqdev_node(inode) || (fd_s->flags & FO_DIRECT)) {
 955         errno = file->ops->write(inode, buf, count, file->f_pos);
 956     } else {
 957         errno = pcache_write(inode, buf, count, file->f_pos);
 958     }
 959
 960     if (errno > 0) {
 961         file->f_pos += errno;
 962         inode->fsize = MAX(inode->fsize, file->f_pos);
 963
 964         unlock_inode(inode);
 965         return errno;
 966     }
 967
 968     unlock_inode(inode);
 969
 970 done:
 971     return DO_STATUS(errno);
 972 }
 973
 974 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 975 {
 976     int errno = 0;
 977     struct v_fd* fd_s;
 978     if ((errno = vfs_getfd(fd, &fd_s))) {
 979         goto done;
 980     }
 981
 982     struct v_file* file = fd_s->file;
 983     struct v_inode* inode = file->inode;
 984
 985     if (!file->ops->seek) {
 986         errno = ENOTSUP;
 987         goto done;
 988     }
 989
 990     if (!check_allow_read(inode)) {
 991         errno = EPERM;
 992         goto done;
 993     }
 994
 995     lock_inode(inode);
 996
 997     int overflow = 0;
 998     int fpos = file->f_pos;
 999
1000     if (vfs_get_dtype(inode->itype) == DT_DIR) {
1001         options = (options != FSEEK_END) ? options : FSEEK_SET;
1002     }
1003
1004     switch (options) {
1005         case FSEEK_CUR:
1006             overflow = sadd_of((int)file->f_pos, offset, &fpos);
1007             break;
1008         case FSEEK_END:
1009             overflow = sadd_of((int)inode->fsize, offset, &fpos);
1010             break;
1011         case FSEEK_SET:
1012             fpos = offset;
1013             break;
1014     }
1015
1016     if (overflow) {
1017         errno = EOVERFLOW;
1018     }
1019     else {
1020         errno = file->ops->seek(file, fpos);
1021     }
1022
1023     unlock_inode(inode);
1024
1025 done:
1026     return DO_STATUS(errno);
1027 }
1028
1029 int
1030 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
1031 {
1032     if (!dnode) {
1033         return 0;
1034     }
1035
1036     if (depth > 64) {
1037         return ENAMETOOLONG;
1038     }
1039
1040     size_t len = 0;
1041
1042     if (dnode->parent != dnode) {
1043         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
1044     }
1045
1046     if (len >= size) {
1047         return len;
1048     }
1049
1050     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
1051         buf[len++] = VFS_PATH_DELIM;
1052     }
1053
1054     size_t cpy_size = MIN(dnode->name.len, size - len);
1055     strncpy(buf + len, dnode->name.value, cpy_size);
1056     len += cpy_size;
1057
1058     return len;
1059 }
1060
1061 int
1062 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
1063 {
1064     const char* link;
1065     struct v_inode* inode = dnode->inode;
1066
1067     if (!check_symlink_node(inode)) {
1068         return EINVAL;
1069     }
1070
1071     if (!inode->ops->read_symlink) {
1072         return ENOTSUP;
1073     }
1074
1075     if (!check_allow_read(inode)) {
1076         return EPERM;
1077     }
1078
1079     lock_inode(inode);
1080
1081     int errno = inode->ops->read_symlink(inode, &link);
1082     if (errno >= 0) {
1083         strncpy(buf, link, MIN(size, (size_t)errno));
1084     }
1085
1086     unlock_inode(inode);
1087     return errno;
1088 }
1089
1090 int
1091 vfs_get_dtype(int itype)
1092 {
1093     int dtype = DT_FILE;
1094     if (check_itype(itype, VFS_IFSYMLINK)) {
1095         dtype |= DT_SYMLINK;
1096     }
1097
1098     if (check_itype(itype, VFS_IFDIR)) {
1099         dtype |= DT_DIR;
1100         return dtype;
1101     }
1102
1103     // TODO other types
1104
1105     return dtype;
1106 }
1107
1108 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
1109 {
1110     int errno;
1111     struct v_fd* fd_s;
1112     if ((errno = vfs_getfd(fd, &fd_s))) {
1113         goto done;
1114     }
1115
1116     struct v_dnode* dnode;
1117     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
1118
1119     if (errno >= 0) {
1120         return errno;
1121     }
1122
1123 done:
1124     return DO_STATUS(errno);
1125 }
1126
1127 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
1128 {
1129     int errno;
1130     struct v_dnode* dnode;
1131     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
1132         errno = vfs_readlink(dnode, buf, size);
1133     }
1134
1135     if (errno >= 0) {
1136         return errno;
1137     }
1138
1139     return DO_STATUS(errno);
1140 }
1141
1142 __DEFINE_LXSYSCALL4(
1143   int, readlinkat, int, dirfd, const char*, pathname, char*, buf, size_t, size)
1144 {
1145     int errno;
1146     struct v_fd* fd_s;
1147     if ((errno = vfs_getfd(dirfd, &fd_s))) {
1148         goto done;
1149     }
1150
1151     pathname = pathname ? pathname : "";
1152
1153     struct v_dnode* dnode;
1154     if (!(errno = vfs_walk(
1155             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
1156         errno = vfs_readlink(fd_s->file->dnode, buf, size);
1157     }
1158
1159     if (errno >= 0) {
1160         return errno;
1161     }
1162
1163 done:
1164     return DO_STATUS(errno);
1165 }
1166
1167 /*
1168     NOTE
1169     When we perform operation that could affect the layout of
1170     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
1171     whenever possible. This will blocking any ongoing path walking to reach
1172     it hence avoid any partial state.
1173 */
1174
1175 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
1176 {
1177     int errno;
1178     struct v_dnode* dnode;
1179     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1180         return DO_STATUS(errno);
1181     }
1182
1183     lock_dnode(dnode);
1184
1185     if (!__check_unlinkable(dnode)) {
1186         errno = EPERM;
1187         goto done;
1188     }
1189
1190     if ((errno = vfs_check_writable(dnode))) {
1191         goto done;
1192     }
1193
1194     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
1195         errno = EROFS;
1196         goto done;
1197     }
1198
1199     if (dnode->ref_count > 1 || dnode->inode->open_count) {
1200         errno = EBUSY;
1201         goto done;
1202     }
1203
1204     if (!llist_empty(&dnode->children)) {
1205         errno = ENOTEMPTY;
1206         goto done;
1207     }
1208
1209     struct v_dnode* parent = dnode->parent;
1210
1211     if (!parent) {
1212         errno = EINVAL;
1213         goto done;
1214     }
1215
1216     lock_dnode(parent);
1217     lock_inode(parent->inode);
1218
1219     if (check_directory_node(dnode->inode)) {
1220         errno = parent->inode->ops->rmdir(parent->inode, dnode);
1221         if (!errno) {
1222             vfs_dcache_remove(dnode);
1223         }
1224     } else {
1225         errno = ENOTDIR;
1226     }
1227
1228     unlock_inode(parent->inode);
1229     unlock_dnode(parent);
1230
1231 done:
1232     unlock_dnode(dnode);
1233     return DO_STATUS(errno);
1234 }
1235
1236 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
1237 {
1238     int errno = 0;
1239     struct v_dnode *parent, *dir;
1240     char name_value[VFS_NAME_MAXLEN];
1241     struct hstr name = HHSTR(name_value, 0, 0);
1242
1243     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
1244         goto done;
1245     }
1246
1247     if (!(errno = vfs_walk(parent, name_value, &dir, NULL, 0))) {
1248         errno = EEXIST;
1249         goto done;
1250     }
1251
1252     if ((errno = vfs_check_writable(parent))) {
1253         goto done;
1254     }
1255
1256     if (!(dir = vfs_d_alloc(parent, &name))) {
1257         errno = ENOMEM;
1258         goto done;
1259     }
1260
1261     struct v_inode* inode = parent->inode;
1262
1263     lock_dnode(parent);
1264     lock_inode(inode);
1265
1266     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1267         errno = ENOTSUP;
1268     } else if (!inode->ops->mkdir) {
1269         errno = ENOTSUP;
1270     } else if (!check_directory_node(inode)) {
1271         errno = ENOTDIR;
1272     } else if (!(errno = inode->ops->mkdir(inode, dir))) {
1273         vfs_dcache_add(parent, dir);
1274         goto cleanup;
1275     }
1276
1277     vfs_d_free(dir);
1278
1279 cleanup:
1280     unlock_inode(inode);
1281     unlock_dnode(parent);
1282 done:
1283     return DO_STATUS(errno);
1284 }
1285
1286 static int
1287 __vfs_do_unlink(struct v_dnode* dnode)
1288 {
1289     int errno;
1290     struct v_inode* inode = dnode->inode;
1291
1292     if (dnode->ref_count > 1) {
1293         return EBUSY;
1294     }
1295
1296     if (!__check_unlinkable(dnode)) {
1297         return EPERM;
1298     }
1299
1300     if ((errno = vfs_check_writable(dnode))) {
1301         return errno;
1302     }
1303
1304     lock_inode(inode);
1305
1306     if (inode->open_count) {
1307         errno = EBUSY;
1308     } else if (!check_directory_node(inode)) {
1309         errno = inode->ops->unlink(inode, dnode);
1310         if (!errno) {
1311             vfs_d_free(dnode);
1312         }
1313     } else {
1314         errno = EISDIR;
1315     }
1316
1317     unlock_inode(inode);
1318
1319     return errno;
1320 }
1321
1322 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1323 {
1324     int errno;
1325     struct v_dnode* dnode;
1326     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1327         goto done;
1328     }
1329
1330     errno = __vfs_do_unlink(dnode);
1331
1332 done:
1333     return DO_STATUS(errno);
1334 }
1335
1336 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1337 {
1338     int errno;
1339     struct v_fd* fd_s;
1340     if ((errno = vfs_getfd(fd, &fd_s))) {
1341         goto done;
1342     }
1343
1344     struct v_dnode* dnode;
1345     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1346         errno = __vfs_do_unlink(dnode);
1347     }
1348
1349 done:
1350     return DO_STATUS(errno);
1351 }
1352
1353 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1354 {
1355     int errno;
1356     struct file_locator floc;
1357     struct v_dnode *to_link, *name_file;
1358
1359     errno = __vfs_try_locate_file(oldpath, &floc, 0);
1360     if (errno) {
1361         goto done;
1362     }
1363
1364     __floc_try_unlock(&floc);
1365
1366     to_link = floc.file;
1367     errno = __vfs_try_locate_file(newpath, &floc, FLOC_MKNAME);
1368     if (!errno) {
1369         goto done;
1370     }
1371
1372     name_file = floc.file;
1373     errno = vfs_link(to_link, name_file);
1374     if (errno) {
1375         vfs_d_free(name_file);
1376     }
1377
1378 done:
1379     __floc_try_unlock(&floc);
1380     return DO_STATUS(errno);
1381 }
1382
1383 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1384 {
1385     int errno;
1386     struct v_fd* fd_s;
1387
1388     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1389         errno = vfs_fsync(fd_s->file);
1390     }
1391
1392     return DO_STATUS(errno);
1393 }
1394
1395 int
1396 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1397 {
1398     int errno = 0;
1399     struct v_fd* copied = cake_grab(fd_pile);
1400
1401     memcpy(copied, old, sizeof(struct v_fd));
1402
1403     vfs_ref_file(old->file);
1404
1405     *new = copied;
1406
1407     return errno;
1408 }
1409
1410 int
1411 vfs_dup2(int oldfd, int newfd)
1412 {
1413     if (newfd == oldfd) {
1414         return newfd;
1415     }
1416
1417     int errno;
1418     struct v_fd *oldfd_s, *newfd_s;
1419     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1420         goto done;
1421     }
1422
1423     if (!TEST_FD(newfd)) {
1424         errno = EBADF;
1425         goto done;
1426     }
1427
1428     newfd_s = __current->fdtable->fds[newfd];
1429     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1430         goto done;
1431     }
1432
1433     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1434         __current->fdtable->fds[newfd] = newfd_s;
1435         return newfd;
1436     }
1437
1438 done:
1439     return DO_STATUS(errno);
1440 }
1441
1442 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1443 {
1444     return vfs_dup2(oldfd, newfd);
1445 }
1446
1447 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1448 {
1449     int errno, newfd;
1450     struct v_fd *oldfd_s, *newfd_s;
1451     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1452         goto done;
1453     }
1454
1455     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1456         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1457         __current->fdtable->fds[newfd] = newfd_s;
1458         return newfd;
1459     }
1460
1461 done:
1462     return DO_STATUS(errno);
1463 }
1464
1465 __DEFINE_LXSYSCALL2(
1466   int, symlink, const char*, pathname, const char*, link_target)
1467 {
1468     int errno;
1469     struct file_locator floc;
1470     struct v_dnode *file;
1471     struct v_inode *f_ino;
1472
1473     errno = __vfs_try_locate_file(pathname, &floc, FLOC_MKNAME);
1474     if (errno) {
1475         goto done;
1476     }
1477
1478     file = floc.file;
1479     errno = __vfs_mknod(floc.dir->inode, file, VFS_IFSYMLINK, NULL);
1480     if (errno) {
1481         vfs_d_free(file);
1482         goto done;
1483     }
1484
1485     f_ino = file->inode;
1486
1487     assert(f_ino);
1488
1489     errno = vfs_check_writable(file);
1490     if (errno) {
1491         goto done;
1492     }
1493
1494     if (!f_ino->ops->set_symlink) {
1495         errno = ENOTSUP;
1496         goto done;
1497     }
1498
1499     lock_inode(f_ino);
1500
1501     errno = f_ino->ops->set_symlink(f_ino, link_target);
1502
1503     unlock_inode(f_ino);
1504
1505 done:
1506     __floc_try_unlock(&floc);
1507     return DO_STATUS(errno);
1508 }
1509
1510 static int
1511 vfs_do_chdir_nolock(struct proc_info* proc, struct v_dnode* dnode)
1512 {
1513     if (!check_directory_node(dnode->inode)) {
1514         return ENOTDIR;
1515     }
1516
1517     if (proc->cwd) {
1518         vfs_unref_dnode(proc->cwd);
1519     }
1520
1521     vfs_ref_dnode(dnode);
1522     proc->cwd = dnode;
1523
1524     return 0;
1525 }
1526
1527 static int
1528 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1529 {
1530     int errno = 0;
1531
1532     lock_dnode(dnode);
1533
1534     errno = vfs_do_chdir_nolock(proc, dnode);
1535
1536     unlock_dnode(dnode);
1537
1538     return errno;
1539 }
1540
1541 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1542 {
1543     struct v_dnode* dnode;
1544     int errno = 0;
1545
1546     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1547         goto done;
1548     }
1549
1550     errno = vfs_do_chdir((struct proc_info*)__current, dnode);
1551
1552 done:
1553     return DO_STATUS(errno);
1554 }
1555
1556 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1557 {
1558     struct v_fd* fd_s;
1559     int errno = 0;
1560
1561     if ((errno = vfs_getfd(fd, &fd_s))) {
1562         goto done;
1563     }
1564
1565     errno = vfs_do_chdir((struct proc_info*)__current, fd_s->file->dnode);
1566
1567 done:
1568     return DO_STATUS(errno);
1569 }
1570
1571
1572 __DEFINE_LXSYSCALL1(int, chroot, const char*, path)
1573 {
1574     int errno;
1575     struct v_dnode* dnode;
1576     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1577         return errno;
1578     }
1579
1580     lock_dnode(dnode);
1581
1582     errno = vfs_do_chdir_nolock(__current, dnode);
1583     if (errno) {
1584         unlock_dnode(dnode);
1585         goto done;
1586     }
1587
1588     __current->root = dnode;
1589
1590     unlock_dnode(dnode);
1591
1592 done:
1593     return DO_STATUS(errno);
1594 }
1595
1596 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1597 {
1598     int errno = 0;
1599     char* ret_ptr = 0;
1600     if (size < 2) {
1601         errno = ERANGE;
1602         goto done;
1603     }
1604
1605     size_t len = 0;
1606
1607     if (!__current->cwd) {
1608         *buf = VFS_PATH_DELIM;
1609         len = 1;
1610     } else {
1611         len = vfs_get_path(__current->cwd, buf, size, 0);
1612         if (len == size) {
1613             errno = ERANGE;
1614             goto done;
1615         }
1616     }
1617
1618     buf[len] = '\0';
1619
1620     ret_ptr = buf;
1621
1622 done:
1623     syscall_result(errno);
1624     return ret_ptr;
1625 }
1626
1627 int
1628 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1629 {
1630     int errno = 0;
1631     if (current->inode->id == target->inode->id) {
1632         // hard link
1633         return 0;
1634     }
1635
1636     if ((errno = vfs_check_writable(current))) {
1637         return errno;
1638     }
1639
1640     if (current->ref_count > 1 || target->ref_count > 1) {
1641         return EBUSY;
1642     }
1643
1644     if (current->super_block != target->super_block) {
1645         return EXDEV;
1646     }
1647
1648     struct v_dnode* oldparent = current->parent;
1649     struct v_dnode* newparent = target->parent;
1650
1651     lock_dnode(current);
1652     lock_dnode(target);
1653     if (oldparent)
1654         lock_dnode(oldparent);
1655     if (newparent)
1656         lock_dnode(newparent);
1657
1658     if (!llist_empty(&target->children)) {
1659         errno = ENOTEMPTY;
1660         unlock_dnode(target);
1661         goto cleanup;
1662     }
1663
1664     if ((errno =
1665            current->inode->ops->rename(current->inode, current, target))) {
1666         unlock_dnode(target);
1667         goto cleanup;
1668     }
1669
1670     // re-position current
1671     hstrcpy(&current->name, &target->name);
1672     vfs_dcache_rehash(newparent, current);
1673
1674     // detach target
1675     vfs_d_free(target);
1676
1677     unlock_dnode(target);
1678
1679 cleanup:
1680     unlock_dnode(current);
1681     if (oldparent)
1682         unlock_dnode(oldparent);
1683     if (newparent)
1684         unlock_dnode(newparent);
1685
1686     return errno;
1687 }
1688
1689 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1690 {
1691     struct v_dnode *cur, *target_parent, *target;
1692     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1693     int errno = 0;
1694
1695     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1696         goto done;
1697     }
1698
1699     if ((errno = vfs_walk(
1700            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1701         goto done;
1702     }
1703
1704     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1705     if (errno == ENOENT) {
1706         target = vfs_d_alloc(target_parent, &name);
1707         vfs_dcache_add(target_parent, target);
1708     } else if (errno) {
1709         goto done;
1710     }
1711
1712     if (!target) {
1713         errno = ENOMEM;
1714         goto done;
1715     }
1716
1717     errno = vfs_do_rename(cur, target);
1718
1719 done:
1720     vfree((void*)name.value);
1721     return DO_STATUS(errno);
1722 }
1723
1724 __DEFINE_LXSYSCALL2(int, fstat, int, fd, struct file_stat*, stat)
1725 {
1726     int errno = 0;
1727     struct v_fd* fds;
1728
1729     if ((errno = vfs_getfd(fd, &fds))) {
1730         goto done;
1731     }
1732
1733     struct v_inode* vino = fds->file->inode;
1734     struct device* fdev = vino->sb->dev;
1735
1736     stat->st_ino     = vino->id;
1737     stat->st_blocks  = vino->lb_usage;
1738     stat->st_size    = vino->fsize;
1739     stat->st_blksize = vino->sb->blksize;
1740     stat->st_nlink   = vino->link_count;
1741     stat->st_uid     = vino->uid;
1742     stat->st_gid     = vino->gid;
1743
1744     stat->st_ctim    = vino->ctime;
1745     stat->st_atim    = vino->atime;
1746     stat->st_mtim    = vino->mtime;
1747
1748     stat->st_mode    = (vino->itype << 16) | vino->acl;
1749
1750     stat->st_ioblksize = PAGE_SIZE;
1751
1752     if (check_device_node(vino)) {
1753         struct device* rdev = resolve_device(vino->data);
1754         if (!rdev) {
1755             errno = EINVAL;
1756             goto done;
1757         }
1758
1759         stat->st_rdev = (dev_t){.meta = rdev->ident.fn_grp,
1760                                 .unique = rdev->ident.unique,
1761                                 .index = dev_uid(rdev) };
1762     }
1763
1764     if (fdev) {
1765         stat->st_dev = (dev_t){.meta = fdev->ident.fn_grp,
1766                                .unique = fdev->ident.unique,
1767                                .index = dev_uid(fdev) };
1768     }
1769
1770 done:
1771     return DO_STATUS(errno);
1772 }
1773
1774 __DEFINE_LXSYSCALL4(int, fchmodat, int, fd,
1775                     const char*, path, int, mode, int, flags)
1776 {
1777     int errno;
1778     struct v_dnode *dnode;
1779     struct v_inode* inode;
1780
1781     errno = vfs_walkat(fd, path, flags, &dnode);
1782     if (errno) {
1783         goto done;
1784     }
1785
1786     errno = vfs_check_writable(dnode);
1787     if (errno) {
1788         return errno;
1789     }
1790
1791     inode = dnode->inode;
1792     lock_inode(inode);
1793
1794     if (!current_is_root()) {
1795         mode = mode & FSACL_RWXMASK;
1796     }
1797
1798     inode->acl = mode;
1799     __vfs_touch_inode(inode, INODE_MODIFY);
1800
1801     unlock_inode(inode);
1802
1803 done:
1804     return DO_STATUS(errno);
1805 }
1806
1807 __DEFINE_LXSYSCALL5(int, fchownat, int, fd,
1808                     const char*, path, uid_t, uid, gid_t, gid, int, flags)
1809 {
1810     int errno;
1811     struct v_dnode *dnode;
1812     struct v_inode *inode;
1813
1814     errno = vfs_walkat(fd, path, flags, &dnode);
1815     if (errno) {
1816         goto done;
1817     }
1818
1819     errno = vfs_check_writable(dnode);
1820     if (errno) {
1821         return errno;
1822     }
1823
1824     inode = dnode->inode;
1825     lock_inode(inode);
1826
1827     inode->uid = uid;
1828     inode->gid = gid;
1829     __vfs_touch_inode(inode, INODE_MODIFY);
1830
1831     unlock_inode(inode);
1832
1833 done:
1834     return DO_STATUS(errno);
1835 }
1836
1837 __DEFINE_LXSYSCALL4(int, faccessat, int, fd,
1838                     const char*, path, int, amode, int, flags)
1839 {
1840     int errno, acl;
1841     struct v_dnode *dnode;
1842     struct v_inode *inode;
1843     struct user_scope* uscope;
1844
1845     uid_t tuid;
1846     gid_t tgid;
1847
1848     errno = vfs_walkat(fd, path, flags, &dnode);
1849     if (errno) {
1850         goto done;
1851     }
1852
1853     if ((flags & AT_EACCESS)) {
1854         tuid = current_euid();
1855         tgid = current_egid();
1856     }
1857     else {
1858         uscope = current_user_scope();
1859         tuid = uscope->ruid;
1860         tgid = uscope->rgid;
1861     }
1862
1863     inode = dnode->inode;
1864
1865     acl  = inode->acl;
1866     acl &= amode;
1867     acl &= check_acl_between(inode->uid, inode->gid, tuid, tgid);
1868     if (!acl) {
1869         errno = EACCESS;
1870     }
1871
1872 done:
1873     return DO_STATUS(errno);
1874 }