lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition (CHECKED)
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/foptions.h>
  48 #include <lunaix/fs.h>
  49 #include <lunaix/mm/cake.h>
  50 #include <lunaix/mm/valloc.h>
  51 #include <lunaix/process.h>
  52 #include <lunaix/spike.h>
  53 #include <lunaix/syscall.h>
  54 #include <lunaix/syscall_utils.h>
  55
  56 #include <lunaix/fs/twifs.h>
  57
  58 #include <usr/lunaix/dirent.h>
  59
  60 #define INODE_ACCESSED  0
  61 #define INODE_MODIFY    1
  62
  63 static struct cake_pile* dnode_pile;
  64 static struct cake_pile* inode_pile;
  65 static struct cake_pile* file_pile;
  66 static struct cake_pile* superblock_pile;
  67 static struct cake_pile* fd_pile;
  68
  69 struct v_dnode* vfs_sysroot = NULL;
  70
  71 struct lru_zone *dnode_lru, *inode_lru;
  72
  73 struct hstr vfs_ddot = HSTR("..", 2);
  74 struct hstr vfs_dot = HSTR(".", 1);
  75 struct hstr vfs_empty = HSTR("", 0);
  76
  77 static int
  78 __vfs_try_evict_dnode(struct lru_node* obj);
  79
  80 static int
  81 __vfs_try_evict_inode(struct lru_node* obj);
  82
  83 void
  84 vfs_init()
  85 {
  86     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  87     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  88     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  89     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  90     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  91     superblock_pile =
  92       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  93
  94     dnode_lru = lru_new_zone("vfs_dnode", __vfs_try_evict_dnode);
  95     inode_lru = lru_new_zone("vfs_inode", __vfs_try_evict_inode);
  96
  97     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
  98     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
  99
 100     // 创建一个根dnode。
 101     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 102     vfs_sysroot->parent = vfs_sysroot;
 103
 104     vfs_ref_dnode(vfs_sysroot);
 105     lru_remove(dnode_lru, &vfs_sysroot->lru);
 106 }
 107
 108 void
 109 vfs_vncache_init(struct vncache* cache)
 110 {
 111     cache->pool = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 112     rwlock_init(&cache->lock);
 113 }
 114
 115 void
 116 vfs_vncache_free(struct vncache* cache)
 117 {
 118     // clear all other reader/writer
 119     rwlock_begin_write(&cache->lock);
 120     vfree(cache->pool);
 121
 122     // already freed, so as the lock
 123 }
 124
 125 void
 126 vfs_vncache_add(struct vncache* cache, size_t key, struct hlist_node* node)
 127 {
 128     struct hbucket* slot;
 129
 130     cache_atomic_write(cache,
 131     {
 132         slot = &cache->pool[key & VFS_HASH_MASK];
 133         hlist_delete(node);
 134         hlist_add(&slot->head, node);
 135     });
 136 }
 137
 138 static inline struct hbucket*
 139 __dcache_hash_nolock(struct v_dnode* parent, u32_t* hash)
 140 {
 141     struct v_superblock* sb;
 142     struct hbucket* d_cache;
 143     u32_t _hash;
 144
 145     sb = parent->super_block;
 146
 147     _hash = *hash;
 148     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 149     _hash += (u32_t)__ptr(parent);
 150
 151     *hash = _hash;
 152     return &sb->d_cache.pool[_hash & VFS_HASH_MASK];
 153 }
 154
 155 static inline int
 156 __sync_inode_nolock(struct v_inode* inode)
 157 {
 158     pcache_commit_all(inode);
 159
 160     int errno = ENOTSUP;
 161     if (inode->ops->sync) {
 162         errno = inode->ops->sync(inode);
 163     }
 164
 165     return errno;
 166 }
 167
 168 struct v_dnode*
 169 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 170 {
 171     u32_t hash;
 172     struct hbucket* slot;
 173     struct v_dnode *pos, *n;
 174     struct vncache *dcache;
 175
 176     if (!str->len || HSTR_EQ(str, &vfs_dot))
 177         return parent;
 178
 179     if (HSTR_EQ(str, &vfs_ddot)) {
 180         return parent->parent;
 181     }
 182
 183     hash = str->hash;
 184     dcache = dnode_cache(parent);
 185
 186     vncache_lock_read(dcache);
 187
 188     slot = __dcache_hash_nolock(parent, &hash);
 189     hashtable_bucket_foreach(slot, pos, n, hash_list)
 190     {
 191         if (pos->name.hash != hash || pos->parent != parent) {
 192             continue;
 193         }
 194
 195         vncache_unlock_read(dcache);
 196         return pos;
 197     }
 198
 199     vncache_unlock_read(dcache);
 200     return NULL;
 201 }
 202
 203 static void
 204 __vfs_touch_inode(struct v_inode* inode, const int type)
 205 {
 206     if (type == INODE_MODIFY) {
 207         inode->mtime = clock_unixtime();
 208     }
 209
 210     else if (type == INODE_ACCESSED) {
 211         inode->atime = clock_unixtime();
 212     }
 213
 214     lru_use_one(inode_lru, &inode->lru);
 215 }
 216
 217 void
 218 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 219 {
 220     struct hbucket* bucket;
 221     struct vncache* cache;
 222
 223     assert(parent);
 224     assert(locked_node(parent));
 225
 226     dnode->ref_count = 1;
 227     dnode->parent = parent;
 228     llist_append(&parent->children, &dnode->siblings);
 229
 230     cache_atomic_write(dnode_cache(parent),
 231     {
 232         bucket = __dcache_hash_nolock(parent, &dnode->name.hash);
 233         hlist_add(&bucket->head, &dnode->hash_list);
 234     });
 235 }
 236
 237 void
 238 vfs_dcache_remove(struct v_dnode* dnode)
 239 {
 240     assert(dnode);
 241     assert(dnode->ref_count == 1);
 242
 243     llist_delete(&dnode->siblings);
 244     llist_delete(&dnode->aka_list);
 245     lru_remove(dnode_lru, &dnode->lru);
 246
 247     cache_atomic_write(dnode_cache(dnode),
 248     {
 249         hlist_delete(&dnode->hash_list);
 250     });
 251
 252     dnode->parent = NULL;
 253     dnode->ref_count = 0;
 254 }
 255
 256 void
 257 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 258 {
 259     assert(new_parent);
 260     assert(locked_node(new_parent));
 261
 262     dnode_atomic(dnode,
 263     {
 264         hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 265         vfs_dcache_remove(dnode);
 266         vfs_dcache_add(new_parent, dnode);
 267     });
 268 }
 269
 270 int
 271 vfs_open(struct v_dnode* dnode, struct v_file** file)
 272 {
 273     struct v_inode* inode = dnode->inode;
 274
 275     if (!inode || !inode->ops->open) {
 276         return ENOTSUP;
 277     }
 278
 279     lock_inode(inode);
 280
 281     struct v_file* vfile = cake_grab(file_pile);
 282     memset(vfile, 0, sizeof(*vfile));
 283
 284     vfile->dnode = dnode;
 285     vfile->inode = inode;
 286     vfile->ref_count = 1;
 287     vfile->ops = inode->default_fops;
 288
 289     if (check_regfile_node(inode) && !inode->pg_cache) {
 290         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 291         pcache_init(pcache);
 292         pcache->master = inode;
 293         inode->pg_cache = pcache;
 294     }
 295
 296     int errno = inode->ops->open(inode, vfile);
 297     if (errno) {
 298         cake_release(file_pile, vfile);
 299     } else {
 300         vfs_ref_dnode(dnode);
 301         inode->open_count++;
 302
 303         *file = vfile;
 304     }
 305
 306     unlock_inode(inode);
 307
 308     return errno;
 309 }
 310
 311 void
 312 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 313 {
 314     lock_dnode(assign_to);
 315
 316     if (assign_to->inode) {
 317         llist_delete(&assign_to->aka_list);
 318         assign_to->inode->link_count--;
 319     }
 320
 321     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 322     assign_to->inode = inode;
 323     inode->link_count++;
 324
 325     unlock_dnode(assign_to);
 326 }
 327
 328 int
 329 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 330 {
 331     int errno;
 332     struct v_inode* inode;
 333
 334     inode = to_link->inode;
 335
 336     if ((errno = vfs_check_writable(to_link))) {
 337         return errno;
 338     }
 339
 340     lock_inode(inode);
 341
 342     if (to_link->super_block->root != name->super_block->root) {
 343         errno = EXDEV;
 344     } else if (!inode->ops->link) {
 345         errno = ENOTSUP;
 346     } else if (!(errno = inode->ops->link(inode, name))) {
 347         vfs_assign_inode(name, inode);
 348     }
 349
 350     unlock_inode(inode);
 351
 352     return errno;
 353 }
 354
 355 int
 356 vfs_pclose(struct v_file* file, pid_t pid)
 357 {
 358     struct v_inode* inode;
 359     int errno = 0;
 360
 361     inode = file->inode;
 362
 363     if (vfs_check_duped_file(file)) {
 364         vfs_unref_file(file);
 365         return 0;
 366     }
 367
 368     /*
 369      * Prevent dead lock.
 370      * This happened when process is terminated while blocking on read.
 371      * In that case, the process is still holding the inode lock and it
 372          will never get released.
 373      * The unlocking should also include ownership check.
 374      *
 375      * To see why, consider two process both open the same file both with
 376      * fd=x.
 377      *      Process A: busy on reading x
 378      *      Process B: do nothing with x
 379      * Assuming that, after a very short time, process B get terminated
 380      * while process A is still busy in it's reading business. By this
 381      * design, the inode lock of this file x is get released by B rather
 382      * than A. And this will cause a probable race condition on A if other
 383      * process is writing to this file later after B exit.
 384     */
 385     mutex_unlock_for(&inode->lock, pid);
 386
 387     // now regain lock for inode syncing
 388
 389     lock_inode(inode);
 390
 391     if ((errno = file->ops->close(file))) {
 392         goto done;
 393     }
 394
 395     vfs_unref_dnode(file->dnode);
 396     cake_release(file_pile, file);
 397
 398     pcache_commit_all(inode);
 399     inode->open_count--;
 400
 401     if (!inode->open_count) {
 402         __sync_inode_nolock(inode);
 403     }
 404
 405 done:
 406     unlock_inode(inode);
 407     return errno;
 408 }
 409
 410 int
 411 vfs_close(struct v_file* file)
 412 {
 413     return vfs_pclose(file, __current->pid);
 414 }
 415
 416 void
 417 vfs_free_fd(struct v_fd* fd)
 418 {
 419     cake_release(fd_pile, fd);
 420 }
 421
 422 int
 423 vfs_isync(struct v_inode* inode)
 424 {
 425     lock_inode(inode);
 426
 427     int errno = __sync_inode_nolock(inode);
 428
 429     unlock_inode(inode);
 430
 431     return errno;
 432 }
 433
 434 int
 435 vfs_fsync(struct v_file* file)
 436 {
 437     int errno;
 438     if ((errno = vfs_check_writable(file->dnode))) {
 439         return errno;
 440     }
 441
 442     return vfs_isync(file->inode);
 443 }
 444
 445 int
 446 vfs_alloc_fdslot(int* fd)
 447 {
 448     struct v_fdtable* fdtab;
 449
 450     fdtab = __current->fdtable;
 451     lock_fdtable(fdtab);
 452
 453     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 454         if (__current->fdtable->fds[i]) {
 455             continue;
 456         }
 457
 458         *fd = i;
 459         unlock_fdtable(fdtab);
 460         return 0;
 461     }
 462
 463     unlock_fdtable(fdtab);
 464     return EMFILE;
 465 }
 466
 467 struct v_superblock*
 468 vfs_sb_alloc()
 469 {
 470     struct v_superblock* sb = cake_grab(superblock_pile);
 471     memset(sb, 0, sizeof(*sb));
 472     llist_init_head(&sb->sb_list);
 473
 474     vfs_vncache_init(&sb->i_cache);
 475     vfs_vncache_init(&sb->d_cache);
 476
 477     sb->ref_count = 1;
 478     return sb;
 479 }
 480
 481 void
 482 vfs_sb_ref(struct v_superblock* sb)
 483 {
 484     sb->ref_count++;
 485 }
 486
 487 void
 488 vfs_sb_unref(struct v_superblock* sb)
 489 {
 490     assert(sb->ref_count);
 491
 492     sb->ref_count--;
 493     if (likely(sb->ref_count)) {
 494         return;
 495     }
 496
 497     if (sb->ops.release) {
 498         sb->ops.release(sb);
 499     }
 500
 501     vfs_vncache_free(&sb->i_cache);
 502     vfs_vncache_free(&sb->d_cache);
 503
 504     cake_release(superblock_pile, sb);
 505 }
 506
 507 static inline bool
 508 __dnode_evictable(struct v_dnode* dnode)
 509 {
 510     return dnode->ref_count == 1
 511         && llist_empty(&dnode->children);
 512 }
 513
 514 static bool
 515 __vfs_try_evict_dnode(struct lru_node* obj)
 516 {
 517     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 518
 519     if (mutex_on_hold(&dnode->lock))
 520         return false;
 521
 522     if (!__dnode_evictable(dnode)) {
 523         return false;
 524     }
 525
 526     vfs_d_free(dnode);
 527     return true;
 528 }
 529
 530 static bool
 531 __vfs_try_evict_inode(struct lru_node* obj)
 532 {
 533     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 534
 535     if (!inode->link_count && !inode->open_count) {
 536         vfs_i_free(inode);
 537         return 1;
 538     }
 539     return 0;
 540 }
 541
 542 struct v_dnode*
 543 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 544 {
 545     struct v_dnode* dnode = cake_grab(dnode_pile);
 546     if (!dnode) {
 547         lru_evict_half(dnode_lru);
 548
 549         if (!(dnode = cake_grab(dnode_pile))) {
 550             return NULL;
 551         }
 552     }
 553
 554     memset(dnode, 0, sizeof(*dnode));
 555     llist_init_head(&dnode->children);
 556     llist_init_head(&dnode->siblings);
 557     llist_init_head(&dnode->aka_list);
 558     mutex_init(&dnode->lock);
 559
 560     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 561
 562     hstrcpy(&dnode->name, name);
 563
 564     if (parent) {
 565         vfs_d_assign_sb(dnode, parent->super_block);
 566         dnode->mnt = parent->mnt;
 567     }
 568
 569     lru_use_one(dnode_lru, &dnode->lru);
 570
 571     return dnode;
 572 }
 573
 574 void
 575 vfs_d_free(struct v_dnode* dnode)
 576 {
 577     assert(dnode->ref_count == 1);
 578
 579     if (dnode->inode) {
 580         assert(dnode->inode->link_count > 0);
 581         dnode->inode->link_count--;
 582     }
 583
 584     vfs_dcache_remove(dnode);
 585
 586     // Make sure the children de-referencing their parent.
 587     // With lru presented, the eviction will be propagated over the entire
 588     // detached subtree eventually
 589     struct v_dnode *pos, *n;
 590     llist_for_each(pos, n, &dnode->children, siblings)
 591     {
 592         vfs_dcache_remove(pos);
 593     }
 594
 595     if (dnode->destruct) {
 596         dnode->destruct(dnode);
 597     }
 598
 599     vfs_sb_unref(dnode->super_block);
 600
 601     vfree((void*)dnode->name.value);
 602     cake_release(dnode_pile, dnode);
 603 }
 604
 605 struct v_inode*
 606 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 607 {
 608     struct hbucket* slot;
 609     struct v_inode *pos, *n, *found = NULL;
 610
 611     cache_atomic_read(&sb->i_cache,
 612     {
 613         slot = &sb->i_cache.pool[i_id & VFS_HASH_MASK];
 614
 615         hashtable_bucket_foreach(slot, pos, n, hash_list)
 616         {
 617             if (pos->id != i_id) {
 618                 continue;
 619             }
 620
 621             lru_use_one(inode_lru, &pos->lru);
 622             found = pos;
 623             break;
 624         }
 625     });
 626
 627     return found;
 628 }
 629
 630 void
 631 vfs_i_addhash(struct v_inode* inode)
 632 {
 633     vfs_vncache_add(inode_cache(inode), inode->id, &inode->hash_list);
 634 }
 635
 636 struct v_inode*
 637 vfs_i_alloc(struct v_superblock* sb)
 638 {
 639     assert(sb->ops.init_inode);
 640
 641     struct v_inode* inode;
 642     if (!(inode = cake_grab(inode_pile))) {
 643         lru_evict_half(inode_lru);
 644         if (!(inode = cake_grab(inode_pile))) {
 645             return NULL;
 646         }
 647     }
 648
 649     memset(inode, 0, sizeof(*inode));
 650     mutex_init(&inode->lock);
 651     llist_init_head(&inode->xattrs);
 652     llist_init_head(&inode->aka_dnodes);
 653
 654     sb->ops.init_inode(sb, inode);
 655
 656     inode->ctime = clock_unixtime();
 657     inode->atime = inode->ctime;
 658     inode->mtime = inode->ctime;
 659
 660     vfs_i_assign_sb(inode, sb);
 661     lru_use_one(inode_lru, &inode->lru);
 662
 663     return inode;
 664 }
 665
 666 void
 667 vfs_i_free(struct v_inode* inode)
 668 {
 669     if (inode->pg_cache) {
 670         pcache_release(inode->pg_cache);
 671         vfree(inode->pg_cache);
 672     }
 673
 674     // we don't need to sync inode.
 675     // If an inode can be free, then it must be properly closed.
 676     // Hence it must be synced already!
 677     if (inode->destruct) {
 678         inode->destruct(inode);
 679     }
 680
 681     vfs_sb_unref(inode->sb);
 682
 683     hlist_delete(&inode->hash_list);
 684     lru_remove(inode_lru, &inode->lru);
 685
 686     cake_release(inode_pile, inode);
 687 }
 688
 689 /* ---- System call definition and support ---- */
 690
 691 // make a new name when not exists
 692 #define FLOC_MAYBE_MKNAME 1
 693
 694 // name must be non-exist and made.
 695 #define FLOC_MKNAME 2
 696
 697 // no follow symlink
 698 #define FLOC_NOFOLLOW 4
 699
 700 int
 701 vfs_getfd(int fd, struct v_fd** fd_s)
 702 {
 703     struct v_fdtable* fdtab;
 704
 705     if (!TEST_FD(fd)) {
 706         return EBADF;
 707     }
 708
 709     fdtab = __current->fdtable;
 710
 711     lock_fdtable(fdtab);
 712     *fd_s = __current->fdtable->fds[fd];
 713     unlock_fdtable(fdtab);
 714
 715     return !*fd_s ? EBADF : 0;
 716 }
 717
 718 static int
 719 __vfs_mknod(struct v_inode* parent, struct v_dnode* dnode,
 720             unsigned int itype, dev_t* dev)
 721 {
 722     int errno;
 723
 724     errno = parent->ops->create(parent, dnode, itype);
 725     if (errno) {
 726         return errno;
 727     }
 728
 729     return 0;
 730 }
 731
 732 struct file_locator {
 733     struct v_dnode* dir;
 734     struct v_dnode* file;
 735     bool fresh;
 736 };
 737
 738 /**
 739  * @brief unlock the file locator (floc) if possible.
 740  *        If the file to be located if not exists, and
 741  *        any FLOC_*MKNAME flag is set, then the parent
 742  *        dnode will be locked until the file has been properly
 743  *        finalised by subsequent logic.
 744  *
 745  * @param floc
 746  */
 747 static inline void
 748 __floc_try_unlock(struct file_locator* floc)
 749 {
 750     if (floc->fresh) {
 751         assert(floc->dir);
 752         unlock_dnode(floc->dir);
 753     }
 754 }
 755
 756 static int
 757 __vfs_try_locate_file(const char* path,
 758                       struct file_locator* floc,
 759                       int options)
 760 {
 761     char name_str[VFS_NAME_MAXLEN];
 762     struct v_dnode *fdir, *file;
 763     struct hstr name = HSTR(name_str, 0);
 764     int errno, woption = 0;
 765
 766     if ((options & FLOC_NOFOLLOW)) {
 767         woption |= VFS_WALK_NOFOLLOW;
 768         options &= ~FLOC_NOFOLLOW;
 769     }
 770
 771     floc->fresh = false;
 772     name_str[0] = 0;
 773     errno = vfs_walk_proc(path, &fdir, &name, woption | VFS_WALK_PARENT);
 774     if (errno) {
 775         return errno;
 776     }
 777
 778     lock_dnode(fdir);
 779
 780     errno = vfs_walk(fdir, name.value, &file, NULL, woption);
 781
 782     if (errno && errno != ENOENT) {
 783         goto error;
 784     }
 785
 786     if (!errno && (options & FLOC_MKNAME)) {
 787         errno = EEXIST;
 788         goto error;
 789     }
 790
 791     if (!errno) {
 792         // the file present, no need to hold the directory lock
 793         unlock_dnode(fdir);
 794         goto done;
 795     }
 796
 797     // errno == ENOENT
 798     if (!options) {
 799         goto error;
 800     }
 801
 802     errno = vfs_check_writable(fdir);
 803     if (errno) {
 804         goto error;
 805     }
 806
 807     floc->fresh = true;
 808
 809     file = vfs_d_alloc(fdir, &name);
 810
 811     if (!file) {
 812         errno = ENOMEM;
 813         goto error;
 814     }
 815
 816     vfs_dcache_add(fdir, file);
 817
 818 done:
 819     floc->dir   = fdir;
 820     floc->file  = file;
 821
 822     return errno;
 823
 824 error:
 825     unlock_dnode(fdir);
 826     return errno;
 827 }
 828
 829
 830 static bool
 831 __check_unlinkable(struct v_dnode* dnode)
 832 {
 833     int acl;
 834     bool wr_self, wr_parent;
 835     struct v_dnode* parent;
 836
 837     parent = dnode->parent;
 838     acl = dnode->inode->acl;
 839
 840     wr_self = check_allow_write(dnode->inode);
 841     wr_parent = check_allow_write(parent->inode);
 842
 843     if (!fsacl_test(acl, svtx)) {
 844         return wr_self;
 845     }
 846
 847     if (current_euid() == dnode->inode->uid) {
 848         return true;
 849     }
 850
 851     return wr_self && wr_parent;
 852 }
 853
 854 int
 855 vfs_do_open(const char* path, int options)
 856 {
 857     int errno, fd, loptions = 0;
 858     struct v_dnode *dentry, *file;
 859     struct v_file* ofile = NULL;
 860     struct file_locator floc;
 861     struct v_inode* inode;
 862
 863     if ((options & FO_CREATE)) {
 864         loptions |= FLOC_MAYBE_MKNAME;
 865     } else if ((options & FO_NOFOLLOW)) {
 866         loptions |= FLOC_NOFOLLOW;
 867     }
 868
 869     errno = __vfs_try_locate_file(path, &floc, loptions);
 870
 871     if (errno || (errno = vfs_alloc_fdslot(&fd))) {
 872         return errno;
 873     }
 874
 875     file   = floc.file;
 876     dentry = floc.dir;
 877
 878     if (floc.fresh) {
 879         errno = __vfs_mknod(dentry->inode, file, VFS_IFFILE, NULL);
 880         if (errno) {
 881             vfs_d_free(file);
 882             __floc_try_unlock(&floc);
 883             return errno;
 884         }
 885
 886         __floc_try_unlock(&floc);
 887     }
 888
 889
 890     if ((errno = vfs_open(file, &ofile))) {
 891         return errno;
 892     }
 893
 894     inode = ofile->inode;
 895     lock_inode(inode);
 896
 897     struct v_fd* fd_s = cake_grab(fd_pile);
 898     memset(fd_s, 0, sizeof(*fd_s));
 899
 900     if ((options & O_TRUNC)) {
 901         file->inode->fsize = 0;
 902     }
 903
 904     if (vfs_get_dtype(inode->itype) == DT_DIR) {
 905         ofile->f_pos = 0;
 906     }
 907
 908     fd_s->file = ofile;
 909     fd_s->flags = options;
 910     __current->fdtable->fds[fd] = fd_s;
 911
 912     unlock_inode(inode);
 913
 914     return fd;
 915 }
 916
 917 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 918 {
 919     int errno = vfs_do_open(path, options);
 920     return DO_STATUS_OR_RETURN(errno);
 921 }
 922
 923 __DEFINE_LXSYSCALL1(int, close, int, fd)
 924 {
 925     struct v_fd* fd_s;
 926     int errno = 0;
 927     if ((errno = vfs_getfd(fd, &fd_s))) {
 928         goto done_err;
 929     }
 930
 931     if ((errno = vfs_close(fd_s->file))) {
 932         goto done_err;
 933     }
 934
 935     cake_release(fd_pile, fd_s);
 936     __current->fdtable->fds[fd] = 0;
 937
 938 done_err:
 939     return DO_STATUS(errno);
 940 }
 941
 942 void
 943 __vfs_readdir_callback(struct dir_context* dctx,
 944                        const char* name,
 945                        const int len,
 946                        const int dtype)
 947 {
 948     struct lx_dirent* dent = (struct lx_dirent*)dctx->cb_data;
 949     strncpy(dent->d_name, name, MIN(len, DIRENT_NAME_MAX_LEN));
 950     dent->d_nlen = len;
 951     dent->d_type = dtype;
 952 }
 953
 954 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent)
 955 {
 956     struct v_fd* fd_s;
 957     int errno;
 958
 959     if ((errno = vfs_getfd(fd, &fd_s))) {
 960         goto done;
 961     }
 962
 963     struct v_inode* inode = fd_s->file->inode;
 964
 965     lock_inode(inode);
 966
 967     if (!check_directory_node(inode)) {
 968         errno = ENOTDIR;
 969         goto unlock;
 970     }
 971
 972     if (!check_allow_read(inode)) {
 973         errno = EPERM;
 974         goto unlock;
 975     }
 976
 977     struct dir_context dctx = (struct dir_context) {
 978         .cb_data = dent,
 979         .read_complete_callback = __vfs_readdir_callback
 980     };
 981
 982     if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 983         goto unlock;
 984     }
 985
 986     dent->d_offset++;
 987     fd_s->file->f_pos++;
 988
 989 unlock:
 990     unlock_inode(inode);
 991
 992 done:
 993     return DO_STATUS_OR_RETURN(errno);
 994 }
 995
 996 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 997 {
 998     int errno = 0;
 999     struct v_fd* fd_s;
1000     struct v_inode* inode;
1001
1002     if ((errno = vfs_getfd(fd, &fd_s))) {
1003         goto done;
1004     }
1005
1006     struct v_file* file = fd_s->file;
1007     if (check_directory_node(file->inode)) {
1008         errno = EISDIR;
1009         goto done;
1010     }
1011
1012     if (!check_allow_read(file->inode)) {
1013         errno = EPERM;
1014         goto done;
1015     }
1016
1017     inode = file->inode;
1018     lock_inode(inode);
1019
1020     __vfs_touch_inode(inode, INODE_ACCESSED);
1021
1022     if (check_seqdev_node(inode) || (fd_s->flags & FO_DIRECT)) {
1023         errno = file->ops->read(inode, buf, count, file->f_pos);
1024     } else {
1025         errno = pcache_read(inode, buf, count, file->f_pos);
1026     }
1027
1028     if (errno > 0) {
1029         file->f_pos += errno;
1030         unlock_inode(inode);
1031         return errno;
1032     }
1033
1034     unlock_inode(inode);
1035
1036 done:
1037     return DO_STATUS(errno);
1038 }
1039
1040 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
1041 {
1042     int errno = 0;
1043     struct v_fd* fd_s;
1044     if ((errno = vfs_getfd(fd, &fd_s))) {
1045         goto done;
1046     }
1047
1048     struct v_inode* inode;
1049     struct v_file* file = fd_s->file;
1050
1051     if ((errno = vfs_check_writable(file->dnode))) {
1052         goto done;
1053     }
1054
1055     if (check_directory_node(file->inode)) {
1056         errno = EISDIR;
1057         goto done;
1058     }
1059
1060     inode = file->inode;
1061     lock_inode(inode);
1062
1063     __vfs_touch_inode(inode, INODE_MODIFY);
1064     if ((fd_s->flags & O_APPEND)) {
1065         file->f_pos = inode->fsize;
1066     }
1067
1068     if (check_seqdev_node(inode) || (fd_s->flags & FO_DIRECT)) {
1069         errno = file->ops->write(inode, buf, count, file->f_pos);
1070     } else {
1071         errno = pcache_write(inode, buf, count, file->f_pos);
1072     }
1073
1074     if (errno > 0) {
1075         file->f_pos += errno;
1076         inode->fsize = MAX(inode->fsize, file->f_pos);
1077
1078         unlock_inode(inode);
1079         return errno;
1080     }
1081
1082     unlock_inode(inode);
1083
1084 done:
1085     return DO_STATUS(errno);
1086 }
1087
1088 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
1089 {
1090     int errno = 0;
1091     struct v_fd* fd_s;
1092     if ((errno = vfs_getfd(fd, &fd_s))) {
1093         goto done;
1094     }
1095
1096     struct v_file* file = fd_s->file;
1097     struct v_inode* inode = file->inode;
1098
1099     if (!file->ops->seek) {
1100         errno = ENOTSUP;
1101         goto done;
1102     }
1103
1104     if (!check_allow_read(inode)) {
1105         errno = EPERM;
1106         goto done;
1107     }
1108
1109     lock_inode(inode);
1110
1111     int overflow = 0;
1112     int fpos = file->f_pos;
1113
1114     if (vfs_get_dtype(inode->itype) == DT_DIR) {
1115         options = (options != FSEEK_END) ? options : FSEEK_SET;
1116     }
1117
1118     switch (options) {
1119         case FSEEK_CUR:
1120             overflow = sadd_of((int)file->f_pos, offset, &fpos);
1121             break;
1122         case FSEEK_END:
1123             overflow = sadd_of((int)inode->fsize, offset, &fpos);
1124             break;
1125         case FSEEK_SET:
1126             fpos = offset;
1127             break;
1128     }
1129
1130     if (overflow) {
1131         errno = EOVERFLOW;
1132     }
1133     else {
1134         errno = file->ops->seek(file, fpos);
1135     }
1136
1137     unlock_inode(inode);
1138
1139 done:
1140     return DO_STATUS(errno);
1141 }
1142
1143 int
1144 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
1145 {
1146     if (!dnode) {
1147         return 0;
1148     }
1149
1150     if (depth > 64) {
1151         return ENAMETOOLONG;
1152     }
1153
1154     size_t len = 0;
1155
1156     if (dnode->parent != dnode) {
1157         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
1158     }
1159
1160     if (len >= size) {
1161         return len;
1162     }
1163
1164     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
1165         buf[len++] = VFS_PATH_DELIM;
1166     }
1167
1168     size_t cpy_size = MIN(dnode->name.len, size - len);
1169     strncpy(buf + len, dnode->name.value, cpy_size);
1170     len += cpy_size;
1171
1172     return len;
1173 }
1174
1175 int
1176 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
1177 {
1178     const char* link;
1179     struct v_inode* inode = dnode->inode;
1180
1181     if (!check_symlink_node(inode)) {
1182         return EINVAL;
1183     }
1184
1185     if (!inode->ops->read_symlink) {
1186         return ENOTSUP;
1187     }
1188
1189     if (!check_allow_read(inode)) {
1190         return EPERM;
1191     }
1192
1193     lock_inode(inode);
1194
1195     int errno = inode->ops->read_symlink(inode, &link);
1196     if (errno >= 0) {
1197         strncpy(buf, link, MIN(size, (size_t)errno));
1198     }
1199
1200     unlock_inode(inode);
1201     return errno;
1202 }
1203
1204 int
1205 vfs_get_dtype(int itype)
1206 {
1207     int dtype = DT_FILE;
1208     if (check_itype(itype, VFS_IFSYMLINK)) {
1209         dtype |= DT_SYMLINK;
1210     }
1211
1212     if (check_itype(itype, VFS_IFDIR)) {
1213         dtype |= DT_DIR;
1214         return dtype;
1215     }
1216
1217     // TODO other types
1218
1219     return dtype;
1220 }
1221
1222 struct v_fdtable*
1223 fdtable_create()
1224 {
1225     struct v_fdtable* fdtab;
1226
1227     fdtab = vzalloc(sizeof(struct v_fdtable));
1228     mutex_init(&fdtab->lock);
1229
1230     return fdtab;
1231 }
1232
1233 void
1234 fdtable_copy(struct v_fdtable* dest, struct v_fdtable* src)
1235 {
1236     lock_fdtable(dest);
1237     lock_fdtable(src);
1238
1239     for (size_t i = 0; i < VFS_MAX_FD; i++) {
1240         struct v_fd* fd = src->fds[i];
1241         if (!fd)
1242             continue;
1243         vfs_dup_fd(fd, &dest->fds[i]);
1244     }
1245
1246     unlock_fdtable(dest);
1247     unlock_fdtable(src);
1248 }
1249
1250 void
1251 fdtable_free(struct v_fdtable* table)
1252 {
1253     assert(!mutex_on_hold(&table->lock));
1254
1255     vfree(table);
1256 }
1257
1258 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
1259 {
1260     int errno;
1261     struct v_fd* fd_s;
1262     if ((errno = vfs_getfd(fd, &fd_s))) {
1263         goto done;
1264     }
1265
1266     struct v_dnode* dnode;
1267
1268     dnode = fd_s->file->dnode;
1269
1270     lock_dnode(dnode);
1271     errno = vfs_get_path(dnode, buf, size, 0);
1272     unlock_dnode(dnode);
1273
1274 done:
1275     return DO_STATUS(errno);
1276 }
1277
1278 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
1279 {
1280     int errno;
1281     struct v_dnode* dnode;
1282     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
1283         errno = vfs_readlink(dnode, buf, size);
1284     }
1285
1286     if (errno >= 0) {
1287         return errno;
1288     }
1289
1290     return DO_STATUS(errno);
1291 }
1292
1293 __DEFINE_LXSYSCALL4(
1294   int, readlinkat, int, dirfd, const char*, pathname, char*, buf, size_t, size)
1295 {
1296     int errno;
1297     struct v_fd* fd_s;
1298     if ((errno = vfs_getfd(dirfd, &fd_s))) {
1299         goto done;
1300     }
1301
1302     pathname = pathname ? pathname : "";
1303
1304     struct v_dnode* dnode;
1305     if (!(errno = vfs_walk(
1306             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
1307         errno = vfs_readlink(fd_s->file->dnode, buf, size);
1308     }
1309
1310     if (errno >= 0) {
1311         return errno;
1312     }
1313
1314 done:
1315     return DO_STATUS(errno);
1316 }
1317
1318 /*
1319     NOTE
1320     When we perform operation that could affect the layout of
1321     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
1322     whenever possible. This will blocking any ongoing path walking to reach
1323     it hence avoid any partial state.
1324 */
1325
1326 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
1327 {
1328     int errno;
1329     struct v_dnode* dnode;
1330     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1331         return DO_STATUS(errno);
1332     }
1333
1334     lock_dnode(dnode);
1335
1336     if (!__check_unlinkable(dnode)) {
1337         errno = EPERM;
1338         goto done;
1339     }
1340
1341     if ((errno = vfs_check_writable(dnode))) {
1342         goto done;
1343     }
1344
1345     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
1346         errno = EROFS;
1347         goto done;
1348     }
1349
1350     if (dnode->ref_count > 1 || dnode->inode->open_count) {
1351         errno = EBUSY;
1352         goto done;
1353     }
1354
1355     if (!llist_empty(&dnode->children)) {
1356         errno = ENOTEMPTY;
1357         goto done;
1358     }
1359
1360     struct v_dnode* parent = dnode->parent;
1361
1362     if (!parent) {
1363         errno = EINVAL;
1364         goto done;
1365     }
1366
1367     lock_dnode(parent);
1368     lock_inode(parent->inode);
1369
1370     if (check_directory_node(dnode->inode)) {
1371         errno = parent->inode->ops->rmdir(parent->inode, dnode);
1372         if (!errno) {
1373             vfs_dcache_remove(dnode);
1374         }
1375     } else {
1376         errno = ENOTDIR;
1377     }
1378
1379     unlock_inode(parent->inode);
1380     unlock_dnode(parent);
1381
1382 done:
1383     unlock_dnode(dnode);
1384     return DO_STATUS(errno);
1385 }
1386
1387 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
1388 {
1389     int errno;
1390     struct hstr name;
1391     struct v_inode* inode;
1392     struct v_dnode *parent, *dir;
1393     char name_value[VFS_NAME_MAXLEN];
1394
1395     name = HHSTR(name_value, 0, 0);
1396
1397     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
1398         goto done;
1399     }
1400
1401     if (!(errno = vfs_walk(parent, name_value, &dir, NULL, 0))) {
1402         errno = EEXIST;
1403         goto done;
1404     }
1405
1406     if ((errno = vfs_check_writable(parent))) {
1407         goto done;
1408     }
1409
1410     if (!(dir = vfs_d_alloc(parent, &name))) {
1411         errno = ENOMEM;
1412         goto done;
1413     }
1414
1415     inode = parent->inode;
1416
1417     lock_dnode(parent);
1418     lock_inode(inode);
1419
1420     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1421         errno = ENOTSUP;
1422     } else if (!inode->ops->mkdir) {
1423         errno = ENOTSUP;
1424     } else if (!check_directory_node(inode)) {
1425         errno = ENOTDIR;
1426     } else if (!(errno = inode->ops->mkdir(inode, dir))) {
1427         vfs_dcache_add(parent, dir);
1428         goto cleanup;
1429     }
1430
1431     vfs_d_free(dir);
1432
1433 cleanup:
1434     unlock_inode(inode);
1435     unlock_dnode(parent);
1436 done:
1437     return DO_STATUS(errno);
1438 }
1439
1440 static int
1441 __vfs_do_unlink(struct v_dnode* dnode)
1442 {
1443     int errno;
1444     struct v_inode* inode = dnode->inode;
1445
1446     if (dnode->ref_count > 1) {
1447         return EBUSY;
1448     }
1449
1450     if (!__check_unlinkable(dnode)) {
1451         return EPERM;
1452     }
1453
1454     if ((errno = vfs_check_writable(dnode))) {
1455         return errno;
1456     }
1457
1458     lock_inode(inode);
1459
1460     if (inode->open_count) {
1461         errno = EBUSY;
1462     } else if (!check_directory_node(inode)) {
1463         errno = inode->ops->unlink(inode, dnode);
1464         if (!errno) {
1465             vfs_d_free(dnode);
1466         }
1467     } else {
1468         errno = EISDIR;
1469     }
1470
1471     unlock_inode(inode);
1472
1473     return errno;
1474 }
1475
1476 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1477 {
1478     int errno;
1479     struct v_dnode* dnode;
1480     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1481         goto done;
1482     }
1483
1484     errno = __vfs_do_unlink(dnode);
1485
1486 done:
1487     return DO_STATUS(errno);
1488 }
1489
1490 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1491 {
1492     int errno;
1493     struct v_fd* fd_s;
1494     if ((errno = vfs_getfd(fd, &fd_s))) {
1495         goto done;
1496     }
1497
1498     struct v_dnode* dnode;
1499     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1500         errno = __vfs_do_unlink(dnode);
1501     }
1502
1503 done:
1504     return DO_STATUS(errno);
1505 }
1506
1507 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1508 {
1509     int errno;
1510     struct file_locator floc;
1511     struct v_dnode *to_link, *name_file;
1512
1513     errno = __vfs_try_locate_file(oldpath, &floc, 0);
1514     if (errno) {
1515         goto done;
1516     }
1517
1518     __floc_try_unlock(&floc);
1519
1520     to_link = floc.file;
1521     errno = __vfs_try_locate_file(newpath, &floc, FLOC_MKNAME);
1522     if (!errno) {
1523         goto done;
1524     }
1525
1526     name_file = floc.file;
1527     errno = vfs_link(to_link, name_file);
1528     if (errno) {
1529         vfs_d_free(name_file);
1530     }
1531
1532 done:
1533     __floc_try_unlock(&floc);
1534     return DO_STATUS(errno);
1535 }
1536
1537 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1538 {
1539     int errno;
1540     struct v_fd* fd_s;
1541
1542     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1543         errno = vfs_fsync(fd_s->file);
1544     }
1545
1546     return DO_STATUS(errno);
1547 }
1548
1549 int
1550 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1551 {
1552     int errno = 0;
1553     struct v_fd* copied = cake_grab(fd_pile);
1554
1555     memcpy(copied, old, sizeof(struct v_fd));
1556
1557     vfs_ref_file(old->file);
1558
1559     *new = copied;
1560
1561     return errno;
1562 }
1563
1564 int
1565 vfs_dup2(int oldfd, int newfd)
1566 {
1567     int errno;
1568     struct v_fdtable* fdtab;
1569     struct v_fd *oldfd_s, *newfd_s;
1570
1571     if (newfd == oldfd) {
1572         return newfd;
1573     }
1574
1575     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1576         goto done;
1577     }
1578
1579     if (!TEST_FD(newfd)) {
1580         errno = EBADF;
1581         goto done;
1582     }
1583
1584     fdtab = __current->fdtable;
1585     lock_fdtable(fdtab);
1586
1587     newfd_s = fdtab->fds[newfd];
1588     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1589         goto unlock_and_done;
1590     }
1591
1592     if ((errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1593         goto unlock_and_done;
1594     }
1595
1596     fdtab->fds[newfd] = newfd_s;
1597
1598     unlock_fdtable(fdtab);
1599     return newfd;
1600
1601 unlock_and_done:
1602     unlock_fdtable(fdtab);
1603
1604 done:
1605     return DO_STATUS(errno);
1606 }
1607
1608 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1609 {
1610     return vfs_dup2(oldfd, newfd);
1611 }
1612
1613 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1614 {
1615     int errno, newfd;
1616     struct v_fd *oldfd_s, *newfd_s;
1617     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1618         goto done;
1619     }
1620
1621     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1622         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1623         __current->fdtable->fds[newfd] = newfd_s;
1624         return newfd;
1625     }
1626
1627 done:
1628     return DO_STATUS(errno);
1629 }
1630
1631 __DEFINE_LXSYSCALL2(
1632   int, symlink, const char*, pathname, const char*, link_target)
1633 {
1634     int errno;
1635     struct file_locator floc;
1636     struct v_dnode *file;
1637     struct v_inode *f_ino;
1638
1639     errno = __vfs_try_locate_file(pathname, &floc, FLOC_MKNAME);
1640     if (errno) {
1641         goto done;
1642     }
1643
1644     file = floc.file;
1645     errno = __vfs_mknod(floc.dir->inode, file, VFS_IFSYMLINK, NULL);
1646     if (errno) {
1647         vfs_d_free(file);
1648         goto done;
1649     }
1650
1651     f_ino = file->inode;
1652
1653     assert(f_ino);
1654
1655     errno = vfs_check_writable(file);
1656     if (errno) {
1657         goto done;
1658     }
1659
1660     if (!f_ino->ops->set_symlink) {
1661         errno = ENOTSUP;
1662         goto done;
1663     }
1664
1665     lock_inode(f_ino);
1666
1667     errno = f_ino->ops->set_symlink(f_ino, link_target);
1668
1669     unlock_inode(f_ino);
1670
1671 done:
1672     __floc_try_unlock(&floc);
1673     return DO_STATUS(errno);
1674 }
1675
1676 static int
1677 vfs_do_chdir_nolock(struct proc_info* proc, struct v_dnode* dnode)
1678 {
1679     if (!check_directory_node(dnode->inode)) {
1680         return ENOTDIR;
1681     }
1682
1683     if (proc->cwd) {
1684         vfs_unref_dnode(proc->cwd);
1685     }
1686
1687     vfs_ref_dnode(dnode);
1688     proc->cwd = dnode;
1689
1690     return 0;
1691 }
1692
1693 static int
1694 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1695 {
1696     int errno = 0;
1697
1698     lock_dnode(dnode);
1699
1700     errno = vfs_do_chdir_nolock(proc, dnode);
1701
1702     unlock_dnode(dnode);
1703
1704     return errno;
1705 }
1706
1707 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1708 {
1709     struct v_dnode* dnode;
1710     int errno = 0;
1711
1712     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1713         goto done;
1714     }
1715
1716     errno = vfs_do_chdir((struct proc_info*)__current, dnode);
1717
1718 done:
1719     return DO_STATUS(errno);
1720 }
1721
1722 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1723 {
1724     struct v_fd* fd_s;
1725     int errno = 0;
1726
1727     if ((errno = vfs_getfd(fd, &fd_s))) {
1728         goto done;
1729     }
1730
1731     errno = vfs_do_chdir((struct proc_info*)__current, fd_s->file->dnode);
1732
1733 done:
1734     return DO_STATUS(errno);
1735 }
1736
1737
1738 __DEFINE_LXSYSCALL1(int, chroot, const char*, path)
1739 {
1740     int errno;
1741     struct v_dnode* dnode;
1742     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1743         return errno;
1744     }
1745
1746     lock_dnode(dnode);
1747
1748     errno = vfs_do_chdir_nolock(__current, dnode);
1749     if (errno) {
1750         unlock_dnode(dnode);
1751         goto done;
1752     }
1753
1754     __current->root = dnode;
1755
1756     unlock_dnode(dnode);
1757
1758 done:
1759     return DO_STATUS(errno);
1760 }
1761
1762 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1763 {
1764     int errno = 0;
1765     char* ret_ptr = 0;
1766     if (size < 2) {
1767         errno = ERANGE;
1768         goto done;
1769     }
1770
1771     size_t len = 0;
1772
1773     if (!__current->cwd) {
1774         *buf = VFS_PATH_DELIM;
1775         len = 1;
1776     } else {
1777         len = vfs_get_path(__current->cwd, buf, size, 0);
1778         if (len == size) {
1779             errno = ERANGE;
1780             goto done;
1781         }
1782     }
1783
1784     buf[len] = '\0';
1785
1786     ret_ptr = buf;
1787
1788 done:
1789     syscall_result(errno);
1790     return ret_ptr;
1791 }
1792
1793 int
1794 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1795 {
1796     int errno = 0;
1797     if (current->inode->id == target->inode->id) {
1798         // hard link
1799         return 0;
1800     }
1801
1802     if ((errno = vfs_check_writable(current))) {
1803         return errno;
1804     }
1805
1806     if (current->ref_count > 1 || target->ref_count > 1) {
1807         return EBUSY;
1808     }
1809
1810     if (current->super_block != target->super_block) {
1811         return EXDEV;
1812     }
1813
1814     struct v_dnode* oldparent = current->parent;
1815     struct v_dnode* newparent = target->parent;
1816
1817     lock_dnode(current);
1818     lock_dnode(target);
1819
1820     if (oldparent)
1821         lock_dnode(oldparent);
1822     if (newparent)
1823         lock_dnode(newparent);
1824
1825     if (!llist_empty(&target->children)) {
1826         errno = ENOTEMPTY;
1827         unlock_dnode(target);
1828         goto cleanup;
1829     }
1830
1831     if ((errno =
1832            current->inode->ops->rename(current->inode, current, target))) {
1833         unlock_dnode(target);
1834         goto cleanup;
1835     }
1836
1837     // re-position current
1838     hstrcpy(&current->name, &target->name);
1839     vfs_dcache_rehash(newparent, current);
1840
1841     // detach target
1842     vfs_d_free(target);
1843
1844     unlock_dnode(target);
1845
1846 cleanup:
1847     unlock_dnode(current);
1848
1849     if (oldparent)
1850         unlock_dnode(oldparent);
1851     if (newparent)
1852         unlock_dnode(newparent);
1853
1854     return errno;
1855 }
1856
1857 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1858 {
1859     struct v_dnode *cur, *target_parent, *target;
1860     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1861     int errno = 0;
1862
1863     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1864         goto done;
1865     }
1866
1867     if ((errno = vfs_walk(
1868            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1869         goto done;
1870     }
1871
1872     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1873     if (errno == ENOENT) {
1874         target = vfs_d_alloc(target_parent, &name);
1875         vfs_dcache_add(target_parent, target);
1876     } else if (errno) {
1877         goto done;
1878     }
1879
1880     if (!target) {
1881         errno = ENOMEM;
1882         goto done;
1883     }
1884
1885     errno = vfs_do_rename(cur, target);
1886
1887 done:
1888     vfree((void*)name.value);
1889     return DO_STATUS(errno);
1890 }
1891
1892 __DEFINE_LXSYSCALL2(int, fstat, int, fd, struct file_stat*, stat)
1893 {
1894     int errno = 0;
1895     struct v_fd* fds;
1896
1897     if ((errno = vfs_getfd(fd, &fds))) {
1898         goto done;
1899     }
1900
1901     struct v_inode* vino = fds->file->inode;
1902     struct device* fdev = vino->sb->dev;
1903
1904     stat->st_ino     = vino->id;
1905     stat->st_blocks  = vino->lb_usage;
1906     stat->st_size    = vino->fsize;
1907     stat->st_blksize = vino->sb->blksize;
1908     stat->st_nlink   = vino->link_count;
1909     stat->st_uid     = vino->uid;
1910     stat->st_gid     = vino->gid;
1911
1912     stat->st_ctim    = vino->ctime;
1913     stat->st_atim    = vino->atime;
1914     stat->st_mtim    = vino->mtime;
1915
1916     stat->st_mode    = (vino->itype << 16) | vino->acl;
1917
1918     stat->st_ioblksize = PAGE_SIZE;
1919
1920     if (check_device_node(vino)) {
1921         struct device* rdev = resolve_device(vino->data);
1922         if (!rdev) {
1923             errno = EINVAL;
1924             goto done;
1925         }
1926
1927         stat->st_rdev = (dev_t){.meta = rdev->ident.fn_grp,
1928                                 .unique = rdev->ident.unique,
1929                                 .index = dev_uid(rdev) };
1930     }
1931
1932     if (fdev) {
1933         stat->st_dev = (dev_t){.meta = fdev->ident.fn_grp,
1934                                .unique = fdev->ident.unique,
1935                                .index = dev_uid(fdev) };
1936     }
1937
1938 done:
1939     return DO_STATUS(errno);
1940 }
1941
1942 __DEFINE_LXSYSCALL4(int, fchmodat, int, fd,
1943                     const char*, path, int, mode, int, flags)
1944 {
1945     int errno;
1946     struct v_dnode *dnode;
1947     struct v_inode* inode;
1948
1949     errno = vfs_walkat(fd, path, flags, &dnode);
1950     if (errno) {
1951         goto done;
1952     }
1953
1954     errno = vfs_check_writable(dnode);
1955     if (errno) {
1956         return errno;
1957     }
1958
1959     inode = dnode->inode;
1960     lock_inode(inode);
1961
1962     if (!current_is_root()) {
1963         mode = mode & FSACL_RWXMASK;
1964     }
1965
1966     inode->acl = mode;
1967     __vfs_touch_inode(inode, INODE_MODIFY);
1968
1969     unlock_inode(inode);
1970
1971 done:
1972     return DO_STATUS(errno);
1973 }
1974
1975 __DEFINE_LXSYSCALL5(int, fchownat, int, fd,
1976                     const char*, path, uid_t, uid, gid_t, gid, int, flags)
1977 {
1978     int errno;
1979     struct v_dnode *dnode;
1980     struct v_inode *inode;
1981
1982     errno = vfs_walkat(fd, path, flags, &dnode);
1983     if (errno) {
1984         goto done;
1985     }
1986
1987     errno = vfs_check_writable(dnode);
1988     if (errno) {
1989         return errno;
1990     }
1991
1992     inode = dnode->inode;
1993     lock_inode(inode);
1994
1995     inode->uid = uid;
1996     inode->gid = gid;
1997     __vfs_touch_inode(inode, INODE_MODIFY);
1998
1999     unlock_inode(inode);
2000
2001 done:
2002     return DO_STATUS(errno);
2003 }
2004
2005 __DEFINE_LXSYSCALL4(int, faccessat, int, fd,
2006                     const char*, path, int, amode, int, flags)
2007 {
2008     int errno, acl;
2009     struct v_dnode *dnode;
2010     struct v_inode *inode;
2011     struct user_scope* uscope;
2012
2013     uid_t tuid;
2014     gid_t tgid;
2015
2016     errno = vfs_walkat(fd, path, flags, &dnode);
2017     if (errno) {
2018         goto done;
2019     }
2020
2021     if ((flags & AT_EACCESS)) {
2022         tuid = current_euid();
2023         tgid = current_egid();
2024     }
2025     else {
2026         uscope = current_user_scope();
2027         tuid = uscope->ruid;
2028         tgid = uscope->rgid;
2029     }
2030
2031     inode = dnode->inode;
2032
2033     acl  = inode->acl;
2034     acl &= amode;
2035     acl &= check_acl_between(inode->uid, inode->gid, tuid, tgid);
2036     if (!acl) {
2037         errno = EACCESS;
2038     }
2039
2040 done:
2041     return DO_STATUS(errno);
2042 }