lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition (CHECKED)
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/foptions.h>
  48 #include <lunaix/fs.h>
  49 #include <lunaix/mm/cake.h>
  50 #include <lunaix/mm/page.h>
  51 #include <lunaix/mm/valloc.h>
  52 #include <lunaix/process.h>
  53 #include <lunaix/spike.h>
  54 #include <lunaix/syscall.h>
  55 #include <lunaix/syscall_utils.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 #include <usr/lunaix/dirent_defs.h>
  60
  61 static struct cake_pile* dnode_pile;
  62 static struct cake_pile* inode_pile;
  63 static struct cake_pile* file_pile;
  64 static struct cake_pile* superblock_pile;
  65 static struct cake_pile* fd_pile;
  66
  67 struct v_dnode* vfs_sysroot;
  68 static struct hbucket* dnode_cache;
  69
  70 struct lru_zone *dnode_lru, *inode_lru;
  71
  72 struct hstr vfs_ddot = HSTR("..", 2);
  73 struct hstr vfs_dot = HSTR(".", 1);
  74 struct hstr vfs_empty = HSTR("", 0);
  75
  76 struct v_superblock*
  77 vfs_sb_alloc();
  78
  79 void
  80 vfs_sb_free(struct v_superblock* sb);
  81
  82 static int
  83 __vfs_try_evict_dnode(struct lru_node* obj);
  84
  85 static int
  86 __vfs_try_evict_inode(struct lru_node* obj);
  87
  88 void
  89 vfs_init()
  90 {
  91     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  92     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  93     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  94     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  95     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  96     superblock_pile =
  97       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  98
  99     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 100
 101     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 102     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 103
 104     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 105     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 106
 107     // 创建一个根dnode。
 108     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 109     vfs_sysroot->parent = vfs_sysroot;
 110     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 111 }
 112
 113 static inline struct hbucket*
 114 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 115 {
 116     u32_t _hash = *hash;
 117     // 确保低位更加随机
 118     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 119     // 与parent的指针值做加法，来减小碰撞的可能性。
 120     _hash += (u32_t)parent;
 121     *hash = _hash;
 122     return &dnode_cache[_hash & VFS_HASH_MASK];
 123 }
 124
 125 struct v_dnode*
 126 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 127 {
 128     if (!str->len || HSTR_EQ(str, &vfs_dot))
 129         return parent;
 130
 131     if (HSTR_EQ(str, &vfs_ddot)) {
 132         return parent->parent;
 133     }
 134
 135     u32_t hash = str->hash;
 136     struct hbucket* slot = __dcache_hash(parent, &hash);
 137
 138     struct v_dnode *pos, *n;
 139     hashtable_bucket_foreach(slot, pos, n, hash_list)
 140     {
 141         if (pos->name.hash == hash) {
 142             return pos;
 143         }
 144     }
 145     return NULL;
 146 }
 147
 148 void
 149 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 150 {
 151     assert(parent);
 152
 153     atomic_fetch_add(&dnode->ref_count, 1);
 154     dnode->parent = parent;
 155     llist_append(&parent->children, &dnode->siblings);
 156
 157     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 158     hlist_add(&bucket->head, &dnode->hash_list);
 159 }
 160
 161 void
 162 vfs_dcache_remove(struct v_dnode* dnode)
 163 {
 164     assert(dnode);
 165     assert(dnode->ref_count == 1);
 166
 167     llist_delete(&dnode->siblings);
 168     llist_delete(&dnode->aka_list);
 169     hlist_delete(&dnode->hash_list);
 170
 171     dnode->parent = NULL;
 172     atomic_fetch_sub(&dnode->ref_count, 1);
 173 }
 174
 175 void
 176 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 177 {
 178     assert(new_parent);
 179
 180     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 181     vfs_dcache_remove(dnode);
 182     vfs_dcache_add(new_parent, dnode);
 183 }
 184
 185 int
 186 vfs_open(struct v_dnode* dnode, struct v_file** file)
 187 {
 188     if (!dnode->inode || !dnode->inode->ops->open) {
 189         return ENOTSUP;
 190     }
 191
 192     struct v_inode* inode = dnode->inode;
 193
 194     lock_inode(inode);
 195
 196     struct v_file* vfile = cake_grab(file_pile);
 197     memset(vfile, 0, sizeof(*vfile));
 198
 199     vfile->dnode = dnode;
 200     vfile->inode = inode;
 201     vfile->ref_count = ATOMIC_VAR_INIT(1);
 202     vfile->ops = inode->default_fops;
 203
 204     if ((inode->itype & F_MFILE) && !inode->pg_cache) {
 205         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 206         pcache_init(pcache);
 207         pcache->master = inode;
 208         inode->pg_cache = pcache;
 209     }
 210
 211     int errno = inode->ops->open(inode, vfile);
 212     if (errno) {
 213         cake_release(file_pile, vfile);
 214     } else {
 215         atomic_fetch_add(&dnode->ref_count, 1);
 216         inode->open_count++;
 217         mnt_mkbusy(dnode->mnt);
 218
 219         *file = vfile;
 220     }
 221
 222     unlock_inode(inode);
 223
 224     return errno;
 225 }
 226
 227 void
 228 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 229 {
 230     if (assign_to->inode) {
 231         llist_delete(&assign_to->aka_list);
 232         assign_to->inode->link_count--;
 233     }
 234     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 235     assign_to->inode = inode;
 236     inode->link_count++;
 237 }
 238
 239 int
 240 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 241 {
 242     int errno;
 243
 244     if ((errno = vfs_check_writable(to_link))) {
 245         return errno;
 246     }
 247
 248     lock_inode(to_link->inode);
 249     if (to_link->super_block->root != name->super_block->root) {
 250         errno = EXDEV;
 251     } else if (!to_link->inode->ops->link) {
 252         errno = ENOTSUP;
 253     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 254         vfs_assign_inode(name, to_link->inode);
 255     }
 256     unlock_inode(to_link->inode);
 257
 258     return errno;
 259 }
 260
 261 int
 262 vfs_pclose(struct v_file* file, pid_t pid)
 263 {
 264     int errno = 0;
 265     if (file->ref_count > 1) {
 266         atomic_fetch_sub(&file->ref_count, 1);
 267     } else if (!(errno = file->ops->close(file))) {
 268         atomic_fetch_sub(&file->dnode->ref_count, 1);
 269         file->inode->open_count--;
 270
 271         /*
 272          * Prevent dead lock.
 273          * This happened when process is terminated while blocking on read.
 274          * In that case, the process is still holding the inode lock and it
 275              will never get released.
 276          * The unlocking should also include ownership check.
 277          *
 278          * To see why, consider two process both open the same file both with
 279          * fd=x.
 280          *      Process A: busy on reading x
 281          *      Process B: do nothing with x
 282          * Assuming that, after a very short time, process B get terminated
 283          * while process A is still busy in it's reading business. By this
 284          * design, the inode lock of this file x is get released by B rather
 285          * than A. And this will cause a probable race condition on A if other
 286          * process is writing to this file later after B exit.
 287          */
 288         if (mutex_on_hold(&file->inode->lock)) {
 289             mutex_unlock_for(&file->inode->lock, pid);
 290         }
 291         mnt_chillax(file->dnode->mnt);
 292
 293         pcache_commit_all(file->inode);
 294         cake_release(file_pile, file);
 295     }
 296     return errno;
 297 }
 298
 299 int
 300 vfs_close(struct v_file* file)
 301 {
 302     return vfs_pclose(file, __current->pid);
 303 }
 304
 305 void
 306 vfs_free_fd(struct v_fd* fd)
 307 {
 308     cake_release(fd_pile, fd);
 309 }
 310
 311 int
 312 vfs_fsync(struct v_file* file)
 313 {
 314     int errno;
 315     if ((errno = vfs_check_writable(file->dnode))) {
 316         return errno;
 317     }
 318
 319     lock_inode(file->inode);
 320
 321     pcache_commit_all(file->inode);
 322
 323     errno = ENOTSUP;
 324     if (file->ops->sync) {
 325         errno = file->ops->sync(file);
 326     }
 327
 328     unlock_inode(file->inode);
 329
 330     return errno;
 331 }
 332
 333 int
 334 vfs_alloc_fdslot(int* fd)
 335 {
 336     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 337         if (!__current->fdtable->fds[i]) {
 338             *fd = i;
 339             return 0;
 340         }
 341     }
 342     return EMFILE;
 343 }
 344
 345 struct v_superblock*
 346 vfs_sb_alloc()
 347 {
 348     struct v_superblock* sb = cake_grab(superblock_pile);
 349     memset(sb, 0, sizeof(*sb));
 350     llist_init_head(&sb->sb_list);
 351     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 352     return sb;
 353 }
 354
 355 void
 356 vfs_sb_free(struct v_superblock* sb)
 357 {
 358     vfree(sb->i_cache);
 359     cake_release(superblock_pile, sb);
 360 }
 361
 362 static int
 363 __vfs_try_evict_dnode(struct lru_node* obj)
 364 {
 365     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 366
 367     if (!dnode->ref_count) {
 368         vfs_d_free(dnode);
 369         return 1;
 370     }
 371     return 0;
 372 }
 373
 374 static int
 375 __vfs_try_evict_inode(struct lru_node* obj)
 376 {
 377     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 378
 379     if (!inode->link_count && !inode->open_count) {
 380         vfs_i_free(inode);
 381         return 1;
 382     }
 383     return 0;
 384 }
 385
 386 struct v_dnode*
 387 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 388 {
 389     struct v_dnode* dnode = cake_grab(dnode_pile);
 390     if (!dnode) {
 391         lru_evict_half(dnode_lru);
 392
 393         if (!(dnode = cake_grab(dnode_pile))) {
 394             return NULL;
 395         }
 396     }
 397
 398     memset(dnode, 0, sizeof(*dnode));
 399     llist_init_head(&dnode->children);
 400     llist_init_head(&dnode->siblings);
 401     llist_init_head(&dnode->aka_list);
 402     mutex_init(&dnode->lock);
 403
 404     dnode->ref_count = ATOMIC_VAR_INIT(0);
 405     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 406
 407     hstrcpy(&dnode->name, name);
 408
 409     if (parent) {
 410         dnode->super_block = parent->super_block;
 411         dnode->mnt = parent->mnt;
 412     }
 413
 414     lru_use_one(dnode_lru, &dnode->lru);
 415
 416     return dnode;
 417 }
 418
 419 void
 420 vfs_d_free(struct v_dnode* dnode)
 421 {
 422     assert(dnode->ref_count == 1);
 423
 424     if (dnode->inode) {
 425         assert(dnode->inode->link_count > 0);
 426         dnode->inode->link_count--;
 427     }
 428
 429     vfs_dcache_remove(dnode);
 430     // Make sure the children de-referencing their parent.
 431     // With lru presented, the eviction will be propagated over the entire
 432     // detached subtree eventually
 433     struct v_dnode *pos, *n;
 434     llist_for_each(pos, n, &dnode->children, siblings)
 435     {
 436         vfs_dcache_remove(pos);
 437     }
 438
 439     vfree((void*)dnode->name.value);
 440     cake_release(dnode_pile, dnode);
 441 }
 442
 443 struct v_inode*
 444 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 445 {
 446     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 447     struct v_inode *pos, *n;
 448     hashtable_bucket_foreach(slot, pos, n, hash_list)
 449     {
 450         if (pos->id == i_id) {
 451             lru_use_one(inode_lru, &pos->lru);
 452             return pos;
 453         }
 454     }
 455
 456     return NULL;
 457 }
 458
 459 void
 460 vfs_i_addhash(struct v_inode* inode)
 461 {
 462     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 463
 464     hlist_delete(&inode->hash_list);
 465     hlist_add(&slot->head, &inode->hash_list);
 466 }
 467
 468 struct v_inode*
 469 vfs_i_alloc(struct v_superblock* sb)
 470 {
 471     assert(sb->ops.init_inode);
 472
 473     struct v_inode* inode;
 474     if (!(inode = cake_grab(inode_pile))) {
 475         lru_evict_half(inode_lru);
 476         if (!(inode = cake_grab(inode_pile))) {
 477             return NULL;
 478         }
 479     }
 480
 481     memset(inode, 0, sizeof(*inode));
 482     mutex_init(&inode->lock);
 483     llist_init_head(&inode->xattrs);
 484     llist_init_head(&inode->aka_dnodes);
 485
 486     sb->ops.init_inode(sb, inode);
 487
 488     inode->sb = sb;
 489     inode->ctime = clock_unixtime();
 490     inode->atime = inode->ctime;
 491     inode->mtime = inode->ctime;
 492
 493     lru_use_one(inode_lru, &inode->lru);
 494     return inode;
 495 }
 496
 497 void
 498 vfs_i_free(struct v_inode* inode)
 499 {
 500     if (inode->pg_cache) {
 501         pcache_release(inode->pg_cache);
 502         vfree(inode->pg_cache);
 503     }
 504     // we don't need to sync inode.
 505     // If an inode can be free, then it must be properly closed.
 506     // Hence it must be synced already!
 507     if (inode->destruct) {
 508         inode->destruct(inode);
 509     }
 510     hlist_delete(&inode->hash_list);
 511     cake_release(inode_pile, inode);
 512 }
 513
 514 /* ---- System call definition and support ---- */
 515
 516 #define FLOCATE_CREATE_EMPTY 1
 517 #define FLOCATE_CREATE_ONLY 2
 518 #define FLOCATE_NOFOLLOW 4
 519
 520 int
 521 vfs_getfd(int fd, struct v_fd** fd_s)
 522 {
 523     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 524         return 0;
 525     }
 526     return EBADF;
 527 }
 528
 529 int
 530 __vfs_try_locate_file(const char* path,
 531                       struct v_dnode** fdir,
 532                       struct v_dnode** file,
 533                       int options)
 534 {
 535     char name_str[VFS_NAME_MAXLEN];
 536     struct hstr name = HSTR(name_str, 0);
 537     int errno, woption = 0;
 538
 539     if ((options & FLOCATE_NOFOLLOW)) {
 540         woption |= VFS_WALK_NOFOLLOW;
 541     }
 542
 543     name_str[0] = 0;
 544     if ((errno = vfs_walk_proc(path, fdir, &name, woption | VFS_WALK_PARENT))) {
 545         return errno;
 546     }
 547
 548     errno = vfs_walk(*fdir, name.value, file, NULL, woption);
 549
 550     if (errno != ENOENT && (options & FLOCATE_CREATE_ONLY)) {
 551         return EEXIST;
 552     }
 553
 554     if (errno != ENOENT ||
 555         !(options & (FLOCATE_CREATE_EMPTY | FLOCATE_CREATE_ONLY))) {
 556         return errno;
 557     }
 558
 559     struct v_dnode* parent = *fdir;
 560     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 561
 562     if (!file_new) {
 563         return ENOMEM;
 564     }
 565
 566     lock_dnode(parent);
 567
 568     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 569         vfs_dcache_add(parent, file_new);
 570         *file = file_new;
 571     } else {
 572         vfs_d_free(file_new);
 573     }
 574
 575     unlock_dnode(parent);
 576
 577     return errno;
 578 }
 579
 580 int
 581 vfs_do_open(const char* path, int options)
 582 {
 583     int errno, fd, loptions = 0;
 584     struct v_dnode *dentry, *file;
 585     struct v_file* ofile = NULL;
 586
 587     if ((options & FO_CREATE)) {
 588         loptions |= FLOCATE_CREATE_EMPTY;
 589     } else if ((options & FO_NOFOLLOW)) {
 590         loptions |= FLOCATE_NOFOLLOW;
 591     }
 592
 593     errno = __vfs_try_locate_file(path, &dentry, &file, loptions);
 594
 595     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 596
 597         if (errno || (errno = vfs_open(file, &ofile))) {
 598             return errno;
 599         }
 600
 601         struct v_fd* fd_s = cake_grab(fd_pile);
 602         memset(fd_s, 0, sizeof(*fd_s));
 603
 604         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 605         fd_s->file = ofile;
 606         fd_s->flags = options;
 607         __current->fdtable->fds[fd] = fd_s;
 608         return fd;
 609     }
 610
 611     return errno;
 612 }
 613
 614 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 615 {
 616     int errno = vfs_do_open(path, options);
 617     return DO_STATUS_OR_RETURN(errno);
 618 }
 619
 620 __DEFINE_LXSYSCALL1(int, close, int, fd)
 621 {
 622     struct v_fd* fd_s;
 623     int errno = 0;
 624     if ((errno = vfs_getfd(fd, &fd_s))) {
 625         goto done_err;
 626     }
 627
 628     if ((errno = vfs_close(fd_s->file))) {
 629         goto done_err;
 630     }
 631
 632     cake_release(fd_pile, fd_s);
 633     __current->fdtable->fds[fd] = 0;
 634
 635 done_err:
 636     return DO_STATUS(errno);
 637 }
 638
 639 void
 640 __vfs_readdir_callback(struct dir_context* dctx,
 641                        const char* name,
 642                        const int len,
 643                        const int dtype)
 644 {
 645     struct lx_dirent* dent = (struct lx_dirent*)dctx->cb_data;
 646     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 647     dent->d_nlen = len;
 648     dent->d_type = dtype;
 649 }
 650
 651 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent)
 652 {
 653     struct v_fd* fd_s;
 654     int errno;
 655
 656     if ((errno = vfs_getfd(fd, &fd_s))) {
 657         goto done;
 658     }
 659
 660     struct v_inode* inode = fd_s->file->inode;
 661
 662     lock_inode(inode);
 663
 664     if ((inode->itype & F_FILE)) {
 665         errno = ENOTDIR;
 666     } else {
 667         struct dir_context dctx = (struct dir_context){
 668           .cb_data = dent,
 669           .index = dent->d_offset,
 670           .read_complete_callback = __vfs_readdir_callback};
 671         errno = 1;
 672         if (dent->d_offset == 0) {
 673             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 674         } else if (dent->d_offset == 1) {
 675             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 676         } else {
 677             dctx.index -= 2;
 678             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 679                 unlock_inode(inode);
 680                 goto done;
 681             }
 682         }
 683         dent->d_offset++;
 684     }
 685
 686     unlock_inode(inode);
 687
 688 done:
 689     return DO_STATUS_OR_RETURN(errno);
 690 }
 691
 692 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 693 {
 694     int errno = 0;
 695     struct v_fd* fd_s;
 696     if ((errno = vfs_getfd(fd, &fd_s))) {
 697         goto done;
 698     }
 699
 700     struct v_file* file = fd_s->file;
 701     if (!(file->inode->itype & F_FILE)) {
 702         errno = EISDIR;
 703         goto done;
 704     }
 705
 706     lock_inode(file->inode);
 707
 708     file->inode->atime = clock_unixtime();
 709
 710     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 711         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 712     } else {
 713         errno = pcache_read(file->inode, buf, count, file->f_pos);
 714     }
 715
 716     if (errno > 0) {
 717         file->f_pos += errno;
 718         unlock_inode(file->inode);
 719         return errno;
 720     }
 721
 722     unlock_inode(file->inode);
 723
 724 done:
 725     return DO_STATUS(errno);
 726 }
 727
 728 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 729 {
 730     int errno = 0;
 731     struct v_fd* fd_s;
 732     if ((errno = vfs_getfd(fd, &fd_s))) {
 733         goto done;
 734     }
 735
 736     struct v_file* file = fd_s->file;
 737
 738     if ((errno = vfs_check_writable(file->dnode))) {
 739         goto done;
 740     }
 741
 742     if (!(file->inode->itype & F_FILE)) {
 743         errno = EISDIR;
 744         goto done;
 745     }
 746
 747     lock_inode(file->inode);
 748
 749     file->inode->mtime = clock_unixtime();
 750
 751     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 752         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 753     } else {
 754         errno = pcache_write(file->inode, buf, count, file->f_pos);
 755     }
 756
 757     if (errno > 0) {
 758         file->f_pos += errno;
 759         unlock_inode(file->inode);
 760         return errno;
 761     }
 762
 763     unlock_inode(file->inode);
 764
 765 done:
 766     return DO_STATUS(errno);
 767 }
 768
 769 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 770 {
 771     int errno = 0;
 772     struct v_fd* fd_s;
 773     if ((errno = vfs_getfd(fd, &fd_s))) {
 774         goto done;
 775     }
 776
 777     struct v_file* file = fd_s->file;
 778
 779     if (!file->ops->seek) {
 780         errno = ENOTSUP;
 781         goto done;
 782     }
 783
 784     lock_inode(file->inode);
 785
 786     int overflow = 0;
 787     int fpos = file->f_pos;
 788     switch (options) {
 789         case FSEEK_CUR:
 790             overflow = sadd_overflow((int)file->f_pos, offset, &fpos);
 791             break;
 792         case FSEEK_END:
 793             overflow = sadd_overflow((int)file->inode->fsize, offset, &fpos);
 794             break;
 795         case FSEEK_SET:
 796             fpos = offset;
 797             break;
 798     }
 799     if (overflow) {
 800         errno = EOVERFLOW;
 801     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 802         file->f_pos = fpos;
 803     }
 804
 805     unlock_inode(file->inode);
 806
 807 done:
 808     return DO_STATUS(errno);
 809 }
 810
 811 int
 812 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 813 {
 814     if (!dnode) {
 815         return 0;
 816     }
 817
 818     if (depth > 64) {
 819         return ENAMETOOLONG;
 820     }
 821
 822     size_t len = 0;
 823
 824     if (dnode->parent != dnode) {
 825         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 826     }
 827
 828     if (len >= size) {
 829         return len;
 830     }
 831
 832     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 833         buf[len++] = VFS_PATH_DELIM;
 834     }
 835
 836     size_t cpy_size = MIN(dnode->name.len, size - len);
 837     strncpy(buf + len, dnode->name.value, cpy_size);
 838     len += cpy_size;
 839
 840     return len;
 841 }
 842
 843 int
 844 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 845 {
 846     const char* link;
 847     struct v_inode* inode = dnode->inode;
 848     if (inode->ops->read_symlink) {
 849         lock_inode(inode);
 850
 851         int errno = inode->ops->read_symlink(inode, &link);
 852         strncpy(buf, link, size);
 853
 854         unlock_inode(inode);
 855         return errno;
 856     }
 857     return 0;
 858 }
 859
 860 int
 861 vfs_get_dtype(int itype)
 862 {
 863     if ((itype & VFS_IFSYMLINK) == VFS_IFSYMLINK) {
 864         return DT_SYMLINK;
 865     } else if (!(itype & VFS_IFFILE)) {
 866         return DT_DIR;
 867     } else {
 868         return DT_FILE;
 869     }
 870 }
 871
 872 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 873 {
 874     int errno;
 875     struct v_fd* fd_s;
 876     if ((errno = vfs_getfd(fd, &fd_s))) {
 877         goto done;
 878     }
 879
 880     struct v_dnode* dnode;
 881     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 882
 883     if (errno >= 0) {
 884         return errno;
 885     }
 886
 887 done:
 888     return DO_STATUS(errno);
 889 }
 890
 891 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 892 {
 893     int errno;
 894     struct v_dnode* dnode;
 895     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 896         errno = vfs_readlink(dnode, buf, size);
 897     }
 898
 899     if (errno >= 0) {
 900         return errno;
 901     }
 902
 903     return DO_STATUS(errno);
 904 }
 905
 906 __DEFINE_LXSYSCALL4(
 907   int, readlinkat, int, dirfd, const char*, pathname, char*, buf, size_t, size)
 908 {
 909     int errno;
 910     struct v_fd* fd_s;
 911     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 912         goto done;
 913     }
 914
 915     pathname = pathname ? pathname : "";
 916
 917     struct v_dnode* dnode;
 918     if (!(errno = vfs_walk(
 919             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 920         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 921     }
 922
 923     if (errno >= 0) {
 924         return errno;
 925     }
 926
 927 done:
 928     return DO_STATUS(errno);
 929 }
 930
 931 /*
 932     NOTE
 933     When we perform operation that could affect the layout of
 934     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 935     whenever possible. This will blocking any ongoing path walking to reach
 936     it hence avoid any partial state.
 937 */
 938
 939 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 940 {
 941     int errno;
 942     struct v_dnode* dnode;
 943     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 944         return DO_STATUS(errno);
 945     }
 946
 947     lock_dnode(dnode);
 948
 949     if ((errno = vfs_check_writable(dnode))) {
 950         goto done;
 951     }
 952
 953     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 954         errno = EROFS;
 955         goto done;
 956     }
 957
 958     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 959         errno = EBUSY;
 960         goto done;
 961     }
 962
 963     if (!llist_empty(&dnode->children)) {
 964         errno = ENOTEMPTY;
 965         goto done;
 966     }
 967
 968     struct v_dnode* parent = dnode->parent;
 969
 970     if (!parent) {
 971         errno = EINVAL;
 972         goto done;
 973     }
 974
 975     lock_dnode(parent);
 976     lock_inode(parent->inode);
 977
 978     if (!(dnode->inode->itype & F_MFILE)) {
 979         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 980         if (!errno) {
 981             vfs_dcache_remove(dnode);
 982         }
 983     } else {
 984         errno = ENOTDIR;
 985     }
 986
 987     unlock_inode(parent->inode);
 988     unlock_dnode(parent);
 989
 990 done:
 991     unlock_dnode(dnode);
 992     return DO_STATUS(errno);
 993 }
 994
 995 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 996 {
 997     int errno = 0;
 998     struct v_dnode *parent, *dir;
 999     char name_value[VFS_NAME_MAXLEN];
1000     struct hstr name = HHSTR(name_value, 0, 0);
1001
1002     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
1003         goto done;
1004     }
1005
1006     if (!(errno = vfs_walk(parent, name_value, &dir, NULL, 0))) {
1007         errno = EEXIST;
1008         goto done;
1009     }
1010
1011     if ((errno = vfs_check_writable(parent))) {
1012         goto done;
1013     }
1014
1015     if (!(dir = vfs_d_alloc(parent, &name))) {
1016         errno = ENOMEM;
1017         goto done;
1018     }
1019
1020     lock_dnode(parent);
1021     lock_inode(parent->inode);
1022
1023     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1024         errno = ENOTSUP;
1025     } else if (!parent->inode->ops->mkdir) {
1026         errno = ENOTSUP;
1027     } else if ((parent->inode->itype & F_FILE)) {
1028         errno = ENOTDIR;
1029     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1030         vfs_dcache_add(parent, dir);
1031         goto cleanup;
1032     }
1033
1034     vfs_d_free(dir);
1035
1036 cleanup:
1037     unlock_inode(parent->inode);
1038     unlock_dnode(parent);
1039 done:
1040     return DO_STATUS(errno);
1041 }
1042
1043 int
1044 __vfs_do_unlink(struct v_dnode* dnode)
1045 {
1046     int errno;
1047     struct v_inode* inode = dnode->inode;
1048
1049     if (dnode->ref_count > 1) {
1050         return EBUSY;
1051     }
1052
1053     if ((errno = vfs_check_writable(dnode))) {
1054         return errno;
1055     }
1056
1057     lock_inode(inode);
1058
1059     if (inode->open_count) {
1060         errno = EBUSY;
1061     } else if ((inode->itype & F_MFILE)) {
1062         errno = inode->ops->unlink(inode);
1063         if (!errno) {
1064             vfs_d_free(dnode);
1065         }
1066     } else {
1067         errno = EISDIR;
1068     }
1069
1070     unlock_inode(inode);
1071
1072     return errno;
1073 }
1074
1075 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1076 {
1077     int errno;
1078     struct v_dnode* dnode;
1079     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1080         goto done;
1081     }
1082
1083     errno = __vfs_do_unlink(dnode);
1084
1085 done:
1086     return DO_STATUS(errno);
1087 }
1088
1089 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1090 {
1091     int errno;
1092     struct v_fd* fd_s;
1093     if ((errno = vfs_getfd(fd, &fd_s))) {
1094         goto done;
1095     }
1096
1097     struct v_dnode* dnode;
1098     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1099         errno = __vfs_do_unlink(dnode);
1100     }
1101
1102 done:
1103     return DO_STATUS(errno);
1104 }
1105
1106 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1107 {
1108     int errno;
1109     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1110
1111     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1112     if (!errno) {
1113         errno = __vfs_try_locate_file(
1114           newpath, &name_dentry, &name_file, FLOCATE_CREATE_ONLY);
1115         if (!errno) {
1116             errno = vfs_link(to_link, name_file);
1117         }
1118     }
1119     return DO_STATUS(errno);
1120 }
1121
1122 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1123 {
1124     int errno;
1125     struct v_fd* fd_s;
1126
1127     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1128         errno = vfs_fsync(fd_s->file);
1129     }
1130
1131     return DO_STATUS(errno);
1132 }
1133
1134 int
1135 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1136 {
1137     int errno = 0;
1138     struct v_fd* copied = cake_grab(fd_pile);
1139
1140     memcpy(copied, old, sizeof(struct v_fd));
1141
1142     atomic_fetch_add(&old->file->ref_count, 1);
1143
1144     *new = copied;
1145
1146     return errno;
1147 }
1148
1149 int
1150 vfs_dup2(int oldfd, int newfd)
1151 {
1152     if (newfd == oldfd) {
1153         return newfd;
1154     }
1155
1156     int errno;
1157     struct v_fd *oldfd_s, *newfd_s;
1158     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1159         goto done;
1160     }
1161
1162     if (!TEST_FD(newfd)) {
1163         errno = EBADF;
1164         goto done;
1165     }
1166
1167     newfd_s = __current->fdtable->fds[newfd];
1168     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1169         goto done;
1170     }
1171
1172     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1173         __current->fdtable->fds[newfd] = newfd_s;
1174         return newfd;
1175     }
1176
1177 done:
1178     return DO_STATUS(errno);
1179 }
1180
1181 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1182 {
1183     return vfs_dup2(oldfd, newfd);
1184 }
1185
1186 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1187 {
1188     int errno, newfd;
1189     struct v_fd *oldfd_s, *newfd_s;
1190     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1191         goto done;
1192     }
1193
1194     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1195         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1196         __current->fdtable->fds[newfd] = newfd_s;
1197         return newfd;
1198     }
1199
1200 done:
1201     return DO_STATUS(errno);
1202 }
1203
1204 __DEFINE_LXSYSCALL2(
1205   int, symlink, const char*, pathname, const char*, link_target)
1206 {
1207     int errno;
1208     struct v_dnode *dnode, *file;
1209     if ((errno = __vfs_try_locate_file(
1210            pathname, &dnode, &file, FLOCATE_CREATE_ONLY))) {
1211         goto done;
1212     }
1213
1214     if ((errno = vfs_check_writable(file))) {
1215         goto done;
1216     }
1217
1218     if (!file->inode->ops->set_symlink) {
1219         errno = ENOTSUP;
1220         goto done;
1221     }
1222
1223     lock_inode(file->inode);
1224
1225     errno = file->inode->ops->set_symlink(file->inode, link_target);
1226
1227     unlock_inode(file->inode);
1228
1229 done:
1230     return DO_STATUS(errno);
1231 }
1232
1233 void
1234 vfs_ref_file(struct v_file* file)
1235 {
1236     atomic_fetch_add(&file->ref_count, 1);
1237 }
1238
1239 void
1240 vfs_ref_dnode(struct v_dnode* dnode)
1241 {
1242     atomic_fetch_add(&dnode->ref_count, 1);
1243
1244     if (dnode->mnt) {
1245         mnt_mkbusy(dnode->mnt);
1246     }
1247 }
1248
1249 void
1250 vfs_unref_dnode(struct v_dnode* dnode)
1251 {
1252     atomic_fetch_sub(&dnode->ref_count, 1);
1253     if (dnode->mnt) {
1254         mnt_chillax(dnode->mnt);
1255     }
1256 }
1257
1258 int
1259 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1260 {
1261     int errno = 0;
1262
1263     lock_dnode(dnode);
1264
1265     if ((dnode->inode->itype & F_FILE)) {
1266         errno = ENOTDIR;
1267         goto done;
1268     }
1269
1270     if (proc->cwd) {
1271         vfs_unref_dnode(proc->cwd);
1272     }
1273
1274     vfs_ref_dnode(dnode);
1275     proc->cwd = dnode;
1276
1277     unlock_dnode(dnode);
1278
1279 done:
1280     return errno;
1281 }
1282
1283 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1284 {
1285     struct v_dnode* dnode;
1286     int errno = 0;
1287
1288     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1289         goto done;
1290     }
1291
1292     errno = vfs_do_chdir((struct proc_info*)__current, dnode);
1293
1294 done:
1295     return DO_STATUS(errno);
1296 }
1297
1298 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1299 {
1300     struct v_fd* fd_s;
1301     int errno = 0;
1302
1303     if ((errno = vfs_getfd(fd, &fd_s))) {
1304         goto done;
1305     }
1306
1307     errno = vfs_do_chdir((struct proc_info*)__current, fd_s->file->dnode);
1308
1309 done:
1310     return DO_STATUS(errno);
1311 }
1312
1313 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1314 {
1315     int errno = 0;
1316     char* ret_ptr = 0;
1317     if (size < 2) {
1318         errno = ERANGE;
1319         goto done;
1320     }
1321
1322     size_t len = 0;
1323
1324     if (!__current->cwd) {
1325         *buf = VFS_PATH_DELIM;
1326         len = 1;
1327     } else {
1328         len = vfs_get_path(__current->cwd, buf, size, 0);
1329         if (len == size) {
1330             errno = ERANGE;
1331             goto done;
1332         }
1333     }
1334
1335     buf[len] = '\0';
1336
1337     ret_ptr = buf;
1338
1339 done:
1340     syscall_result(errno);
1341     return ret_ptr;
1342 }
1343
1344 int
1345 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1346 {
1347     int errno = 0;
1348     if (current->inode->id == target->inode->id) {
1349         // hard link
1350         return 0;
1351     }
1352
1353     if ((errno = vfs_check_writable(current))) {
1354         return errno;
1355     }
1356
1357     if (current->ref_count > 1 || target->ref_count > 1) {
1358         return EBUSY;
1359     }
1360
1361     if (current->super_block != target->super_block) {
1362         return EXDEV;
1363     }
1364
1365     struct v_dnode* oldparent = current->parent;
1366     struct v_dnode* newparent = target->parent;
1367
1368     lock_dnode(current);
1369     lock_dnode(target);
1370     if (oldparent)
1371         lock_dnode(oldparent);
1372     if (newparent)
1373         lock_dnode(newparent);
1374
1375     if (!llist_empty(&target->children)) {
1376         errno = ENOTEMPTY;
1377         unlock_dnode(target);
1378         goto cleanup;
1379     }
1380
1381     if ((errno =
1382            current->inode->ops->rename(current->inode, current, target))) {
1383         unlock_dnode(target);
1384         goto cleanup;
1385     }
1386
1387     // re-position current
1388     hstrcpy(&current->name, &target->name);
1389     vfs_dcache_rehash(newparent, current);
1390
1391     // detach target
1392     vfs_d_free(target);
1393
1394     unlock_dnode(target);
1395
1396 cleanup:
1397     unlock_dnode(current);
1398     if (oldparent)
1399         unlock_dnode(oldparent);
1400     if (newparent)
1401         unlock_dnode(newparent);
1402
1403     return errno;
1404 }
1405
1406 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1407 {
1408     struct v_dnode *cur, *target_parent, *target;
1409     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1410     int errno = 0;
1411
1412     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1413         goto done;
1414     }
1415
1416     if ((errno = vfs_walk(
1417            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1418         goto done;
1419     }
1420
1421     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1422     if (errno == ENOENT) {
1423         target = vfs_d_alloc(target_parent, &name);
1424         vfs_dcache_add(target_parent, target);
1425     } else if (errno) {
1426         goto done;
1427     }
1428
1429     if (!target) {
1430         errno = ENOMEM;
1431         goto done;
1432     }
1433
1434     errno = vfs_do_rename(cur, target);
1435
1436 done:
1437     vfree((void*)name.value);
1438     return DO_STATUS(errno);
1439 }
1440
1441 __DEFINE_LXSYSCALL2(int, fstat, int, fd, struct file_stat*, stat)
1442 {
1443     int errno = 0;
1444     struct v_fd* fds;
1445
1446     if ((errno = vfs_getfd(fd, &fds))) {
1447         goto done;
1448     }
1449
1450     struct v_inode* vino = fds->file->inode;
1451     struct device* fdev = vino->sb->dev;
1452
1453     *stat = (struct file_stat){.st_ino = vino->id,
1454                                .st_blocks = vino->lb_usage,
1455                                .st_size = vino->fsize,
1456                                .mode = vino->itype,
1457                                .st_ioblksize = PG_SIZE,
1458                                .st_blksize = vino->sb->blksize};
1459
1460     if (VFS_DEVFILE(vino->itype)) {
1461         struct device* rdev = resolve_device(vino->data);
1462         if (!rdev || rdev->magic != DEV_STRUCT_MAGIC) {
1463             errno = EINVAL;
1464             goto done;
1465         }
1466
1467         stat->st_rdev = (dev_t){.meta = rdev->ident.fn_grp,
1468                                 .unique = rdev->ident.unique,
1469                                 .index = rdev->dev_uid};
1470     }
1471
1472     if (fdev) {
1473         stat->st_dev = (dev_t){.meta = fdev->ident.fn_grp,
1474                                .unique = fdev->ident.unique,
1475                                .index = fdev->dev_uid};
1476     }
1477
1478 done:
1479     return DO_STATUS(errno);
1480 }