lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/dirent.h>
  48 #include <lunaix/foptions.h>
  49 #include <lunaix/fs.h>
  50 #include <lunaix/mm/cake.h>
  51 #include <lunaix/mm/page.h>
  52 #include <lunaix/mm/valloc.h>
  53 #include <lunaix/process.h>
  54 #include <lunaix/spike.h>
  55 #include <lunaix/syscall.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 static struct cake_pile* dnode_pile;
  60 static struct cake_pile* inode_pile;
  61 static struct cake_pile* file_pile;
  62 static struct cake_pile* superblock_pile;
  63 static struct cake_pile* fd_pile;
  64
  65 struct v_dnode* vfs_sysroot;
  66 static struct hbucket* dnode_cache;
  67
  68 struct lru_zone *dnode_lru, *inode_lru;
  69
  70 struct hstr vfs_ddot = HSTR("..", 2);
  71 struct hstr vfs_dot = HSTR(".", 1);
  72 struct hstr vfs_empty = HSTR("", 0);
  73
  74 struct v_superblock*
  75 vfs_sb_alloc();
  76
  77 void
  78 vfs_sb_free(struct v_superblock* sb);
  79
  80 static int
  81 __vfs_try_evict_dnode(struct lru_node* obj);
  82
  83 static int
  84 __vfs_try_evict_inode(struct lru_node* obj);
  85
  86 void
  87 vfs_init()
  88 {
  89     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  90     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  91     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  92     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  93     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  94     superblock_pile =
  95       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  96
  97     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
  98
  99     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 100     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 101
 102     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 103     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 104
 105     // 创建一个根dnode。
 106     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 107     vfs_sysroot->parent = vfs_sysroot;
 108     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 109 }
 110
 111 inline struct hbucket*
 112 __dcache_hash(struct v_dnode* parent, uint32_t* hash)
 113 {
 114     uint32_t _hash = *hash;
 115     // 与parent的指针值做加法，来减小碰撞的可能性。
 116     _hash += (uint32_t)parent;
 117     // 确保低位更加随机
 118     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 119     *hash = _hash;
 120     return &dnode_cache[_hash & VFS_HASH_MASK];
 121 }
 122
 123 struct v_dnode*
 124 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 125 {
 126     if (!str->len || HSTR_EQ(str, &vfs_dot))
 127         return parent;
 128
 129     if (HSTR_EQ(str, &vfs_ddot)) {
 130         return parent->parent;
 131     }
 132
 133     uint32_t hash = str->hash;
 134     struct hbucket* slot = __dcache_hash(parent, &hash);
 135
 136     struct v_dnode *pos, *n;
 137     hashtable_bucket_foreach(slot, pos, n, hash_list)
 138     {
 139         if (pos->name.hash == hash) {
 140             return pos;
 141         }
 142     }
 143     return NULL;
 144 }
 145
 146 void
 147 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 148 {
 149     assert(parent);
 150
 151     atomic_fetch_add(&dnode->ref_count, 1);
 152     dnode->parent = parent;
 153     llist_append(&parent->children, &dnode->siblings);
 154
 155     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 156     hlist_add(&bucket->head, &dnode->hash_list);
 157 }
 158
 159 void
 160 vfs_dcache_remove(struct v_dnode* dnode)
 161 {
 162     assert(dnode);
 163     assert(dnode->ref_count == 1);
 164
 165     llist_delete(&dnode->siblings);
 166     hlist_delete(&dnode->hash_list);
 167
 168     dnode->parent = NULL;
 169     atomic_fetch_sub(&dnode->ref_count, 1);
 170 }
 171
 172 void
 173 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 174 {
 175     assert(new_parent);
 176
 177     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 178     vfs_dcache_remove(dnode);
 179     vfs_dcache_add(new_parent, dnode);
 180 }
 181
 182 int
 183 vfs_open(struct v_dnode* dnode, struct v_file** file)
 184 {
 185     if (!dnode->inode || !dnode->inode->ops->open) {
 186         return ENOTSUP;
 187     }
 188
 189     struct v_inode* inode = dnode->inode;
 190
 191     lock_inode(inode);
 192
 193     struct v_file* vfile = cake_grab(file_pile);
 194     memset(vfile, 0, sizeof(*vfile));
 195
 196     vfile->dnode = dnode;
 197     vfile->inode = inode;
 198     vfile->ref_count = ATOMIC_VAR_INIT(1);
 199     vfile->ops = inode->default_fops;
 200
 201     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 202         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 203         pcache_init(pcache);
 204         pcache->master = inode;
 205         inode->pg_cache = pcache;
 206     }
 207
 208     int errno = inode->ops->open(inode, vfile);
 209     if (errno) {
 210         cake_release(file_pile, vfile);
 211     } else {
 212         atomic_fetch_add(&dnode->ref_count, 1);
 213         inode->open_count++;
 214         mnt_mkbusy(dnode->mnt);
 215
 216         *file = vfile;
 217     }
 218
 219     unlock_inode(inode);
 220
 221     return errno;
 222 }
 223
 224 void
 225 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 226 {
 227     if (assign_to->inode) {
 228         assign_to->inode->link_count--;
 229     }
 230     assign_to->inode = inode;
 231     inode->link_count++;
 232 }
 233
 234 int
 235 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 236 {
 237     int errno;
 238
 239     if ((errno = vfs_check_writable(to_link))) {
 240         return errno;
 241     }
 242
 243     lock_inode(to_link->inode);
 244     if (to_link->super_block->root != name->super_block->root) {
 245         errno = EXDEV;
 246     } else if (!to_link->inode->ops->link) {
 247         errno = ENOTSUP;
 248     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 249         vfs_assign_inode(name, to_link->inode);
 250     }
 251     unlock_inode(to_link->inode);
 252
 253     return errno;
 254 }
 255
 256 int
 257 vfs_close(struct v_file* file)
 258 {
 259     int errno = 0;
 260     if (file->ref_count > 1) {
 261         atomic_fetch_sub(&file->ref_count, 1);
 262     } else if (!(errno = file->ops->close(file))) {
 263         atomic_fetch_sub(&file->dnode->ref_count, 1);
 264         file->inode->open_count--;
 265
 266         // Remove dead lock.
 267         // This happened when process is terminated while blocking on read.
 268         // In that case, the process is still holding the inode lock and it will
 269         // never get released.
 270         // FIXME is this a good solution?
 271         /*
 272          * Consider two process both open the same file both with fd=x.
 273          *      Process A: busy on reading x
 274          *      Process B: do nothing with x
 275          * Assume that, after a very short time, process B get terminated while
 276          * process A is still busy in it's reading business. By this design, the
 277          * inode lock of this file x is get released by B rather than A. And
 278          * this will cause a probable race condition on A if other process is
 279          * writing to this file later after B exit.
 280          *
 281          * A possible solution is to add a owner identification in the lock
 282          * context, so only the lock holder can do the release.
 283          */
 284         if (mutex_on_hold(&file->inode->lock)) {
 285             unlock_inode(file->inode);
 286         }
 287         mnt_chillax(file->dnode->mnt);
 288
 289         pcache_commit_all(file->inode);
 290         cake_release(file_pile, file);
 291     }
 292     return errno;
 293 }
 294
 295 int
 296 vfs_fsync(struct v_file* file)
 297 {
 298     int errno;
 299     if ((errno = vfs_check_writable(file->dnode))) {
 300         return errno;
 301     }
 302
 303     lock_inode(file->inode);
 304
 305     pcache_commit_all(file->inode);
 306
 307     errno = ENOTSUP;
 308     if (file->ops->sync) {
 309         errno = file->ops->sync(file);
 310     }
 311
 312     unlock_inode(file->inode);
 313
 314     return errno;
 315 }
 316
 317 int
 318 vfs_alloc_fdslot(int* fd)
 319 {
 320     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 321         if (!__current->fdtable->fds[i]) {
 322             *fd = i;
 323             return 0;
 324         }
 325     }
 326     return EMFILE;
 327 }
 328
 329 struct v_superblock*
 330 vfs_sb_alloc()
 331 {
 332     struct v_superblock* sb = cake_grab(superblock_pile);
 333     memset(sb, 0, sizeof(*sb));
 334     llist_init_head(&sb->sb_list);
 335     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 336     return sb;
 337 }
 338
 339 void
 340 vfs_sb_free(struct v_superblock* sb)
 341 {
 342     vfree(sb->i_cache);
 343     cake_release(superblock_pile, sb);
 344 }
 345
 346 static int
 347 __vfs_try_evict_dnode(struct lru_node* obj)
 348 {
 349     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 350
 351     if (!dnode->ref_count) {
 352         vfs_d_free(dnode);
 353         return 1;
 354     }
 355     return 0;
 356 }
 357
 358 static int
 359 __vfs_try_evict_inode(struct lru_node* obj)
 360 {
 361     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 362
 363     if (!inode->link_count && !inode->open_count) {
 364         vfs_i_free(inode);
 365         return 1;
 366     }
 367     return 0;
 368 }
 369
 370 struct v_dnode*
 371 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 372 {
 373     struct v_dnode* dnode = cake_grab(dnode_pile);
 374     if (!dnode) {
 375         lru_evict_half(dnode_lru);
 376
 377         if (!(dnode = cake_grab(dnode_pile))) {
 378             return NULL;
 379         }
 380     }
 381
 382     memset(dnode, 0, sizeof(*dnode));
 383     llist_init_head(&dnode->children);
 384     llist_init_head(&dnode->siblings);
 385     mutex_init(&dnode->lock);
 386
 387     dnode->ref_count = ATOMIC_VAR_INIT(0);
 388     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 389
 390     hstrcpy(&dnode->name, name);
 391
 392     if (parent) {
 393         dnode->super_block = parent->super_block;
 394         dnode->mnt = parent->mnt;
 395     }
 396
 397     lru_use_one(dnode_lru, &dnode->lru);
 398
 399     return dnode;
 400 }
 401
 402 void
 403 vfs_d_free(struct v_dnode* dnode)
 404 {
 405     assert(dnode->ref_count == 1);
 406
 407     if (dnode->inode) {
 408         assert(dnode->inode->link_count > 0);
 409         dnode->inode->link_count--;
 410     }
 411
 412     vfs_dcache_remove(dnode);
 413     // Make sure the children de-referencing their parent.
 414     // With lru presented, the eviction will be propagated over the entire
 415     // detached subtree eventually
 416     struct v_dnode *pos, *n;
 417     llist_for_each(pos, n, &dnode->children, siblings)
 418     {
 419         vfs_dcache_remove(pos);
 420     }
 421
 422     vfree(dnode->name.value);
 423     cake_release(dnode_pile, dnode);
 424 }
 425
 426 struct v_inode*
 427 vfs_i_find(struct v_superblock* sb, uint32_t i_id)
 428 {
 429     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 430     struct v_inode *pos, *n;
 431     hashtable_bucket_foreach(slot, pos, n, hash_list)
 432     {
 433         if (pos->id == i_id) {
 434             lru_use_one(inode_lru, &pos->lru);
 435             return pos;
 436         }
 437     }
 438
 439     return NULL;
 440 }
 441
 442 void
 443 vfs_i_addhash(struct v_inode* inode)
 444 {
 445     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 446
 447     hlist_delete(&inode->hash_list);
 448     hlist_add(&slot->head, &inode->hash_list);
 449 }
 450
 451 struct v_inode*
 452 vfs_i_alloc(struct v_superblock* sb)
 453 {
 454     assert(sb->ops.init_inode);
 455
 456     struct v_inode* inode;
 457     if (!(inode = cake_grab(inode_pile))) {
 458         lru_evict_half(inode_lru);
 459         if (!(inode = cake_grab(inode_pile))) {
 460             return NULL;
 461         }
 462     }
 463
 464     memset(inode, 0, sizeof(*inode));
 465     mutex_init(&inode->lock);
 466     llist_init_head(&inode->xattrs);
 467
 468     sb->ops.init_inode(sb, inode);
 469
 470     inode->sb = sb;
 471     inode->ctime = clock_unixtime();
 472     inode->atime = inode->ctime;
 473     inode->mtime = inode->ctime;
 474
 475 done:
 476     lru_use_one(inode_lru, &inode->lru);
 477     return inode;
 478 }
 479
 480 void
 481 vfs_i_free(struct v_inode* inode)
 482 {
 483     if (inode->pg_cache) {
 484         pcache_release(inode->pg_cache);
 485         vfree(inode->pg_cache);
 486     }
 487     inode->ops->sync(inode);
 488     hlist_delete(&inode->hash_list);
 489     cake_release(inode_pile, inode);
 490 }
 491
 492 /* ---- System call definition and support ---- */
 493
 494 #define FLOCATE_CREATE_EMPTY 1
 495
 496 int
 497 vfs_getfd(int fd, struct v_fd** fd_s)
 498 {
 499     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 500         return 0;
 501     }
 502     return EBADF;
 503 }
 504
 505 int
 506 __vfs_try_locate_file(const char* path,
 507                       struct v_dnode** fdir,
 508                       struct v_dnode** file,
 509                       int options)
 510 {
 511     char name_str[VFS_NAME_MAXLEN];
 512     struct hstr name = HSTR(name_str, 0);
 513     int errno;
 514
 515     name_str[0] = 0;
 516     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 517         return errno;
 518     }
 519
 520     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 521     if (errno != ENOENT || !(options & FLOCATE_CREATE_EMPTY)) {
 522         return errno;
 523     }
 524
 525     struct v_dnode* parent = *fdir;
 526     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 527
 528     if (!file_new) {
 529         return ENOMEM;
 530     }
 531
 532     lock_dnode(parent);
 533
 534     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 535         vfs_dcache_add(parent, file_new);
 536         *file = file_new;
 537     } else {
 538         vfs_d_free(file_new);
 539     }
 540
 541     unlock_dnode(parent);
 542
 543     return errno;
 544 }
 545
 546 int
 547 vfs_do_open(const char* path, int options)
 548 {
 549     int errno, fd;
 550     struct v_dnode *dentry, *file;
 551     struct v_file* ofile = 0;
 552
 553     errno = __vfs_try_locate_file(
 554       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 555
 556     if (errno || (errno = vfs_open(file, &ofile))) {
 557         return errno;
 558     }
 559
 560     struct v_inode* o_inode = ofile->inode;
 561
 562     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 563         struct v_fd* fd_s = vzalloc(sizeof(*fd_s));
 564         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 565         fd_s->file = ofile;
 566         fd_s->flags = options;
 567         __current->fdtable->fds[fd] = fd_s;
 568         return fd;
 569     }
 570
 571     return errno;
 572 }
 573
 574 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 575 {
 576     int errno = vfs_do_open(path, options);
 577     return DO_STATUS_OR_RETURN(errno);
 578 }
 579
 580 __DEFINE_LXSYSCALL1(int, close, int, fd)
 581 {
 582     struct v_fd* fd_s;
 583     int errno = 0;
 584     if ((errno = vfs_getfd(fd, &fd_s))) {
 585         goto done_err;
 586     }
 587
 588     if ((errno = vfs_close(fd_s->file))) {
 589         goto done_err;
 590     }
 591
 592     vfree(fd_s);
 593     __current->fdtable->fds[fd] = 0;
 594
 595 done_err:
 596     return DO_STATUS(errno);
 597 }
 598
 599 void
 600 __vfs_readdir_callback(struct dir_context* dctx,
 601                        const char* name,
 602                        const int len,
 603                        const int dtype)
 604 {
 605     struct dirent* dent = (struct dirent*)dctx->cb_data;
 606     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 607     dent->d_nlen = len;
 608     dent->d_type = dtype;
 609 }
 610
 611 __DEFINE_LXSYSCALL2(int, readdir, int, fd, struct dirent*, dent)
 612 {
 613     struct v_fd* fd_s;
 614     int errno;
 615
 616     if ((errno = vfs_getfd(fd, &fd_s))) {
 617         goto done;
 618     }
 619
 620     struct v_inode* inode = fd_s->file->inode;
 621
 622     lock_inode(inode);
 623
 624     if (!(inode->itype & VFS_IFDIR)) {
 625         errno = ENOTDIR;
 626     } else {
 627         struct dir_context dctx =
 628           (struct dir_context){ .cb_data = dent,
 629                                 .index = dent->d_offset,
 630                                 .read_complete_callback =
 631                                   __vfs_readdir_callback };
 632         errno = 1;
 633         if (dent->d_offset == 0) {
 634             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 635         } else if (dent->d_offset == 1) {
 636             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 637         } else {
 638             dctx.index -= 2;
 639             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 640                 unlock_inode(inode);
 641                 goto done;
 642             }
 643         }
 644         dent->d_offset++;
 645     }
 646
 647     unlock_inode(inode);
 648
 649 done:
 650     return DO_STATUS_OR_RETURN(errno);
 651 }
 652
 653 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 654 {
 655     int errno = 0;
 656     struct v_fd* fd_s;
 657     if ((errno = vfs_getfd(fd, &fd_s))) {
 658         goto done;
 659     }
 660
 661     struct v_file* file = fd_s->file;
 662     if ((file->inode->itype & VFS_IFDIR)) {
 663         errno = EISDIR;
 664         goto done;
 665     }
 666
 667     lock_inode(file->inode);
 668
 669     file->inode->atime = clock_unixtime();
 670
 671     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 672         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 673     } else {
 674         errno = pcache_read(file->inode, buf, count, file->f_pos);
 675     }
 676
 677     if (errno > 0) {
 678         file->f_pos += errno;
 679         unlock_inode(file->inode);
 680         return errno;
 681     }
 682
 683     unlock_inode(file->inode);
 684
 685 done:
 686     return DO_STATUS(errno);
 687 }
 688
 689 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 690 {
 691     int errno = 0;
 692     struct v_fd* fd_s;
 693     if ((errno = vfs_getfd(fd, &fd_s))) {
 694         goto done;
 695     }
 696
 697     struct v_file* file = fd_s->file;
 698
 699     if ((errno = vfs_check_writable(file->dnode))) {
 700         goto done;
 701     }
 702
 703     if ((file->inode->itype & VFS_IFDIR)) {
 704         errno = EISDIR;
 705         goto done;
 706     }
 707
 708     lock_inode(file->inode);
 709
 710     file->inode->mtime = clock_unixtime();
 711
 712     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 713         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 714     } else {
 715         errno = pcache_write(file->inode, buf, count, file->f_pos);
 716     }
 717
 718     if (errno > 0) {
 719         file->f_pos += errno;
 720         unlock_inode(file->inode);
 721         return errno;
 722     }
 723
 724     unlock_inode(file->inode);
 725
 726 done:
 727     return DO_STATUS(errno);
 728 }
 729
 730 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 731 {
 732     int errno = 0;
 733     struct v_fd* fd_s;
 734     if ((errno = vfs_getfd(fd, &fd_s))) {
 735         goto done;
 736     }
 737
 738     struct v_file* file = fd_s->file;
 739
 740     if (!file->ops->seek) {
 741         errno = ENOTSUP;
 742         goto done;
 743     }
 744
 745     lock_inode(file->inode);
 746
 747     int overflow = 0;
 748     int fpos = file->f_pos;
 749     switch (options) {
 750         case FSEEK_CUR:
 751             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 752             break;
 753         case FSEEK_END:
 754             overflow =
 755               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 756             break;
 757         case FSEEK_SET:
 758             fpos = offset;
 759             break;
 760     }
 761     if (overflow) {
 762         errno = EOVERFLOW;
 763     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 764         file->f_pos = fpos;
 765     }
 766
 767     unlock_inode(file->inode);
 768
 769 done:
 770     return DO_STATUS(errno);
 771 }
 772
 773 int
 774 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 775 {
 776     if (!dnode || dnode->parent == dnode) {
 777         return 0;
 778     }
 779
 780     if (depth > 64) {
 781         return ENAMETOOLONG;
 782     }
 783
 784     size_t len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 785
 786     if (len >= size) {
 787         return len;
 788     }
 789
 790     buf[len++] = VFS_PATH_DELIM;
 791
 792     size_t cpy_size = MIN(dnode->name.len, size - len);
 793     strncpy(buf + len, dnode->name.value, cpy_size);
 794     len += cpy_size;
 795
 796     return len;
 797 }
 798
 799 int
 800 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 801 {
 802     const char* link;
 803     struct v_inode* inode = dnode->inode;
 804     if (inode->ops->read_symlink) {
 805         lock_inode(inode);
 806
 807         int errno = inode->ops->read_symlink(inode, &link);
 808         strncpy(buf, link, size);
 809
 810         unlock_inode(inode);
 811         return errno;
 812     }
 813     return 0;
 814 }
 815
 816 int
 817 vfs_get_dtype(int itype)
 818 {
 819     switch (itype) {
 820         case VFS_IFDIR:
 821             return DT_DIR;
 822         case VFS_IFSYMLINK:
 823             return DT_SYMLINK;
 824         default:
 825             return DT_PIPE;
 826     }
 827 }
 828
 829 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 830 {
 831     int errno;
 832     struct v_fd* fd_s;
 833     if ((errno = vfs_getfd(fd, &fd_s))) {
 834         goto done;
 835     }
 836
 837     struct v_dnode* dnode;
 838     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 839
 840     if (errno >= 0) {
 841         return errno;
 842     }
 843
 844 done:
 845     return DO_STATUS(errno);
 846 }
 847
 848 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 849 {
 850     int errno;
 851     struct v_dnode* dnode;
 852     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 853         errno = vfs_readlink(dnode, buf, size);
 854     }
 855
 856     if (errno >= 0) {
 857         return errno;
 858     }
 859
 860     return DO_STATUS(errno);
 861 }
 862
 863 __DEFINE_LXSYSCALL4(int,
 864                     readlinkat,
 865                     int,
 866                     dirfd,
 867                     const char*,
 868                     pathname,
 869                     char*,
 870                     buf,
 871                     size_t,
 872                     size)
 873 {
 874     int errno;
 875     struct v_fd* fd_s;
 876     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 877         goto done;
 878     }
 879
 880     struct v_dnode* dnode;
 881     if (!(errno = vfs_walk(
 882             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 883         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 884     }
 885
 886     if (errno >= 0) {
 887         return errno;
 888     }
 889
 890 done:
 891     return DO_STATUS(errno);
 892 }
 893
 894 /*
 895     NOTE
 896     When we perform operation that could affect the layout of
 897     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 898     whenever possible. This will blocking any ongoing path walking to reach
 899     it hence avoid any partial state.
 900 */
 901
 902 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 903 {
 904     int errno;
 905     struct v_dnode* dnode;
 906     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 907         return DO_STATUS(errno);
 908     }
 909
 910     lock_dnode(dnode);
 911
 912     if ((errno = vfs_check_writable(dnode))) {
 913         goto done;
 914     }
 915
 916     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 917         errno = EROFS;
 918         goto done;
 919     }
 920
 921     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 922         errno = EBUSY;
 923         goto done;
 924     }
 925
 926     if (!llist_empty(&dnode->children)) {
 927         errno = ENOTEMPTY;
 928         goto done;
 929     }
 930
 931     struct v_dnode* parent = dnode->parent;
 932
 933     if (!parent) {
 934         errno = EINVAL;
 935         goto done;
 936     }
 937
 938     lock_dnode(parent);
 939     lock_inode(parent->inode);
 940
 941     if ((dnode->inode->itype & VFS_IFDIR)) {
 942         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 943         if (!errno) {
 944             vfs_dcache_remove(dnode);
 945         }
 946     } else {
 947         errno = ENOTDIR;
 948     }
 949
 950     unlock_inode(parent->inode);
 951     unlock_dnode(parent);
 952
 953 done:
 954     unlock_dnode(dnode);
 955     return DO_STATUS(errno);
 956 }
 957
 958 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 959 {
 960     int errno = 0;
 961     struct v_dnode *parent, *dir;
 962     char name_value[VFS_NAME_MAXLEN];
 963     struct hstr name = HHSTR(name_value, 0, 0);
 964
 965     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
 966         goto done;
 967     }
 968
 969     if ((errno = vfs_check_writable(parent))) {
 970         goto done;
 971     }
 972
 973     if (!(dir = vfs_d_alloc(parent, &name))) {
 974         errno = ENOMEM;
 975         goto done;
 976     }
 977
 978     lock_dnode(parent);
 979     lock_inode(parent->inode);
 980
 981     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
 982         errno = ENOTSUP;
 983     } else if (!parent->inode->ops->mkdir) {
 984         errno = ENOTSUP;
 985     } else if (!(parent->inode->itype & VFS_IFDIR)) {
 986         errno = ENOTDIR;
 987     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
 988         vfs_dcache_add(parent, dir);
 989         goto cleanup;
 990     }
 991
 992     vfs_d_free(dir);
 993
 994 cleanup:
 995     unlock_inode(parent->inode);
 996     unlock_dnode(parent);
 997 done:
 998     return DO_STATUS(errno);
 999 }
1000
1001 int
1002 __vfs_do_unlink(struct v_dnode* dnode)
1003 {
1004     int errno;
1005     struct v_inode* inode = dnode->inode;
1006
1007     if (dnode->ref_count > 1) {
1008         return EBUSY;
1009     }
1010
1011     if ((errno = vfs_check_writable(dnode))) {
1012         return errno;
1013     }
1014
1015     lock_inode(inode);
1016
1017     if (inode->open_count) {
1018         errno = EBUSY;
1019     } else if (!(inode->itype & VFS_IFDIR)) {
1020         // The underlying unlink implementation should handle
1021         //  symlink case
1022         errno = inode->ops->unlink(inode);
1023         if (!errno) {
1024             vfs_d_free(dnode);
1025         }
1026     } else {
1027         errno = EISDIR;
1028     }
1029
1030     unlock_inode(inode);
1031
1032     return errno;
1033 }
1034
1035 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1036 {
1037     int errno;
1038     struct v_dnode* dnode;
1039     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1040         goto done;
1041     }
1042
1043     errno = __vfs_do_unlink(dnode);
1044
1045 done:
1046     return DO_STATUS(errno);
1047 }
1048
1049 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1050 {
1051     int errno;
1052     struct v_fd* fd_s;
1053     if ((errno = vfs_getfd(fd, &fd_s))) {
1054         goto done;
1055     }
1056
1057     struct v_dnode* dnode;
1058     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1059         errno = __vfs_do_unlink(dnode);
1060     }
1061
1062 done:
1063     return DO_STATUS(errno);
1064 }
1065
1066 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1067 {
1068     int errno;
1069     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1070
1071     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1072     if (!errno) {
1073         errno = __vfs_try_locate_file(
1074           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1075         if (!errno) {
1076             errno = EEXIST;
1077         } else if (name_file) {
1078             errno = vfs_link(to_link, name_file);
1079         }
1080     }
1081     return DO_STATUS(errno);
1082 }
1083
1084 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1085 {
1086     int errno;
1087     struct v_fd* fd_s;
1088
1089     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1090         errno = vfs_fsync(fd_s->file);
1091     }
1092
1093     return DO_STATUS(errno);
1094 }
1095
1096 int
1097 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1098 {
1099     int errno = 0;
1100     struct v_fd* copied = cake_grab(fd_pile);
1101
1102     memcpy(copied, old, sizeof(struct v_fd));
1103
1104     atomic_fetch_add(&old->file->ref_count, 1);
1105
1106     *new = copied;
1107
1108     return errno;
1109 }
1110
1111 int
1112 vfs_dup2(int oldfd, int newfd)
1113 {
1114     if (newfd == oldfd) {
1115         return newfd;
1116     }
1117
1118     int errno;
1119     struct v_fd *oldfd_s, *newfd_s;
1120     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1121         goto done;
1122     }
1123
1124     if (!TEST_FD(newfd)) {
1125         errno = EBADF;
1126         goto done;
1127     }
1128
1129     newfd_s = __current->fdtable->fds[newfd];
1130     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1131         goto done;
1132     }
1133
1134     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1135         __current->fdtable->fds[newfd] = newfd_s;
1136         return newfd;
1137     }
1138
1139 done:
1140     return DO_STATUS(errno);
1141 }
1142
1143 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1144 {
1145     return vfs_dup2(oldfd, newfd);
1146 }
1147
1148 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1149 {
1150     int errno, newfd;
1151     struct v_fd *oldfd_s, *newfd_s;
1152     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1153         goto done;
1154     }
1155
1156     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1157         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1158         __current->fdtable->fds[newfd] = newfd_s;
1159         return newfd;
1160     }
1161
1162 done:
1163     return DO_STATUS(errno);
1164 }
1165
1166 __DEFINE_LXSYSCALL2(int,
1167                     symlink,
1168                     const char*,
1169                     pathname,
1170                     const char*,
1171                     link_target)
1172 {
1173     int errno;
1174     struct v_dnode* dnode;
1175     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1176         goto done;
1177     }
1178
1179     if (errno = vfs_check_writable(dnode)) {
1180         goto done;
1181     }
1182
1183     if (!dnode->inode->ops->set_symlink) {
1184         errno = ENOTSUP;
1185         goto done;
1186     }
1187
1188     lock_inode(dnode->inode);
1189
1190     errno = dnode->inode->ops->set_symlink(dnode->inode, link_target);
1191
1192     unlock_inode(dnode->inode);
1193
1194 done:
1195     return DO_STATUS(errno);
1196 }
1197
1198 void
1199 vfs_ref_dnode(struct v_dnode* dnode)
1200 {
1201     atomic_fetch_add(&dnode->ref_count, 1);
1202     mnt_mkbusy(dnode->mnt);
1203 }
1204
1205 void
1206 vfs_unref_dnode(struct v_dnode* dnode)
1207 {
1208     atomic_fetch_sub(&dnode->ref_count, 1);
1209     mnt_chillax(dnode->mnt);
1210 }
1211
1212 int
1213 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1214 {
1215     int errno = 0;
1216
1217     lock_dnode(dnode);
1218
1219     if (!(dnode->inode->itype & VFS_IFDIR)) {
1220         errno = ENOTDIR;
1221         goto done;
1222     }
1223
1224     if (proc->cwd) {
1225         vfs_unref_dnode(proc->cwd);
1226     }
1227
1228     vfs_ref_dnode(dnode);
1229     proc->cwd = dnode;
1230
1231     unlock_dnode(dnode);
1232
1233 done:
1234     return errno;
1235 }
1236
1237 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1238 {
1239     struct v_dnode* dnode;
1240     int errno = 0;
1241
1242     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1243         goto done;
1244     }
1245
1246     errno = vfs_do_chdir(__current, dnode);
1247
1248 done:
1249     return DO_STATUS(errno);
1250 }
1251
1252 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1253 {
1254     struct v_fd* fd_s;
1255     int errno = 0;
1256
1257     if ((errno = vfs_getfd(fd, &fd_s))) {
1258         goto done;
1259     }
1260
1261     errno = vfs_do_chdir(__current, fd_s->file->dnode);
1262
1263 done:
1264     return DO_STATUS(errno);
1265 }
1266
1267 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1268 {
1269     int errno = 0;
1270     char* ret_ptr = 0;
1271     if (size < 2) {
1272         errno = ERANGE;
1273         goto done;
1274     }
1275
1276     size_t len = 0;
1277
1278     if (!__current->cwd) {
1279         *buf = VFS_PATH_DELIM;
1280         len = 1;
1281     } else {
1282         len = vfs_get_path(__current->cwd, buf, size, 0);
1283         if (len == size) {
1284             errno = ERANGE;
1285             goto done;
1286         }
1287     }
1288
1289     buf[len + 1] = '\0';
1290
1291     ret_ptr = buf;
1292
1293 done:
1294     __current->k_status = errno;
1295     return ret_ptr;
1296 }
1297
1298 int
1299 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1300 {
1301     int errno = 0;
1302     if (current->inode->id == target->inode->id) {
1303         // hard link
1304         return 0;
1305     }
1306
1307     if (errno = vfs_check_writable(current)) {
1308         return errno;
1309     }
1310
1311     if (current->ref_count > 1 || target->ref_count > 1) {
1312         return EBUSY;
1313     }
1314
1315     if (current->super_block != target->super_block) {
1316         return EXDEV;
1317     }
1318
1319     struct v_dnode* oldparent = current->parent;
1320     struct v_dnode* newparent = target->parent;
1321
1322     lock_dnode(current);
1323     lock_dnode(target);
1324     if (oldparent)
1325         lock_dnode(oldparent);
1326     if (newparent)
1327         lock_dnode(newparent);
1328
1329     if (!llist_empty(&target->children)) {
1330         errno = ENOTEMPTY;
1331         unlock_dnode(target);
1332         goto cleanup;
1333     }
1334
1335     if ((errno =
1336            current->inode->ops->rename(current->inode, current, target))) {
1337         unlock_dnode(target);
1338         goto cleanup;
1339     }
1340
1341     // re-position current
1342     hstrcpy(&current->name, &target->name);
1343     vfs_dcache_rehash(newparent, current);
1344
1345     // detach target
1346     vfs_d_free(target);
1347
1348     unlock_dnode(target);
1349
1350 cleanup:
1351     unlock_dnode(current);
1352     if (oldparent)
1353         unlock_dnode(oldparent);
1354     if (newparent)
1355         unlock_dnode(newparent);
1356
1357     return errno;
1358 }
1359
1360 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1361 {
1362     struct v_dnode *cur, *target_parent, *target;
1363     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1364     int errno = 0;
1365
1366     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1367         goto done;
1368     }
1369
1370     if ((errno = vfs_walk(
1371            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1372         goto done;
1373     }
1374
1375     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1376     if (errno == ENOENT) {
1377         target = vfs_d_alloc(target_parent, &name);
1378         vfs_dcache_add(target_parent, target);
1379     } else if (errno) {
1380         goto done;
1381     }
1382
1383     if (!target) {
1384         errno = ENOMEM;
1385         goto done;
1386     }
1387
1388     errno = vfs_do_rename(cur, target);
1389
1390 done:
1391     vfree(name.value);
1392     return DO_STATUS(errno);
1393 }