lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/dirent.h>
  48 #include <lunaix/foptions.h>
  49 #include <lunaix/fs.h>
  50 #include <lunaix/mm/cake.h>
  51 #include <lunaix/mm/page.h>
  52 #include <lunaix/mm/valloc.h>
  53 #include <lunaix/process.h>
  54 #include <lunaix/spike.h>
  55 #include <lunaix/syscall.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 static struct cake_pile* dnode_pile;
  60 static struct cake_pile* inode_pile;
  61 static struct cake_pile* file_pile;
  62 static struct cake_pile* superblock_pile;
  63 static struct cake_pile* fd_pile;
  64
  65 struct v_dnode* vfs_sysroot;
  66 static struct hbucket* dnode_cache;
  67
  68 struct lru_zone *dnode_lru, *inode_lru;
  69
  70 struct hstr vfs_ddot = HSTR("..", 2);
  71 struct hstr vfs_dot = HSTR(".", 1);
  72 struct hstr vfs_empty = HSTR("", 0);
  73
  74 struct v_superblock*
  75 vfs_sb_alloc();
  76
  77 void
  78 vfs_sb_free(struct v_superblock* sb);
  79
  80 static int
  81 __vfs_try_evict_dnode(struct lru_node* obj);
  82
  83 static int
  84 __vfs_try_evict_inode(struct lru_node* obj);
  85
  86 void
  87 vfs_init()
  88 {
  89     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  90     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  91     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  92     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  93     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  94     superblock_pile =
  95       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  96
  97     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
  98
  99     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 100     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 101
 102     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 103     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 104
 105     // 创建一个根dnode。
 106     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 107     vfs_sysroot->parent = vfs_sysroot;
 108     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 109 }
 110
 111 inline struct hbucket*
 112 __dcache_hash(struct v_dnode* parent, uint32_t* hash)
 113 {
 114     uint32_t _hash = *hash;
 115     // 与parent的指针值做加法，来减小碰撞的可能性。
 116     _hash += (uint32_t)parent;
 117     // 确保低位更加随机
 118     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 119     *hash = _hash;
 120     return &dnode_cache[_hash & VFS_HASH_MASK];
 121 }
 122
 123 struct v_dnode*
 124 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 125 {
 126     if (!str->len || HSTR_EQ(str, &vfs_dot))
 127         return parent;
 128
 129     if (HSTR_EQ(str, &vfs_ddot)) {
 130         return parent->parent;
 131     }
 132
 133     uint32_t hash = str->hash;
 134     struct hbucket* slot = __dcache_hash(parent, &hash);
 135
 136     struct v_dnode *pos, *n;
 137     hashtable_bucket_foreach(slot, pos, n, hash_list)
 138     {
 139         if (pos->name.hash == hash) {
 140             return pos;
 141         }
 142     }
 143     return NULL;
 144 }
 145
 146 void
 147 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 148 {
 149     assert(parent);
 150
 151     atomic_fetch_add(&dnode->ref_count, 1);
 152     dnode->parent = parent;
 153     llist_append(&parent->children, &dnode->siblings);
 154
 155     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 156     hlist_add(&bucket->head, &dnode->hash_list);
 157 }
 158
 159 void
 160 vfs_dcache_remove(struct v_dnode* dnode)
 161 {
 162     assert(dnode);
 163     assert(dnode->ref_count == 1);
 164
 165     llist_delete(&dnode->siblings);
 166     hlist_delete(&dnode->hash_list);
 167
 168     dnode->parent = NULL;
 169     atomic_fetch_sub(&dnode->ref_count, 1);
 170 }
 171
 172 void
 173 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 174 {
 175     assert(new_parent);
 176
 177     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 178     vfs_dcache_remove(dnode);
 179     vfs_dcache_add(new_parent, dnode);
 180 }
 181
 182 int
 183 vfs_open(struct v_dnode* dnode, struct v_file** file)
 184 {
 185     if (!dnode->inode || !dnode->inode->ops->open) {
 186         return ENOTSUP;
 187     }
 188
 189     struct v_inode* inode = dnode->inode;
 190
 191     lock_inode(inode);
 192
 193     struct v_file* vfile = cake_grab(file_pile);
 194     memset(vfile, 0, sizeof(*vfile));
 195
 196     vfile->dnode = dnode;
 197     vfile->inode = inode;
 198     vfile->ref_count = ATOMIC_VAR_INIT(1);
 199     vfile->ops = inode->default_fops;
 200
 201     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 202         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 203         pcache_init(pcache);
 204         pcache->master = inode;
 205         inode->pg_cache = pcache;
 206     }
 207
 208     int errno = inode->ops->open(inode, vfile);
 209     if (errno) {
 210         cake_release(file_pile, vfile);
 211     } else {
 212         atomic_fetch_add(&dnode->ref_count, 1);
 213         inode->open_count++;
 214         mnt_mkbusy(dnode->mnt);
 215
 216         *file = vfile;
 217     }
 218
 219     unlock_inode(inode);
 220
 221     return errno;
 222 }
 223
 224 void
 225 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 226 {
 227     if (assign_to->inode) {
 228         assign_to->inode->link_count--;
 229     }
 230     assign_to->inode = inode;
 231     inode->link_count++;
 232 }
 233
 234 int
 235 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 236 {
 237     int errno;
 238
 239     if ((errno = vfs_check_writable(to_link))) {
 240         return errno;
 241     }
 242
 243     lock_inode(to_link->inode);
 244     if (to_link->super_block->root != name->super_block->root) {
 245         errno = EXDEV;
 246     } else if (!to_link->inode->ops->link) {
 247         errno = ENOTSUP;
 248     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 249         vfs_assign_inode(name, to_link->inode);
 250     }
 251     unlock_inode(to_link->inode);
 252
 253     return errno;
 254 }
 255
 256 int
 257 vfs_pclose(struct v_file* file, pid_t pid)
 258 {
 259     int errno = 0;
 260     if (file->ref_count > 1) {
 261         atomic_fetch_sub(&file->ref_count, 1);
 262     } else if (!(errno = file->ops->close(file))) {
 263         atomic_fetch_sub(&file->dnode->ref_count, 1);
 264         file->inode->open_count--;
 265
 266         // Prevent dead lock.
 267         // This happened when process is terminated while blocking on read.
 268         // In that case, the process is still holding the inode lock and it will
 269         // never get released.
 270         /*
 271          * The unlocking should also include ownership check.
 272          *
 273          * To see why, consider two process both open the same file both with
 274          * fd=x.
 275          *      Process A: busy on reading x
 276          *      Process B: do nothing with x
 277          * Assuming that, after a very short time, process B get terminated
 278          * while process A is still busy in it's reading business. By this
 279          * design, the inode lock of this file x is get released by B rather
 280          * than A. And this will cause a probable race condition on A if other
 281          * process is writing to this file later after B exit.
 282          */
 283         if (mutex_on_hold(&file->inode->lock)) {
 284             mutex_unlock_for(&file->inode->lock, pid);
 285         }
 286         mnt_chillax(file->dnode->mnt);
 287
 288         pcache_commit_all(file->inode);
 289         cake_release(file_pile, file);
 290     }
 291     return errno;
 292 }
 293
 294 int
 295 vfs_close(struct v_file* file)
 296 {
 297     return vfs_pclose(file, __current->pid);
 298 }
 299
 300 int
 301 vfs_fsync(struct v_file* file)
 302 {
 303     int errno;
 304     if ((errno = vfs_check_writable(file->dnode))) {
 305         return errno;
 306     }
 307
 308     lock_inode(file->inode);
 309
 310     pcache_commit_all(file->inode);
 311
 312     errno = ENOTSUP;
 313     if (file->ops->sync) {
 314         errno = file->ops->sync(file);
 315     }
 316
 317     unlock_inode(file->inode);
 318
 319     return errno;
 320 }
 321
 322 int
 323 vfs_alloc_fdslot(int* fd)
 324 {
 325     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 326         if (!__current->fdtable->fds[i]) {
 327             *fd = i;
 328             return 0;
 329         }
 330     }
 331     return EMFILE;
 332 }
 333
 334 struct v_superblock*
 335 vfs_sb_alloc()
 336 {
 337     struct v_superblock* sb = cake_grab(superblock_pile);
 338     memset(sb, 0, sizeof(*sb));
 339     llist_init_head(&sb->sb_list);
 340     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 341     return sb;
 342 }
 343
 344 void
 345 vfs_sb_free(struct v_superblock* sb)
 346 {
 347     vfree(sb->i_cache);
 348     cake_release(superblock_pile, sb);
 349 }
 350
 351 static int
 352 __vfs_try_evict_dnode(struct lru_node* obj)
 353 {
 354     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 355
 356     if (!dnode->ref_count) {
 357         vfs_d_free(dnode);
 358         return 1;
 359     }
 360     return 0;
 361 }
 362
 363 static int
 364 __vfs_try_evict_inode(struct lru_node* obj)
 365 {
 366     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 367
 368     if (!inode->link_count && !inode->open_count) {
 369         vfs_i_free(inode);
 370         return 1;
 371     }
 372     return 0;
 373 }
 374
 375 struct v_dnode*
 376 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 377 {
 378     struct v_dnode* dnode = cake_grab(dnode_pile);
 379     if (!dnode) {
 380         lru_evict_half(dnode_lru);
 381
 382         if (!(dnode = cake_grab(dnode_pile))) {
 383             return NULL;
 384         }
 385     }
 386
 387     memset(dnode, 0, sizeof(*dnode));
 388     llist_init_head(&dnode->children);
 389     llist_init_head(&dnode->siblings);
 390     mutex_init(&dnode->lock);
 391
 392     dnode->ref_count = ATOMIC_VAR_INIT(0);
 393     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 394
 395     hstrcpy(&dnode->name, name);
 396
 397     if (parent) {
 398         dnode->super_block = parent->super_block;
 399         dnode->mnt = parent->mnt;
 400     }
 401
 402     lru_use_one(dnode_lru, &dnode->lru);
 403
 404     return dnode;
 405 }
 406
 407 void
 408 vfs_d_free(struct v_dnode* dnode)
 409 {
 410     assert(dnode->ref_count == 1);
 411
 412     if (dnode->inode) {
 413         assert(dnode->inode->link_count > 0);
 414         dnode->inode->link_count--;
 415     }
 416
 417     vfs_dcache_remove(dnode);
 418     // Make sure the children de-referencing their parent.
 419     // With lru presented, the eviction will be propagated over the entire
 420     // detached subtree eventually
 421     struct v_dnode *pos, *n;
 422     llist_for_each(pos, n, &dnode->children, siblings)
 423     {
 424         vfs_dcache_remove(pos);
 425     }
 426
 427     vfree(dnode->name.value);
 428     cake_release(dnode_pile, dnode);
 429 }
 430
 431 struct v_inode*
 432 vfs_i_find(struct v_superblock* sb, uint32_t i_id)
 433 {
 434     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 435     struct v_inode *pos, *n;
 436     hashtable_bucket_foreach(slot, pos, n, hash_list)
 437     {
 438         if (pos->id == i_id) {
 439             lru_use_one(inode_lru, &pos->lru);
 440             return pos;
 441         }
 442     }
 443
 444     return NULL;
 445 }
 446
 447 void
 448 vfs_i_addhash(struct v_inode* inode)
 449 {
 450     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 451
 452     hlist_delete(&inode->hash_list);
 453     hlist_add(&slot->head, &inode->hash_list);
 454 }
 455
 456 struct v_inode*
 457 vfs_i_alloc(struct v_superblock* sb)
 458 {
 459     assert(sb->ops.init_inode);
 460
 461     struct v_inode* inode;
 462     if (!(inode = cake_grab(inode_pile))) {
 463         lru_evict_half(inode_lru);
 464         if (!(inode = cake_grab(inode_pile))) {
 465             return NULL;
 466         }
 467     }
 468
 469     memset(inode, 0, sizeof(*inode));
 470     mutex_init(&inode->lock);
 471     llist_init_head(&inode->xattrs);
 472
 473     sb->ops.init_inode(sb, inode);
 474
 475     inode->sb = sb;
 476     inode->ctime = clock_unixtime();
 477     inode->atime = inode->ctime;
 478     inode->mtime = inode->ctime;
 479
 480 done:
 481     lru_use_one(inode_lru, &inode->lru);
 482     return inode;
 483 }
 484
 485 void
 486 vfs_i_free(struct v_inode* inode)
 487 {
 488     if (inode->pg_cache) {
 489         pcache_release(inode->pg_cache);
 490         vfree(inode->pg_cache);
 491     }
 492     inode->ops->sync(inode);
 493     hlist_delete(&inode->hash_list);
 494     cake_release(inode_pile, inode);
 495 }
 496
 497 /* ---- System call definition and support ---- */
 498
 499 #define FLOCATE_CREATE_EMPTY 1
 500
 501 int
 502 vfs_getfd(int fd, struct v_fd** fd_s)
 503 {
 504     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 505         return 0;
 506     }
 507     return EBADF;
 508 }
 509
 510 int
 511 __vfs_try_locate_file(const char* path,
 512                       struct v_dnode** fdir,
 513                       struct v_dnode** file,
 514                       int options)
 515 {
 516     char name_str[VFS_NAME_MAXLEN];
 517     struct hstr name = HSTR(name_str, 0);
 518     int errno;
 519
 520     name_str[0] = 0;
 521     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 522         return errno;
 523     }
 524
 525     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 526     if (errno != ENOENT || !(options & FLOCATE_CREATE_EMPTY)) {
 527         return errno;
 528     }
 529
 530     struct v_dnode* parent = *fdir;
 531     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 532
 533     if (!file_new) {
 534         return ENOMEM;
 535     }
 536
 537     lock_dnode(parent);
 538
 539     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 540         vfs_dcache_add(parent, file_new);
 541         *file = file_new;
 542     } else {
 543         vfs_d_free(file_new);
 544     }
 545
 546     unlock_dnode(parent);
 547
 548     return errno;
 549 }
 550
 551 int
 552 vfs_do_open(const char* path, int options)
 553 {
 554     int errno, fd;
 555     struct v_dnode *dentry, *file;
 556     struct v_file* ofile = 0;
 557
 558     errno = __vfs_try_locate_file(
 559       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 560
 561     if (errno || (errno = vfs_open(file, &ofile))) {
 562         return errno;
 563     }
 564
 565     struct v_inode* o_inode = ofile->inode;
 566
 567     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 568         struct v_fd* fd_s = vzalloc(sizeof(*fd_s));
 569         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 570         fd_s->file = ofile;
 571         fd_s->flags = options;
 572         __current->fdtable->fds[fd] = fd_s;
 573         return fd;
 574     }
 575
 576     return errno;
 577 }
 578
 579 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 580 {
 581     int errno = vfs_do_open(path, options);
 582     return DO_STATUS_OR_RETURN(errno);
 583 }
 584
 585 __DEFINE_LXSYSCALL1(int, close, int, fd)
 586 {
 587     struct v_fd* fd_s;
 588     int errno = 0;
 589     if ((errno = vfs_getfd(fd, &fd_s))) {
 590         goto done_err;
 591     }
 592
 593     if ((errno = vfs_close(fd_s->file))) {
 594         goto done_err;
 595     }
 596
 597     vfree(fd_s);
 598     __current->fdtable->fds[fd] = 0;
 599
 600 done_err:
 601     return DO_STATUS(errno);
 602 }
 603
 604 void
 605 __vfs_readdir_callback(struct dir_context* dctx,
 606                        const char* name,
 607                        const int len,
 608                        const int dtype)
 609 {
 610     struct dirent* dent = (struct dirent*)dctx->cb_data;
 611     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 612     dent->d_nlen = len;
 613     dent->d_type = dtype;
 614 }
 615
 616 __DEFINE_LXSYSCALL2(int, readdir, int, fd, struct dirent*, dent)
 617 {
 618     struct v_fd* fd_s;
 619     int errno;
 620
 621     if ((errno = vfs_getfd(fd, &fd_s))) {
 622         goto done;
 623     }
 624
 625     struct v_inode* inode = fd_s->file->inode;
 626
 627     lock_inode(inode);
 628
 629     if (!(inode->itype & VFS_IFDIR)) {
 630         errno = ENOTDIR;
 631     } else {
 632         struct dir_context dctx =
 633           (struct dir_context){ .cb_data = dent,
 634                                 .index = dent->d_offset,
 635                                 .read_complete_callback =
 636                                   __vfs_readdir_callback };
 637         errno = 1;
 638         if (dent->d_offset == 0) {
 639             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 640         } else if (dent->d_offset == 1) {
 641             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 642         } else {
 643             dctx.index -= 2;
 644             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 645                 unlock_inode(inode);
 646                 goto done;
 647             }
 648         }
 649         dent->d_offset++;
 650     }
 651
 652     unlock_inode(inode);
 653
 654 done:
 655     return DO_STATUS_OR_RETURN(errno);
 656 }
 657
 658 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 659 {
 660     int errno = 0;
 661     struct v_fd* fd_s;
 662     if ((errno = vfs_getfd(fd, &fd_s))) {
 663         goto done;
 664     }
 665
 666     struct v_file* file = fd_s->file;
 667     if ((file->inode->itype & VFS_IFDIR)) {
 668         errno = EISDIR;
 669         goto done;
 670     }
 671
 672     lock_inode(file->inode);
 673
 674     file->inode->atime = clock_unixtime();
 675
 676     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 677         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 678     } else {
 679         errno = pcache_read(file->inode, buf, count, file->f_pos);
 680     }
 681
 682     if (errno > 0) {
 683         file->f_pos += errno;
 684         unlock_inode(file->inode);
 685         return errno;
 686     }
 687
 688     unlock_inode(file->inode);
 689
 690 done:
 691     return DO_STATUS(errno);
 692 }
 693
 694 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 695 {
 696     int errno = 0;
 697     struct v_fd* fd_s;
 698     if ((errno = vfs_getfd(fd, &fd_s))) {
 699         goto done;
 700     }
 701
 702     struct v_file* file = fd_s->file;
 703
 704     if ((errno = vfs_check_writable(file->dnode))) {
 705         goto done;
 706     }
 707
 708     if ((file->inode->itype & VFS_IFDIR)) {
 709         errno = EISDIR;
 710         goto done;
 711     }
 712
 713     lock_inode(file->inode);
 714
 715     file->inode->mtime = clock_unixtime();
 716
 717     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 718         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 719     } else {
 720         errno = pcache_write(file->inode, buf, count, file->f_pos);
 721     }
 722
 723     if (errno > 0) {
 724         file->f_pos += errno;
 725         unlock_inode(file->inode);
 726         return errno;
 727     }
 728
 729     unlock_inode(file->inode);
 730
 731 done:
 732     return DO_STATUS(errno);
 733 }
 734
 735 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 736 {
 737     int errno = 0;
 738     struct v_fd* fd_s;
 739     if ((errno = vfs_getfd(fd, &fd_s))) {
 740         goto done;
 741     }
 742
 743     struct v_file* file = fd_s->file;
 744
 745     if (!file->ops->seek) {
 746         errno = ENOTSUP;
 747         goto done;
 748     }
 749
 750     lock_inode(file->inode);
 751
 752     int overflow = 0;
 753     int fpos = file->f_pos;
 754     switch (options) {
 755         case FSEEK_CUR:
 756             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 757             break;
 758         case FSEEK_END:
 759             overflow =
 760               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 761             break;
 762         case FSEEK_SET:
 763             fpos = offset;
 764             break;
 765     }
 766     if (overflow) {
 767         errno = EOVERFLOW;
 768     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 769         file->f_pos = fpos;
 770     }
 771
 772     unlock_inode(file->inode);
 773
 774 done:
 775     return DO_STATUS(errno);
 776 }
 777
 778 int
 779 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 780 {
 781     if (!dnode) {
 782         return 0;
 783     }
 784
 785     if (depth > 64) {
 786         return ENAMETOOLONG;
 787     }
 788
 789     size_t len = 0;
 790
 791     if (dnode->parent != dnode) {
 792         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 793     }
 794
 795     if (len >= size) {
 796         return len;
 797     }
 798
 799     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 800         buf[len++] = VFS_PATH_DELIM;
 801     }
 802
 803     size_t cpy_size = MIN(dnode->name.len, size - len);
 804     strncpy(buf + len, dnode->name.value, cpy_size);
 805     len += cpy_size;
 806
 807     return len;
 808 }
 809
 810 int
 811 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 812 {
 813     const char* link;
 814     struct v_inode* inode = dnode->inode;
 815     if (inode->ops->read_symlink) {
 816         lock_inode(inode);
 817
 818         int errno = inode->ops->read_symlink(inode, &link);
 819         strncpy(buf, link, size);
 820
 821         unlock_inode(inode);
 822         return errno;
 823     }
 824     return 0;
 825 }
 826
 827 int
 828 vfs_get_dtype(int itype)
 829 {
 830     switch (itype) {
 831         case VFS_IFDIR:
 832             return DT_DIR;
 833         case VFS_IFSYMLINK:
 834             return DT_SYMLINK;
 835         default:
 836             return DT_PIPE;
 837     }
 838 }
 839
 840 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 841 {
 842     int errno;
 843     struct v_fd* fd_s;
 844     if ((errno = vfs_getfd(fd, &fd_s))) {
 845         goto done;
 846     }
 847
 848     struct v_dnode* dnode;
 849     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 850
 851     if (errno >= 0) {
 852         return errno;
 853     }
 854
 855 done:
 856     return DO_STATUS(errno);
 857 }
 858
 859 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 860 {
 861     int errno;
 862     struct v_dnode* dnode;
 863     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 864         errno = vfs_readlink(dnode, buf, size);
 865     }
 866
 867     if (errno >= 0) {
 868         return errno;
 869     }
 870
 871     return DO_STATUS(errno);
 872 }
 873
 874 __DEFINE_LXSYSCALL4(int,
 875                     readlinkat,
 876                     int,
 877                     dirfd,
 878                     const char*,
 879                     pathname,
 880                     char*,
 881                     buf,
 882                     size_t,
 883                     size)
 884 {
 885     int errno;
 886     struct v_fd* fd_s;
 887     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 888         goto done;
 889     }
 890
 891     struct v_dnode* dnode;
 892     if (!(errno = vfs_walk(
 893             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 894         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 895     }
 896
 897     if (errno >= 0) {
 898         return errno;
 899     }
 900
 901 done:
 902     return DO_STATUS(errno);
 903 }
 904
 905 /*
 906     NOTE
 907     When we perform operation that could affect the layout of
 908     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 909     whenever possible. This will blocking any ongoing path walking to reach
 910     it hence avoid any partial state.
 911 */
 912
 913 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 914 {
 915     int errno;
 916     struct v_dnode* dnode;
 917     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 918         return DO_STATUS(errno);
 919     }
 920
 921     lock_dnode(dnode);
 922
 923     if ((errno = vfs_check_writable(dnode))) {
 924         goto done;
 925     }
 926
 927     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 928         errno = EROFS;
 929         goto done;
 930     }
 931
 932     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 933         errno = EBUSY;
 934         goto done;
 935     }
 936
 937     if (!llist_empty(&dnode->children)) {
 938         errno = ENOTEMPTY;
 939         goto done;
 940     }
 941
 942     struct v_dnode* parent = dnode->parent;
 943
 944     if (!parent) {
 945         errno = EINVAL;
 946         goto done;
 947     }
 948
 949     lock_dnode(parent);
 950     lock_inode(parent->inode);
 951
 952     if ((dnode->inode->itype & VFS_IFDIR)) {
 953         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 954         if (!errno) {
 955             vfs_dcache_remove(dnode);
 956         }
 957     } else {
 958         errno = ENOTDIR;
 959     }
 960
 961     unlock_inode(parent->inode);
 962     unlock_dnode(parent);
 963
 964 done:
 965     unlock_dnode(dnode);
 966     return DO_STATUS(errno);
 967 }
 968
 969 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 970 {
 971     int errno = 0;
 972     struct v_dnode *parent, *dir;
 973     char name_value[VFS_NAME_MAXLEN];
 974     struct hstr name = HHSTR(name_value, 0, 0);
 975
 976     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
 977         goto done;
 978     }
 979
 980     if ((errno = vfs_check_writable(parent))) {
 981         goto done;
 982     }
 983
 984     if (!(dir = vfs_d_alloc(parent, &name))) {
 985         errno = ENOMEM;
 986         goto done;
 987     }
 988
 989     lock_dnode(parent);
 990     lock_inode(parent->inode);
 991
 992     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
 993         errno = ENOTSUP;
 994     } else if (!parent->inode->ops->mkdir) {
 995         errno = ENOTSUP;
 996     } else if (!(parent->inode->itype & VFS_IFDIR)) {
 997         errno = ENOTDIR;
 998     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
 999         vfs_dcache_add(parent, dir);
1000         goto cleanup;
1001     }
1002
1003     vfs_d_free(dir);
1004
1005 cleanup:
1006     unlock_inode(parent->inode);
1007     unlock_dnode(parent);
1008 done:
1009     return DO_STATUS(errno);
1010 }
1011
1012 int
1013 __vfs_do_unlink(struct v_dnode* dnode)
1014 {
1015     int errno;
1016     struct v_inode* inode = dnode->inode;
1017
1018     if (dnode->ref_count > 1) {
1019         return EBUSY;
1020     }
1021
1022     if ((errno = vfs_check_writable(dnode))) {
1023         return errno;
1024     }
1025
1026     lock_inode(inode);
1027
1028     if (inode->open_count) {
1029         errno = EBUSY;
1030     } else if (!(inode->itype & VFS_IFDIR)) {
1031         // The underlying unlink implementation should handle
1032         //  symlink case
1033         errno = inode->ops->unlink(inode);
1034         if (!errno) {
1035             vfs_d_free(dnode);
1036         }
1037     } else {
1038         errno = EISDIR;
1039     }
1040
1041     unlock_inode(inode);
1042
1043     return errno;
1044 }
1045
1046 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1047 {
1048     int errno;
1049     struct v_dnode* dnode;
1050     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1051         goto done;
1052     }
1053
1054     errno = __vfs_do_unlink(dnode);
1055
1056 done:
1057     return DO_STATUS(errno);
1058 }
1059
1060 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1061 {
1062     int errno;
1063     struct v_fd* fd_s;
1064     if ((errno = vfs_getfd(fd, &fd_s))) {
1065         goto done;
1066     }
1067
1068     struct v_dnode* dnode;
1069     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1070         errno = __vfs_do_unlink(dnode);
1071     }
1072
1073 done:
1074     return DO_STATUS(errno);
1075 }
1076
1077 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1078 {
1079     int errno;
1080     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1081
1082     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1083     if (!errno) {
1084         errno = __vfs_try_locate_file(
1085           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1086         if (!errno) {
1087             errno = EEXIST;
1088         } else if (name_file) {
1089             errno = vfs_link(to_link, name_file);
1090         }
1091     }
1092     return DO_STATUS(errno);
1093 }
1094
1095 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1096 {
1097     int errno;
1098     struct v_fd* fd_s;
1099
1100     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1101         errno = vfs_fsync(fd_s->file);
1102     }
1103
1104     return DO_STATUS(errno);
1105 }
1106
1107 int
1108 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1109 {
1110     int errno = 0;
1111     struct v_fd* copied = cake_grab(fd_pile);
1112
1113     memcpy(copied, old, sizeof(struct v_fd));
1114
1115     atomic_fetch_add(&old->file->ref_count, 1);
1116
1117     *new = copied;
1118
1119     return errno;
1120 }
1121
1122 int
1123 vfs_dup2(int oldfd, int newfd)
1124 {
1125     if (newfd == oldfd) {
1126         return newfd;
1127     }
1128
1129     int errno;
1130     struct v_fd *oldfd_s, *newfd_s;
1131     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1132         goto done;
1133     }
1134
1135     if (!TEST_FD(newfd)) {
1136         errno = EBADF;
1137         goto done;
1138     }
1139
1140     newfd_s = __current->fdtable->fds[newfd];
1141     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1142         goto done;
1143     }
1144
1145     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1146         __current->fdtable->fds[newfd] = newfd_s;
1147         return newfd;
1148     }
1149
1150 done:
1151     return DO_STATUS(errno);
1152 }
1153
1154 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1155 {
1156     return vfs_dup2(oldfd, newfd);
1157 }
1158
1159 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1160 {
1161     int errno, newfd;
1162     struct v_fd *oldfd_s, *newfd_s;
1163     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1164         goto done;
1165     }
1166
1167     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1168         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1169         __current->fdtable->fds[newfd] = newfd_s;
1170         return newfd;
1171     }
1172
1173 done:
1174     return DO_STATUS(errno);
1175 }
1176
1177 __DEFINE_LXSYSCALL2(int,
1178                     symlink,
1179                     const char*,
1180                     pathname,
1181                     const char*,
1182                     link_target)
1183 {
1184     int errno;
1185     struct v_dnode* dnode;
1186     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1187         goto done;
1188     }
1189
1190     if (errno = vfs_check_writable(dnode)) {
1191         goto done;
1192     }
1193
1194     if (!dnode->inode->ops->set_symlink) {
1195         errno = ENOTSUP;
1196         goto done;
1197     }
1198
1199     lock_inode(dnode->inode);
1200
1201     errno = dnode->inode->ops->set_symlink(dnode->inode, link_target);
1202
1203     unlock_inode(dnode->inode);
1204
1205 done:
1206     return DO_STATUS(errno);
1207 }
1208
1209 void
1210 vfs_ref_dnode(struct v_dnode* dnode)
1211 {
1212     atomic_fetch_add(&dnode->ref_count, 1);
1213     mnt_mkbusy(dnode->mnt);
1214 }
1215
1216 void
1217 vfs_unref_dnode(struct v_dnode* dnode)
1218 {
1219     atomic_fetch_sub(&dnode->ref_count, 1);
1220     mnt_chillax(dnode->mnt);
1221 }
1222
1223 int
1224 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1225 {
1226     int errno = 0;
1227
1228     lock_dnode(dnode);
1229
1230     if (!(dnode->inode->itype & VFS_IFDIR)) {
1231         errno = ENOTDIR;
1232         goto done;
1233     }
1234
1235     if (proc->cwd) {
1236         vfs_unref_dnode(proc->cwd);
1237     }
1238
1239     vfs_ref_dnode(dnode);
1240     proc->cwd = dnode;
1241
1242     unlock_dnode(dnode);
1243
1244 done:
1245     return errno;
1246 }
1247
1248 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1249 {
1250     struct v_dnode* dnode;
1251     int errno = 0;
1252
1253     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1254         goto done;
1255     }
1256
1257     errno = vfs_do_chdir(__current, dnode);
1258
1259 done:
1260     return DO_STATUS(errno);
1261 }
1262
1263 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1264 {
1265     struct v_fd* fd_s;
1266     int errno = 0;
1267
1268     if ((errno = vfs_getfd(fd, &fd_s))) {
1269         goto done;
1270     }
1271
1272     errno = vfs_do_chdir(__current, fd_s->file->dnode);
1273
1274 done:
1275     return DO_STATUS(errno);
1276 }
1277
1278 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1279 {
1280     int errno = 0;
1281     char* ret_ptr = 0;
1282     if (size < 2) {
1283         errno = ERANGE;
1284         goto done;
1285     }
1286
1287     size_t len = 0;
1288
1289     if (!__current->cwd) {
1290         *buf = VFS_PATH_DELIM;
1291         len = 1;
1292     } else {
1293         len = vfs_get_path(__current->cwd, buf, size, 0);
1294         if (len == size) {
1295             errno = ERANGE;
1296             goto done;
1297         }
1298     }
1299
1300     buf[len + 1] = '\0';
1301
1302     ret_ptr = buf;
1303
1304 done:
1305     __current->k_status = errno;
1306     return ret_ptr;
1307 }
1308
1309 int
1310 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1311 {
1312     int errno = 0;
1313     if (current->inode->id == target->inode->id) {
1314         // hard link
1315         return 0;
1316     }
1317
1318     if (errno = vfs_check_writable(current)) {
1319         return errno;
1320     }
1321
1322     if (current->ref_count > 1 || target->ref_count > 1) {
1323         return EBUSY;
1324     }
1325
1326     if (current->super_block != target->super_block) {
1327         return EXDEV;
1328     }
1329
1330     struct v_dnode* oldparent = current->parent;
1331     struct v_dnode* newparent = target->parent;
1332
1333     lock_dnode(current);
1334     lock_dnode(target);
1335     if (oldparent)
1336         lock_dnode(oldparent);
1337     if (newparent)
1338         lock_dnode(newparent);
1339
1340     if (!llist_empty(&target->children)) {
1341         errno = ENOTEMPTY;
1342         unlock_dnode(target);
1343         goto cleanup;
1344     }
1345
1346     if ((errno =
1347            current->inode->ops->rename(current->inode, current, target))) {
1348         unlock_dnode(target);
1349         goto cleanup;
1350     }
1351
1352     // re-position current
1353     hstrcpy(&current->name, &target->name);
1354     vfs_dcache_rehash(newparent, current);
1355
1356     // detach target
1357     vfs_d_free(target);
1358
1359     unlock_dnode(target);
1360
1361 cleanup:
1362     unlock_dnode(current);
1363     if (oldparent)
1364         unlock_dnode(oldparent);
1365     if (newparent)
1366         unlock_dnode(newparent);
1367
1368     return errno;
1369 }
1370
1371 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1372 {
1373     struct v_dnode *cur, *target_parent, *target;
1374     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1375     int errno = 0;
1376
1377     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1378         goto done;
1379     }
1380
1381     if ((errno = vfs_walk(
1382            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1383         goto done;
1384     }
1385
1386     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1387     if (errno == ENOENT) {
1388         target = vfs_d_alloc(target_parent, &name);
1389         vfs_dcache_add(target_parent, target);
1390     } else if (errno) {
1391         goto done;
1392     }
1393
1394     if (!target) {
1395         errno = ENOMEM;
1396         goto done;
1397     }
1398
1399     errno = vfs_do_rename(cur, target);
1400
1401 done:
1402     vfree(name.value);
1403     return DO_STATUS(errno);
1404 }