lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/dirent.h>
  48 #include <lunaix/foptions.h>
  49 #include <lunaix/fs.h>
  50 #include <lunaix/mm/cake.h>
  51 #include <lunaix/mm/page.h>
  52 #include <lunaix/mm/valloc.h>
  53 #include <lunaix/process.h>
  54 #include <lunaix/spike.h>
  55 #include <lunaix/syscall.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 static struct cake_pile* dnode_pile;
  60 static struct cake_pile* inode_pile;
  61 static struct cake_pile* file_pile;
  62 static struct cake_pile* superblock_pile;
  63 static struct cake_pile* fd_pile;
  64
  65 struct v_dnode* vfs_sysroot;
  66 static struct hbucket* dnode_cache;
  67
  68 struct lru_zone *dnode_lru, *inode_lru;
  69
  70 struct hstr vfs_ddot = HSTR("..", 2);
  71 struct hstr vfs_dot = HSTR(".", 1);
  72 struct hstr vfs_empty = HSTR("", 0);
  73
  74 struct v_superblock*
  75 vfs_sb_alloc();
  76
  77 void
  78 vfs_sb_free(struct v_superblock* sb);
  79
  80 static int
  81 __vfs_try_evict_dnode(struct lru_node* obj);
  82
  83 static int
  84 __vfs_try_evict_inode(struct lru_node* obj);
  85
  86 void
  87 vfs_init()
  88 {
  89     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  90     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  91     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  92     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  93     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  94     superblock_pile =
  95       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  96
  97     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
  98
  99     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 100     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 101
 102     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 103     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 104
 105     // 创建一个根dnode。
 106     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 107     vfs_sysroot->parent = vfs_sysroot;
 108     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 109 }
 110
 111 inline struct hbucket*
 112 __dcache_hash(struct v_dnode* parent, uint32_t* hash)
 113 {
 114     uint32_t _hash = *hash;
 115     // 确保低位更加随机
 116     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 117     // 与parent的指针值做加法，来减小碰撞的可能性。
 118     _hash += (uint32_t)parent;
 119     *hash = _hash;
 120     return &dnode_cache[_hash & VFS_HASH_MASK];
 121 }
 122
 123 struct v_dnode*
 124 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 125 {
 126     if (!str->len || HSTR_EQ(str, &vfs_dot))
 127         return parent;
 128
 129     if (HSTR_EQ(str, &vfs_ddot)) {
 130         return parent->parent;
 131     }
 132
 133     uint32_t hash = str->hash;
 134     struct hbucket* slot = __dcache_hash(parent, &hash);
 135
 136     struct v_dnode *pos, *n;
 137     hashtable_bucket_foreach(slot, pos, n, hash_list)
 138     {
 139         if (pos->name.hash == hash) {
 140             return pos;
 141         }
 142     }
 143     return NULL;
 144 }
 145
 146 void
 147 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 148 {
 149     assert(parent);
 150
 151     atomic_fetch_add(&dnode->ref_count, 1);
 152     dnode->parent = parent;
 153     llist_append(&parent->children, &dnode->siblings);
 154
 155     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 156     hlist_add(&bucket->head, &dnode->hash_list);
 157 }
 158
 159 void
 160 vfs_dcache_remove(struct v_dnode* dnode)
 161 {
 162     assert(dnode);
 163     assert(dnode->ref_count == 1);
 164
 165     llist_delete(&dnode->siblings);
 166     llist_delete(&dnode->aka_list);
 167     hlist_delete(&dnode->hash_list);
 168
 169     dnode->parent = NULL;
 170     atomic_fetch_sub(&dnode->ref_count, 1);
 171 }
 172
 173 void
 174 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 175 {
 176     assert(new_parent);
 177
 178     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 179     vfs_dcache_remove(dnode);
 180     vfs_dcache_add(new_parent, dnode);
 181 }
 182
 183 int
 184 vfs_open(struct v_dnode* dnode, struct v_file** file)
 185 {
 186     if (!dnode->inode || !dnode->inode->ops->open) {
 187         return ENOTSUP;
 188     }
 189
 190     struct v_inode* inode = dnode->inode;
 191
 192     lock_inode(inode);
 193
 194     struct v_file* vfile = cake_grab(file_pile);
 195     memset(vfile, 0, sizeof(*vfile));
 196
 197     vfile->dnode = dnode;
 198     vfile->inode = inode;
 199     vfile->ref_count = ATOMIC_VAR_INIT(1);
 200     vfile->ops = inode->default_fops;
 201
 202     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 203         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 204         pcache_init(pcache);
 205         pcache->master = inode;
 206         inode->pg_cache = pcache;
 207     }
 208
 209     int errno = inode->ops->open(inode, vfile);
 210     if (errno) {
 211         cake_release(file_pile, vfile);
 212     } else {
 213         atomic_fetch_add(&dnode->ref_count, 1);
 214         inode->open_count++;
 215         mnt_mkbusy(dnode->mnt);
 216
 217         *file = vfile;
 218     }
 219
 220     unlock_inode(inode);
 221
 222     return errno;
 223 }
 224
 225 void
 226 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 227 {
 228     if (assign_to->inode) {
 229         llist_delete(&assign_to->aka_list);
 230         assign_to->inode->link_count--;
 231     }
 232     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 233     assign_to->inode = inode;
 234     inode->link_count++;
 235 }
 236
 237 int
 238 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 239 {
 240     int errno;
 241
 242     if ((errno = vfs_check_writable(to_link))) {
 243         return errno;
 244     }
 245
 246     lock_inode(to_link->inode);
 247     if (to_link->super_block->root != name->super_block->root) {
 248         errno = EXDEV;
 249     } else if (!to_link->inode->ops->link) {
 250         errno = ENOTSUP;
 251     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 252         vfs_assign_inode(name, to_link->inode);
 253     }
 254     unlock_inode(to_link->inode);
 255
 256     return errno;
 257 }
 258
 259 int
 260 vfs_pclose(struct v_file* file, pid_t pid)
 261 {
 262     int errno = 0;
 263     if (file->ref_count > 1) {
 264         atomic_fetch_sub(&file->ref_count, 1);
 265     } else if (!(errno = file->ops->close(file))) {
 266         atomic_fetch_sub(&file->dnode->ref_count, 1);
 267         file->inode->open_count--;
 268
 269         // Prevent dead lock.
 270         // This happened when process is terminated while blocking on read.
 271         // In that case, the process is still holding the inode lock and it will
 272         // never get released.
 273         /*
 274          * The unlocking should also include ownership check.
 275          *
 276          * To see why, consider two process both open the same file both with
 277          * fd=x.
 278          *      Process A: busy on reading x
 279          *      Process B: do nothing with x
 280          * Assuming that, after a very short time, process B get terminated
 281          * while process A is still busy in it's reading business. By this
 282          * design, the inode lock of this file x is get released by B rather
 283          * than A. And this will cause a probable race condition on A if other
 284          * process is writing to this file later after B exit.
 285          */
 286         if (mutex_on_hold(&file->inode->lock)) {
 287             mutex_unlock_for(&file->inode->lock, pid);
 288         }
 289         mnt_chillax(file->dnode->mnt);
 290
 291         pcache_commit_all(file->inode);
 292         cake_release(file_pile, file);
 293     }
 294     return errno;
 295 }
 296
 297 int
 298 vfs_close(struct v_file* file)
 299 {
 300     return vfs_pclose(file, __current->pid);
 301 }
 302
 303 void
 304 vfs_free_fd(struct v_fd* fd)
 305 {
 306     cake_release(fd_pile, fd);
 307 }
 308
 309 int
 310 vfs_fsync(struct v_file* file)
 311 {
 312     int errno;
 313     if ((errno = vfs_check_writable(file->dnode))) {
 314         return errno;
 315     }
 316
 317     lock_inode(file->inode);
 318
 319     pcache_commit_all(file->inode);
 320
 321     errno = ENOTSUP;
 322     if (file->ops->sync) {
 323         errno = file->ops->sync(file);
 324     }
 325
 326     unlock_inode(file->inode);
 327
 328     return errno;
 329 }
 330
 331 int
 332 vfs_alloc_fdslot(int* fd)
 333 {
 334     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 335         if (!__current->fdtable->fds[i]) {
 336             *fd = i;
 337             return 0;
 338         }
 339     }
 340     return EMFILE;
 341 }
 342
 343 struct v_superblock*
 344 vfs_sb_alloc()
 345 {
 346     struct v_superblock* sb = cake_grab(superblock_pile);
 347     memset(sb, 0, sizeof(*sb));
 348     llist_init_head(&sb->sb_list);
 349     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 350     return sb;
 351 }
 352
 353 void
 354 vfs_sb_free(struct v_superblock* sb)
 355 {
 356     vfree(sb->i_cache);
 357     cake_release(superblock_pile, sb);
 358 }
 359
 360 static int
 361 __vfs_try_evict_dnode(struct lru_node* obj)
 362 {
 363     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 364
 365     if (!dnode->ref_count) {
 366         vfs_d_free(dnode);
 367         return 1;
 368     }
 369     return 0;
 370 }
 371
 372 static int
 373 __vfs_try_evict_inode(struct lru_node* obj)
 374 {
 375     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 376
 377     if (!inode->link_count && !inode->open_count) {
 378         vfs_i_free(inode);
 379         return 1;
 380     }
 381     return 0;
 382 }
 383
 384 struct v_dnode*
 385 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 386 {
 387     struct v_dnode* dnode = cake_grab(dnode_pile);
 388     if (!dnode) {
 389         lru_evict_half(dnode_lru);
 390
 391         if (!(dnode = cake_grab(dnode_pile))) {
 392             return NULL;
 393         }
 394     }
 395
 396     memset(dnode, 0, sizeof(*dnode));
 397     llist_init_head(&dnode->children);
 398     llist_init_head(&dnode->siblings);
 399     llist_init_head(&dnode->aka_list);
 400     mutex_init(&dnode->lock);
 401
 402     dnode->ref_count = ATOMIC_VAR_INIT(0);
 403     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 404
 405     hstrcpy(&dnode->name, name);
 406
 407     if (parent) {
 408         dnode->super_block = parent->super_block;
 409         dnode->mnt = parent->mnt;
 410     }
 411
 412     lru_use_one(dnode_lru, &dnode->lru);
 413
 414     return dnode;
 415 }
 416
 417 void
 418 vfs_d_free(struct v_dnode* dnode)
 419 {
 420     assert(dnode->ref_count == 1);
 421
 422     if (dnode->inode) {
 423         assert(dnode->inode->link_count > 0);
 424         dnode->inode->link_count--;
 425     }
 426
 427     vfs_dcache_remove(dnode);
 428     // Make sure the children de-referencing their parent.
 429     // With lru presented, the eviction will be propagated over the entire
 430     // detached subtree eventually
 431     struct v_dnode *pos, *n;
 432     llist_for_each(pos, n, &dnode->children, siblings)
 433     {
 434         vfs_dcache_remove(pos);
 435     }
 436
 437     vfree(dnode->name.value);
 438     cake_release(dnode_pile, dnode);
 439 }
 440
 441 struct v_inode*
 442 vfs_i_find(struct v_superblock* sb, uint32_t i_id)
 443 {
 444     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 445     struct v_inode *pos, *n;
 446     hashtable_bucket_foreach(slot, pos, n, hash_list)
 447     {
 448         if (pos->id == i_id) {
 449             lru_use_one(inode_lru, &pos->lru);
 450             return pos;
 451         }
 452     }
 453
 454     return NULL;
 455 }
 456
 457 void
 458 vfs_i_addhash(struct v_inode* inode)
 459 {
 460     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 461
 462     hlist_delete(&inode->hash_list);
 463     hlist_add(&slot->head, &inode->hash_list);
 464 }
 465
 466 struct v_inode*
 467 vfs_i_alloc(struct v_superblock* sb)
 468 {
 469     assert(sb->ops.init_inode);
 470
 471     struct v_inode* inode;
 472     if (!(inode = cake_grab(inode_pile))) {
 473         lru_evict_half(inode_lru);
 474         if (!(inode = cake_grab(inode_pile))) {
 475             return NULL;
 476         }
 477     }
 478
 479     memset(inode, 0, sizeof(*inode));
 480     mutex_init(&inode->lock);
 481     llist_init_head(&inode->xattrs);
 482     llist_init_head(&inode->aka_dnodes);
 483
 484     sb->ops.init_inode(sb, inode);
 485
 486     inode->sb = sb;
 487     inode->ctime = clock_unixtime();
 488     inode->atime = inode->ctime;
 489     inode->mtime = inode->ctime;
 490
 491 done:
 492     lru_use_one(inode_lru, &inode->lru);
 493     return inode;
 494 }
 495
 496 void
 497 vfs_i_free(struct v_inode* inode)
 498 {
 499     if (inode->pg_cache) {
 500         pcache_release(inode->pg_cache);
 501         vfree(inode->pg_cache);
 502     }
 503     inode->ops->sync(inode);
 504     hlist_delete(&inode->hash_list);
 505     cake_release(inode_pile, inode);
 506 }
 507
 508 /* ---- System call definition and support ---- */
 509
 510 #define FLOCATE_CREATE_EMPTY 1
 511
 512 int
 513 vfs_getfd(int fd, struct v_fd** fd_s)
 514 {
 515     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 516         return 0;
 517     }
 518     return EBADF;
 519 }
 520
 521 int
 522 __vfs_try_locate_file(const char* path,
 523                       struct v_dnode** fdir,
 524                       struct v_dnode** file,
 525                       int options)
 526 {
 527     char name_str[VFS_NAME_MAXLEN];
 528     struct hstr name = HSTR(name_str, 0);
 529     int errno;
 530
 531     name_str[0] = 0;
 532     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 533         return errno;
 534     }
 535
 536     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 537     if (errno != ENOENT || !(options & FLOCATE_CREATE_EMPTY)) {
 538         return errno;
 539     }
 540
 541     struct v_dnode* parent = *fdir;
 542     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 543
 544     if (!file_new) {
 545         return ENOMEM;
 546     }
 547
 548     lock_dnode(parent);
 549
 550     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 551         vfs_dcache_add(parent, file_new);
 552         *file = file_new;
 553     } else {
 554         vfs_d_free(file_new);
 555     }
 556
 557     unlock_dnode(parent);
 558
 559     return errno;
 560 }
 561
 562 int
 563 vfs_do_open(const char* path, int options)
 564 {
 565     int errno, fd;
 566     struct v_dnode *dentry, *file;
 567     struct v_file* ofile = 0;
 568
 569     errno = __vfs_try_locate_file(
 570       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 571
 572     if (errno || (errno = vfs_open(file, &ofile))) {
 573         return errno;
 574     }
 575
 576     struct v_inode* o_inode = ofile->inode;
 577
 578     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 579         struct v_fd* fd_s = cake_grab(fd_pile);
 580         memset(fd_s, 0, sizeof(*fd_s));
 581
 582         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 583         fd_s->file = ofile;
 584         fd_s->flags = options;
 585         __current->fdtable->fds[fd] = fd_s;
 586         return fd;
 587     }
 588
 589     return errno;
 590 }
 591
 592 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 593 {
 594     int errno = vfs_do_open(path, options);
 595     return DO_STATUS_OR_RETURN(errno);
 596 }
 597
 598 __DEFINE_LXSYSCALL1(int, close, int, fd)
 599 {
 600     struct v_fd* fd_s;
 601     int errno = 0;
 602     if ((errno = vfs_getfd(fd, &fd_s))) {
 603         goto done_err;
 604     }
 605
 606     if ((errno = vfs_close(fd_s->file))) {
 607         goto done_err;
 608     }
 609
 610     cake_release(fd_pile, fd_s);
 611     __current->fdtable->fds[fd] = 0;
 612
 613 done_err:
 614     return DO_STATUS(errno);
 615 }
 616
 617 void
 618 __vfs_readdir_callback(struct dir_context* dctx,
 619                        const char* name,
 620                        const int len,
 621                        const int dtype)
 622 {
 623     struct dirent* dent = (struct dirent*)dctx->cb_data;
 624     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 625     dent->d_nlen = len;
 626     dent->d_type = dtype;
 627 }
 628
 629 __DEFINE_LXSYSCALL2(int, readdir, int, fd, struct dirent*, dent)
 630 {
 631     struct v_fd* fd_s;
 632     int errno;
 633
 634     if ((errno = vfs_getfd(fd, &fd_s))) {
 635         goto done;
 636     }
 637
 638     struct v_inode* inode = fd_s->file->inode;
 639
 640     lock_inode(inode);
 641
 642     if (!(inode->itype & VFS_IFDIR)) {
 643         errno = ENOTDIR;
 644     } else {
 645         struct dir_context dctx =
 646           (struct dir_context){ .cb_data = dent,
 647                                 .index = dent->d_offset,
 648                                 .read_complete_callback =
 649                                   __vfs_readdir_callback };
 650         errno = 1;
 651         if (dent->d_offset == 0) {
 652             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 653         } else if (dent->d_offset == 1) {
 654             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 655         } else {
 656             dctx.index -= 2;
 657             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 658                 unlock_inode(inode);
 659                 goto done;
 660             }
 661         }
 662         dent->d_offset++;
 663     }
 664
 665     unlock_inode(inode);
 666
 667 done:
 668     return DO_STATUS_OR_RETURN(errno);
 669 }
 670
 671 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 672 {
 673     int errno = 0;
 674     struct v_fd* fd_s;
 675     if ((errno = vfs_getfd(fd, &fd_s))) {
 676         goto done;
 677     }
 678
 679     struct v_file* file = fd_s->file;
 680     if ((file->inode->itype & VFS_IFDIR)) {
 681         errno = EISDIR;
 682         goto done;
 683     }
 684
 685     lock_inode(file->inode);
 686
 687     file->inode->atime = clock_unixtime();
 688
 689     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 690         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 691     } else {
 692         errno = pcache_read(file->inode, buf, count, file->f_pos);
 693     }
 694
 695     if (errno > 0) {
 696         file->f_pos += errno;
 697         unlock_inode(file->inode);
 698         return errno;
 699     }
 700
 701     unlock_inode(file->inode);
 702
 703 done:
 704     return DO_STATUS(errno);
 705 }
 706
 707 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 708 {
 709     int errno = 0;
 710     struct v_fd* fd_s;
 711     if ((errno = vfs_getfd(fd, &fd_s))) {
 712         goto done;
 713     }
 714
 715     struct v_file* file = fd_s->file;
 716
 717     if ((errno = vfs_check_writable(file->dnode))) {
 718         goto done;
 719     }
 720
 721     if ((file->inode->itype & VFS_IFDIR)) {
 722         errno = EISDIR;
 723         goto done;
 724     }
 725
 726     lock_inode(file->inode);
 727
 728     file->inode->mtime = clock_unixtime();
 729
 730     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 731         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 732     } else {
 733         errno = pcache_write(file->inode, buf, count, file->f_pos);
 734     }
 735
 736     if (errno > 0) {
 737         file->f_pos += errno;
 738         unlock_inode(file->inode);
 739         return errno;
 740     }
 741
 742     unlock_inode(file->inode);
 743
 744 done:
 745     return DO_STATUS(errno);
 746 }
 747
 748 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 749 {
 750     int errno = 0;
 751     struct v_fd* fd_s;
 752     if ((errno = vfs_getfd(fd, &fd_s))) {
 753         goto done;
 754     }
 755
 756     struct v_file* file = fd_s->file;
 757
 758     if (!file->ops->seek) {
 759         errno = ENOTSUP;
 760         goto done;
 761     }
 762
 763     lock_inode(file->inode);
 764
 765     int overflow = 0;
 766     int fpos = file->f_pos;
 767     switch (options) {
 768         case FSEEK_CUR:
 769             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 770             break;
 771         case FSEEK_END:
 772             overflow =
 773               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 774             break;
 775         case FSEEK_SET:
 776             fpos = offset;
 777             break;
 778     }
 779     if (overflow) {
 780         errno = EOVERFLOW;
 781     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 782         file->f_pos = fpos;
 783     }
 784
 785     unlock_inode(file->inode);
 786
 787 done:
 788     return DO_STATUS(errno);
 789 }
 790
 791 int
 792 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 793 {
 794     if (!dnode) {
 795         return 0;
 796     }
 797
 798     if (depth > 64) {
 799         return ENAMETOOLONG;
 800     }
 801
 802     size_t len = 0;
 803
 804     if (dnode->parent != dnode) {
 805         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 806     }
 807
 808     if (len >= size) {
 809         return len;
 810     }
 811
 812     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 813         buf[len++] = VFS_PATH_DELIM;
 814     }
 815
 816     size_t cpy_size = MIN(dnode->name.len, size - len);
 817     strncpy(buf + len, dnode->name.value, cpy_size);
 818     len += cpy_size;
 819
 820     return len;
 821 }
 822
 823 int
 824 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 825 {
 826     const char* link;
 827     struct v_inode* inode = dnode->inode;
 828     if (inode->ops->read_symlink) {
 829         lock_inode(inode);
 830
 831         int errno = inode->ops->read_symlink(inode, &link);
 832         strncpy(buf, link, size);
 833
 834         unlock_inode(inode);
 835         return errno;
 836     }
 837     return 0;
 838 }
 839
 840 int
 841 vfs_get_dtype(int itype)
 842 {
 843     switch (itype) {
 844         case VFS_IFDIR:
 845             return DT_DIR;
 846         case VFS_IFSYMLINK:
 847             return DT_SYMLINK;
 848         default:
 849             return DT_PIPE;
 850     }
 851 }
 852
 853 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 854 {
 855     int errno;
 856     struct v_fd* fd_s;
 857     if ((errno = vfs_getfd(fd, &fd_s))) {
 858         goto done;
 859     }
 860
 861     struct v_dnode* dnode;
 862     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 863
 864     if (errno >= 0) {
 865         return errno;
 866     }
 867
 868 done:
 869     return DO_STATUS(errno);
 870 }
 871
 872 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 873 {
 874     int errno;
 875     struct v_dnode* dnode;
 876     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 877         errno = vfs_readlink(dnode, buf, size);
 878     }
 879
 880     if (errno >= 0) {
 881         return errno;
 882     }
 883
 884     return DO_STATUS(errno);
 885 }
 886
 887 __DEFINE_LXSYSCALL4(int,
 888                     readlinkat,
 889                     int,
 890                     dirfd,
 891                     const char*,
 892                     pathname,
 893                     char*,
 894                     buf,
 895                     size_t,
 896                     size)
 897 {
 898     int errno;
 899     struct v_fd* fd_s;
 900     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 901         goto done;
 902     }
 903
 904     struct v_dnode* dnode;
 905     if (!(errno = vfs_walk(
 906             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 907         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 908     }
 909
 910     if (errno >= 0) {
 911         return errno;
 912     }
 913
 914 done:
 915     return DO_STATUS(errno);
 916 }
 917
 918 /*
 919     NOTE
 920     When we perform operation that could affect the layout of
 921     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 922     whenever possible. This will blocking any ongoing path walking to reach
 923     it hence avoid any partial state.
 924 */
 925
 926 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 927 {
 928     int errno;
 929     struct v_dnode* dnode;
 930     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 931         return DO_STATUS(errno);
 932     }
 933
 934     lock_dnode(dnode);
 935
 936     if ((errno = vfs_check_writable(dnode))) {
 937         goto done;
 938     }
 939
 940     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 941         errno = EROFS;
 942         goto done;
 943     }
 944
 945     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 946         errno = EBUSY;
 947         goto done;
 948     }
 949
 950     if (!llist_empty(&dnode->children)) {
 951         errno = ENOTEMPTY;
 952         goto done;
 953     }
 954
 955     struct v_dnode* parent = dnode->parent;
 956
 957     if (!parent) {
 958         errno = EINVAL;
 959         goto done;
 960     }
 961
 962     lock_dnode(parent);
 963     lock_inode(parent->inode);
 964
 965     if ((dnode->inode->itype & VFS_IFDIR)) {
 966         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 967         if (!errno) {
 968             vfs_dcache_remove(dnode);
 969         }
 970     } else {
 971         errno = ENOTDIR;
 972     }
 973
 974     unlock_inode(parent->inode);
 975     unlock_dnode(parent);
 976
 977 done:
 978     unlock_dnode(dnode);
 979     return DO_STATUS(errno);
 980 }
 981
 982 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 983 {
 984     int errno = 0;
 985     struct v_dnode *parent, *dir;
 986     char name_value[VFS_NAME_MAXLEN];
 987     struct hstr name = HHSTR(name_value, 0, 0);
 988
 989     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
 990         goto done;
 991     }
 992
 993     if ((errno = vfs_check_writable(parent))) {
 994         goto done;
 995     }
 996
 997     if (!(dir = vfs_d_alloc(parent, &name))) {
 998         errno = ENOMEM;
 999         goto done;
1000     }
1001
1002     lock_dnode(parent);
1003     lock_inode(parent->inode);
1004
1005     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1006         errno = ENOTSUP;
1007     } else if (!parent->inode->ops->mkdir) {
1008         errno = ENOTSUP;
1009     } else if (!(parent->inode->itype & VFS_IFDIR)) {
1010         errno = ENOTDIR;
1011     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1012         vfs_dcache_add(parent, dir);
1013         goto cleanup;
1014     }
1015
1016     vfs_d_free(dir);
1017
1018 cleanup:
1019     unlock_inode(parent->inode);
1020     unlock_dnode(parent);
1021 done:
1022     return DO_STATUS(errno);
1023 }
1024
1025 int
1026 __vfs_do_unlink(struct v_dnode* dnode)
1027 {
1028     int errno;
1029     struct v_inode* inode = dnode->inode;
1030
1031     if (dnode->ref_count > 1) {
1032         return EBUSY;
1033     }
1034
1035     if ((errno = vfs_check_writable(dnode))) {
1036         return errno;
1037     }
1038
1039     lock_inode(inode);
1040
1041     if (inode->open_count) {
1042         errno = EBUSY;
1043     } else if (!(inode->itype & VFS_IFDIR)) {
1044         // The underlying unlink implementation should handle
1045         //  symlink case
1046         errno = inode->ops->unlink(inode);
1047         if (!errno) {
1048             vfs_d_free(dnode);
1049         }
1050     } else {
1051         errno = EISDIR;
1052     }
1053
1054     unlock_inode(inode);
1055
1056     return errno;
1057 }
1058
1059 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1060 {
1061     int errno;
1062     struct v_dnode* dnode;
1063     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1064         goto done;
1065     }
1066
1067     errno = __vfs_do_unlink(dnode);
1068
1069 done:
1070     return DO_STATUS(errno);
1071 }
1072
1073 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1074 {
1075     int errno;
1076     struct v_fd* fd_s;
1077     if ((errno = vfs_getfd(fd, &fd_s))) {
1078         goto done;
1079     }
1080
1081     struct v_dnode* dnode;
1082     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1083         errno = __vfs_do_unlink(dnode);
1084     }
1085
1086 done:
1087     return DO_STATUS(errno);
1088 }
1089
1090 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1091 {
1092     int errno;
1093     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1094
1095     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1096     if (!errno) {
1097         errno = __vfs_try_locate_file(
1098           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1099         if (!errno) {
1100             errno = EEXIST;
1101         } else if (name_file) {
1102             errno = vfs_link(to_link, name_file);
1103         }
1104     }
1105     return DO_STATUS(errno);
1106 }
1107
1108 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1109 {
1110     int errno;
1111     struct v_fd* fd_s;
1112
1113     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1114         errno = vfs_fsync(fd_s->file);
1115     }
1116
1117     return DO_STATUS(errno);
1118 }
1119
1120 int
1121 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1122 {
1123     int errno = 0;
1124     struct v_fd* copied = cake_grab(fd_pile);
1125
1126     memcpy(copied, old, sizeof(struct v_fd));
1127
1128     atomic_fetch_add(&old->file->ref_count, 1);
1129
1130     *new = copied;
1131
1132     return errno;
1133 }
1134
1135 int
1136 vfs_dup2(int oldfd, int newfd)
1137 {
1138     if (newfd == oldfd) {
1139         return newfd;
1140     }
1141
1142     int errno;
1143     struct v_fd *oldfd_s, *newfd_s;
1144     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1145         goto done;
1146     }
1147
1148     if (!TEST_FD(newfd)) {
1149         errno = EBADF;
1150         goto done;
1151     }
1152
1153     newfd_s = __current->fdtable->fds[newfd];
1154     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1155         goto done;
1156     }
1157
1158     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1159         __current->fdtable->fds[newfd] = newfd_s;
1160         return newfd;
1161     }
1162
1163 done:
1164     return DO_STATUS(errno);
1165 }
1166
1167 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1168 {
1169     return vfs_dup2(oldfd, newfd);
1170 }
1171
1172 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1173 {
1174     int errno, newfd;
1175     struct v_fd *oldfd_s, *newfd_s;
1176     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1177         goto done;
1178     }
1179
1180     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1181         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1182         __current->fdtable->fds[newfd] = newfd_s;
1183         return newfd;
1184     }
1185
1186 done:
1187     return DO_STATUS(errno);
1188 }
1189
1190 __DEFINE_LXSYSCALL2(int,
1191                     symlink,
1192                     const char*,
1193                     pathname,
1194                     const char*,
1195                     link_target)
1196 {
1197     int errno;
1198     struct v_dnode* dnode;
1199     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1200         goto done;
1201     }
1202
1203     if (errno = vfs_check_writable(dnode)) {
1204         goto done;
1205     }
1206
1207     if (!dnode->inode->ops->set_symlink) {
1208         errno = ENOTSUP;
1209         goto done;
1210     }
1211
1212     lock_inode(dnode->inode);
1213
1214     errno = dnode->inode->ops->set_symlink(dnode->inode, link_target);
1215
1216     unlock_inode(dnode->inode);
1217
1218 done:
1219     return DO_STATUS(errno);
1220 }
1221
1222 void
1223 vfs_ref_dnode(struct v_dnode* dnode)
1224 {
1225     atomic_fetch_add(&dnode->ref_count, 1);
1226     mnt_mkbusy(dnode->mnt);
1227 }
1228
1229 void
1230 vfs_unref_dnode(struct v_dnode* dnode)
1231 {
1232     atomic_fetch_sub(&dnode->ref_count, 1);
1233     mnt_chillax(dnode->mnt);
1234 }
1235
1236 int
1237 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1238 {
1239     int errno = 0;
1240
1241     lock_dnode(dnode);
1242
1243     if (!(dnode->inode->itype & VFS_IFDIR)) {
1244         errno = ENOTDIR;
1245         goto done;
1246     }
1247
1248     if (proc->cwd) {
1249         vfs_unref_dnode(proc->cwd);
1250     }
1251
1252     vfs_ref_dnode(dnode);
1253     proc->cwd = dnode;
1254
1255     unlock_dnode(dnode);
1256
1257 done:
1258     return errno;
1259 }
1260
1261 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1262 {
1263     struct v_dnode* dnode;
1264     int errno = 0;
1265
1266     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1267         goto done;
1268     }
1269
1270     errno = vfs_do_chdir(__current, dnode);
1271
1272 done:
1273     return DO_STATUS(errno);
1274 }
1275
1276 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1277 {
1278     struct v_fd* fd_s;
1279     int errno = 0;
1280
1281     if ((errno = vfs_getfd(fd, &fd_s))) {
1282         goto done;
1283     }
1284
1285     errno = vfs_do_chdir(__current, fd_s->file->dnode);
1286
1287 done:
1288     return DO_STATUS(errno);
1289 }
1290
1291 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1292 {
1293     int errno = 0;
1294     char* ret_ptr = 0;
1295     if (size < 2) {
1296         errno = ERANGE;
1297         goto done;
1298     }
1299
1300     size_t len = 0;
1301
1302     if (!__current->cwd) {
1303         *buf = VFS_PATH_DELIM;
1304         len = 1;
1305     } else {
1306         len = vfs_get_path(__current->cwd, buf, size, 0);
1307         if (len == size) {
1308             errno = ERANGE;
1309             goto done;
1310         }
1311     }
1312
1313     buf[len + 1] = '\0';
1314
1315     ret_ptr = buf;
1316
1317 done:
1318     __current->k_status = errno;
1319     return ret_ptr;
1320 }
1321
1322 int
1323 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1324 {
1325     int errno = 0;
1326     if (current->inode->id == target->inode->id) {
1327         // hard link
1328         return 0;
1329     }
1330
1331     if (errno = vfs_check_writable(current)) {
1332         return errno;
1333     }
1334
1335     if (current->ref_count > 1 || target->ref_count > 1) {
1336         return EBUSY;
1337     }
1338
1339     if (current->super_block != target->super_block) {
1340         return EXDEV;
1341     }
1342
1343     struct v_dnode* oldparent = current->parent;
1344     struct v_dnode* newparent = target->parent;
1345
1346     lock_dnode(current);
1347     lock_dnode(target);
1348     if (oldparent)
1349         lock_dnode(oldparent);
1350     if (newparent)
1351         lock_dnode(newparent);
1352
1353     if (!llist_empty(&target->children)) {
1354         errno = ENOTEMPTY;
1355         unlock_dnode(target);
1356         goto cleanup;
1357     }
1358
1359     if ((errno =
1360            current->inode->ops->rename(current->inode, current, target))) {
1361         unlock_dnode(target);
1362         goto cleanup;
1363     }
1364
1365     // re-position current
1366     hstrcpy(&current->name, &target->name);
1367     vfs_dcache_rehash(newparent, current);
1368
1369     // detach target
1370     vfs_d_free(target);
1371
1372     unlock_dnode(target);
1373
1374 cleanup:
1375     unlock_dnode(current);
1376     if (oldparent)
1377         unlock_dnode(oldparent);
1378     if (newparent)
1379         unlock_dnode(newparent);
1380
1381     return errno;
1382 }
1383
1384 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1385 {
1386     struct v_dnode *cur, *target_parent, *target;
1387     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1388     int errno = 0;
1389
1390     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1391         goto done;
1392     }
1393
1394     if ((errno = vfs_walk(
1395            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1396         goto done;
1397     }
1398
1399     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1400     if (errno == ENOENT) {
1401         target = vfs_d_alloc(target_parent, &name);
1402         vfs_dcache_add(target_parent, target);
1403     } else if (errno) {
1404         goto done;
1405     }
1406
1407     if (!target) {
1408         errno = ENOMEM;
1409         goto done;
1410     }
1411
1412     errno = vfs_do_rename(cur, target);
1413
1414 done:
1415     vfree(name.value);
1416     return DO_STATUS(errno);
1417 }