lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/foptions.h>
  48 #include <lunaix/fs.h>
  49 #include <lunaix/mm/cake.h>
  50 #include <lunaix/mm/page.h>
  51 #include <lunaix/mm/valloc.h>
  52 #include <lunaix/process.h>
  53 #include <lunaix/spike.h>
  54 #include <lunaix/syscall.h>
  55 #include <lunaix/syscall_utils.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 #include <usr/sys/dirent_defs.h>
  60
  61 static struct cake_pile* dnode_pile;
  62 static struct cake_pile* inode_pile;
  63 static struct cake_pile* file_pile;
  64 static struct cake_pile* superblock_pile;
  65 static struct cake_pile* fd_pile;
  66
  67 struct v_dnode* vfs_sysroot;
  68 static struct hbucket* dnode_cache;
  69
  70 struct lru_zone *dnode_lru, *inode_lru;
  71
  72 struct hstr vfs_ddot = HSTR("..", 2);
  73 struct hstr vfs_dot = HSTR(".", 1);
  74 struct hstr vfs_empty = HSTR("", 0);
  75
  76 struct v_superblock*
  77 vfs_sb_alloc();
  78
  79 void
  80 vfs_sb_free(struct v_superblock* sb);
  81
  82 static int
  83 __vfs_try_evict_dnode(struct lru_node* obj);
  84
  85 static int
  86 __vfs_try_evict_inode(struct lru_node* obj);
  87
  88 void
  89 vfs_init()
  90 {
  91     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  92     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  93     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  94     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  95     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  96     superblock_pile =
  97       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  98
  99     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 100
 101     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 102     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 103
 104     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 105     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 106
 107     // 创建一个根dnode。
 108     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 109     vfs_sysroot->parent = vfs_sysroot;
 110     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 111 }
 112
 113 inline struct hbucket*
 114 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 115 {
 116     u32_t _hash = *hash;
 117     // 确保低位更加随机
 118     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 119     // 与parent的指针值做加法，来减小碰撞的可能性。
 120     _hash += (u32_t)parent;
 121     *hash = _hash;
 122     return &dnode_cache[_hash & VFS_HASH_MASK];
 123 }
 124
 125 struct v_dnode*
 126 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 127 {
 128     if (!str->len || HSTR_EQ(str, &vfs_dot))
 129         return parent;
 130
 131     if (HSTR_EQ(str, &vfs_ddot)) {
 132         return parent->parent;
 133     }
 134
 135     u32_t hash = str->hash;
 136     struct hbucket* slot = __dcache_hash(parent, &hash);
 137
 138     struct v_dnode *pos, *n;
 139     hashtable_bucket_foreach(slot, pos, n, hash_list)
 140     {
 141         if (pos->name.hash == hash) {
 142             return pos;
 143         }
 144     }
 145     return NULL;
 146 }
 147
 148 void
 149 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 150 {
 151     assert(parent);
 152
 153     atomic_fetch_add(&dnode->ref_count, 1);
 154     dnode->parent = parent;
 155     llist_append(&parent->children, &dnode->siblings);
 156
 157     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 158     hlist_add(&bucket->head, &dnode->hash_list);
 159 }
 160
 161 void
 162 vfs_dcache_remove(struct v_dnode* dnode)
 163 {
 164     assert(dnode);
 165     assert(dnode->ref_count == 1);
 166
 167     llist_delete(&dnode->siblings);
 168     llist_delete(&dnode->aka_list);
 169     hlist_delete(&dnode->hash_list);
 170
 171     dnode->parent = NULL;
 172     atomic_fetch_sub(&dnode->ref_count, 1);
 173 }
 174
 175 void
 176 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 177 {
 178     assert(new_parent);
 179
 180     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 181     vfs_dcache_remove(dnode);
 182     vfs_dcache_add(new_parent, dnode);
 183 }
 184
 185 int
 186 vfs_open(struct v_dnode* dnode, struct v_file** file)
 187 {
 188     if (!dnode->inode || !dnode->inode->ops->open) {
 189         return ENOTSUP;
 190     }
 191
 192     struct v_inode* inode = dnode->inode;
 193
 194     lock_inode(inode);
 195
 196     struct v_file* vfile = cake_grab(file_pile);
 197     memset(vfile, 0, sizeof(*vfile));
 198
 199     vfile->dnode = dnode;
 200     vfile->inode = inode;
 201     vfile->ref_count = ATOMIC_VAR_INIT(1);
 202     vfile->ops = inode->default_fops;
 203
 204     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 205         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 206         pcache_init(pcache);
 207         pcache->master = inode;
 208         inode->pg_cache = pcache;
 209     }
 210
 211     int errno = inode->ops->open(inode, vfile);
 212     if (errno) {
 213         cake_release(file_pile, vfile);
 214     } else {
 215         atomic_fetch_add(&dnode->ref_count, 1);
 216         inode->open_count++;
 217         mnt_mkbusy(dnode->mnt);
 218
 219         *file = vfile;
 220     }
 221
 222     unlock_inode(inode);
 223
 224     return errno;
 225 }
 226
 227 void
 228 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 229 {
 230     if (assign_to->inode) {
 231         llist_delete(&assign_to->aka_list);
 232         assign_to->inode->link_count--;
 233     }
 234     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 235     assign_to->inode = inode;
 236     inode->link_count++;
 237 }
 238
 239 int
 240 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 241 {
 242     int errno;
 243
 244     if ((errno = vfs_check_writable(to_link))) {
 245         return errno;
 246     }
 247
 248     lock_inode(to_link->inode);
 249     if (to_link->super_block->root != name->super_block->root) {
 250         errno = EXDEV;
 251     } else if (!to_link->inode->ops->link) {
 252         errno = ENOTSUP;
 253     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 254         vfs_assign_inode(name, to_link->inode);
 255     }
 256     unlock_inode(to_link->inode);
 257
 258     return errno;
 259 }
 260
 261 int
 262 vfs_pclose(struct v_file* file, pid_t pid)
 263 {
 264     int errno = 0;
 265     if (file->ref_count > 1) {
 266         atomic_fetch_sub(&file->ref_count, 1);
 267     } else if (!(errno = file->ops->close(file))) {
 268         atomic_fetch_sub(&file->dnode->ref_count, 1);
 269         file->inode->open_count--;
 270
 271         /*
 272          * Prevent dead lock.
 273          * This happened when process is terminated while blocking on read.
 274          * In that case, the process is still holding the inode lock and it
 275              will never get released.
 276          * The unlocking should also include ownership check.
 277          *
 278          * To see why, consider two process both open the same file both with
 279          * fd=x.
 280          *      Process A: busy on reading x
 281          *      Process B: do nothing with x
 282          * Assuming that, after a very short time, process B get terminated
 283          * while process A is still busy in it's reading business. By this
 284          * design, the inode lock of this file x is get released by B rather
 285          * than A. And this will cause a probable race condition on A if other
 286          * process is writing to this file later after B exit.
 287          */
 288         if (mutex_on_hold(&file->inode->lock)) {
 289             mutex_unlock_for(&file->inode->lock, pid);
 290         }
 291         mnt_chillax(file->dnode->mnt);
 292
 293         pcache_commit_all(file->inode);
 294         cake_release(file_pile, file);
 295     }
 296     return errno;
 297 }
 298
 299 int
 300 vfs_close(struct v_file* file)
 301 {
 302     return vfs_pclose(file, __current->pid);
 303 }
 304
 305 void
 306 vfs_free_fd(struct v_fd* fd)
 307 {
 308     cake_release(fd_pile, fd);
 309 }
 310
 311 int
 312 vfs_fsync(struct v_file* file)
 313 {
 314     int errno;
 315     if ((errno = vfs_check_writable(file->dnode))) {
 316         return errno;
 317     }
 318
 319     lock_inode(file->inode);
 320
 321     pcache_commit_all(file->inode);
 322
 323     errno = ENOTSUP;
 324     if (file->ops->sync) {
 325         errno = file->ops->sync(file);
 326     }
 327
 328     unlock_inode(file->inode);
 329
 330     return errno;
 331 }
 332
 333 int
 334 vfs_alloc_fdslot(int* fd)
 335 {
 336     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 337         if (!__current->fdtable->fds[i]) {
 338             *fd = i;
 339             return 0;
 340         }
 341     }
 342     return EMFILE;
 343 }
 344
 345 struct v_superblock*
 346 vfs_sb_alloc()
 347 {
 348     struct v_superblock* sb = cake_grab(superblock_pile);
 349     memset(sb, 0, sizeof(*sb));
 350     llist_init_head(&sb->sb_list);
 351     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 352     return sb;
 353 }
 354
 355 void
 356 vfs_sb_free(struct v_superblock* sb)
 357 {
 358     vfree(sb->i_cache);
 359     cake_release(superblock_pile, sb);
 360 }
 361
 362 static int
 363 __vfs_try_evict_dnode(struct lru_node* obj)
 364 {
 365     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 366
 367     if (!dnode->ref_count) {
 368         vfs_d_free(dnode);
 369         return 1;
 370     }
 371     return 0;
 372 }
 373
 374 static int
 375 __vfs_try_evict_inode(struct lru_node* obj)
 376 {
 377     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 378
 379     if (!inode->link_count && !inode->open_count) {
 380         vfs_i_free(inode);
 381         return 1;
 382     }
 383     return 0;
 384 }
 385
 386 struct v_dnode*
 387 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 388 {
 389     struct v_dnode* dnode = cake_grab(dnode_pile);
 390     if (!dnode) {
 391         lru_evict_half(dnode_lru);
 392
 393         if (!(dnode = cake_grab(dnode_pile))) {
 394             return NULL;
 395         }
 396     }
 397
 398     memset(dnode, 0, sizeof(*dnode));
 399     llist_init_head(&dnode->children);
 400     llist_init_head(&dnode->siblings);
 401     llist_init_head(&dnode->aka_list);
 402     mutex_init(&dnode->lock);
 403
 404     dnode->ref_count = ATOMIC_VAR_INIT(0);
 405     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 406
 407     hstrcpy(&dnode->name, name);
 408
 409     if (parent) {
 410         dnode->super_block = parent->super_block;
 411         dnode->mnt = parent->mnt;
 412     }
 413
 414     lru_use_one(dnode_lru, &dnode->lru);
 415
 416     return dnode;
 417 }
 418
 419 void
 420 vfs_d_free(struct v_dnode* dnode)
 421 {
 422     assert(dnode->ref_count == 1);
 423
 424     if (dnode->inode) {
 425         assert(dnode->inode->link_count > 0);
 426         dnode->inode->link_count--;
 427     }
 428
 429     vfs_dcache_remove(dnode);
 430     // Make sure the children de-referencing their parent.
 431     // With lru presented, the eviction will be propagated over the entire
 432     // detached subtree eventually
 433     struct v_dnode *pos, *n;
 434     llist_for_each(pos, n, &dnode->children, siblings)
 435     {
 436         vfs_dcache_remove(pos);
 437     }
 438
 439     vfree(dnode->name.value);
 440     cake_release(dnode_pile, dnode);
 441 }
 442
 443 struct v_inode*
 444 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 445 {
 446     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 447     struct v_inode *pos, *n;
 448     hashtable_bucket_foreach(slot, pos, n, hash_list)
 449     {
 450         if (pos->id == i_id) {
 451             lru_use_one(inode_lru, &pos->lru);
 452             return pos;
 453         }
 454     }
 455
 456     return NULL;
 457 }
 458
 459 void
 460 vfs_i_addhash(struct v_inode* inode)
 461 {
 462     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 463
 464     hlist_delete(&inode->hash_list);
 465     hlist_add(&slot->head, &inode->hash_list);
 466 }
 467
 468 struct v_inode*
 469 vfs_i_alloc(struct v_superblock* sb)
 470 {
 471     assert(sb->ops.init_inode);
 472
 473     struct v_inode* inode;
 474     if (!(inode = cake_grab(inode_pile))) {
 475         lru_evict_half(inode_lru);
 476         if (!(inode = cake_grab(inode_pile))) {
 477             return NULL;
 478         }
 479     }
 480
 481     memset(inode, 0, sizeof(*inode));
 482     mutex_init(&inode->lock);
 483     llist_init_head(&inode->xattrs);
 484     llist_init_head(&inode->aka_dnodes);
 485
 486     sb->ops.init_inode(sb, inode);
 487
 488     inode->sb = sb;
 489     inode->ctime = clock_unixtime();
 490     inode->atime = inode->ctime;
 491     inode->mtime = inode->ctime;
 492
 493 done:
 494     lru_use_one(inode_lru, &inode->lru);
 495     return inode;
 496 }
 497
 498 void
 499 vfs_i_free(struct v_inode* inode)
 500 {
 501     if (inode->pg_cache) {
 502         pcache_release(inode->pg_cache);
 503         vfree(inode->pg_cache);
 504     }
 505     // we don't need to sync inode.
 506     // If an inode can be free, then it must be properly closed.
 507     // Hence it must be synced already!
 508     if (inode->destruct) {
 509         inode->destruct(inode);
 510     }
 511     hlist_delete(&inode->hash_list);
 512     cake_release(inode_pile, inode);
 513 }
 514
 515 /* ---- System call definition and support ---- */
 516
 517 #define FLOCATE_CREATE_EMPTY 1
 518
 519 int
 520 vfs_getfd(int fd, struct v_fd** fd_s)
 521 {
 522     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 523         return 0;
 524     }
 525     return EBADF;
 526 }
 527
 528 int
 529 __vfs_try_locate_file(const char* path,
 530                       struct v_dnode** fdir,
 531                       struct v_dnode** file,
 532                       int options)
 533 {
 534     char name_str[VFS_NAME_MAXLEN];
 535     struct hstr name = HSTR(name_str, 0);
 536     int errno;
 537
 538     name_str[0] = 0;
 539     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 540         return errno;
 541     }
 542
 543     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 544     if (errno != ENOENT || !(options & FLOCATE_CREATE_EMPTY)) {
 545         return errno;
 546     }
 547
 548     struct v_dnode* parent = *fdir;
 549     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 550
 551     if (!file_new) {
 552         return ENOMEM;
 553     }
 554
 555     lock_dnode(parent);
 556
 557     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 558         vfs_dcache_add(parent, file_new);
 559         *file = file_new;
 560     } else {
 561         vfs_d_free(file_new);
 562     }
 563
 564     unlock_dnode(parent);
 565
 566     return errno;
 567 }
 568
 569 int
 570 vfs_do_open(const char* path, int options)
 571 {
 572     int errno, fd;
 573     struct v_dnode *dentry, *file;
 574     struct v_file* ofile = NULL;
 575
 576     errno = __vfs_try_locate_file(
 577       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 578
 579     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 580
 581         if (errno || (errno = vfs_open(file, &ofile))) {
 582             return errno;
 583         }
 584
 585         struct v_fd* fd_s = cake_grab(fd_pile);
 586         memset(fd_s, 0, sizeof(*fd_s));
 587
 588         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 589         fd_s->file = ofile;
 590         fd_s->flags = options;
 591         __current->fdtable->fds[fd] = fd_s;
 592         return fd;
 593     }
 594
 595     return errno;
 596 }
 597
 598 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 599 {
 600     int errno = vfs_do_open(path, options);
 601     return DO_STATUS_OR_RETURN(errno);
 602 }
 603
 604 __DEFINE_LXSYSCALL1(int, close, int, fd)
 605 {
 606     struct v_fd* fd_s;
 607     int errno = 0;
 608     if ((errno = vfs_getfd(fd, &fd_s))) {
 609         goto done_err;
 610     }
 611
 612     if ((errno = vfs_close(fd_s->file))) {
 613         goto done_err;
 614     }
 615
 616     cake_release(fd_pile, fd_s);
 617     __current->fdtable->fds[fd] = 0;
 618
 619 done_err:
 620     return DO_STATUS(errno);
 621 }
 622
 623 void
 624 __vfs_readdir_callback(struct dir_context* dctx,
 625                        const char* name,
 626                        const int len,
 627                        const int dtype)
 628 {
 629     struct lx_dirent* dent = (struct lx_dirent*)dctx->cb_data;
 630     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 631     dent->d_nlen = len;
 632     dent->d_type = dtype;
 633 }
 634
 635 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent)
 636 {
 637     struct v_fd* fd_s;
 638     int errno;
 639
 640     if ((errno = vfs_getfd(fd, &fd_s))) {
 641         goto done;
 642     }
 643
 644     struct v_inode* inode = fd_s->file->inode;
 645
 646     lock_inode(inode);
 647
 648     if (!(inode->itype & VFS_IFDIR)) {
 649         errno = ENOTDIR;
 650     } else {
 651         struct dir_context dctx =
 652           (struct dir_context){ .cb_data = dent,
 653                                 .index = dent->d_offset,
 654                                 .read_complete_callback =
 655                                   __vfs_readdir_callback };
 656         errno = 1;
 657         if (dent->d_offset == 0) {
 658             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 659         } else if (dent->d_offset == 1) {
 660             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 661         } else {
 662             dctx.index -= 2;
 663             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 664                 unlock_inode(inode);
 665                 goto done;
 666             }
 667         }
 668         dent->d_offset++;
 669     }
 670
 671     unlock_inode(inode);
 672
 673 done:
 674     return DO_STATUS_OR_RETURN(errno);
 675 }
 676
 677 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 678 {
 679     int errno = 0;
 680     struct v_fd* fd_s;
 681     if ((errno = vfs_getfd(fd, &fd_s))) {
 682         goto done;
 683     }
 684
 685     struct v_file* file = fd_s->file;
 686     if ((file->inode->itype & VFS_IFDIR)) {
 687         errno = EISDIR;
 688         goto done;
 689     }
 690
 691     lock_inode(file->inode);
 692
 693     file->inode->atime = clock_unixtime();
 694
 695     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 696         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 697     } else {
 698         errno = pcache_read(file->inode, buf, count, file->f_pos);
 699     }
 700
 701     if (errno > 0) {
 702         file->f_pos += errno;
 703         unlock_inode(file->inode);
 704         return errno;
 705     }
 706
 707     unlock_inode(file->inode);
 708
 709 done:
 710     return DO_STATUS(errno);
 711 }
 712
 713 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 714 {
 715     int errno = 0;
 716     struct v_fd* fd_s;
 717     if ((errno = vfs_getfd(fd, &fd_s))) {
 718         goto done;
 719     }
 720
 721     struct v_file* file = fd_s->file;
 722
 723     if ((errno = vfs_check_writable(file->dnode))) {
 724         goto done;
 725     }
 726
 727     if ((file->inode->itype & VFS_IFDIR)) {
 728         errno = EISDIR;
 729         goto done;
 730     }
 731
 732     lock_inode(file->inode);
 733
 734     file->inode->mtime = clock_unixtime();
 735
 736     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 737         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 738     } else {
 739         errno = pcache_write(file->inode, buf, count, file->f_pos);
 740     }
 741
 742     if (errno > 0) {
 743         file->f_pos += errno;
 744         unlock_inode(file->inode);
 745         return errno;
 746     }
 747
 748     unlock_inode(file->inode);
 749
 750 done:
 751     return DO_STATUS(errno);
 752 }
 753
 754 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 755 {
 756     int errno = 0;
 757     struct v_fd* fd_s;
 758     if ((errno = vfs_getfd(fd, &fd_s))) {
 759         goto done;
 760     }
 761
 762     struct v_file* file = fd_s->file;
 763
 764     if (!file->ops->seek) {
 765         errno = ENOTSUP;
 766         goto done;
 767     }
 768
 769     lock_inode(file->inode);
 770
 771     int overflow = 0;
 772     int fpos = file->f_pos;
 773     switch (options) {
 774         case FSEEK_CUR:
 775             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 776             break;
 777         case FSEEK_END:
 778             overflow =
 779               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 780             break;
 781         case FSEEK_SET:
 782             fpos = offset;
 783             break;
 784     }
 785     if (overflow) {
 786         errno = EOVERFLOW;
 787     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 788         file->f_pos = fpos;
 789     }
 790
 791     unlock_inode(file->inode);
 792
 793 done:
 794     return DO_STATUS(errno);
 795 }
 796
 797 int
 798 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 799 {
 800     if (!dnode) {
 801         return 0;
 802     }
 803
 804     if (depth > 64) {
 805         return ENAMETOOLONG;
 806     }
 807
 808     size_t len = 0;
 809
 810     if (dnode->parent != dnode) {
 811         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 812     }
 813
 814     if (len >= size) {
 815         return len;
 816     }
 817
 818     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 819         buf[len++] = VFS_PATH_DELIM;
 820     }
 821
 822     size_t cpy_size = MIN(dnode->name.len, size - len);
 823     strncpy(buf + len, dnode->name.value, cpy_size);
 824     len += cpy_size;
 825
 826     return len;
 827 }
 828
 829 int
 830 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 831 {
 832     const char* link;
 833     struct v_inode* inode = dnode->inode;
 834     if (inode->ops->read_symlink) {
 835         lock_inode(inode);
 836
 837         int errno = inode->ops->read_symlink(inode, &link);
 838         strncpy(buf, link, size);
 839
 840         unlock_inode(inode);
 841         return errno;
 842     }
 843     return 0;
 844 }
 845
 846 int
 847 vfs_get_dtype(int itype)
 848 {
 849     switch (itype) {
 850         case VFS_IFDIR:
 851             return DT_DIR;
 852         case VFS_IFSYMLINK:
 853             return DT_SYMLINK;
 854         default:
 855             return DT_PIPE;
 856     }
 857 }
 858
 859 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 860 {
 861     int errno;
 862     struct v_fd* fd_s;
 863     if ((errno = vfs_getfd(fd, &fd_s))) {
 864         goto done;
 865     }
 866
 867     struct v_dnode* dnode;
 868     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 869
 870     if (errno >= 0) {
 871         return errno;
 872     }
 873
 874 done:
 875     return DO_STATUS(errno);
 876 }
 877
 878 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 879 {
 880     int errno;
 881     struct v_dnode* dnode;
 882     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 883         errno = vfs_readlink(dnode, buf, size);
 884     }
 885
 886     if (errno >= 0) {
 887         return errno;
 888     }
 889
 890     return DO_STATUS(errno);
 891 }
 892
 893 __DEFINE_LXSYSCALL4(int,
 894                     readlinkat,
 895                     int,
 896                     dirfd,
 897                     const char*,
 898                     pathname,
 899                     char*,
 900                     buf,
 901                     size_t,
 902                     size)
 903 {
 904     int errno;
 905     struct v_fd* fd_s;
 906     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 907         goto done;
 908     }
 909
 910     struct v_dnode* dnode;
 911     if (!(errno = vfs_walk(
 912             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 913         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 914     }
 915
 916     if (errno >= 0) {
 917         return errno;
 918     }
 919
 920 done:
 921     return DO_STATUS(errno);
 922 }
 923
 924 /*
 925     NOTE
 926     When we perform operation that could affect the layout of
 927     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 928     whenever possible. This will blocking any ongoing path walking to reach
 929     it hence avoid any partial state.
 930 */
 931
 932 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 933 {
 934     int errno;
 935     struct v_dnode* dnode;
 936     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 937         return DO_STATUS(errno);
 938     }
 939
 940     lock_dnode(dnode);
 941
 942     if ((errno = vfs_check_writable(dnode))) {
 943         goto done;
 944     }
 945
 946     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 947         errno = EROFS;
 948         goto done;
 949     }
 950
 951     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 952         errno = EBUSY;
 953         goto done;
 954     }
 955
 956     if (!llist_empty(&dnode->children)) {
 957         errno = ENOTEMPTY;
 958         goto done;
 959     }
 960
 961     struct v_dnode* parent = dnode->parent;
 962
 963     if (!parent) {
 964         errno = EINVAL;
 965         goto done;
 966     }
 967
 968     lock_dnode(parent);
 969     lock_inode(parent->inode);
 970
 971     if ((dnode->inode->itype & VFS_IFDIR)) {
 972         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 973         if (!errno) {
 974             vfs_dcache_remove(dnode);
 975         }
 976     } else {
 977         errno = ENOTDIR;
 978     }
 979
 980     unlock_inode(parent->inode);
 981     unlock_dnode(parent);
 982
 983 done:
 984     unlock_dnode(dnode);
 985     return DO_STATUS(errno);
 986 }
 987
 988 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 989 {
 990     int errno = 0;
 991     struct v_dnode *parent, *dir;
 992     char name_value[VFS_NAME_MAXLEN];
 993     struct hstr name = HHSTR(name_value, 0, 0);
 994
 995     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
 996         goto done;
 997     }
 998
 999     if ((errno = vfs_check_writable(parent))) {
1000         goto done;
1001     }
1002
1003     if (!(dir = vfs_d_alloc(parent, &name))) {
1004         errno = ENOMEM;
1005         goto done;
1006     }
1007
1008     lock_dnode(parent);
1009     lock_inode(parent->inode);
1010
1011     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1012         errno = ENOTSUP;
1013     } else if (!parent->inode->ops->mkdir) {
1014         errno = ENOTSUP;
1015     } else if (!(parent->inode->itype & VFS_IFDIR)) {
1016         errno = ENOTDIR;
1017     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1018         vfs_dcache_add(parent, dir);
1019         goto cleanup;
1020     }
1021
1022     vfs_d_free(dir);
1023
1024 cleanup:
1025     unlock_inode(parent->inode);
1026     unlock_dnode(parent);
1027 done:
1028     return DO_STATUS(errno);
1029 }
1030
1031 int
1032 __vfs_do_unlink(struct v_dnode* dnode)
1033 {
1034     int errno;
1035     struct v_inode* inode = dnode->inode;
1036
1037     if (dnode->ref_count > 1) {
1038         return EBUSY;
1039     }
1040
1041     if ((errno = vfs_check_writable(dnode))) {
1042         return errno;
1043     }
1044
1045     lock_inode(inode);
1046
1047     if (inode->open_count) {
1048         errno = EBUSY;
1049     } else if (!(inode->itype & VFS_IFDIR)) {
1050         // The underlying unlink implementation should handle
1051         //  symlink case
1052         errno = inode->ops->unlink(inode);
1053         if (!errno) {
1054             vfs_d_free(dnode);
1055         }
1056     } else {
1057         errno = EISDIR;
1058     }
1059
1060     unlock_inode(inode);
1061
1062     return errno;
1063 }
1064
1065 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1066 {
1067     int errno;
1068     struct v_dnode* dnode;
1069     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1070         goto done;
1071     }
1072
1073     errno = __vfs_do_unlink(dnode);
1074
1075 done:
1076     return DO_STATUS(errno);
1077 }
1078
1079 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1080 {
1081     int errno;
1082     struct v_fd* fd_s;
1083     if ((errno = vfs_getfd(fd, &fd_s))) {
1084         goto done;
1085     }
1086
1087     struct v_dnode* dnode;
1088     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1089         errno = __vfs_do_unlink(dnode);
1090     }
1091
1092 done:
1093     return DO_STATUS(errno);
1094 }
1095
1096 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1097 {
1098     int errno;
1099     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1100
1101     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1102     if (!errno) {
1103         errno = __vfs_try_locate_file(
1104           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1105         if (!errno) {
1106             errno = EEXIST;
1107         } else if (name_file) {
1108             errno = vfs_link(to_link, name_file);
1109         }
1110     }
1111     return DO_STATUS(errno);
1112 }
1113
1114 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1115 {
1116     int errno;
1117     struct v_fd* fd_s;
1118
1119     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1120         errno = vfs_fsync(fd_s->file);
1121     }
1122
1123     return DO_STATUS(errno);
1124 }
1125
1126 int
1127 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1128 {
1129     int errno = 0;
1130     struct v_fd* copied = cake_grab(fd_pile);
1131
1132     memcpy(copied, old, sizeof(struct v_fd));
1133
1134     atomic_fetch_add(&old->file->ref_count, 1);
1135
1136     *new = copied;
1137
1138     return errno;
1139 }
1140
1141 int
1142 vfs_dup2(int oldfd, int newfd)
1143 {
1144     if (newfd == oldfd) {
1145         return newfd;
1146     }
1147
1148     int errno;
1149     struct v_fd *oldfd_s, *newfd_s;
1150     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1151         goto done;
1152     }
1153
1154     if (!TEST_FD(newfd)) {
1155         errno = EBADF;
1156         goto done;
1157     }
1158
1159     newfd_s = __current->fdtable->fds[newfd];
1160     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1161         goto done;
1162     }
1163
1164     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1165         __current->fdtable->fds[newfd] = newfd_s;
1166         return newfd;
1167     }
1168
1169 done:
1170     return DO_STATUS(errno);
1171 }
1172
1173 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1174 {
1175     return vfs_dup2(oldfd, newfd);
1176 }
1177
1178 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1179 {
1180     int errno, newfd;
1181     struct v_fd *oldfd_s, *newfd_s;
1182     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1183         goto done;
1184     }
1185
1186     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1187         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1188         __current->fdtable->fds[newfd] = newfd_s;
1189         return newfd;
1190     }
1191
1192 done:
1193     return DO_STATUS(errno);
1194 }
1195
1196 __DEFINE_LXSYSCALL2(int,
1197                     symlink,
1198                     const char*,
1199                     pathname,
1200                     const char*,
1201                     link_target)
1202 {
1203     int errno;
1204     struct v_dnode* dnode;
1205     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1206         goto done;
1207     }
1208
1209     if (errno = vfs_check_writable(dnode)) {
1210         goto done;
1211     }
1212
1213     if (!dnode->inode->ops->set_symlink) {
1214         errno = ENOTSUP;
1215         goto done;
1216     }
1217
1218     lock_inode(dnode->inode);
1219
1220     errno = dnode->inode->ops->set_symlink(dnode->inode, link_target);
1221
1222     unlock_inode(dnode->inode);
1223
1224 done:
1225     return DO_STATUS(errno);
1226 }
1227
1228 void
1229 vfs_ref_dnode(struct v_dnode* dnode)
1230 {
1231     atomic_fetch_add(&dnode->ref_count, 1);
1232     mnt_mkbusy(dnode->mnt);
1233 }
1234
1235 void
1236 vfs_unref_dnode(struct v_dnode* dnode)
1237 {
1238     atomic_fetch_sub(&dnode->ref_count, 1);
1239     mnt_chillax(dnode->mnt);
1240 }
1241
1242 int
1243 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1244 {
1245     int errno = 0;
1246
1247     lock_dnode(dnode);
1248
1249     if (!(dnode->inode->itype & VFS_IFDIR)) {
1250         errno = ENOTDIR;
1251         goto done;
1252     }
1253
1254     if (proc->cwd) {
1255         vfs_unref_dnode(proc->cwd);
1256     }
1257
1258     vfs_ref_dnode(dnode);
1259     proc->cwd = dnode;
1260
1261     unlock_dnode(dnode);
1262
1263 done:
1264     return errno;
1265 }
1266
1267 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1268 {
1269     struct v_dnode* dnode;
1270     int errno = 0;
1271
1272     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1273         goto done;
1274     }
1275
1276     errno = vfs_do_chdir(__current, dnode);
1277
1278 done:
1279     return DO_STATUS(errno);
1280 }
1281
1282 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1283 {
1284     struct v_fd* fd_s;
1285     int errno = 0;
1286
1287     if ((errno = vfs_getfd(fd, &fd_s))) {
1288         goto done;
1289     }
1290
1291     errno = vfs_do_chdir(__current, fd_s->file->dnode);
1292
1293 done:
1294     return DO_STATUS(errno);
1295 }
1296
1297 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1298 {
1299     int errno = 0;
1300     char* ret_ptr = 0;
1301     if (size < 2) {
1302         errno = ERANGE;
1303         goto done;
1304     }
1305
1306     size_t len = 0;
1307
1308     if (!__current->cwd) {
1309         *buf = VFS_PATH_DELIM;
1310         len = 1;
1311     } else {
1312         len = vfs_get_path(__current->cwd, buf, size, 0);
1313         if (len == size) {
1314             errno = ERANGE;
1315             goto done;
1316         }
1317     }
1318
1319     buf[len + 1] = '\0';
1320
1321     ret_ptr = buf;
1322
1323 done:
1324     __current->k_status = errno;
1325     return ret_ptr;
1326 }
1327
1328 int
1329 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1330 {
1331     int errno = 0;
1332     if (current->inode->id == target->inode->id) {
1333         // hard link
1334         return 0;
1335     }
1336
1337     if (errno = vfs_check_writable(current)) {
1338         return errno;
1339     }
1340
1341     if (current->ref_count > 1 || target->ref_count > 1) {
1342         return EBUSY;
1343     }
1344
1345     if (current->super_block != target->super_block) {
1346         return EXDEV;
1347     }
1348
1349     struct v_dnode* oldparent = current->parent;
1350     struct v_dnode* newparent = target->parent;
1351
1352     lock_dnode(current);
1353     lock_dnode(target);
1354     if (oldparent)
1355         lock_dnode(oldparent);
1356     if (newparent)
1357         lock_dnode(newparent);
1358
1359     if (!llist_empty(&target->children)) {
1360         errno = ENOTEMPTY;
1361         unlock_dnode(target);
1362         goto cleanup;
1363     }
1364
1365     if ((errno =
1366            current->inode->ops->rename(current->inode, current, target))) {
1367         unlock_dnode(target);
1368         goto cleanup;
1369     }
1370
1371     // re-position current
1372     hstrcpy(&current->name, &target->name);
1373     vfs_dcache_rehash(newparent, current);
1374
1375     // detach target
1376     vfs_d_free(target);
1377
1378     unlock_dnode(target);
1379
1380 cleanup:
1381     unlock_dnode(current);
1382     if (oldparent)
1383         unlock_dnode(oldparent);
1384     if (newparent)
1385         unlock_dnode(newparent);
1386
1387     return errno;
1388 }
1389
1390 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1391 {
1392     struct v_dnode *cur, *target_parent, *target;
1393     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1394     int errno = 0;
1395
1396     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1397         goto done;
1398     }
1399
1400     if ((errno = vfs_walk(
1401            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1402         goto done;
1403     }
1404
1405     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1406     if (errno == ENOENT) {
1407         target = vfs_d_alloc(target_parent, &name);
1408         vfs_dcache_add(target_parent, target);
1409     } else if (errno) {
1410         goto done;
1411     }
1412
1413     if (!target) {
1414         errno = ENOMEM;
1415         goto done;
1416     }
1417
1418     errno = vfs_do_rename(cur, target);
1419
1420 done:
1421     vfree(name.value);
1422     return DO_STATUS(errno);
1423 }