lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru (CHECKED)
  20  2. Get dnodes hooked into lru (CHECKED)
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely. (CHECKED)
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we
  31             unmount. (CHECKED)
  32  7. (mount) Figure out a way to acquire the device represented by a dnode.
  33             so it can be used to mount. (e.g. we wish to get `struct device*`
  34             out of the dnode at /dev/sda)
  35             [tip] we should pay attention at twifs and add a private_data field
  36             under struct v_dnode? (CHECKED)
  37  8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
  38  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  39                     image file using a so called "loopback" pseudo device. Maybe
  40                     we can do similar thing in Lunaix? A block device emulation
  41                     above the regular file when we mount it on.
  42  10. (device) device number (dev_t) allocation
  43             [good idea] <class>:<subclass>:<uniq_id> composition
  44 */
  45
  46 #include <klibc/string.h>
  47 #include <lunaix/dirent.h>
  48 #include <lunaix/foptions.h>
  49 #include <lunaix/fs.h>
  50 #include <lunaix/mm/cake.h>
  51 #include <lunaix/mm/page.h>
  52 #include <lunaix/mm/valloc.h>
  53 #include <lunaix/process.h>
  54 #include <lunaix/spike.h>
  55 #include <lunaix/syscall.h>
  56
  57 #include <lunaix/fs/twifs.h>
  58
  59 static struct cake_pile* dnode_pile;
  60 static struct cake_pile* inode_pile;
  61 static struct cake_pile* file_pile;
  62 static struct cake_pile* superblock_pile;
  63 static struct cake_pile* fd_pile;
  64
  65 struct v_dnode* vfs_sysroot;
  66 static struct hbucket* dnode_cache;
  67
  68 struct lru_zone *dnode_lru, *inode_lru;
  69
  70 struct hstr vfs_ddot = HSTR("..", 2);
  71 struct hstr vfs_dot = HSTR(".", 1);
  72 struct hstr vfs_empty = HSTR("", 0);
  73
  74 struct v_superblock*
  75 vfs_sb_alloc();
  76
  77 void
  78 vfs_sb_free(struct v_superblock* sb);
  79
  80 static int
  81 __vfs_try_evict_dnode(struct lru_node* obj);
  82
  83 static int
  84 __vfs_try_evict_inode(struct lru_node* obj);
  85
  86 void
  87 vfs_init()
  88 {
  89     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  90     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  91     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  92     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  93     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  94     superblock_pile =
  95       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  96
  97     dnode_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
  98
  99     dnode_lru = lru_new_zone(__vfs_try_evict_dnode);
 100     inode_lru = lru_new_zone(__vfs_try_evict_inode);
 101
 102     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 103     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 104
 105     // 创建一个根dnode。
 106     vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
 107     vfs_sysroot->parent = vfs_sysroot;
 108     atomic_fetch_add(&vfs_sysroot->ref_count, 1);
 109 }
 110
 111 inline struct hbucket*
 112 __dcache_hash(struct v_dnode* parent, u32_t* hash)
 113 {
 114     u32_t _hash = *hash;
 115     // 确保低位更加随机
 116     _hash = _hash ^ (_hash >> VFS_HASHBITS);
 117     // 与parent的指针值做加法，来减小碰撞的可能性。
 118     _hash += (u32_t)parent;
 119     *hash = _hash;
 120     return &dnode_cache[_hash & VFS_HASH_MASK];
 121 }
 122
 123 struct v_dnode*
 124 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 125 {
 126     if (!str->len || HSTR_EQ(str, &vfs_dot))
 127         return parent;
 128
 129     if (HSTR_EQ(str, &vfs_ddot)) {
 130         return parent->parent;
 131     }
 132
 133     u32_t hash = str->hash;
 134     struct hbucket* slot = __dcache_hash(parent, &hash);
 135
 136     struct v_dnode *pos, *n;
 137     hashtable_bucket_foreach(slot, pos, n, hash_list)
 138     {
 139         if (pos->name.hash == hash) {
 140             return pos;
 141         }
 142     }
 143     return NULL;
 144 }
 145
 146 void
 147 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 148 {
 149     assert(parent);
 150
 151     atomic_fetch_add(&dnode->ref_count, 1);
 152     dnode->parent = parent;
 153     llist_append(&parent->children, &dnode->siblings);
 154
 155     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 156     hlist_add(&bucket->head, &dnode->hash_list);
 157 }
 158
 159 void
 160 vfs_dcache_remove(struct v_dnode* dnode)
 161 {
 162     assert(dnode);
 163     assert(dnode->ref_count == 1);
 164
 165     llist_delete(&dnode->siblings);
 166     llist_delete(&dnode->aka_list);
 167     hlist_delete(&dnode->hash_list);
 168
 169     dnode->parent = NULL;
 170     atomic_fetch_sub(&dnode->ref_count, 1);
 171 }
 172
 173 void
 174 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 175 {
 176     assert(new_parent);
 177
 178     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 179     vfs_dcache_remove(dnode);
 180     vfs_dcache_add(new_parent, dnode);
 181 }
 182
 183 int
 184 vfs_open(struct v_dnode* dnode, struct v_file** file)
 185 {
 186     if (!dnode->inode || !dnode->inode->ops->open) {
 187         return ENOTSUP;
 188     }
 189
 190     struct v_inode* inode = dnode->inode;
 191
 192     lock_inode(inode);
 193
 194     struct v_file* vfile = cake_grab(file_pile);
 195     memset(vfile, 0, sizeof(*vfile));
 196
 197     vfile->dnode = dnode;
 198     vfile->inode = inode;
 199     vfile->ref_count = ATOMIC_VAR_INIT(1);
 200     vfile->ops = inode->default_fops;
 201
 202     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 203         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 204         pcache_init(pcache);
 205         pcache->master = inode;
 206         inode->pg_cache = pcache;
 207     }
 208
 209     int errno = inode->ops->open(inode, vfile);
 210     if (errno) {
 211         cake_release(file_pile, vfile);
 212     } else {
 213         atomic_fetch_add(&dnode->ref_count, 1);
 214         inode->open_count++;
 215         mnt_mkbusy(dnode->mnt);
 216
 217         *file = vfile;
 218     }
 219
 220     unlock_inode(inode);
 221
 222     return errno;
 223 }
 224
 225 void
 226 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
 227 {
 228     if (assign_to->inode) {
 229         llist_delete(&assign_to->aka_list);
 230         assign_to->inode->link_count--;
 231     }
 232     llist_append(&inode->aka_dnodes, &assign_to->aka_list);
 233     assign_to->inode = inode;
 234     inode->link_count++;
 235 }
 236
 237 int
 238 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 239 {
 240     int errno;
 241
 242     if ((errno = vfs_check_writable(to_link))) {
 243         return errno;
 244     }
 245
 246     lock_inode(to_link->inode);
 247     if (to_link->super_block->root != name->super_block->root) {
 248         errno = EXDEV;
 249     } else if (!to_link->inode->ops->link) {
 250         errno = ENOTSUP;
 251     } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) {
 252         vfs_assign_inode(name, to_link->inode);
 253     }
 254     unlock_inode(to_link->inode);
 255
 256     return errno;
 257 }
 258
 259 int
 260 vfs_pclose(struct v_file* file, pid_t pid)
 261 {
 262     int errno = 0;
 263     if (file->ref_count > 1) {
 264         atomic_fetch_sub(&file->ref_count, 1);
 265     } else if (!(errno = file->ops->close(file))) {
 266         atomic_fetch_sub(&file->dnode->ref_count, 1);
 267         file->inode->open_count--;
 268
 269         /*
 270          * Prevent dead lock.
 271          * This happened when process is terminated while blocking on read.
 272          * In that case, the process is still holding the inode lock and it
 273              will never get released.
 274          * The unlocking should also include ownership check.
 275          *
 276          * To see why, consider two process both open the same file both with
 277          * fd=x.
 278          *      Process A: busy on reading x
 279          *      Process B: do nothing with x
 280          * Assuming that, after a very short time, process B get terminated
 281          * while process A is still busy in it's reading business. By this
 282          * design, the inode lock of this file x is get released by B rather
 283          * than A. And this will cause a probable race condition on A if other
 284          * process is writing to this file later after B exit.
 285          */
 286         if (mutex_on_hold(&file->inode->lock)) {
 287             mutex_unlock_for(&file->inode->lock, pid);
 288         }
 289         mnt_chillax(file->dnode->mnt);
 290
 291         pcache_commit_all(file->inode);
 292         cake_release(file_pile, file);
 293     }
 294     return errno;
 295 }
 296
 297 int
 298 vfs_close(struct v_file* file)
 299 {
 300     return vfs_pclose(file, __current->pid);
 301 }
 302
 303 void
 304 vfs_free_fd(struct v_fd* fd)
 305 {
 306     cake_release(fd_pile, fd);
 307 }
 308
 309 int
 310 vfs_fsync(struct v_file* file)
 311 {
 312     int errno;
 313     if ((errno = vfs_check_writable(file->dnode))) {
 314         return errno;
 315     }
 316
 317     lock_inode(file->inode);
 318
 319     pcache_commit_all(file->inode);
 320
 321     errno = ENOTSUP;
 322     if (file->ops->sync) {
 323         errno = file->ops->sync(file);
 324     }
 325
 326     unlock_inode(file->inode);
 327
 328     return errno;
 329 }
 330
 331 int
 332 vfs_alloc_fdslot(int* fd)
 333 {
 334     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 335         if (!__current->fdtable->fds[i]) {
 336             *fd = i;
 337             return 0;
 338         }
 339     }
 340     return EMFILE;
 341 }
 342
 343 struct v_superblock*
 344 vfs_sb_alloc()
 345 {
 346     struct v_superblock* sb = cake_grab(superblock_pile);
 347     memset(sb, 0, sizeof(*sb));
 348     llist_init_head(&sb->sb_list);
 349     sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
 350     return sb;
 351 }
 352
 353 void
 354 vfs_sb_free(struct v_superblock* sb)
 355 {
 356     vfree(sb->i_cache);
 357     cake_release(superblock_pile, sb);
 358 }
 359
 360 static int
 361 __vfs_try_evict_dnode(struct lru_node* obj)
 362 {
 363     struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
 364
 365     if (!dnode->ref_count) {
 366         vfs_d_free(dnode);
 367         return 1;
 368     }
 369     return 0;
 370 }
 371
 372 static int
 373 __vfs_try_evict_inode(struct lru_node* obj)
 374 {
 375     struct v_inode* inode = container_of(obj, struct v_inode, lru);
 376
 377     if (!inode->link_count && !inode->open_count) {
 378         vfs_i_free(inode);
 379         return 1;
 380     }
 381     return 0;
 382 }
 383
 384 struct v_dnode*
 385 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
 386 {
 387     struct v_dnode* dnode = cake_grab(dnode_pile);
 388     if (!dnode) {
 389         lru_evict_half(dnode_lru);
 390
 391         if (!(dnode = cake_grab(dnode_pile))) {
 392             return NULL;
 393         }
 394     }
 395
 396     memset(dnode, 0, sizeof(*dnode));
 397     llist_init_head(&dnode->children);
 398     llist_init_head(&dnode->siblings);
 399     llist_init_head(&dnode->aka_list);
 400     mutex_init(&dnode->lock);
 401
 402     dnode->ref_count = ATOMIC_VAR_INIT(0);
 403     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 404
 405     hstrcpy(&dnode->name, name);
 406
 407     if (parent) {
 408         dnode->super_block = parent->super_block;
 409         dnode->mnt = parent->mnt;
 410     }
 411
 412     lru_use_one(dnode_lru, &dnode->lru);
 413
 414     return dnode;
 415 }
 416
 417 void
 418 vfs_d_free(struct v_dnode* dnode)
 419 {
 420     assert(dnode->ref_count == 1);
 421
 422     if (dnode->inode) {
 423         assert(dnode->inode->link_count > 0);
 424         dnode->inode->link_count--;
 425     }
 426
 427     vfs_dcache_remove(dnode);
 428     // Make sure the children de-referencing their parent.
 429     // With lru presented, the eviction will be propagated over the entire
 430     // detached subtree eventually
 431     struct v_dnode *pos, *n;
 432     llist_for_each(pos, n, &dnode->children, siblings)
 433     {
 434         vfs_dcache_remove(pos);
 435     }
 436
 437     vfree(dnode->name.value);
 438     cake_release(dnode_pile, dnode);
 439 }
 440
 441 struct v_inode*
 442 vfs_i_find(struct v_superblock* sb, u32_t i_id)
 443 {
 444     struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK];
 445     struct v_inode *pos, *n;
 446     hashtable_bucket_foreach(slot, pos, n, hash_list)
 447     {
 448         if (pos->id == i_id) {
 449             lru_use_one(inode_lru, &pos->lru);
 450             return pos;
 451         }
 452     }
 453
 454     return NULL;
 455 }
 456
 457 void
 458 vfs_i_addhash(struct v_inode* inode)
 459 {
 460     struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK];
 461
 462     hlist_delete(&inode->hash_list);
 463     hlist_add(&slot->head, &inode->hash_list);
 464 }
 465
 466 struct v_inode*
 467 vfs_i_alloc(struct v_superblock* sb)
 468 {
 469     assert(sb->ops.init_inode);
 470
 471     struct v_inode* inode;
 472     if (!(inode = cake_grab(inode_pile))) {
 473         lru_evict_half(inode_lru);
 474         if (!(inode = cake_grab(inode_pile))) {
 475             return NULL;
 476         }
 477     }
 478
 479     memset(inode, 0, sizeof(*inode));
 480     mutex_init(&inode->lock);
 481     llist_init_head(&inode->xattrs);
 482     llist_init_head(&inode->aka_dnodes);
 483
 484     sb->ops.init_inode(sb, inode);
 485
 486     inode->sb = sb;
 487     inode->ctime = clock_unixtime();
 488     inode->atime = inode->ctime;
 489     inode->mtime = inode->ctime;
 490
 491 done:
 492     lru_use_one(inode_lru, &inode->lru);
 493     return inode;
 494 }
 495
 496 void
 497 vfs_i_free(struct v_inode* inode)
 498 {
 499     if (inode->pg_cache) {
 500         pcache_release(inode->pg_cache);
 501         vfree(inode->pg_cache);
 502     }
 503     // we don't need to sync inode.
 504     // If an inode can be free, then it must be properly closed.
 505     // Hence it must be synced already!
 506     if (inode->destruct) {
 507         inode->destruct(inode);
 508     }
 509     hlist_delete(&inode->hash_list);
 510     cake_release(inode_pile, inode);
 511 }
 512
 513 /* ---- System call definition and support ---- */
 514
 515 #define FLOCATE_CREATE_EMPTY 1
 516
 517 int
 518 vfs_getfd(int fd, struct v_fd** fd_s)
 519 {
 520     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 521         return 0;
 522     }
 523     return EBADF;
 524 }
 525
 526 int
 527 __vfs_try_locate_file(const char* path,
 528                       struct v_dnode** fdir,
 529                       struct v_dnode** file,
 530                       int options)
 531 {
 532     char name_str[VFS_NAME_MAXLEN];
 533     struct hstr name = HSTR(name_str, 0);
 534     int errno;
 535
 536     name_str[0] = 0;
 537     if ((errno = vfs_walk_proc(path, fdir, &name, VFS_WALK_PARENT))) {
 538         return errno;
 539     }
 540
 541     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 542     if (errno != ENOENT || !(options & FLOCATE_CREATE_EMPTY)) {
 543         return errno;
 544     }
 545
 546     struct v_dnode* parent = *fdir;
 547     struct v_dnode* file_new = vfs_d_alloc(parent, &name);
 548
 549     if (!file_new) {
 550         return ENOMEM;
 551     }
 552
 553     lock_dnode(parent);
 554
 555     if (!(errno = parent->inode->ops->create(parent->inode, file_new))) {
 556         vfs_dcache_add(parent, file_new);
 557         *file = file_new;
 558     } else {
 559         vfs_d_free(file_new);
 560     }
 561
 562     unlock_dnode(parent);
 563
 564     return errno;
 565 }
 566
 567 int
 568 vfs_do_open(const char* path, int options)
 569 {
 570     int errno, fd;
 571     struct v_dnode *dentry, *file;
 572     struct v_file* ofile = NULL;
 573
 574     errno = __vfs_try_locate_file(
 575       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 576
 577     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 578
 579         if (errno || (errno = vfs_open(file, &ofile))) {
 580             return errno;
 581         }
 582
 583         struct v_fd* fd_s = cake_grab(fd_pile);
 584         memset(fd_s, 0, sizeof(*fd_s));
 585
 586         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 587         fd_s->file = ofile;
 588         fd_s->flags = options;
 589         __current->fdtable->fds[fd] = fd_s;
 590         return fd;
 591     }
 592
 593     return errno;
 594 }
 595
 596 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 597 {
 598     int errno = vfs_do_open(path, options);
 599     return DO_STATUS_OR_RETURN(errno);
 600 }
 601
 602 __DEFINE_LXSYSCALL1(int, close, int, fd)
 603 {
 604     struct v_fd* fd_s;
 605     int errno = 0;
 606     if ((errno = vfs_getfd(fd, &fd_s))) {
 607         goto done_err;
 608     }
 609
 610     if ((errno = vfs_close(fd_s->file))) {
 611         goto done_err;
 612     }
 613
 614     cake_release(fd_pile, fd_s);
 615     __current->fdtable->fds[fd] = 0;
 616
 617 done_err:
 618     return DO_STATUS(errno);
 619 }
 620
 621 void
 622 __vfs_readdir_callback(struct dir_context* dctx,
 623                        const char* name,
 624                        const int len,
 625                        const int dtype)
 626 {
 627     struct dirent* dent = (struct dirent*)dctx->cb_data;
 628     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 629     dent->d_nlen = len;
 630     dent->d_type = dtype;
 631 }
 632
 633 __DEFINE_LXSYSCALL2(int, readdir, int, fd, struct dirent*, dent)
 634 {
 635     struct v_fd* fd_s;
 636     int errno;
 637
 638     if ((errno = vfs_getfd(fd, &fd_s))) {
 639         goto done;
 640     }
 641
 642     struct v_inode* inode = fd_s->file->inode;
 643
 644     lock_inode(inode);
 645
 646     if (!(inode->itype & VFS_IFDIR)) {
 647         errno = ENOTDIR;
 648     } else {
 649         struct dir_context dctx =
 650           (struct dir_context){ .cb_data = dent,
 651                                 .index = dent->d_offset,
 652                                 .read_complete_callback =
 653                                   __vfs_readdir_callback };
 654         errno = 1;
 655         if (dent->d_offset == 0) {
 656             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, DT_DIR);
 657         } else if (dent->d_offset == 1) {
 658             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, DT_DIR);
 659         } else {
 660             dctx.index -= 2;
 661             if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
 662                 unlock_inode(inode);
 663                 goto done;
 664             }
 665         }
 666         dent->d_offset++;
 667     }
 668
 669     unlock_inode(inode);
 670
 671 done:
 672     return DO_STATUS_OR_RETURN(errno);
 673 }
 674
 675 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 676 {
 677     int errno = 0;
 678     struct v_fd* fd_s;
 679     if ((errno = vfs_getfd(fd, &fd_s))) {
 680         goto done;
 681     }
 682
 683     struct v_file* file = fd_s->file;
 684     if ((file->inode->itype & VFS_IFDIR)) {
 685         errno = EISDIR;
 686         goto done;
 687     }
 688
 689     lock_inode(file->inode);
 690
 691     file->inode->atime = clock_unixtime();
 692
 693     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 694         errno = file->ops->read(file->inode, buf, count, file->f_pos);
 695     } else {
 696         errno = pcache_read(file->inode, buf, count, file->f_pos);
 697     }
 698
 699     if (errno > 0) {
 700         file->f_pos += errno;
 701         unlock_inode(file->inode);
 702         return errno;
 703     }
 704
 705     unlock_inode(file->inode);
 706
 707 done:
 708     return DO_STATUS(errno);
 709 }
 710
 711 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 712 {
 713     int errno = 0;
 714     struct v_fd* fd_s;
 715     if ((errno = vfs_getfd(fd, &fd_s))) {
 716         goto done;
 717     }
 718
 719     struct v_file* file = fd_s->file;
 720
 721     if ((errno = vfs_check_writable(file->dnode))) {
 722         goto done;
 723     }
 724
 725     if ((file->inode->itype & VFS_IFDIR)) {
 726         errno = EISDIR;
 727         goto done;
 728     }
 729
 730     lock_inode(file->inode);
 731
 732     file->inode->mtime = clock_unixtime();
 733
 734     if ((file->inode->itype & VFS_IFSEQDEV) || (fd_s->flags & FO_DIRECT)) {
 735         errno = file->ops->write(file->inode, buf, count, file->f_pos);
 736     } else {
 737         errno = pcache_write(file->inode, buf, count, file->f_pos);
 738     }
 739
 740     if (errno > 0) {
 741         file->f_pos += errno;
 742         unlock_inode(file->inode);
 743         return errno;
 744     }
 745
 746     unlock_inode(file->inode);
 747
 748 done:
 749     return DO_STATUS(errno);
 750 }
 751
 752 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 753 {
 754     int errno = 0;
 755     struct v_fd* fd_s;
 756     if ((errno = vfs_getfd(fd, &fd_s))) {
 757         goto done;
 758     }
 759
 760     struct v_file* file = fd_s->file;
 761
 762     if (!file->ops->seek) {
 763         errno = ENOTSUP;
 764         goto done;
 765     }
 766
 767     lock_inode(file->inode);
 768
 769     int overflow = 0;
 770     int fpos = file->f_pos;
 771     switch (options) {
 772         case FSEEK_CUR:
 773             overflow = __builtin_sadd_overflow((int)file->f_pos, offset, &fpos);
 774             break;
 775         case FSEEK_END:
 776             overflow =
 777               __builtin_sadd_overflow((int)file->inode->fsize, offset, &fpos);
 778             break;
 779         case FSEEK_SET:
 780             fpos = offset;
 781             break;
 782     }
 783     if (overflow) {
 784         errno = EOVERFLOW;
 785     } else if (!(errno = file->ops->seek(file->inode, fpos))) {
 786         file->f_pos = fpos;
 787     }
 788
 789     unlock_inode(file->inode);
 790
 791 done:
 792     return DO_STATUS(errno);
 793 }
 794
 795 int
 796 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 797 {
 798     if (!dnode) {
 799         return 0;
 800     }
 801
 802     if (depth > 64) {
 803         return ENAMETOOLONG;
 804     }
 805
 806     size_t len = 0;
 807
 808     if (dnode->parent != dnode) {
 809         len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 810     }
 811
 812     if (len >= size) {
 813         return len;
 814     }
 815
 816     if (!len || buf[len - 1] != VFS_PATH_DELIM) {
 817         buf[len++] = VFS_PATH_DELIM;
 818     }
 819
 820     size_t cpy_size = MIN(dnode->name.len, size - len);
 821     strncpy(buf + len, dnode->name.value, cpy_size);
 822     len += cpy_size;
 823
 824     return len;
 825 }
 826
 827 int
 828 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 829 {
 830     const char* link;
 831     struct v_inode* inode = dnode->inode;
 832     if (inode->ops->read_symlink) {
 833         lock_inode(inode);
 834
 835         int errno = inode->ops->read_symlink(inode, &link);
 836         strncpy(buf, link, size);
 837
 838         unlock_inode(inode);
 839         return errno;
 840     }
 841     return 0;
 842 }
 843
 844 int
 845 vfs_get_dtype(int itype)
 846 {
 847     switch (itype) {
 848         case VFS_IFDIR:
 849             return DT_DIR;
 850         case VFS_IFSYMLINK:
 851             return DT_SYMLINK;
 852         default:
 853             return DT_PIPE;
 854     }
 855 }
 856
 857 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 858 {
 859     int errno;
 860     struct v_fd* fd_s;
 861     if ((errno = vfs_getfd(fd, &fd_s))) {
 862         goto done;
 863     }
 864
 865     struct v_dnode* dnode;
 866     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 867
 868     if (errno >= 0) {
 869         return errno;
 870     }
 871
 872 done:
 873     return DO_STATUS(errno);
 874 }
 875
 876 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 877 {
 878     int errno;
 879     struct v_dnode* dnode;
 880     if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 881         errno = vfs_readlink(dnode, buf, size);
 882     }
 883
 884     if (errno >= 0) {
 885         return errno;
 886     }
 887
 888     return DO_STATUS(errno);
 889 }
 890
 891 __DEFINE_LXSYSCALL4(int,
 892                     readlinkat,
 893                     int,
 894                     dirfd,
 895                     const char*,
 896                     pathname,
 897                     char*,
 898                     buf,
 899                     size_t,
 900                     size)
 901 {
 902     int errno;
 903     struct v_fd* fd_s;
 904     if ((errno = vfs_getfd(dirfd, &fd_s))) {
 905         goto done;
 906     }
 907
 908     struct v_dnode* dnode;
 909     if (!(errno = vfs_walk(
 910             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 911         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 912     }
 913
 914     if (errno >= 0) {
 915         return errno;
 916     }
 917
 918 done:
 919     return DO_STATUS(errno);
 920 }
 921
 922 /*
 923     NOTE
 924     When we perform operation that could affect the layout of
 925     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 926     whenever possible. This will blocking any ongoing path walking to reach
 927     it hence avoid any partial state.
 928 */
 929
 930 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 931 {
 932     int errno;
 933     struct v_dnode* dnode;
 934     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
 935         return DO_STATUS(errno);
 936     }
 937
 938     lock_dnode(dnode);
 939
 940     if ((errno = vfs_check_writable(dnode))) {
 941         goto done;
 942     }
 943
 944     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 945         errno = EROFS;
 946         goto done;
 947     }
 948
 949     if (dnode->ref_count > 1 || dnode->inode->open_count) {
 950         errno = EBUSY;
 951         goto done;
 952     }
 953
 954     if (!llist_empty(&dnode->children)) {
 955         errno = ENOTEMPTY;
 956         goto done;
 957     }
 958
 959     struct v_dnode* parent = dnode->parent;
 960
 961     if (!parent) {
 962         errno = EINVAL;
 963         goto done;
 964     }
 965
 966     lock_dnode(parent);
 967     lock_inode(parent->inode);
 968
 969     if ((dnode->inode->itype & VFS_IFDIR)) {
 970         errno = parent->inode->ops->rmdir(parent->inode, dnode);
 971         if (!errno) {
 972             vfs_dcache_remove(dnode);
 973         }
 974     } else {
 975         errno = ENOTDIR;
 976     }
 977
 978     unlock_inode(parent->inode);
 979     unlock_dnode(parent);
 980
 981 done:
 982     unlock_dnode(dnode);
 983     return DO_STATUS(errno);
 984 }
 985
 986 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
 987 {
 988     int errno = 0;
 989     struct v_dnode *parent, *dir;
 990     char name_value[VFS_NAME_MAXLEN];
 991     struct hstr name = HHSTR(name_value, 0, 0);
 992
 993     if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
 994         goto done;
 995     }
 996
 997     if ((errno = vfs_check_writable(parent))) {
 998         goto done;
 999     }
1000
1001     if (!(dir = vfs_d_alloc(parent, &name))) {
1002         errno = ENOMEM;
1003         goto done;
1004     }
1005
1006     lock_dnode(parent);
1007     lock_inode(parent->inode);
1008
1009     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1010         errno = ENOTSUP;
1011     } else if (!parent->inode->ops->mkdir) {
1012         errno = ENOTSUP;
1013     } else if (!(parent->inode->itype & VFS_IFDIR)) {
1014         errno = ENOTDIR;
1015     } else if (!(errno = parent->inode->ops->mkdir(parent->inode, dir))) {
1016         vfs_dcache_add(parent, dir);
1017         goto cleanup;
1018     }
1019
1020     vfs_d_free(dir);
1021
1022 cleanup:
1023     unlock_inode(parent->inode);
1024     unlock_dnode(parent);
1025 done:
1026     return DO_STATUS(errno);
1027 }
1028
1029 int
1030 __vfs_do_unlink(struct v_dnode* dnode)
1031 {
1032     int errno;
1033     struct v_inode* inode = dnode->inode;
1034
1035     if (dnode->ref_count > 1) {
1036         return EBUSY;
1037     }
1038
1039     if ((errno = vfs_check_writable(dnode))) {
1040         return errno;
1041     }
1042
1043     lock_inode(inode);
1044
1045     if (inode->open_count) {
1046         errno = EBUSY;
1047     } else if (!(inode->itype & VFS_IFDIR)) {
1048         // The underlying unlink implementation should handle
1049         //  symlink case
1050         errno = inode->ops->unlink(inode);
1051         if (!errno) {
1052             vfs_d_free(dnode);
1053         }
1054     } else {
1055         errno = EISDIR;
1056     }
1057
1058     unlock_inode(inode);
1059
1060     return errno;
1061 }
1062
1063 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1064 {
1065     int errno;
1066     struct v_dnode* dnode;
1067     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1068         goto done;
1069     }
1070
1071     errno = __vfs_do_unlink(dnode);
1072
1073 done:
1074     return DO_STATUS(errno);
1075 }
1076
1077 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1078 {
1079     int errno;
1080     struct v_fd* fd_s;
1081     if ((errno = vfs_getfd(fd, &fd_s))) {
1082         goto done;
1083     }
1084
1085     struct v_dnode* dnode;
1086     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1087         errno = __vfs_do_unlink(dnode);
1088     }
1089
1090 done:
1091     return DO_STATUS(errno);
1092 }
1093
1094 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1095 {
1096     int errno;
1097     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1098
1099     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1100     if (!errno) {
1101         errno = __vfs_try_locate_file(
1102           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1103         if (!errno) {
1104             errno = EEXIST;
1105         } else if (name_file) {
1106             errno = vfs_link(to_link, name_file);
1107         }
1108     }
1109     return DO_STATUS(errno);
1110 }
1111
1112 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1113 {
1114     int errno;
1115     struct v_fd* fd_s;
1116
1117     if (!(errno = vfs_getfd(fildes, &fd_s))) {
1118         errno = vfs_fsync(fd_s->file);
1119     }
1120
1121     return DO_STATUS(errno);
1122 }
1123
1124 int
1125 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1126 {
1127     int errno = 0;
1128     struct v_fd* copied = cake_grab(fd_pile);
1129
1130     memcpy(copied, old, sizeof(struct v_fd));
1131
1132     atomic_fetch_add(&old->file->ref_count, 1);
1133
1134     *new = copied;
1135
1136     return errno;
1137 }
1138
1139 int
1140 vfs_dup2(int oldfd, int newfd)
1141 {
1142     if (newfd == oldfd) {
1143         return newfd;
1144     }
1145
1146     int errno;
1147     struct v_fd *oldfd_s, *newfd_s;
1148     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1149         goto done;
1150     }
1151
1152     if (!TEST_FD(newfd)) {
1153         errno = EBADF;
1154         goto done;
1155     }
1156
1157     newfd_s = __current->fdtable->fds[newfd];
1158     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1159         goto done;
1160     }
1161
1162     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1163         __current->fdtable->fds[newfd] = newfd_s;
1164         return newfd;
1165     }
1166
1167 done:
1168     return DO_STATUS(errno);
1169 }
1170
1171 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1172 {
1173     return vfs_dup2(oldfd, newfd);
1174 }
1175
1176 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1177 {
1178     int errno, newfd;
1179     struct v_fd *oldfd_s, *newfd_s;
1180     if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1181         goto done;
1182     }
1183
1184     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1185         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1186         __current->fdtable->fds[newfd] = newfd_s;
1187         return newfd;
1188     }
1189
1190 done:
1191     return DO_STATUS(errno);
1192 }
1193
1194 __DEFINE_LXSYSCALL2(int,
1195                     symlink,
1196                     const char*,
1197                     pathname,
1198                     const char*,
1199                     link_target)
1200 {
1201     int errno;
1202     struct v_dnode* dnode;
1203     if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1204         goto done;
1205     }
1206
1207     if (errno = vfs_check_writable(dnode)) {
1208         goto done;
1209     }
1210
1211     if (!dnode->inode->ops->set_symlink) {
1212         errno = ENOTSUP;
1213         goto done;
1214     }
1215
1216     lock_inode(dnode->inode);
1217
1218     errno = dnode->inode->ops->set_symlink(dnode->inode, link_target);
1219
1220     unlock_inode(dnode->inode);
1221
1222 done:
1223     return DO_STATUS(errno);
1224 }
1225
1226 void
1227 vfs_ref_dnode(struct v_dnode* dnode)
1228 {
1229     atomic_fetch_add(&dnode->ref_count, 1);
1230     mnt_mkbusy(dnode->mnt);
1231 }
1232
1233 void
1234 vfs_unref_dnode(struct v_dnode* dnode)
1235 {
1236     atomic_fetch_sub(&dnode->ref_count, 1);
1237     mnt_chillax(dnode->mnt);
1238 }
1239
1240 int
1241 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1242 {
1243     int errno = 0;
1244
1245     lock_dnode(dnode);
1246
1247     if (!(dnode->inode->itype & VFS_IFDIR)) {
1248         errno = ENOTDIR;
1249         goto done;
1250     }
1251
1252     if (proc->cwd) {
1253         vfs_unref_dnode(proc->cwd);
1254     }
1255
1256     vfs_ref_dnode(dnode);
1257     proc->cwd = dnode;
1258
1259     unlock_dnode(dnode);
1260
1261 done:
1262     return errno;
1263 }
1264
1265 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1266 {
1267     struct v_dnode* dnode;
1268     int errno = 0;
1269
1270     if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1271         goto done;
1272     }
1273
1274     errno = vfs_do_chdir(__current, dnode);
1275
1276 done:
1277     return DO_STATUS(errno);
1278 }
1279
1280 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1281 {
1282     struct v_fd* fd_s;
1283     int errno = 0;
1284
1285     if ((errno = vfs_getfd(fd, &fd_s))) {
1286         goto done;
1287     }
1288
1289     errno = vfs_do_chdir(__current, fd_s->file->dnode);
1290
1291 done:
1292     return DO_STATUS(errno);
1293 }
1294
1295 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1296 {
1297     int errno = 0;
1298     char* ret_ptr = 0;
1299     if (size < 2) {
1300         errno = ERANGE;
1301         goto done;
1302     }
1303
1304     size_t len = 0;
1305
1306     if (!__current->cwd) {
1307         *buf = VFS_PATH_DELIM;
1308         len = 1;
1309     } else {
1310         len = vfs_get_path(__current->cwd, buf, size, 0);
1311         if (len == size) {
1312             errno = ERANGE;
1313             goto done;
1314         }
1315     }
1316
1317     buf[len + 1] = '\0';
1318
1319     ret_ptr = buf;
1320
1321 done:
1322     __current->k_status = errno;
1323     return ret_ptr;
1324 }
1325
1326 int
1327 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1328 {
1329     int errno = 0;
1330     if (current->inode->id == target->inode->id) {
1331         // hard link
1332         return 0;
1333     }
1334
1335     if (errno = vfs_check_writable(current)) {
1336         return errno;
1337     }
1338
1339     if (current->ref_count > 1 || target->ref_count > 1) {
1340         return EBUSY;
1341     }
1342
1343     if (current->super_block != target->super_block) {
1344         return EXDEV;
1345     }
1346
1347     struct v_dnode* oldparent = current->parent;
1348     struct v_dnode* newparent = target->parent;
1349
1350     lock_dnode(current);
1351     lock_dnode(target);
1352     if (oldparent)
1353         lock_dnode(oldparent);
1354     if (newparent)
1355         lock_dnode(newparent);
1356
1357     if (!llist_empty(&target->children)) {
1358         errno = ENOTEMPTY;
1359         unlock_dnode(target);
1360         goto cleanup;
1361     }
1362
1363     if ((errno =
1364            current->inode->ops->rename(current->inode, current, target))) {
1365         unlock_dnode(target);
1366         goto cleanup;
1367     }
1368
1369     // re-position current
1370     hstrcpy(&current->name, &target->name);
1371     vfs_dcache_rehash(newparent, current);
1372
1373     // detach target
1374     vfs_d_free(target);
1375
1376     unlock_dnode(target);
1377
1378 cleanup:
1379     unlock_dnode(current);
1380     if (oldparent)
1381         unlock_dnode(oldparent);
1382     if (newparent)
1383         unlock_dnode(newparent);
1384
1385     return errno;
1386 }
1387
1388 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1389 {
1390     struct v_dnode *cur, *target_parent, *target;
1391     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1392     int errno = 0;
1393
1394     if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1395         goto done;
1396     }
1397
1398     if ((errno = vfs_walk(
1399            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1400         goto done;
1401     }
1402
1403     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1404     if (errno == ENOENT) {
1405         target = vfs_d_alloc(target_parent, &name);
1406         vfs_dcache_add(target_parent, target);
1407     } else if (errno) {
1408         goto done;
1409     }
1410
1411     if (!target) {
1412         errno = ENOMEM;
1413         goto done;
1414     }
1415
1416     errno = vfs_do_rename(cur, target);
1417
1418 done:
1419     vfree(name.value);
1420     return DO_STATUS(errno);
1421 }