lunaix-os/kernel/fs/vfs.c

   1 /**
   2  * @file vfs.c
   3  * @author Lunaixsky (zelong56@gmail.com)
   4  * @brief Lunaix virtual file system - an abstraction layer for all file system.
   5  * @version 0.1
   6  * @date 2022-07-24
   7  *
   8  * @copyright Copyright (c) 2022
   9  *
  10  */
  11
  12 // Welcome to The Mountain O'Shit! :)
  13
  14 /*
  15  TODO vfs & device todos checklist
  16
  17     It is overseen by Twilight Sparkle ;)
  18
  19  1. Get inodes hooked into lru
  20  2. Get dnodes hooked into lru
  21  3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
  22  4. (lru) Add a callback function (or destructor) for eviction.
  23         [good idea] or a constructor/destructor pattern in cake allocator ?
  24  5. (mount) Figure out a way to identify a busy mount point before unmount
  25             maybe a unified mount_point structure that maintain a referencing
  26             counter on any dnodes within the subtree? Such a counter will only
  27             increament if a file is opened or a dnode is being used as working
  28             directory and decreamenting conversely.
  29  6. (mount) Ability to track all mount points (including sub-mounts)
  30             so we can be confident to clean up everything when we unmount.
  31  7. (mount) Figure out a way to acquire the device represented by a dnode.
  32             so it can be used to mount. (e.g. we wish to get `struct device*`
  33             out of the dnode at /dev/sda)
  34             [tip] we should pay attention at twifs and add a private_data field
  35             under struct v_dnode?
  36  8. (mount) Then, we should refactor on mount/unmount mechanism.
  37  9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
  38                     image file using a so called "loopback" pseudo device. Maybe
  39                     we can do similar thing in Lunaix? A block device emulation
  40                     above the regular file when we mount it on.
  41  10. (device) device number (dev_t) allocation
  42             [good idea] <class>:<subclass>:<uniq_id> composition
  43 */
  44
  45 #include <klibc/string.h>
  46 #include <lunaix/dirent.h>
  47 #include <lunaix/foptions.h>
  48 #include <lunaix/fs.h>
  49 #include <lunaix/mm/cake.h>
  50 #include <lunaix/mm/page.h>
  51 #include <lunaix/mm/valloc.h>
  52 #include <lunaix/process.h>
  53 #include <lunaix/spike.h>
  54 #include <lunaix/syscall.h>
  55
  56 #include <lunaix/fs/twifs.h>
  57
  58 #define PATH_DELIM '/'
  59 #define HASHTABLE_BITS 10
  60 #define HASHTABLE_SIZE (1 << HASHTABLE_BITS)
  61 #define HASH_MASK (HASHTABLE_SIZE - 1)
  62 #define HASHBITS (32 - HASHTABLE_BITS)
  63
  64 #define lock_inode(inode) mutex_lock(&inode->lock)
  65 #define unlock_inode(inode) mutex_unlock(&inode->lock)
  66
  67 #define lock_dnode(dnode) mutex_lock(&dnode->lock)
  68 #define unlock_dnode(dnode) mutex_unlock(&dnode->lock)
  69
  70 static struct cake_pile* dnode_pile;
  71 static struct cake_pile* inode_pile;
  72 static struct cake_pile* file_pile;
  73 static struct cake_pile* superblock_pile;
  74 static struct cake_pile* fd_pile;
  75
  76 static struct v_superblock* root_sb;
  77 static struct hbucket *dnode_cache, *inode_cache;
  78
  79 struct hstr vfs_ddot = HSTR("..", 2);
  80 struct hstr vfs_dot = HSTR(".", 1);
  81 struct hstr vfs_empty = HSTR("", 0);
  82
  83 struct v_superblock*
  84 vfs_sb_alloc();
  85
  86 void
  87 vfs_sb_free(struct v_superblock* sb);
  88
  89 void
  90 vfs_init()
  91 {
  92     // 为他们专门创建一个蛋糕堆，而不使用valloc，这样我们可以最小化内碎片的产生
  93     dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
  94     inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
  95     file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
  96     fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
  97     superblock_pile =
  98       cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
  99
 100     dnode_cache = vzalloc(HASHTABLE_SIZE * sizeof(struct hbucket));
 101     inode_cache = vzalloc(HASHTABLE_SIZE * sizeof(struct hbucket));
 102
 103     hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
 104     hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
 105
 106     // 创建一个根superblock，用来蕴含我们的根目录。
 107     root_sb = vfs_sb_alloc();
 108     root_sb->root = vfs_d_alloc();
 109     root_sb->root->inode = vfs_i_alloc(root_sb, 0);
 110 }
 111
 112 inline struct hbucket*
 113 __dcache_hash(struct v_dnode* parent, uint32_t* hash)
 114 {
 115     uint32_t _hash = *hash;
 116     // 与parent的指针值做加法，来减小碰撞的可能性。
 117     _hash += (uint32_t)parent;
 118     // 确保低位更加随机
 119     _hash = _hash ^ (_hash >> HASHBITS);
 120     *hash = _hash;
 121     return &dnode_cache[_hash & HASH_MASK];
 122 }
 123
 124 struct v_dnode*
 125 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
 126 {
 127     if (!str->len || HSTR_EQ(str, &vfs_dot))
 128         return parent;
 129
 130     if (HSTR_EQ(str, &vfs_ddot)) {
 131         return parent->parent ? parent->parent : parent;
 132     }
 133
 134     uint32_t hash = str->hash;
 135     struct hbucket* slot = __dcache_hash(parent, &hash);
 136
 137     struct v_dnode *pos, *n;
 138     hashtable_bucket_foreach(slot, pos, n, hash_list)
 139     {
 140         if (pos->name.hash == hash) {
 141             return pos;
 142         }
 143     }
 144     return NULL;
 145 }
 146
 147 void
 148 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
 149 {
 150     atomic_fetch_add(&dnode->ref_count, 1);
 151     dnode->parent = parent;
 152     llist_append(&parent->children, &dnode->siblings);
 153     struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash);
 154     hlist_add(&bucket->head, &dnode->hash_list);
 155 }
 156
 157 void
 158 vfs_dcache_remove(struct v_dnode* dnode)
 159 {
 160     assert(dnode->ref_count == 1);
 161
 162     llist_delete(&dnode->siblings);
 163     hlist_delete(&dnode->hash_list);
 164
 165     dnode->parent = NULL;
 166     atomic_fetch_sub(&dnode->ref_count, 1);
 167 }
 168
 169 void
 170 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
 171 {
 172     hstr_rehash(&dnode->name, HSTR_FULL_HASH);
 173     vfs_dcache_remove(dnode);
 174     vfs_dcache_add(new_parent, dnode);
 175 }
 176
 177 int
 178 __vfs_walk(struct v_dnode* start,
 179            const char* path,
 180            struct v_dnode** dentry,
 181            struct hstr* component,
 182            int walk_options)
 183 {
 184     int errno = 0;
 185     int i = 0, j = 0;
 186
 187     if (path[0] == PATH_DELIM || !start) {
 188         if ((walk_options & VFS_WALK_FSRELATIVE) && start) {
 189             start = start->super_block->root;
 190         } else {
 191             start = root_sb->root;
 192         }
 193         i++;
 194     }
 195
 196     struct v_dnode* dnode;
 197     struct v_dnode* current_level = start;
 198
 199     char name_content[VFS_NAME_MAXLEN];
 200     struct hstr name = HSTR(name_content, 0);
 201
 202     char current = path[i++], lookahead;
 203     while (current) {
 204         lookahead = path[i++];
 205         if (current != PATH_DELIM) {
 206             if (j >= VFS_NAME_MAXLEN - 1) {
 207                 return ENAMETOOLONG;
 208             }
 209             if (!VFS_VALID_CHAR(current)) {
 210                 return EINVAL;
 211             }
 212             name_content[j++] = current;
 213             if (lookahead) {
 214                 goto cont;
 215             }
 216         }
 217
 218         // handling cases like /^.*(\/+).*$/
 219         if (lookahead == PATH_DELIM) {
 220             goto cont;
 221         }
 222
 223         lock_dnode(current_level);
 224
 225         name_content[j] = 0;
 226         name.len = j;
 227         hstr_rehash(&name, HSTR_FULL_HASH);
 228
 229         if (!lookahead && (walk_options & VFS_WALK_PARENT)) {
 230             if (component) {
 231                 component->hash = name.hash;
 232                 component->len = j;
 233                 strcpy(component->value, name_content);
 234             }
 235             unlock_dnode(current_level);
 236             break;
 237         }
 238
 239         dnode = vfs_dcache_lookup(current_level, &name);
 240
 241         if (!dnode) {
 242             dnode = vfs_d_alloc();
 243
 244             hstrcpy(&dnode->name, &name);
 245
 246             lock_inode(current_level->inode);
 247
 248             errno =
 249               current_level->inode->ops.dir_lookup(current_level->inode, dnode);
 250
 251             if (errno == ENOENT && (walk_options & VFS_WALK_MKPARENT)) {
 252                 if (!current_level->inode->ops.mkdir) {
 253                     errno = ENOTSUP;
 254                 } else {
 255                     errno = current_level->inode->ops.mkdir(
 256                       current_level->inode, dnode);
 257                 }
 258             }
 259
 260             unlock_inode(current_level->inode);
 261
 262             if (errno) {
 263                 unlock_dnode(current_level);
 264                 vfree(dnode->name.value);
 265                 goto error;
 266             }
 267
 268             vfs_dcache_add(current_level, dnode);
 269         }
 270
 271         unlock_dnode(current_level);
 272
 273         j = 0;
 274         current_level = dnode;
 275     cont:
 276         current = lookahead;
 277     };
 278
 279     *dentry = current_level;
 280     return 0;
 281
 282 error:
 283     vfs_d_free(dnode);
 284     *dentry = NULL;
 285     return errno;
 286 }
 287
 288 #define VFS_MAX_SYMLINK 16
 289
 290 int
 291 vfs_walk(struct v_dnode* start,
 292          const char* path,
 293          struct v_dnode** dentry,
 294          struct hstr* component,
 295          int options)
 296 {
 297     struct v_dnode* interim;
 298     const char* pathname = path;
 299     int errno = __vfs_walk(start, path, &interim, component, options);
 300     int counter = 0;
 301
 302     while (!errno) {
 303         if (counter >= VFS_MAX_SYMLINK) {
 304             errno = ELOOP;
 305             continue;
 306         }
 307         if ((interim->inode->itype & VFS_IFSYMLINK) &&
 308             !(options & VFS_WALK_NOFOLLOW) &&
 309             interim->inode->ops.read_symlink) {
 310             errno = interim->inode->ops.read_symlink(interim->inode, &pathname);
 311             if (errno) {
 312                 break;
 313             }
 314         } else {
 315             break;
 316         }
 317         errno = __vfs_walk(start, pathname, &interim, component, options);
 318         counter++;
 319     }
 320
 321     *dentry = errno ? 0 : interim;
 322
 323     return errno;
 324 }
 325
 326 int
 327 vfs_mount(const char* target, const char* fs_name, struct device* device)
 328 {
 329     int errno;
 330     struct v_dnode* mnt;
 331
 332     if (!(errno = vfs_walk(__current->cwd, target, &mnt, NULL, 0))) {
 333         errno = vfs_mount_at(fs_name, device, mnt);
 334     }
 335
 336     return errno;
 337 }
 338
 339 int
 340 vfs_unmount(const char* target)
 341 {
 342     int errno;
 343     struct v_dnode* mnt;
 344
 345     if (!(errno = vfs_walk(__current->cwd, target, &mnt, NULL, 0))) {
 346         errno = vfs_unmount_at(mnt);
 347     }
 348
 349     return errno;
 350 }
 351
 352 int
 353 vfs_mount_at(const char* fs_name,
 354              struct device* device,
 355              struct v_dnode* mnt_point)
 356 {
 357     if (!(mnt_point->inode->itype & VFS_IFDIR)) {
 358         return ENOTDIR;
 359     }
 360
 361     struct filesystem* fs = fsm_get(fs_name);
 362     if (!fs) {
 363         return ENODEV;
 364     }
 365
 366     struct v_superblock* sb = vfs_sb_alloc();
 367     sb->dev = device;
 368     sb->fs_id = fs->fs_id;
 369
 370     int errno = 0;
 371     if (!(errno = fs->mount(sb, mnt_point))) {
 372         sb->fs = fs;
 373         sb->root = mnt_point;
 374         mnt_point->super_block = sb;
 375         llist_append(&root_sb->sb_list, &sb->sb_list);
 376     }
 377
 378     return errno;
 379 }
 380
 381 int
 382 vfs_unmount_at(struct v_dnode* mnt_point)
 383 {
 384     // FIXME deal with the detached dcache subtree
 385     int errno = 0;
 386     struct v_superblock* sb = mnt_point->super_block;
 387     if (!sb) {
 388         return EINVAL;
 389     }
 390
 391     if (sb->root != mnt_point) {
 392         return EINVAL;
 393     }
 394
 395     if (!(errno = sb->fs->unmount(sb))) {
 396         struct v_dnode* fs_root = sb->root;
 397         vfs_dcache_remove(fs_root);
 398
 399         llist_delete(&sb->sb_list);
 400         vfs_sb_free(sb);
 401         vfs_d_free(fs_root);
 402     }
 403     return errno;
 404 }
 405
 406 int
 407 vfs_open(struct v_dnode* dnode, struct v_file** file)
 408 {
 409     if (!dnode->inode || !dnode->inode->ops.open) {
 410         return ENOTSUP;
 411     }
 412
 413     struct v_inode* inode = dnode->inode;
 414     struct v_file* vfile = cake_grab(file_pile);
 415     memset(vfile, 0, sizeof(*vfile));
 416
 417     vfile->dnode = dnode;
 418     vfile->inode = inode;
 419     vfile->ref_count = ATOMIC_VAR_INIT(1);
 420     vfile->ops = inode->default_fops;
 421
 422     if ((inode->itype & VFS_IFFILE) && !inode->pg_cache) {
 423         struct pcache* pcache = vzalloc(sizeof(struct pcache));
 424         pcache_init(pcache);
 425         pcache->master = inode;
 426         inode->pg_cache = pcache;
 427     }
 428
 429     int errno = inode->ops.open(inode, vfile);
 430     if (errno) {
 431         cake_release(file_pile, vfile);
 432     } else {
 433         atomic_fetch_add(&dnode->ref_count, 1);
 434         inode->open_count++;
 435
 436         *file = vfile;
 437     }
 438
 439     return errno;
 440 }
 441
 442 int
 443 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
 444 {
 445     int errno;
 446
 447     lock_inode(to_link->inode);
 448     if (to_link->super_block->root != name->super_block->root) {
 449         errno = EXDEV;
 450     } else if (!to_link->inode->ops.link) {
 451         errno = ENOTSUP;
 452     } else if (!(errno = to_link->inode->ops.link(to_link->inode, name))) {
 453         name->inode = to_link->inode;
 454         to_link->inode->link_count++;
 455     }
 456     unlock_inode(to_link->inode);
 457
 458     return errno;
 459 }
 460
 461 int
 462 vfs_close(struct v_file* file)
 463 {
 464     int errno = 0;
 465     if (!file->ops.close || !(errno = file->ops.close(file))) {
 466         atomic_fetch_sub(&file->dnode->ref_count, 1);
 467         file->inode->open_count--;
 468
 469         pcache_commit_all(file->inode);
 470         cake_release(file_pile, file);
 471     }
 472     return errno;
 473 }
 474
 475 int
 476 vfs_fsync(struct v_file* file)
 477 {
 478     lock_inode(file->inode);
 479
 480     int errno = ENOTSUP;
 481     pcache_commit_all(file->inode);
 482     if (file->ops.sync) {
 483         errno = file->ops.sync(file->inode);
 484     }
 485
 486     unlock_inode(file->inode);
 487
 488     return errno;
 489 }
 490
 491 int
 492 vfs_alloc_fdslot(int* fd)
 493 {
 494     for (size_t i = 0; i < VFS_MAX_FD; i++) {
 495         if (!__current->fdtable->fds[i]) {
 496             *fd = i;
 497             return 0;
 498         }
 499     }
 500     return EMFILE;
 501 }
 502
 503 struct v_superblock*
 504 vfs_sb_alloc()
 505 {
 506     struct v_superblock* sb = cake_grab(superblock_pile);
 507     memset(sb, 0, sizeof(*sb));
 508     llist_init_head(&sb->sb_list);
 509     return sb;
 510 }
 511
 512 void
 513 vfs_sb_free(struct v_superblock* sb)
 514 {
 515     cake_release(superblock_pile, sb);
 516 }
 517
 518 struct v_dnode*
 519 vfs_d_alloc()
 520 {
 521     struct v_dnode* dnode = cake_grab(dnode_pile);
 522     memset(dnode, 0, sizeof(*dnode));
 523     llist_init_head(&dnode->children);
 524     llist_init_head(&dnode->siblings);
 525     mutex_init(&dnode->lock);
 526
 527     dnode->ref_count = ATOMIC_VAR_INIT(0);
 528     dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
 529
 530     return dnode;
 531 }
 532
 533 void
 534 vfs_d_free(struct v_dnode* dnode)
 535 {
 536     if (dnode->ref_count) {
 537         // it can be only freed if no one is refering
 538         return;
 539     }
 540     if (dnode->inode && dnode->inode->link_count) {
 541         dnode->inode->link_count--;
 542     }
 543
 544     // Make sure the children de-referencing their parent.
 545     // With lru presented, the eviction will be propagated over the entire
 546     // detached subtree eventually
 547     struct v_dnode *pos, *n;
 548     llist_for_each(pos, n, &dnode->children, siblings)
 549     {
 550         vfs_dcache_remove(pos);
 551     }
 552
 553     vfree(dnode->name.value);
 554     cake_release(dnode_pile, dnode);
 555 }
 556
 557 struct v_inode*
 558 vfs_i_alloc(dev_t device_id, uint32_t inode_id)
 559 {
 560     // 我们这里假设每个文件系统与设备是一一对应（毕竟一个分区不可能有两个不同的文件系统）
 561     // 而每个文件系统所产生的 v_inode 缓存必须要和其他文件系统产生的区分开来。
 562     // 这也就是说，每个 v_inode 的 id
 563     // 必须要由设备ID，和该虚拟inode缓存所对应的物理inode
 564     // 相对于其所在的文件系统的id，进行组成！
 565     inode_id = hash_32(inode_id ^ device_id, HASH_SIZE_BITS);
 566     inode_id = (inode_id >> HASHBITS) ^ inode_id;
 567
 568     struct hbucket* slot = &inode_cache[inode_id & HASH_MASK];
 569     struct v_inode *pos, *n;
 570     hashtable_bucket_foreach(slot, pos, n, hash_list)
 571     {
 572         if (pos->id == inode_id) {
 573             return pos;
 574         }
 575     }
 576
 577     pos = cake_grab(inode_pile);
 578     memset(pos, 0, sizeof(*pos));
 579
 580     pos->id = inode_id;
 581     pos->link_count = 1;
 582
 583     mutex_init(&pos->lock);
 584
 585     hlist_add(&slot->head, &pos->hash_list);
 586
 587     return pos;
 588 }
 589
 590 void
 591 vfs_i_free(struct v_inode* inode)
 592 {
 593     hlist_delete(&inode->hash_list);
 594     cake_release(inode_pile, inode);
 595 }
 596
 597 /* ---- System call definition and support ---- */
 598
 599 #define FLOCATE_CREATE_EMPTY 1
 600
 601 #define DO_STATUS(errno) SYSCALL_ESTATUS(__current->k_status = errno)
 602 #define DO_STATUS_OR_RETURN(errno) ({ errno < 0 ? DO_STATUS(errno) : errno; })
 603
 604 #define TEST_FD(fd) (fd >= 0 && fd < VFS_MAX_FD)
 605
 606 int
 607 __vfs_getfd(int fd, struct v_fd** fd_s)
 608 {
 609     if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) {
 610         return 0;
 611     }
 612     return EBADF;
 613 }
 614
 615 int
 616 __vfs_try_locate_file(const char* path,
 617                       struct v_dnode** fdir,
 618                       struct v_dnode** file,
 619                       int options)
 620 {
 621     char name_str[VFS_NAME_MAXLEN];
 622     struct hstr name = HSTR(name_str, 0);
 623     int errno;
 624     if ((errno =
 625            vfs_walk(__current->cwd, path, fdir, &name, VFS_WALK_PARENT))) {
 626         return errno;
 627     }
 628
 629     errno = vfs_walk(*fdir, name.value, file, NULL, 0);
 630     if (errno != ENOENT || !(options & FLOCATE_CREATE_EMPTY)) {
 631         return errno;
 632     }
 633
 634     struct v_dnode* parent = *fdir;
 635     struct v_dnode* file_new = vfs_d_alloc();
 636     hstrcpy(&file_new->name, &name);
 637
 638     if (!(errno = parent->inode->ops.create(parent->inode, file_new))) {
 639         *file = file_new;
 640
 641         vfs_dcache_add(parent, file_new);
 642         llist_append(&parent->children, &file_new->siblings);
 643     } else {
 644         vfs_d_free(file_new);
 645     }
 646
 647     return errno;
 648 }
 649
 650 int
 651 vfs_do_open(const char* path, int options)
 652 {
 653     int errno, fd;
 654     struct v_dnode *dentry, *file;
 655     struct v_file* ofile = 0;
 656
 657     errno = __vfs_try_locate_file(
 658       path, &dentry, &file, (options & FO_CREATE) ? FLOCATE_CREATE_EMPTY : 0);
 659
 660     if (errno || (errno = vfs_open(file, &ofile))) {
 661         return errno;
 662     }
 663
 664     struct v_inode* o_inode = ofile->inode;
 665     if (!(o_inode->itype & VFS_IFSEQDEV) && !(options & FO_DIRECT)) {
 666         // XXX Change here accordingly when signature of pcache_r/w changed.
 667         ofile->ops.read = pcache_read;
 668         ofile->ops.write = pcache_write;
 669     }
 670
 671     if (!errno && !(errno = vfs_alloc_fdslot(&fd))) {
 672         struct v_fd* fd_s = vzalloc(sizeof(*fd_s));
 673         ofile->f_pos = ofile->inode->fsize & -((options & FO_APPEND) != 0);
 674         fd_s->file = ofile;
 675         fd_s->flags = options;
 676         __current->fdtable->fds[fd] = fd_s;
 677         return fd;
 678     }
 679
 680     return errno;
 681 }
 682
 683 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
 684 {
 685     int errno = vfs_do_open(path, options);
 686     return DO_STATUS_OR_RETURN(errno);
 687 }
 688
 689 __DEFINE_LXSYSCALL1(int, close, int, fd)
 690 {
 691     struct v_fd* fd_s;
 692     int errno = 0;
 693     if ((errno = __vfs_getfd(fd, &fd_s))) {
 694         goto done_err;
 695     }
 696
 697     if (fd_s->file->ref_count > 1) {
 698         fd_s->file->ref_count--;
 699     } else if ((errno = vfs_close(fd_s->file))) {
 700         goto done_err;
 701     }
 702
 703     vfree(fd_s);
 704     __current->fdtable->fds[fd] = 0;
 705
 706 done_err:
 707     return DO_STATUS(errno);
 708 }
 709
 710 void
 711 __vfs_readdir_callback(struct dir_context* dctx,
 712                        const char* name,
 713                        const int len,
 714                        const int dtype)
 715 {
 716     struct dirent* dent = (struct dirent*)dctx->cb_data;
 717     strncpy(dent->d_name, name, DIRENT_NAME_MAX_LEN);
 718     dent->d_nlen = len;
 719     dent->d_type = dtype;
 720 }
 721
 722 __DEFINE_LXSYSCALL2(int, readdir, int, fd, struct dirent*, dent)
 723 {
 724     struct v_fd* fd_s;
 725     int errno;
 726
 727     if ((errno = __vfs_getfd(fd, &fd_s))) {
 728         goto done;
 729     }
 730
 731     struct v_inode* inode = fd_s->file->inode;
 732
 733     lock_inode(inode);
 734
 735     if (!(fd_s->file->inode->itype & VFS_IFDIR)) {
 736         errno = ENOTDIR;
 737     } else {
 738         struct dir_context dctx =
 739           (struct dir_context){ .cb_data = dent,
 740                                 .index = dent->d_offset,
 741                                 .read_complete_callback =
 742                                   __vfs_readdir_callback };
 743         if (dent->d_offset == 0) {
 744             __vfs_readdir_callback(&dctx, vfs_dot.value, vfs_dot.len, 0);
 745         } else if (dent->d_offset == 1) {
 746             __vfs_readdir_callback(&dctx, vfs_ddot.value, vfs_ddot.len, 0);
 747         } else {
 748             dctx.index -= 2;
 749             if ((errno = fd_s->file->ops.readdir(inode, &dctx))) {
 750                 unlock_inode(inode);
 751                 goto done;
 752             }
 753         }
 754         errno = 0;
 755         dent->d_offset++;
 756     }
 757
 758     unlock_inode(inode);
 759
 760 done:
 761     return DO_STATUS(errno);
 762 }
 763
 764 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
 765 {
 766     int errno = 0;
 767     struct v_fd* fd_s;
 768     if ((errno = __vfs_getfd(fd, &fd_s))) {
 769         goto done;
 770     }
 771
 772     struct v_file* file = fd_s->file;
 773     if ((file->inode->itype & VFS_IFDIR)) {
 774         errno = EISDIR;
 775         goto done;
 776     }
 777
 778     lock_inode(file->inode);
 779
 780     file->inode->atime = clock_unixtime();
 781
 782     __SYSCALL_INTERRUPTIBLE(
 783       { errno = file->ops.read(file->inode, buf, count, file->f_pos); })
 784
 785     unlock_inode(file->inode);
 786
 787     if (errno > 0) {
 788         file->f_pos += errno;
 789         return errno;
 790     }
 791
 792 done:
 793     return DO_STATUS(errno);
 794 }
 795
 796 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
 797 {
 798     int errno = 0;
 799     struct v_fd* fd_s;
 800     if ((errno = __vfs_getfd(fd, &fd_s))) {
 801         goto done;
 802     }
 803
 804     struct v_file* file = fd_s->file;
 805     if ((file->inode->itype & VFS_IFDIR)) {
 806         errno = EISDIR;
 807         goto done;
 808     }
 809
 810     lock_inode(file->inode);
 811
 812     file->inode->mtime = clock_unixtime();
 813
 814     __SYSCALL_INTERRUPTIBLE(
 815       { errno = file->ops.write(file->inode, buf, count, file->f_pos); })
 816
 817     unlock_inode(file->inode);
 818
 819     if (errno > 0) {
 820         file->f_pos += errno;
 821         return errno;
 822     }
 823
 824 done:
 825     return DO_STATUS(errno);
 826 }
 827
 828 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
 829 {
 830     int errno = 0;
 831     struct v_fd* fd_s;
 832     if ((errno = __vfs_getfd(fd, &fd_s))) {
 833         goto done;
 834     }
 835
 836     struct v_file* file = fd_s->file;
 837
 838     lock_inode(file->inode);
 839
 840     size_t fpos = file->f_pos;
 841     switch (options) {
 842         case FSEEK_CUR:
 843             fpos = (size_t)((int)file->f_pos + offset);
 844             break;
 845         case FSEEK_END:
 846             fpos = (size_t)((int)file->inode->fsize + offset);
 847             break;
 848         case FSEEK_SET:
 849             fpos = offset;
 850             break;
 851     }
 852     if (!file->ops.seek || !(errno = file->ops.seek(file->inode, fpos))) {
 853         file->f_pos = fpos;
 854     }
 855
 856     unlock_inode(file->inode);
 857
 858 done:
 859     return DO_STATUS(errno);
 860 }
 861
 862 int
 863 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
 864 {
 865     if (!dnode) {
 866         return 0;
 867     }
 868
 869     if (depth > 64) {
 870         return ELOOP;
 871     }
 872
 873     size_t len = vfs_get_path(dnode->parent, buf, size, depth + 1);
 874
 875     if (len >= size) {
 876         return len;
 877     }
 878
 879     size_t cpy_size = MIN(dnode->name.len, size - len);
 880     strncpy(buf + len, dnode->name.value, cpy_size);
 881     len += cpy_size;
 882
 883     if (len < size) {
 884         buf[len++] = PATH_DELIM;
 885     }
 886
 887     return len;
 888 }
 889
 890 int
 891 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
 892 {
 893     const char* link;
 894     struct v_inode* inode = dnode->inode;
 895     if (inode->ops.read_symlink) {
 896         lock_inode(inode);
 897
 898         int errno = inode->ops.read_symlink(inode, &link);
 899         strncpy(buf, link, size);
 900
 901         unlock_inode(inode);
 902         return errno;
 903     }
 904     return 0;
 905 }
 906
 907 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
 908 {
 909     int errno;
 910     struct v_fd* fd_s;
 911     if ((errno = __vfs_getfd(fd, &fd_s))) {
 912         goto done;
 913     }
 914
 915     struct v_dnode* dnode;
 916     errno = vfs_get_path(fd_s->file->dnode, buf, size, 0);
 917
 918     if (errno >= 0) {
 919         return errno;
 920     }
 921
 922 done:
 923     return DO_STATUS(errno);
 924 }
 925
 926 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
 927 {
 928     int errno;
 929     struct v_dnode* dnode;
 930     if (!(errno =
 931             vfs_walk(__current->cwd, path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 932         errno = vfs_readlink(dnode, buf, size);
 933     }
 934
 935     if (errno >= 0) {
 936         return errno;
 937     }
 938
 939     return DO_STATUS(errno);
 940 }
 941
 942 __DEFINE_LXSYSCALL4(int,
 943                     readlinkat,
 944                     int,
 945                     dirfd,
 946                     const char*,
 947                     pathname,
 948                     char*,
 949                     buf,
 950                     size_t,
 951                     size)
 952 {
 953     int errno;
 954     struct v_fd* fd_s;
 955     if ((errno = __vfs_getfd(dirfd, &fd_s))) {
 956         goto done;
 957     }
 958
 959     struct v_dnode* dnode;
 960     if (!(errno = vfs_walk(
 961             fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
 962         errno = vfs_readlink(fd_s->file->dnode, buf, size);
 963     }
 964
 965     if (errno >= 0) {
 966         return errno;
 967     }
 968
 969 done:
 970     return DO_STATUS(errno);
 971 }
 972
 973 /*
 974     NOTE
 975     When we perform operation that could affect the layout of
 976     directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
 977     whenever possible. This will blocking any ongoing path walking to reach
 978     it hence avoid any partial state.
 979 */
 980
 981 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
 982 {
 983     int errno;
 984     struct v_dnode* dnode;
 985     if ((errno = vfs_walk(__current->cwd, pathname, &dnode, NULL, 0))) {
 986         return DO_STATUS(errno);
 987     }
 988
 989     lock_dnode(dnode);
 990
 991     if (dnode->parent)
 992         lock_dnode(dnode->parent);
 993
 994     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
 995         errno = EROFS;
 996         goto done;
 997     }
 998
 999     if (dnode->ref_count || dnode->inode->open_count) {
1000         errno = EBUSY;
1001         goto done;
1002     }
1003
1004     if (!llist_empty(&dnode->children)) {
1005         errno = ENOTEMPTY;
1006         goto done;
1007     }
1008
1009     lock_inode(dnode->inode);
1010
1011     if ((dnode->inode->itype & VFS_IFDIR)) {
1012         errno = dnode->inode->ops.rmdir(dnode->inode);
1013         if (!errno) {
1014             vfs_dcache_remove(dnode);
1015             unlock_inode(dnode->inode);
1016             vfs_d_free(dnode);
1017
1018             goto done;
1019         }
1020     } else {
1021         errno = ENOTDIR;
1022     }
1023
1024     unlock_inode(dnode->inode);
1025
1026 done:
1027     unlock_dnode(dnode);
1028     if (dnode->parent)
1029         unlock_dnode(dnode->parent);
1030     return DO_STATUS(errno);
1031 }
1032
1033 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
1034 {
1035     struct v_dnode *parent, *dir = vfs_d_alloc();
1036     int errno =
1037       vfs_walk(__current->cwd, path, &parent, &dir->name, VFS_WALK_PARENT);
1038     if (errno) {
1039         goto done;
1040     }
1041
1042     lock_dnode(parent);
1043     lock_inode(parent->inode);
1044
1045     if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1046         errno = ENOTSUP;
1047     } else if (!parent->inode->ops.mkdir) {
1048         errno = ENOTSUP;
1049     } else if (!(parent->inode->itype & VFS_IFDIR)) {
1050         errno = ENOTDIR;
1051     } else if (!(errno = parent->inode->ops.mkdir(parent->inode, dir))) {
1052         llist_append(&parent->children, &dir->siblings);
1053         goto cleanup;
1054     }
1055
1056     vfs_d_free(dir);
1057
1058 cleanup:
1059     unlock_inode(parent->inode);
1060     unlock_dnode(parent);
1061 done:
1062     return DO_STATUS(errno);
1063 }
1064
1065 int
1066 __vfs_do_unlink(struct v_dnode* dnode)
1067 {
1068     struct v_inode* inode = dnode->inode;
1069
1070     if (dnode->ref_count > 1) {
1071         return EBUSY;
1072     }
1073
1074     lock_inode(inode);
1075
1076     int errno;
1077     if (inode->open_count) {
1078         errno = EBUSY;
1079     } else if (!(inode->itype & VFS_IFDIR)) {
1080         // The underlying unlink implementation should handle
1081         //  symlink case
1082         errno = inode->ops.unlink(inode);
1083         if (!errno) {
1084             inode->link_count--;
1085             vfs_dcache_remove(dnode);
1086             vfs_d_free(dnode);
1087         }
1088     } else {
1089         errno = EISDIR;
1090     }
1091
1092     unlock_inode(inode);
1093
1094     return errno;
1095 }
1096
1097 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1098 {
1099     int errno;
1100     struct v_dnode* dnode;
1101     if ((errno = vfs_walk(__current->cwd, pathname, &dnode, NULL, 0))) {
1102         goto done;
1103     }
1104     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
1105         errno = EROFS;
1106         goto done;
1107     }
1108
1109     errno = __vfs_do_unlink(dnode);
1110
1111 done:
1112     return DO_STATUS(errno);
1113 }
1114
1115 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1116 {
1117     int errno;
1118     struct v_fd* fd_s;
1119     if ((errno = __vfs_getfd(fd, &fd_s))) {
1120         goto done;
1121     }
1122
1123     struct v_dnode* dnode;
1124     if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1125         errno = __vfs_do_unlink(dnode);
1126     }
1127
1128 done:
1129     return DO_STATUS(errno);
1130 }
1131
1132 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1133 {
1134     int errno;
1135     struct v_dnode *dentry, *to_link, *name_dentry, *name_file;
1136
1137     errno = __vfs_try_locate_file(oldpath, &dentry, &to_link, 0);
1138     if (!errno) {
1139         errno = __vfs_try_locate_file(
1140           newpath, &name_dentry, &name_file, FLOCATE_CREATE_EMPTY);
1141         if (!errno) {
1142             errno = EEXIST;
1143         } else if (name_file) {
1144             errno = vfs_link(to_link, name_file);
1145         }
1146     }
1147     return DO_STATUS(errno);
1148 }
1149
1150 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1151 {
1152     int errno;
1153     struct v_fd* fd_s;
1154     if (!(errno = __vfs_getfd(fildes, &fd_s))) {
1155         errno = vfs_fsync(fd_s->file);
1156     }
1157
1158     return DO_STATUS(errno);
1159 }
1160
1161 int
1162 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1163 {
1164     int errno = 0;
1165     struct v_fd* copied = cake_grab(fd_pile);
1166
1167     memcpy(copied, old, sizeof(struct v_fd));
1168
1169     atomic_fetch_add(&old->file->ref_count, 1);
1170
1171     *new = copied;
1172
1173     return errno;
1174 }
1175
1176 int
1177 vfs_dup2(int oldfd, int newfd)
1178 {
1179     if (newfd == oldfd) {
1180         return newfd;
1181     }
1182
1183     int errno;
1184     struct v_fd *oldfd_s, *newfd_s;
1185     if ((errno = __vfs_getfd(oldfd, &oldfd_s))) {
1186         goto done;
1187     }
1188
1189     if (!TEST_FD(newfd)) {
1190         errno = EBADF;
1191         goto done;
1192     }
1193
1194     newfd_s = __current->fdtable->fds[newfd];
1195     if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1196         goto done;
1197     }
1198
1199     if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1200         __current->fdtable->fds[newfd] = newfd_s;
1201         return newfd;
1202     }
1203
1204 done:
1205     return DO_STATUS(errno);
1206 }
1207
1208 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1209 {
1210     return vfs_dup2(oldfd, newfd);
1211 }
1212
1213 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1214 {
1215     int errno, newfd;
1216     struct v_fd *oldfd_s, *newfd_s;
1217     if ((errno = __vfs_getfd(oldfd, &oldfd_s))) {
1218         goto done;
1219     }
1220
1221     if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1222         !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1223         __current->fdtable->fds[newfd] = newfd_s;
1224         return newfd;
1225     }
1226
1227 done:
1228     return DO_STATUS(errno);
1229 }
1230
1231 __DEFINE_LXSYSCALL2(int,
1232                     symlink,
1233                     const char*,
1234                     pathname,
1235                     const char*,
1236                     link_target)
1237 {
1238     int errno;
1239     struct v_dnode* dnode;
1240     if ((errno = vfs_walk(__current->cwd, pathname, &dnode, NULL, 0))) {
1241         goto done;
1242     }
1243     if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
1244         errno = EROFS;
1245         goto done;
1246     }
1247     if (!dnode->inode->ops.symlink) {
1248         errno = ENOTSUP;
1249         goto done;
1250     }
1251
1252     lock_inode(dnode->inode);
1253
1254     errno = dnode->inode->ops.symlink(dnode->inode, link_target);
1255
1256     unlock_inode(dnode->inode);
1257
1258 done:
1259     return DO_STATUS(errno);
1260 }
1261
1262 int
1263 __vfs_do_chdir(struct v_dnode* dnode)
1264 {
1265     int errno = 0;
1266
1267     lock_dnode(dnode);
1268
1269     if (!(dnode->inode->itype & VFS_IFDIR)) {
1270         errno = ENOTDIR;
1271         goto done;
1272     }
1273
1274     if (__current->cwd) {
1275         atomic_fetch_add(&__current->cwd->ref_count, 1);
1276     }
1277
1278     atomic_fetch_sub(&dnode->ref_count, 1);
1279     __current->cwd = dnode;
1280
1281     unlock_dnode(dnode);
1282
1283 done:
1284     return errno;
1285 }
1286
1287 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1288 {
1289     struct v_dnode* dnode;
1290     int errno = 0;
1291
1292     if ((errno = vfs_walk(__current->cwd, path, &dnode, NULL, 0))) {
1293         goto done;
1294     }
1295
1296     errno = __vfs_do_chdir(dnode);
1297
1298 done:
1299     return DO_STATUS(errno);
1300 }
1301
1302 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1303 {
1304     struct v_fd* fd_s;
1305     int errno = 0;
1306
1307     if ((errno = __vfs_getfd(fd, &fd_s))) {
1308         goto done;
1309     }
1310
1311     errno = __vfs_do_chdir(fd_s->file->dnode);
1312
1313 done:
1314     return DO_STATUS(errno);
1315 }
1316
1317 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1318 {
1319     int errno = 0;
1320     char* ret_ptr = 0;
1321     if (size < 2) {
1322         errno = ERANGE;
1323         goto done;
1324     }
1325
1326     size_t len = 0;
1327
1328     if (!__current->cwd) {
1329         *buf = PATH_DELIM;
1330         len = 1;
1331     } else {
1332         len = vfs_get_path(__current->cwd, buf, size, 0);
1333         if (len == size) {
1334             errno = ERANGE;
1335             goto done;
1336         }
1337     }
1338
1339     buf[len + 1] = '\0';
1340
1341     ret_ptr = buf;
1342
1343 done:
1344     __current->k_status = errno;
1345     return ret_ptr;
1346 }
1347
1348 int
1349 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1350 {
1351     if (current->inode->id == target->inode->id) {
1352         // hard link
1353         return 0;
1354     }
1355
1356     if (current->ref_count > 1 || target->ref_count > 1) {
1357         return EBUSY;
1358     }
1359
1360     if (current->super_block != target->super_block) {
1361         return EXDEV;
1362     }
1363
1364     int errno = 0;
1365
1366     struct v_dnode* oldparent = current->parent;
1367     struct v_dnode* newparent = target->parent;
1368
1369     lock_dnode(current);
1370     lock_dnode(target);
1371     if (oldparent)
1372         lock_dnode(oldparent);
1373     if (newparent)
1374         lock_dnode(newparent);
1375
1376     if (!llist_empty(&target->children)) {
1377         errno = ENOTEMPTY;
1378         unlock_dnode(target);
1379         goto cleanup;
1380     }
1381
1382     if ((errno = current->inode->ops.rename(current->inode, current, target))) {
1383         unlock_dnode(target);
1384         goto cleanup;
1385     }
1386
1387     // re-position current
1388     hstrcpy(&current->name, &target->name);
1389     vfs_dcache_rehash(newparent, current);
1390
1391     // detach target
1392     vfs_dcache_remove(target);
1393
1394     unlock_dnode(target);
1395
1396 cleanup:
1397     unlock_dnode(current);
1398     if (oldparent)
1399         unlock_dnode(oldparent);
1400     if (newparent)
1401         unlock_dnode(newparent);
1402
1403     return errno;
1404 }
1405
1406 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1407 {
1408     struct v_dnode *cur, *target_parent, *target;
1409     struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1410     int errno = 0;
1411
1412     if ((errno = vfs_walk(__current->cwd, oldpath, &cur, NULL, 0))) {
1413         goto done;
1414     }
1415
1416     if ((errno = vfs_walk(
1417            __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1418         goto done;
1419     }
1420
1421     errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1422     if (errno == ENOENT) {
1423         target = vfs_d_alloc();
1424         hstrcpy(&target->name, &name);
1425     } else if (errno) {
1426         goto done;
1427     }
1428
1429     if (!(errno = vfs_do_rename(cur, target))) {
1430         vfs_d_free(target);
1431     }
1432
1433 done:
1434     vfree(name.value);
1435     return DO_STATUS(errno);
1436 }
1437
1438 __DEFINE_LXSYSCALL3(int,
1439                     mount,
1440                     const char*,
1441                     source,
1442                     const char*,
1443                     target,
1444                     const char*,
1445                     fstype)
1446 {
1447     struct v_dnode *dev, *mnt;
1448     int errno = 0;
1449
1450     if ((errno = vfs_walk(__current->cwd, source, &dev, NULL, 0))) {
1451         goto done;
1452     }
1453
1454     if ((errno = vfs_walk(__current->cwd, target, &mnt, NULL, 0))) {
1455         goto done;
1456     }
1457
1458     if (!(dev->inode->itype & VFS_IFVOLDEV)) {
1459         errno = ENOTDEV;
1460         goto done;
1461     }
1462
1463     if (mnt->ref_count > 1) {
1464         errno = EBUSY;
1465         goto done;
1466     }
1467
1468     // FIXME should not touch the underlying fs!
1469     struct device* device =
1470       (struct device*)((struct twifs_node*)dev->inode->data)->data;
1471
1472     errno = vfs_mount_at(fstype, device, mnt);
1473
1474 done:
1475     return DO_STATUS(errno);
1476 }
1477
1478 __DEFINE_LXSYSCALL1(int, unmount, const char*, target)
1479 {
1480     return vfs_unmount(target);
1481 }