3 * @author Lunaixsky (zelong56@gmail.com)
4 * @brief Lunaix virtual file system - an abstraction layer for all file system.
8 * @copyright Copyright (c) 2022
12 // Welcome to The Mountain O'Shit! :)
15 TODO vfs & device todos checklist
17 It is overseen by Twilight Sparkle ;)
19 1. Get inodes hooked into lru (CHECKED)
20 2. Get dnodes hooked into lru (CHECKED)
21 3. Get inodes properly hashed so they can be reused by underling fs (CHECKED)
22 4. (lru) Add a callback function (or destructor) for eviction. (CHECKED)
23 [good idea] or a constructor/destructor pattern in cake allocator ?
24 5. (mount) Figure out a way to identify a busy mount point before unmount
25 maybe a unified mount_point structure that maintain a referencing
26 counter on any dnodes within the subtree? Such a counter will only
27 increament if a file is opened or a dnode is being used as working
28 directory and decreamenting conversely. (CHECKED)
29 6. (mount) Ability to track all mount points (including sub-mounts)
30 so we can be confident to clean up everything when we
32 7. (mount) Figure out a way to acquire the device represented by a dnode.
33 so it can be used to mount. (e.g. we wish to get `struct device*`
34 out of the dnode at /dev/sda)
35 [tip] we should pay attention at twifs and add a private_data field
36 under struct v_dnode? (CHECKED)
37 8. (mount) Then, we should refactor on mount/unmount mechanism. (CHECKED)
38 9. (mount) (future) Ability to mount any thing? e.g. Linux can mount a disk
39 image file using a so called "loopback" pseudo device. Maybe
40 we can do similar thing in Lunaix? A block device emulation
41 above the regular file when we mount it on.
42 10. (device) device number (dev_t) allocation
43 [good idea] <class>:<subclass>:<uniq_id> composition (CHECKED)
46 #include <klibc/string.h>
47 #include <lunaix/foptions.h>
48 #include <lunaix/fs.h>
49 #include <lunaix/mm/cake.h>
50 #include <lunaix/mm/valloc.h>
51 #include <lunaix/process.h>
52 #include <lunaix/spike.h>
53 #include <lunaix/syscall.h>
54 #include <lunaix/syscall_utils.h>
56 #include <lunaix/fs/twifs.h>
58 #include <usr/lunaix/dirent.h>
60 #define INODE_ACCESSED 0
61 #define INODE_MODIFY 1
63 static struct cake_pile* dnode_pile;
64 static struct cake_pile* inode_pile;
65 static struct cake_pile* file_pile;
66 static struct cake_pile* superblock_pile;
67 static struct cake_pile* fd_pile;
69 struct v_dnode* vfs_sysroot = NULL;
71 struct lru_zone *dnode_lru, *inode_lru;
73 struct hstr vfs_ddot = HSTR("..", 2);
74 struct hstr vfs_dot = HSTR(".", 1);
75 struct hstr vfs_empty = HSTR("", 0);
78 __vfs_try_evict_dnode(struct lru_node* obj);
81 __vfs_try_evict_inode(struct lru_node* obj);
86 // 为他们专门创建一个蛋糕堆,而不使用valloc,这样我们可以最小化内碎片的产生
87 dnode_pile = cake_new_pile("dnode_cache", sizeof(struct v_dnode), 1, 0);
88 inode_pile = cake_new_pile("inode_cache", sizeof(struct v_inode), 1, 0);
89 file_pile = cake_new_pile("file_cache", sizeof(struct v_file), 1, 0);
90 fd_pile = cake_new_pile("fd_cache", sizeof(struct v_fd), 1, 0);
92 cake_new_pile("sb_cache", sizeof(struct v_superblock), 1, 0);
94 dnode_lru = lru_new_zone("vfs_dnode", __vfs_try_evict_dnode);
95 inode_lru = lru_new_zone("vfs_inode", __vfs_try_evict_inode);
97 hstr_rehash(&vfs_ddot, HSTR_FULL_HASH);
98 hstr_rehash(&vfs_dot, HSTR_FULL_HASH);
101 vfs_sysroot = vfs_d_alloc(NULL, &vfs_empty);
102 vfs_sysroot->parent = vfs_sysroot;
104 vfs_ref_dnode(vfs_sysroot);
105 lru_remove(dnode_lru, &vfs_sysroot->lru);
109 vfs_vncache_init(struct vncache* cache)
111 cache->pool = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket));
112 rwlock_init(&cache->lock);
116 vfs_vncache_free(struct vncache* cache)
118 // clear all other reader/writer
119 rwlock_begin_write(&cache->lock);
122 // already freed, so as the lock
126 vfs_vncache_add(struct vncache* cache, size_t key, struct hlist_node* node)
128 struct hbucket* slot;
130 cache_atomic_write(cache,
132 slot = &cache->pool[key & VFS_HASH_MASK];
134 hlist_add(&slot->head, node);
138 static inline struct hbucket*
139 __dcache_hash_nolock(struct v_dnode* parent, u32_t* hash)
141 struct v_superblock* sb;
142 struct hbucket* d_cache;
145 sb = parent->super_block;
148 _hash = _hash ^ (_hash >> VFS_HASHBITS);
149 _hash += (u32_t)__ptr(parent);
152 return &sb->d_cache.pool[_hash & VFS_HASH_MASK];
156 __sync_inode_nolock(struct v_inode* inode)
158 pcache_commit_all(inode);
161 if (inode->ops->sync) {
162 errno = inode->ops->sync(inode);
169 vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str)
172 struct hbucket* slot;
173 struct v_dnode *pos, *n;
174 struct vncache *dcache;
176 if (!str->len || HSTR_EQ(str, &vfs_dot))
179 if (HSTR_EQ(str, &vfs_ddot)) {
180 return parent->parent;
184 dcache = dnode_cache(parent);
186 vncache_lock_read(dcache);
188 slot = __dcache_hash_nolock(parent, &hash);
189 hashtable_bucket_foreach(slot, pos, n, hash_list)
191 if (pos->name.hash != hash || pos->parent != parent) {
195 vncache_unlock_read(dcache);
199 vncache_unlock_read(dcache);
204 __vfs_touch_inode(struct v_inode* inode, const int type)
206 if (type == INODE_MODIFY) {
207 inode->mtime = clock_unixtime();
210 else if (type == INODE_ACCESSED) {
211 inode->atime = clock_unixtime();
214 lru_use_one(inode_lru, &inode->lru);
218 vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode)
220 struct hbucket* bucket;
221 struct vncache* cache;
224 assert(locked_node(parent));
226 dnode->ref_count = 1;
227 dnode->parent = parent;
228 llist_append(&parent->children, &dnode->siblings);
230 cache_atomic_write(dnode_cache(parent),
232 bucket = __dcache_hash_nolock(parent, &dnode->name.hash);
233 hlist_add(&bucket->head, &dnode->hash_list);
238 vfs_dcache_remove(struct v_dnode* dnode)
241 assert(dnode->ref_count == 1);
243 llist_delete(&dnode->siblings);
244 llist_delete(&dnode->aka_list);
245 lru_remove(dnode_lru, &dnode->lru);
247 cache_atomic_write(dnode_cache(dnode),
249 hlist_delete(&dnode->hash_list);
252 dnode->parent = NULL;
253 dnode->ref_count = 0;
257 vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode)
260 assert(locked_node(new_parent));
264 hstr_rehash(&dnode->name, HSTR_FULL_HASH);
265 vfs_dcache_remove(dnode);
266 vfs_dcache_add(new_parent, dnode);
271 vfs_open(struct v_dnode* dnode, struct v_file** file)
273 struct v_inode* inode = dnode->inode;
275 if (!inode || !inode->ops->open) {
281 struct v_file* vfile = cake_grab(file_pile);
282 memset(vfile, 0, sizeof(*vfile));
284 vfile->dnode = dnode;
285 vfile->inode = inode;
286 vfile->ref_count = 1;
287 vfile->ops = inode->default_fops;
289 if (check_regfile_node(inode) && !inode->pg_cache) {
290 struct pcache* pcache = vzalloc(sizeof(struct pcache));
292 pcache->master = inode;
293 inode->pg_cache = pcache;
296 int errno = inode->ops->open(inode, vfile);
298 cake_release(file_pile, vfile);
300 vfs_ref_dnode(dnode);
312 vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode)
314 lock_dnode(assign_to);
316 if (assign_to->inode) {
317 llist_delete(&assign_to->aka_list);
318 assign_to->inode->link_count--;
321 llist_append(&inode->aka_dnodes, &assign_to->aka_list);
322 assign_to->inode = inode;
325 unlock_dnode(assign_to);
329 vfs_link(struct v_dnode* to_link, struct v_dnode* name)
332 struct v_inode* inode;
334 inode = to_link->inode;
336 if ((errno = vfs_check_writable(to_link))) {
342 if (to_link->super_block->root != name->super_block->root) {
344 } else if (!inode->ops->link) {
346 } else if (!(errno = inode->ops->link(inode, name))) {
347 vfs_assign_inode(name, inode);
356 vfs_pclose(struct v_file* file, pid_t pid)
358 struct v_inode* inode;
363 if (vfs_check_duped_file(file)) {
364 vfs_unref_file(file);
370 * This happened when process is terminated while blocking on read.
371 * In that case, the process is still holding the inode lock and it
372 will never get released.
373 * The unlocking should also include ownership check.
375 * To see why, consider two process both open the same file both with
377 * Process A: busy on reading x
378 * Process B: do nothing with x
379 * Assuming that, after a very short time, process B get terminated
380 * while process A is still busy in it's reading business. By this
381 * design, the inode lock of this file x is get released by B rather
382 * than A. And this will cause a probable race condition on A if other
383 * process is writing to this file later after B exit.
385 mutex_unlock_for(&inode->lock, pid);
387 // now regain lock for inode syncing
391 if ((errno = file->ops->close(file))) {
395 vfs_unref_dnode(file->dnode);
396 cake_release(file_pile, file);
398 pcache_commit_all(inode);
401 if (!inode->open_count) {
402 __sync_inode_nolock(inode);
411 vfs_close(struct v_file* file)
413 return vfs_pclose(file, __current->pid);
417 vfs_free_fd(struct v_fd* fd)
419 cake_release(fd_pile, fd);
423 vfs_isync(struct v_inode* inode)
427 int errno = __sync_inode_nolock(inode);
435 vfs_fsync(struct v_file* file)
438 if ((errno = vfs_check_writable(file->dnode))) {
442 return vfs_isync(file->inode);
446 vfs_alloc_fdslot(int* fd)
448 struct v_fdtable* fdtab;
450 fdtab = __current->fdtable;
453 for (size_t i = 0; i < VFS_MAX_FD; i++) {
454 if (__current->fdtable->fds[i]) {
459 unlock_fdtable(fdtab);
463 unlock_fdtable(fdtab);
470 struct v_superblock* sb = cake_grab(superblock_pile);
471 memset(sb, 0, sizeof(*sb));
472 llist_init_head(&sb->sb_list);
474 vfs_vncache_init(&sb->i_cache);
475 vfs_vncache_init(&sb->d_cache);
482 vfs_sb_ref(struct v_superblock* sb)
488 vfs_sb_unref(struct v_superblock* sb)
490 assert(sb->ref_count);
493 if (likely(sb->ref_count)) {
497 if (sb->ops.release) {
501 vfs_vncache_free(&sb->i_cache);
502 vfs_vncache_free(&sb->d_cache);
504 cake_release(superblock_pile, sb);
508 __dnode_evictable(struct v_dnode* dnode)
510 return dnode->ref_count == 1
511 && llist_empty(&dnode->children);
515 __vfs_try_evict_dnode(struct lru_node* obj)
517 struct v_dnode* dnode = container_of(obj, struct v_dnode, lru);
519 if (mutex_on_hold(&dnode->lock))
522 if (!__dnode_evictable(dnode)) {
531 __vfs_try_evict_inode(struct lru_node* obj)
533 struct v_inode* inode = container_of(obj, struct v_inode, lru);
535 if (!inode->link_count && !inode->open_count) {
543 vfs_d_alloc(struct v_dnode* parent, struct hstr* name)
545 struct v_dnode* dnode = cake_grab(dnode_pile);
547 lru_evict_half(dnode_lru);
549 if (!(dnode = cake_grab(dnode_pile))) {
554 memset(dnode, 0, sizeof(*dnode));
555 llist_init_head(&dnode->children);
556 llist_init_head(&dnode->siblings);
557 llist_init_head(&dnode->aka_list);
558 mutex_init(&dnode->lock);
560 dnode->name = HHSTR(vzalloc(VFS_NAME_MAXLEN), 0, 0);
562 hstrcpy(&dnode->name, name);
565 vfs_d_assign_sb(dnode, parent->super_block);
566 dnode->mnt = parent->mnt;
569 lru_use_one(dnode_lru, &dnode->lru);
575 vfs_d_free(struct v_dnode* dnode)
577 assert(dnode->ref_count == 1);
580 assert(dnode->inode->link_count > 0);
581 dnode->inode->link_count--;
584 vfs_dcache_remove(dnode);
586 // Make sure the children de-referencing their parent.
587 // With lru presented, the eviction will be propagated over the entire
588 // detached subtree eventually
589 struct v_dnode *pos, *n;
590 llist_for_each(pos, n, &dnode->children, siblings)
592 vfs_dcache_remove(pos);
595 if (dnode->destruct) {
596 dnode->destruct(dnode);
599 vfs_sb_unref(dnode->super_block);
601 vfree((void*)dnode->name.value);
602 cake_release(dnode_pile, dnode);
606 vfs_i_find(struct v_superblock* sb, u32_t i_id)
608 struct hbucket* slot;
609 struct v_inode *pos, *n, *found = NULL;
611 cache_atomic_read(&sb->i_cache,
613 slot = &sb->i_cache.pool[i_id & VFS_HASH_MASK];
615 hashtable_bucket_foreach(slot, pos, n, hash_list)
617 if (pos->id != i_id) {
621 lru_use_one(inode_lru, &pos->lru);
631 vfs_i_addhash(struct v_inode* inode)
633 vfs_vncache_add(inode_cache(inode), inode->id, &inode->hash_list);
637 vfs_i_alloc(struct v_superblock* sb)
639 assert(sb->ops.init_inode);
641 struct v_inode* inode;
642 if (!(inode = cake_grab(inode_pile))) {
643 lru_evict_half(inode_lru);
644 if (!(inode = cake_grab(inode_pile))) {
649 memset(inode, 0, sizeof(*inode));
650 mutex_init(&inode->lock);
651 llist_init_head(&inode->xattrs);
652 llist_init_head(&inode->aka_dnodes);
654 sb->ops.init_inode(sb, inode);
656 inode->ctime = clock_unixtime();
657 inode->atime = inode->ctime;
658 inode->mtime = inode->ctime;
660 vfs_i_assign_sb(inode, sb);
661 lru_use_one(inode_lru, &inode->lru);
667 vfs_i_free(struct v_inode* inode)
669 if (inode->pg_cache) {
670 pcache_release(inode->pg_cache);
671 vfree(inode->pg_cache);
674 // we don't need to sync inode.
675 // If an inode can be free, then it must be properly closed.
676 // Hence it must be synced already!
677 if (inode->destruct) {
678 inode->destruct(inode);
681 vfs_sb_unref(inode->sb);
683 hlist_delete(&inode->hash_list);
684 lru_remove(inode_lru, &inode->lru);
686 cake_release(inode_pile, inode);
689 /* ---- System call definition and support ---- */
691 // make a new name when not exists
692 #define FLOC_MAYBE_MKNAME 1
694 // name must be non-exist and made.
695 #define FLOC_MKNAME 2
698 #define FLOC_NOFOLLOW 4
701 vfs_getfd(int fd, struct v_fd** fd_s)
703 struct v_fdtable* fdtab;
709 fdtab = __current->fdtable;
712 *fd_s = __current->fdtable->fds[fd];
713 unlock_fdtable(fdtab);
715 return !*fd_s ? EBADF : 0;
719 __vfs_mknod(struct v_inode* parent, struct v_dnode* dnode,
720 unsigned int itype, dev_t* dev)
724 errno = parent->ops->create(parent, dnode, itype);
732 struct file_locator {
734 struct v_dnode* file;
739 * @brief unlock the file locator (floc) if possible.
740 * If the file to be located if not exists, and
741 * any FLOC_*MKNAME flag is set, then the parent
742 * dnode will be locked until the file has been properly
743 * finalised by subsequent logic.
748 __floc_try_unlock(struct file_locator* floc)
752 unlock_dnode(floc->dir);
757 __vfs_try_locate_file(const char* path,
758 struct file_locator* floc,
761 char name_str[VFS_NAME_MAXLEN];
762 struct v_dnode *fdir, *file;
763 struct hstr name = HSTR(name_str, 0);
764 int errno, woption = 0;
766 if ((options & FLOC_NOFOLLOW)) {
767 woption |= VFS_WALK_NOFOLLOW;
768 options &= ~FLOC_NOFOLLOW;
773 errno = vfs_walk_proc(path, &fdir, &name, woption | VFS_WALK_PARENT);
780 errno = vfs_walk(fdir, name.value, &file, NULL, woption);
782 if (errno && errno != ENOENT) {
786 if (!errno && (options & FLOC_MKNAME)) {
792 // the file present, no need to hold the directory lock
802 errno = vfs_check_writable(fdir);
809 file = vfs_d_alloc(fdir, &name);
816 vfs_dcache_add(fdir, file);
831 __check_unlinkable(struct v_dnode* dnode)
834 bool wr_self, wr_parent;
835 struct v_dnode* parent;
837 parent = dnode->parent;
838 acl = dnode->inode->acl;
840 wr_self = check_allow_write(dnode->inode);
841 wr_parent = check_allow_write(parent->inode);
843 if (!fsacl_test(acl, svtx)) {
847 if (current_euid() == dnode->inode->uid) {
851 return wr_self && wr_parent;
855 vfs_do_open(const char* path, int options)
857 int errno, fd, loptions = 0;
858 struct v_dnode *dentry, *file;
859 struct v_file* ofile = NULL;
860 struct file_locator floc;
861 struct v_inode* inode;
863 if ((options & FO_CREATE)) {
864 loptions |= FLOC_MAYBE_MKNAME;
865 } else if ((options & FO_NOFOLLOW)) {
866 loptions |= FLOC_NOFOLLOW;
869 errno = __vfs_try_locate_file(path, &floc, loptions);
871 if (errno || (errno = vfs_alloc_fdslot(&fd))) {
879 errno = __vfs_mknod(dentry->inode, file, VFS_IFFILE, NULL);
882 __floc_try_unlock(&floc);
886 __floc_try_unlock(&floc);
890 if ((errno = vfs_open(file, &ofile))) {
894 inode = ofile->inode;
897 struct v_fd* fd_s = cake_grab(fd_pile);
898 memset(fd_s, 0, sizeof(*fd_s));
900 if ((options & O_TRUNC)) {
901 file->inode->fsize = 0;
904 if (vfs_get_dtype(inode->itype) == DT_DIR) {
909 fd_s->flags = options;
910 __current->fdtable->fds[fd] = fd_s;
917 __DEFINE_LXSYSCALL2(int, open, const char*, path, int, options)
919 int errno = vfs_do_open(path, options);
920 return DO_STATUS_OR_RETURN(errno);
923 __DEFINE_LXSYSCALL1(int, close, int, fd)
927 if ((errno = vfs_getfd(fd, &fd_s))) {
931 if ((errno = vfs_close(fd_s->file))) {
935 cake_release(fd_pile, fd_s);
936 __current->fdtable->fds[fd] = 0;
939 return DO_STATUS(errno);
943 __vfs_readdir_callback(struct dir_context* dctx,
948 struct lx_dirent* dent = (struct lx_dirent*)dctx->cb_data;
949 strncpy(dent->d_name, name, MIN(len, DIRENT_NAME_MAX_LEN));
951 dent->d_type = dtype;
954 __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent)
959 if ((errno = vfs_getfd(fd, &fd_s))) {
963 struct v_inode* inode = fd_s->file->inode;
967 if (!check_directory_node(inode)) {
972 if (!check_allow_read(inode)) {
977 struct dir_context dctx = (struct dir_context) {
979 .read_complete_callback = __vfs_readdir_callback
982 if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) {
993 return DO_STATUS_OR_RETURN(errno);
996 __DEFINE_LXSYSCALL3(int, read, int, fd, void*, buf, size_t, count)
1000 struct v_inode* inode;
1002 if ((errno = vfs_getfd(fd, &fd_s))) {
1006 struct v_file* file = fd_s->file;
1007 if (check_directory_node(file->inode)) {
1012 if (!check_allow_read(file->inode)) {
1017 inode = file->inode;
1020 __vfs_touch_inode(inode, INODE_ACCESSED);
1022 if (check_seqdev_node(inode) || (fd_s->flags & FO_DIRECT)) {
1023 errno = file->ops->read(inode, buf, count, file->f_pos);
1025 errno = pcache_read(inode, buf, count, file->f_pos);
1029 file->f_pos += errno;
1030 unlock_inode(inode);
1034 unlock_inode(inode);
1037 return DO_STATUS(errno);
1040 __DEFINE_LXSYSCALL3(int, write, int, fd, void*, buf, size_t, count)
1044 if ((errno = vfs_getfd(fd, &fd_s))) {
1048 struct v_inode* inode;
1049 struct v_file* file = fd_s->file;
1051 if ((errno = vfs_check_writable(file->dnode))) {
1055 if (check_directory_node(file->inode)) {
1060 inode = file->inode;
1063 __vfs_touch_inode(inode, INODE_MODIFY);
1064 if ((fd_s->flags & O_APPEND)) {
1065 file->f_pos = inode->fsize;
1068 if (check_seqdev_node(inode) || (fd_s->flags & FO_DIRECT)) {
1069 errno = file->ops->write(inode, buf, count, file->f_pos);
1071 errno = pcache_write(inode, buf, count, file->f_pos);
1075 file->f_pos += errno;
1076 inode->fsize = MAX(inode->fsize, file->f_pos);
1078 unlock_inode(inode);
1082 unlock_inode(inode);
1085 return DO_STATUS(errno);
1088 __DEFINE_LXSYSCALL3(int, lseek, int, fd, int, offset, int, options)
1092 if ((errno = vfs_getfd(fd, &fd_s))) {
1096 struct v_file* file = fd_s->file;
1097 struct v_inode* inode = file->inode;
1099 if (!file->ops->seek) {
1104 if (!check_allow_read(inode)) {
1112 int fpos = file->f_pos;
1114 if (vfs_get_dtype(inode->itype) == DT_DIR) {
1115 options = (options != FSEEK_END) ? options : FSEEK_SET;
1120 overflow = sadd_of((int)file->f_pos, offset, &fpos);
1123 overflow = sadd_of((int)inode->fsize, offset, &fpos);
1134 errno = file->ops->seek(file, fpos);
1137 unlock_inode(inode);
1140 return DO_STATUS(errno);
1144 vfs_get_path(struct v_dnode* dnode, char* buf, size_t size, int depth)
1151 return ENAMETOOLONG;
1156 if (dnode->parent != dnode) {
1157 len = vfs_get_path(dnode->parent, buf, size, depth + 1);
1164 if (!len || buf[len - 1] != VFS_PATH_DELIM) {
1165 buf[len++] = VFS_PATH_DELIM;
1168 size_t cpy_size = MIN(dnode->name.len, size - len);
1169 strncpy(buf + len, dnode->name.value, cpy_size);
1176 vfs_readlink(struct v_dnode* dnode, char* buf, size_t size)
1179 struct v_inode* inode = dnode->inode;
1181 if (!check_symlink_node(inode)) {
1185 if (!inode->ops->read_symlink) {
1189 if (!check_allow_read(inode)) {
1195 int errno = inode->ops->read_symlink(inode, &link);
1197 strncpy(buf, link, MIN(size, (size_t)errno));
1200 unlock_inode(inode);
1205 vfs_get_dtype(int itype)
1207 int dtype = DT_FILE;
1208 if (check_itype(itype, VFS_IFSYMLINK)) {
1209 dtype |= DT_SYMLINK;
1212 if (check_itype(itype, VFS_IFDIR)) {
1225 struct v_fdtable* fdtab;
1227 fdtab = vzalloc(sizeof(struct v_fdtable));
1228 mutex_init(&fdtab->lock);
1234 fdtable_copy(struct v_fdtable* dest, struct v_fdtable* src)
1239 for (size_t i = 0; i < VFS_MAX_FD; i++) {
1240 struct v_fd* fd = src->fds[i];
1243 vfs_dup_fd(fd, &dest->fds[i]);
1246 unlock_fdtable(dest);
1247 unlock_fdtable(src);
1251 fdtable_free(struct v_fdtable* table)
1253 assert(!mutex_on_hold(&table->lock));
1258 __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size)
1262 if ((errno = vfs_getfd(fd, &fd_s))) {
1266 struct v_dnode* dnode;
1268 dnode = fd_s->file->dnode;
1271 errno = vfs_get_path(dnode, buf, size, 0);
1272 unlock_dnode(dnode);
1275 return DO_STATUS(errno);
1278 __DEFINE_LXSYSCALL3(int, readlink, const char*, path, char*, buf, size_t, size)
1281 struct v_dnode* dnode;
1282 if (!(errno = vfs_walk_proc(path, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
1283 errno = vfs_readlink(dnode, buf, size);
1290 return DO_STATUS(errno);
1293 __DEFINE_LXSYSCALL4(
1294 int, readlinkat, int, dirfd, const char*, pathname, char*, buf, size_t, size)
1298 if ((errno = vfs_getfd(dirfd, &fd_s))) {
1302 pathname = pathname ? pathname : "";
1304 struct v_dnode* dnode;
1305 if (!(errno = vfs_walk(
1306 fd_s->file->dnode, pathname, &dnode, NULL, VFS_WALK_NOFOLLOW))) {
1307 errno = vfs_readlink(fd_s->file->dnode, buf, size);
1315 return DO_STATUS(errno);
1320 When we perform operation that could affect the layout of
1321 directory (i.e., rename, mkdir, rmdir). We must lock the parent dir
1322 whenever possible. This will blocking any ongoing path walking to reach
1323 it hence avoid any partial state.
1326 __DEFINE_LXSYSCALL1(int, rmdir, const char*, pathname)
1329 struct v_dnode* dnode;
1330 if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1331 return DO_STATUS(errno);
1336 if (!__check_unlinkable(dnode)) {
1341 if ((errno = vfs_check_writable(dnode))) {
1345 if ((dnode->super_block->fs->types & FSTYPE_ROFS)) {
1350 if (dnode->ref_count > 1 || dnode->inode->open_count) {
1355 if (!llist_empty(&dnode->children)) {
1360 struct v_dnode* parent = dnode->parent;
1368 lock_inode(parent->inode);
1370 if (check_directory_node(dnode->inode)) {
1371 errno = parent->inode->ops->rmdir(parent->inode, dnode);
1373 vfs_dcache_remove(dnode);
1379 unlock_inode(parent->inode);
1380 unlock_dnode(parent);
1383 unlock_dnode(dnode);
1384 return DO_STATUS(errno);
1387 __DEFINE_LXSYSCALL1(int, mkdir, const char*, path)
1391 struct v_inode* inode;
1392 struct v_dnode *parent, *dir;
1393 char name_value[VFS_NAME_MAXLEN];
1395 name = HHSTR(name_value, 0, 0);
1397 if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) {
1401 if (!(errno = vfs_walk(parent, name_value, &dir, NULL, 0))) {
1406 if ((errno = vfs_check_writable(parent))) {
1410 if (!(dir = vfs_d_alloc(parent, &name))) {
1415 inode = parent->inode;
1420 if ((parent->super_block->fs->types & FSTYPE_ROFS)) {
1422 } else if (!inode->ops->mkdir) {
1424 } else if (!check_directory_node(inode)) {
1426 } else if (!(errno = inode->ops->mkdir(inode, dir))) {
1427 vfs_dcache_add(parent, dir);
1434 unlock_inode(inode);
1435 unlock_dnode(parent);
1437 return DO_STATUS(errno);
1441 __vfs_do_unlink(struct v_dnode* dnode)
1444 struct v_inode* inode = dnode->inode;
1446 if (dnode->ref_count > 1) {
1450 if (!__check_unlinkable(dnode)) {
1454 if ((errno = vfs_check_writable(dnode))) {
1460 if (inode->open_count) {
1462 } else if (!check_directory_node(inode)) {
1463 errno = inode->ops->unlink(inode, dnode);
1471 unlock_inode(inode);
1476 __DEFINE_LXSYSCALL1(int, unlink, const char*, pathname)
1479 struct v_dnode* dnode;
1480 if ((errno = vfs_walk_proc(pathname, &dnode, NULL, 0))) {
1484 errno = __vfs_do_unlink(dnode);
1487 return DO_STATUS(errno);
1490 __DEFINE_LXSYSCALL2(int, unlinkat, int, fd, const char*, pathname)
1494 if ((errno = vfs_getfd(fd, &fd_s))) {
1498 struct v_dnode* dnode;
1499 if (!(errno = vfs_walk(fd_s->file->dnode, pathname, &dnode, NULL, 0))) {
1500 errno = __vfs_do_unlink(dnode);
1504 return DO_STATUS(errno);
1507 __DEFINE_LXSYSCALL2(int, link, const char*, oldpath, const char*, newpath)
1510 struct file_locator floc;
1511 struct v_dnode *to_link, *name_file;
1513 errno = __vfs_try_locate_file(oldpath, &floc, 0);
1518 __floc_try_unlock(&floc);
1520 to_link = floc.file;
1521 errno = __vfs_try_locate_file(newpath, &floc, FLOC_MKNAME);
1526 name_file = floc.file;
1527 errno = vfs_link(to_link, name_file);
1529 vfs_d_free(name_file);
1533 __floc_try_unlock(&floc);
1534 return DO_STATUS(errno);
1537 __DEFINE_LXSYSCALL1(int, fsync, int, fildes)
1542 if (!(errno = vfs_getfd(fildes, &fd_s))) {
1543 errno = vfs_fsync(fd_s->file);
1546 return DO_STATUS(errno);
1550 vfs_dup_fd(struct v_fd* old, struct v_fd** new)
1553 struct v_fd* copied = cake_grab(fd_pile);
1555 memcpy(copied, old, sizeof(struct v_fd));
1557 vfs_ref_file(old->file);
1565 vfs_dup2(int oldfd, int newfd)
1568 struct v_fdtable* fdtab;
1569 struct v_fd *oldfd_s, *newfd_s;
1571 if (newfd == oldfd) {
1575 if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1579 if (!TEST_FD(newfd)) {
1584 fdtab = __current->fdtable;
1585 lock_fdtable(fdtab);
1587 newfd_s = fdtab->fds[newfd];
1588 if (newfd_s && (errno = vfs_close(newfd_s->file))) {
1589 goto unlock_and_done;
1592 if ((errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1593 goto unlock_and_done;
1596 fdtab->fds[newfd] = newfd_s;
1598 unlock_fdtable(fdtab);
1602 unlock_fdtable(fdtab);
1605 return DO_STATUS(errno);
1608 __DEFINE_LXSYSCALL2(int, dup2, int, oldfd, int, newfd)
1610 return vfs_dup2(oldfd, newfd);
1613 __DEFINE_LXSYSCALL1(int, dup, int, oldfd)
1616 struct v_fd *oldfd_s, *newfd_s;
1617 if ((errno = vfs_getfd(oldfd, &oldfd_s))) {
1621 if (!(errno = vfs_alloc_fdslot(&newfd)) &&
1622 !(errno = vfs_dup_fd(oldfd_s, &newfd_s))) {
1623 __current->fdtable->fds[newfd] = newfd_s;
1628 return DO_STATUS(errno);
1631 __DEFINE_LXSYSCALL2(
1632 int, symlink, const char*, pathname, const char*, link_target)
1635 struct file_locator floc;
1636 struct v_dnode *file;
1637 struct v_inode *f_ino;
1639 errno = __vfs_try_locate_file(pathname, &floc, FLOC_MKNAME);
1645 errno = __vfs_mknod(floc.dir->inode, file, VFS_IFSYMLINK, NULL);
1651 f_ino = file->inode;
1655 errno = vfs_check_writable(file);
1660 if (!f_ino->ops->set_symlink) {
1667 errno = f_ino->ops->set_symlink(f_ino, link_target);
1669 unlock_inode(f_ino);
1672 __floc_try_unlock(&floc);
1673 return DO_STATUS(errno);
1677 vfs_do_chdir_nolock(struct proc_info* proc, struct v_dnode* dnode)
1679 if (!check_directory_node(dnode->inode)) {
1684 vfs_unref_dnode(proc->cwd);
1687 vfs_ref_dnode(dnode);
1694 vfs_do_chdir(struct proc_info* proc, struct v_dnode* dnode)
1700 errno = vfs_do_chdir_nolock(proc, dnode);
1702 unlock_dnode(dnode);
1707 __DEFINE_LXSYSCALL1(int, chdir, const char*, path)
1709 struct v_dnode* dnode;
1712 if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1716 errno = vfs_do_chdir((struct proc_info*)__current, dnode);
1719 return DO_STATUS(errno);
1722 __DEFINE_LXSYSCALL1(int, fchdir, int, fd)
1727 if ((errno = vfs_getfd(fd, &fd_s))) {
1731 errno = vfs_do_chdir((struct proc_info*)__current, fd_s->file->dnode);
1734 return DO_STATUS(errno);
1738 __DEFINE_LXSYSCALL1(int, chroot, const char*, path)
1741 struct v_dnode* dnode;
1742 if ((errno = vfs_walk_proc(path, &dnode, NULL, 0))) {
1748 errno = vfs_do_chdir_nolock(__current, dnode);
1750 unlock_dnode(dnode);
1754 __current->root = dnode;
1756 unlock_dnode(dnode);
1759 return DO_STATUS(errno);
1762 __DEFINE_LXSYSCALL2(char*, getcwd, char*, buf, size_t, size)
1773 if (!__current->cwd) {
1774 *buf = VFS_PATH_DELIM;
1777 len = vfs_get_path(__current->cwd, buf, size, 0);
1789 syscall_result(errno);
1794 vfs_do_rename(struct v_dnode* current, struct v_dnode* target)
1797 if (current->inode->id == target->inode->id) {
1802 if ((errno = vfs_check_writable(current))) {
1806 if (current->ref_count > 1 || target->ref_count > 1) {
1810 if (current->super_block != target->super_block) {
1814 struct v_dnode* oldparent = current->parent;
1815 struct v_dnode* newparent = target->parent;
1817 lock_dnode(current);
1821 lock_dnode(oldparent);
1823 lock_dnode(newparent);
1825 if (!llist_empty(&target->children)) {
1827 unlock_dnode(target);
1832 current->inode->ops->rename(current->inode, current, target))) {
1833 unlock_dnode(target);
1837 // re-position current
1838 hstrcpy(¤t->name, &target->name);
1839 vfs_dcache_rehash(newparent, current);
1844 unlock_dnode(target);
1847 unlock_dnode(current);
1850 unlock_dnode(oldparent);
1852 unlock_dnode(newparent);
1857 __DEFINE_LXSYSCALL2(int, rename, const char*, oldpath, const char*, newpath)
1859 struct v_dnode *cur, *target_parent, *target;
1860 struct hstr name = HSTR(valloc(VFS_NAME_MAXLEN), 0);
1863 if ((errno = vfs_walk_proc(oldpath, &cur, NULL, 0))) {
1867 if ((errno = vfs_walk(
1868 __current->cwd, newpath, &target_parent, &name, VFS_WALK_PARENT))) {
1872 errno = vfs_walk(target_parent, name.value, &target, NULL, 0);
1873 if (errno == ENOENT) {
1874 target = vfs_d_alloc(target_parent, &name);
1875 vfs_dcache_add(target_parent, target);
1885 errno = vfs_do_rename(cur, target);
1888 vfree((void*)name.value);
1889 return DO_STATUS(errno);
1892 __DEFINE_LXSYSCALL2(int, fstat, int, fd, struct file_stat*, stat)
1897 if ((errno = vfs_getfd(fd, &fds))) {
1901 struct v_inode* vino = fds->file->inode;
1902 struct device* fdev = vino->sb->dev;
1904 stat->st_ino = vino->id;
1905 stat->st_blocks = vino->lb_usage;
1906 stat->st_size = vino->fsize;
1907 stat->st_blksize = vino->sb->blksize;
1908 stat->st_nlink = vino->link_count;
1909 stat->st_uid = vino->uid;
1910 stat->st_gid = vino->gid;
1912 stat->st_ctim = vino->ctime;
1913 stat->st_atim = vino->atime;
1914 stat->st_mtim = vino->mtime;
1916 stat->st_mode = (vino->itype << 16) | vino->acl;
1918 stat->st_ioblksize = PAGE_SIZE;
1920 if (check_device_node(vino)) {
1921 struct device* rdev = resolve_device(vino->data);
1927 stat->st_rdev = (dev_t){.meta = rdev->ident.fn_grp,
1928 .unique = rdev->ident.unique,
1929 .index = dev_uid(rdev) };
1933 stat->st_dev = (dev_t){.meta = fdev->ident.fn_grp,
1934 .unique = fdev->ident.unique,
1935 .index = dev_uid(fdev) };
1939 return DO_STATUS(errno);
1942 __DEFINE_LXSYSCALL4(int, fchmodat, int, fd,
1943 const char*, path, int, mode, int, flags)
1946 struct v_dnode *dnode;
1947 struct v_inode* inode;
1949 errno = vfs_walkat(fd, path, flags, &dnode);
1954 errno = vfs_check_writable(dnode);
1959 inode = dnode->inode;
1962 if (!current_is_root()) {
1963 mode = mode & FSACL_RWXMASK;
1967 __vfs_touch_inode(inode, INODE_MODIFY);
1969 unlock_inode(inode);
1972 return DO_STATUS(errno);
1975 __DEFINE_LXSYSCALL5(int, fchownat, int, fd,
1976 const char*, path, uid_t, uid, gid_t, gid, int, flags)
1979 struct v_dnode *dnode;
1980 struct v_inode *inode;
1982 errno = vfs_walkat(fd, path, flags, &dnode);
1987 errno = vfs_check_writable(dnode);
1992 inode = dnode->inode;
1997 __vfs_touch_inode(inode, INODE_MODIFY);
1999 unlock_inode(inode);
2002 return DO_STATUS(errno);
2005 __DEFINE_LXSYSCALL4(int, faccessat, int, fd,
2006 const char*, path, int, amode, int, flags)
2009 struct v_dnode *dnode;
2010 struct v_inode *inode;
2011 struct user_scope* uscope;
2016 errno = vfs_walkat(fd, path, flags, &dnode);
2021 if ((flags & AT_EACCESS)) {
2022 tuid = current_euid();
2023 tgid = current_egid();
2026 uscope = current_user_scope();
2027 tuid = uscope->ruid;
2028 tgid = uscope->rgid;
2031 inode = dnode->inode;
2035 acl &= check_acl_between(inode->uid, inode->gid, tuid, tgid);
2041 return DO_STATUS(errno);