From ec4ff182252b6d7b3cb81f4fa783c5348a6c30fd Mon Sep 17 00:00:00 2001 From: Lunaixsky Date: Sun, 4 May 2025 19:33:51 +0100 Subject: [PATCH] Fix file system racing and ext2 directory insertion (#58) * fix: ext2 directory insertion; racing on inode create use nesting lock for v_inode and v_dnode locking. hold the lock on the parent directory upon creating inode prevent contention across multiple process that partakes similar actions fix the edge case of dirent insertion with data block boundary crossing. fix the incorrect calculation between fsblock index and ext2 data block/inode index. new test case: fragfile - random file spammer * add locks to the shared objects of ext2 fs * fix missing locks in the vfs subsystem * add locks to dnode and inode cache * add locks to fdtable for resolving inter-threads contentions * scale up the fragfile sample size --- lunaix-os/includes/lunaix/ds/mutex.h | 5 +- lunaix-os/includes/lunaix/ds/rwlock.h | 3 + lunaix-os/includes/lunaix/ds/spinlock.h | 3 + lunaix-os/includes/lunaix/fs.h | 75 ++++- lunaix-os/kernel/ds/mutex.c | 19 +- lunaix-os/kernel/fs/LConfig | 2 + lunaix-os/kernel/fs/ext2/LConfig | 11 + lunaix-os/kernel/fs/ext2/alloc.c | 36 ++- lunaix-os/kernel/fs/ext2/dir.c | 177 +++++++----- lunaix-os/kernel/fs/ext2/ext2.h | 122 +++++++-- lunaix-os/kernel/fs/ext2/file.c | 5 +- lunaix-os/kernel/fs/ext2/group.c | 23 +- lunaix-os/kernel/fs/ext2/inode.c | 154 ++++++++--- lunaix-os/kernel/fs/ext2/mount.c | 5 + lunaix-os/kernel/fs/mount.c | 4 +- lunaix-os/kernel/fs/vfs.c | 350 ++++++++++++++++++------ lunaix-os/kernel/process/fork.c | 14 +- lunaix-os/usr/LBuild | 2 + lunaix-os/usr/fragfile.c | 71 +++++ 19 files changed, 806 insertions(+), 275 deletions(-) create mode 100644 lunaix-os/kernel/fs/ext2/LConfig create mode 100644 lunaix-os/usr/fragfile.c diff --git a/lunaix-os/includes/lunaix/ds/mutex.h b/lunaix-os/includes/lunaix/ds/mutex.h index 510f1fb..304cc6d 100644 --- a/lunaix-os/includes/lunaix/ds/mutex.h +++ b/lunaix-os/includes/lunaix/ds/mutex.h @@ -6,7 +6,7 @@ typedef struct mutex_s { - atomic_ulong lk; + atomic_uint lk; pid_t owner; } mutex_t; @@ -37,4 +37,7 @@ mutex_unlock_nested(mutex_t* mutex); void mutex_unlock_for(mutex_t* mutex, pid_t pid); +bool +mutex_trylock(mutex_t* mutex); + #endif /* __LUNAIX_MUTEX_H */ diff --git a/lunaix-os/includes/lunaix/ds/rwlock.h b/lunaix-os/includes/lunaix/ds/rwlock.h index 7dda9ad..ac0d039 100644 --- a/lunaix-os/includes/lunaix/ds/rwlock.h +++ b/lunaix-os/includes/lunaix/ds/rwlock.h @@ -13,6 +13,9 @@ typedef struct rwlock_s waitq_t waiting_writers; } rwlock_t; +void +rwlock_init(rwlock_t* rwlock); + void rwlock_begin_read(rwlock_t* rwlock); diff --git a/lunaix-os/includes/lunaix/ds/spinlock.h b/lunaix-os/includes/lunaix/ds/spinlock.h index 557f310..cea310a 100644 --- a/lunaix-os/includes/lunaix/ds/spinlock.h +++ b/lunaix-os/includes/lunaix/ds/spinlock.h @@ -8,6 +8,9 @@ struct spinlock volatile bool flag; }; +#define DEFINE_SPINLOCK(name) \ + struct spinlock name = { .flag = false } + typedef struct spinlock spinlock_t; /* diff --git a/lunaix-os/includes/lunaix/fs.h b/lunaix-os/includes/lunaix/fs.h index bba939c..003f410 100644 --- a/lunaix-os/includes/lunaix/fs.h +++ b/lunaix-os/includes/lunaix/fs.h @@ -1,8 +1,6 @@ #ifndef __LUNAIX_VFS_H #define __LUNAIX_VFS_H -#include -#include #include #include #include @@ -10,6 +8,10 @@ #include #include #include +#include + +#include +#include #include #include #include @@ -62,20 +64,25 @@ ('0' <= (chr) && (chr) <= '9') || (chr) == '.' || (chr) == '_' || \ (chr) == '-' || (chr) == ':') -#define unlock_inode(inode) mutex_unlock(&inode->lock) +#define unlock_inode(inode) mutex_unlock_nested(&inode->lock) #define lock_inode(inode) \ ({ \ - mutex_lock(&inode->lock); \ + mutex_lock_nested(&inode->lock); \ lru_use_one(inode_lru, &inode->lru); \ }) -#define unlock_dnode(dnode) mutex_unlock(&dnode->lock) +#define unlock_dnode(dnode) mutex_unlock_nested(&dnode->lock) #define lock_dnode(dnode) \ ({ \ - mutex_lock(&dnode->lock); \ + mutex_lock_nested(&dnode->lock); \ lru_use_one(dnode_lru, &dnode->lru); \ }) +#define dnode_atomic(dnode, ops) \ + do { lock_dnode(dnode); ops; unlock_dnode(dnode); } while(0) + +#define locked_node(node) mutex_on_hold(&(node)->lock) + #define assert_fs(cond) assert_p(cond, "FS") #define fail_fs(msg) fail_p(msg, "FS") @@ -117,6 +124,28 @@ struct fs_iter struct filesystem* fs; }; +struct vncache +{ + struct hbucket* pool; + rwlock_t lock; +}; +#define cache_atomic_read(cache, ops) \ + do { \ + rwlock_begin_read(&(cache)->lock); \ + ops; \ + rwlock_end_read(&(cache)->lock); \ + } while (0) + +#define cache_atomic_write(cache, ops) \ + do { \ + rwlock_begin_write(&(cache)->lock); \ + ops; \ + rwlock_end_write(&(cache)->lock); \ + } while (0) + +#define dnode_cache(dnode) (&(dnode)->super_block->d_cache) +#define inode_cache(inode) (&(inode)->sb->i_cache) + struct v_superblock { struct llist_header sb_list; @@ -124,8 +153,8 @@ struct v_superblock struct v_dnode* root; struct filesystem* fs; struct blkbuf_cache* blks; - struct hbucket* i_cache; - struct hbucket* d_cache; + struct vncache i_cache; + struct vncache d_cache; void* data; unsigned int ref_count; @@ -305,7 +334,10 @@ struct v_dnode struct v_fdtable { struct v_fd* fds[VFS_MAX_FD]; + mutex_t lock; // inter-threads contention }; +#define lock_fdtable(fdtab) mutex_lock(&(fdtab)->lock) +#define unlock_fdtable(fdtab) mutex_unlock(&(fdtab)->lock) struct pcache { @@ -364,6 +396,21 @@ fsm_itend(struct fs_iter* iterator) iterator->fs = NULL; } +void +vfs_vncache_init(struct vncache* cache); + +void +vfs_vncache_free(struct vncache* cache); + +void +vfs_vncache_add(struct vncache* cache, size_t key, struct hlist_node* node); + +#define vncache_lock_read(cache) rwlock_begin_read(&(cache)->lock); +#define vncache_unlock_read(cache) rwlock_end_read(&(cache)->lock); + +#define vncache_lock_write(cache) rwlock_begin_write(&(cache)->lock); +#define vncache_unlock_write(cache) rwlock_end_write(&(cache)->lock); + void vfs_init(); @@ -655,6 +702,18 @@ void xattr_addcache(struct v_inode* inode, struct v_xattr_entry* xattr); +/* --- fdtable --- */ + +struct v_fdtable* +fdtable_create(); + +void +fdtable_copy(struct v_fdtable* dest, struct v_fdtable* src); + +void +fdtable_free(struct v_fdtable* table); + + /* --- misc stuff --- */ #define check_itype(to_check, itype) \ diff --git a/lunaix-os/kernel/ds/mutex.c b/lunaix-os/kernel/ds/mutex.c index 41e131d..4aac422 100644 --- a/lunaix-os/kernel/ds/mutex.c +++ b/lunaix-os/kernel/ds/mutex.c @@ -2,6 +2,12 @@ #include #include +#define __do_lock(mutext) \ + ({ \ + atomic_fetch_add(&mutex->lk, 1);\ + mutex->owner = __current->pid; \ + }) + static inline bool must_inline __mutex_check_owner(mutex_t* mutex) { @@ -15,8 +21,7 @@ __mutext_lock(mutex_t* mutex) preempt_current(); } - atomic_fetch_add(&mutex->lk, 1); - mutex->owner = __current->pid; + __do_lock(mutex); } static inline void must_inline @@ -32,6 +37,16 @@ mutex_lock(mutex_t* mutex) __mutext_lock(mutex); } +bool +mutex_trylock(mutex_t* mutex) +{ + if (atomic_load(&mutex->lk)) + return false; + + __do_lock(mutex); + return true; +} + void mutex_unlock(mutex_t* mutex) { diff --git a/lunaix-os/kernel/fs/LConfig b/lunaix-os/kernel/fs/LConfig index 194e186..880d6c6 100644 --- a/lunaix-os/kernel/fs/LConfig +++ b/lunaix-os/kernel/fs/LConfig @@ -19,3 +19,5 @@ def file_system(): type(bool) default(True) + +include("ext2") \ No newline at end of file diff --git a/lunaix-os/kernel/fs/ext2/LConfig b/lunaix-os/kernel/fs/ext2/LConfig new file mode 100644 index 0000000..f5cb0a6 --- /dev/null +++ b/lunaix-os/kernel/fs/ext2/LConfig @@ -0,0 +1,11 @@ + +@Collection("ext2") +def ext2_fs(): + add_to_collection(file_system) + + @Term("Debug Messages") + def ext2_debug_msg(): + type(bool) + default(False) + + return v(fs_ext2) \ No newline at end of file diff --git a/lunaix-os/kernel/fs/ext2/alloc.c b/lunaix-os/kernel/fs/ext2/alloc.c index 2996141..62dfb6b 100644 --- a/lunaix-os/kernel/fs/ext2/alloc.c +++ b/lunaix-os/kernel/fs/ext2/alloc.c @@ -4,12 +4,20 @@ static inline unsigned int __ext2_global_slot_alloc(struct v_superblock* vsb, int type_sel, struct ext2_gdesc** gd_out) { + int alloc; struct ext2_sbinfo* sb; struct ext2_gdesc *pos; struct llist_header *header; + alloc = ALLOC_FAIL; sb = EXT2_SB(vsb); + + ext2sb_lock(sb); header = &sb->free_list_sel[type_sel]; + + // we have used up all avaliable inodes/blocks + if (llist_empty(header)) + goto done; if (type_sel == GDESC_INO_SEL) { pos = list_entry(header->next, struct ext2_gdesc, free_grps_ino); @@ -18,12 +26,14 @@ __ext2_global_slot_alloc(struct v_superblock* vsb, int type_sel, pos = list_entry(header->next, struct ext2_gdesc, free_grps_blk); } - int alloc = ext2gd_alloc_slot(pos, type_sel); + alloc = ext2gd_alloc_slot(pos, type_sel); if (valid_bmp_slot(alloc)) { *gd_out = pos; } +done: + ext2sb_unlock(sb); return alloc; } @@ -45,13 +55,15 @@ ext2gd_alloc_slot(struct ext2_gdesc* gd, int type_sel) struct ext2_bmp* bmp; struct ext2_sbinfo *sb; int alloc; + + ext2gd_lock(gd); sb = gd->sb; bmp = &gd->bmps[type_sel]; - alloc = ext2bmp_alloc_one(bmp); + alloc = ext2bmp_alloc_nolock(bmp); if (alloc < 0) { - return alloc; + goto done; } if (!ext2bmp_check_free(bmp)) { @@ -66,8 +78,11 @@ ext2gd_alloc_slot(struct ext2_gdesc* gd, int type_sel) sb->raw->s_free_blk_cnt--; } - fsblock_dirty(gd->buf); - fsblock_dirty(sb->buf); + ext2gd_schedule_sync(gd); + ext2sb_schedule_sync(sb); + +done: + ext2gd_unlock(gd); return alloc; } @@ -77,7 +92,9 @@ ext2gd_free_slot(struct ext2_gdesc* gd, int type_sel, int slot) struct llist_header *free_ent, *free_list; struct ext2_sbinfo *sb; - ext2bmp_free_one(&gd->bmps[type_sel], slot); + ext2gd_lock(gd); + + ext2bmp_free_nolock(&gd->bmps[type_sel], slot); sb = gd->sb; free_ent = &gd->free_list_sel[slot]; @@ -86,6 +103,7 @@ ext2gd_free_slot(struct ext2_gdesc* gd, int type_sel, int slot) llist_append(free_list, free_ent); } + // FIXME might need arch-depedent impl for atomic operations if (type_sel == GDESC_INO_SEL) { gd->info->bg_free_ino_cnt++; sb->raw->s_free_ino_cnt++; @@ -94,6 +112,8 @@ ext2gd_free_slot(struct ext2_gdesc* gd, int type_sel, int slot) sb->raw->s_free_blk_cnt++; } - fsblock_dirty(gd->buf); - fsblock_dirty(sb->buf); + ext2gd_schedule_sync(gd); + ext2sb_schedule_sync(sb); + + ext2gd_unlock(gd); } \ No newline at end of file diff --git a/lunaix-os/kernel/fs/ext2/dir.c b/lunaix-os/kernel/fs/ext2/dir.c index a6af51e..9058c8a 100644 --- a/lunaix-os/kernel/fs/ext2/dir.c +++ b/lunaix-os/kernel/fs/ext2/dir.c @@ -57,98 +57,138 @@ done: _ret: fsblock_put(prev_buf); ext2dr_itend(&iter); + return itstate_sel(&iter, errno); } -static size_t +static inline size_t __dirent_realsize(struct ext2b_dirent* dirent) { return sizeof(*dirent) - sizeof(dirent->name) + dirent->name_len; } -#define DIRENT_SLOT_MID 0 -#define DIRENT_SLOT_LAST 1 -#define DIRENT_SLOT_EMPTY 2 +#define DIRENT_INSERT 0 +#define DIRENT_APPEND 1 + +#define DIRENT_ALIGNMENT sizeof(int) + +struct dirent_locator +{ + size_t search_size; + + int state; + struct ext2_dnode result; + size_t new_prev_reclen; + size_t db_pos; +}; + + +static inline void must_inline +__init_locator(struct dirent_locator* loc, size_t search_size) +{ + *loc = (struct dirent_locator) { .search_size = search_size }; +} static int -__find_free_dirent_slot(struct v_inode* inode, size_t size, - struct ext2_dnode* e_dnode_out, size_t *reclen) +__find_free_dirent_slot(struct v_inode* inode, struct dirent_locator* loc) { - struct ext2_iterator iter; + struct ext2_iterator dbit; struct ext2b_dirent *dir = NULL; + struct ext2_dnode* result; + bbuf_t prev_buf = bbuf_null; bool found = false; - ext2db_itbegin(&iter, inode); - - size_t sz = 0; + size_t sz = 0, aligned = 0; unsigned int rec = 0, total_rec = 0; + unsigned int dir_size; + + aligned = ROUNDUP(loc->search_size, DIRENT_ALIGNMENT); + result = &loc->result; + + ext2db_itbegin(&dbit, inode, DBIT_MODE_BLOCK); - while (!found && ext2db_itnext(&iter)) + while (!found && ext2db_itnext(&dbit)) { rec = 0; do { - dir = (struct ext2b_dirent*)offset(iter.data, rec); + dir = (struct ext2b_dirent*)offset(dbit.data, rec); sz = dir->rec_len - __dirent_realsize(dir); - sz = ROUNDDOWN(sz, 4); - if (sz >= size) { + sz = ROUNDDOWN(sz, DIRENT_ALIGNMENT); + if ((signed)sz >= (signed)aligned) { found = true; break; } rec += dir->rec_len; total_rec += dir->rec_len; - } while(rec < iter.blksz); + } while(rec < dbit.blksz); if (likely(prev_buf)) { fsblock_put(prev_buf); } - prev_buf = fsblock_take(iter.sel_buf); + prev_buf = fsblock_take(dbit.sel_buf); } + ext2_debug("dr_find_slot: found=%d, blk_off=%d, off=%d, gap=%d, blk=%d/%d", + found, rec, total_rec, sz, dbit.pos - 1, dbit.end_pos); + + loc->db_pos = dbit.pos - 1; + if (blkbuf_nullbuf(prev_buf)) { // this dir is brand new - return DIRENT_SLOT_EMPTY; + loc->state = DIRENT_APPEND; + goto done; } - e_dnode_out->prev = (struct ext2_dnode_sub) { + dir_size = ROUNDUP(__dirent_realsize(dir), 4); + loc->new_prev_reclen = dir_size; + + result->prev = (struct ext2_dnode_sub) { .buf = fsblock_take(prev_buf), .dirent = dir }; if (!found) { // if prev is the last, and no more space left behind. - assert_fs(rec == iter.blksz); + assert_fs(rec == dbit.blksz); - e_dnode_out->self.buf = bbuf_null; - ext2db_itend(&iter); - return itstate_sel(&iter, DIRENT_SLOT_LAST); - } + result->self.buf = bbuf_null; + ext2db_itend(&dbit); - unsigned int dir_size; - - dir_size = ROUNDUP(__dirent_realsize(dir), 4); - *reclen = dir_size; + loc->state = DIRENT_APPEND; + goto done; + } - rec = total_rec + dir_size; - dir = (struct ext2b_dirent*)offset(iter.data, rec); + rec += dir_size; + dir = (struct ext2b_dirent*)offset(dbit.data, rec); - e_dnode_out->self = (struct ext2_dnode_sub) { - .buf = fsblock_take(iter.sel_buf), + result->self = (struct ext2_dnode_sub) { + .buf = fsblock_take(dbit.sel_buf), .dirent = dir }; - ext2db_itend(&iter); - return DIRENT_SLOT_MID; + ext2db_itend(&dbit); + + loc->state = DIRENT_INSERT; + +done: + return itstate_sel(&dbit, 0); } static inline void -__destruct_ext2_dnode(struct ext2_dnode* e_dno) +__release_dnode_blocks(struct ext2_dnode* e_dno) { fsblock_put(e_dno->prev.buf); fsblock_put(e_dno->self.buf); +} + +static inline void +__destruct_ext2_dnode(struct ext2_dnode* e_dno) +{ + __release_dnode_blocks(e_dno); vfree(e_dno); } @@ -452,54 +492,35 @@ ext2dr_insert(struct v_inode* this, struct ext2b_dirent* dirent, { int errno; size_t size, new_reclen, old_reclen; - struct ext2_inode* e_self; struct ext2_dnode* e_dno; struct ext2b_dirent* prev_dirent; + struct dirent_locator locator; bbuf_t buf; - e_self = EXT2_INO(this); - e_dno = vzalloc(sizeof(*e_dno)); - size = __dirent_realsize(dirent); - errno = __find_free_dirent_slot(this, size, e_dno, &new_reclen); + __init_locator(&locator, size); + + errno = __find_free_dirent_slot(this, &locator); if (errno < 0) { goto failed; } - if (errno == DIRENT_SLOT_EMPTY) { - if ((errno = ext2db_acquire(this, 0, &buf))) { + e_dno = &locator.result; + new_reclen = locator.new_prev_reclen; + old_reclen = fsapi_block_size(this->sb); + + if (locator.state != DIRENT_INSERT) + { + if ((errno = ext2db_acquire(this, locator.db_pos, &buf))) goto failed; - } this->fsize += fsapi_block_size(this->sb); ext2ino_update(this); - old_reclen = fsapi_block_size(this->sb); e_dno->self.buf = buf; - e_dno->self.dirent = blkbuf_data(buf); - - goto place_dir; + e_dno->self.dirent = block_buffer(buf, struct ext2b_dirent); } - prev_dirent = e_dno->prev.dirent; - old_reclen = prev_dirent->rec_len; - - if (errno == DIRENT_SLOT_LAST) { - // prev is last record - if ((errno = ext2db_alloc(this, &buf))) { - goto failed; - } - - this->fsize += fsapi_block_size(this->sb); - ext2ino_update(this); - - new_reclen = __dirent_realsize(prev_dirent); - new_reclen = ROUNDUP(new_reclen, sizeof(int)); - e_dno->self = (struct ext2_dnode_sub) { - .buf = buf, - .dirent = block_buffer(buf, struct ext2b_dirent) - }; - } /* --- +--------+ --- @@ -519,17 +540,29 @@ ext2dr_insert(struct v_inode* this, struct ext2b_dirent* dirent, +--------+ */ - old_reclen -= new_reclen; - prev_dirent->rec_len = new_reclen; - fsblock_dirty(e_dno->prev.buf); + else + { + prev_dirent = e_dno->prev.dirent; + old_reclen = prev_dirent->rec_len; + old_reclen -= new_reclen; + + prev_dirent->rec_len = new_reclen; + fsblock_dirty(e_dno->prev.buf); + } + + ext2_debug("dr_insert: state=%d, blk=%d, prev_rlen=%d, new_rlen=%d", + locator.state, locator.db_pos, new_reclen, old_reclen); -place_dir: - dirent->rec_len = ROUNDUP(old_reclen, sizeof(int)); + assert_fs(new_reclen > 0); + assert_fs(old_reclen > 0); + + dirent->rec_len = old_reclen; + memcpy(e_dno->self.dirent, dirent, size); fsblock_dirty(e_dno->self.buf); if (!e_dno_out) { - __destruct_ext2_dnode(e_dno); + __release_dnode_blocks(e_dno); } else { *e_dno_out = e_dno; @@ -538,7 +571,7 @@ place_dir: return errno; failed: - __destruct_ext2_dnode(e_dno); + __release_dnode_blocks(e_dno); return errno; } diff --git a/lunaix-os/kernel/fs/ext2/ext2.h b/lunaix-os/kernel/fs/ext2/ext2.h index 1f12315..8a6e852 100644 --- a/lunaix-os/kernel/fs/ext2/ext2.h +++ b/lunaix-os/kernel/fs/ext2/ext2.h @@ -6,6 +6,14 @@ #include #include #include +#include + +#ifdef CONFIG_EXT2_DEBUG_MSG +# include +# define ext2_debug(fmt, ...) kprintf_v("ext2", fmt, ##__VA_ARGS__) +#else +# define ext2_debug(fmt, ...) +#endif #define FEAT_COMPRESSION 0b00000001 #define FEAT_RESIZE_INO 0b00000010 @@ -202,6 +210,8 @@ struct ext2_sbinfo struct llist_header gds; GDESC_FREE_LISTS; }; + + mutex_t lock; }; #define EXT2_SB(vsb) (fsapi_impl_data(vsb, struct ext2_sbinfo)) @@ -234,6 +244,8 @@ struct ext2_gdesc struct ext2_sbinfo* sb; bbuf_t buf; bcobj_t cache_ref; + + mutex_t lock; }; /* @@ -269,7 +281,8 @@ struct ext2_inode bbuf_t buf; // partial inotab that holds this inode unsigned int inds_lgents; // log2(# of block in an indirection level) unsigned int ino_id; - size_t indirect_blocks; + size_t nr_fsblks; + size_t nr_indblks; size_t isize; struct ext2b_inode* ino; // raw ext2 inode @@ -290,6 +303,8 @@ struct ext2_inode // prefetched block for 1st order of indirection bbuf_t ind_ord1; char* symlink; + + // No lock required, it shares lock context with v_inode. }; #define EXT2_INO(v_inode) (fsapi_impl_data(v_inode, struct ext2_inode)) @@ -303,6 +318,8 @@ struct ext2_dnode { struct ext2_dnode_sub self; struct ext2_dnode_sub prev; + + // No lock required, it shares lock context with v_dnode. }; #define EXT2_DNO(v_dnode) (fsapi_impl_data(v_dnode, struct ext2_dnode)) @@ -336,7 +353,6 @@ struct ext2_iterator struct ext2_file { struct ext2_iterator iter; - struct ext2_inode* b_ino; }; #define EXT2_FILE(v_file) (fsapi_impl_data(v_file, struct ext2_file)) @@ -371,6 +387,27 @@ ext2_feature(struct v_superblock* vsb, unsigned int feat) return !!(EXT2_SB(vsb)->all_feature & feat); } +/* ************ Superblock ************ */ + +static inline void +ext2sb_schedule_sync(struct ext2_sbinfo* sb) +{ + fsblock_dirty(sb->buf); +} + +static inline void must_inline +ext2sb_lock(struct ext2_sbinfo* sb) +{ + mutex_lock(&sb->lock); +} + +static inline void must_inline +ext2sb_unlock(struct ext2_sbinfo* sb) +{ + mutex_unlock(&sb->lock); +} + + /* ************ Inodes ************ */ void @@ -405,8 +442,20 @@ ext2ino_linkto(struct ext2_inode* e_ino, struct ext2b_dirent* dirent) fsblock_dirty(e_ino->buf); } +static inline void +ext2ino_schedule_sync(struct ext2_inode* ino) +{ + fsblock_dirty(ino->buf); +} + + +/* ************* Data blocks ************* */ + +#define DBIT_MODE_ISIZE 0 +#define DBIT_MODE_BLOCK 1 + void -ext2db_itbegin(struct ext2_iterator* iter, struct v_inode* inode); +ext2db_itbegin(struct ext2_iterator* iter, struct v_inode* inode, int mode); void ext2db_itend(struct ext2_iterator* iter); @@ -486,14 +535,33 @@ void ext2gd_release_gdt(struct v_superblock* vsb); int -ext2gd_take(struct v_superblock* vsb, +ext2gd_take_at(struct v_superblock* vsb, unsigned int index, struct ext2_gdesc** out); +static inline struct ext2_gdesc* +ext2gd_take(struct ext2_gdesc* gd) { + bcache_refonce(gd->cache_ref); + + return gd; +} + static inline void ext2gd_put(struct ext2_gdesc* gd) { bcache_return(gd->cache_ref); } +static inline void must_inline +ext2gd_lock(struct ext2_gdesc* gd) +{ + mutex_lock(&gd->lock); +} + +static inline void must_inline +ext2gd_unlock(struct ext2_gdesc* gd) +{ + mutex_unlock(&gd->lock); +} + /* ************ Directory ************ */ @@ -599,23 +667,6 @@ ext2_get_symlink(struct v_inode *this, const char **path_out); int ext2_set_symlink(struct v_inode *this, const char *target); -/* *********** Bitmap *********** */ - -void -ext2bmp_init(struct ext2_bmp* e_bmp, bbuf_t bmp_buf, unsigned int nr_bits); - -bool -ext2bmp_check_free(struct ext2_bmp* e_bmp); - -int -ext2bmp_alloc_one(struct ext2_bmp* e_bmp); - -void -ext2bmp_free_one(struct ext2_bmp* e_bmp, unsigned int pos); - -void -ext2bmp_discard(struct ext2_bmp* e_bmp); - /* *********** Allocations *********** */ #define ALLOC_FAIL -1 @@ -656,6 +707,13 @@ ext2gd_free_block(struct ext2_gdesc* gd, int slot) ext2gd_free_slot(gd, GDESC_BLK_SEL, slot); } +static inline void +ext2gd_schedule_sync(struct ext2_gdesc* gd) +{ + fsblock_dirty(gd->buf); + fsblock_dirty(gd->ino_bmp.raw); + fsblock_dirty(gd->blk_bmp.raw); +} /** * @brief Allocate a free inode @@ -705,4 +763,26 @@ int ext2db_alloc_slot(struct v_superblock* vsb, struct ext2_gdesc** gd_out); +/* *********** Bitmap *********** */ + +void +ext2bmp_init(struct ext2_bmp* e_bmp, bbuf_t bmp_buf, unsigned int nr_bits); + +int +ext2bmp_alloc_nolock(struct ext2_bmp* e_bmp); + +void +ext2bmp_free_nolock(struct ext2_bmp* e_bmp, unsigned int pos); + +void +ext2bmp_discard_nolock(struct ext2_bmp* e_bmp); + +static inline bool +ext2bmp_check_free(struct ext2_bmp* e_bmp) +{ + assert(e_bmp->raw); + + return valid_bmp_slot(e_bmp->next_free); +} + #endif /* __LUNAIX_EXT2_H */ diff --git a/lunaix-os/kernel/fs/ext2/file.c b/lunaix-os/kernel/fs/ext2/file.c index 0fe4d3d..bd14785 100644 --- a/lunaix-os/kernel/fs/ext2/file.c +++ b/lunaix-os/kernel/fs/ext2/file.c @@ -12,7 +12,6 @@ ext2_open_inode(struct v_inode* inode, struct v_file* file) struct ext2_file* e_file; e_file = valloc(sizeof(*e_file)); - e_file->b_ino = EXT2_INO(inode); file->data = e_file; @@ -96,7 +95,7 @@ ext2_inode_read(struct v_inode *inode, blksz = e_sb->block_size; end = fpos + len; - ext2db_itbegin(&iter, inode); + ext2db_itbegin(&iter, inode, DBIT_MODE_ISIZE); ext2db_itffw(&iter, fpos / blksz); while (fpos < end && ext2db_itnext(&iter)) { @@ -134,7 +133,7 @@ ext2_inode_read_page(struct v_inode *inode, void *buffer, size_t fpos) n = PAGE_SIZE / e_sb->block_size; transfer_sz = MIN(PAGE_SIZE, e_sb->block_size); - ext2db_itbegin(&iter, inode); + ext2db_itbegin(&iter, inode, DBIT_MODE_ISIZE); ext2db_itffw(&iter, blk_start); while (n-- && ext2db_itnext(&iter)) diff --git a/lunaix-os/kernel/fs/ext2/group.c b/lunaix-os/kernel/fs/ext2/group.c index b77d6b7..d11e510 100644 --- a/lunaix-os/kernel/fs/ext2/group.c +++ b/lunaix-os/kernel/fs/ext2/group.c @@ -89,8 +89,8 @@ __try_load_bitmap(struct v_superblock* vsb, struct ext2_sbinfo* ext2sb; struct ext2_bmp* bmp; struct llist_header* flist, *flist_entry; + unsigned int bmp_blk_id, bmp_size; bbuf_t buf; - unsigned int blk_id, bmp_blk_id, bmp_size; ext2sb = EXT2_SB(vsb); @@ -111,8 +111,7 @@ __try_load_bitmap(struct v_superblock* vsb, flist = &ext2sb->free_list_sel[type]; flist_entry = &gd->free_list_sel[type]; - blk_id = ext2_datablock(vsb, bmp_blk_id); - buf = fsblock_get(vsb, blk_id); + buf = fsblock_get(vsb, bmp_blk_id); if (blkbuf_errbuf(buf)) { return false; } @@ -127,7 +126,7 @@ __try_load_bitmap(struct v_superblock* vsb, } int -ext2gd_take(struct v_superblock* vsb, +ext2gd_take_at(struct v_superblock* vsb, unsigned int index, struct ext2_gdesc** out) { bbuf_t part, buf; @@ -170,6 +169,8 @@ ext2gd_take(struct v_superblock* vsb, .ino_base = index * ext2sb->raw->s_ino_per_grp }; + mutex_init(&gd->lock); + *out = gd; if (!ext2sb->read_only) { @@ -236,16 +237,8 @@ ext2bmp_init(struct ext2_bmp* e_bmp, bbuf_t bmp_buf, unsigned int nr_bits) __ext2bmp_update_next_free_cell(e_bmp); } -bool -ext2bmp_check_free(struct ext2_bmp* e_bmp) -{ - assert(e_bmp->raw); - - return valid_bmp_slot(e_bmp->next_free); -} - int -ext2bmp_alloc_one(struct ext2_bmp* e_bmp) +ext2bmp_alloc_nolock(struct ext2_bmp* e_bmp) { assert(e_bmp->raw); @@ -276,7 +269,7 @@ ext2bmp_alloc_one(struct ext2_bmp* e_bmp) } void -ext2bmp_free_one(struct ext2_bmp* e_bmp, unsigned int pos) +ext2bmp_free_nolock(struct ext2_bmp* e_bmp, unsigned int pos) { assert(e_bmp->raw); @@ -292,7 +285,7 @@ ext2bmp_free_one(struct ext2_bmp* e_bmp, unsigned int pos) } void -ext2bmp_discard(struct ext2_bmp* e_bmp) +ext2bmp_discard_nolock(struct ext2_bmp* e_bmp) { assert(e_bmp->raw); diff --git a/lunaix-os/kernel/fs/ext2/inode.c b/lunaix-os/kernel/fs/ext2/inode.c index 5871d07..0956b1e 100644 --- a/lunaix-os/kernel/fs/ext2/inode.c +++ b/lunaix-os/kernel/fs/ext2/inode.c @@ -128,8 +128,65 @@ __btlb_flushall(struct ext2_inode* e_inode) } } +/** + * Obtain the number of indirect blocks that contains + * pointers to next level blocks. + * + * Let N be the number of ids that a data block can hold, + * then the total number of data blocks assigned (reserved) + * to the inode: + * + * i_blocks = 12 + (N + 1) + (N^2 + N + 1) + (N^3 + N^2 + N + 1) + */ +static int +__get_nr_indblks(struct ext2_sbinfo* sb, size_t fsblks) +{ + ssize_t blks; + int nr_ents; + int nr_inds, n, acc_nr; + + blks = (ssize_t)fsblks; + nr_ents = sb->block_size / sizeof(int); + acc_nr = 1; + + if (blks <= 12) + return 0; + + blks -= 12; + + if (blks > 0) // order-1 indirection + { + n = MIN(ICEIL(blks, nr_ents), acc_nr); + blks -= n * nr_ents; + + nr_inds += 1; + acc_nr *= nr_ents; + } + + if (blks > 0) // order-2 indirection + { + n = MIN(ICEIL(blks, nr_ents), acc_nr); + blks -= n * nr_ents; + + nr_inds += n + 1; + acc_nr *= nr_ents; + } + + if (blks > 0) // order-3 indirection + { + n = MAX(ICEIL(blks, nr_ents), acc_nr); + blks -= n * nr_ents; + + nr_inds += n + ICEIL(n, nr_ents) + 1; + } + + assert_fs(blks <= 0); + + return nr_inds; +} + void -ext2db_itbegin(struct ext2_iterator* iter, struct v_inode* inode) +ext2db_itbegin(struct ext2_iterator* iter, struct v_inode* inode, int mode) { struct ext2_inode* e_ino; @@ -138,8 +195,12 @@ ext2db_itbegin(struct ext2_iterator* iter, struct v_inode* inode) .pos = 0, .inode = inode, .blksz = inode->sb->blksize, - .end_pos = ICEIL(e_ino->isize, inode->sb->blksize) }; + + if (mode == DBIT_MODE_ISIZE) + iter->end_pos = ICEIL(e_ino->isize, inode->sb->blksize); + else + iter->end_pos = e_ino->nr_fsblks - e_ino->nr_indblks; } void @@ -186,14 +247,13 @@ ext2db_itnext(struct ext2_iterator* iter) fsblock_put(iter->sel_buf); } - buf = ext2db_get(iter->inode, iter->pos); + buf = ext2db_get(iter->inode, iter->pos++); iter->sel_buf = buf; if (!buf || !ext2_itcheckbuf(iter)) { return false; } - iter->pos++; iter->data = blkbuf_data(buf); return true; @@ -331,7 +391,7 @@ __get_group_desc(struct v_superblock* vsb, int ino, sb = EXT2_SB(vsb); blkgrp_id = to_fsblock_id(ino) / sb->raw->s_ino_per_grp; - return ext2gd_take(vsb, blkgrp_id, gd_out); + return ext2gd_take_at(vsb, blkgrp_id, gd_out); } static struct ext2b_inode* @@ -371,7 +431,7 @@ __create_inode(struct v_superblock* vsb, struct ext2_gdesc* gd, int ino_index) struct ext2b_inode* b_inode; struct ext2_inode* inode; unsigned int ind_ents; - size_t inds_blks; + size_t nr_linked; sb = gd->sb; b_inode = __get_raw_inode(vsb, gd, &ino_tab, ino_index); @@ -383,7 +443,7 @@ __create_inode(struct v_superblock* vsb, struct ext2_gdesc* gd, int ino_index) inode->btlb = vzalloc(sizeof(struct ext2_btlb)); inode->buf = ino_tab; inode->ino = b_inode; - inode->blk_grp = gd; + inode->blk_grp = ext2gd_take(gd); inode->isize = b_inode->i_size; if (ext2_feature(vsb, FEAT_LARGE_FILE)) { @@ -391,11 +451,11 @@ __create_inode(struct v_superblock* vsb, struct ext2_gdesc* gd, int ino_index) } if (b_inode->i_blocks) { - inds_blks = (size_t)b_inode->i_blocks; - inds_blks -= ICEIL(inode->isize, 512); - inds_blks /= (sb->block_size / 512); + nr_linked = (size_t)b_inode->i_blocks; + nr_linked /= (sb->block_size / 512); - inode->indirect_blocks = inds_blks; + inode->nr_fsblks = nr_linked; + inode->nr_indblks = __get_nr_indblks(sb, nr_linked); } ind_ents = sb->block_size / sizeof(int); @@ -404,6 +464,8 @@ __create_inode(struct v_superblock* vsb, struct ext2_gdesc* gd, int ino_index) inode->inds_lgents = ilog2(ind_ents); inode->ino_id = gd->ino_base + to_ext2ino_id(ino_index); + ext2_debug("ino(%d): isize=%lu, nr_blk=%lu, nr_inds=%lu", + inode->ino_id, inode->isize, inode->nr_fsblks, inode->nr_indblks); return inode; } @@ -530,7 +592,7 @@ __free_block_at(struct v_superblock *vsb, unsigned int block_pos) sb = EXT2_SB(vsb); gd_index = block_pos / sb->raw->s_blk_per_grp; - if ((errno = ext2gd_take(vsb, gd_index, &gd))) { + if ((errno = ext2gd_take_at(vsb, gd_index, &gd))) { return errno; } @@ -654,7 +716,9 @@ __update_inode_size(struct v_inode* inode, size_t size) { struct ext2b_inode* b_ino; struct ext2_inode* e_ino; + struct ext2_sbinfo* sb; + sb = EXT2_SB(inode->sb); e_ino = EXT2_INO(inode); b_ino = e_ino->ino; @@ -668,8 +732,8 @@ __update_inode_size(struct v_inode* inode, size_t size) b_ino->i_size = size; } - b_ino->i_blocks = ICEIL(size, 512); - b_ino->i_blocks += e_ino->indirect_blocks; + b_ino->i_blocks = e_ino->nr_fsblks * (sb->block_size / 512); + fsblock_dirty(e_ino->buf); } int @@ -738,6 +802,9 @@ ext2_link(struct v_inode* this, struct v_dnode* new_name) new_name->data = e_dno; vfs_assign_inode(new_name, this); + // linking a dnode to parent could result new data block allocated + ext2_sync_inode(parent); + done: return errno; } @@ -760,6 +827,8 @@ ext2_unlink(struct v_inode* this, struct v_dnode* name) return errno; } + // unlink a dnode from parent will not free the allocated data blocks + // rather, it leads to fragmentation return ext2ino_free(this); } @@ -784,6 +853,9 @@ __walkstate_set_stack(struct walk_state* state, int depth, state->stack.indices[depth] = index; } +#define WALKMODE_ALLOC 0b01 +#define WALKMODE_NOBTLB 0b10 + /** * @brief Walk the indrection chain given the position of data block * relative to the inode. Upon completed, walk_state will be @@ -794,20 +866,26 @@ __walkstate_set_stack(struct walk_state* state, int depth, * (i.e., a leaf block), then the state is the indirect block that * containing the ID of that leaf block. * - * If `resolve` is set, it will resolve any absence encountered - * during the walk by allocating and chaining indirect block. - * It require the file system is mounted writable. + * Two modes can be specified to alter the walk process: + * + * WALKMODE_ALLOC + * resolve any absence encountered + * during the walk by allocating and chaining indirect block + * + * WALKMODE_NOBTLB + * Ignore the cached result, always perform a complete walk. + * This does not by pass the cache entirely, lower level caches + * like block buffer (blkio request cache) will be used transparently * * @param inode inode to walk * @param pos flattened data block position to be located * @param state contain the walk result - * @param resolve whether to auto allocate the indirection structure during - * walk if `pos` is not exist. + * @param mode walk mode * @return int */ static int __walk_indirects(struct v_inode* inode, unsigned int pos, - struct walk_state* state, bool resolve, bool full_walk) + struct walk_state* state, int mode) { int errno; int inds, stride, shifts, level; @@ -816,12 +894,13 @@ __walk_indirects(struct v_inode* inode, unsigned int pos, struct ext2b_inode* b_inode; struct v_superblock* vsb; bbuf_t table, next_table; + bool alloc; e_inode = EXT2_INO(inode); b_inode = e_inode->ino; vsb = inode->sb; level = 0; - resolve = resolve && !EXT2_SB(vsb)->read_only; + alloc = (mode & WALKMODE_ALLOC) && !EXT2_SB(vsb)->read_only; if (pos < 12) { index = pos; @@ -847,7 +926,7 @@ __walk_indirects(struct v_inode* inode, unsigned int pos, } // bTLB cache the last level indirect block - if (!full_walk && (table = __btlb_hit(e_inode, pos))) { + if (!(mode & WALKMODE_NOBTLB) && (table = __btlb_hit(e_inode, pos))) { level = inds; index = pos & ((1 << stride) - 1); slotref = &block_buffer(table, u32_t)[index]; @@ -867,7 +946,7 @@ __walk_indirects(struct v_inode* inode, unsigned int pos, next = *slotref; if (!next) { - if (!resolve) { + if (!alloc) { goto _return; } @@ -876,7 +955,6 @@ __walk_indirects(struct v_inode* inode, unsigned int pos, return errno; } - e_inode->indirect_blocks++; *slotref = fsblock_id(next_table); fsblock_dirty(table); } @@ -894,7 +972,6 @@ __walk_indirects(struct v_inode* inode, unsigned int pos, assert(shifts >= 0); index = (pos & mask) >> shifts; - slotref = &block_buffer(table, u32_t)[index]; shifts -= stride; @@ -927,7 +1004,7 @@ ext2db_get(struct v_inode* inode, unsigned int data_pos) ext2walk_init_state(&state); - errno = __walk_indirects(inode, data_pos, &state, false, false); + errno = __walk_indirects(inode, data_pos, &state, 0); if (errno) { return (bbuf_t)INVL_BUFFER; } @@ -953,7 +1030,7 @@ ext2db_acquire(struct v_inode* inode, unsigned int data_pos, bbuf_t* out) ext2walk_init_state(&state); - errno = __walk_indirects(inode, data_pos, &state, true, false); + errno = __walk_indirects(inode, data_pos, &state, WALKMODE_ALLOC); if (errno) { return errno; } @@ -987,36 +1064,36 @@ done: int ext2db_alloc(struct v_inode* inode, bbuf_t* out) { - int free_ino_idx; + int next_free; struct ext2_gdesc* gd; struct ext2_inode* e_inode; struct v_superblock* vsb; - free_ino_idx = ALLOC_FAIL; + next_free = ALLOC_FAIL; e_inode = EXT2_INO(inode); vsb = inode->sb; gd = e_inode->blk_grp; - free_ino_idx = ext2gd_alloc_block(gd); + next_free = ext2gd_alloc_block(gd); // locality alloc failed, try entire fs - if (!valid_bmp_slot(free_ino_idx)) { - free_ino_idx = ext2db_alloc_slot(vsb, &gd); + if (!valid_bmp_slot(next_free)) { + next_free = ext2db_alloc_slot(vsb, &gd); } - if (!valid_bmp_slot(free_ino_idx)) { + if (!valid_bmp_slot(next_free)) { return EDQUOT; } - free_ino_idx += gd->base; - free_ino_idx = ext2_datablock(vsb, free_ino_idx); - free_ino_idx = to_ext2ino_id(free_ino_idx); + next_free += gd->base; + next_free = ext2_datablock(vsb, next_free); - bbuf_t buf = fsblock_get(vsb, free_ino_idx); + bbuf_t buf = fsblock_get(vsb, next_free); if (blkbuf_errbuf(buf)) { return EIO; } + e_inode->nr_fsblks++; *out = buf; return 0; } @@ -1067,7 +1144,6 @@ ext2ino_resizing(struct v_inode* inode, size_t new_size) } __update_inode_size(inode, new_size); - fsblock_dirty(e_ino->buf); if (check_symlink_node(inode)) { return 0; @@ -1080,7 +1156,7 @@ ext2ino_resizing(struct v_inode* inode, size_t new_size) ext2walk_init_state(&state); pos = new_size / fsapi_block_size(inode->sb); - errno = __walk_indirects(inode, pos, &state, false, true); + errno = __walk_indirects(inode, pos, &state, WALKMODE_NOBTLB); if (errno) { return errno; } diff --git a/lunaix-os/kernel/fs/ext2/mount.c b/lunaix-os/kernel/fs/ext2/mount.c index 05fbdce..0d328e7 100644 --- a/lunaix-os/kernel/fs/ext2/mount.c +++ b/lunaix-os/kernel/fs/ext2/mount.c @@ -170,6 +170,8 @@ ext2_mount(struct v_superblock* vsb, struct v_dnode* mnt) ext2sb->raw = rawsb; ext2sb->all_feature = __translate_feature(rawsb); + mutex_init(&ext2sb->lock); + fsapi_set_vsb_ops(vsb, &vsb_ops); fsapi_complete_vsb_setup(vsb, ext2sb); @@ -188,6 +190,9 @@ ext2_mount(struct v_superblock* vsb, struct v_dnode* mnt) ext2sb->raw = offset(blkbuf_data(buf), EXT2_BASE_BLKSZ); } + ext2sb->raw->s_mnt_cnt++; + ext2sb->raw->s_mtime = clock_unixtime(); + ext2sb->buf = buf; vfree(rawsb); return 0; diff --git a/lunaix-os/kernel/fs/mount.c b/lunaix-os/kernel/fs/mount.c index b62b77f..1dee55a 100644 --- a/lunaix-os/kernel/fs/mount.c +++ b/lunaix-os/kernel/fs/mount.c @@ -86,8 +86,8 @@ __vfs_do_unmount(struct v_mount* mnt) // detached the inodes from cache, and let lru policy to recycle them for (size_t i = 0; i < VFS_HASHTABLE_SIZE; i++) { - __detach_node_cache_ref(&sb->i_cache[i]); - __detach_node_cache_ref(&sb->d_cache[i]); + __detach_node_cache_ref(&sb->i_cache.pool[i]); + __detach_node_cache_ref(&sb->d_cache.pool[i]); } struct v_dnode *pos, *next; diff --git a/lunaix-os/kernel/fs/vfs.c b/lunaix-os/kernel/fs/vfs.c index 32014d3..2402cc5 100644 --- a/lunaix-os/kernel/fs/vfs.c +++ b/lunaix-os/kernel/fs/vfs.c @@ -102,21 +102,54 @@ vfs_init() vfs_sysroot->parent = vfs_sysroot; vfs_ref_dnode(vfs_sysroot); + lru_remove(dnode_lru, &vfs_sysroot->lru); +} + +void +vfs_vncache_init(struct vncache* cache) +{ + cache->pool = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket)); + rwlock_init(&cache->lock); +} + +void +vfs_vncache_free(struct vncache* cache) +{ + // clear all other reader/writer + rwlock_begin_write(&cache->lock); + vfree(cache->pool); + + // already freed, so as the lock +} + +void +vfs_vncache_add(struct vncache* cache, size_t key, struct hlist_node* node) +{ + struct hbucket* slot; + + cache_atomic_write(cache, + { + slot = &cache->pool[key & VFS_HASH_MASK]; + hlist_delete(node); + hlist_add(&slot->head, node); + }); } static inline struct hbucket* -__dcache_hash(struct v_dnode* parent, u32_t* hash) +__dcache_hash_nolock(struct v_dnode* parent, u32_t* hash) { + struct v_superblock* sb; struct hbucket* d_cache; u32_t _hash; + + sb = parent->super_block; - d_cache = parent->super_block->d_cache; _hash = *hash; _hash = _hash ^ (_hash >> VFS_HASHBITS); _hash += (u32_t)__ptr(parent); *hash = _hash; - return &d_cache[_hash & VFS_HASH_MASK]; + return &sb->d_cache.pool[_hash & VFS_HASH_MASK]; } static inline int @@ -135,6 +168,11 @@ __sync_inode_nolock(struct v_inode* inode) struct v_dnode* vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str) { + u32_t hash; + struct hbucket* slot; + struct v_dnode *pos, *n; + struct vncache *dcache; + if (!str->len || HSTR_EQ(str, &vfs_dot)) return parent; @@ -142,16 +180,23 @@ vfs_dcache_lookup(struct v_dnode* parent, struct hstr* str) return parent->parent; } - u32_t hash = str->hash; - struct hbucket* slot = __dcache_hash(parent, &hash); + hash = str->hash; + dcache = dnode_cache(parent); + + vncache_lock_read(dcache); - struct v_dnode *pos, *n; + slot = __dcache_hash_nolock(parent, &hash); hashtable_bucket_foreach(slot, pos, n, hash_list) { - if (pos->name.hash == hash && pos->parent == parent) { - return pos; + if (pos->name.hash != hash || pos->parent != parent) { + continue; } + + vncache_unlock_read(dcache); + return pos; } + + vncache_unlock_read(dcache); return NULL; } @@ -172,14 +217,21 @@ __vfs_touch_inode(struct v_inode* inode, const int type) void vfs_dcache_add(struct v_dnode* parent, struct v_dnode* dnode) { + struct hbucket* bucket; + struct vncache* cache; + assert(parent); + assert(locked_node(parent)); dnode->ref_count = 1; dnode->parent = parent; llist_append(&parent->children, &dnode->siblings); - struct hbucket* bucket = __dcache_hash(parent, &dnode->name.hash); - hlist_add(&bucket->head, &dnode->hash_list); + cache_atomic_write(dnode_cache(parent), + { + bucket = __dcache_hash_nolock(parent, &dnode->name.hash); + hlist_add(&bucket->head, &dnode->hash_list); + }); } void @@ -190,7 +242,12 @@ vfs_dcache_remove(struct v_dnode* dnode) llist_delete(&dnode->siblings); llist_delete(&dnode->aka_list); - hlist_delete(&dnode->hash_list); + lru_remove(dnode_lru, &dnode->lru); + + cache_atomic_write(dnode_cache(dnode), + { + hlist_delete(&dnode->hash_list); + }); dnode->parent = NULL; dnode->ref_count = 0; @@ -200,10 +257,14 @@ void vfs_dcache_rehash(struct v_dnode* new_parent, struct v_dnode* dnode) { assert(new_parent); + assert(locked_node(new_parent)); - hstr_rehash(&dnode->name, HSTR_FULL_HASH); - vfs_dcache_remove(dnode); - vfs_dcache_add(new_parent, dnode); + dnode_atomic(dnode, + { + hstr_rehash(&dnode->name, HSTR_FULL_HASH); + vfs_dcache_remove(dnode); + vfs_dcache_add(new_parent, dnode); + }); } int @@ -250,6 +311,8 @@ vfs_open(struct v_dnode* dnode, struct v_file** file) void vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode) { + lock_dnode(assign_to); + if (assign_to->inode) { llist_delete(&assign_to->aka_list); assign_to->inode->link_count--; @@ -258,26 +321,33 @@ vfs_assign_inode(struct v_dnode* assign_to, struct v_inode* inode) llist_append(&inode->aka_dnodes, &assign_to->aka_list); assign_to->inode = inode; inode->link_count++; + + unlock_dnode(assign_to); } int vfs_link(struct v_dnode* to_link, struct v_dnode* name) { int errno; + struct v_inode* inode; + + inode = to_link->inode; if ((errno = vfs_check_writable(to_link))) { return errno; } - lock_inode(to_link->inode); + lock_inode(inode); + if (to_link->super_block->root != name->super_block->root) { errno = EXDEV; - } else if (!to_link->inode->ops->link) { + } else if (!inode->ops->link) { errno = ENOTSUP; - } else if (!(errno = to_link->inode->ops->link(to_link->inode, name))) { - vfs_assign_inode(name, to_link->inode); + } else if (!(errno = inode->ops->link(inode, name))) { + vfs_assign_inode(name, inode); } - unlock_inode(to_link->inode); + + unlock_inode(inode); return errno; } @@ -290,6 +360,11 @@ vfs_pclose(struct v_file* file, pid_t pid) inode = file->inode; + if (vfs_check_duped_file(file)) { + vfs_unref_file(file); + return 0; + } + /* * Prevent dead lock. * This happened when process is terminated while blocking on read. @@ -307,13 +382,11 @@ vfs_pclose(struct v_file* file, pid_t pid) * than A. And this will cause a probable race condition on A if other * process is writing to this file later after B exit. */ - mutex_unlock_for(&inode->lock, pid); - - if (vfs_check_duped_file(file)) { - vfs_unref_file(file); - return 0; - } + + // now regain lock for inode syncing + + lock_inode(inode); if ((errno = file->ops->close(file))) { goto done; @@ -322,17 +395,6 @@ vfs_pclose(struct v_file* file, pid_t pid) vfs_unref_dnode(file->dnode); cake_release(file_pile, file); - /* - if the current inode is not being locked by other - threads that does not share same open context, - then we can try to do sync opportunistically - */ - if (mutex_on_hold(&inode->lock)) { - goto done; - } - - lock_inode(inode); - pcache_commit_all(inode); inode->open_count--; @@ -340,9 +402,8 @@ vfs_pclose(struct v_file* file, pid_t pid) __sync_inode_nolock(inode); } - unlock_inode(inode); - done: + unlock_inode(inode); return errno; } @@ -384,12 +445,22 @@ vfs_fsync(struct v_file* file) int vfs_alloc_fdslot(int* fd) { + struct v_fdtable* fdtab; + + fdtab = __current->fdtable; + lock_fdtable(fdtab); + for (size_t i = 0; i < VFS_MAX_FD; i++) { - if (!__current->fdtable->fds[i]) { - *fd = i; - return 0; + if (__current->fdtable->fds[i]) { + continue; } + + *fd = i; + unlock_fdtable(fdtab); + return 0; } + + unlock_fdtable(fdtab); return EMFILE; } @@ -399,9 +470,9 @@ vfs_sb_alloc() struct v_superblock* sb = cake_grab(superblock_pile); memset(sb, 0, sizeof(*sb)); llist_init_head(&sb->sb_list); - - sb->i_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket)); - sb->d_cache = vzalloc(VFS_HASHTABLE_SIZE * sizeof(struct hbucket)); + + vfs_vncache_init(&sb->i_cache); + vfs_vncache_init(&sb->d_cache); sb->ref_count = 1; return sb; @@ -427,25 +498,36 @@ vfs_sb_unref(struct v_superblock* sb) sb->ops.release(sb); } - vfree(sb->i_cache); - vfree(sb->d_cache); - + vfs_vncache_free(&sb->i_cache); + vfs_vncache_free(&sb->d_cache); + cake_release(superblock_pile, sb); } -static int +static inline bool +__dnode_evictable(struct v_dnode* dnode) +{ + return dnode->ref_count == 1 + && llist_empty(&dnode->children); +} + +static bool __vfs_try_evict_dnode(struct lru_node* obj) { struct v_dnode* dnode = container_of(obj, struct v_dnode, lru); - if (!dnode->ref_count) { - vfs_d_free(dnode); - return 1; + if (mutex_on_hold(&dnode->lock)) + return false; + + if (!__dnode_evictable(dnode)) { + return false; } - return 0; + + vfs_d_free(dnode); + return true; } -static int +static bool __vfs_try_evict_inode(struct lru_node* obj) { struct v_inode* inode = container_of(obj, struct v_inode, lru); @@ -493,13 +575,14 @@ void vfs_d_free(struct v_dnode* dnode) { assert(dnode->ref_count == 1); - + if (dnode->inode) { assert(dnode->inode->link_count > 0); dnode->inode->link_count--; } vfs_dcache_remove(dnode); + // Make sure the children de-referencing their parent. // With lru presented, the eviction will be propagated over the entire // detached subtree eventually @@ -514,6 +597,7 @@ vfs_d_free(struct v_dnode* dnode) } vfs_sb_unref(dnode->super_block); + vfree((void*)dnode->name.value); cake_release(dnode_pile, dnode); } @@ -521,26 +605,32 @@ vfs_d_free(struct v_dnode* dnode) struct v_inode* vfs_i_find(struct v_superblock* sb, u32_t i_id) { - struct hbucket* slot = &sb->i_cache[i_id & VFS_HASH_MASK]; - struct v_inode *pos, *n; - hashtable_bucket_foreach(slot, pos, n, hash_list) + struct hbucket* slot; + struct v_inode *pos, *n, *found = NULL; + + cache_atomic_read(&sb->i_cache, { - if (pos->id == i_id) { + slot = &sb->i_cache.pool[i_id & VFS_HASH_MASK]; + + hashtable_bucket_foreach(slot, pos, n, hash_list) + { + if (pos->id != i_id) { + continue; + } + lru_use_one(inode_lru, &pos->lru); - return pos; + found = pos; + break; } - } + }); - return NULL; + return found; } void vfs_i_addhash(struct v_inode* inode) { - struct hbucket* slot = &inode->sb->i_cache[inode->id & VFS_HASH_MASK]; - - hlist_delete(&inode->hash_list); - hlist_add(&slot->head, &inode->hash_list); + vfs_vncache_add(inode_cache(inode), inode->id, &inode->hash_list); } struct v_inode* @@ -569,6 +659,7 @@ vfs_i_alloc(struct v_superblock* sb) vfs_i_assign_sb(inode, sb); lru_use_one(inode_lru, &inode->lru); + return inode; } @@ -579,6 +670,7 @@ vfs_i_free(struct v_inode* inode) pcache_release(inode->pg_cache); vfree(inode->pg_cache); } + // we don't need to sync inode. // If an inode can be free, then it must be properly closed. // Hence it must be synced already! @@ -587,7 +679,10 @@ vfs_i_free(struct v_inode* inode) } vfs_sb_unref(inode->sb); + hlist_delete(&inode->hash_list); + lru_remove(inode_lru, &inode->lru); + cake_release(inode_pile, inode); } @@ -605,10 +700,19 @@ vfs_i_free(struct v_inode* inode) int vfs_getfd(int fd, struct v_fd** fd_s) { - if (TEST_FD(fd) && (*fd_s = __current->fdtable->fds[fd])) { - return 0; + struct v_fdtable* fdtab; + + if (!TEST_FD(fd)) { + return EBADF; } - return EBADF; + + fdtab = __current->fdtable; + + lock_fdtable(fdtab); + *fd_s = __current->fdtable->fds[fd]; + unlock_fdtable(fdtab); + + return !*fd_s ? EBADF : 0; } static int @@ -671,27 +775,33 @@ __vfs_try_locate_file(const char* path, return errno; } + lock_dnode(fdir); + errno = vfs_walk(fdir, name.value, &file, NULL, woption); if (errno && errno != ENOENT) { - goto done; + goto error; + } + + if (!errno && (options & FLOC_MKNAME)) { + errno = EEXIST; + goto error; } if (!errno) { - if ((options & FLOC_MKNAME)) { - errno = EEXIST; - } + // the file present, no need to hold the directory lock + unlock_dnode(fdir); goto done; } // errno == ENOENT if (!options) { - goto done; + goto error; } errno = vfs_check_writable(fdir); if (errno) { - goto done; + goto error; } floc->fresh = true; @@ -699,17 +809,20 @@ __vfs_try_locate_file(const char* path, file = vfs_d_alloc(fdir, &name); if (!file) { - return ENOMEM; + errno = ENOMEM; + goto error; } - lock_dnode(fdir); - vfs_dcache_add(fdir, file); done: floc->dir = fdir; floc->file = file; + + return errno; +error: + unlock_dnode(fdir); return errno; } @@ -869,6 +982,7 @@ __DEFINE_LXSYSCALL2(int, sys_readdir, int, fd, struct lx_dirent*, dent) if ((errno = fd_s->file->ops->readdir(fd_s->file, &dctx)) != 1) { goto unlock; } + dent->d_offset++; fd_s->file->f_pos++; @@ -1105,6 +1219,42 @@ vfs_get_dtype(int itype) return dtype; } +struct v_fdtable* +fdtable_create() +{ + struct v_fdtable* fdtab; + + fdtab = vzalloc(sizeof(struct v_fdtable)); + mutex_init(&fdtab->lock); + + return fdtab; +} + +void +fdtable_copy(struct v_fdtable* dest, struct v_fdtable* src) +{ + lock_fdtable(dest); + lock_fdtable(src); + + for (size_t i = 0; i < VFS_MAX_FD; i++) { + struct v_fd* fd = src->fds[i]; + if (!fd) + continue; + vfs_dup_fd(fd, &dest->fds[i]); + } + + unlock_fdtable(dest); + unlock_fdtable(src); +} + +void +fdtable_free(struct v_fdtable* table) +{ + assert(!mutex_on_hold(&table->lock)); + + vfree(table); +} + __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size) { int errno; @@ -1114,11 +1264,12 @@ __DEFINE_LXSYSCALL3(int, realpathat, int, fd, char*, buf, size_t, size) } struct v_dnode* dnode; - errno = vfs_get_path(fd_s->file->dnode, buf, size, 0); - if (errno >= 0) { - return errno; - } + dnode = fd_s->file->dnode; + + lock_dnode(dnode); + errno = vfs_get_path(dnode, buf, size, 0); + unlock_dnode(dnode); done: return DO_STATUS(errno); @@ -1235,10 +1386,13 @@ done: __DEFINE_LXSYSCALL1(int, mkdir, const char*, path) { - int errno = 0; + int errno; + struct hstr name; + struct v_inode* inode; struct v_dnode *parent, *dir; char name_value[VFS_NAME_MAXLEN]; - struct hstr name = HHSTR(name_value, 0, 0); + + name = HHSTR(name_value, 0, 0); if ((errno = vfs_walk_proc(path, &parent, &name, VFS_WALK_PARENT))) { goto done; @@ -1258,7 +1412,7 @@ __DEFINE_LXSYSCALL1(int, mkdir, const char*, path) goto done; } - struct v_inode* inode = parent->inode; + inode = parent->inode; lock_dnode(parent); lock_inode(inode); @@ -1410,12 +1564,14 @@ vfs_dup_fd(struct v_fd* old, struct v_fd** new) int vfs_dup2(int oldfd, int newfd) { + int errno; + struct v_fdtable* fdtab; + struct v_fd *oldfd_s, *newfd_s; + if (newfd == oldfd) { return newfd; } - int errno; - struct v_fd *oldfd_s, *newfd_s; if ((errno = vfs_getfd(oldfd, &oldfd_s))) { goto done; } @@ -1425,16 +1581,26 @@ vfs_dup2(int oldfd, int newfd) goto done; } - newfd_s = __current->fdtable->fds[newfd]; + fdtab = __current->fdtable; + lock_fdtable(fdtab); + + newfd_s = fdtab->fds[newfd]; if (newfd_s && (errno = vfs_close(newfd_s->file))) { - goto done; + goto unlock_and_done; } - if (!(errno = vfs_dup_fd(oldfd_s, &newfd_s))) { - __current->fdtable->fds[newfd] = newfd_s; - return newfd; + if ((errno = vfs_dup_fd(oldfd_s, &newfd_s))) { + goto unlock_and_done; } + fdtab->fds[newfd] = newfd_s; + + unlock_fdtable(fdtab); + return newfd; + +unlock_and_done: + unlock_fdtable(fdtab); + done: return DO_STATUS(errno); } @@ -1650,6 +1816,7 @@ vfs_do_rename(struct v_dnode* current, struct v_dnode* target) lock_dnode(current); lock_dnode(target); + if (oldparent) lock_dnode(oldparent); if (newparent) @@ -1678,6 +1845,7 @@ vfs_do_rename(struct v_dnode* current, struct v_dnode* target) cleanup: unlock_dnode(current); + if (oldparent) unlock_dnode(oldparent); if (newparent) diff --git a/lunaix-os/kernel/process/fork.c b/lunaix-os/kernel/process/fork.c index 674e5c6..d68a8f8 100644 --- a/lunaix-os/kernel/process/fork.c +++ b/lunaix-os/kernel/process/fork.c @@ -45,18 +45,6 @@ region_maybe_cow(struct mm_region* region) tlb_flush_vmr_all(region); } -static inline void -__dup_fdtable(struct proc_info* pcb) -{ - for (size_t i = 0; i < VFS_MAX_FD; i++) { - struct v_fd* fd = __current->fdtable->fds[i]; - if (!fd) - continue; - vfs_dup_fd(fd, &pcb->fdtable->fds[i]); - } -} - - static void __dup_kernel_stack(struct thread* thread, ptr_t vm_mnt) { @@ -172,7 +160,7 @@ dup_proc() vfs_ref_dnode(pcb->cwd); } - __dup_fdtable(pcb); + fdtable_copy(pcb->fdtable, __current->fdtable); uscope_copy(&pcb->uscope, current_user_scope()); struct proc_mm* mm = vmspace(pcb); diff --git a/lunaix-os/usr/LBuild b/lunaix-os/usr/LBuild index 4b0617c..320bbeb 100644 --- a/lunaix-os/usr/LBuild +++ b/lunaix-os/usr/LBuild @@ -9,6 +9,8 @@ sources([ "maze", "mkdir", "rm", + "testfork", + "fragfile", ]) compile_opts([ diff --git a/lunaix-os/usr/fragfile.c b/lunaix-os/usr/fragfile.c new file mode 100644 index 0000000..a67b94a --- /dev/null +++ b/lunaix-os/usr/fragfile.c @@ -0,0 +1,71 @@ +#include +#include +#include +#include +#include + +static char alphabets[] = "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "01234567890"; + +#define NR_BUFSIZE 4096 +#define NR_NAME_LEN 8 +#define NR_REPEAT 5 + +int main() +{ + unsigned int buf[NR_BUFSIZE]; + char name[NR_NAME_LEN + 1]; + int fd = open("/dev/rand", O_RDONLY); + + if (mkdir("testdir") && errno != EEXIST) + { + printf("Unable to mkdir %d\n", errno); + _exit(1); + } + + if (chdir("testdir")) + { + printf("Unable to chdir %d\n", errno); + _exit(1); + } + + int nr_total = NR_REPEAT * NR_BUFSIZE / NR_NAME_LEN; + + int cnt = 0; + for (int i = 0; i < NR_REPEAT; i++) + { + int n = read(fd, buf, 4096 * sizeof(int)); + int j = 0, k = 0; + while (j < 4096) { + name[k++] = alphabets[buf[j++] % 63]; + + if (k < NR_NAME_LEN) { + continue; + } + + k = 0; + cnt++; + name[NR_NAME_LEN] = 0; + + printf("[%04d/%04d] creating: %s\r", cnt, nr_total, name); + int fd2 = open(name, O_RDONLY | O_CREAT); + + if (fd2 < 0) + { + printf("\n"); + if (errno == EDQUOT) { + printf("Out of quota\n"); + return 0; + } + + printf("Unable to open %d\n", errno); + continue; + } + + close(fd2); + } + } + printf("\n"); + return 0; +} \ No newline at end of file -- 2.27.0