1 #include <lunaix/fs/api.h>
2 #include <lunaix/mm/valloc.h>
4 #include <klibc/string.h>
8 static struct v_inode_ops ext2_inode_ops = {
9 .dir_lookup = ext2dr_lookup,
10 .open = ext2_open_inode,
13 .read_symlink = ext2_get_symlink,
14 .set_symlink = ext2_set_symlink,
15 .rename = ext2_rename,
17 .unlink = ext2_unlink,
18 .create = ext2_create,
19 .sync = ext2_sync_inode
22 static struct v_file_ops ext2_file_ops = {
23 .close = ext2_close_inode,
25 .read = ext2_inode_read,
26 .read_page = ext2_inode_read_page,
28 .write = ext2_inode_write,
29 .write_page = ext2_inode_write_page,
31 .readdir = ext2dr_read,
32 .seek = ext2_seek_inode,
33 .sync = ext2_file_sync
36 #define to_tag(e_ino, val) \
37 (((val) >> (e_ino)->inds_lgents) | (1 << msbiti))
38 #define valid_tag(tag) ((tag) & (1 << msbiti))
41 __btlb_insert(struct ext2_inode* e_inode, unsigned int blkid, bbuf_t buf)
43 struct ext2_btlb* btlb;
44 struct ext2_btlb_entry* btlbe = NULL;
47 if (unlikely(!blkid)) {
53 for (int i = 0; i < BTLB_SETS; i++)
55 if (valid_tag(btlb->buffer[i].tag)) {
59 btlbe = &btlb->buffer[i];
64 we have triggered the capacity miss.
65 since most file operation is heavily linear and strong
66 locality, we place our bet on it and avoid go through
67 the whole overhead of LRU eviction stuff. Just a trival
68 random eviction will do the fine job
70 cap_sel = hash_32(blkid, ilog2(BTLB_SETS));
71 btlbe = &btlb->buffer[cap_sel];
73 fsblock_put(btlbe->block);
76 btlbe->tag = to_tag(e_inode, blkid);
77 btlbe->block = fsblock_take(buf);
81 __btlb_hit(struct ext2_inode* e_inode, unsigned int blkid)
83 struct ext2_btlb* btlb;
84 struct ext2_btlb_entry* btlbe = NULL;
85 unsigned int in_tag, ref_cnts;
88 in_tag = to_tag(e_inode, blkid);
90 for (int i = 0; i < BTLB_SETS; i++)
92 btlbe = &btlb->buffer[i];
94 if (btlbe->tag != in_tag) {
98 ref_cnts = blkbuf_refcounts(btlbe->block);
101 btlbe->block = bbuf_null;
105 return fsblock_take(btlbe->block);
112 __btlb_flushall(struct ext2_inode* e_inode)
114 struct ext2_btlb* btlb;
115 struct ext2_btlb_entry* btlbe = NULL;
117 btlb = e_inode->btlb;
119 for (int i = 0; i < BTLB_SETS; i++)
121 btlbe = &btlb->buffer[i];
122 if (!valid_tag(btlbe->tag)) {
127 fsblock_put(btlbe->block);
132 ext2db_itbegin(struct ext2_iterator* iter, struct v_inode* inode)
134 struct ext2_inode* e_ino;
136 e_ino = EXT2_INO(inode);
137 *iter = (struct ext2_iterator){
140 .blksz = inode->sb->blksize,
141 .end_pos = ICEIL(e_ino->isize, inode->sb->blksize)
146 ext2db_itreset(struct ext2_iterator* iter)
148 if (likely(iter->sel_buf)) {
149 fsblock_put(iter->sel_buf);
150 iter->sel_buf = NULL;
157 ext2db_itffw(struct ext2_iterator* iter, int count)
164 ext2db_itend(struct ext2_iterator* iter)
166 if (likely(iter->sel_buf)) {
167 fsblock_put(iter->sel_buf);
168 iter->sel_buf = NULL;
173 ext2db_itnext(struct ext2_iterator* iter)
177 if (unlikely(iter->has_error)) {
181 if (unlikely(iter->pos > iter->end_pos)) {
185 if (likely(iter->sel_buf)) {
186 fsblock_put(iter->sel_buf);
189 buf = ext2db_get(iter->inode, iter->pos);
192 if (!buf || !ext2_itcheckbuf(iter)) {
197 iter->data = blkbuf_data(buf);
203 ext2ino_init(struct v_superblock* vsb, struct v_inode* inode)
205 // Placeholder, to make vsb happy
209 __destruct_ext2_inode(struct ext2_inode* e_inode)
211 __btlb_flushall(e_inode);
213 fsblock_put(e_inode->ind_ord1);
214 fsblock_put(e_inode->buf);
216 ext2gd_put(e_inode->blk_grp);
218 vfree_safe(e_inode->symlink);
219 vfree(e_inode->btlb);
224 ext2_destruct_inode(struct v_inode* inode)
226 struct ext2_inode* e_inode;
228 e_inode = EXT2_INO(inode);
231 __destruct_ext2_inode(e_inode);
235 __ext2ino_fill_common(struct v_inode* inode, ino_t ino_id)
237 fsapi_inode_setid(inode, ino_id, ino_id);
238 fsapi_inode_setfops(inode, &ext2_file_ops);
239 fsapi_inode_setops(inode, &ext2_inode_ops);
240 fsapi_inode_setdector(inode, ext2_destruct_inode);
245 __translate_vfs_itype(unsigned int v_itype)
247 unsigned int e_itype = IMODE_IFREG;
249 if (v_itype == VFS_IFFILE) {
250 e_itype = IMODE_IFREG;
252 else if (check_itype(v_itype, VFS_IFDIR)) {
253 e_itype = IMODE_IFDIR;
254 e_itype |= IMODE_UEX;
256 else if (check_itype(v_itype, VFS_IFSEQDEV)) {
257 e_itype = IMODE_IFCHR;
259 else if (check_itype(v_itype, VFS_IFVOLDEV)) {
260 e_itype = IMODE_IFBLK;
263 if (check_itype(v_itype, VFS_IFSYMLINK)) {
264 e_itype |= IMODE_IFLNK;
267 // FIXME we keep this until we have our own user manager
268 e_itype |= (IMODE_URD | IMODE_GRD | IMODE_ORD);
273 ext2ino_fill(struct v_inode* inode, ino_t ino_id)
275 struct ext2_sbinfo* sb;
276 struct ext2_inode* e_ino;
277 struct v_superblock* vsb;
278 struct ext2b_inode* b_ino;
279 unsigned int type = VFS_IFFILE;
285 if ((errno = ext2ino_get(vsb, ino_id, &e_ino))) {
290 ino_id = e_ino->ino_id;
292 fsapi_inode_setsize(inode, e_ino->isize);
294 fsapi_inode_settime(inode, b_ino->i_ctime,
298 __ext2ino_fill_common(inode, ino_id);
300 if (check_itype(b_ino->i_mode, IMODE_IFLNK)) {
301 type = VFS_IFSYMLINK;
303 else if (check_itype(b_ino->i_mode, IMODE_IFDIR)) {
306 else if (check_itype(b_ino->i_mode, IMODE_IFCHR)) {
309 else if (check_itype(b_ino->i_mode, IMODE_IFBLK)) {
313 fsapi_inode_settype(inode, type);
315 fsapi_inode_complete(inode, e_ino);
321 __get_group_desc(struct v_superblock* vsb, int ino,
322 struct ext2_gdesc** gd_out)
324 unsigned int blkgrp_id;
325 struct ext2_sbinfo* sb;
329 blkgrp_id = to_fsblock_id(ino) / sb->raw->s_ino_per_grp;
330 return ext2gd_take(vsb, blkgrp_id, gd_out);
333 static struct ext2b_inode*
334 __get_raw_inode(struct v_superblock* vsb, struct ext2_gdesc* gd,
335 bbuf_t* buf_out, int ino_index)
338 struct ext2_sbinfo* sb;
339 struct ext2b_inode* b_inode;
340 unsigned int ino_tab_sel, ino_tab_off, tab_partlen;
345 tab_partlen = sb->block_size / sb->raw->s_ino_size;
346 ino_tab_sel = ino_index / tab_partlen;
347 ino_tab_off = ino_index % tab_partlen;
349 ino_tab = fsblock_get(vsb, gd->info->bg_ino_tab + ino_tab_sel);
350 if (blkbuf_errbuf(ino_tab)) {
354 b_inode = (struct ext2b_inode*)blkbuf_data(ino_tab);
355 b_inode = &b_inode[ino_tab_off];
362 static struct ext2_inode*
363 __create_inode(struct v_superblock* vsb, struct ext2_gdesc* gd, int ino_index)
366 struct ext2_sbinfo* sb;
367 struct ext2b_inode* b_inode;
368 struct ext2_inode* inode;
369 unsigned int ind_ents;
373 b_inode = __get_raw_inode(vsb, gd, &ino_tab, ino_index);
378 inode = vzalloc(sizeof(*inode));
379 inode->btlb = vzalloc(sizeof(struct ext2_btlb));
380 inode->buf = ino_tab;
381 inode->ino = b_inode;
383 inode->isize = b_inode->i_size;
385 if (ext2_feature(vsb, FEAT_LARGE_FILE)) {
386 inode->isize |= (size_t)((u64_t)(b_inode->i_size_h32) << 32);
389 if (b_inode->i_blocks) {
390 inds_blks = (size_t)b_inode->i_blocks;
391 inds_blks -= ICEIL(inode->isize, 512);
392 inds_blks /= (sb->block_size / 512);
394 inode->indirect_blocks = inds_blks;
397 ind_ents = sb->block_size / sizeof(int);
398 assert(is_pot(ind_ents));
400 inode->inds_lgents = ilog2(ind_ents);
401 inode->ino_id = gd->ino_base + to_ext2ino_id(ino_index);
407 ext2ino_get_fast(struct v_superblock* vsb,
408 unsigned int ino, struct ext2_fast_inode* fast_ino)
412 struct ext2_gdesc* gd;
413 struct ext2_sbinfo* sb;
414 struct ext2b_inode* b_inode;
415 unsigned int ino_rel_id;
418 errno = __get_group_desc(vsb, ino, &gd);
423 ino_rel_id = to_fsblock_id(ino) % sb->raw->s_ino_per_grp;
424 b_inode = __get_raw_inode(vsb, gd, &ino_tab, ino_rel_id);
426 fast_ino->buf = ino_tab;
427 fast_ino->ino = b_inode;
433 ext2ino_get(struct v_superblock* vsb,
434 unsigned int ino, struct ext2_inode** out)
436 struct ext2_sbinfo* sb;
437 struct ext2_inode* inode;
438 struct ext2_gdesc* gd;
439 struct ext2b_inode* b_inode;
440 unsigned int ino_rel_id;
441 unsigned int tab_partlen;
442 unsigned int ind_ents, prima_ind;
447 if ((errno = __get_group_desc(vsb, ino, &gd))) {
451 ino_rel_id = to_fsblock_id(ino) % sb->raw->s_ino_per_grp;
452 inode = __create_inode(vsb, gd, ino_rel_id);
457 b_inode = inode->ino;
458 prima_ind = b_inode->i_block.ind1;
465 inode->ind_ord1 = fsblock_get(vsb, prima_ind);
466 if (blkbuf_errbuf(inode->ind_ord1)) {
477 ext2ino_alloc(struct v_superblock* vsb,
478 struct ext2_inode* hint, struct ext2_inode** out)
481 struct ext2_gdesc* gd;
482 struct ext2_inode* inode;
484 free_ino_idx = ALLOC_FAIL;
487 free_ino_idx = ext2gd_alloc_inode(gd);
490 // locality hinted alloc failed, try entire fs
491 if (!valid_bmp_slot(free_ino_idx)) {
492 free_ino_idx = ext2ino_alloc_slot(vsb, &gd);
495 if (!valid_bmp_slot(free_ino_idx)) {
499 inode = __create_inode(vsb, gd, free_ino_idx);
502 ext2gd_free_inode(gd, free_ino_idx);
506 memset(inode->ino, 0, sizeof(*inode->ino));
507 fsblock_dirty(inode->buf);
514 __free_block_at(struct v_superblock *vsb, unsigned int block_pos)
517 struct ext2_gdesc* gd;
518 struct ext2_sbinfo * sb;
524 block_pos = ext2_datablock(vsb, block_pos);
527 gd_index = block_pos / sb->raw->s_blk_per_grp;
529 if ((errno = ext2gd_take(vsb, gd_index, &gd))) {
533 assert(block_pos >= gd->base);
534 ext2gd_free_block(gd, block_pos - gd->base);
541 __free_recurisve_from(struct v_superblock *vsb, struct ext2_inode* inode,
542 struct walk_stack* stack, int depth)
548 int ind_entries = 1 << inode->inds_lgents;
549 int max_len[] = { 15, ind_entries, ind_entries, ind_entries };
551 u32_t* tables = stack->tables;
552 u32_t* indices = stack->indices;
554 if (depth > MAX_INDS_DEPTH || !tables[depth]) {
558 idx = indices[depth];
559 len = max_len[depth];
560 tab = fsblock_get(vsb, ext2_datablock(vsb, tables[depth]));
562 if (blkbuf_errbuf(tab)) {
566 db_tab = blkbuf_data(tab);
568 int offset = offsetof(struct ext2b_inode, i_block_arr);
569 db_tab = offset(db_tab, offset);
575 for (; idx < len; idx++)
577 u32_t db_id = db_tab[idx];
583 if (depth >= MAX_INDS_DEPTH) {
587 tables[depth] = db_id;
588 errno = __free_recurisve_from(vsb, inode, stack, depth + 1);
594 __free_block_at(vsb, db_id);
604 ext2ino_free(struct v_inode* inode)
607 unsigned int ino_slot;
608 struct ext2_inode* e_ino;
609 struct ext2_gdesc* e_gd;
610 struct ext2b_inode* b_ino;
611 struct ext2_sbinfo* sb;
613 sb = EXT2_SB(inode->sb);
614 e_ino = EXT2_INO(inode);
616 e_gd = e_ino->blk_grp;
618 assert_fs(b_ino->i_lnk_cnt > 0);
619 fsblock_dirty(e_ino->buf);
622 if (b_ino->i_lnk_cnt >= 1) {
626 ext2ino_resizing(inode, 0);
628 ino_slot = e_ino->ino_id;
629 ino_slot = to_fsblock_id(ino_slot - e_gd->base);
630 ext2gd_free_inode(e_ino->blk_grp, ino_slot);
632 __destruct_ext2_inode(e_ino);
640 __update_inode_access_metadata(struct ext2b_inode* b_ino,
641 struct v_inode* inode)
643 b_ino->i_ctime = inode->ctime;
644 b_ino->i_atime = inode->atime;
645 b_ino->i_mtime = inode->mtime;
649 __update_inode_size(struct v_inode* inode, size_t size)
651 struct ext2b_inode* b_ino;
652 struct ext2_inode* e_ino;
654 e_ino = EXT2_INO(inode);
659 if (ext2_feature(inode->sb, FEAT_LARGE_FILE)) {
660 b_ino->i_size_l32 = (unsigned int)size;
661 b_ino->i_size_h32 = (unsigned int)((u64_t)size >> 32);
664 b_ino->i_size = size;
667 b_ino->i_blocks = ICEIL(size, 512);
668 b_ino->i_blocks += e_ino->indirect_blocks;
672 ext2ino_make(struct v_superblock* vsb, unsigned int itype,
673 struct ext2_inode* hint, struct v_inode** out)
676 struct ext2_inode* e_ino;
677 struct ext2b_inode* b_ino;
678 struct v_inode* inode;
680 errno = ext2ino_alloc(vsb, hint, &e_ino);
686 inode = vfs_i_alloc(vsb);
688 __ext2ino_fill_common(inode, e_ino->ino_id);
690 __update_inode_access_metadata(b_ino, inode);
691 b_ino->i_mode = __translate_vfs_itype(itype);
693 fsapi_inode_settype(inode, itype);
694 fsapi_inode_complete(inode, e_ino);
701 ext2_create(struct v_inode* this, struct v_dnode* dnode, unsigned int itype)
704 struct v_inode* created;
706 errno = ext2ino_make(this->sb, itype, EXT2_INO(this), &created);
711 return ext2_link(created, dnode);
715 ext2_link(struct v_inode* this, struct v_dnode* new_name)
718 struct v_inode* parent;
719 struct ext2_inode* e_ino;
720 struct ext2_dnode* e_dno;
721 struct ext2b_dirent dirent;
723 e_ino = EXT2_INO(this);
724 parent = fsapi_dnode_parent(new_name);
726 ext2dr_setup_dirent(&dirent, this, &new_name->name);
727 ext2ino_linkto(e_ino, &dirent);
729 errno = ext2dr_insert(parent, &dirent, &e_dno);
734 new_name->data = e_dno;
735 vfs_assign_inode(new_name, this);
742 ext2_unlink(struct v_inode* this, struct v_dnode* name)
745 struct ext2_inode* e_ino;
746 struct ext2_dnode* e_dno;
748 e_ino = EXT2_INO(this);
749 e_dno = EXT2_DNO(name);
752 assert_fs(e_dno->self.dirent->inode == e_ino->ino_id);
754 errno = ext2dr_remove(e_dno);
759 return ext2ino_free(this);
763 ext2ino_update(struct v_inode* inode)
765 struct ext2_inode* e_ino;
767 e_ino = EXT2_INO(inode);
768 __update_inode_access_metadata(e_ino->ino, inode);
770 fsblock_dirty(e_ino->buf);
773 /* ******************* Data Blocks ******************* */
776 __walkstate_set_stack(struct walk_state* state, int depth,
777 bbuf_t tab, unsigned int index)
779 state->stack.tables[depth] = fsblock_id(tab);
780 state->stack.indices[depth] = index;
784 * @brief Walk the indrection chain given the position of data block
785 * relative to the inode. Upon completed, walk_state will be
786 * populated with result. On error, walk_state is untouched.
788 * Note, the result will always be one above the stopping level.
789 * That means, if your pos is pointed directly to file-content block
790 * (i.e., a leaf block), then the state is the indirect block that
791 * containing the ID of that leaf block.
793 * If `resolve` is set, it will resolve any absence encountered
794 * during the walk by allocating and chaining indirect block.
795 * It require the file system is mounted writable.
797 * @param inode inode to walk
798 * @param pos flattened data block position to be located
799 * @param state contain the walk result
800 * @param resolve whether to auto allocate the indirection structure during
801 * walk if `pos` is not exist.
805 __walk_indirects(struct v_inode* inode, unsigned int pos,
806 struct walk_state* state, bool resolve, bool full_walk)
809 int inds, stride, shifts, level;
810 unsigned int *slotref, index, next, mask;
811 struct ext2_inode* e_inode;
812 struct ext2b_inode* b_inode;
813 struct v_superblock* vsb;
814 bbuf_t table, next_table;
816 e_inode = EXT2_INO(inode);
817 b_inode = e_inode->ino;
820 resolve = resolve && !EXT2_SB(vsb)->read_only;
824 slotref = &b_inode->i_block_arr[pos];
825 table = fsblock_take(e_inode->buf);
831 stride = e_inode->inds_lgents;
832 if (!(pos >> stride)) {
835 else if (!(pos >> (stride * 2))) {
838 else if (!(pos >> (stride * 3))) {
842 fail("unrealistic block pos");
845 // bTLB cache the last level indirect block
846 if (!full_walk && (table = __btlb_hit(e_inode, pos))) {
848 index = pos & ((1 << stride) - 1);
849 slotref = &block_buffer(table, u32_t)[index];
853 shifts = stride * (inds - 1);
854 mask = ((1 << stride) - 1) << shifts;
856 index = 12 + inds - 1;
857 slotref = &b_inode->i_block.inds[inds - 1];
858 table = fsblock_take(e_inode->buf);
860 for (; level < inds; level++)
862 __walkstate_set_stack(state, level, table, index);
870 if ((errno = ext2db_alloc(inode, &next_table))) {
875 e_inode->indirect_blocks++;
876 *slotref = fsblock_id(next_table);
877 fsblock_dirty(table);
880 next_table = fsblock_get(vsb, next);
886 if (blkbuf_errbuf(table)) {
892 index = (pos & mask) >> shifts;
894 slotref = &block_buffer(table, u32_t)[index];
897 mask = mask >> stride;
900 __btlb_insert(e_inode, pos, table);
903 assert(blkbuf_refcounts(table) >= 1);
907 state->slot_ref = slotref;
908 state->table = table;
909 state->level = level;
910 state->indirections = inds;
912 __walkstate_set_stack(state, level, table, index);
918 ext2db_get(struct v_inode* inode, unsigned int data_pos)
922 struct walk_state state;
924 ext2walk_init_state(&state);
926 errno = __walk_indirects(inode, data_pos, &state, false, false);
928 return (bbuf_t)INVL_BUFFER;
931 blkid = *state.slot_ref;
933 ext2walk_free_state(&state);
939 return fsblock_get(inode->sb, blkid);
943 ext2db_acquire(struct v_inode* inode, unsigned int data_pos, bbuf_t* out)
947 unsigned int block_id;
948 struct walk_state state;
950 ext2walk_init_state(&state);
952 errno = __walk_indirects(inode, data_pos, &state, true, false);
957 block_id = *state.slot_ref;
959 buf = fsblock_get(inode->sb, block_id);
963 errno = ext2db_alloc(inode, &buf);
965 ext2walk_free_state(&state);
969 *state.slot_ref = fsblock_id(buf);
970 fsblock_dirty(state.table);
973 ext2walk_free_state(&state);
975 if (blkbuf_errbuf(buf)) {
984 ext2db_alloc(struct v_inode* inode, bbuf_t* out)
987 struct ext2_gdesc* gd;
988 struct ext2_inode* e_inode;
989 struct v_superblock* vsb;
991 free_ino_idx = ALLOC_FAIL;
992 e_inode = EXT2_INO(inode);
995 gd = e_inode->blk_grp;
996 free_ino_idx = ext2gd_alloc_block(gd);
998 // locality alloc failed, try entire fs
999 if (!valid_bmp_slot(free_ino_idx)) {
1000 free_ino_idx = ext2db_alloc_slot(vsb, &gd);
1003 if (!valid_bmp_slot(free_ino_idx)) {
1007 free_ino_idx += gd->base;
1008 free_ino_idx = ext2_datablock(vsb, free_ino_idx);
1009 free_ino_idx = to_ext2ino_id(free_ino_idx);
1011 bbuf_t buf = fsblock_get(vsb, free_ino_idx);
1012 if (blkbuf_errbuf(buf)) {
1021 ext2db_free_pos(struct v_inode* inode, unsigned int block_pos)
1023 struct ext2_inode* e_inode;
1024 struct ext2_gdesc* gd;
1026 e_inode = EXT2_INO(inode);
1027 gd = e_inode->blk_grp;
1029 assert(block_pos >= gd->base);
1031 block_pos -= gd->base;
1033 ext2gd_free_block(gd, block_pos);
1037 ext2db_free(struct v_inode* inode, bbuf_t buf)
1039 assert(blkbuf_not_shared(buf));
1041 ext2db_free_pos(inode, blkbuf_id(buf));
1048 ext2ino_resizing(struct v_inode* inode, size_t new_size)
1053 struct walk_state state;
1054 struct ext2_inode* e_ino;
1055 struct ext2b_inode* b_ino;
1057 e_ino = EXT2_INO(inode);
1059 oldsize = e_ino->isize;
1061 if (oldsize == new_size) {
1065 __update_inode_size(inode, new_size);
1066 fsblock_dirty(e_ino->buf);
1068 if (check_symlink_node(inode)) {
1072 if (oldsize < new_size) {
1076 ext2walk_init_state(&state);
1078 pos = new_size / fsapi_block_size(inode->sb);
1079 errno = __walk_indirects(inode, pos, &state, false, true);
1084 errno = __free_recurisve_from(inode->sb, e_ino, &state.stack, 0);
1086 ext2walk_free_state(&state);