1 #include <lunaix/fs/api.h>
2 #include <lunaix/mm/valloc.h>
4 #include <klibc/string.h>
8 static struct v_inode_ops ext2_inode_ops = {
9 .dir_lookup = ext2dr_lookup,
10 .open = ext2_open_inode,
13 .read_symlink = ext2_get_symlink,
14 .set_symlink = ext2_set_symlink,
15 .rename = ext2_rename,
17 .unlink = ext2_unlink,
18 .create = ext2_create,
19 .sync = ext2_sync_inode
22 static struct v_file_ops ext2_file_ops = {
23 .close = ext2_close_inode,
25 .read = ext2_inode_read,
26 .read_page = ext2_inode_read_page,
28 .write = ext2_inode_write,
29 .write_page = ext2_inode_write_page,
31 .readdir = ext2dr_read,
32 .seek = ext2_seek_inode,
33 .sync = ext2_file_sync
36 #define to_tag(e_ino, val) \
37 (((val) >> (e_ino)->inds_lgents) | (1 << msbiti))
38 #define valid_tag(tag) ((tag) & (1 << msbiti))
41 __btlb_insert(struct ext2_inode* e_inode, unsigned int blkid, bbuf_t buf)
43 struct ext2_btlb* btlb;
44 struct ext2_btlb_entry* btlbe = NULL;
47 if (unlikely(!blkid)) {
53 for (int i = 0; i < BTLB_SETS; i++)
55 if (valid_tag(btlb->buffer[i].tag)) {
59 btlbe = &btlb->buffer[i];
64 we have triggered the capacity miss.
65 since most file operation is heavily linear and strong
66 locality, we place our bet on it and avoid go through
67 the whole overhead of LRU eviction stuff. Just a trival
68 random eviction will do the fine job
70 cap_sel = hash_32(blkid, ilog2(BTLB_SETS));
71 btlbe = &btlb->buffer[cap_sel];
73 fsblock_put(btlbe->block);
76 btlbe->tag = to_tag(e_inode, blkid);
77 btlbe->block = fsblock_take(buf);
81 __btlb_hit(struct ext2_inode* e_inode, unsigned int blkid)
83 struct ext2_btlb* btlb;
84 struct ext2_btlb_entry* btlbe = NULL;
85 unsigned int in_tag, ref_cnts;
88 in_tag = to_tag(e_inode, blkid);
90 for (int i = 0; i < BTLB_SETS; i++)
92 btlbe = &btlb->buffer[i];
94 if (btlbe->tag != in_tag) {
98 ref_cnts = blkbuf_refcounts(btlbe->block);
101 btlbe->block = bbuf_null;
105 return fsblock_take(btlbe->block);
112 __btlb_flushall(struct ext2_inode* e_inode)
114 struct ext2_btlb* btlb;
115 struct ext2_btlb_entry* btlbe = NULL;
117 btlb = e_inode->btlb;
119 for (int i = 0; i < BTLB_SETS; i++)
121 btlbe = &btlb->buffer[i];
122 if (!valid_tag(btlbe->tag)) {
127 fsblock_put(btlbe->block);
132 ext2db_itbegin(struct ext2_iterator* iter, struct v_inode* inode)
134 struct ext2_inode* e_ino;
136 e_ino = EXT2_INO(inode);
137 *iter = (struct ext2_iterator){
140 .blksz = inode->sb->blksize,
141 .end_pos = ICEIL(e_ino->isize, inode->sb->blksize)
146 ext2db_itreset(struct ext2_iterator* iter)
148 if (likely(iter->sel_buf)) {
149 fsblock_put(iter->sel_buf);
150 iter->sel_buf = NULL;
157 ext2db_itffw(struct ext2_iterator* iter, int count)
164 ext2db_itend(struct ext2_iterator* iter)
166 if (likely(iter->sel_buf)) {
167 fsblock_put(iter->sel_buf);
168 iter->sel_buf = NULL;
173 ext2db_itnext(struct ext2_iterator* iter)
177 if (unlikely(iter->has_error)) {
181 if (unlikely(iter->pos > iter->end_pos)) {
185 if (likely(iter->sel_buf)) {
186 fsblock_put(iter->sel_buf);
189 buf = ext2db_get(iter->inode, iter->pos);
192 if (!buf || !ext2_itcheckbuf(iter)) {
197 iter->data = blkbuf_data(buf);
203 ext2ino_init(struct v_superblock* vsb, struct v_inode* inode)
205 // Placeholder, to make vsb happy
209 __destruct_ext2_inode(struct ext2_inode* e_inode)
211 __btlb_flushall(e_inode);
213 fsblock_put(e_inode->ind_ord1);
214 fsblock_put(e_inode->buf);
216 ext2gd_put(e_inode->blk_grp);
218 vfree_safe(e_inode->symlink);
219 vfree(e_inode->btlb);
224 ext2_destruct_inode(struct v_inode* inode)
226 struct ext2_inode* e_inode;
228 e_inode = EXT2_INO(inode);
231 __destruct_ext2_inode(e_inode);
235 __ext2ino_fill_common(struct v_inode* inode, ino_t ino_id)
237 fsapi_inode_setid(inode, ino_id, ino_id);
238 fsapi_inode_setfops(inode, &ext2_file_ops);
239 fsapi_inode_setops(inode, &ext2_inode_ops);
240 fsapi_inode_setdector(inode, ext2_destruct_inode);
245 __translate_vfs_itype(unsigned int v_itype)
247 unsigned int e_itype = IMODE_IFREG;
249 if (v_itype == VFS_IFFILE) {
250 e_itype = IMODE_IFREG;
252 else if (check_itype(v_itype, VFS_IFDIR)) {
253 e_itype = IMODE_IFDIR;
254 e_itype |= IMODE_UEX;
256 else if (check_itype(v_itype, VFS_IFSEQDEV)) {
257 e_itype = IMODE_IFCHR;
259 else if (check_itype(v_itype, VFS_IFVOLDEV)) {
260 e_itype = IMODE_IFBLK;
263 if (check_itype(v_itype, VFS_IFSYMLINK)) {
264 e_itype |= IMODE_IFLNK;
267 // FIXME we keep this until we have our own user manager
268 e_itype |= (IMODE_URD | IMODE_GRD | IMODE_ORD);
273 ext2ino_fill(struct v_inode* inode, ino_t ino_id)
275 struct ext2_sbinfo* sb;
276 struct ext2_inode* e_ino;
277 struct v_superblock* vsb;
278 struct ext2b_inode* b_ino;
279 unsigned int type = VFS_IFFILE;
285 if ((errno = ext2ino_get(vsb, ino_id, &e_ino))) {
290 ino_id = e_ino->ino_id;
292 fsapi_inode_setsize(inode, e_ino->isize);
294 fsapi_inode_settime(inode, b_ino->i_ctime,
298 fsapi_inode_setaccess(inode, b_ino->i_mode & IMODE_ACL_MASK);
299 fsapi_inode_setowner(inode, b_ino->i_uid,
302 __ext2ino_fill_common(inode, ino_id);
304 if (check_itype(b_ino->i_mode, IMODE_IFLNK)) {
305 type = VFS_IFSYMLINK;
307 else if (check_itype(b_ino->i_mode, IMODE_IFDIR)) {
310 else if (check_itype(b_ino->i_mode, IMODE_IFCHR)) {
313 else if (check_itype(b_ino->i_mode, IMODE_IFBLK)) {
317 fsapi_inode_settype(inode, type);
319 fsapi_inode_complete(inode, e_ino);
325 __get_group_desc(struct v_superblock* vsb, int ino,
326 struct ext2_gdesc** gd_out)
328 unsigned int blkgrp_id;
329 struct ext2_sbinfo* sb;
333 blkgrp_id = to_fsblock_id(ino) / sb->raw->s_ino_per_grp;
334 return ext2gd_take(vsb, blkgrp_id, gd_out);
337 static struct ext2b_inode*
338 __get_raw_inode(struct v_superblock* vsb, struct ext2_gdesc* gd,
339 bbuf_t* buf_out, int ino_index)
342 struct ext2_sbinfo* sb;
343 struct ext2b_inode* b_inode;
344 unsigned int ino_tab_sel, ino_tab_off, tab_partlen;
349 tab_partlen = sb->block_size / sb->raw->s_ino_size;
350 ino_tab_sel = ino_index / tab_partlen;
351 ino_tab_off = ino_index % tab_partlen;
353 ino_tab = fsblock_get(vsb, gd->info->bg_ino_tab + ino_tab_sel);
354 if (blkbuf_errbuf(ino_tab)) {
358 b_inode = (struct ext2b_inode*)blkbuf_data(ino_tab);
359 b_inode = &b_inode[ino_tab_off];
366 static struct ext2_inode*
367 __create_inode(struct v_superblock* vsb, struct ext2_gdesc* gd, int ino_index)
370 struct ext2_sbinfo* sb;
371 struct ext2b_inode* b_inode;
372 struct ext2_inode* inode;
373 unsigned int ind_ents;
377 b_inode = __get_raw_inode(vsb, gd, &ino_tab, ino_index);
382 inode = vzalloc(sizeof(*inode));
383 inode->btlb = vzalloc(sizeof(struct ext2_btlb));
384 inode->buf = ino_tab;
385 inode->ino = b_inode;
387 inode->isize = b_inode->i_size;
389 if (ext2_feature(vsb, FEAT_LARGE_FILE)) {
390 inode->isize |= (size_t)((u64_t)(b_inode->i_size_h32) << 32);
393 if (b_inode->i_blocks) {
394 inds_blks = (size_t)b_inode->i_blocks;
395 inds_blks -= ICEIL(inode->isize, 512);
396 inds_blks /= (sb->block_size / 512);
398 inode->indirect_blocks = inds_blks;
401 ind_ents = sb->block_size / sizeof(int);
402 assert(is_pot(ind_ents));
404 inode->inds_lgents = ilog2(ind_ents);
405 inode->ino_id = gd->ino_base + to_ext2ino_id(ino_index);
411 ext2ino_get_fast(struct v_superblock* vsb,
412 unsigned int ino, struct ext2_fast_inode* fast_ino)
416 struct ext2_gdesc* gd;
417 struct ext2_sbinfo* sb;
418 struct ext2b_inode* b_inode;
419 unsigned int ino_rel_id;
422 errno = __get_group_desc(vsb, ino, &gd);
427 ino_rel_id = to_fsblock_id(ino) % sb->raw->s_ino_per_grp;
428 b_inode = __get_raw_inode(vsb, gd, &ino_tab, ino_rel_id);
430 fast_ino->buf = ino_tab;
431 fast_ino->ino = b_inode;
437 ext2ino_get(struct v_superblock* vsb,
438 unsigned int ino, struct ext2_inode** out)
440 struct ext2_sbinfo* sb;
441 struct ext2_inode* inode;
442 struct ext2_gdesc* gd;
443 struct ext2b_inode* b_inode;
444 unsigned int ino_rel_id;
445 unsigned int tab_partlen;
446 unsigned int ind_ents, prima_ind;
451 if ((errno = __get_group_desc(vsb, ino, &gd))) {
455 ino_rel_id = to_fsblock_id(ino) % sb->raw->s_ino_per_grp;
456 inode = __create_inode(vsb, gd, ino_rel_id);
461 b_inode = inode->ino;
462 prima_ind = b_inode->i_block.ind1;
469 inode->ind_ord1 = fsblock_get(vsb, prima_ind);
470 if (blkbuf_errbuf(inode->ind_ord1)) {
481 ext2ino_alloc(struct v_superblock* vsb,
482 struct ext2_inode* hint, struct ext2_inode** out)
485 struct ext2_gdesc* gd;
486 struct ext2_inode* inode;
488 free_ino_idx = ALLOC_FAIL;
491 free_ino_idx = ext2gd_alloc_inode(gd);
494 // locality hinted alloc failed, try entire fs
495 if (!valid_bmp_slot(free_ino_idx)) {
496 free_ino_idx = ext2ino_alloc_slot(vsb, &gd);
499 if (!valid_bmp_slot(free_ino_idx)) {
503 inode = __create_inode(vsb, gd, free_ino_idx);
506 ext2gd_free_inode(gd, free_ino_idx);
510 memset(inode->ino, 0, sizeof(*inode->ino));
511 fsblock_dirty(inode->buf);
518 __free_block_at(struct v_superblock *vsb, unsigned int block_pos)
521 struct ext2_gdesc* gd;
522 struct ext2_sbinfo * sb;
528 block_pos = ext2_datablock(vsb, block_pos);
531 gd_index = block_pos / sb->raw->s_blk_per_grp;
533 if ((errno = ext2gd_take(vsb, gd_index, &gd))) {
537 assert(block_pos >= gd->base);
538 ext2gd_free_block(gd, block_pos - gd->base);
545 __free_recurisve_from(struct v_superblock *vsb, struct ext2_inode* inode,
546 struct walk_stack* stack, int depth)
552 int ind_entries = 1 << inode->inds_lgents;
553 int max_len[] = { 15, ind_entries, ind_entries, ind_entries };
555 u32_t* tables = stack->tables;
556 u32_t* indices = stack->indices;
558 if (depth > MAX_INDS_DEPTH || !tables[depth]) {
562 idx = indices[depth];
563 len = max_len[depth];
564 tab = fsblock_get(vsb, ext2_datablock(vsb, tables[depth]));
566 if (blkbuf_errbuf(tab)) {
570 db_tab = blkbuf_data(tab);
572 int offset = offsetof(struct ext2b_inode, i_block_arr);
573 db_tab = offset(db_tab, offset);
579 for (; idx < len; idx++)
581 u32_t db_id = db_tab[idx];
587 if (depth >= MAX_INDS_DEPTH) {
591 tables[depth] = db_id;
592 errno = __free_recurisve_from(vsb, inode, stack, depth + 1);
598 __free_block_at(vsb, db_id);
608 ext2ino_free(struct v_inode* inode)
611 unsigned int ino_slot;
612 struct ext2_inode* e_ino;
613 struct ext2_gdesc* e_gd;
614 struct ext2b_inode* b_ino;
615 struct ext2_sbinfo* sb;
617 sb = EXT2_SB(inode->sb);
618 e_ino = EXT2_INO(inode);
620 e_gd = e_ino->blk_grp;
622 assert_fs(b_ino->i_lnk_cnt > 0);
623 fsblock_dirty(e_ino->buf);
626 if (b_ino->i_lnk_cnt >= 1) {
630 ext2ino_resizing(inode, 0);
632 ino_slot = e_ino->ino_id;
633 ino_slot = to_fsblock_id(ino_slot - e_gd->base);
634 ext2gd_free_inode(e_ino->blk_grp, ino_slot);
636 __destruct_ext2_inode(e_ino);
644 __update_inode_access_metadata(struct ext2b_inode* b_ino,
645 struct v_inode* inode)
647 b_ino->i_ctime = inode->ctime;
648 b_ino->i_atime = inode->atime;
649 b_ino->i_mtime = inode->mtime;
653 __update_inode_size(struct v_inode* inode, size_t size)
655 struct ext2b_inode* b_ino;
656 struct ext2_inode* e_ino;
658 e_ino = EXT2_INO(inode);
663 if (ext2_feature(inode->sb, FEAT_LARGE_FILE)) {
664 b_ino->i_size_l32 = (unsigned int)size;
665 b_ino->i_size_h32 = (unsigned int)((u64_t)size >> 32);
668 b_ino->i_size = size;
671 b_ino->i_blocks = ICEIL(size, 512);
672 b_ino->i_blocks += e_ino->indirect_blocks;
676 ext2ino_make(struct v_superblock* vsb, unsigned int itype,
677 struct ext2_inode* hint, struct v_inode** out)
680 struct ext2_inode* e_ino;
681 struct ext2b_inode* b_ino;
682 struct v_inode* inode;
684 errno = ext2ino_alloc(vsb, hint, &e_ino);
690 inode = vfs_i_alloc(vsb);
692 __ext2ino_fill_common(inode, e_ino->ino_id);
694 __update_inode_access_metadata(b_ino, inode);
695 b_ino->i_mode = __translate_vfs_itype(itype);
697 fsapi_inode_settype(inode, itype);
698 fsapi_inode_complete(inode, e_ino);
705 ext2_create(struct v_inode* this, struct v_dnode* dnode, unsigned int itype)
708 struct v_inode* created;
710 errno = ext2ino_make(this->sb, itype, EXT2_INO(this), &created);
715 return ext2_link(created, dnode);
719 ext2_link(struct v_inode* this, struct v_dnode* new_name)
722 struct v_inode* parent;
723 struct ext2_inode* e_ino;
724 struct ext2_dnode* e_dno;
725 struct ext2b_dirent dirent;
727 e_ino = EXT2_INO(this);
728 parent = fsapi_dnode_parent(new_name);
730 ext2dr_setup_dirent(&dirent, this, &new_name->name);
731 ext2ino_linkto(e_ino, &dirent);
733 errno = ext2dr_insert(parent, &dirent, &e_dno);
738 new_name->data = e_dno;
739 vfs_assign_inode(new_name, this);
746 ext2_unlink(struct v_inode* this, struct v_dnode* name)
749 struct ext2_inode* e_ino;
750 struct ext2_dnode* e_dno;
752 e_ino = EXT2_INO(this);
753 e_dno = EXT2_DNO(name);
756 assert_fs(e_dno->self.dirent->inode == e_ino->ino_id);
758 errno = ext2dr_remove(e_dno);
763 return ext2ino_free(this);
767 ext2ino_update(struct v_inode* inode)
769 struct ext2_inode* e_ino;
771 e_ino = EXT2_INO(inode);
772 __update_inode_access_metadata(e_ino->ino, inode);
774 fsblock_dirty(e_ino->buf);
777 /* ******************* Data Blocks ******************* */
780 __walkstate_set_stack(struct walk_state* state, int depth,
781 bbuf_t tab, unsigned int index)
783 state->stack.tables[depth] = fsblock_id(tab);
784 state->stack.indices[depth] = index;
788 * @brief Walk the indrection chain given the position of data block
789 * relative to the inode. Upon completed, walk_state will be
790 * populated with result. On error, walk_state is untouched.
792 * Note, the result will always be one above the stopping level.
793 * That means, if your pos is pointed directly to file-content block
794 * (i.e., a leaf block), then the state is the indirect block that
795 * containing the ID of that leaf block.
797 * If `resolve` is set, it will resolve any absence encountered
798 * during the walk by allocating and chaining indirect block.
799 * It require the file system is mounted writable.
801 * @param inode inode to walk
802 * @param pos flattened data block position to be located
803 * @param state contain the walk result
804 * @param resolve whether to auto allocate the indirection structure during
805 * walk if `pos` is not exist.
809 __walk_indirects(struct v_inode* inode, unsigned int pos,
810 struct walk_state* state, bool resolve, bool full_walk)
813 int inds, stride, shifts, level;
814 unsigned int *slotref, index, next, mask;
815 struct ext2_inode* e_inode;
816 struct ext2b_inode* b_inode;
817 struct v_superblock* vsb;
818 bbuf_t table, next_table;
820 e_inode = EXT2_INO(inode);
821 b_inode = e_inode->ino;
824 resolve = resolve && !EXT2_SB(vsb)->read_only;
828 slotref = &b_inode->i_block_arr[pos];
829 table = fsblock_take(e_inode->buf);
835 stride = e_inode->inds_lgents;
836 if (!(pos >> stride)) {
839 else if (!(pos >> (stride * 2))) {
842 else if (!(pos >> (stride * 3))) {
846 fail("unrealistic block pos");
849 // bTLB cache the last level indirect block
850 if (!full_walk && (table = __btlb_hit(e_inode, pos))) {
852 index = pos & ((1 << stride) - 1);
853 slotref = &block_buffer(table, u32_t)[index];
857 shifts = stride * (inds - 1);
858 mask = ((1 << stride) - 1) << shifts;
860 index = 12 + inds - 1;
861 slotref = &b_inode->i_block.inds[inds - 1];
862 table = fsblock_take(e_inode->buf);
864 for (; level < inds; level++)
866 __walkstate_set_stack(state, level, table, index);
874 if ((errno = ext2db_alloc(inode, &next_table))) {
879 e_inode->indirect_blocks++;
880 *slotref = fsblock_id(next_table);
881 fsblock_dirty(table);
884 next_table = fsblock_get(vsb, next);
890 if (blkbuf_errbuf(table)) {
896 index = (pos & mask) >> shifts;
898 slotref = &block_buffer(table, u32_t)[index];
901 mask = mask >> stride;
904 __btlb_insert(e_inode, pos, table);
907 assert(blkbuf_refcounts(table) >= 1);
911 state->slot_ref = slotref;
912 state->table = table;
913 state->level = level;
914 state->indirections = inds;
916 __walkstate_set_stack(state, level, table, index);
922 ext2db_get(struct v_inode* inode, unsigned int data_pos)
926 struct walk_state state;
928 ext2walk_init_state(&state);
930 errno = __walk_indirects(inode, data_pos, &state, false, false);
932 return (bbuf_t)INVL_BUFFER;
935 blkid = *state.slot_ref;
937 ext2walk_free_state(&state);
943 return fsblock_get(inode->sb, blkid);
947 ext2db_acquire(struct v_inode* inode, unsigned int data_pos, bbuf_t* out)
951 unsigned int block_id;
952 struct walk_state state;
954 ext2walk_init_state(&state);
956 errno = __walk_indirects(inode, data_pos, &state, true, false);
961 block_id = *state.slot_ref;
963 buf = fsblock_get(inode->sb, block_id);
967 errno = ext2db_alloc(inode, &buf);
969 ext2walk_free_state(&state);
973 *state.slot_ref = fsblock_id(buf);
974 fsblock_dirty(state.table);
977 ext2walk_free_state(&state);
979 if (blkbuf_errbuf(buf)) {
988 ext2db_alloc(struct v_inode* inode, bbuf_t* out)
991 struct ext2_gdesc* gd;
992 struct ext2_inode* e_inode;
993 struct v_superblock* vsb;
995 free_ino_idx = ALLOC_FAIL;
996 e_inode = EXT2_INO(inode);
999 gd = e_inode->blk_grp;
1000 free_ino_idx = ext2gd_alloc_block(gd);
1002 // locality alloc failed, try entire fs
1003 if (!valid_bmp_slot(free_ino_idx)) {
1004 free_ino_idx = ext2db_alloc_slot(vsb, &gd);
1007 if (!valid_bmp_slot(free_ino_idx)) {
1011 free_ino_idx += gd->base;
1012 free_ino_idx = ext2_datablock(vsb, free_ino_idx);
1013 free_ino_idx = to_ext2ino_id(free_ino_idx);
1015 bbuf_t buf = fsblock_get(vsb, free_ino_idx);
1016 if (blkbuf_errbuf(buf)) {
1025 ext2db_free_pos(struct v_inode* inode, unsigned int block_pos)
1027 struct ext2_inode* e_inode;
1028 struct ext2_gdesc* gd;
1030 e_inode = EXT2_INO(inode);
1031 gd = e_inode->blk_grp;
1033 assert(block_pos >= gd->base);
1035 block_pos -= gd->base;
1037 ext2gd_free_block(gd, block_pos);
1041 ext2db_free(struct v_inode* inode, bbuf_t buf)
1043 assert(blkbuf_not_shared(buf));
1045 ext2db_free_pos(inode, blkbuf_id(buf));
1052 ext2ino_resizing(struct v_inode* inode, size_t new_size)
1057 struct walk_state state;
1058 struct ext2_inode* e_ino;
1059 struct ext2b_inode* b_ino;
1061 e_ino = EXT2_INO(inode);
1063 oldsize = e_ino->isize;
1065 if (oldsize == new_size) {
1069 __update_inode_size(inode, new_size);
1070 fsblock_dirty(e_ino->buf);
1072 if (check_symlink_node(inode)) {
1076 if (oldsize < new_size) {
1080 ext2walk_init_state(&state);
1082 pos = new_size / fsapi_block_size(inode->sb);
1083 errno = __walk_indirects(inode, pos, &state, false, true);
1088 errno = __free_recurisve_from(inode->sb, e_ino, &state.stack, 0);
1090 ext2walk_free_state(&state);