2 * Copyright (c) 2015 Grzegorz Kostka (kostka.grzegorz@gmail.com)
3 * Copyright (c) 2015 Kaho Ng (ngkaho1234@gmail.com)
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "ext4_blockdev.h"
30 #include "ext4_super.h"
31 #include "ext4_balloc.h"
32 #include "ext4_debug.h"
39 #include "ext4_extent.h"
42 * used by extent splitting.
44 #define EXT4_EXT_MARK_UNWRIT1 0x02 /* mark first half unwritten */
45 #define EXT4_EXT_MARK_UNWRIT2 0x04 /* mark second half unwritten */
46 #define EXT4_EXT_DATA_VALID1 0x08 /* first half contains valid data */
47 #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
48 #define EXT4_EXT_NO_COMBINE 0x20 /* do not combine two extents */
50 static struct ext4_extent_tail *
51 find_ext4_extent_tail(struct ext4_extent_header *eh)
53 return (struct ext4_extent_tail *)(((char *)eh) +
54 EXT4_EXTENT_TAIL_OFFSET(eh));
57 static struct ext4_extent_header *ext_inode_hdr(struct ext4_inode *inode)
59 return (struct ext4_extent_header *)inode->blocks;
62 static struct ext4_extent_header *ext_block_hdr(struct ext4_block *block)
64 return (struct ext4_extent_header *)block->data;
67 static uint16_t ext_depth(struct ext4_inode *inode)
69 return to_le16(ext_inode_hdr(inode)->depth);
72 static uint16_t ext4_ext_get_actual_len(struct ext4_extent *ext)
74 return (to_le16(ext->block_count) <= EXT_INIT_MAX_LEN
75 ? to_le16(ext->block_count)
76 : (to_le16(ext->block_count) - EXT_INIT_MAX_LEN));
79 static void ext4_ext_mark_initialized(struct ext4_extent *ext)
81 ext->block_count = to_le16(ext4_ext_get_actual_len(ext));
84 static void ext4_ext_mark_unwritten(struct ext4_extent *ext)
86 ext->block_count |= to_le16(EXT_INIT_MAX_LEN);
89 static int ext4_ext_is_unwritten(struct ext4_extent *ext)
91 /* Extent with ee_len of 0x8000 is treated as an initialized extent */
92 return (to_le16(ext->block_count) > EXT_INIT_MAX_LEN);
97 * combine low and high parts of physical block number into ext4_fsblk_t
99 static ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
103 block = to_le32(ex->start_lo);
104 block |= ((ext4_fsblk_t)to_le16(ex->start_hi) << 31) << 1;
110 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
112 static ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_index *ix)
116 block = to_le32(ix->leaf_lo);
117 block |= ((ext4_fsblk_t)to_le16(ix->leaf_hi) << 31) << 1;
122 * ext4_ext_store_pblock:
123 * stores a large physical block number into an extent struct,
124 * breaking it into parts
126 static void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
128 ex->start_lo = to_le32((unsigned long)(pb & 0xffffffff));
129 ex->start_hi = to_le16((unsigned long)((pb >> 31) >> 1) & 0xffff);
133 * ext4_idx_store_pblock:
134 * stores a large physical block number into an index struct,
135 * breaking it into parts
137 static void ext4_idx_store_pblock(struct ext4_extent_index *ix, ext4_fsblk_t pb)
139 ix->leaf_lo = to_le32((unsigned long)(pb & 0xffffffff));
140 ix->leaf_hi = to_le16((unsigned long)((pb >> 31) >> 1) & 0xffff);
143 static int ext4_allocate_single_block(struct ext4_inode_ref *inode_ref,
144 ext4_fsblk_t goal __unused,
145 ext4_fsblk_t *blockp)
147 return ext4_balloc_alloc_block(inode_ref, (uint32_t *)blockp);
150 static ext4_fsblk_t ext4_new_meta_blocks(struct ext4_inode_ref *inode_ref,
152 uint32_t flags __unused,
153 uint32_t *count, int *errp)
155 ext4_fsblk_t block = 0;
157 *errp = ext4_allocate_single_block(inode_ref, goal, &block);
163 static void ext4_ext_free_blocks(struct ext4_inode_ref *inode_ref,
164 ext4_fsblk_t block, uint32_t count,
165 uint32_t flags __unused)
168 for (i = 0; i < count; i++)
169 ext4_balloc_free_block(inode_ref, (uint32_t)block + i);
172 static size_t ext4_ext_space_block(struct ext4_inode_ref *inode_ref)
175 uint32_t block_size = ext4_sb_get_block_size(&inode_ref->fs->sb);
177 size = (block_size - sizeof(struct ext4_extent_header)) /
178 sizeof(struct ext4_extent);
182 static size_t ext4_ext_space_block_idx(struct ext4_inode_ref *inode_ref)
185 uint32_t block_size = ext4_sb_get_block_size(&inode_ref->fs->sb);
187 size = (block_size - sizeof(struct ext4_extent_header)) /
188 sizeof(struct ext4_extent_index);
192 static size_t ext4_ext_space_root(struct ext4_inode_ref *inode_ref)
196 size = sizeof(inode_ref->inode->blocks);
197 size -= sizeof(struct ext4_extent_header);
198 size /= sizeof(struct ext4_extent);
202 static size_t ext4_ext_space_root_idx(struct ext4_inode_ref *inode_ref)
206 size = sizeof(inode_ref->inode->blocks);
207 size -= sizeof(struct ext4_extent_header);
208 size /= sizeof(struct ext4_extent_index);
212 static size_t ext4_ext_max_entries(struct ext4_inode_ref *inode_ref,
217 if (depth == ext_depth(inode_ref->inode)) {
219 max = ext4_ext_space_root(inode_ref);
221 max = ext4_ext_space_root_idx(inode_ref);
224 max = ext4_ext_space_block(inode_ref);
226 max = ext4_ext_space_block_idx(inode_ref);
232 static ext4_fsblk_t ext4_ext_find_goal(struct ext4_extent_path *path,
236 uint32_t depth = path->depth;
237 struct ext4_extent *ex;
240 * Try to predict block placement assuming that we are
241 * filling in a file which will eventually be
242 * non-sparse --- i.e., in the case of libbfd writing
243 * an ELF object sections out-of-order but in a way
244 * the eventually results in a contiguous object or
245 * executable file, or some database extending a table
246 * space file. However, this is actually somewhat
247 * non-ideal if we are writing a sparse file such as
248 * qemu or KVM writing a raw image file that is going
249 * to stay fairly sparse, since it will end up
250 * fragmenting the file system's free space. Maybe we
251 * should have some hueristics or some way to allow
252 * userspace to pass a hint to file system,
253 * especially if the latter case turns out to be
256 ex = path[depth].extent;
258 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
259 ext4_lblk_t ext_block = to_le32(ex->first_block);
261 if (block > ext_block)
262 return ext_pblk + (block - ext_block);
264 return ext_pblk - (ext_block - block);
267 /* it looks like index is empty;
268 * try to find starting block from index itself */
269 if (path[depth].block.lb_id)
270 return path[depth].block.lb_id;
273 /* OK. use inode's group */
278 * Allocation for a meta data block
280 static ext4_fsblk_t ext4_ext_new_meta_block(struct ext4_inode_ref *inode_ref,
281 struct ext4_extent_path *path,
282 struct ext4_extent *ex, int *err,
285 ext4_fsblk_t goal, newblock;
287 goal = ext4_ext_find_goal(path, to_le32(ex->first_block));
288 newblock = ext4_new_meta_blocks(inode_ref, goal, flags, NULL, err);
292 static int ext4_ext_dirty(struct ext4_inode_ref *inode_ref,
293 struct ext4_extent_path *path)
295 if (path->block.lb_id)
296 path->block.dirty = true;
298 inode_ref->dirty = true;
303 static void ext4_ext_drop_refs(struct ext4_inode_ref *inode_ref,
304 struct ext4_extent_path *path, bool keep_other)
315 for (i = 0; i <= depth; i++, path++)
316 if (path->block.lb_id) {
317 ext4_block_set(inode_ref->fs->bdev, &path->block);
322 * Temporarily we don't need to support checksum.
324 static uint32_t ext4_ext_block_csum(struct ext4_inode_ref *inode_ref __unused,
325 struct ext4_extent_header *eh __unused)
327 /*TODO: should we add crc32 here ?*/
328 /*return ext4_crc32c(inode->i_csum, eh, EXT4_EXTENT_TAIL_OFFSET(eh));*/
332 static void ext4_extent_block_csum_set(struct ext4_inode_ref *inode_ref,
333 struct ext4_extent_header *eh)
335 struct ext4_extent_tail *tail;
337 tail = find_ext4_extent_tail(eh);
338 tail->et_checksum = ext4_ext_block_csum(inode_ref, eh);
342 * Check that whether the basic information inside the extent header
345 static int ext4_ext_check(struct ext4_inode_ref *inode_ref,
346 struct ext4_extent_header *eh, uint16_t depth,
347 ext4_fsblk_t pblk __unused)
349 struct ext4_extent_tail *tail;
350 const char *error_msg;
353 if (to_le16(eh->magic) != EXT4_EXTENT_MAGIC) {
354 error_msg = "invalid magic";
357 if (to_le16(eh->depth) != depth) {
358 error_msg = "unexpected eh_depth";
361 if (eh->max_entries_count == 0) {
362 error_msg = "invalid eh_max";
365 if (to_le16(eh->entries_count) > to_le16(eh->max_entries_count)) {
366 error_msg = "invalid eh_entries";
370 tail = find_ext4_extent_tail(eh);
371 if (tail->et_checksum != ext4_ext_block_csum(inode_ref, eh)) {
372 /* FIXME: Warning: extent checksum damaged? */
378 ext4_dbg(DEBUG_EXTENT, "Bad extents B+ tree block: %s. "
379 "Blocknr: %" PRId64 "\n",
384 static int read_extent_tree_block(struct ext4_inode_ref *inode_ref,
385 ext4_fsblk_t pblk, int32_t depth,
386 struct ext4_block *bh,
387 uint32_t flags __unused)
391 err = ext4_block_get(inode_ref->fs->bdev, bh, pblk);
395 err = ext4_ext_check(inode_ref, ext_block_hdr(bh), depth, pblk);
402 ext4_block_set(inode_ref->fs->bdev, bh);
408 * ext4_ext_binsearch_idx:
409 * binary search for the closest index of the given block
410 * the header must be checked before calling this
412 static void ext4_ext_binsearch_idx(struct ext4_extent_path *path,
415 struct ext4_extent_header *eh = path->header;
416 struct ext4_extent_index *r, *l, *m;
418 l = EXT_FIRST_INDEX(eh) + 1;
419 r = EXT_LAST_INDEX(eh);
422 if (block < to_le32(m->first_block))
432 * ext4_ext_binsearch:
433 * binary search for closest extent of the given block
434 * the header must be checked before calling this
436 static void ext4_ext_binsearch(struct ext4_extent_path *path, ext4_lblk_t block)
438 struct ext4_extent_header *eh = path->header;
439 struct ext4_extent *r, *l, *m;
441 if (eh->entries_count == 0) {
443 * this leaf is empty:
444 * we get such a leaf in split/add case
449 l = EXT_FIRST_EXTENT(eh) + 1;
450 r = EXT_LAST_EXTENT(eh);
454 if (block < to_le32(m->first_block))
460 path->extent = l - 1;
463 #define EXT4_EXT_PATH_INC_DEPTH 1
465 static int ext4_find_extent(struct ext4_inode_ref *inode_ref, ext4_lblk_t block,
466 struct ext4_extent_path **orig_path, uint32_t flags)
468 struct ext4_extent_header *eh;
469 struct ext4_block bh = EXT4_BLOCK_ZERO();
470 ext4_fsblk_t buf_block = 0;
471 struct ext4_extent_path *path = *orig_path;
472 int32_t depth, ppos = 0;
476 eh = ext_inode_hdr(inode_ref->inode);
477 depth = ext_depth(inode_ref->inode);
480 ext4_ext_drop_refs(inode_ref, path, 0);
481 if (depth > path[0].maxdepth) {
483 *orig_path = path = NULL;
487 int32_t path_depth = depth + EXT4_EXT_PATH_INC_DEPTH;
488 /* account possible depth increase */
489 path = calloc(1, sizeof(struct ext4_extent_path) *
493 path[0].maxdepth = path_depth;
499 /* walk through the tree */
501 ext4_ext_binsearch_idx(path + ppos, block);
502 path[ppos].p_block = ext4_idx_pblock(path[ppos].index);
503 path[ppos].depth = i;
504 path[ppos].extent = NULL;
505 buf_block = path[ppos].p_block;
509 if (!path[ppos].block.lb_id ||
510 path[ppos].block.lb_id != buf_block) {
511 ret = read_extent_tree_block(inode_ref, buf_block, i,
517 ext4_block_set(inode_ref->fs->bdev, &bh);
522 eh = ext_block_hdr(&bh);
523 path[ppos].block = bh;
524 path[ppos].header = eh;
528 path[ppos].depth = i;
529 path[ppos].extent = NULL;
530 path[ppos].index = NULL;
533 ext4_ext_binsearch(path + ppos, block);
534 /* if not an empty leaf */
535 if (path[ppos].extent)
536 path[ppos].p_block = ext4_ext_pblock(path[ppos].extent);
544 ext4_ext_drop_refs(inode_ref, path, 0);
551 static void ext4_ext_init_header(struct ext4_inode_ref *inode_ref,
552 struct ext4_extent_header *eh, int32_t depth)
554 eh->entries_count = 0;
555 eh->max_entries_count = to_le16(ext4_ext_max_entries(inode_ref, depth));
556 eh->magic = to_le16(EXT4_EXTENT_MAGIC);
561 * Be cautious, the buffer_head returned is not yet mark dirtied. */
562 static int ext4_ext_split_node(struct ext4_inode_ref *inode_ref,
563 struct ext4_extent_path *path, int32_t at,
564 struct ext4_extent *newext,
565 ext4_fsblk_t *sibling, struct ext4_block *new_bh)
568 ext4_fsblk_t newblock;
569 struct ext4_block bh = EXT4_BLOCK_ZERO();
570 int32_t depth = ext_depth(inode_ref->inode);
572 ext4_assert(sibling);
574 /* FIXME: currently we split at the point after the current extent. */
575 newblock = ext4_ext_new_meta_block(inode_ref, path, newext, &ret, 0);
579 /* For write access.# */
580 ret = ext4_block_get(inode_ref->fs->bdev, &bh, newblock);
585 /* start copy from next extent */
586 ptrdiff_t m = EXT_MAX_EXTENT(path[at].header) - path[at].extent;
587 struct ext4_extent_header *neh;
588 neh = ext_block_hdr(&bh);
589 ext4_ext_init_header(inode_ref, neh, 0);
591 struct ext4_extent *ex;
592 ex = EXT_FIRST_EXTENT(neh);
593 memmove(ex, path[at].extent + 1,
594 sizeof(struct ext4_extent) * m);
596 to_le16(to_le16(neh->entries_count) + m);
597 path[at].header->entries_count = to_le16(
598 to_le16(path[at].header->entries_count) - m);
599 ret = ext4_ext_dirty(inode_ref, path + at);
604 ptrdiff_t m = EXT_MAX_INDEX(path[at].header) - path[at].index;
605 struct ext4_extent_header *neh;
606 neh = ext_block_hdr(&bh);
607 ext4_ext_init_header(inode_ref, neh, depth - at);
609 struct ext4_extent_index *ix;
610 ix = EXT_FIRST_INDEX(neh);
611 memmove(ix, path[at].index + 1,
612 sizeof(struct ext4_extent) * m);
614 to_le16(to_le16(neh->entries_count) + m);
615 path[at].header->entries_count = to_le16(
616 to_le16(path[at].header->entries_count) - m);
617 ret = ext4_ext_dirty(inode_ref, path + at);
625 ext4_block_set(inode_ref->fs->bdev, &bh);
628 ext4_ext_free_blocks(inode_ref, newblock, 1, 0);
637 static ext4_lblk_t ext4_ext_block_index(struct ext4_extent_header *eh)
640 return to_le32(EXT_FIRST_INDEX(eh)->first_block);
642 return to_le32(EXT_FIRST_EXTENT(eh)->first_block);
645 #define EXT_INODE_HDR_NEED_GROW 0x1
647 struct ext_split_trans {
649 struct ext4_extent_path path;
653 static int ext4_ext_insert_index(struct ext4_inode_ref *inode_ref,
654 struct ext4_extent_path *path, int32_t at,
655 struct ext4_extent *newext,
656 ext4_lblk_t insert_index,
657 ext4_fsblk_t insert_block,
658 struct ext_split_trans *spt)
660 struct ext4_extent_index *ix;
661 struct ext4_extent_path *curp = path + at;
662 struct ext4_block bh = EXT4_BLOCK_ZERO();
665 struct ext4_extent_header *eh;
667 if (curp->index && insert_index == to_le32(curp->index->first_block))
670 if (to_le16(curp->header->entries_count) ==
671 to_le16(curp->header->max_entries_count)) {
673 struct ext4_extent_header *neh;
674 err = ext4_ext_split_node(inode_ref, path, at, newext,
679 neh = ext_block_hdr(&bh);
680 if (insert_index > to_le32(curp->index->first_block)) {
681 /* Make decision which node should be used to
682 * insert the index.*/
683 if (to_le16(neh->entries_count) >
684 to_le16(curp->header->entries_count)) {
687 ix = EXT_LAST_INDEX(eh) + 1;
690 ix = EXT_FIRST_INDEX(eh);
695 ix = EXT_LAST_INDEX(eh);
698 err = EXT_INODE_HDR_NEED_GROW;
703 if (curp->index == NULL) {
704 ix = EXT_FIRST_INDEX(eh);
706 } else if (insert_index > to_le32(curp->index->first_block)) {
708 ix = curp->index + 1;
715 len = EXT_LAST_INDEX(eh) - ix + 1;
716 ext4_assert(len >= 0);
718 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_index));
720 if (ix > EXT_MAX_INDEX(eh)) {
725 ix->first_block = to_le32(insert_index);
726 ext4_idx_store_pblock(ix, insert_block);
727 eh->entries_count = to_le16(to_le16(eh->entries_count) + 1);
729 if (ix > EXT_LAST_INDEX(eh)) {
734 if (eh == curp->header)
735 err = ext4_ext_dirty(inode_ref, curp);
742 ext4_block_set(inode_ref->fs->bdev, &bh);
745 } else if (bh.lb_id) {
746 /* If we got a sibling leaf. */
749 spt->path.p_block = ext4_idx_pblock(ix);
750 spt->path.depth = to_le16(eh->depth);
751 spt->path.maxdepth = 0;
752 spt->path.extent = NULL;
753 spt->path.index = ix;
754 spt->path.header = eh;
755 spt->path.block = bh;
758 * If newext->ee_block can be included into the
761 if (to_le32(newext->first_block) >=
762 ext4_ext_block_index(ext_block_hdr(&bh)))
766 curp->p_block = ext4_idx_pblock(ix);
772 curp->p_block = ext4_idx_pblock(ix);
778 * ext4_ext_correct_indexes:
779 * if leaf gets modified and modified extent is first in the leaf,
780 * then we have to correct all indexes above.
782 static int ext4_ext_correct_indexes(struct ext4_inode_ref *inode_ref,
783 struct ext4_extent_path *path)
785 struct ext4_extent_header *eh;
786 int32_t depth = ext_depth(inode_ref->inode);
787 struct ext4_extent *ex;
792 eh = path[depth].header;
793 ex = path[depth].extent;
795 if (ex == NULL || eh == NULL) {
800 /* there is no tree at all */
804 if (ex != EXT_FIRST_EXTENT(eh)) {
805 /* we correct tree if first leaf got modified only */
810 * TODO: we need correction if border is smaller than current one
813 border = path[depth].extent->first_block;
814 path[k].index->first_block = border;
815 err = ext4_ext_dirty(inode_ref, path + k);
820 /* change all left-side indexes */
821 if (path[k + 1].index != EXT_FIRST_INDEX(path[k + 1].header))
823 path[k].index->first_block = border;
824 err = ext4_ext_dirty(inode_ref, path + k);
832 static bool ext4_ext_can_prepend(struct ext4_extent *ex1,
833 struct ext4_extent *ex2)
835 if (ext4_ext_pblock(ex2) + ext4_ext_get_actual_len(ex2) !=
836 ext4_ext_pblock(ex1))
839 if (ext4_ext_is_unwritten(ex1)) {
840 if (ext4_ext_get_actual_len(ex1) +
841 ext4_ext_get_actual_len(ex2) >
842 EXT_UNWRITTEN_MAX_LEN)
844 } else if (ext4_ext_get_actual_len(ex1) + ext4_ext_get_actual_len(ex2) >
848 if (to_le32(ex2->first_block) + ext4_ext_get_actual_len(ex2) !=
849 to_le32(ex1->first_block))
855 static bool ext4_ext_can_append(struct ext4_extent *ex1,
856 struct ext4_extent *ex2)
858 if (ext4_ext_pblock(ex1) + ext4_ext_get_actual_len(ex1) !=
859 ext4_ext_pblock(ex2))
862 if (ext4_ext_is_unwritten(ex1)) {
863 if (ext4_ext_get_actual_len(ex1) +
864 ext4_ext_get_actual_len(ex2) >
865 EXT_UNWRITTEN_MAX_LEN)
867 } else if (ext4_ext_get_actual_len(ex1) + ext4_ext_get_actual_len(ex2) >
871 if (to_le32(ex1->first_block) + ext4_ext_get_actual_len(ex1) !=
872 to_le32(ex2->first_block))
878 static int ext4_ext_insert_leaf(struct ext4_inode_ref *inode_ref,
879 struct ext4_extent_path *path, int32_t at,
880 struct ext4_extent *newext,
881 struct ext_split_trans *spt, uint32_t flags)
883 struct ext4_extent_path *curp = path + at;
884 struct ext4_extent *ex = curp->extent;
885 struct ext4_block bh = EXT4_BLOCK_ZERO();
889 struct ext4_extent_header *eh = NULL;
892 to_le32(newext->first_block) == to_le32(curp->extent->first_block))
895 if (!(flags & EXT4_EXT_NO_COMBINE)) {
896 if (curp->extent && ext4_ext_can_append(curp->extent, newext)) {
897 unwritten = ext4_ext_is_unwritten(curp->extent);
898 curp->extent->block_count =
899 to_le16(ext4_ext_get_actual_len(curp->extent) +
900 ext4_ext_get_actual_len(newext));
902 ext4_ext_mark_unwritten(curp->extent);
903 err = ext4_ext_dirty(inode_ref, curp);
908 ext4_ext_can_prepend(curp->extent, newext)) {
909 unwritten = ext4_ext_is_unwritten(curp->extent);
910 curp->extent->first_block = newext->first_block;
911 curp->extent->block_count =
912 to_le16(ext4_ext_get_actual_len(curp->extent) +
913 ext4_ext_get_actual_len(newext));
915 ext4_ext_mark_unwritten(curp->extent);
916 err = ext4_ext_dirty(inode_ref, curp);
921 if (to_le16(curp->header->entries_count) ==
922 to_le16(curp->header->max_entries_count)) {
924 struct ext4_extent_header *neh;
925 err = ext4_ext_split_node(inode_ref, path, at, newext,
930 neh = ext_block_hdr(&bh);
931 if (to_le32(newext->first_block) >
932 to_le32(curp->extent->first_block)) {
933 if (to_le16(neh->entries_count) >
934 to_le16(curp->header->entries_count)) {
937 ex = EXT_LAST_EXTENT(eh) + 1;
940 ex = EXT_FIRST_EXTENT(eh);
945 ex = EXT_LAST_EXTENT(eh);
948 err = EXT_INODE_HDR_NEED_GROW;
953 if (curp->extent == NULL) {
954 ex = EXT_FIRST_EXTENT(eh);
956 } else if (to_le32(newext->first_block) >
957 to_le32(curp->extent->first_block)) {
959 ex = curp->extent + 1;
966 len = EXT_LAST_EXTENT(eh) - ex + 1;
967 ext4_assert(len >= 0);
969 memmove(ex + 1, ex, len * sizeof(struct ext4_extent));
971 if (ex > EXT_MAX_EXTENT(eh)) {
976 ex->first_block = newext->first_block;
977 ex->block_count = newext->block_count;
978 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
979 eh->entries_count = to_le16(to_le16(eh->entries_count) + 1);
981 if (ex > EXT_LAST_EXTENT(eh)) {
986 if (eh == curp->header) {
987 err = ext4_ext_correct_indexes(inode_ref, path);
990 err = ext4_ext_dirty(inode_ref, curp);
997 ext4_block_set(inode_ref->fs->bdev, &bh);
1000 } else if (bh.lb_id) {
1001 /* If we got a sibling leaf. */
1004 spt->path.p_block = ext4_ext_pblock(ex);
1005 spt->path.depth = to_le16(eh->depth);
1006 spt->path.maxdepth = 0;
1007 spt->path.extent = ex;
1008 spt->path.index = NULL;
1009 spt->path.header = eh;
1010 spt->path.block = bh;
1013 * If newext->ee_block can be included into the
1016 if (to_le32(newext->first_block) >=
1017 ext4_ext_block_index(ext_block_hdr(&bh)))
1021 curp->p_block = ext4_ext_pblock(ex);
1027 curp->p_block = ext4_ext_pblock(ex);
1034 * ext4_ext_grow_indepth:
1035 * implements tree growing procedure:
1036 * - allocates new block
1037 * - moves top-level data (index block or leaf) into the new block
1038 * - initializes new top-level, creating index that points to the
1039 * just created block
1041 static int ext4_ext_grow_indepth(struct ext4_inode_ref *inode_ref,
1044 struct ext4_extent_header *neh;
1045 struct ext4_block bh = EXT4_BLOCK_ZERO();
1046 ext4_fsblk_t newblock, goal = 0;
1049 /* Try to prepend new index to old one */
1050 if (ext_depth(inode_ref->inode))
1051 goal = ext4_idx_pblock(
1052 EXT_FIRST_INDEX(ext_inode_hdr(inode_ref->inode)));
1056 newblock = ext4_new_meta_blocks(inode_ref, goal, flags, NULL, &err);
1061 err = ext4_block_get(inode_ref->fs->bdev, &bh, newblock);
1063 ext4_ext_free_blocks(inode_ref, newblock, 1, 0);
1067 /* move top-level index/leaf into new block */
1068 memmove(bh.data, inode_ref->inode->blocks,
1069 sizeof(inode_ref->inode->blocks));
1071 /* set size of new block */
1072 neh = ext_block_hdr(&bh);
1073 /* old root could have indexes or leaves
1074 * so calculate e_max right way */
1075 if (ext_depth(inode_ref->inode))
1076 neh->max_entries_count =
1077 to_le16(ext4_ext_space_block_idx(inode_ref));
1079 neh->max_entries_count =
1080 to_le16(ext4_ext_space_block(inode_ref));
1082 neh->magic = to_le16(EXT4_EXTENT_MAGIC);
1083 ext4_extent_block_csum_set(inode_ref, neh);
1085 /* Update top-level index: num,max,pointer */
1086 neh = ext_inode_hdr(inode_ref->inode);
1087 neh->entries_count = to_le16(1);
1088 ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1089 if (neh->depth == 0) {
1090 /* Root extent block becomes index block */
1091 neh->max_entries_count =
1092 to_le16(ext4_ext_space_root_idx(inode_ref));
1093 EXT_FIRST_INDEX(neh)
1094 ->first_block = EXT_FIRST_EXTENT(neh)->first_block;
1096 neh->depth = to_le16(to_le16(neh->depth) + 1);
1099 inode_ref->dirty = true;
1100 ext4_block_set(inode_ref->fs->bdev, &bh);
1105 __unused static void print_path(struct ext4_extent_path *path)
1107 int32_t i = path->depth;
1112 ? (path->extent - EXT_FIRST_EXTENT(path->header))
1116 ? (path->index - EXT_FIRST_INDEX(path->header))
1121 ext4_dbg(DEBUG_EXTENT,
1122 "depth %" PRId32 ", p_block: %" PRIu64 ","
1123 "p_ext offset: %td, p_idx offset: %td\n",
1124 i, path->p_block, a, b);
1130 static void ext4_ext_replace_path(struct ext4_inode_ref *inode_ref,
1131 struct ext4_extent_path *path,
1132 struct ext_split_trans *spt, int32_t depth,
1135 int32_t i = depth - level;
1137 ext4_ext_drop_refs(inode_ref, path + i, 1);
1138 path[i] = spt->path;
1141 static int ext4_ext_insert_extent(struct ext4_inode_ref *inode_ref,
1142 struct ext4_extent_path **ppath,
1143 struct ext4_extent *newext, uint32_t flags)
1145 int32_t i, depth, level;
1147 ext4_fsblk_t ptr = 0;
1148 struct ext4_extent_path *path = *ppath;
1149 struct ext_split_trans *spt = NULL;
1150 struct ext_split_trans newblock;
1152 memset(&newblock, 0, sizeof(newblock));
1154 depth = ext_depth(inode_ref->inode);
1155 for (i = depth, level = 0; i >= 0; i--, level++)
1156 if (EXT_HAS_FREE_INDEX(path + i))
1160 spt = calloc(1, sizeof(struct ext_split_trans) * (level));
1168 depth = ext_depth(inode_ref->inode);
1172 ret = ext4_ext_insert_leaf(inode_ref, path, depth - i,
1173 newext, &newblock, flags);
1175 ret = ext4_ext_insert_index(
1176 inode_ref, path, depth - i, newext,
1177 ext4_ext_block_index(
1178 ext_block_hdr(&spt[i - 1].path.block)),
1179 spt[i - 1].ptr, &newblock);
1183 if (ret && ret != EXT_INODE_HDR_NEED_GROW)
1185 else if (spt && ptr && !ret) {
1186 /* Prepare for the next iteration after splitting. */
1191 } while (ptr != 0 && i <= depth);
1193 if (ret == EXT_INODE_HDR_NEED_GROW) {
1194 ret = ext4_ext_grow_indepth(inode_ref, 0);
1197 ret = ext4_find_extent(inode_ref, to_le32(newext->first_block),
1208 ext4_ext_drop_refs(inode_ref, path, 0);
1210 while (--level >= 0 && spt) {
1211 if (spt[level].ptr) {
1212 ext4_ext_free_blocks(inode_ref, spt[level].ptr,
1214 ext4_ext_drop_refs(inode_ref, &spt[level].path,
1219 while (--level >= 0 && spt) {
1220 if (spt[level].switch_to)
1221 ext4_ext_replace_path(inode_ref, path, spt,
1223 else if (spt[level].ptr)
1224 ext4_ext_drop_refs(inode_ref, &spt[level].path,
1234 static void ext4_ext_remove_blocks(struct ext4_inode_ref *inode_ref,
1235 struct ext4_extent *ex, ext4_lblk_t from,
1238 ext4_lblk_t len = to - from + 1;
1241 num = from - to_le32(ex->first_block);
1242 start = ext4_ext_pblock(ex) + num;
1243 ext4_dbg(DEBUG_EXTENT,
1244 "Freeing %" PRIu32 " at %" PRIu64 ", %" PRIu32 "\n", from,
1247 ext4_ext_free_blocks(inode_ref, start, len, 0);
1250 static int ext4_ext_remove_idx(struct ext4_inode_ref *inode_ref,
1251 struct ext4_extent_path *path, int32_t depth)
1257 /* free index block */
1258 leaf = ext4_idx_pblock(path[i].index);
1260 if (path[i].index != EXT_LAST_INDEX(path[i].header)) {
1261 ptrdiff_t len = EXT_LAST_INDEX(path[i].header) - path[i].index;
1262 memmove(path[i].index, path[i].index + 1,
1263 len * sizeof(struct ext4_extent_index));
1266 path[i].header->entries_count =
1267 to_le16(to_le16(path[i].header->entries_count) - 1);
1268 err = ext4_ext_dirty(inode_ref, path + i);
1272 ext4_dbg(DEBUG_EXTENT, "IDX: Freeing %" PRIu32 " at %" PRIu64 ", %d\n",
1273 to_le32(path[i].index->first_block), leaf, 1);
1274 ext4_ext_free_blocks(inode_ref, leaf, 1, 0);
1277 if (path[i].index != EXT_FIRST_INDEX(path[i].header))
1280 path[i - 1].index->first_block = path[i].index->first_block;
1281 err = ext4_ext_dirty(inode_ref, path + i - 1);
1290 static int ext4_ext_remove_leaf(struct ext4_inode_ref *inode_ref,
1291 struct ext4_extent_path *path, ext4_lblk_t from,
1295 int32_t depth = ext_depth(inode_ref->inode);
1296 struct ext4_extent *ex = path[depth].extent;
1297 struct ext4_extent *start_ex, *ex2 = NULL;
1298 struct ext4_extent_header *eh = path[depth].header;
1301 uint16_t new_entries;
1304 new_entries = to_le16(eh->entries_count);
1305 while (ex <= EXT_LAST_EXTENT(path[depth].header) &&
1306 to_le32(ex->first_block) <= to) {
1307 int32_t new_len = 0;
1309 ext4_fsblk_t start, new_start;
1310 new_start = start = to_le32(ex->first_block);
1311 len = ext4_ext_get_actual_len(ex);
1314 len -= from - start;
1315 new_len = from - start;
1318 if (start + len - 1 > to) {
1319 len -= start + len - 1 - to;
1320 new_len = start + len - 1 - to;
1321 new_start += to + 1;
1325 ext4_ext_remove_blocks(inode_ref, ex, start, start + len - 1);
1326 ex->first_block = to_le32(new_start);
1330 unwritten = ext4_ext_is_unwritten(ex);
1331 ex->block_count = to_le16(new_len);
1333 ext4_ext_mark_unwritten(ex);
1342 if (ex2 <= EXT_LAST_EXTENT(eh))
1343 memmove(start_ex, ex2, EXT_LAST_EXTENT(eh) - ex2 + 1);
1345 eh->entries_count = to_le16(new_entries);
1346 ext4_ext_dirty(inode_ref, path + depth);
1347 if (path[depth].extent == EXT_FIRST_EXTENT(eh) && eh->entries_count)
1348 err = ext4_ext_correct_indexes(inode_ref, path);
1350 /* if this leaf is free, then we should
1351 * remove it from index block above */
1352 if (err == EOK && eh->entries_count == 0 && path[depth].block.lb_id)
1353 err = ext4_ext_remove_idx(inode_ref, path, depth - 1);
1358 static int ext4_ext_more_to_rm(struct ext4_extent_path *path, ext4_lblk_t to)
1360 if (!to_le16(path->header->entries_count))
1363 if (path->index > EXT_LAST_INDEX(path->header))
1366 if (to_le32(path->index->first_block) > to)
1372 int ext4_ext_remove_space(struct ext4_inode_ref *inode_ref, ext4_lblk_t from,
1375 struct ext4_extent_path *path = NULL;
1377 int32_t depth = ext_depth(inode_ref->inode);
1380 ret = ext4_find_extent(inode_ref, from, &path, 0);
1384 if (!path[depth].extent ||
1385 !IN_RANGE(from, to_le32(path[depth].extent->first_block),
1386 ext4_ext_get_actual_len(path[depth].extent))) {
1394 struct ext4_extent_header *eh;
1395 struct ext4_extent *first_ex, *last_ex;
1396 ext4_lblk_t leaf_from, leaf_to;
1397 eh = path[i].header;
1398 ext4_assert(to_le16(eh->entries_count) > 0);
1399 first_ex = EXT_FIRST_EXTENT(eh);
1400 last_ex = EXT_LAST_EXTENT(eh);
1401 leaf_from = to_le32(first_ex->first_block);
1402 leaf_to = to_le32(last_ex->first_block) +
1403 ext4_ext_get_actual_len(last_ex) - 1;
1404 if (leaf_from < from)
1410 ext4_ext_remove_leaf(inode_ref, path, leaf_from,
1412 ext4_ext_drop_refs(inode_ref, path + i, 0);
1416 struct ext4_extent_header *eh;
1417 eh = path[i].header;
1418 if (ext4_ext_more_to_rm(path + i, to)) {
1419 struct ext4_block bh = EXT4_BLOCK_ZERO();
1420 if (path[i + 1].block.lb_id)
1421 ext4_ext_drop_refs(inode_ref,
1424 ret = read_extent_tree_block(
1425 inode_ref, ext4_idx_pblock(path[i].index),
1426 depth - i - 1, &bh, 0);
1431 ext4_idx_pblock(path[i].index);
1432 path[i + 1].block = bh;
1433 path[i + 1].header = ext_block_hdr(&bh);
1434 path[i + 1].depth = depth - i - 1;
1436 path[i + 1].extent = EXT_FIRST_EXTENT(
1437 path[i + 1].header);
1440 EXT_FIRST_INDEX(path[i + 1].header);
1444 if (!eh->entries_count && i > 0) {
1446 ret = ext4_ext_remove_idx(inode_ref,
1450 ext4_block_set(inode_ref->fs->bdev,
1458 /* TODO: flexible tree reduction should be here */
1459 if (path->header->entries_count == 0) {
1461 * truncate to zero freed all the tree,
1462 * so we need to correct eh_depth
1464 ext_inode_hdr(inode_ref->inode)->depth = 0;
1465 ext_inode_hdr(inode_ref->inode)->max_entries_count =
1466 to_le16(ext4_ext_space_root(inode_ref));
1467 ret = ext4_ext_dirty(inode_ref, path);
1471 ext4_ext_drop_refs(inode_ref, path, 0);
1477 static int ext4_ext_split_extent_at(struct ext4_inode_ref *inode_ref,
1478 struct ext4_extent_path **ppath,
1479 ext4_lblk_t split, uint32_t split_flag)
1481 struct ext4_extent *ex, newex;
1482 ext4_fsblk_t newblock;
1483 ext4_lblk_t ee_block;
1485 int32_t depth = ext_depth(inode_ref->inode);
1488 ex = (*ppath)[depth].extent;
1489 ee_block = to_le32(ex->first_block);
1490 ee_len = ext4_ext_get_actual_len(ex);
1491 newblock = split - ee_block + ext4_ext_pblock(ex);
1493 if (split == ee_block) {
1495 * case b: block @split is the block that the extent begins with
1496 * then we just change the state of the extent, and splitting
1499 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
1500 ext4_ext_mark_unwritten(ex);
1502 ext4_ext_mark_initialized(ex);
1504 err = ext4_ext_dirty(inode_ref, *ppath + depth);
1508 ex->block_count = to_le16(split - ee_block);
1509 if (split_flag & EXT4_EXT_MARK_UNWRIT1)
1510 ext4_ext_mark_unwritten(ex);
1512 err = ext4_ext_dirty(inode_ref, *ppath + depth);
1516 newex.first_block = to_le32(split);
1517 newex.block_count = to_le16(ee_len - (split - ee_block));
1518 ext4_ext_store_pblock(&newex, newblock);
1519 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
1520 ext4_ext_mark_unwritten(&newex);
1521 err = ext4_ext_insert_extent(inode_ref, ppath, &newex,
1522 EXT4_EXT_NO_COMBINE);
1524 goto restore_extent_len;
1529 ex->block_count = to_le16(ee_len);
1530 err = ext4_ext_dirty(inode_ref, *ppath + depth);
1534 static int ext4_ext_convert_to_initialized(struct ext4_inode_ref *inode_ref,
1535 struct ext4_extent_path **ppath,
1536 ext4_lblk_t split, uint32_t blocks)
1538 int32_t depth = ext_depth(inode_ref->inode), err = EOK;
1539 struct ext4_extent *ex = (*ppath)[depth].extent;
1541 ext4_assert(to_le32(ex->first_block) <= split);
1543 if (split + blocks ==
1544 to_le32(ex->first_block) + ext4_ext_get_actual_len(ex)) {
1545 /* split and initialize right part */
1546 err = ext4_ext_split_extent_at(inode_ref, ppath, split,
1547 EXT4_EXT_MARK_UNWRIT1);
1548 } else if (to_le32(ex->first_block) == split) {
1549 /* split and initialize left part */
1550 err = ext4_ext_split_extent_at(inode_ref, ppath, split + blocks,
1551 EXT4_EXT_MARK_UNWRIT2);
1553 /* split 1 extent to 3 and initialize the 2nd */
1554 err = ext4_ext_split_extent_at(inode_ref, ppath, split + blocks,
1555 EXT4_EXT_MARK_UNWRIT1 |
1556 EXT4_EXT_MARK_UNWRIT2);
1558 err = ext4_ext_split_extent_at(inode_ref, ppath, split,
1559 EXT4_EXT_MARK_UNWRIT1);
1566 int ext4_ext_tree_init(struct ext4_inode_ref *inode_ref)
1568 struct ext4_extent_header *eh;
1570 eh = ext_inode_hdr(inode_ref->inode);
1572 eh->entries_count = 0;
1573 eh->magic = to_le16(EXT4_EXTENT_MAGIC);
1574 eh->max_entries_count = to_le16(ext4_ext_space_root(inode_ref));
1575 inode_ref->dirty = true;
1580 * ext4_ext_next_allocated_block:
1581 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1582 * NOTE: it considers block number from index entry as
1583 * allocated block. Thus, index entries have to be consistent
1586 #define EXT_MAX_BLOCKS (ext4_lblk_t) - 1
1588 static ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_extent_path *path)
1592 depth = path->depth;
1594 if (depth == 0 && path->extent == NULL)
1595 return EXT_MAX_BLOCKS;
1597 while (depth >= 0) {
1598 if (depth == path->depth) {
1600 if (path[depth].extent &&
1601 path[depth].extent !=
1602 EXT_LAST_EXTENT(path[depth].header))
1604 path[depth].extent[1].first_block);
1607 if (path[depth].index !=
1608 EXT_LAST_INDEX(path[depth].header))
1610 path[depth].index[1].first_block);
1615 return EXT_MAX_BLOCKS;
1618 static int ext4_ext_zero_unwritten_range(struct ext4_inode_ref *inode_ref,
1620 uint32_t blocks_count)
1624 uint32_t block_size = ext4_sb_get_block_size(&inode_ref->fs->sb);
1625 for (i = 0; i < blocks_count; i++) {
1626 uint32_t block_u32 = (uint32_t)block + (uint32_t)i;
1627 struct ext4_block bh = EXT4_BLOCK_ZERO();
1628 err = ext4_block_get(inode_ref->fs->bdev, &bh, block_u32);
1632 memset(bh.data, 0, block_size);
1634 err = ext4_block_set(inode_ref->fs->bdev, &bh);
1641 int ext4_ext_get_blocks(struct ext4_inode_ref *inode_ref, ext4_fsblk_t iblock,
1642 uint32_t max_blocks, ext4_fsblk_t *result, bool create,
1643 uint32_t *blocks_count)
1645 struct ext4_extent_path *path = NULL;
1646 struct ext4_extent newex, *ex;
1650 uint32_t allocated = 0;
1651 ext4_fsblk_t next, newblock;
1659 /* find extent for this block */
1660 err = ext4_find_extent(inode_ref, iblock, &path, 0);
1666 depth = ext_depth(inode_ref->inode);
1669 * consistent leaf must not be empty
1670 * this situations is possible, though, _during_ tree modification
1671 * this is why assert can't be put in ext4_ext_find_extent()
1673 if ((ex = path[depth].extent)) {
1674 ext4_lblk_t ee_block = to_le32(ex->first_block);
1675 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
1676 uint16_t ee_len = ext4_ext_get_actual_len(ex);
1677 /* if found exent covers block, simple return it */
1678 if (IN_RANGE(iblock, ee_block, ee_len)) {
1679 /* number of remain blocks in the extent */
1680 allocated = ee_len - (iblock - ee_block);
1682 if (ext4_ext_is_unwritten(ex)) {
1684 uint32_t zero_range;
1685 zero_range = allocated;
1686 if (zero_range > max_blocks)
1687 zero_range = max_blocks;
1689 newblock = iblock - ee_block + ee_start;
1690 err = ext4_ext_zero_unwritten_range(
1691 inode_ref, newblock, zero_range);
1695 err = ext4_ext_convert_to_initialized(
1696 inode_ref, &path, iblock,
1705 newblock = iblock - ee_block + ee_start;
1712 * requested block isn't allocated yet
1713 * we couldn't try to create block if create flag is zero
1719 /* find next allocated block so that we know how many
1720 * blocks we can allocate without ovelapping next extent */
1721 next = ext4_ext_next_allocated_block(path);
1722 allocated = next - iblock;
1723 if (allocated > max_blocks)
1724 allocated = max_blocks;
1726 /* allocate new block */
1727 goal = ext4_ext_find_goal(path, iblock);
1728 newblock = ext4_new_meta_blocks(inode_ref, goal, 0, &allocated, &err);
1732 /* try to insert new extent into found leaf and return */
1733 newex.first_block = to_le32(iblock);
1734 ext4_ext_store_pblock(&newex, newblock);
1735 newex.block_count = to_le16(allocated);
1736 err = ext4_ext_insert_extent(inode_ref, &path, &newex, 0);
1738 /* free data blocks we just allocated */
1739 ext4_ext_free_blocks(inode_ref, ext4_ext_pblock(&newex),
1740 to_le16(newex.block_count), 0);
1744 /* previous routine could use block we allocated */
1745 newblock = ext4_ext_pblock(&newex);
1748 if (allocated > max_blocks)
1749 allocated = max_blocks;
1755 *blocks_count = allocated;
1759 ext4_ext_drop_refs(inode_ref, path, 0);