ext4_journal: initialize block tag with 0 before writing to it.
[lwext4.git] / lwext4 / ext4_journal.c
index aeb5565a72473d9a36a8996ecb0938c84e899d86..e9171296412d2857440e4e8247cb3470a314c6dd 100644 (file)
@@ -39,6 +39,7 @@
 #include "ext4_types.h"
 #include "ext4_fs.h"
 #include "ext4_super.h"
+#include "ext4_journal.h"
 #include "ext4_errno.h"
 #include "ext4_blockdev.h"
 #include "ext4_crc32c.h"
 #include <string.h>
 #include <stdlib.h>
 
+/**@brief  Revoke entry during journal replay.*/
 struct revoke_entry {
+       /**@brief  Block number not to be replayed.*/
        ext4_fsblk_t block;
+
+       /**@brief  For any transaction id smaller
+        *         than trans_id, records of @block
+        *         in those transactions should not
+        *         be replayed.*/
        uint32_t trans_id;
+
+       /**@brief  Revoke tree node.*/
        RB_ENTRY(revoke_entry) revoke_node;
 };
 
+/**@brief  Valid journal replay information.*/
 struct recover_info {
+       /**@brief  Starting transaction id.*/
        uint32_t start_trans_id;
+
+       /**@brief  Ending transaction id.*/
        uint32_t last_trans_id;
+
+       /**@brief  Used as internal argument.*/
        uint32_t this_trans_id;
+
+       /**@brief  RB-Tree storing revoke entries.*/
        RB_HEAD(jbd_revoke, revoke_entry) revoke_root;
 };
 
+/**@brief  Journal replay internal arguments.*/
 struct replay_arg {
+       /**@brief  Journal replay information.*/
        struct recover_info *info;
+
+       /**@brief  Current block we are on.*/
        uint32_t *this_block;
+
+       /**@brief  Current trans_id we are on.*/
        uint32_t this_trans_id;
 };
 
@@ -83,11 +107,11 @@ RB_GENERATE_INTERNAL(jbd_revoke, revoke_entry, revoke_node,
 #define jbd_alloc_revoke_entry() calloc(1, sizeof(struct revoke_entry))
 #define jbd_free_revoke_entry(addr) free(addr)
 
-int jbd_inode_bmap(struct jbd_fs *jbd_fs,
-                  ext4_lblk_t iblock,
-                  ext4_fsblk_t *fblock);
-
-int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s)
+/**@brief  Write jbd superblock to disk.
+ * @param  jbd_fs jbd filesystem
+ * @param  s jbd superblock
+ * @return standard error code*/
+static int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s)
 {
        int rc;
        struct ext4_fs *fs = jbd_fs->inode_ref.fs;
@@ -102,7 +126,11 @@ int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s)
                                     EXT4_SUPERBLOCK_SIZE);
 }
 
-int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s)
+/**@brief  Read jbd superblock from disk.
+ * @param  jbd_fs jbd filesystem
+ * @param  s jbd superblock
+ * @return standard error code*/
+static int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s)
 {
        int rc;
        struct ext4_fs *fs = jbd_fs->inode_ref.fs;
@@ -117,6 +145,9 @@ int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s)
                                    EXT4_SUPERBLOCK_SIZE);
 }
 
+/**@brief  Verify jbd superblock.
+ * @param  sb jbd superblock
+ * @return true if jbd superblock is valid */
 static bool jbd_verify_sb(struct jbd_sb *sb)
 {
        struct jbd_bhdr *header = &sb->header;
@@ -130,6 +161,9 @@ static bool jbd_verify_sb(struct jbd_sb *sb)
        return true;
 }
 
+/**@brief  Write back dirty jbd superblock to disk.
+ * @param  jbd_fs jbd filesystem
+ * @return standard error code*/
 static int jbd_write_sb(struct jbd_fs *jbd_fs)
 {
        int rc = EOK;
@@ -143,6 +177,10 @@ static int jbd_write_sb(struct jbd_fs *jbd_fs)
        return rc;
 }
 
+/**@brief  Get reference to jbd filesystem.
+ * @param  fs Filesystem to load journal of
+ * @param  jbd_fs jbd filesystem
+ * @return standard error code*/
 int jbd_get_fs(struct ext4_fs *fs,
               struct jbd_fs *jbd_fs)
 {
@@ -150,6 +188,9 @@ int jbd_get_fs(struct ext4_fs *fs,
        uint32_t journal_ino;
 
        memset(jbd_fs, 0, sizeof(struct jbd_fs));
+       /* See if there is journal inode on this filesystem.*/
+       /* FIXME: detection on existance ofbkejournal bdev is
+        *        missing.*/
        journal_ino = ext4_get32(&fs->sb, journal_inode_number);
 
        rc = ext4_fs_get_inode_ref(fs,
@@ -174,6 +215,9 @@ int jbd_get_fs(struct ext4_fs *fs,
        return rc;
 }
 
+/**@brief  Put reference of jbd filesystem.
+ * @param  jbd_fs jbd filesystem
+ * @return standard error code*/
 int jbd_put_fs(struct jbd_fs *jbd_fs)
 {
        int rc = EOK;
@@ -183,6 +227,11 @@ int jbd_put_fs(struct jbd_fs *jbd_fs)
        return rc;
 }
 
+/**@brief  Data block lookup helper.
+ * @param  jbd_fs jbd filesystem
+ * @param  iblock block index
+ * @param  fblock logical block address
+ * @return standard error code*/
 int jbd_inode_bmap(struct jbd_fs *jbd_fs,
                   ext4_lblk_t iblock,
                   ext4_fsblk_t *fblock)
@@ -195,13 +244,21 @@ int jbd_inode_bmap(struct jbd_fs *jbd_fs,
        return rc;
 }
 
-int jbd_block_get(struct jbd_fs *jbd_fs,
+/**@brief   jbd block get function (through cache).
+ * @param   jbd_fs jbd filesystem
+ * @param   block block descriptor
+ * @param   fblock jbd logical block address
+ * @return  standard error code*/
+static int jbd_block_get(struct jbd_fs *jbd_fs,
                  struct ext4_block *block,
                  ext4_fsblk_t fblock)
 {
        /* TODO: journal device. */
        int rc;
        ext4_lblk_t iblock = (ext4_lblk_t)fblock;
+
+       /* Lookup the logical block address of
+        * fblock.*/
        rc = jbd_inode_bmap(jbd_fs, iblock,
                            &fblock);
        if (rc != EOK)
@@ -209,10 +266,21 @@ int jbd_block_get(struct jbd_fs *jbd_fs,
 
        struct ext4_blockdev *bdev = jbd_fs->inode_ref.fs->bdev;
        rc = ext4_block_get(bdev, block, fblock);
+
+       /* If succeeded, mark buffer as BC_FLUSH to indicate
+        * that data should be written to disk immediately.*/
+       if (rc == EOK)
+               ext4_bcache_set_flag(block->buf, BC_FLUSH);
+
        return rc;
 }
 
-int jbd_block_get_noread(struct jbd_fs *jbd_fs,
+/**@brief   jbd block get function (through cache, don't read).
+ * @param   jbd_fs jbd filesystem
+ * @param   block block descriptor
+ * @param   fblock jbd logical block address
+ * @return  standard error code*/
+static int jbd_block_get_noread(struct jbd_fs *jbd_fs,
                         struct ext4_block *block,
                         ext4_fsblk_t fblock)
 {
@@ -226,29 +294,41 @@ int jbd_block_get_noread(struct jbd_fs *jbd_fs,
 
        struct ext4_blockdev *bdev = jbd_fs->inode_ref.fs->bdev;
        rc = ext4_block_get_noread(bdev, block, fblock);
+       if (rc == EOK)
+               ext4_bcache_set_flag(block->buf, BC_FLUSH);
+
        return rc;
 }
 
-int jbd_block_set(struct jbd_fs *jbd_fs,
+/**@brief   jbd block set procedure (through cache).
+ * @param   jbd_fs jbd filesystem
+ * @param   block block descriptor
+ * @return  standard error code*/
+static int jbd_block_set(struct jbd_fs *jbd_fs,
                  struct ext4_block *block)
 {
        return ext4_block_set(jbd_fs->inode_ref.fs->bdev,
                              block);
 }
 
-/*
- * helper functions to deal with 32 or 64bit block numbers.
- */
-int jbd_tag_bytes(struct jbd_fs *jbd_fs)
+/**@brief  helper functions to calculate
+ *         block tag size, not including UUID part.
+ * @param  jbd_fs jbd filesystem
+ * @return tag size in bytes*/
+static int jbd_tag_bytes(struct jbd_fs *jbd_fs)
 {
        int size;
 
+       /* It is very easy to deal with the case which
+        * JBD_FEATURE_INCOMPAT_CSUM_V3 is enabled.*/
        if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
                                     JBD_FEATURE_INCOMPAT_CSUM_V3))
                return sizeof(struct jbd_block_tag3);
 
        size = sizeof(struct jbd_block_tag);
 
+       /* If JBD_FEATURE_INCOMPAT_CSUM_V2 is enabled,
+        * add 2 bytes to size.*/
        if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
                                     JBD_FEATURE_INCOMPAT_CSUM_V2))
                size += sizeof(uint16_t);
@@ -257,18 +337,35 @@ int jbd_tag_bytes(struct jbd_fs *jbd_fs)
                                     JBD_FEATURE_INCOMPAT_64BIT))
                return size;
 
+       /* If block number is 4 bytes in size,
+        * minus 4 bytes from size */
        return size - sizeof(uint32_t);
 }
 
-/**@brief: tag information. */
+/**@brief  Tag information. */
 struct tag_info {
+       /**@brief  Tag size in bytes, including UUID part.*/
        int tag_bytes;
+
+       /**@brief  block number stored in this tag.*/
        ext4_fsblk_t block;
+
+       /**@brief  whether UUID part exists or not.*/
        bool uuid_exist;
+
+       /**@brief  UUID content if UUID part exists.*/
        uint8_t uuid[UUID_SIZE];
+
+       /**@brief  Is this the last tag? */
        bool last_tag;
 };
 
+/**@brief  Extract information from a block tag.
+ * @param  __tag pointer to the block tag
+ * @param  tag_bytes block tag size of this jbd filesystem
+ * @param  remaining size in buffer containing the block tag
+ * @param  tag_info information of this tag.
+ * @return  EOK when succeed, otherwise return EINVAL.*/
 static int
 jbd_extract_block_tag(struct jbd_fs *jbd_fs,
                      void *__tag,
@@ -281,6 +378,7 @@ jbd_extract_block_tag(struct jbd_fs *jbd_fs,
        tag_info->uuid_exist = false;
        tag_info->last_tag = false;
 
+       /* See whether it is possible to hold a valid block tag.*/
        if (remain_buf_size - tag_bytes < 0)
                return EINVAL;
 
@@ -297,6 +395,7 @@ jbd_extract_block_tag(struct jbd_fs *jbd_fs,
                        tag_info->block = 0;
 
                if (!(jbd_get32(tag, flags) & JBD_FLAG_SAME_UUID)) {
+                       /* See whether it is possible to hold UUID part.*/
                        if (remain_buf_size - tag_bytes < UUID_SIZE)
                                return EINVAL;
 
@@ -321,6 +420,7 @@ jbd_extract_block_tag(struct jbd_fs *jbd_fs,
                        tag_info->block = 0;
 
                if (!(jbd_get16(tag, flags) & JBD_FLAG_SAME_UUID)) {
+                       /* See whether it is possible to hold UUID part.*/
                        if (remain_buf_size - tag_bytes < UUID_SIZE)
                                return EINVAL;
 
@@ -337,6 +437,11 @@ jbd_extract_block_tag(struct jbd_fs *jbd_fs,
        return EOK;
 }
 
+/**@brief  Write information to a block tag.
+ * @param  __tag pointer to the block tag
+ * @param  remaining size in buffer containing the block tag
+ * @param  tag_info information of this tag.
+ * @return  EOK when succeed, otherwise return EINVAL.*/
 static int
 jbd_write_block_tag(struct jbd_fs *jbd_fs,
                    void *__tag,
@@ -348,27 +453,30 @@ jbd_write_block_tag(struct jbd_fs *jbd_fs,
 
        tag_info->tag_bytes = tag_bytes;
 
+       /* See whether it is possible to hold a valid block tag.*/
        if (remain_buf_size - tag_bytes < 0)
                return EINVAL;
 
        if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
                                     JBD_FEATURE_INCOMPAT_CSUM_V3)) {
                struct jbd_block_tag3 *tag = __tag;
+               memset(tag, 0, sizeof(struct jbd_block_tag3));
                jbd_set32(tag, blocknr, tag_info->block);
                if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
                                             JBD_FEATURE_INCOMPAT_64BIT))
                        jbd_set32(tag, blocknr_high, tag_info->block >> 32);
 
-               if (!tag_info->uuid_exist) {
+               if (tag_info->uuid_exist) {
+                       /* See whether it is possible to hold UUID part.*/
                        if (remain_buf_size - tag_bytes < UUID_SIZE)
                                return EINVAL;
 
                        uuid_start = (char *)tag + tag_bytes;
                        tag_info->tag_bytes += UUID_SIZE;
                        memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
+               } else
                        jbd_set32(tag, flags,
                                  jbd_get32(tag, flags) | JBD_FLAG_SAME_UUID);
-               }
 
                if (tag_info->last_tag)
                        jbd_set32(tag, flags,
@@ -376,21 +484,23 @@ jbd_write_block_tag(struct jbd_fs *jbd_fs,
 
        } else {
                struct jbd_block_tag *tag = __tag;
+               memset(tag, 0, sizeof(struct jbd_block_tag));
                jbd_set32(tag, blocknr, tag_info->block);
                if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
                                             JBD_FEATURE_INCOMPAT_64BIT))
                        jbd_set32(tag, blocknr_high, tag_info->block >> 32);
 
-               if (!tag_info->uuid_exist) {
+               if (tag_info->uuid_exist) {
+                       /* See whether it is possible to hold UUID part.*/
                        if (remain_buf_size - tag_bytes < UUID_SIZE)
                                return EINVAL;
 
                        uuid_start = (char *)tag + tag_bytes;
                        tag_info->tag_bytes += UUID_SIZE;
                        memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
+               } else
                        jbd_set16(tag, flags,
                                  jbd_get16(tag, flags) | JBD_FLAG_SAME_UUID);
-               }
 
                if (tag_info->last_tag)
                        jbd_set16(tag, flags,
@@ -400,6 +510,13 @@ jbd_write_block_tag(struct jbd_fs *jbd_fs,
        return EOK;
 }
 
+/**@brief  Iterate all block tags in a block.
+ * @param  jbd_fs jbd filesystem
+ * @param  __tag_start pointer to the block
+ * @param  tag_tbl_size size of the block
+ * @param  func callback routine to indicate that
+ *         a block tag is found
+ * @param  arg additional argument to be passed to func */
 static void
 jbd_iterate_block_table(struct jbd_fs *jbd_fs,
                        void *__tag_start,
@@ -415,6 +532,7 @@ jbd_iterate_block_table(struct jbd_fs *jbd_fs,
        tag_start = __tag_start;
        tag_ptr = tag_start;
 
+       /* Cut off the size of block tail storing checksum. */
        if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
                                     JBD_FEATURE_INCOMPAT_CSUM_V2) ||
            JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
@@ -434,6 +552,7 @@ jbd_iterate_block_table(struct jbd_fs *jbd_fs,
                if (func)
                        func(jbd_fs, tag_info.block, tag_info.uuid, arg);
 
+               /* Stop the iteration when we reach the last tag. */
                if (tag_info.last_tag)
                        break;
 
@@ -465,6 +584,9 @@ jbd_revoke_entry_lookup(struct recover_info *info, ext4_fsblk_t block)
        return RB_FIND(jbd_revoke, &info->revoke_root, &tmp);
 }
 
+/**@brief  Replay a block in a transaction.
+ * @param  jbd_fs jbd filesystem
+ * @param  block  block address to be replayed.*/
 static void jbd_replay_block_tags(struct jbd_fs *jbd_fs,
                                  ext4_fsblk_t block,
                                  uint8_t *uuid __unused,
@@ -480,6 +602,8 @@ static void jbd_replay_block_tags(struct jbd_fs *jbd_fs,
 
        (*this_block)++;
 
+       /* We replay this block only if the current transaction id
+        * is equal or greater than that in revoke entry.*/
        revoke_entry = jbd_revoke_entry_lookup(info, block);
        if (revoke_entry &&
            arg->this_trans_id < revoke_entry->trans_id)
@@ -493,6 +617,7 @@ static void jbd_replay_block_tags(struct jbd_fs *jbd_fs,
        if (r != EOK)
                return;
 
+       /* We need special treatment for ext4 superblock. */
        if (block) {
                r = ext4_block_get_noread(fs->bdev, &ext4_block, block);
                if (r != EOK) {
@@ -530,12 +655,18 @@ static void jbd_replay_block_tags(struct jbd_fs *jbd_fs,
        return;
 }
 
+/**@brief  Add block address to revoke tree, along with
+ *         its transaction id.
+ * @param  info  journal replay info
+ * @param  block  block address to be replayed.*/
 static void jbd_add_revoke_block_tags(struct recover_info *info,
                                      ext4_fsblk_t block)
 {
        struct revoke_entry *revoke_entry;
 
        ext4_dbg(DEBUG_JBD, "Add block %" PRIu64 " to revoke tree\n", block);
+       /* If the revoke entry with respect to the block address
+        * exists already, update its transaction id.*/
        revoke_entry = jbd_revoke_entry_lookup(info, block);
        if (revoke_entry) {
                revoke_entry->trans_id = info->this_trans_id;
@@ -573,7 +704,10 @@ do {                                                                       \
 #define ACTION_REVOKE 1
 #define ACTION_RECOVER 2
 
-
+/**@brief  Add entries in a revoke block to revoke tree.
+ * @param  jbd_fs jbd filesystem
+ * @param  header revoke block header
+ * @param  recover_info  journal replay info*/
 static void jbd_build_revoke_tree(struct jbd_fs *jbd_fs,
                                  struct jbd_bhdr *header,
                                  struct recover_info *info)
@@ -582,6 +716,8 @@ static void jbd_build_revoke_tree(struct jbd_fs *jbd_fs,
        struct jbd_revoke_header *revoke_hdr =
                (struct jbd_revoke_header *)header;
        uint32_t i, nr_entries, record_len = 4;
+
+       /* If we are working on a 64bit jbd filesystem, */
        if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
                                     JBD_FEATURE_INCOMPAT_64BIT))
                record_len = 8;
@@ -630,9 +766,14 @@ static void jbd_replay_descriptor_block(struct jbd_fs *jbd_fs,
                                arg);
 }
 
-int jbd_iterate_log(struct jbd_fs *jbd_fs,
-                   struct recover_info *info,
-                   int action)
+/**@brief  The core routine of journal replay.
+ * @param  jbd_fs jbd filesystem
+ * @param  recover_info  journal replay info
+ * @param  action action needed to be taken
+ * @return standard error code*/
+static int jbd_iterate_log(struct jbd_fs *jbd_fs,
+                          struct recover_info *info,
+                          int action)
 {
        int r = EOK;
        bool log_end = false;
@@ -640,6 +781,7 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs,
        uint32_t start_trans_id, this_trans_id;
        uint32_t start_block, this_block;
 
+       /* We start iterating valid blocks in the whole journal.*/
        start_trans_id = this_trans_id = jbd_get32(sb, sequence);
        start_block = this_block = jbd_get32(sb, start);
 
@@ -649,6 +791,10 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs,
        while (!log_end) {
                struct ext4_block block;
                struct jbd_bhdr *header;
+               /* If we are not scanning for the last
+                * valid transaction in the journal,
+                * we will stop when we reach the end of
+                * the journal.*/
                if (action != ACTION_SCAN)
                        if (this_trans_id > info->last_trans_id) {
                                log_end = true;
@@ -660,12 +806,19 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs,
                        break;
 
                header = (struct jbd_bhdr *)block.data;
+               /* This block does not have a valid magic number,
+                * so we have reached the end of the journal.*/
                if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER) {
                        jbd_block_set(jbd_fs, &block);
                        log_end = true;
                        continue;
                }
 
+               /* If the transaction id we found is not expected,
+                * we may have reached the end of the journal.
+                *
+                * If we are not scanning the journal, something
+                * bad might have taken place. :-( */
                if (jbd_get32(header, sequence) != this_trans_id) {
                        if (action != ACTION_SCAN)
                                r = EIO;
@@ -697,6 +850,9 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs,
                        ext4_dbg(DEBUG_JBD, "Commit block: %" PRIu32", "
                                            "trans_id: %" PRIu32"\n",
                                            this_block, this_trans_id);
+                       /* This is the end of a transaction,
+                        * we may now proceed to the next transaction.
+                        */
                        this_trans_id++;
                        break;
                case JBD_REVOKE_BLOCK:
@@ -722,6 +878,7 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs,
        }
        ext4_dbg(DEBUG_JBD, "End of journal.\n");
        if (r == EOK && action == ACTION_SCAN) {
+               /* We have finished scanning the journal. */
                info->start_trans_id = start_trans_id;
                if (this_trans_id > start_trans_id)
                        info->last_trans_id = this_trans_id - 1;
@@ -732,6 +889,9 @@ int jbd_iterate_log(struct jbd_fs *jbd_fs,
        return r;
 }
 
+/**@brief  Replay journal.
+ * @param  jbd_fs jbd filesystem
+ * @return standard error code*/
 int jbd_recover(struct jbd_fs *jbd_fs)
 {
        int r;
@@ -752,6 +912,10 @@ int jbd_recover(struct jbd_fs *jbd_fs)
 
        r = jbd_iterate_log(jbd_fs, &info, ACTION_RECOVER);
        if (r == EOK) {
+               /* If we successfully replay the journal,
+                * clear EXT4_FINCOM_RECOVER flag on the
+                * ext4 superblock, and set the start of
+                * journal to 0.*/
                uint32_t features_incompatible =
                        ext4_get32(&jbd_fs->inode_ref.fs->sb,
                                   features_incompatible);
@@ -761,12 +925,14 @@ int jbd_recover(struct jbd_fs *jbd_fs)
                           features_incompatible,
                           features_incompatible);
                jbd_fs->dirty = true;
+               r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev,
+                                 &jbd_fs->inode_ref.fs->sb);
        }
        jbd_destroy_revoke_tree(&info);
        return r;
 }
 
-void jbd_journal_write_sb(struct jbd_journal *journal)
+static void jbd_journal_write_sb(struct jbd_journal *journal)
 {
        struct jbd_fs *jbd_fs = journal->jbd_fs;
        jbd_set32(&jbd_fs->sb, start, journal->start);
@@ -774,9 +940,26 @@ void jbd_journal_write_sb(struct jbd_journal *journal)
        jbd_fs->dirty = true;
 }
 
+/**@brief  Start accessing the journal.
+ * @param  jbd_fs jbd filesystem
+ * @param  journal current journal session
+ * @return standard error code*/
 int jbd_journal_start(struct jbd_fs *jbd_fs,
                      struct jbd_journal *journal)
 {
+       int r;
+       uint32_t features_incompatible =
+                       ext4_get32(&jbd_fs->inode_ref.fs->sb,
+                                  features_incompatible);
+       features_incompatible |= EXT4_FINCOM_RECOVER;
+       ext4_set32(&jbd_fs->inode_ref.fs->sb,
+                       features_incompatible,
+                       features_incompatible);
+       r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev,
+                       &jbd_fs->inode_ref.fs->sb);
+       if (r != EOK)
+               return r;
+
        journal->first = jbd_get32(&jbd_fs->sb, first);
        journal->start = journal->first;
        journal->last = journal->first;
@@ -786,28 +969,69 @@ int jbd_journal_start(struct jbd_fs *jbd_fs,
        journal->block_size = jbd_get32(&jbd_fs->sb, blocksize);
 
        TAILQ_INIT(&journal->trans_queue);
+       TAILQ_INIT(&journal->cp_queue);
        journal->jbd_fs = jbd_fs;
        jbd_journal_write_sb(journal);
        return jbd_write_sb(jbd_fs);
 }
 
+/**@brief  Stop accessing the journal.
+ * @param  journal current journal session
+ * @return standard error code*/
 int jbd_journal_stop(struct jbd_journal *journal)
 {
+       int r;
+       struct jbd_fs *jbd_fs = journal->jbd_fs;
+       uint32_t features_incompatible;
+
+       /* Commit all the transactions to the journal.*/
+       jbd_journal_commit_all(journal);
+       /* Make sure that journalled content have reached
+        * the disk.*/
+       ext4_block_cache_flush(jbd_fs->inode_ref.fs->bdev);
+
+       features_incompatible =
+               ext4_get32(&jbd_fs->inode_ref.fs->sb,
+                          features_incompatible);
+       features_incompatible &= ~EXT4_FINCOM_RECOVER;
+       ext4_set32(&jbd_fs->inode_ref.fs->sb,
+                       features_incompatible,
+                       features_incompatible);
+       r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev,
+                       &jbd_fs->inode_ref.fs->sb);
+       if (r != EOK)
+               return r;
+
        journal->start = 0;
        journal->trans_id = 0;
        jbd_journal_write_sb(journal);
        return jbd_write_sb(journal->jbd_fs);
 }
 
+/**@brief  Allocate a block in the journal.
+ * @param  journal current journal session
+ * @param  trans transaction
+ * @return allocated block address*/
 static uint32_t jbd_journal_alloc_block(struct jbd_journal *journal,
                                        struct jbd_trans *trans)
 {
-       uint32_t start_block = journal->last++;
+       uint32_t start_block;
+
+       start_block = journal->last++;
        trans->alloc_blocks++;
        wrap(&journal->jbd_fs->sb, journal->last);
+       
+       /* If there is no space left, flush all journalled
+        * blocks to disk first.*/
+       if (journal->last == journal->start)
+               ext4_block_cache_flush(journal->jbd_fs->inode_ref.fs->bdev);
+
        return start_block;
 }
 
+/**@brief  Allocate a new transaction
+ * @param  journal current journal session
+ * @return transaction allocated*/
 struct jbd_trans *
 jbd_journal_new_trans(struct jbd_journal *journal)
 {
@@ -827,10 +1051,27 @@ static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
                          int res,
                          void *arg);
 
+/**@brief  Add block to a transaction and gain
+ *         access to it before making any modications.
+ * @param  trans transaction
+ * @param  block block descriptor
+ * @return standard error code*/
 int jbd_trans_add_block(struct jbd_trans *trans,
                        struct ext4_block *block)
 {
-       struct jbd_buf *buf = calloc(1, sizeof(struct jbd_buf));
+       struct jbd_buf *buf;
+       struct ext4_fs *fs =
+               trans->journal->jbd_fs->inode_ref.fs;
+
+       /* If the buffer has already been modified, we should
+        * flush dirty data in this buffer to disk.*/
+       if (ext4_bcache_test_flag(block->buf, BC_DIRTY)) {
+               /* XXX: i don't want to know whether the call
+                * succeeds or not. */
+               ext4_block_flush_buf(fs->bdev, block->buf);
+       }
+
+       buf = calloc(1, sizeof(struct jbd_buf));
        if (!buf)
                return ENOMEM;
 
@@ -838,14 +1079,20 @@ int jbd_trans_add_block(struct jbd_trans *trans,
        buf->block = *block;
        ext4_bcache_inc_ref(block->buf);
 
+       /* If the content reach the disk, notify us
+        * so that we may do a checkpoint. */
        block->buf->end_write = jbd_trans_end_write;
-       block->buf->end_write_arg = trans;
+       block->buf->end_write_arg = buf;
 
        trans->data_cnt++;
        LIST_INSERT_HEAD(&trans->buf_list, buf, buf_node);
        return EOK;
 }
 
+/**@brief  Add block to be revoked to a transaction
+ * @param  trans transaction
+ * @param  lba logical block address
+ * @return standard error code*/
 int jbd_trans_revoke_block(struct jbd_trans *trans,
                           ext4_fsblk_t lba)
 {
@@ -859,16 +1106,24 @@ int jbd_trans_revoke_block(struct jbd_trans *trans,
        return EOK;
 }
 
+/**@brief  Free a transaction
+ * @param  journal current journal session
+ * @param  trans transaction
+ * @param  abort discard all the modifications on the block?
+ * @return standard error code*/
 void jbd_journal_free_trans(struct jbd_journal *journal,
                            struct jbd_trans *trans,
                            bool abort)
 {
        struct jbd_buf *jbd_buf, *tmp;
        struct jbd_revoke_rec *rec, *tmp2;
+       struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
        LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node,
                          tmp) {
-               if (abort)
-                       ext4_block_set(journal->jbd_fs->bdev, &jbd_buf->block);
+               if (abort) {
+                       ext4_bcache_clear_dirty(jbd_buf->block.buf);
+                       ext4_block_set(fs->bdev, &jbd_buf->block);
+               }
 
                LIST_REMOVE(jbd_buf, buf_node);
                free(jbd_buf);
@@ -882,6 +1137,9 @@ void jbd_journal_free_trans(struct jbd_journal *journal,
        free(trans);
 }
 
+/**@brief  Write commit block for a transaction
+ * @param  trans transaction
+ * @return standard error code*/
 static int jbd_trans_write_commit_block(struct jbd_trans *trans)
 {
        int rc;
@@ -897,9 +1155,9 @@ static int jbd_trans_write_commit_block(struct jbd_trans *trans)
                return rc;
 
        header = (struct jbd_commit_header *)commit_block.data;
-       header->header.magic = JBD_MAGIC_NUMBER;
-       header->header.blocktype = JBD_COMMIT_BLOCK;
-       header->header.sequence = trans->trans_id;
+       jbd_set32(&header->header, magic, JBD_MAGIC_NUMBER);
+       jbd_set32(&header->header, blocktype, JBD_COMMIT_BLOCK);
+       jbd_set32(&header->header, sequence, trans->trans_id);
 
        ext4_bcache_set_dirty(commit_block.buf);
        rc = jbd_block_set(journal->jbd_fs, &commit_block);
@@ -909,6 +1167,10 @@ static int jbd_trans_write_commit_block(struct jbd_trans *trans)
        return EOK;
 }
 
+/**@brief  Write descriptor block for a transaction
+ * @param  journal current journal session
+ * @param  trans transaction
+ * @return standard error code*/
 static int jbd_journal_prepare(struct jbd_journal *journal,
                               struct jbd_trans *trans)
 {
@@ -917,33 +1179,47 @@ static int jbd_journal_prepare(struct jbd_journal *journal,
        uint32_t desc_iblock = 0;
        uint32_t data_iblock = 0;
        char *tag_start = NULL, *tag_ptr = NULL;
-       struct jbd_buf *jbd_buf;
+       struct jbd_buf *jbd_buf, *tmp;
        struct ext4_block desc_block, data_block;
+       struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
 
-       LIST_FOREACH(jbd_buf, &trans->buf_list, buf_node) {
+       LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node, tmp) {
                struct tag_info tag_info;
                bool uuid_exist = false;
+               if (!ext4_bcache_test_flag(jbd_buf->block.buf,
+                                          BC_DIRTY)) {
+                       /* The buffer has not been modified, just release
+                        * that jbd_buf. */
+                       ext4_block_set(fs->bdev, &jbd_buf->block);
+                       LIST_REMOVE(jbd_buf, buf_node);
+                       free(jbd_buf);
+                       continue;
+               }
 again:
                if (!desc_iblock) {
                        struct jbd_bhdr *bhdr;
                        desc_iblock = jbd_journal_alloc_block(journal, trans);
                        rc = jbd_block_get_noread(journal->jbd_fs,
                                           &desc_block, desc_iblock);
-                       if (!rc)
+                       if (rc != EOK)
                                break;
 
                        ext4_bcache_set_dirty(desc_block.buf);
 
                        bhdr = (struct jbd_bhdr *)desc_block.data;
-                       bhdr->magic = JBD_MAGIC_NUMBER;
-                       bhdr->blocktype = JBD_DESCRIPTOR_BLOCK;
-                       bhdr->sequence = trans->trans_id;
+                       jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
+                       jbd_set32(bhdr, blocktype, JBD_DESCRIPTOR_BLOCK);
+                       jbd_set32(bhdr, sequence, trans->trans_id);
 
                        tag_start = (char *)(bhdr + 1);
                        tag_ptr = tag_start;
                        uuid_exist = true;
                        tag_tbl_size = journal->block_size -
                                sizeof(struct jbd_bhdr);
+
+                       if (!trans->start_iblock)
+                               trans->start_iblock = desc_iblock;
+
                }
                tag_info.block = jbd_buf->block.lb_id;
                tag_info.uuid_exist = uuid_exist;
@@ -990,6 +1266,10 @@ again:
        return rc;
 }
 
+/**@brief  Write revoke block for a transaction
+ * @param  journal current journal session
+ * @param  trans transaction
+ * @return standard error code*/
 static int
 jbd_journal_prepare_revoke(struct jbd_journal *journal,
                           struct jbd_trans *trans)
@@ -1015,25 +1295,30 @@ again:
                        desc_iblock = jbd_journal_alloc_block(journal, trans);
                        rc = jbd_block_get_noread(journal->jbd_fs,
                                           &desc_block, desc_iblock);
-                       if (!rc) {
+                       if (rc != EOK) {
                                break;
                        }
 
                        ext4_bcache_set_dirty(desc_block.buf);
 
                        bhdr = (struct jbd_bhdr *)desc_block.data;
-                       bhdr->magic = JBD_MAGIC_NUMBER;
-                       bhdr->blocktype = JBD_REVOKE_BLOCK;
-                       bhdr->sequence = trans->trans_id;
+                       jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
+                       jbd_set32(bhdr, blocktype, JBD_REVOKE_BLOCK);
+                       jbd_set32(bhdr, sequence, trans->trans_id);
                        
                        header = (struct jbd_revoke_header *)bhdr;
                        blocks_entry = (char *)(header + 1);
                        tag_tbl_size = journal->block_size -
                                sizeof(struct jbd_revoke_header);
+
+                       if (!trans->start_iblock)
+                               trans->start_iblock = desc_iblock;
+
                }
 
                if (tag_tbl_size < record_len) {
-                       header->count = journal->block_size - tag_tbl_size;
+                       jbd_set32(header, count,
+                                 journal->block_size - tag_tbl_size);
                        jbd_block_set(journal->jbd_fs, &desc_block);
                        desc_iblock = 0;
                        header = NULL;
@@ -1055,7 +1340,8 @@ again:
        }
        if (rc == EOK && desc_iblock) {
                if (header != NULL)
-                       header->count = journal->block_size - tag_tbl_size;
+                       jbd_set32(header, count,
+                                 journal->block_size - tag_tbl_size);
 
                jbd_block_set(journal->jbd_fs, &desc_block);
        }
@@ -1063,6 +1349,9 @@ again:
        return rc;
 }
 
+/**@brief  Submit the transaction to transaction queue.
+ * @param  journal current journal session
+ * @param  trans transaction*/
 void
 jbd_journal_submit_trans(struct jbd_journal *journal,
                         struct jbd_trans *trans)
@@ -1072,75 +1361,163 @@ jbd_journal_submit_trans(struct jbd_journal *journal,
                          trans_node);
 }
 
+/**@brief  Put references of block descriptors in a transaction.
+ * @param  journal current journal session
+ * @param  trans transaction*/
 void jbd_journal_cp_trans(struct jbd_journal *journal, struct jbd_trans *trans)
 {
        struct jbd_buf *jbd_buf, *tmp;
+       struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
        LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node,
                        tmp) {
-               ext4_block_set(journal->jbd_fs->bdev, &jbd_buf->block);
+               struct ext4_block block = jbd_buf->block;
+               ext4_block_set(fs->bdev, &block);
        }
 }
 
+/**@brief  Update the start block of the journal when
+ *         all the contents in a transaction reach the disk.*/
 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
-                         struct ext4_buf *buf __unused,
+                         struct ext4_buf *buf,
                          int res,
                          void *arg)
 {
-       struct jbd_trans *trans = arg;
+       struct jbd_buf *jbd_buf = arg;
+       struct jbd_trans *trans = jbd_buf->trans;
        struct jbd_journal *journal = trans->journal;
+       bool first_in_queue =
+               trans == TAILQ_FIRST(&journal->cp_queue);
        if (res != EOK)
                trans->error = res;
 
+       /* Clear the end_write and end_write_arg fields. */
+       buf->end_write = NULL;
+       buf->end_write_arg = NULL;
+
        trans->written_cnt++;
        if (trans->written_cnt == trans->data_cnt) {
                TAILQ_REMOVE(&journal->cp_queue, trans, trans_node);
-               journal->start += trans->alloc_blocks;
-               journal->trans_id = ++trans->trans_id;
-               jbd_journal_write_sb(journal);
+
+               if (first_in_queue) {
+                       journal->start = trans->start_iblock +
+                               trans->alloc_blocks;
+                       wrap(&journal->jbd_fs->sb, journal->start);
+                       journal->trans_id = trans->trans_id + 1;
+               }
                jbd_journal_free_trans(journal, trans, false);
 
-               if ((trans = TAILQ_FIRST(&journal->cp_queue))) {
-                       jbd_journal_cp_trans(journal, trans);
+               if (first_in_queue) {
+                       while ((trans = TAILQ_FIRST(&journal->cp_queue))) {
+                               if (!trans->data_cnt) {
+                                       TAILQ_REMOVE(&journal->cp_queue,
+                                                    trans,
+                                                    trans_node);
+                                       journal->start = trans->start_iblock +
+                                               trans->alloc_blocks;
+                                       wrap(&journal->jbd_fs->sb, journal->start);
+                                       journal->trans_id = trans->trans_id + 1;
+                                       jbd_journal_free_trans(journal,
+                                                              trans, false);
+                               } else {
+                                       journal->start = trans->start_iblock;
+                                       wrap(&journal->jbd_fs->sb, journal->start);
+                                       journal->trans_id = trans->trans_id;
+                                       break;
+                               }
+                       }
+                       jbd_journal_write_sb(journal);
+                       jbd_write_sb(journal->jbd_fs);
                }
        }
 }
 
-/*
- * XXX: one should disable cache writeback first.
- */
-void jbd_journal_commit_one(struct jbd_journal *journal)
+/**@brief  Commit a transaction to the journal immediately.
+ * @param  journal current journal session
+ * @param  trans transaction
+ * @return standard error code*/
+int jbd_journal_commit_trans(struct jbd_journal *journal,
+                            struct jbd_trans *trans)
 {
        int rc = EOK;
        uint32_t last = journal->last;
-       struct jbd_trans *trans;
-       if ((trans = TAILQ_FIRST(&journal->trans_queue))) {
-               TAILQ_REMOVE(&journal->trans_queue, trans, trans_node);
 
-               trans->trans_id = journal->alloc_trans_id;
-               rc = jbd_journal_prepare(journal, trans);
-               if (rc != EOK)
-                       goto Finish;
+       trans->trans_id = journal->alloc_trans_id;
+       rc = jbd_journal_prepare(journal, trans);
+       if (rc != EOK)
+               goto Finish;
 
-               rc = jbd_journal_prepare_revoke(journal, trans);
-               if (rc != EOK)
-                       goto Finish;
+       rc = jbd_journal_prepare_revoke(journal, trans);
+       if (rc != EOK)
+               goto Finish;
 
-               rc = jbd_trans_write_commit_block(trans);
-               if (rc != EOK)
-                       goto Finish;
+       if (LIST_EMPTY(&trans->buf_list) &&
+           LIST_EMPTY(&trans->revoke_list)) {
+               /* Since there are no entries in both buffer list
+                * and revoke entry list, we do not consider trans as
+                * complete transaction and just return EOK.*/
+               jbd_journal_free_trans(journal, trans, false);
+               goto Finish;
+       }
 
-               journal->alloc_trans_id++;
-               TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
-                         trans_node);
-               if (trans == TAILQ_FIRST(&journal->cp_queue)) {
+       rc = jbd_trans_write_commit_block(trans);
+       if (rc != EOK)
+               goto Finish;
+
+       journal->alloc_trans_id++;
+       if (TAILQ_EMPTY(&journal->cp_queue)) {
+               if (trans->data_cnt) {
+                       journal->start = trans->start_iblock;
+                       wrap(&journal->jbd_fs->sb, journal->start);
+                       journal->trans_id = trans->trans_id;
+                       jbd_journal_write_sb(journal);
+                       jbd_write_sb(journal->jbd_fs);
+                       TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
+                                       trans_node);
                        jbd_journal_cp_trans(journal, trans);
+               } else {
+                       journal->start = trans->start_iblock +
+                               trans->alloc_blocks;
+                       wrap(&journal->jbd_fs->sb, journal->start);
+                       journal->trans_id = trans->trans_id + 1;
+                       jbd_journal_write_sb(journal);
+                       jbd_journal_free_trans(journal, trans, false);
                }
+       } else {
+               TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
+                               trans_node);
+               if (trans->data_cnt)
+                       jbd_journal_cp_trans(journal, trans);
+
        }
 Finish:
        if (rc != EOK) {
                journal->last = last;
                jbd_journal_free_trans(journal, trans, true);
        }
+       return rc;
+}
+
+/**@brief  Commit one transaction on transaction queue
+ *         to the journal.
+ * @param  journal current journal session.*/
+void jbd_journal_commit_one(struct jbd_journal *journal)
+{
+       struct jbd_trans *trans;
+
+       if ((trans = TAILQ_FIRST(&journal->trans_queue))) {
+               TAILQ_REMOVE(&journal->trans_queue, trans, trans_node);
+               jbd_journal_commit_trans(journal, trans);
+       }
+}
+
+/**@brief  Commit all the transactions on transaction queue
+ *         to the journal.
+ * @param  journal current journal session.*/
+void jbd_journal_commit_all(struct jbd_journal *journal)
+{
+       while (!TAILQ_EMPTY(&journal->trans_queue)) {
+               jbd_journal_commit_one(journal);
+       }
 }
 
 /**