ext4_xattr: better handling on some corner error case
[lwext4.git] / src / ext4_journal.c
1 /*
2  * Copyright (c) 2015 Grzegorz Kostka (kostka.grzegorz@gmail.com)
3  * Copyright (c) 2015 Kaho Ng (ngkaho1234@gmail.com)
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * - Redistributions of source code must retain the above copyright
11  *   notice, this list of conditions and the following disclaimer.
12  * - Redistributions in binary form must reproduce the above copyright
13  *   notice, this list of conditions and the following disclaimer in the
14  *   documentation and/or other materials provided with the distribution.
15  * - The name of the author may not be used to endorse or promote products
16  *   derived from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29
30 /** @addtogroup lwext4
31  * @{
32  */
33 /**
34  * @file  ext4_journal.c
35  * @brief Journal handle functions
36  */
37
38 #include "ext4_config.h"
39 #include "ext4_types.h"
40 #include "ext4_misc.h"
41 #include "ext4_errno.h"
42 #include "ext4_debug.h"
43
44 #include "ext4_fs.h"
45 #include "ext4_super.h"
46 #include "ext4_journal.h"
47 #include "ext4_blockdev.h"
48 #include "ext4_crc32.h"
49 #include "ext4_journal.h"
50
51 #include <string.h>
52 #include <stdlib.h>
53
54 /**@brief  Revoke entry during journal replay.*/
55 struct revoke_entry {
56         /**@brief  Block number not to be replayed.*/
57         ext4_fsblk_t block;
58
59         /**@brief  For any transaction id smaller
60          *         than trans_id, records of @block
61          *         in those transactions should not
62          *         be replayed.*/
63         uint32_t trans_id;
64
65         /**@brief  Revoke tree node.*/
66         RB_ENTRY(revoke_entry) revoke_node;
67 };
68
69 /**@brief  Valid journal replay information.*/
70 struct recover_info {
71         /**@brief  Starting transaction id.*/
72         uint32_t start_trans_id;
73
74         /**@brief  Ending transaction id.*/
75         uint32_t last_trans_id;
76
77         /**@brief  Used as internal argument.*/
78         uint32_t this_trans_id;
79
80         /**@brief  No of transactions went through.*/
81         uint32_t trans_cnt;
82
83         /**@brief  RB-Tree storing revoke entries.*/
84         RB_HEAD(jbd_revoke, revoke_entry) revoke_root;
85 };
86
87 /**@brief  Journal replay internal arguments.*/
88 struct replay_arg {
89         /**@brief  Journal replay information.*/
90         struct recover_info *info;
91
92         /**@brief  Current block we are on.*/
93         uint32_t *this_block;
94
95         /**@brief  Current trans_id we are on.*/
96         uint32_t this_trans_id;
97 };
98
99 /* Make sure we wrap around the log correctly! */
100 #define wrap(sb, var)                                           \
101 do {                                                                    \
102         if (var >= jbd_get32((sb), maxlen))                                     \
103                 var -= (jbd_get32((sb), maxlen) - jbd_get32((sb), first));      \
104 } while (0)
105
106
107 static int
108 jbd_revoke_entry_cmp(struct revoke_entry *a, struct revoke_entry *b)
109 {
110         if (a->block > b->block)
111                 return 1;
112         else if (a->block < b->block)
113                 return -1;
114         return 0;
115 }
116
117 static int
118 jbd_block_rec_cmp(struct jbd_block_rec *a, struct jbd_block_rec *b)
119 {
120         if (a->lba > b->lba)
121                 return 1;
122         else if (a->lba < b->lba)
123                 return -1;
124         return 0;
125 }
126
127 static int
128 jbd_revoke_rec_cmp(struct jbd_revoke_rec *a, struct jbd_revoke_rec *b)
129 {
130         if (a->lba > b->lba)
131                 return 1;
132         else if (a->lba < b->lba)
133                 return -1;
134         return 0;
135 }
136
137 RB_GENERATE_INTERNAL(jbd_revoke, revoke_entry, revoke_node,
138                      jbd_revoke_entry_cmp, static inline)
139 RB_GENERATE_INTERNAL(jbd_block, jbd_block_rec, block_rec_node,
140                      jbd_block_rec_cmp, static inline)
141 RB_GENERATE_INTERNAL(jbd_revoke_tree, jbd_revoke_rec, revoke_node,
142                      jbd_revoke_rec_cmp, static inline)
143
144 #define jbd_alloc_revoke_entry() calloc(1, sizeof(struct revoke_entry))
145 #define jbd_free_revoke_entry(addr) free(addr)
146
147 static int jbd_has_csum(struct jbd_sb *jbd_sb)
148 {
149         if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V2))
150                 return 2;
151
152         if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V3))
153                 return 3;
154
155         return 0;
156 }
157
158 #if CONFIG_META_CSUM_ENABLE
159 static uint32_t jbd_sb_csum(struct jbd_sb *jbd_sb)
160 {
161         uint32_t checksum = 0;
162
163         if (jbd_has_csum(jbd_sb)) {
164                 uint32_t orig_checksum = jbd_sb->checksum;
165                 jbd_set32(jbd_sb, checksum, 0);
166                 /* Calculate crc32c checksum against tho whole superblock */
167                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_sb,
168                                 JBD_SUPERBLOCK_SIZE);
169                 jbd_sb->checksum = orig_checksum;
170         }
171         return checksum;
172 }
173 #else
174 #define jbd_sb_csum(...) 0
175 #endif
176
177 static void jbd_sb_csum_set(struct jbd_sb *jbd_sb)
178 {
179         if (!jbd_has_csum(jbd_sb))
180                 return;
181
182         jbd_set32(jbd_sb, checksum, jbd_sb_csum(jbd_sb));
183 }
184
185 #if CONFIG_META_CSUM_ENABLE
186 static bool
187 jbd_verify_sb_csum(struct jbd_sb *jbd_sb)
188 {
189         if (!jbd_has_csum(jbd_sb))
190                 return true;
191
192         return jbd_sb_csum(jbd_sb) == jbd_get32(jbd_sb, checksum);
193 }
194 #else
195 #define jbd_verify_sb_csum(...) true
196 #endif
197
198 #if CONFIG_META_CSUM_ENABLE
199 static uint32_t jbd_meta_csum(struct jbd_fs *jbd_fs,
200                               struct jbd_bhdr *bhdr)
201 {
202         uint32_t checksum = 0;
203
204         if (jbd_has_csum(&jbd_fs->sb)) {
205                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
206                 struct jbd_block_tail *tail =
207                         (struct jbd_block_tail *)((char *)bhdr + block_size -
208                                 sizeof(struct jbd_block_tail));
209                 uint32_t orig_checksum = tail->checksum;
210                 tail->checksum = 0;
211
212                 /* First calculate crc32c checksum against fs uuid */
213                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
214                                        sizeof(jbd_fs->sb.uuid));
215                 /* Calculate crc32c checksum against tho whole block */
216                 checksum = ext4_crc32c(checksum, bhdr,
217                                 block_size);
218                 tail->checksum = orig_checksum;
219         }
220         return checksum;
221 }
222 #else
223 #define jbd_meta_csum(...) 0
224 #endif
225
226 static void jbd_meta_csum_set(struct jbd_fs *jbd_fs,
227                               struct jbd_bhdr *bhdr)
228 {
229         uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
230         struct jbd_block_tail *tail = (struct jbd_block_tail *)
231                                 ((char *)bhdr + block_size -
232                                 sizeof(struct jbd_block_tail));
233         if (!jbd_has_csum(&jbd_fs->sb))
234                 return;
235
236         tail->checksum = to_be32(jbd_meta_csum(jbd_fs, bhdr));
237 }
238
239 #if CONFIG_META_CSUM_ENABLE
240 static bool
241 jbd_verify_meta_csum(struct jbd_fs *jbd_fs,
242                      struct jbd_bhdr *bhdr)
243 {
244         uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
245         struct jbd_block_tail *tail = (struct jbd_block_tail *)
246                                 ((char *)bhdr + block_size -
247                                 sizeof(struct jbd_block_tail));
248         if (!jbd_has_csum(&jbd_fs->sb))
249                 return true;
250
251         return jbd_meta_csum(jbd_fs, bhdr) == to_be32(tail->checksum);
252 }
253 #else
254 #define jbd_verify_meta_csum(...) true
255 #endif
256
257 #if CONFIG_META_CSUM_ENABLE
258 static uint32_t jbd_commit_csum(struct jbd_fs *jbd_fs,
259                               struct jbd_commit_header *header)
260 {
261         uint32_t checksum = 0;
262
263         if (jbd_has_csum(&jbd_fs->sb)) {
264                 uint32_t orig_checksum_type = header->chksum_type,
265                          orig_checksum_size = header->chksum_size,
266                          orig_checksum = header->chksum[0];
267                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
268                 header->chksum_type = 0;
269                 header->chksum_size = 0;
270                 header->chksum[0] = 0;
271
272                 /* First calculate crc32c checksum against fs uuid */
273                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
274                                        sizeof(jbd_fs->sb.uuid));
275                 /* Calculate crc32c checksum against tho whole block */
276                 checksum = ext4_crc32c(checksum, header,
277                                 block_size);
278
279                 header->chksum_type = orig_checksum_type;
280                 header->chksum_size = orig_checksum_size;
281                 header->chksum[0] = orig_checksum;
282         }
283         return checksum;
284 }
285 #else
286 #define jbd_commit_csum(...) 0
287 #endif
288
289 static void jbd_commit_csum_set(struct jbd_fs *jbd_fs,
290                               struct jbd_commit_header *header)
291 {
292         if (!jbd_has_csum(&jbd_fs->sb))
293                 return;
294
295         header->chksum_type = 0;
296         header->chksum_size = 0;
297         header->chksum[0] = jbd_commit_csum(jbd_fs, header);
298 }
299
300 #if CONFIG_META_CSUM_ENABLE
301 static bool jbd_verify_commit_csum(struct jbd_fs *jbd_fs,
302                                    struct jbd_commit_header *header)
303 {
304         if (!jbd_has_csum(&jbd_fs->sb))
305                 return true;
306
307         return header->chksum[0] == to_be32(jbd_commit_csum(jbd_fs,
308                                             header));
309 }
310 #else
311 #define jbd_verify_commit_csum(...) true
312 #endif
313
314 #if CONFIG_META_CSUM_ENABLE
315 /*
316  * NOTE: We only make use of @csum parameter when
317  *       JBD_FEATURE_COMPAT_CHECKSUM is enabled.
318  */
319 static uint32_t jbd_block_csum(struct jbd_fs *jbd_fs, const void *buf,
320                                uint32_t csum,
321                                uint32_t sequence)
322 {
323         uint32_t checksum = 0;
324
325         if (jbd_has_csum(&jbd_fs->sb)) {
326                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
327                 /* First calculate crc32c checksum against fs uuid */
328                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
329                                        sizeof(jbd_fs->sb.uuid));
330                 /* Then calculate crc32c checksum against sequence no. */
331                 checksum = ext4_crc32c(checksum, &sequence,
332                                 sizeof(uint32_t));
333                 /* Calculate crc32c checksum against tho whole block */
334                 checksum = ext4_crc32c(checksum, buf,
335                                 block_size);
336         } else if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
337                                      JBD_FEATURE_COMPAT_CHECKSUM)) {
338                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
339                 /* Calculate crc32c checksum against tho whole block */
340                 checksum = ext4_crc32(csum, buf,
341                                 block_size);
342         }
343         return checksum;
344 }
345 #else
346 #define jbd_block_csum(...) 0
347 #endif
348
349 static void jbd_block_tag_csum_set(struct jbd_fs *jbd_fs, void *__tag,
350                                    uint32_t checksum)
351 {
352         int ver = jbd_has_csum(&jbd_fs->sb);
353         if (!ver)
354                 return;
355
356         if (ver == 2) {
357                 struct jbd_block_tag *tag = __tag;
358                 tag->checksum = (uint16_t)to_be32(checksum);
359         } else {
360                 struct jbd_block_tag3 *tag = __tag;
361                 tag->checksum = to_be32(checksum);
362         }
363 }
364
365 /**@brief  Write jbd superblock to disk.
366  * @param  jbd_fs jbd filesystem
367  * @param  s jbd superblock
368  * @return standard error code*/
369 static int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s)
370 {
371         int rc;
372         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
373         uint64_t offset;
374         ext4_fsblk_t fblock;
375         rc = jbd_inode_bmap(jbd_fs, 0, &fblock);
376         if (rc != EOK)
377                 return rc;
378
379         jbd_sb_csum_set(s);
380         offset = fblock * ext4_sb_get_block_size(&fs->sb);
381         return ext4_block_writebytes(fs->bdev, offset, s,
382                                      EXT4_SUPERBLOCK_SIZE);
383 }
384
385 /**@brief  Read jbd superblock from disk.
386  * @param  jbd_fs jbd filesystem
387  * @param  s jbd superblock
388  * @return standard error code*/
389 static int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s)
390 {
391         int rc;
392         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
393         uint64_t offset;
394         ext4_fsblk_t fblock;
395         rc = jbd_inode_bmap(jbd_fs, 0, &fblock);
396         if (rc != EOK)
397                 return rc;
398
399         offset = fblock * ext4_sb_get_block_size(&fs->sb);
400         return ext4_block_readbytes(fs->bdev, offset, s,
401                                     EXT4_SUPERBLOCK_SIZE);
402 }
403
404 /**@brief  Verify jbd superblock.
405  * @param  sb jbd superblock
406  * @return true if jbd superblock is valid */
407 static bool jbd_verify_sb(struct jbd_sb *sb)
408 {
409         struct jbd_bhdr *header = &sb->header;
410         if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER)
411                 return false;
412
413         if (jbd_get32(header, blocktype) != JBD_SUPERBLOCK &&
414             jbd_get32(header, blocktype) != JBD_SUPERBLOCK_V2)
415                 return false;
416
417         return jbd_verify_sb_csum(sb);
418 }
419
420 /**@brief  Write back dirty jbd superblock to disk.
421  * @param  jbd_fs jbd filesystem
422  * @return standard error code*/
423 static int jbd_write_sb(struct jbd_fs *jbd_fs)
424 {
425         int rc = EOK;
426         if (jbd_fs->dirty) {
427                 rc = jbd_sb_write(jbd_fs, &jbd_fs->sb);
428                 if (rc != EOK)
429                         return rc;
430
431                 jbd_fs->dirty = false;
432         }
433         return rc;
434 }
435
436 /**@brief  Get reference to jbd filesystem.
437  * @param  fs Filesystem to load journal of
438  * @param  jbd_fs jbd filesystem
439  * @return standard error code*/
440 int jbd_get_fs(struct ext4_fs *fs,
441                struct jbd_fs *jbd_fs)
442 {
443         int rc;
444         uint32_t journal_ino;
445
446         memset(jbd_fs, 0, sizeof(struct jbd_fs));
447         /* See if there is journal inode on this filesystem.*/
448         /* FIXME: detection on existance ofbkejournal bdev is
449          *        missing.*/
450         journal_ino = ext4_get32(&fs->sb, journal_inode_number);
451
452         rc = ext4_fs_get_inode_ref(fs,
453                                    journal_ino,
454                                    &jbd_fs->inode_ref);
455         if (rc != EOK) {
456                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
457                 return rc;
458         }
459         rc = jbd_sb_read(jbd_fs, &jbd_fs->sb);
460         if (rc != EOK) {
461                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
462                 ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
463                 return rc;
464         }
465         if (!jbd_verify_sb(&jbd_fs->sb)) {
466                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
467                 ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
468                 rc = EIO;
469         }
470
471         if (rc == EOK)
472                 jbd_fs->bdev = fs->bdev;
473
474         return rc;
475 }
476
477 /**@brief  Put reference of jbd filesystem.
478  * @param  jbd_fs jbd filesystem
479  * @return standard error code*/
480 int jbd_put_fs(struct jbd_fs *jbd_fs)
481 {
482         int rc = EOK;
483         rc = jbd_write_sb(jbd_fs);
484
485         ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
486         return rc;
487 }
488
489 /**@brief  Data block lookup helper.
490  * @param  jbd_fs jbd filesystem
491  * @param  iblock block index
492  * @param  fblock logical block address
493  * @return standard error code*/
494 int jbd_inode_bmap(struct jbd_fs *jbd_fs,
495                    ext4_lblk_t iblock,
496                    ext4_fsblk_t *fblock)
497 {
498         int rc = ext4_fs_get_inode_dblk_idx(
499                         &jbd_fs->inode_ref,
500                         iblock,
501                         fblock,
502                         false);
503         return rc;
504 }
505
506 /**@brief   jbd block get function (through cache).
507  * @param   jbd_fs jbd filesystem
508  * @param   block block descriptor
509  * @param   fblock jbd logical block address
510  * @return  standard error code*/
511 static int jbd_block_get(struct jbd_fs *jbd_fs,
512                   struct ext4_block *block,
513                   ext4_fsblk_t fblock)
514 {
515         /* TODO: journal device. */
516         int rc;
517         struct ext4_blockdev *bdev = jbd_fs->bdev;
518         ext4_lblk_t iblock = (ext4_lblk_t)fblock;
519
520         /* Lookup the logical block address of
521          * fblock.*/
522         rc = jbd_inode_bmap(jbd_fs, iblock,
523                             &fblock);
524         if (rc != EOK)
525                 return rc;
526
527         rc = ext4_block_get(bdev, block, fblock);
528
529         /* If succeeded, mark buffer as BC_FLUSH to indicate
530          * that data should be written to disk immediately.*/
531         if (rc == EOK) {
532                 ext4_bcache_set_flag(block->buf, BC_FLUSH);
533                 /* As we don't want to occupy too much space
534                  * in block cache, we set this buffer BC_TMP.*/
535                 ext4_bcache_set_flag(block->buf, BC_TMP);
536         }
537
538         return rc;
539 }
540
541 /**@brief   jbd block get function (through cache, don't read).
542  * @param   jbd_fs jbd filesystem
543  * @param   block block descriptor
544  * @param   fblock jbd logical block address
545  * @return  standard error code*/
546 static int jbd_block_get_noread(struct jbd_fs *jbd_fs,
547                          struct ext4_block *block,
548                          ext4_fsblk_t fblock)
549 {
550         /* TODO: journal device. */
551         int rc;
552         struct ext4_blockdev *bdev = jbd_fs->bdev;
553         ext4_lblk_t iblock = (ext4_lblk_t)fblock;
554         rc = jbd_inode_bmap(jbd_fs, iblock,
555                             &fblock);
556         if (rc != EOK)
557                 return rc;
558
559         rc = ext4_block_get_noread(bdev, block, fblock);
560         if (rc == EOK)
561                 ext4_bcache_set_flag(block->buf, BC_FLUSH);
562
563         return rc;
564 }
565
566 /**@brief   jbd block set procedure (through cache).
567  * @param   jbd_fs jbd filesystem
568  * @param   block block descriptor
569  * @return  standard error code*/
570 static int jbd_block_set(struct jbd_fs *jbd_fs,
571                   struct ext4_block *block)
572 {
573         struct ext4_blockdev *bdev = jbd_fs->bdev;
574         return ext4_block_set(bdev, block);
575 }
576
577 /**@brief  helper functions to calculate
578  *         block tag size, not including UUID part.
579  * @param  jbd_fs jbd filesystem
580  * @return tag size in bytes*/
581 static int jbd_tag_bytes(struct jbd_fs *jbd_fs)
582 {
583         int size;
584
585         /* It is very easy to deal with the case which
586          * JBD_FEATURE_INCOMPAT_CSUM_V3 is enabled.*/
587         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
588                                      JBD_FEATURE_INCOMPAT_CSUM_V3))
589                 return sizeof(struct jbd_block_tag3);
590
591         size = sizeof(struct jbd_block_tag);
592
593         /* If JBD_FEATURE_INCOMPAT_CSUM_V2 is enabled,
594          * add 2 bytes to size.*/
595         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
596                                      JBD_FEATURE_INCOMPAT_CSUM_V2))
597                 size += sizeof(uint16_t);
598
599         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
600                                      JBD_FEATURE_INCOMPAT_64BIT))
601                 return size;
602
603         /* If block number is 4 bytes in size,
604          * minus 4 bytes from size */
605         return size - sizeof(uint32_t);
606 }
607
608 /**@brief  Tag information. */
609 struct tag_info {
610         /**@brief  Tag size in bytes, including UUID part.*/
611         int tag_bytes;
612
613         /**@brief  block number stored in this tag.*/
614         ext4_fsblk_t block;
615
616         /**@brief  whether UUID part exists or not.*/
617         bool uuid_exist;
618
619         /**@brief  UUID content if UUID part exists.*/
620         uint8_t uuid[UUID_SIZE];
621
622         /**@brief  Is this the last tag? */
623         bool last_tag;
624
625         /**@brief  crc32c checksum. */
626         uint32_t checksum;
627 };
628
629 /**@brief  Extract information from a block tag.
630  * @param  __tag pointer to the block tag
631  * @param  tag_bytes block tag size of this jbd filesystem
632  * @param  remaining size in buffer containing the block tag
633  * @param  tag_info information of this tag.
634  * @return  EOK when succeed, otherwise return EINVAL.*/
635 static int
636 jbd_extract_block_tag(struct jbd_fs *jbd_fs,
637                       void *__tag,
638                       int tag_bytes,
639                       int32_t remain_buf_size,
640                       struct tag_info *tag_info)
641 {
642         char *uuid_start;
643         tag_info->tag_bytes = tag_bytes;
644         tag_info->uuid_exist = false;
645         tag_info->last_tag = false;
646
647         /* See whether it is possible to hold a valid block tag.*/
648         if (remain_buf_size - tag_bytes < 0)
649                 return EINVAL;
650
651         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
652                                      JBD_FEATURE_INCOMPAT_CSUM_V3)) {
653                 struct jbd_block_tag3 *tag = __tag;
654                 tag_info->block = jbd_get32(tag, blocknr);
655                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
656                                              JBD_FEATURE_INCOMPAT_64BIT))
657                          tag_info->block |=
658                                  (uint64_t)jbd_get32(tag, blocknr_high) << 32;
659
660                 if (jbd_get32(tag, flags) & JBD_FLAG_ESCAPE)
661                         tag_info->block = 0;
662
663                 if (!(jbd_get32(tag, flags) & JBD_FLAG_SAME_UUID)) {
664                         /* See whether it is possible to hold UUID part.*/
665                         if (remain_buf_size - tag_bytes < UUID_SIZE)
666                                 return EINVAL;
667
668                         uuid_start = (char *)tag + tag_bytes;
669                         tag_info->uuid_exist = true;
670                         tag_info->tag_bytes += UUID_SIZE;
671                         memcpy(tag_info->uuid, uuid_start, UUID_SIZE);
672                 }
673
674                 if (jbd_get32(tag, flags) & JBD_FLAG_LAST_TAG)
675                         tag_info->last_tag = true;
676
677         } else {
678                 struct jbd_block_tag *tag = __tag;
679                 tag_info->block = jbd_get32(tag, blocknr);
680                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
681                                              JBD_FEATURE_INCOMPAT_64BIT))
682                          tag_info->block |=
683                                  (uint64_t)jbd_get32(tag, blocknr_high) << 32;
684
685                 if (jbd_get16(tag, flags) & JBD_FLAG_ESCAPE)
686                         tag_info->block = 0;
687
688                 if (!(jbd_get16(tag, flags) & JBD_FLAG_SAME_UUID)) {
689                         /* See whether it is possible to hold UUID part.*/
690                         if (remain_buf_size - tag_bytes < UUID_SIZE)
691                                 return EINVAL;
692
693                         uuid_start = (char *)tag + tag_bytes;
694                         tag_info->uuid_exist = true;
695                         tag_info->tag_bytes += UUID_SIZE;
696                         memcpy(tag_info->uuid, uuid_start, UUID_SIZE);
697                 }
698
699                 if (jbd_get16(tag, flags) & JBD_FLAG_LAST_TAG)
700                         tag_info->last_tag = true;
701
702         }
703         return EOK;
704 }
705
706 /**@brief  Write information to a block tag.
707  * @param  __tag pointer to the block tag
708  * @param  remaining size in buffer containing the block tag
709  * @param  tag_info information of this tag.
710  * @return  EOK when succeed, otherwise return EINVAL.*/
711 static int
712 jbd_write_block_tag(struct jbd_fs *jbd_fs,
713                     void *__tag,
714                     int32_t remain_buf_size,
715                     struct tag_info *tag_info)
716 {
717         char *uuid_start;
718         int tag_bytes = jbd_tag_bytes(jbd_fs);
719
720         tag_info->tag_bytes = tag_bytes;
721
722         /* See whether it is possible to hold a valid block tag.*/
723         if (remain_buf_size - tag_bytes < 0)
724                 return EINVAL;
725
726         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
727                                      JBD_FEATURE_INCOMPAT_CSUM_V3)) {
728                 struct jbd_block_tag3 *tag = __tag;
729                 memset(tag, 0, sizeof(struct jbd_block_tag3));
730                 jbd_set32(tag, blocknr, (uint32_t)tag_info->block);
731                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
732                                              JBD_FEATURE_INCOMPAT_64BIT))
733                         jbd_set32(tag, blocknr_high, tag_info->block >> 32);
734
735                 if (tag_info->uuid_exist) {
736                         /* See whether it is possible to hold UUID part.*/
737                         if (remain_buf_size - tag_bytes < UUID_SIZE)
738                                 return EINVAL;
739
740                         uuid_start = (char *)tag + tag_bytes;
741                         tag_info->tag_bytes += UUID_SIZE;
742                         memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
743                 } else
744                         jbd_set32(tag, flags,
745                                   jbd_get32(tag, flags) | JBD_FLAG_SAME_UUID);
746
747                 jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum);
748
749                 if (tag_info->last_tag)
750                         jbd_set32(tag, flags,
751                                   jbd_get32(tag, flags) | JBD_FLAG_LAST_TAG);
752
753         } else {
754                 struct jbd_block_tag *tag = __tag;
755                 memset(tag, 0, sizeof(struct jbd_block_tag));
756                 jbd_set32(tag, blocknr, (uint32_t)tag_info->block);
757                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
758                                              JBD_FEATURE_INCOMPAT_64BIT))
759                         jbd_set32(tag, blocknr_high, tag_info->block >> 32);
760
761                 if (tag_info->uuid_exist) {
762                         /* See whether it is possible to hold UUID part.*/
763                         if (remain_buf_size - tag_bytes < UUID_SIZE)
764                                 return EINVAL;
765
766                         uuid_start = (char *)tag + tag_bytes;
767                         tag_info->tag_bytes += UUID_SIZE;
768                         memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
769                 } else
770                         jbd_set16(tag, flags,
771                                   jbd_get16(tag, flags) | JBD_FLAG_SAME_UUID);
772
773                 jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum);
774
775                 if (tag_info->last_tag)
776                         jbd_set16(tag, flags,
777                                   jbd_get16(tag, flags) | JBD_FLAG_LAST_TAG);
778
779         }
780         return EOK;
781 }
782
783 /**@brief  Iterate all block tags in a block.
784  * @param  jbd_fs jbd filesystem
785  * @param  __tag_start pointer to the block
786  * @param  tag_tbl_size size of the block
787  * @param  func callback routine to indicate that
788  *         a block tag is found
789  * @param  arg additional argument to be passed to func */
790 static void
791 jbd_iterate_block_table(struct jbd_fs *jbd_fs,
792                         void *__tag_start,
793                         int32_t tag_tbl_size,
794                         void (*func)(struct jbd_fs * jbd_fs,
795                                         ext4_fsblk_t block,
796                                         uint8_t *uuid,
797                                         void *arg),
798                         void *arg)
799 {
800         char *tag_start, *tag_ptr;
801         int tag_bytes = jbd_tag_bytes(jbd_fs);
802         tag_start = __tag_start;
803         tag_ptr = tag_start;
804
805         /* Cut off the size of block tail storing checksum. */
806         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
807                                      JBD_FEATURE_INCOMPAT_CSUM_V2) ||
808             JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
809                                      JBD_FEATURE_INCOMPAT_CSUM_V3))
810                 tag_tbl_size -= sizeof(struct jbd_block_tail);
811
812         while (tag_tbl_size) {
813                 struct tag_info tag_info;
814                 int rc = jbd_extract_block_tag(jbd_fs,
815                                       tag_ptr,
816                                       tag_bytes,
817                                       tag_tbl_size,
818                                       &tag_info);
819                 if (rc != EOK)
820                         break;
821
822                 if (func)
823                         func(jbd_fs, tag_info.block, tag_info.uuid, arg);
824
825                 /* Stop the iteration when we reach the last tag. */
826                 if (tag_info.last_tag)
827                         break;
828
829                 tag_ptr += tag_info.tag_bytes;
830                 tag_tbl_size -= tag_info.tag_bytes;
831         }
832 }
833
834 static void jbd_display_block_tags(struct jbd_fs *jbd_fs,
835                                    ext4_fsblk_t block,
836                                    uint8_t *uuid,
837                                    void *arg)
838 {
839         uint32_t *iblock = arg;
840         ext4_dbg(DEBUG_JBD, "Block in block_tag: %" PRIu64 "\n", block);
841         (*iblock)++;
842         wrap(&jbd_fs->sb, *iblock);
843         (void)jbd_fs;
844         (void)uuid;
845         return;
846 }
847
848 static struct revoke_entry *
849 jbd_revoke_entry_lookup(struct recover_info *info, ext4_fsblk_t block)
850 {
851         struct revoke_entry tmp = {
852                 .block = block
853         };
854
855         return RB_FIND(jbd_revoke, &info->revoke_root, &tmp);
856 }
857
858 /**@brief  Replay a block in a transaction.
859  * @param  jbd_fs jbd filesystem
860  * @param  block  block address to be replayed.*/
861 static void jbd_replay_block_tags(struct jbd_fs *jbd_fs,
862                                   ext4_fsblk_t block,
863                                   uint8_t *uuid __unused,
864                                   void *__arg)
865 {
866         int r;
867         struct replay_arg *arg = __arg;
868         struct recover_info *info = arg->info;
869         uint32_t *this_block = arg->this_block;
870         struct revoke_entry *revoke_entry;
871         struct ext4_block journal_block, ext4_block;
872         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
873
874         (*this_block)++;
875         wrap(&jbd_fs->sb, *this_block);
876
877         /* We replay this block only if the current transaction id
878          * is equal or greater than that in revoke entry.*/
879         revoke_entry = jbd_revoke_entry_lookup(info, block);
880         if (revoke_entry &&
881             arg->this_trans_id < revoke_entry->trans_id)
882                 return;
883
884         ext4_dbg(DEBUG_JBD,
885                  "Replaying block in block_tag: %" PRIu64 "\n",
886                  block);
887
888         r = jbd_block_get(jbd_fs, &journal_block, *this_block);
889         if (r != EOK)
890                 return;
891
892         /* We need special treatment for ext4 superblock. */
893         if (block) {
894                 r = ext4_block_get_noread(fs->bdev, &ext4_block, block);
895                 if (r != EOK) {
896                         jbd_block_set(jbd_fs, &journal_block);
897                         return;
898                 }
899
900                 memcpy(ext4_block.data,
901                         journal_block.data,
902                         jbd_get32(&jbd_fs->sb, blocksize));
903
904                 ext4_bcache_set_dirty(ext4_block.buf);
905                 ext4_block_set(fs->bdev, &ext4_block);
906         } else {
907                 uint16_t mount_count, state;
908                 mount_count = ext4_get16(&fs->sb, mount_count);
909                 state = ext4_get16(&fs->sb, state);
910
911                 memcpy(&fs->sb,
912                         journal_block.data + EXT4_SUPERBLOCK_OFFSET,
913                         EXT4_SUPERBLOCK_SIZE);
914
915                 /* Mark system as mounted */
916                 ext4_set16(&fs->sb, state, state);
917                 r = ext4_sb_write(fs->bdev, &fs->sb);
918                 if (r != EOK)
919                         return;
920
921                 /*Update mount count*/
922                 ext4_set16(&fs->sb, mount_count, mount_count);
923         }
924
925         jbd_block_set(jbd_fs, &journal_block);
926         
927         return;
928 }
929
930 /**@brief  Add block address to revoke tree, along with
931  *         its transaction id.
932  * @param  info  journal replay info
933  * @param  block  block address to be replayed.*/
934 static void jbd_add_revoke_block_tags(struct recover_info *info,
935                                       ext4_fsblk_t block)
936 {
937         struct revoke_entry *revoke_entry;
938
939         ext4_dbg(DEBUG_JBD, "Add block %" PRIu64 " to revoke tree\n", block);
940         /* If the revoke entry with respect to the block address
941          * exists already, update its transaction id.*/
942         revoke_entry = jbd_revoke_entry_lookup(info, block);
943         if (revoke_entry) {
944                 revoke_entry->trans_id = info->this_trans_id;
945                 return;
946         }
947
948         revoke_entry = jbd_alloc_revoke_entry();
949         ext4_assert(revoke_entry);
950         revoke_entry->block = block;
951         revoke_entry->trans_id = info->this_trans_id;
952         RB_INSERT(jbd_revoke, &info->revoke_root, revoke_entry);
953
954         return;
955 }
956
957 static void jbd_destroy_revoke_tree(struct recover_info *info)
958 {
959         while (!RB_EMPTY(&info->revoke_root)) {
960                 struct revoke_entry *revoke_entry =
961                         RB_MIN(jbd_revoke, &info->revoke_root);
962                 ext4_assert(revoke_entry);
963                 RB_REMOVE(jbd_revoke, &info->revoke_root, revoke_entry);
964                 jbd_free_revoke_entry(revoke_entry);
965         }
966 }
967
968
969 #define ACTION_SCAN 0
970 #define ACTION_REVOKE 1
971 #define ACTION_RECOVER 2
972
973 /**@brief  Add entries in a revoke block to revoke tree.
974  * @param  jbd_fs jbd filesystem
975  * @param  header revoke block header
976  * @param  recover_info  journal replay info*/
977 static void jbd_build_revoke_tree(struct jbd_fs *jbd_fs,
978                                   struct jbd_bhdr *header,
979                                   struct recover_info *info)
980 {
981         char *blocks_entry;
982         struct jbd_revoke_header *revoke_hdr =
983                 (struct jbd_revoke_header *)header;
984         uint32_t i, nr_entries, record_len = 4;
985
986         /* If we are working on a 64bit jbd filesystem, */
987         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
988                                      JBD_FEATURE_INCOMPAT_64BIT))
989                 record_len = 8;
990
991         nr_entries = (jbd_get32(revoke_hdr, count) -
992                         sizeof(struct jbd_revoke_header)) /
993                         record_len;
994
995         blocks_entry = (char *)(revoke_hdr + 1);
996
997         for (i = 0;i < nr_entries;i++) {
998                 if (record_len == 8) {
999                         uint64_t *blocks =
1000                                 (uint64_t *)blocks_entry;
1001                         jbd_add_revoke_block_tags(info, to_be64(*blocks));
1002                 } else {
1003                         uint32_t *blocks =
1004                                 (uint32_t *)blocks_entry;
1005                         jbd_add_revoke_block_tags(info, to_be32(*blocks));
1006                 }
1007                 blocks_entry += record_len;
1008         }
1009 }
1010
1011 static void jbd_debug_descriptor_block(struct jbd_fs *jbd_fs,
1012                                        struct jbd_bhdr *header,
1013                                        uint32_t *iblock)
1014 {
1015         jbd_iterate_block_table(jbd_fs,
1016                                 header + 1,
1017                                 jbd_get32(&jbd_fs->sb, blocksize) -
1018                                         sizeof(struct jbd_bhdr),
1019                                 jbd_display_block_tags,
1020                                 iblock);
1021 }
1022
1023 static void jbd_replay_descriptor_block(struct jbd_fs *jbd_fs,
1024                                         struct jbd_bhdr *header,
1025                                         struct replay_arg *arg)
1026 {
1027         jbd_iterate_block_table(jbd_fs,
1028                                 header + 1,
1029                                 jbd_get32(&jbd_fs->sb, blocksize) -
1030                                         sizeof(struct jbd_bhdr),
1031                                 jbd_replay_block_tags,
1032                                 arg);
1033 }
1034
1035 /**@brief  The core routine of journal replay.
1036  * @param  jbd_fs jbd filesystem
1037  * @param  recover_info  journal replay info
1038  * @param  action action needed to be taken
1039  * @return standard error code*/
1040 static int jbd_iterate_log(struct jbd_fs *jbd_fs,
1041                            struct recover_info *info,
1042                            int action)
1043 {
1044         int r = EOK;
1045         bool log_end = false;
1046         struct jbd_sb *sb = &jbd_fs->sb;
1047         uint32_t start_trans_id, this_trans_id;
1048         uint32_t start_block, this_block;
1049
1050         /* We start iterating valid blocks in the whole journal.*/
1051         start_trans_id = this_trans_id = jbd_get32(sb, sequence);
1052         start_block = this_block = jbd_get32(sb, start);
1053         if (action == ACTION_SCAN)
1054                 info->trans_cnt = 0;
1055         else if (!info->trans_cnt)
1056                 log_end = true;
1057
1058         ext4_dbg(DEBUG_JBD, "Start of journal at trans id: %" PRIu32 "\n",
1059                             start_trans_id);
1060
1061         while (!log_end) {
1062                 struct ext4_block block;
1063                 struct jbd_bhdr *header;
1064                 /* If we are not scanning for the last
1065                  * valid transaction in the journal,
1066                  * we will stop when we reach the end of
1067                  * the journal.*/
1068                 if (action != ACTION_SCAN)
1069                         if (this_trans_id > info->last_trans_id) {
1070                                 log_end = true;
1071                                 continue;
1072                         }
1073
1074                 r = jbd_block_get(jbd_fs, &block, this_block);
1075                 if (r != EOK)
1076                         break;
1077
1078                 header = (struct jbd_bhdr *)block.data;
1079                 /* This block does not have a valid magic number,
1080                  * so we have reached the end of the journal.*/
1081                 if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER) {
1082                         jbd_block_set(jbd_fs, &block);
1083                         log_end = true;
1084                         continue;
1085                 }
1086
1087                 /* If the transaction id we found is not expected,
1088                  * we may have reached the end of the journal.
1089                  *
1090                  * If we are not scanning the journal, something
1091                  * bad might have taken place. :-( */
1092                 if (jbd_get32(header, sequence) != this_trans_id) {
1093                         if (action != ACTION_SCAN)
1094                                 r = EIO;
1095
1096                         jbd_block_set(jbd_fs, &block);
1097                         log_end = true;
1098                         continue;
1099                 }
1100
1101                 switch (jbd_get32(header, blocktype)) {
1102                 case JBD_DESCRIPTOR_BLOCK:
1103                         if (!jbd_verify_meta_csum(jbd_fs, header)) {
1104                                 ext4_dbg(DEBUG_JBD,
1105                                         DBG_WARN "Descriptor block checksum failed."
1106                                                 "Journal block: %" PRIu32"\n",
1107                                                 this_block);
1108                                 log_end = true;
1109                                 break;
1110                         }
1111                         ext4_dbg(DEBUG_JBD, "Descriptor block: %" PRIu32", "
1112                                             "trans_id: %" PRIu32"\n",
1113                                             this_block, this_trans_id);
1114                         if (action == ACTION_RECOVER) {
1115                                 struct replay_arg replay_arg;
1116                                 replay_arg.info = info;
1117                                 replay_arg.this_block = &this_block;
1118                                 replay_arg.this_trans_id = this_trans_id;
1119
1120                                 jbd_replay_descriptor_block(jbd_fs,
1121                                                 header, &replay_arg);
1122                         } else
1123                                 jbd_debug_descriptor_block(jbd_fs,
1124                                                 header, &this_block);
1125
1126                         break;
1127                 case JBD_COMMIT_BLOCK:
1128                         if (!jbd_verify_commit_csum(jbd_fs,
1129                                         (struct jbd_commit_header *)header)) {
1130                                 ext4_dbg(DEBUG_JBD,
1131                                         DBG_WARN "Commit block checksum failed."
1132                                                 "Journal block: %" PRIu32"\n",
1133                                                 this_block);
1134                                 log_end = true;
1135                                 break;
1136                         }
1137                         ext4_dbg(DEBUG_JBD, "Commit block: %" PRIu32", "
1138                                             "trans_id: %" PRIu32"\n",
1139                                             this_block, this_trans_id);
1140                         /* This is the end of a transaction,
1141                          * we may now proceed to the next transaction.
1142                          */
1143                         this_trans_id++;
1144                         info->trans_cnt++;
1145                         break;
1146                 case JBD_REVOKE_BLOCK:
1147                         if (!jbd_verify_meta_csum(jbd_fs, header)) {
1148                                 ext4_dbg(DEBUG_JBD,
1149                                         DBG_WARN "Revoke block checksum failed."
1150                                                 "Journal block: %" PRIu32"\n",
1151                                                 this_block);
1152                                 log_end = true;
1153                                 break;
1154                         }
1155                         ext4_dbg(DEBUG_JBD, "Revoke block: %" PRIu32", "
1156                                             "trans_id: %" PRIu32"\n",
1157                                             this_block, this_trans_id);
1158                         if (action == ACTION_REVOKE) {
1159                                 info->this_trans_id = this_trans_id;
1160                                 jbd_build_revoke_tree(jbd_fs,
1161                                                 header, info);
1162                         }
1163                         break;
1164                 default:
1165                         log_end = true;
1166                         break;
1167                 }
1168                 jbd_block_set(jbd_fs, &block);
1169                 this_block++;
1170                 wrap(sb, this_block);
1171                 if (this_block == start_block)
1172                         log_end = true;
1173
1174         }
1175         ext4_dbg(DEBUG_JBD, "End of journal.\n");
1176         if (r == EOK && action == ACTION_SCAN) {
1177                 /* We have finished scanning the journal. */
1178                 info->start_trans_id = start_trans_id;
1179                 if (this_trans_id > start_trans_id)
1180                         info->last_trans_id = this_trans_id - 1;
1181                 else
1182                         info->last_trans_id = this_trans_id;
1183         }
1184
1185         return r;
1186 }
1187
1188 /**@brief  Replay journal.
1189  * @param  jbd_fs jbd filesystem
1190  * @return standard error code*/
1191 int jbd_recover(struct jbd_fs *jbd_fs)
1192 {
1193         int r;
1194         struct recover_info info;
1195         struct jbd_sb *sb = &jbd_fs->sb;
1196         if (!sb->start)
1197                 return EOK;
1198
1199         RB_INIT(&info.revoke_root);
1200
1201         r = jbd_iterate_log(jbd_fs, &info, ACTION_SCAN);
1202         if (r != EOK)
1203                 return r;
1204
1205         r = jbd_iterate_log(jbd_fs, &info, ACTION_REVOKE);
1206         if (r != EOK)
1207                 return r;
1208
1209         r = jbd_iterate_log(jbd_fs, &info, ACTION_RECOVER);
1210         if (r == EOK) {
1211                 /* If we successfully replay the journal,
1212                  * clear EXT4_FINCOM_RECOVER flag on the
1213                  * ext4 superblock, and set the start of
1214                  * journal to 0.*/
1215                 uint32_t features_incompatible =
1216                         ext4_get32(&jbd_fs->inode_ref.fs->sb,
1217                                    features_incompatible);
1218                 jbd_set32(&jbd_fs->sb, start, 0);
1219                 features_incompatible &= ~EXT4_FINCOM_RECOVER;
1220                 ext4_set32(&jbd_fs->inode_ref.fs->sb,
1221                            features_incompatible,
1222                            features_incompatible);
1223                 jbd_fs->dirty = true;
1224                 r = ext4_sb_write(jbd_fs->bdev,
1225                                   &jbd_fs->inode_ref.fs->sb);
1226         }
1227         jbd_destroy_revoke_tree(&info);
1228         return r;
1229 }
1230
1231 static void jbd_journal_write_sb(struct jbd_journal *journal)
1232 {
1233         struct jbd_fs *jbd_fs = journal->jbd_fs;
1234         jbd_set32(&jbd_fs->sb, start, journal->start);
1235         jbd_set32(&jbd_fs->sb, sequence, journal->trans_id);
1236         jbd_fs->dirty = true;
1237 }
1238
1239 /**@brief  Start accessing the journal.
1240  * @param  jbd_fs jbd filesystem
1241  * @param  journal current journal session
1242  * @return standard error code*/
1243 int jbd_journal_start(struct jbd_fs *jbd_fs,
1244                       struct jbd_journal *journal)
1245 {
1246         int r;
1247         uint32_t features_incompatible =
1248                         ext4_get32(&jbd_fs->inode_ref.fs->sb,
1249                                    features_incompatible);
1250         struct ext4_block block = EXT4_BLOCK_ZERO();
1251         features_incompatible |= EXT4_FINCOM_RECOVER;
1252         ext4_set32(&jbd_fs->inode_ref.fs->sb,
1253                         features_incompatible,
1254                         features_incompatible);
1255         r = ext4_sb_write(jbd_fs->bdev,
1256                         &jbd_fs->inode_ref.fs->sb);
1257         if (r != EOK)
1258                 return r;
1259
1260         journal->first = jbd_get32(&jbd_fs->sb, first);
1261         journal->start = journal->first;
1262         journal->last = journal->first;
1263         journal->trans_id = 1;
1264         journal->alloc_trans_id = 1;
1265
1266         journal->block_size = jbd_get32(&jbd_fs->sb, blocksize);
1267
1268         r = jbd_block_get_noread(jbd_fs,
1269                          &block,
1270                          journal->start);
1271         if (r != EOK) {
1272                 memset(journal, 0, sizeof(struct jbd_journal));
1273                 return r;
1274         }
1275         memset(block.data, 0, journal->block_size);
1276         ext4_bcache_set_dirty(block.buf);
1277         r = jbd_block_set(jbd_fs, &block);
1278         if (r != EOK) {
1279                 memset(journal, 0, sizeof(struct jbd_journal));
1280                 return r;
1281         }
1282
1283         TAILQ_INIT(&journal->cp_queue);
1284         RB_INIT(&journal->block_rec_root);
1285         journal->jbd_fs = jbd_fs;
1286         jbd_journal_write_sb(journal);
1287         r = jbd_write_sb(jbd_fs);
1288         if (r != EOK)
1289                 return r;
1290
1291         jbd_fs->bdev->journal = journal;
1292         return EOK;
1293 }
1294
1295 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
1296                           struct ext4_buf *buf __unused,
1297                           int res,
1298                           void *arg);
1299
1300 /*
1301  * This routine is only suitable to committed transactions. */
1302 static void jbd_journal_flush_trans(struct jbd_trans *trans)
1303 {
1304         struct jbd_buf *jbd_buf, *tmp;
1305         struct jbd_journal *journal = trans->journal;
1306         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1307         void *tmp_data = malloc(journal->block_size);
1308         ext4_assert(tmp_data);
1309
1310         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
1311                         tmp) {
1312                 struct ext4_buf *buf;
1313                 struct ext4_block block;
1314                 /* The buffer is not yet flushed. */
1315                 buf = ext4_bcache_find_get(fs->bdev->bc, &block,
1316                                            jbd_buf->block_rec->lba);
1317                 if (!(buf && ext4_bcache_test_flag(buf, BC_UPTODATE) &&
1318                       jbd_buf->block_rec->trans == trans)) {
1319                         int r;
1320                         struct ext4_block jbd_block = EXT4_BLOCK_ZERO();
1321                         ext4_assert(jbd_block_get(journal->jbd_fs,
1322                                                 &jbd_block,
1323                                                 jbd_buf->jbd_lba) == EOK);
1324                         memcpy(tmp_data, jbd_block.data,
1325                                         journal->block_size);
1326                         ext4_block_set(fs->bdev, &jbd_block);
1327                         r = ext4_blocks_set_direct(fs->bdev, tmp_data,
1328                                         jbd_buf->block_rec->lba, 1);
1329                         jbd_trans_end_write(fs->bdev->bc, buf, r, jbd_buf);
1330                 } else
1331                         ext4_block_flush_buf(fs->bdev, buf);
1332
1333                 if (buf)
1334                         ext4_block_set(fs->bdev, &block);
1335         }
1336
1337         free(tmp_data);
1338 }
1339
1340 static void
1341 jbd_journal_skip_pure_revoke(struct jbd_journal *journal,
1342                              struct jbd_trans *trans)
1343 {
1344         journal->start = trans->start_iblock +
1345                 trans->alloc_blocks;
1346         wrap(&journal->jbd_fs->sb, journal->start);
1347         journal->trans_id = trans->trans_id + 1;
1348         jbd_journal_free_trans(journal,
1349                         trans, false);
1350         jbd_journal_write_sb(journal);
1351 }
1352
1353 void
1354 jbd_journal_purge_cp_trans(struct jbd_journal *journal,
1355                            bool flush,
1356                            bool once)
1357 {
1358         struct jbd_trans *trans;
1359         while ((trans = TAILQ_FIRST(&journal->cp_queue))) {
1360                 if (!trans->data_cnt) {
1361                         TAILQ_REMOVE(&journal->cp_queue,
1362                                         trans,
1363                                         trans_node);
1364                         jbd_journal_skip_pure_revoke(journal, trans);
1365                 } else {
1366                         if (trans->data_cnt ==
1367                                         trans->written_cnt) {
1368                                 journal->start =
1369                                         trans->start_iblock +
1370                                         trans->alloc_blocks;
1371                                 wrap(&journal->jbd_fs->sb,
1372                                                 journal->start);
1373                                 journal->trans_id =
1374                                         trans->trans_id + 1;
1375                                 TAILQ_REMOVE(&journal->cp_queue,
1376                                                 trans,
1377                                                 trans_node);
1378                                 jbd_journal_free_trans(journal,
1379                                                 trans,
1380                                                 false);
1381                                 jbd_journal_write_sb(journal);
1382                         } else if (!flush) {
1383                                 journal->start =
1384                                         trans->start_iblock;
1385                                 wrap(&journal->jbd_fs->sb,
1386                                                 journal->start);
1387                                 journal->trans_id =
1388                                         trans->trans_id;
1389                                 jbd_journal_write_sb(journal);
1390                                 break;
1391                         } else
1392                                 jbd_journal_flush_trans(trans);
1393                 }
1394                 if (once)
1395                         break;
1396         }
1397 }
1398
1399 /**@brief  Stop accessing the journal.
1400  * @param  journal current journal session
1401  * @return standard error code*/
1402 int jbd_journal_stop(struct jbd_journal *journal)
1403 {
1404         int r;
1405         struct jbd_fs *jbd_fs = journal->jbd_fs;
1406         uint32_t features_incompatible;
1407
1408         /* Make sure that journalled content have reached
1409          * the disk.*/
1410         jbd_journal_purge_cp_trans(journal, true, false);
1411
1412         /* There should be no block record in this journal
1413          * session. */
1414         if (!RB_EMPTY(&journal->block_rec_root))
1415                 ext4_dbg(DEBUG_JBD,
1416                          DBG_WARN "There are still block records "
1417                                   "in this journal session!\n");
1418
1419         features_incompatible =
1420                 ext4_get32(&jbd_fs->inode_ref.fs->sb,
1421                            features_incompatible);
1422         features_incompatible &= ~EXT4_FINCOM_RECOVER;
1423         ext4_set32(&jbd_fs->inode_ref.fs->sb,
1424                         features_incompatible,
1425                         features_incompatible);
1426         r = ext4_sb_write(jbd_fs->bdev,
1427                         &jbd_fs->inode_ref.fs->sb);
1428         if (r != EOK)
1429                 return r;
1430
1431         journal->start = 0;
1432         journal->trans_id = 0;
1433         jbd_journal_write_sb(journal);
1434         return jbd_write_sb(journal->jbd_fs);
1435 }
1436
1437 /**@brief  Allocate a block in the journal.
1438  * @param  journal current journal session
1439  * @param  trans transaction
1440  * @return allocated block address*/
1441 static uint32_t jbd_journal_alloc_block(struct jbd_journal *journal,
1442                                         struct jbd_trans *trans)
1443 {
1444         uint32_t start_block;
1445
1446         start_block = journal->last++;
1447         trans->alloc_blocks++;
1448         wrap(&journal->jbd_fs->sb, journal->last);
1449         
1450         /* If there is no space left, flush all journalled
1451          * blocks to disk first.*/
1452         if (journal->last == journal->start)
1453                 jbd_journal_purge_cp_trans(journal, true, false);
1454
1455         return start_block;
1456 }
1457
1458 static struct jbd_block_rec *
1459 jbd_trans_block_rec_lookup(struct jbd_journal *journal,
1460                            ext4_fsblk_t lba)
1461 {
1462         struct jbd_block_rec tmp = {
1463                 .lba = lba
1464         };
1465
1466         return RB_FIND(jbd_block,
1467                        &journal->block_rec_root,
1468                        &tmp);
1469 }
1470
1471 static void
1472 jbd_trans_change_ownership(struct jbd_block_rec *block_rec,
1473                            struct jbd_trans *new_trans)
1474 {
1475         LIST_REMOVE(block_rec, tbrec_node);
1476         if (new_trans) {
1477                 /* Now this block record belongs to this transaction. */
1478                 LIST_INSERT_HEAD(&new_trans->tbrec_list, block_rec, tbrec_node);
1479         }
1480         block_rec->trans = new_trans;
1481 }
1482
1483 static inline struct jbd_block_rec *
1484 jbd_trans_insert_block_rec(struct jbd_trans *trans,
1485                            ext4_fsblk_t lba)
1486 {
1487         struct jbd_block_rec *block_rec;
1488         block_rec = jbd_trans_block_rec_lookup(trans->journal, lba);
1489         if (block_rec) {
1490                 jbd_trans_change_ownership(block_rec, trans);
1491                 return block_rec;
1492         }
1493         block_rec = calloc(1, sizeof(struct jbd_block_rec));
1494         if (!block_rec)
1495                 return NULL;
1496
1497         block_rec->lba = lba;
1498         block_rec->trans = trans;
1499         TAILQ_INIT(&block_rec->dirty_buf_queue);
1500         LIST_INSERT_HEAD(&trans->tbrec_list, block_rec, tbrec_node);
1501         RB_INSERT(jbd_block, &trans->journal->block_rec_root, block_rec);
1502         return block_rec;
1503 }
1504
1505 /*
1506  * This routine will do the dirty works.
1507  */
1508 static void
1509 jbd_trans_finish_callback(struct jbd_journal *journal,
1510                           const struct jbd_trans *trans,
1511                           struct jbd_block_rec *block_rec,
1512                           bool abort,
1513                           bool revoke)
1514 {
1515         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1516         if (block_rec->trans != trans)
1517                 return;
1518
1519         if (!abort) {
1520                 struct jbd_buf *jbd_buf, *tmp;
1521                 TAILQ_FOREACH_SAFE(jbd_buf,
1522                                 &block_rec->dirty_buf_queue,
1523                                 dirty_buf_node,
1524                                 tmp) {
1525                         jbd_trans_end_write(fs->bdev->bc,
1526                                         NULL,
1527                                         EOK,
1528                                         jbd_buf);
1529                 }
1530         } else {
1531                 /*
1532                  * We have to roll back data if the block is going to be
1533                  * aborted.
1534                  */
1535                 struct jbd_buf *jbd_buf;
1536                 struct ext4_block jbd_block = EXT4_BLOCK_ZERO(),
1537                                   block = EXT4_BLOCK_ZERO();
1538                 jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue,
1539                                 jbd_buf_dirty);
1540                 if (jbd_buf) {
1541                         if (!revoke) {
1542                                 ext4_assert(ext4_block_get_noread(fs->bdev,
1543                                                         &block,
1544                                                         block_rec->lba) == EOK);
1545                                 ext4_assert(jbd_block_get(journal->jbd_fs,
1546                                                         &jbd_block,
1547                                                         jbd_buf->jbd_lba) == EOK);
1548                                 memcpy(block.data, jbd_block.data,
1549                                                 journal->block_size);
1550
1551                                 jbd_trans_change_ownership(block_rec,
1552                                                 jbd_buf->trans);
1553
1554                                 block.buf->end_write = jbd_trans_end_write;
1555                                 block.buf->end_write_arg = jbd_buf;
1556
1557                                 ext4_bcache_set_flag(jbd_block.buf, BC_TMP);
1558                                 ext4_bcache_set_dirty(block.buf);
1559
1560                                 ext4_block_set(fs->bdev, &jbd_block);
1561                                 ext4_block_set(fs->bdev, &block);
1562                                 return;
1563                         } else {
1564                                 /* The revoked buffer is yet written. */
1565                                 jbd_trans_change_ownership(block_rec,
1566                                                 jbd_buf->trans);
1567                         }
1568                 }
1569         }
1570 }
1571
1572 static inline void
1573 jbd_trans_remove_block_rec(struct jbd_journal *journal,
1574                            struct jbd_block_rec *block_rec,
1575                            struct jbd_trans *trans)
1576 {
1577         /* If this block record doesn't belong to this transaction,
1578          * give up.*/
1579         if (block_rec->trans == trans) {
1580                 LIST_REMOVE(block_rec, tbrec_node);
1581                 RB_REMOVE(jbd_block,
1582                                 &journal->block_rec_root,
1583                                 block_rec);
1584                 free(block_rec);
1585         }
1586 }
1587
1588 /**@brief  Add block to a transaction and mark it dirty.
1589  * @param  trans transaction
1590  * @param  block block descriptor
1591  * @return standard error code*/
1592 int jbd_trans_set_block_dirty(struct jbd_trans *trans,
1593                               struct ext4_block *block)
1594 {
1595         struct jbd_buf *jbd_buf;
1596         struct jbd_revoke_rec *rec, tmp_rec = {
1597                 .lba = block->lb_id
1598         };
1599         struct jbd_block_rec *block_rec;
1600
1601         if (block->buf->end_write == jbd_trans_end_write) {
1602                 jbd_buf = block->buf->end_write_arg;
1603                 if (jbd_buf && jbd_buf->trans == trans)
1604                         return EOK;
1605         }
1606         jbd_buf = calloc(1, sizeof(struct jbd_buf));
1607         if (!jbd_buf)
1608                 return ENOMEM;
1609
1610         if ((block_rec = jbd_trans_insert_block_rec(trans,
1611                                         block->lb_id)) == NULL) {
1612                 free(jbd_buf);
1613                 return ENOMEM;
1614         }
1615
1616         TAILQ_INSERT_TAIL(&block_rec->dirty_buf_queue,
1617                         jbd_buf,
1618                         dirty_buf_node);
1619
1620         jbd_buf->block_rec = block_rec;
1621         jbd_buf->trans = trans;
1622         jbd_buf->block = *block;
1623         ext4_bcache_inc_ref(block->buf);
1624
1625         /* If the content reach the disk, notify us
1626          * so that we may do a checkpoint. */
1627         block->buf->end_write = jbd_trans_end_write;
1628         block->buf->end_write_arg = jbd_buf;
1629
1630         trans->data_cnt++;
1631         TAILQ_INSERT_HEAD(&trans->buf_queue, jbd_buf, buf_node);
1632
1633         ext4_bcache_set_dirty(block->buf);
1634         rec = RB_FIND(jbd_revoke_tree,
1635                         &trans->revoke_root,
1636                         &tmp_rec);
1637         if (rec)
1638                 RB_REMOVE(jbd_revoke_tree, &trans->revoke_root,
1639                           rec);
1640
1641         return EOK;
1642 }
1643
1644 /**@brief  Add block to be revoked to a transaction
1645  * @param  trans transaction
1646  * @param  lba logical block address
1647  * @return standard error code*/
1648 int jbd_trans_revoke_block(struct jbd_trans *trans,
1649                            ext4_fsblk_t lba)
1650 {
1651         struct jbd_revoke_rec *rec =
1652                 calloc(1, sizeof(struct jbd_revoke_rec));
1653         if (!rec)
1654                 return ENOMEM;
1655
1656         rec->lba = lba;
1657         RB_INSERT(jbd_revoke_tree, &trans->revoke_root, rec);
1658         return EOK;
1659 }
1660
1661 /**@brief  Try to add block to be revoked to a transaction.
1662  *         If @lba still remains in an transaction on checkpoint
1663  *         queue, add @lba as a revoked block to the transaction.
1664  * @param  trans transaction
1665  * @param  lba logical block address
1666  * @return standard error code*/
1667 int jbd_trans_try_revoke_block(struct jbd_trans *trans,
1668                                ext4_fsblk_t lba)
1669 {
1670         struct jbd_journal *journal = trans->journal;
1671         struct jbd_block_rec *block_rec =
1672                 jbd_trans_block_rec_lookup(journal, lba);
1673
1674         if (block_rec) {
1675                 if (block_rec->trans == trans) {
1676                         struct jbd_buf *jbd_buf =
1677                                 TAILQ_LAST(&block_rec->dirty_buf_queue,
1678                                         jbd_buf_dirty);
1679                         /* If there are still unwritten buffers. */
1680                         if (TAILQ_FIRST(&block_rec->dirty_buf_queue) !=
1681                             jbd_buf)
1682                                 jbd_trans_revoke_block(trans, lba);
1683
1684                 } else
1685                         jbd_trans_revoke_block(trans, lba);
1686         }
1687
1688         return EOK;
1689 }
1690
1691 /**@brief  Free a transaction
1692  * @param  journal current journal session
1693  * @param  trans transaction
1694  * @param  abort discard all the modifications on the block?
1695  * @return standard error code*/
1696 void jbd_journal_free_trans(struct jbd_journal *journal,
1697                             struct jbd_trans *trans,
1698                             bool abort)
1699 {
1700         struct jbd_buf *jbd_buf, *tmp;
1701         struct jbd_revoke_rec *rec, *tmp2;
1702         struct jbd_block_rec *block_rec, *tmp3;
1703         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1704         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
1705                           tmp) {
1706                 block_rec = jbd_buf->block_rec;
1707                 if (abort) {
1708                         jbd_buf->block.buf->end_write = NULL;
1709                         jbd_buf->block.buf->end_write_arg = NULL;
1710                         ext4_bcache_clear_dirty(jbd_buf->block.buf);
1711                         ext4_block_set(fs->bdev, &jbd_buf->block);
1712                 }
1713
1714                 TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
1715                         jbd_buf,
1716                         dirty_buf_node);
1717                 jbd_trans_finish_callback(journal,
1718                                 trans,
1719                                 block_rec,
1720                                 abort,
1721                                 false);
1722                 TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1723                 free(jbd_buf);
1724         }
1725         RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root,
1726                           tmp2) {
1727                 RB_REMOVE(jbd_revoke_tree, &trans->revoke_root, rec);
1728                 free(rec);
1729         }
1730         LIST_FOREACH_SAFE(block_rec, &trans->tbrec_list, tbrec_node,
1731                           tmp3) {
1732                 jbd_trans_remove_block_rec(journal, block_rec, trans);
1733         }
1734
1735         free(trans);
1736 }
1737
1738 /**@brief  Write commit block for a transaction
1739  * @param  trans transaction
1740  * @return standard error code*/
1741 static int jbd_trans_write_commit_block(struct jbd_trans *trans)
1742 {
1743         int rc;
1744         struct ext4_block block;
1745         struct jbd_commit_header *header;
1746         uint32_t commit_iblock, orig_commit_iblock;
1747         struct jbd_journal *journal = trans->journal;
1748
1749         commit_iblock = jbd_journal_alloc_block(journal, trans);
1750         rc = jbd_block_get_noread(journal->jbd_fs, &block, commit_iblock);
1751         if (rc != EOK)
1752                 return rc;
1753
1754         header = (struct jbd_commit_header *)block.data;
1755         jbd_set32(&header->header, magic, JBD_MAGIC_NUMBER);
1756         jbd_set32(&header->header, blocktype, JBD_COMMIT_BLOCK);
1757         jbd_set32(&header->header, sequence, trans->trans_id);
1758
1759         if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb,
1760                                 JBD_FEATURE_COMPAT_CHECKSUM)) {
1761                 jbd_set32(header, chksum_type, JBD_CRC32_CHKSUM);
1762                 jbd_set32(header, chksum_size, JBD_CRC32_CHKSUM_SIZE);
1763                 jbd_set32(header, chksum[0], trans->data_csum);
1764         }
1765         jbd_commit_csum_set(journal->jbd_fs, header);
1766         ext4_bcache_set_dirty(block.buf);
1767         ext4_bcache_set_flag(block.buf, BC_TMP);
1768         rc = jbd_block_set(journal->jbd_fs, &block);
1769         if (rc != EOK)
1770                 return rc;
1771
1772         orig_commit_iblock = commit_iblock;
1773         commit_iblock++;
1774         wrap(&journal->jbd_fs->sb, commit_iblock);
1775
1776         /* To prevent accidental reference to stale journalling metadata. */
1777         if (orig_commit_iblock < commit_iblock) {
1778                 rc = jbd_block_get_noread(journal->jbd_fs, &block, commit_iblock);
1779                 if (rc != EOK)
1780                         return rc;
1781
1782                 memset(block.data, 0, journal->block_size);
1783                 ext4_bcache_set_dirty(block.buf);
1784                 ext4_bcache_set_flag(block.buf, BC_TMP);
1785                 rc = jbd_block_set(journal->jbd_fs, &block);
1786         }
1787
1788         return rc;
1789 }
1790
1791 /**@brief  Write descriptor block for a transaction
1792  * @param  journal current journal session
1793  * @param  trans transaction
1794  * @return standard error code*/
1795 static int jbd_journal_prepare(struct jbd_journal *journal,
1796                                struct jbd_trans *trans)
1797 {
1798         int rc = EOK, i = 0;
1799         struct ext4_block desc_block = EXT4_BLOCK_ZERO(),
1800                           data_block = EXT4_BLOCK_ZERO();
1801         int32_t tag_tbl_size = 0;
1802         uint32_t desc_iblock = 0;
1803         uint32_t data_iblock = 0;
1804         char *tag_start = NULL, *tag_ptr = NULL;
1805         struct jbd_buf *jbd_buf, *tmp;
1806         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1807         uint32_t checksum = EXT4_CRC32_INIT;
1808         struct jbd_bhdr *bhdr = NULL;
1809         void *data;
1810
1811         /* Try to remove any non-dirty buffers from the tail of
1812          * buf_queue. */
1813         TAILQ_FOREACH_REVERSE_SAFE(jbd_buf, &trans->buf_queue,
1814                         jbd_trans_buf, buf_node, tmp) {
1815                 struct jbd_revoke_rec tmp_rec = {
1816                         .lba = jbd_buf->block_rec->lba
1817                 };
1818                 /* We stop the iteration when we find a dirty buffer. */
1819                 if (ext4_bcache_test_flag(jbd_buf->block.buf,
1820                                         BC_DIRTY))
1821                         break;
1822         
1823                 TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
1824                         jbd_buf,
1825                         dirty_buf_node);
1826
1827                 jbd_buf->block.buf->end_write = NULL;
1828                 jbd_buf->block.buf->end_write_arg = NULL;
1829                 jbd_trans_finish_callback(journal,
1830                                 trans,
1831                                 jbd_buf->block_rec,
1832                                 true,
1833                                 RB_FIND(jbd_revoke_tree,
1834                                         &trans->revoke_root,
1835                                         &tmp_rec));
1836                 jbd_trans_remove_block_rec(journal,
1837                                         jbd_buf->block_rec, trans);
1838                 trans->data_cnt--;
1839
1840                 ext4_block_set(fs->bdev, &jbd_buf->block);
1841                 TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1842                 free(jbd_buf);
1843         }
1844
1845         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, tmp) {
1846                 struct tag_info tag_info;
1847                 bool uuid_exist = false;
1848                 struct jbd_revoke_rec tmp_rec = {
1849                         .lba = jbd_buf->block_rec->lba
1850                 };
1851                 if (!ext4_bcache_test_flag(jbd_buf->block.buf,
1852                                            BC_DIRTY)) {
1853                         TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
1854                                         jbd_buf,
1855                                         dirty_buf_node);
1856
1857                         jbd_buf->block.buf->end_write = NULL;
1858                         jbd_buf->block.buf->end_write_arg = NULL;
1859
1860                         /* The buffer has not been modified, just release
1861                          * that jbd_buf. */
1862                         jbd_trans_finish_callback(journal,
1863                                         trans,
1864                                         jbd_buf->block_rec,
1865                                         true,
1866                                         RB_FIND(jbd_revoke_tree,
1867                                                 &trans->revoke_root,
1868                                                 &tmp_rec));
1869                         jbd_trans_remove_block_rec(journal,
1870                                         jbd_buf->block_rec, trans);
1871                         trans->data_cnt--;
1872
1873                         ext4_block_set(fs->bdev, &jbd_buf->block);
1874                         TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1875                         free(jbd_buf);
1876                         continue;
1877                 }
1878                 checksum = jbd_block_csum(journal->jbd_fs,
1879                                           jbd_buf->block.data,
1880                                           checksum,
1881                                           trans->trans_id);
1882 again:
1883                 if (!desc_iblock) {
1884                         desc_iblock = jbd_journal_alloc_block(journal, trans);
1885                         rc = jbd_block_get_noread(journal->jbd_fs, &desc_block, desc_iblock);
1886                         if (rc != EOK)
1887                                 break;
1888
1889                         bhdr = (struct jbd_bhdr *)desc_block.data;
1890                         jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
1891                         jbd_set32(bhdr, blocktype, JBD_DESCRIPTOR_BLOCK);
1892                         jbd_set32(bhdr, sequence, trans->trans_id);
1893
1894                         tag_start = (char *)(bhdr + 1);
1895                         tag_ptr = tag_start;
1896                         uuid_exist = true;
1897                         tag_tbl_size = journal->block_size -
1898                                 sizeof(struct jbd_bhdr);
1899
1900                         if (jbd_has_csum(&journal->jbd_fs->sb))
1901                                 tag_tbl_size -= sizeof(struct jbd_block_tail);
1902
1903                         if (!trans->start_iblock)
1904                                 trans->start_iblock = desc_iblock;
1905
1906                         ext4_bcache_set_dirty(desc_block.buf);
1907                         ext4_bcache_set_flag(desc_block.buf, BC_TMP);
1908                 }
1909                 tag_info.block = jbd_buf->block.lb_id;
1910                 tag_info.uuid_exist = uuid_exist;
1911                 if (i == trans->data_cnt - 1)
1912                         tag_info.last_tag = true;
1913                 else
1914                         tag_info.last_tag = false;
1915
1916                 tag_info.checksum = checksum;
1917
1918                 if (uuid_exist)
1919                         memcpy(tag_info.uuid, journal->jbd_fs->sb.uuid,
1920                                         UUID_SIZE);
1921
1922                 rc = jbd_write_block_tag(journal->jbd_fs,
1923                                 tag_ptr,
1924                                 tag_tbl_size,
1925                                 &tag_info);
1926                 if (rc != EOK) {
1927                         jbd_meta_csum_set(journal->jbd_fs, bhdr);
1928                         desc_iblock = 0;
1929                         rc = jbd_block_set(journal->jbd_fs, &desc_block);
1930                         if (rc != EOK)
1931                                 break;
1932
1933                         goto again;
1934                 }
1935
1936                 data_iblock = jbd_journal_alloc_block(journal, trans);
1937                 rc = jbd_block_get_noread(journal->jbd_fs, &data_block, data_iblock);
1938                 if (rc != EOK) {
1939                         desc_iblock = 0;
1940                         ext4_bcache_clear_dirty(desc_block.buf);
1941                         jbd_block_set(journal->jbd_fs, &desc_block);
1942                         break;
1943                 }
1944
1945                 data = data_block.data;
1946                 memcpy(data, jbd_buf->block.data,
1947                         journal->block_size);
1948                 ext4_bcache_set_dirty(data_block.buf);
1949                 ext4_bcache_set_flag(data_block.buf, BC_TMP);
1950                 rc = jbd_block_set(journal->jbd_fs, &data_block);
1951                 if (rc != EOK) {
1952                         desc_iblock = 0;
1953                         ext4_bcache_clear_dirty(desc_block.buf);
1954                         jbd_block_set(journal->jbd_fs, &desc_block);
1955                         break;
1956                 }
1957                 jbd_buf->jbd_lba = data_iblock;
1958
1959                 tag_ptr += tag_info.tag_bytes;
1960                 tag_tbl_size -= tag_info.tag_bytes;
1961
1962                 i++;
1963         }
1964         if (rc == EOK && desc_iblock) {
1965                 jbd_meta_csum_set(journal->jbd_fs,
1966                                 (struct jbd_bhdr *)bhdr);
1967                 trans->data_csum = checksum;
1968                 rc = jbd_block_set(journal->jbd_fs, &desc_block);
1969         }
1970
1971         return rc;
1972 }
1973
1974 /**@brief  Write revoke block for a transaction
1975  * @param  journal current journal session
1976  * @param  trans transaction
1977  * @return standard error code*/
1978 static int
1979 jbd_journal_prepare_revoke(struct jbd_journal *journal,
1980                            struct jbd_trans *trans)
1981 {
1982         int rc = EOK, i = 0;
1983         struct ext4_block desc_block = EXT4_BLOCK_ZERO();
1984         int32_t tag_tbl_size = 0;
1985         uint32_t desc_iblock = 0;
1986         char *blocks_entry = NULL;
1987         struct jbd_revoke_rec *rec, *tmp;
1988         struct jbd_revoke_header *header = NULL;
1989         int32_t record_len = 4;
1990         struct jbd_bhdr *bhdr = NULL;
1991
1992         if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb,
1993                                      JBD_FEATURE_INCOMPAT_64BIT))
1994                 record_len = 8;
1995
1996         RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root,
1997                           tmp) {
1998 again:
1999                 if (!desc_iblock) {
2000                         desc_iblock = jbd_journal_alloc_block(journal, trans);
2001                         rc = jbd_block_get_noread(journal->jbd_fs, &desc_block,
2002                                                   desc_iblock);
2003                         if (rc != EOK)
2004                                 break;
2005
2006                         bhdr = (struct jbd_bhdr *)desc_block.data;
2007                         jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
2008                         jbd_set32(bhdr, blocktype, JBD_REVOKE_BLOCK);
2009                         jbd_set32(bhdr, sequence, trans->trans_id);
2010                         
2011                         header = (struct jbd_revoke_header *)bhdr;
2012                         blocks_entry = (char *)(header + 1);
2013                         tag_tbl_size = journal->block_size -
2014                                 sizeof(struct jbd_revoke_header);
2015
2016                         if (jbd_has_csum(&journal->jbd_fs->sb))
2017                                 tag_tbl_size -= sizeof(struct jbd_block_tail);
2018
2019                         if (!trans->start_iblock)
2020                                 trans->start_iblock = desc_iblock;
2021
2022                         ext4_bcache_set_dirty(desc_block.buf);
2023                         ext4_bcache_set_flag(desc_block.buf, BC_TMP);
2024                 }
2025
2026                 if (tag_tbl_size < record_len) {
2027                         jbd_set32(header, count,
2028                                   journal->block_size - tag_tbl_size);
2029                         jbd_meta_csum_set(journal->jbd_fs, bhdr);
2030                         bhdr = NULL;
2031                         desc_iblock = 0;
2032                         header = NULL;
2033                         rc = jbd_block_set(journal->jbd_fs, &desc_block);
2034                         if (rc != EOK)
2035                                 break;
2036
2037                         goto again;
2038                 }
2039                 if (record_len == 8) {
2040                         uint64_t *blocks =
2041                                 (uint64_t *)blocks_entry;
2042                         *blocks = to_be64(rec->lba);
2043                 } else {
2044                         uint32_t *blocks =
2045                                 (uint32_t *)blocks_entry;
2046                         *blocks = to_be32((uint32_t)rec->lba);
2047                 }
2048                 blocks_entry += record_len;
2049                 tag_tbl_size -= record_len;
2050
2051                 i++;
2052         }
2053         if (rc == EOK && desc_iblock) {
2054                 if (header != NULL)
2055                         jbd_set32(header, count,
2056                                   journal->block_size - tag_tbl_size);
2057
2058                 jbd_meta_csum_set(journal->jbd_fs, bhdr);
2059                 rc = jbd_block_set(journal->jbd_fs, &desc_block);
2060         }
2061
2062         return rc;
2063 }
2064
2065 /**@brief  Put references of block descriptors in a transaction.
2066  * @param  journal current journal session
2067  * @param  trans transaction*/
2068 void jbd_journal_cp_trans(struct jbd_journal *journal, struct jbd_trans *trans)
2069 {
2070         struct jbd_buf *jbd_buf, *tmp;
2071         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
2072         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
2073                         tmp) {
2074                 struct ext4_block block = jbd_buf->block;
2075                 ext4_block_set(fs->bdev, &block);
2076         }
2077 }
2078
2079 /**@brief  Update the start block of the journal when
2080  *         all the contents in a transaction reach the disk.*/
2081 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
2082                           struct ext4_buf *buf,
2083                           int res,
2084                           void *arg)
2085 {
2086         struct jbd_buf *jbd_buf = arg;
2087         struct jbd_trans *trans = jbd_buf->trans;
2088         struct jbd_block_rec *block_rec = jbd_buf->block_rec;
2089         struct jbd_journal *journal = trans->journal;
2090         bool first_in_queue =
2091                 trans == TAILQ_FIRST(&journal->cp_queue);
2092         if (res != EOK)
2093                 trans->error = res;
2094
2095         TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
2096         TAILQ_REMOVE(&block_rec->dirty_buf_queue,
2097                         jbd_buf,
2098                         dirty_buf_node);
2099
2100         jbd_trans_finish_callback(journal,
2101                         trans,
2102                         jbd_buf->block_rec,
2103                         false,
2104                         false);
2105         if (block_rec->trans == trans && buf) {
2106                 /* Clear the end_write and end_write_arg fields. */
2107                 buf->end_write = NULL;
2108                 buf->end_write_arg = NULL;
2109         }
2110
2111         free(jbd_buf);
2112
2113         trans->written_cnt++;
2114         if (trans->written_cnt == trans->data_cnt) {
2115                 /* If it is the first transaction on checkpoint queue,
2116                  * we will shift the start of the journal to the next
2117                  * transaction, and remove subsequent written
2118                  * transactions from checkpoint queue until we find
2119                  * an unwritten one. */
2120                 if (first_in_queue) {
2121                         journal->start = trans->start_iblock +
2122                                 trans->alloc_blocks;
2123                         wrap(&journal->jbd_fs->sb, journal->start);
2124                         journal->trans_id = trans->trans_id + 1;
2125                         TAILQ_REMOVE(&journal->cp_queue, trans, trans_node);
2126                         jbd_journal_free_trans(journal, trans, false);
2127
2128                         jbd_journal_purge_cp_trans(journal, false, true);
2129                         jbd_journal_write_sb(journal);
2130                         jbd_write_sb(journal->jbd_fs);
2131                 }
2132         }
2133 }
2134
2135 /**@brief  Commit a transaction to the journal immediately.
2136  * @param  journal current journal session
2137  * @param  trans transaction
2138  * @return standard error code*/
2139 static int __jbd_journal_commit_trans(struct jbd_journal *journal,
2140                                       struct jbd_trans *trans)
2141 {
2142         int rc = EOK;
2143         uint32_t last = journal->last;
2144         struct jbd_revoke_rec *rec, *tmp;
2145
2146         trans->trans_id = journal->alloc_trans_id;
2147         rc = jbd_journal_prepare(journal, trans);
2148         if (rc != EOK)
2149                 goto Finish;
2150
2151         rc = jbd_journal_prepare_revoke(journal, trans);
2152         if (rc != EOK)
2153                 goto Finish;
2154
2155         if (TAILQ_EMPTY(&trans->buf_queue) &&
2156             RB_EMPTY(&trans->revoke_root)) {
2157                 /* Since there are no entries in both buffer list
2158                  * and revoke entry list, we do not consider trans as
2159                  * complete transaction and just return EOK.*/
2160                 jbd_journal_free_trans(journal, trans, false);
2161                 goto Finish;
2162         }
2163
2164         rc = jbd_trans_write_commit_block(trans);
2165         if (rc != EOK)
2166                 goto Finish;
2167
2168         journal->alloc_trans_id++;
2169
2170         /* Complete the checkpoint of buffers which are revoked. */
2171         RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root,
2172                         tmp) {
2173                 struct jbd_block_rec *block_rec =
2174                         jbd_trans_block_rec_lookup(journal, rec->lba);
2175                 struct jbd_buf *jbd_buf = NULL;
2176                 if (block_rec)
2177                         jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue,
2178                                         jbd_buf_dirty);
2179                 if (jbd_buf) {
2180                         struct ext4_buf *buf;
2181                         struct ext4_block block = EXT4_BLOCK_ZERO();
2182                         /*
2183                          * We do this to reset the ext4_buf::end_write and
2184                          * ext4_buf::end_write_arg fields so that the checkpoint
2185                          * callback won't be triggered again.
2186                          */
2187                         buf = ext4_bcache_find_get(journal->jbd_fs->bdev->bc,
2188                                         &block,
2189                                         jbd_buf->block_rec->lba);
2190                         jbd_trans_end_write(journal->jbd_fs->bdev->bc,
2191                                         buf,
2192                                         EOK,
2193                                         jbd_buf);
2194                         if (buf)
2195                                 ext4_block_set(journal->jbd_fs->bdev, &block);
2196                 }
2197         }
2198
2199         if (TAILQ_EMPTY(&journal->cp_queue)) {
2200                 /*
2201                  * This transaction is going to be the first object in the
2202                  * checkpoint queue.
2203                  * When the first transaction in checkpoint queue is completely
2204                  * written to disk, we shift the tail of the log to right.
2205                  */
2206                 if (trans->data_cnt) {
2207                         journal->start = trans->start_iblock;
2208                         wrap(&journal->jbd_fs->sb, journal->start);
2209                         journal->trans_id = trans->trans_id;
2210                         jbd_journal_write_sb(journal);
2211                         jbd_write_sb(journal->jbd_fs);
2212                         TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
2213                                         trans_node);
2214                         jbd_journal_cp_trans(journal, trans);
2215                 } else {
2216                         journal->start = trans->start_iblock +
2217                                 trans->alloc_blocks;
2218                         wrap(&journal->jbd_fs->sb, journal->start);
2219                         journal->trans_id = trans->trans_id + 1;
2220                         jbd_journal_write_sb(journal);
2221                         jbd_journal_free_trans(journal, trans, false);
2222                 }
2223         } else {
2224                 /* No need to do anything to the JBD superblock. */
2225                 TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
2226                                 trans_node);
2227                 if (trans->data_cnt)
2228                         jbd_journal_cp_trans(journal, trans);
2229         }
2230 Finish:
2231         if (rc != EOK && rc != ENOSPC) {
2232                 journal->last = last;
2233                 jbd_journal_free_trans(journal, trans, true);
2234         }
2235         return rc;
2236 }
2237
2238 /**@brief  Allocate a new transaction
2239  * @param  journal current journal session
2240  * @return transaction allocated*/
2241 struct jbd_trans *
2242 jbd_journal_new_trans(struct jbd_journal *journal)
2243 {
2244         struct jbd_trans *trans = NULL;
2245         trans = calloc(1, sizeof(struct jbd_trans));
2246         if (!trans)
2247                 return NULL;
2248
2249         /* We will assign a trans_id to this transaction,
2250          * once it has been committed.*/
2251         trans->journal = journal;
2252         trans->data_csum = EXT4_CRC32_INIT;
2253         trans->error = EOK;
2254         TAILQ_INIT(&trans->buf_queue);
2255         return trans;
2256 }
2257
2258 /**@brief  Commit a transaction to the journal immediately.
2259  * @param  journal current journal session
2260  * @param  trans transaction
2261  * @return standard error code*/
2262 int jbd_journal_commit_trans(struct jbd_journal *journal,
2263                              struct jbd_trans *trans)
2264 {
2265         int r = EOK;
2266         r = __jbd_journal_commit_trans(journal, trans);
2267         return r;
2268 }
2269
2270 /**
2271  * @}
2272  */