fbdf23308aa5052134be703949b1aa57ead8b781
[lwext4.git] / src / ext4_journal.c
1 /*
2  * Copyright (c) 2015 Grzegorz Kostka (kostka.grzegorz@gmail.com)
3  * Copyright (c) 2015 Kaho Ng (ngkaho1234@gmail.com)
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * - Redistributions of source code must retain the above copyright
11  *   notice, this list of conditions and the following disclaimer.
12  * - Redistributions in binary form must reproduce the above copyright
13  *   notice, this list of conditions and the following disclaimer in the
14  *   documentation and/or other materials provided with the distribution.
15  * - The name of the author may not be used to endorse or promote products
16  *   derived from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29
30 /** @addtogroup lwext4
31  * @{
32  */
33 /**
34  * @file  ext4_journal.c
35  * @brief Journal handle functions
36  */
37
38 #include <ext4_config.h>
39 #include <ext4_types.h>
40 #include <ext4_misc.h>
41 #include <ext4_errno.h>
42 #include <ext4_debug.h>
43
44 #include <ext4_fs.h>
45 #include <ext4_super.h>
46 #include <ext4_journal.h>
47 #include <ext4_blockdev.h>
48 #include <ext4_crc32.h>
49 #include <ext4_journal.h>
50
51 #include <string.h>
52 #include <stdlib.h>
53
54 /**@brief  Revoke entry during journal replay.*/
55 struct revoke_entry {
56         /**@brief  Block number not to be replayed.*/
57         ext4_fsblk_t block;
58
59         /**@brief  For any transaction id smaller
60          *         than trans_id, records of @block
61          *         in those transactions should not
62          *         be replayed.*/
63         uint32_t trans_id;
64
65         /**@brief  Revoke tree node.*/
66         RB_ENTRY(revoke_entry) revoke_node;
67 };
68
69 /**@brief  Valid journal replay information.*/
70 struct recover_info {
71         /**@brief  Starting transaction id.*/
72         uint32_t start_trans_id;
73
74         /**@brief  Ending transaction id.*/
75         uint32_t last_trans_id;
76
77         /**@brief  Used as internal argument.*/
78         uint32_t this_trans_id;
79
80         /**@brief  No of transactions went through.*/
81         uint32_t trans_cnt;
82
83         /**@brief  RB-Tree storing revoke entries.*/
84         RB_HEAD(jbd_revoke, revoke_entry) revoke_root;
85 };
86
87 /**@brief  Journal replay internal arguments.*/
88 struct replay_arg {
89         /**@brief  Journal replay information.*/
90         struct recover_info *info;
91
92         /**@brief  Current block we are on.*/
93         uint32_t *this_block;
94
95         /**@brief  Current trans_id we are on.*/
96         uint32_t this_trans_id;
97 };
98
99 /* Make sure we wrap around the log correctly! */
100 #define wrap(sb, var)                                           \
101 do {                                                                    \
102         if (var >= jbd_get32((sb), maxlen))                                     \
103                 var -= (jbd_get32((sb), maxlen) - jbd_get32((sb), first));      \
104 } while (0)
105
106 static inline int32_t
107 trans_id_diff(uint32_t x, uint32_t y)
108 {
109         int32_t diff = x - y;
110         return diff;
111 }
112
113 static int
114 jbd_revoke_entry_cmp(struct revoke_entry *a, struct revoke_entry *b)
115 {
116         if (a->block > b->block)
117                 return 1;
118         else if (a->block < b->block)
119                 return -1;
120         return 0;
121 }
122
123 static int
124 jbd_block_rec_cmp(struct jbd_block_rec *a, struct jbd_block_rec *b)
125 {
126         if (a->lba > b->lba)
127                 return 1;
128         else if (a->lba < b->lba)
129                 return -1;
130         return 0;
131 }
132
133 static int
134 jbd_revoke_rec_cmp(struct jbd_revoke_rec *a, struct jbd_revoke_rec *b)
135 {
136         if (a->lba > b->lba)
137                 return 1;
138         else if (a->lba < b->lba)
139                 return -1;
140         return 0;
141 }
142
143 RB_GENERATE_INTERNAL(jbd_revoke, revoke_entry, revoke_node,
144                      jbd_revoke_entry_cmp, static inline)
145 RB_GENERATE_INTERNAL(jbd_block, jbd_block_rec, block_rec_node,
146                      jbd_block_rec_cmp, static inline)
147 RB_GENERATE_INTERNAL(jbd_revoke_tree, jbd_revoke_rec, revoke_node,
148                      jbd_revoke_rec_cmp, static inline)
149
150 #define jbd_alloc_revoke_entry() ext4_calloc(1, sizeof(struct revoke_entry))
151 #define jbd_free_revoke_entry(addr) ext4_free(addr)
152
153 static int jbd_has_csum(struct jbd_sb *jbd_sb)
154 {
155         if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V2))
156                 return 2;
157
158         if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V3))
159                 return 3;
160
161         return 0;
162 }
163
164 #if CONFIG_META_CSUM_ENABLE
165 static uint32_t jbd_sb_csum(struct jbd_sb *jbd_sb)
166 {
167         uint32_t checksum = 0;
168
169         if (jbd_has_csum(jbd_sb)) {
170                 uint32_t orig_checksum = jbd_sb->checksum;
171                 jbd_set32(jbd_sb, checksum, 0);
172                 /* Calculate crc32c checksum against tho whole superblock */
173                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_sb,
174                                 JBD_SUPERBLOCK_SIZE);
175                 jbd_sb->checksum = orig_checksum;
176         }
177         return checksum;
178 }
179 #else
180 #define jbd_sb_csum(...) 0
181 #endif
182
183 static void jbd_sb_csum_set(struct jbd_sb *jbd_sb)
184 {
185         if (!jbd_has_csum(jbd_sb))
186                 return;
187
188         jbd_set32(jbd_sb, checksum, jbd_sb_csum(jbd_sb));
189 }
190
191 #if CONFIG_META_CSUM_ENABLE
192 static bool
193 jbd_verify_sb_csum(struct jbd_sb *jbd_sb)
194 {
195         if (!jbd_has_csum(jbd_sb))
196                 return true;
197
198         return jbd_sb_csum(jbd_sb) == jbd_get32(jbd_sb, checksum);
199 }
200 #else
201 #define jbd_verify_sb_csum(...) true
202 #endif
203
204 #if CONFIG_META_CSUM_ENABLE
205 static uint32_t jbd_meta_csum(struct jbd_fs *jbd_fs,
206                               struct jbd_bhdr *bhdr)
207 {
208         uint32_t checksum = 0;
209
210         if (jbd_has_csum(&jbd_fs->sb)) {
211                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
212                 struct jbd_block_tail *tail =
213                         (struct jbd_block_tail *)((char *)bhdr + block_size -
214                                 sizeof(struct jbd_block_tail));
215                 uint32_t orig_checksum = tail->checksum;
216                 tail->checksum = 0;
217
218                 /* First calculate crc32c checksum against fs uuid */
219                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
220                                        sizeof(jbd_fs->sb.uuid));
221                 /* Calculate crc32c checksum against tho whole block */
222                 checksum = ext4_crc32c(checksum, bhdr,
223                                 block_size);
224                 tail->checksum = orig_checksum;
225         }
226         return checksum;
227 }
228 #else
229 #define jbd_meta_csum(...) 0
230 #endif
231
232 static void jbd_meta_csum_set(struct jbd_fs *jbd_fs,
233                               struct jbd_bhdr *bhdr)
234 {
235         uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
236         struct jbd_block_tail *tail = (struct jbd_block_tail *)
237                                 ((char *)bhdr + block_size -
238                                 sizeof(struct jbd_block_tail));
239         if (!jbd_has_csum(&jbd_fs->sb))
240                 return;
241
242         tail->checksum = to_be32(jbd_meta_csum(jbd_fs, bhdr));
243 }
244
245 #if CONFIG_META_CSUM_ENABLE
246 static bool
247 jbd_verify_meta_csum(struct jbd_fs *jbd_fs,
248                      struct jbd_bhdr *bhdr)
249 {
250         uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
251         struct jbd_block_tail *tail = (struct jbd_block_tail *)
252                                 ((char *)bhdr + block_size -
253                                 sizeof(struct jbd_block_tail));
254         if (!jbd_has_csum(&jbd_fs->sb))
255                 return true;
256
257         return jbd_meta_csum(jbd_fs, bhdr) == to_be32(tail->checksum);
258 }
259 #else
260 #define jbd_verify_meta_csum(...) true
261 #endif
262
263 #if CONFIG_META_CSUM_ENABLE
264 static uint32_t jbd_commit_csum(struct jbd_fs *jbd_fs,
265                               struct jbd_commit_header *header)
266 {
267         uint32_t checksum = 0;
268
269         if (jbd_has_csum(&jbd_fs->sb)) {
270                 uint32_t orig_checksum_type = header->chksum_type,
271                          orig_checksum_size = header->chksum_size,
272                          orig_checksum = header->chksum[0];
273                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
274                 header->chksum_type = 0;
275                 header->chksum_size = 0;
276                 header->chksum[0] = 0;
277
278                 /* First calculate crc32c checksum against fs uuid */
279                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
280                                        sizeof(jbd_fs->sb.uuid));
281                 /* Calculate crc32c checksum against tho whole block */
282                 checksum = ext4_crc32c(checksum, header,
283                                 block_size);
284
285                 header->chksum_type = orig_checksum_type;
286                 header->chksum_size = orig_checksum_size;
287                 header->chksum[0] = orig_checksum;
288         }
289         return checksum;
290 }
291 #else
292 #define jbd_commit_csum(...) 0
293 #endif
294
295 static void jbd_commit_csum_set(struct jbd_fs *jbd_fs,
296                               struct jbd_commit_header *header)
297 {
298         if (!jbd_has_csum(&jbd_fs->sb))
299                 return;
300
301         header->chksum_type = 0;
302         header->chksum_size = 0;
303         header->chksum[0] = jbd_commit_csum(jbd_fs, header);
304 }
305
306 #if CONFIG_META_CSUM_ENABLE
307 static bool jbd_verify_commit_csum(struct jbd_fs *jbd_fs,
308                                    struct jbd_commit_header *header)
309 {
310         if (!jbd_has_csum(&jbd_fs->sb))
311                 return true;
312
313         return header->chksum[0] == to_be32(jbd_commit_csum(jbd_fs,
314                                             header));
315 }
316 #else
317 #define jbd_verify_commit_csum(...) true
318 #endif
319
320 #if CONFIG_META_CSUM_ENABLE
321 /*
322  * NOTE: We only make use of @csum parameter when
323  *       JBD_FEATURE_COMPAT_CHECKSUM is enabled.
324  */
325 static uint32_t jbd_block_csum(struct jbd_fs *jbd_fs, const void *buf,
326                                uint32_t csum,
327                                uint32_t sequence)
328 {
329         uint32_t checksum = 0;
330
331         if (jbd_has_csum(&jbd_fs->sb)) {
332                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
333                 /* First calculate crc32c checksum against fs uuid */
334                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
335                                        sizeof(jbd_fs->sb.uuid));
336                 /* Then calculate crc32c checksum against sequence no. */
337                 checksum = ext4_crc32c(checksum, &sequence,
338                                 sizeof(uint32_t));
339                 /* Calculate crc32c checksum against tho whole block */
340                 checksum = ext4_crc32c(checksum, buf,
341                                 block_size);
342         } else if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
343                                      JBD_FEATURE_COMPAT_CHECKSUM)) {
344                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
345                 /* Calculate crc32c checksum against tho whole block */
346                 checksum = ext4_crc32(csum, buf,
347                                 block_size);
348         }
349         return checksum;
350 }
351 #else
352 #define jbd_block_csum(...) 0
353 #endif
354
355 static void jbd_block_tag_csum_set(struct jbd_fs *jbd_fs, void *__tag,
356                                    uint32_t checksum)
357 {
358         int ver = jbd_has_csum(&jbd_fs->sb);
359         if (!ver)
360                 return;
361
362         if (ver == 2) {
363                 struct jbd_block_tag *tag = __tag;
364                 tag->checksum = (uint16_t)to_be32(checksum);
365         } else {
366                 struct jbd_block_tag3 *tag = __tag;
367                 tag->checksum = to_be32(checksum);
368         }
369 }
370
371 /**@brief  Write jbd superblock to disk.
372  * @param  jbd_fs jbd filesystem
373  * @param  s jbd superblock
374  * @return standard error code*/
375 static int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s)
376 {
377         int rc;
378         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
379         uint64_t offset;
380         ext4_fsblk_t fblock;
381         rc = jbd_inode_bmap(jbd_fs, 0, &fblock);
382         if (rc != EOK)
383                 return rc;
384
385         jbd_sb_csum_set(s);
386         offset = fblock * ext4_sb_get_block_size(&fs->sb);
387         return ext4_block_writebytes(fs->bdev, offset, s,
388                                      EXT4_SUPERBLOCK_SIZE);
389 }
390
391 /**@brief  Read jbd superblock from disk.
392  * @param  jbd_fs jbd filesystem
393  * @param  s jbd superblock
394  * @return standard error code*/
395 static int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s)
396 {
397         int rc;
398         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
399         uint64_t offset;
400         ext4_fsblk_t fblock;
401         rc = jbd_inode_bmap(jbd_fs, 0, &fblock);
402         if (rc != EOK)
403                 return rc;
404
405         offset = fblock * ext4_sb_get_block_size(&fs->sb);
406         return ext4_block_readbytes(fs->bdev, offset, s,
407                                     EXT4_SUPERBLOCK_SIZE);
408 }
409
410 /**@brief  Verify jbd superblock.
411  * @param  sb jbd superblock
412  * @return true if jbd superblock is valid */
413 static bool jbd_verify_sb(struct jbd_sb *sb)
414 {
415         struct jbd_bhdr *header = &sb->header;
416         if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER)
417                 return false;
418
419         if (jbd_get32(header, blocktype) != JBD_SUPERBLOCK &&
420             jbd_get32(header, blocktype) != JBD_SUPERBLOCK_V2)
421                 return false;
422
423         return jbd_verify_sb_csum(sb);
424 }
425
426 /**@brief  Write back dirty jbd superblock to disk.
427  * @param  jbd_fs jbd filesystem
428  * @return standard error code*/
429 static int jbd_write_sb(struct jbd_fs *jbd_fs)
430 {
431         int rc = EOK;
432         if (jbd_fs->dirty) {
433                 rc = jbd_sb_write(jbd_fs, &jbd_fs->sb);
434                 if (rc != EOK)
435                         return rc;
436
437                 jbd_fs->dirty = false;
438         }
439         return rc;
440 }
441
442 /**@brief  Get reference to jbd filesystem.
443  * @param  fs Filesystem to load journal of
444  * @param  jbd_fs jbd filesystem
445  * @return standard error code*/
446 int jbd_get_fs(struct ext4_fs *fs,
447                struct jbd_fs *jbd_fs)
448 {
449         int rc;
450         uint32_t journal_ino;
451
452         memset(jbd_fs, 0, sizeof(struct jbd_fs));
453         /* See if there is journal inode on this filesystem.*/
454         /* FIXME: detection on existance ofbkejournal bdev is
455          *        missing.*/
456         journal_ino = ext4_get32(&fs->sb, journal_inode_number);
457
458         rc = ext4_fs_get_inode_ref(fs,
459                                    journal_ino,
460                                    &jbd_fs->inode_ref);
461         if (rc != EOK)
462                 return rc;
463
464         rc = jbd_sb_read(jbd_fs, &jbd_fs->sb);
465         if (rc != EOK)
466                 goto Error;
467
468         if (!jbd_verify_sb(&jbd_fs->sb)) {
469                 rc = EIO;
470                 goto Error;
471         }
472
473         if (rc == EOK)
474                 jbd_fs->bdev = fs->bdev;
475
476         return rc;
477 Error:
478         ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
479         memset(jbd_fs, 0, sizeof(struct jbd_fs));
480
481         return rc;
482 }
483
484 /**@brief  Put reference of jbd filesystem.
485  * @param  jbd_fs jbd filesystem
486  * @return standard error code*/
487 int jbd_put_fs(struct jbd_fs *jbd_fs)
488 {
489         int rc = EOK;
490         rc = jbd_write_sb(jbd_fs);
491
492         ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
493         return rc;
494 }
495
496 /**@brief  Data block lookup helper.
497  * @param  jbd_fs jbd filesystem
498  * @param  iblock block index
499  * @param  fblock logical block address
500  * @return standard error code*/
501 int jbd_inode_bmap(struct jbd_fs *jbd_fs,
502                    ext4_lblk_t iblock,
503                    ext4_fsblk_t *fblock)
504 {
505         int rc = ext4_fs_get_inode_dblk_idx(
506                         &jbd_fs->inode_ref,
507                         iblock,
508                         fblock,
509                         false);
510         return rc;
511 }
512
513 /**@brief   jbd block get function (through cache).
514  * @param   jbd_fs jbd filesystem
515  * @param   block block descriptor
516  * @param   fblock jbd logical block address
517  * @return  standard error code*/
518 static int jbd_block_get(struct jbd_fs *jbd_fs,
519                   struct ext4_block *block,
520                   ext4_fsblk_t fblock)
521 {
522         /* TODO: journal device. */
523         int rc;
524         struct ext4_blockdev *bdev = jbd_fs->bdev;
525         ext4_lblk_t iblock = (ext4_lblk_t)fblock;
526
527         /* Lookup the logical block address of
528          * fblock.*/
529         rc = jbd_inode_bmap(jbd_fs, iblock,
530                             &fblock);
531         if (rc != EOK)
532                 return rc;
533
534         rc = ext4_block_get(bdev, block, fblock);
535
536         /* If succeeded, mark buffer as BC_FLUSH to indicate
537          * that data should be written to disk immediately.*/
538         if (rc == EOK) {
539                 ext4_bcache_set_flag(block->buf, BC_FLUSH);
540                 /* As we don't want to occupy too much space
541                  * in block cache, we set this buffer BC_TMP.*/
542                 ext4_bcache_set_flag(block->buf, BC_TMP);
543         }
544
545         return rc;
546 }
547
548 /**@brief   jbd block get function (through cache, don't read).
549  * @param   jbd_fs jbd filesystem
550  * @param   block block descriptor
551  * @param   fblock jbd logical block address
552  * @return  standard error code*/
553 static int jbd_block_get_noread(struct jbd_fs *jbd_fs,
554                          struct ext4_block *block,
555                          ext4_fsblk_t fblock)
556 {
557         /* TODO: journal device. */
558         int rc;
559         struct ext4_blockdev *bdev = jbd_fs->bdev;
560         ext4_lblk_t iblock = (ext4_lblk_t)fblock;
561         rc = jbd_inode_bmap(jbd_fs, iblock,
562                             &fblock);
563         if (rc != EOK)
564                 return rc;
565
566         rc = ext4_block_get_noread(bdev, block, fblock);
567         if (rc == EOK)
568                 ext4_bcache_set_flag(block->buf, BC_FLUSH);
569
570         return rc;
571 }
572
573 /**@brief   jbd block set procedure (through cache).
574  * @param   jbd_fs jbd filesystem
575  * @param   block block descriptor
576  * @return  standard error code*/
577 static int jbd_block_set(struct jbd_fs *jbd_fs,
578                   struct ext4_block *block)
579 {
580         struct ext4_blockdev *bdev = jbd_fs->bdev;
581         return ext4_block_set(bdev, block);
582 }
583
584 /**@brief  helper functions to calculate
585  *         block tag size, not including UUID part.
586  * @param  jbd_fs jbd filesystem
587  * @return tag size in bytes*/
588 static int jbd_tag_bytes(struct jbd_fs *jbd_fs)
589 {
590         int size;
591
592         /* It is very easy to deal with the case which
593          * JBD_FEATURE_INCOMPAT_CSUM_V3 is enabled.*/
594         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
595                                      JBD_FEATURE_INCOMPAT_CSUM_V3))
596                 return sizeof(struct jbd_block_tag3);
597
598         size = sizeof(struct jbd_block_tag);
599
600         /* If JBD_FEATURE_INCOMPAT_CSUM_V2 is enabled,
601          * add 2 bytes to size.*/
602         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
603                                      JBD_FEATURE_INCOMPAT_CSUM_V2))
604                 size += sizeof(uint16_t);
605
606         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
607                                      JBD_FEATURE_INCOMPAT_64BIT))
608                 return size;
609
610         /* If block number is 4 bytes in size,
611          * minus 4 bytes from size */
612         return size - sizeof(uint32_t);
613 }
614
615 /**@brief  Tag information. */
616 struct tag_info {
617         /**@brief  Tag size in bytes, including UUID part.*/
618         int tag_bytes;
619
620         /**@brief  block number stored in this tag.*/
621         ext4_fsblk_t block;
622
623         /**@brief  Is the first 4 bytes of block equals to
624          *         JBD_MAGIC_NUMBER? */
625         bool is_escape;
626
627         /**@brief  whether UUID part exists or not.*/
628         bool uuid_exist;
629
630         /**@brief  UUID content if UUID part exists.*/
631         uint8_t uuid[UUID_SIZE];
632
633         /**@brief  Is this the last tag? */
634         bool last_tag;
635
636         /**@brief  crc32c checksum. */
637         uint32_t checksum;
638 };
639
640 /**@brief  Extract information from a block tag.
641  * @param  __tag pointer to the block tag
642  * @param  tag_bytes block tag size of this jbd filesystem
643  * @param  remaining size in buffer containing the block tag
644  * @param  tag_info information of this tag.
645  * @return  EOK when succeed, otherwise return EINVAL.*/
646 static int
647 jbd_extract_block_tag(struct jbd_fs *jbd_fs,
648                       void *__tag,
649                       int tag_bytes,
650                       int32_t remain_buf_size,
651                       struct tag_info *tag_info)
652 {
653         char *uuid_start;
654         tag_info->tag_bytes = tag_bytes;
655         tag_info->uuid_exist = false;
656         tag_info->last_tag = false;
657         tag_info->is_escape = false;
658
659         /* See whether it is possible to hold a valid block tag.*/
660         if (remain_buf_size - tag_bytes < 0)
661                 return EINVAL;
662
663         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
664                                      JBD_FEATURE_INCOMPAT_CSUM_V3)) {
665                 struct jbd_block_tag3 *tag = __tag;
666                 tag_info->block = jbd_get32(tag, blocknr);
667                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
668                                              JBD_FEATURE_INCOMPAT_64BIT))
669                          tag_info->block |=
670                                  (uint64_t)jbd_get32(tag, blocknr_high) << 32;
671
672                 if (jbd_get32(tag, flags) & JBD_FLAG_ESCAPE)
673                         tag_info->is_escape = true;
674
675                 if (!(jbd_get32(tag, flags) & JBD_FLAG_SAME_UUID)) {
676                         /* See whether it is possible to hold UUID part.*/
677                         if (remain_buf_size - tag_bytes < UUID_SIZE)
678                                 return EINVAL;
679
680                         uuid_start = (char *)tag + tag_bytes;
681                         tag_info->uuid_exist = true;
682                         tag_info->tag_bytes += UUID_SIZE;
683                         memcpy(tag_info->uuid, uuid_start, UUID_SIZE);
684                 }
685
686                 if (jbd_get32(tag, flags) & JBD_FLAG_LAST_TAG)
687                         tag_info->last_tag = true;
688
689         } else {
690                 struct jbd_block_tag *tag = __tag;
691                 tag_info->block = jbd_get32(tag, blocknr);
692                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
693                                              JBD_FEATURE_INCOMPAT_64BIT))
694                          tag_info->block |=
695                                  (uint64_t)jbd_get32(tag, blocknr_high) << 32;
696
697                 if (jbd_get16(tag, flags) & JBD_FLAG_ESCAPE)
698                         tag_info->is_escape = true;
699
700                 if (!(jbd_get16(tag, flags) & JBD_FLAG_SAME_UUID)) {
701                         /* See whether it is possible to hold UUID part.*/
702                         if (remain_buf_size - tag_bytes < UUID_SIZE)
703                                 return EINVAL;
704
705                         uuid_start = (char *)tag + tag_bytes;
706                         tag_info->uuid_exist = true;
707                         tag_info->tag_bytes += UUID_SIZE;
708                         memcpy(tag_info->uuid, uuid_start, UUID_SIZE);
709                 }
710
711                 if (jbd_get16(tag, flags) & JBD_FLAG_LAST_TAG)
712                         tag_info->last_tag = true;
713
714         }
715         return EOK;
716 }
717
718 /**@brief  Write information to a block tag.
719  * @param  __tag pointer to the block tag
720  * @param  remaining size in buffer containing the block tag
721  * @param  tag_info information of this tag.
722  * @return  EOK when succeed, otherwise return EINVAL.*/
723 static int
724 jbd_write_block_tag(struct jbd_fs *jbd_fs,
725                     void *__tag,
726                     int32_t remain_buf_size,
727                     struct tag_info *tag_info)
728 {
729         char *uuid_start;
730         int tag_bytes = jbd_tag_bytes(jbd_fs);
731
732         tag_info->tag_bytes = tag_bytes;
733
734         /* See whether it is possible to hold a valid block tag.*/
735         if (remain_buf_size - tag_bytes < 0)
736                 return EINVAL;
737
738         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
739                                      JBD_FEATURE_INCOMPAT_CSUM_V3)) {
740                 struct jbd_block_tag3 *tag = __tag;
741                 memset(tag, 0, sizeof(struct jbd_block_tag3));
742                 jbd_set32(tag, blocknr, (uint32_t)tag_info->block);
743                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
744                                              JBD_FEATURE_INCOMPAT_64BIT))
745                         jbd_set32(tag, blocknr_high, tag_info->block >> 32);
746
747                 if (tag_info->uuid_exist) {
748                         /* See whether it is possible to hold UUID part.*/
749                         if (remain_buf_size - tag_bytes < UUID_SIZE)
750                                 return EINVAL;
751
752                         uuid_start = (char *)tag + tag_bytes;
753                         tag_info->tag_bytes += UUID_SIZE;
754                         memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
755                 } else
756                         jbd_set32(tag, flags,
757                                   jbd_get32(tag, flags) | JBD_FLAG_SAME_UUID);
758
759                 jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum);
760
761                 if (tag_info->last_tag)
762                         jbd_set32(tag, flags,
763                                   jbd_get32(tag, flags) | JBD_FLAG_LAST_TAG);
764
765                 if (tag_info->is_escape)
766                         jbd_set32(tag, flags,
767                                   jbd_get32(tag, flags) | JBD_FLAG_ESCAPE);
768
769         } else {
770                 struct jbd_block_tag *tag = __tag;
771                 memset(tag, 0, sizeof(struct jbd_block_tag));
772                 jbd_set32(tag, blocknr, (uint32_t)tag_info->block);
773                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
774                                              JBD_FEATURE_INCOMPAT_64BIT))
775                         jbd_set32(tag, blocknr_high, tag_info->block >> 32);
776
777                 if (tag_info->uuid_exist) {
778                         /* See whether it is possible to hold UUID part.*/
779                         if (remain_buf_size - tag_bytes < UUID_SIZE)
780                                 return EINVAL;
781
782                         uuid_start = (char *)tag + tag_bytes;
783                         tag_info->tag_bytes += UUID_SIZE;
784                         memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
785                 } else
786                         jbd_set16(tag, flags,
787                                   jbd_get16(tag, flags) | JBD_FLAG_SAME_UUID);
788
789                 jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum);
790
791                 if (tag_info->last_tag)
792                         jbd_set16(tag, flags,
793                                   jbd_get16(tag, flags) | JBD_FLAG_LAST_TAG);
794
795
796                 if (tag_info->is_escape)
797                         jbd_set16(tag, flags,
798                                   jbd_get16(tag, flags) | JBD_FLAG_ESCAPE);
799
800         }
801         return EOK;
802 }
803
804 /**@brief  Iterate all block tags in a block.
805  * @param  jbd_fs jbd filesystem
806  * @param  __tag_start pointer to the block
807  * @param  tag_tbl_size size of the block
808  * @param  func callback routine to indicate that
809  *         a block tag is found
810  * @param  arg additional argument to be passed to func */
811 static void
812 jbd_iterate_block_table(struct jbd_fs *jbd_fs,
813                         void *__tag_start,
814                         int32_t tag_tbl_size,
815                         void (*func)(struct jbd_fs * jbd_fs,
816                                      struct tag_info *tag_info,
817                                      void *arg),
818                         void *arg)
819 {
820         char *tag_start, *tag_ptr;
821         int tag_bytes = jbd_tag_bytes(jbd_fs);
822         tag_start = __tag_start;
823         tag_ptr = tag_start;
824
825         /* Cut off the size of block tail storing checksum. */
826         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
827                                      JBD_FEATURE_INCOMPAT_CSUM_V2) ||
828             JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
829                                      JBD_FEATURE_INCOMPAT_CSUM_V3))
830                 tag_tbl_size -= sizeof(struct jbd_block_tail);
831
832         while (tag_tbl_size) {
833                 struct tag_info tag_info;
834                 int rc = jbd_extract_block_tag(jbd_fs,
835                                       tag_ptr,
836                                       tag_bytes,
837                                       tag_tbl_size,
838                                       &tag_info);
839                 if (rc != EOK)
840                         break;
841
842                 if (func)
843                         func(jbd_fs, &tag_info, arg);
844
845                 /* Stop the iteration when we reach the last tag. */
846                 if (tag_info.last_tag)
847                         break;
848
849                 tag_ptr += tag_info.tag_bytes;
850                 tag_tbl_size -= tag_info.tag_bytes;
851         }
852 }
853
854 static void jbd_display_block_tags(struct jbd_fs *jbd_fs,
855                                    struct tag_info *tag_info,
856                                    void *arg)
857 {
858         uint32_t *iblock = arg;
859         ext4_dbg(DEBUG_JBD, "Block in block_tag: %" PRIu64 "\n", tag_info->block);
860         (*iblock)++;
861         wrap(&jbd_fs->sb, *iblock);
862         (void)jbd_fs;
863         return;
864 }
865
866 static struct revoke_entry *
867 jbd_revoke_entry_lookup(struct recover_info *info, ext4_fsblk_t block)
868 {
869         struct revoke_entry tmp = {
870                 .block = block
871         };
872
873         return RB_FIND(jbd_revoke, &info->revoke_root, &tmp);
874 }
875
876 /**@brief  Replay a block in a transaction.
877  * @param  jbd_fs jbd filesystem
878  * @param  tag_info tag_info of the logged block.*/
879 static void jbd_replay_block_tags(struct jbd_fs *jbd_fs,
880                                   struct tag_info *tag_info,
881                                   void *__arg)
882 {
883         int r;
884         struct replay_arg *arg = __arg;
885         struct recover_info *info = arg->info;
886         uint32_t *this_block = arg->this_block;
887         struct revoke_entry *revoke_entry;
888         struct ext4_block journal_block, ext4_block;
889         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
890
891         (*this_block)++;
892         wrap(&jbd_fs->sb, *this_block);
893
894         /* We replay this block only if the current transaction id
895          * is equal or greater than that in revoke entry.*/
896         revoke_entry = jbd_revoke_entry_lookup(info, tag_info->block);
897         if (revoke_entry &&
898             trans_id_diff(arg->this_trans_id, revoke_entry->trans_id) <= 0)
899                 return;
900
901         ext4_dbg(DEBUG_JBD,
902                  "Replaying block in block_tag: %" PRIu64 "\n",
903                  tag_info->block);
904
905         r = jbd_block_get(jbd_fs, &journal_block, *this_block);
906         if (r != EOK)
907                 return;
908
909         /* We need special treatment for ext4 superblock. */
910         if (tag_info->block) {
911                 r = ext4_block_get_noread(fs->bdev, &ext4_block, tag_info->block);
912                 if (r != EOK) {
913                         jbd_block_set(jbd_fs, &journal_block);
914                         return;
915                 }
916
917                 memcpy(ext4_block.data,
918                         journal_block.data,
919                         jbd_get32(&jbd_fs->sb, blocksize));
920
921                 if (tag_info->is_escape)
922                         ((struct jbd_bhdr *)ext4_block.data)->magic =
923                                         to_be32(JBD_MAGIC_NUMBER);
924
925                 ext4_bcache_set_dirty(ext4_block.buf);
926                 ext4_block_set(fs->bdev, &ext4_block);
927         } else {
928                 uint16_t mount_count, state;
929                 mount_count = ext4_get16(&fs->sb, mount_count);
930                 state = ext4_get16(&fs->sb, state);
931
932                 memcpy(&fs->sb,
933                         journal_block.data + EXT4_SUPERBLOCK_OFFSET,
934                         EXT4_SUPERBLOCK_SIZE);
935
936                 /* Mark system as mounted */
937                 ext4_set16(&fs->sb, state, state);
938                 r = ext4_sb_write(fs->bdev, &fs->sb);
939                 if (r != EOK)
940                         return;
941
942                 /*Update mount count*/
943                 ext4_set16(&fs->sb, mount_count, mount_count);
944         }
945
946         jbd_block_set(jbd_fs, &journal_block);
947         
948         return;
949 }
950
951 /**@brief  Add block address to revoke tree, along with
952  *         its transaction id.
953  * @param  info  journal replay info
954  * @param  block  block address to be replayed.*/
955 static void jbd_add_revoke_block_tags(struct recover_info *info,
956                                       ext4_fsblk_t block)
957 {
958         struct revoke_entry *revoke_entry;
959
960         ext4_dbg(DEBUG_JBD, "Add block %" PRIu64 " to revoke tree\n", block);
961         /* If the revoke entry with respect to the block address
962          * exists already, update its transaction id.*/
963         revoke_entry = jbd_revoke_entry_lookup(info, block);
964         if (revoke_entry) {
965                 revoke_entry->trans_id = info->this_trans_id;
966                 return;
967         }
968
969         revoke_entry = jbd_alloc_revoke_entry();
970         ext4_assert(revoke_entry);
971         revoke_entry->block = block;
972         revoke_entry->trans_id = info->this_trans_id;
973         RB_INSERT(jbd_revoke, &info->revoke_root, revoke_entry);
974
975         return;
976 }
977
978 static void jbd_destroy_revoke_tree(struct recover_info *info)
979 {
980         while (!RB_EMPTY(&info->revoke_root)) {
981                 struct revoke_entry *revoke_entry =
982                         RB_MIN(jbd_revoke, &info->revoke_root);
983                 ext4_assert(revoke_entry);
984                 RB_REMOVE(jbd_revoke, &info->revoke_root, revoke_entry);
985                 jbd_free_revoke_entry(revoke_entry);
986         }
987 }
988
989
990 #define ACTION_SCAN 0
991 #define ACTION_REVOKE 1
992 #define ACTION_RECOVER 2
993
994 /**@brief  Add entries in a revoke block to revoke tree.
995  * @param  jbd_fs jbd filesystem
996  * @param  header revoke block header
997  * @param  recover_info  journal replay info*/
998 static void jbd_build_revoke_tree(struct jbd_fs *jbd_fs,
999                                   struct jbd_bhdr *header,
1000                                   struct recover_info *info)
1001 {
1002         char *blocks_entry;
1003         struct jbd_revoke_header *revoke_hdr =
1004                 (struct jbd_revoke_header *)header;
1005         uint32_t i, nr_entries, record_len = 4;
1006
1007         /* If we are working on a 64bit jbd filesystem, */
1008         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
1009                                      JBD_FEATURE_INCOMPAT_64BIT))
1010                 record_len = 8;
1011
1012         nr_entries = (jbd_get32(revoke_hdr, count) -
1013                         sizeof(struct jbd_revoke_header)) /
1014                         record_len;
1015
1016         blocks_entry = (char *)(revoke_hdr + 1);
1017
1018         for (i = 0;i < nr_entries;i++) {
1019                 if (record_len == 8) {
1020                         uint64_t *blocks =
1021                                 (uint64_t *)blocks_entry;
1022                         jbd_add_revoke_block_tags(info, to_be64(*blocks));
1023                 } else {
1024                         uint32_t *blocks =
1025                                 (uint32_t *)blocks_entry;
1026                         jbd_add_revoke_block_tags(info, to_be32(*blocks));
1027                 }
1028                 blocks_entry += record_len;
1029         }
1030 }
1031
1032 static void jbd_debug_descriptor_block(struct jbd_fs *jbd_fs,
1033                                        struct jbd_bhdr *header,
1034                                        uint32_t *iblock)
1035 {
1036         jbd_iterate_block_table(jbd_fs,
1037                                 header + 1,
1038                                 jbd_get32(&jbd_fs->sb, blocksize) -
1039                                         sizeof(struct jbd_bhdr),
1040                                 jbd_display_block_tags,
1041                                 iblock);
1042 }
1043
1044 static void jbd_replay_descriptor_block(struct jbd_fs *jbd_fs,
1045                                         struct jbd_bhdr *header,
1046                                         struct replay_arg *arg)
1047 {
1048         jbd_iterate_block_table(jbd_fs,
1049                                 header + 1,
1050                                 jbd_get32(&jbd_fs->sb, blocksize) -
1051                                         sizeof(struct jbd_bhdr),
1052                                 jbd_replay_block_tags,
1053                                 arg);
1054 }
1055
1056 /**@brief  The core routine of journal replay.
1057  * @param  jbd_fs jbd filesystem
1058  * @param  recover_info  journal replay info
1059  * @param  action action needed to be taken
1060  * @return standard error code*/
1061 static int jbd_iterate_log(struct jbd_fs *jbd_fs,
1062                            struct recover_info *info,
1063                            int action)
1064 {
1065         int r = EOK;
1066         bool log_end = false;
1067         struct jbd_sb *sb = &jbd_fs->sb;
1068         uint32_t start_trans_id, this_trans_id;
1069         uint32_t start_block, this_block;
1070
1071         /* We start iterating valid blocks in the whole journal.*/
1072         start_trans_id = this_trans_id = jbd_get32(sb, sequence);
1073         start_block = this_block = jbd_get32(sb, start);
1074         if (action == ACTION_SCAN)
1075                 info->trans_cnt = 0;
1076         else if (!info->trans_cnt)
1077                 log_end = true;
1078
1079         ext4_dbg(DEBUG_JBD, "Start of journal at trans id: %" PRIu32 "\n",
1080                             start_trans_id);
1081
1082         while (!log_end) {
1083                 struct ext4_block block;
1084                 struct jbd_bhdr *header;
1085                 /* If we are not scanning for the last
1086                  * valid transaction in the journal,
1087                  * we will stop when we reach the end of
1088                  * the journal.*/
1089                 if (action != ACTION_SCAN)
1090                         if (trans_id_diff(this_trans_id, info->last_trans_id) > 0) {
1091                                 log_end = true;
1092                                 continue;
1093                         }
1094
1095                 r = jbd_block_get(jbd_fs, &block, this_block);
1096                 if (r != EOK)
1097                         break;
1098
1099                 header = (struct jbd_bhdr *)block.data;
1100                 /* This block does not have a valid magic number,
1101                  * so we have reached the end of the journal.*/
1102                 if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER) {
1103                         jbd_block_set(jbd_fs, &block);
1104                         log_end = true;
1105                         continue;
1106                 }
1107
1108                 /* If the transaction id we found is not expected,
1109                  * we may have reached the end of the journal.
1110                  *
1111                  * If we are not scanning the journal, something
1112                  * bad might have taken place. :-( */
1113                 if (jbd_get32(header, sequence) != this_trans_id) {
1114                         if (action != ACTION_SCAN)
1115                                 r = EIO;
1116
1117                         jbd_block_set(jbd_fs, &block);
1118                         log_end = true;
1119                         continue;
1120                 }
1121
1122                 switch (jbd_get32(header, blocktype)) {
1123                 case JBD_DESCRIPTOR_BLOCK:
1124                         if (!jbd_verify_meta_csum(jbd_fs, header)) {
1125                                 ext4_dbg(DEBUG_JBD,
1126                                         DBG_WARN "Descriptor block checksum failed."
1127                                                 "Journal block: %" PRIu32"\n",
1128                                                 this_block);
1129                                 log_end = true;
1130                                 break;
1131                         }
1132                         ext4_dbg(DEBUG_JBD, "Descriptor block: %" PRIu32", "
1133                                             "trans_id: %" PRIu32"\n",
1134                                             this_block, this_trans_id);
1135                         if (action == ACTION_RECOVER) {
1136                                 struct replay_arg replay_arg;
1137                                 replay_arg.info = info;
1138                                 replay_arg.this_block = &this_block;
1139                                 replay_arg.this_trans_id = this_trans_id;
1140
1141                                 jbd_replay_descriptor_block(jbd_fs,
1142                                                 header, &replay_arg);
1143                         } else
1144                                 jbd_debug_descriptor_block(jbd_fs,
1145                                                 header, &this_block);
1146
1147                         break;
1148                 case JBD_COMMIT_BLOCK:
1149                         if (!jbd_verify_commit_csum(jbd_fs,
1150                                         (struct jbd_commit_header *)header)) {
1151                                 ext4_dbg(DEBUG_JBD,
1152                                         DBG_WARN "Commit block checksum failed."
1153                                                 "Journal block: %" PRIu32"\n",
1154                                                 this_block);
1155                                 log_end = true;
1156                                 break;
1157                         }
1158                         ext4_dbg(DEBUG_JBD, "Commit block: %" PRIu32", "
1159                                             "trans_id: %" PRIu32"\n",
1160                                             this_block, this_trans_id);
1161                         /*
1162                          * This is the end of a transaction,
1163                          * we may now proceed to the next transaction.
1164                          */
1165                         this_trans_id++;
1166                         if (action == ACTION_SCAN)
1167                                 info->trans_cnt++;
1168                         break;
1169                 case JBD_REVOKE_BLOCK:
1170                         if (!jbd_verify_meta_csum(jbd_fs, header)) {
1171                                 ext4_dbg(DEBUG_JBD,
1172                                         DBG_WARN "Revoke block checksum failed."
1173                                                 "Journal block: %" PRIu32"\n",
1174                                                 this_block);
1175                                 log_end = true;
1176                                 break;
1177                         }
1178                         ext4_dbg(DEBUG_JBD, "Revoke block: %" PRIu32", "
1179                                             "trans_id: %" PRIu32"\n",
1180                                             this_block, this_trans_id);
1181                         if (action == ACTION_REVOKE) {
1182                                 info->this_trans_id = this_trans_id;
1183                                 jbd_build_revoke_tree(jbd_fs,
1184                                                 header, info);
1185                         }
1186                         break;
1187                 default:
1188                         log_end = true;
1189                         break;
1190                 }
1191                 jbd_block_set(jbd_fs, &block);
1192                 this_block++;
1193                 wrap(sb, this_block);
1194                 if (this_block == start_block)
1195                         log_end = true;
1196
1197         }
1198         ext4_dbg(DEBUG_JBD, "End of journal.\n");
1199         if (r == EOK && action == ACTION_SCAN) {
1200                 /* We have finished scanning the journal. */
1201                 info->start_trans_id = start_trans_id;
1202                 if (trans_id_diff(this_trans_id, start_trans_id) > 0)
1203                         info->last_trans_id = this_trans_id - 1;
1204                 else
1205                         info->last_trans_id = this_trans_id;
1206         }
1207
1208         return r;
1209 }
1210
1211 /**@brief  Replay journal.
1212  * @param  jbd_fs jbd filesystem
1213  * @return standard error code*/
1214 int jbd_recover(struct jbd_fs *jbd_fs)
1215 {
1216         int r;
1217         struct recover_info info;
1218         struct jbd_sb *sb = &jbd_fs->sb;
1219         if (!sb->start)
1220                 return EOK;
1221
1222         RB_INIT(&info.revoke_root);
1223
1224         r = jbd_iterate_log(jbd_fs, &info, ACTION_SCAN);
1225         if (r != EOK)
1226                 return r;
1227
1228         r = jbd_iterate_log(jbd_fs, &info, ACTION_REVOKE);
1229         if (r != EOK)
1230                 return r;
1231
1232         r = jbd_iterate_log(jbd_fs, &info, ACTION_RECOVER);
1233         if (r == EOK) {
1234                 /* If we successfully replay the journal,
1235                  * clear EXT4_FINCOM_RECOVER flag on the
1236                  * ext4 superblock, and set the start of
1237                  * journal to 0.*/
1238                 uint32_t features_incompatible =
1239                         ext4_get32(&jbd_fs->inode_ref.fs->sb,
1240                                    features_incompatible);
1241                 jbd_set32(&jbd_fs->sb, start, 0);
1242                 jbd_set32(&jbd_fs->sb, sequence, info.last_trans_id);
1243                 features_incompatible &= ~EXT4_FINCOM_RECOVER;
1244                 ext4_set32(&jbd_fs->inode_ref.fs->sb,
1245                            features_incompatible,
1246                            features_incompatible);
1247                 jbd_fs->dirty = true;
1248                 r = ext4_sb_write(jbd_fs->bdev,
1249                                   &jbd_fs->inode_ref.fs->sb);
1250         }
1251         jbd_destroy_revoke_tree(&info);
1252         return r;
1253 }
1254
1255 static void jbd_journal_write_sb(struct jbd_journal *journal)
1256 {
1257         struct jbd_fs *jbd_fs = journal->jbd_fs;
1258         jbd_set32(&jbd_fs->sb, start, journal->start);
1259         jbd_set32(&jbd_fs->sb, sequence, journal->trans_id);
1260         jbd_fs->dirty = true;
1261 }
1262
1263 /**@brief  Start accessing the journal.
1264  * @param  jbd_fs jbd filesystem
1265  * @param  journal current journal session
1266  * @return standard error code*/
1267 int jbd_journal_start(struct jbd_fs *jbd_fs,
1268                       struct jbd_journal *journal)
1269 {
1270         int r;
1271         uint32_t features_incompatible =
1272                         ext4_get32(&jbd_fs->inode_ref.fs->sb,
1273                                    features_incompatible);
1274         features_incompatible |= EXT4_FINCOM_RECOVER;
1275         ext4_set32(&jbd_fs->inode_ref.fs->sb,
1276                         features_incompatible,
1277                         features_incompatible);
1278         r = ext4_sb_write(jbd_fs->bdev,
1279                         &jbd_fs->inode_ref.fs->sb);
1280         if (r != EOK)
1281                 return r;
1282
1283         journal->first = jbd_get32(&jbd_fs->sb, first);
1284         journal->start = journal->first;
1285         journal->last = journal->first;
1286         /*
1287          * To invalidate any stale records we need to start from
1288          * the checkpoint transaction ID of the previous journalling session
1289          * plus 1.
1290          */
1291         journal->trans_id = jbd_get32(&jbd_fs->sb, sequence) + 1;
1292         journal->alloc_trans_id = journal->trans_id;
1293
1294         journal->block_size = jbd_get32(&jbd_fs->sb, blocksize);
1295
1296         TAILQ_INIT(&journal->cp_queue);
1297         RB_INIT(&journal->block_rec_root);
1298         journal->jbd_fs = jbd_fs;
1299         jbd_journal_write_sb(journal);
1300         r = jbd_write_sb(jbd_fs);
1301         if (r != EOK)
1302                 return r;
1303
1304         jbd_fs->bdev->journal = journal;
1305         return EOK;
1306 }
1307
1308 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
1309                           struct ext4_buf *buf __unused,
1310                           int res,
1311                           void *arg);
1312
1313 /*
1314  * This routine is only suitable to committed transactions. */
1315 static void jbd_journal_flush_trans(struct jbd_trans *trans)
1316 {
1317         struct jbd_buf *jbd_buf, *tmp;
1318         struct jbd_journal *journal = trans->journal;
1319         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1320         void *tmp_data = ext4_malloc(journal->block_size);
1321         ext4_assert(tmp_data);
1322
1323         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
1324                         tmp) {
1325                 struct ext4_buf *buf;
1326                 struct ext4_block block;
1327                 /* The buffer is not yet flushed. */
1328                 buf = ext4_bcache_find_get(fs->bdev->bc, &block,
1329                                            jbd_buf->block_rec->lba);
1330                 if (!(buf && ext4_bcache_test_flag(buf, BC_UPTODATE) &&
1331                       jbd_buf->block_rec->trans == trans)) {
1332                         int r;
1333                         struct ext4_block jbd_block = EXT4_BLOCK_ZERO();
1334                         ext4_assert(jbd_block_get(journal->jbd_fs,
1335                                                 &jbd_block,
1336                                                 jbd_buf->jbd_lba) == EOK);
1337                         memcpy(tmp_data, jbd_block.data,
1338                                         journal->block_size);
1339                         ext4_block_set(fs->bdev, &jbd_block);
1340                         r = ext4_blocks_set_direct(fs->bdev, tmp_data,
1341                                         jbd_buf->block_rec->lba, 1);
1342                         jbd_trans_end_write(fs->bdev->bc, buf, r, jbd_buf);
1343                 } else
1344                         ext4_block_flush_buf(fs->bdev, buf);
1345
1346                 if (buf)
1347                         ext4_block_set(fs->bdev, &block);
1348         }
1349
1350         ext4_free(tmp_data);
1351 }
1352
1353 static void
1354 jbd_journal_skip_pure_revoke(struct jbd_journal *journal,
1355                              struct jbd_trans *trans)
1356 {
1357         journal->start = trans->start_iblock +
1358                 trans->alloc_blocks;
1359         wrap(&journal->jbd_fs->sb, journal->start);
1360         journal->trans_id = trans->trans_id + 1;
1361         jbd_journal_free_trans(journal,
1362                         trans, false);
1363         jbd_journal_write_sb(journal);
1364 }
1365
1366 void
1367 jbd_journal_purge_cp_trans(struct jbd_journal *journal,
1368                            bool flush,
1369                            bool once)
1370 {
1371         struct jbd_trans *trans;
1372         while ((trans = TAILQ_FIRST(&journal->cp_queue))) {
1373                 if (!trans->data_cnt) {
1374                         TAILQ_REMOVE(&journal->cp_queue,
1375                                         trans,
1376                                         trans_node);
1377                         jbd_journal_skip_pure_revoke(journal, trans);
1378                 } else {
1379                         if (trans->data_cnt ==
1380                                         trans->written_cnt) {
1381                                 journal->start =
1382                                         trans->start_iblock +
1383                                         trans->alloc_blocks;
1384                                 wrap(&journal->jbd_fs->sb,
1385                                                 journal->start);
1386                                 journal->trans_id =
1387                                         trans->trans_id + 1;
1388                                 TAILQ_REMOVE(&journal->cp_queue,
1389                                                 trans,
1390                                                 trans_node);
1391                                 jbd_journal_free_trans(journal,
1392                                                 trans,
1393                                                 false);
1394                                 jbd_journal_write_sb(journal);
1395                         } else if (!flush) {
1396                                 journal->start =
1397                                         trans->start_iblock;
1398                                 wrap(&journal->jbd_fs->sb,
1399                                                 journal->start);
1400                                 journal->trans_id =
1401                                         trans->trans_id;
1402                                 jbd_journal_write_sb(journal);
1403                                 break;
1404                         } else
1405                                 jbd_journal_flush_trans(trans);
1406                 }
1407                 if (once)
1408                         break;
1409         }
1410 }
1411
1412 /**@brief  Stop accessing the journal.
1413  * @param  journal current journal session
1414  * @return standard error code*/
1415 int jbd_journal_stop(struct jbd_journal *journal)
1416 {
1417         int r;
1418         struct jbd_fs *jbd_fs = journal->jbd_fs;
1419         uint32_t features_incompatible;
1420
1421         /* Make sure that journalled content have reached
1422          * the disk.*/
1423         jbd_journal_purge_cp_trans(journal, true, false);
1424
1425         /* There should be no block record in this journal
1426          * session. */
1427         if (!RB_EMPTY(&journal->block_rec_root))
1428                 ext4_dbg(DEBUG_JBD,
1429                          DBG_WARN "There are still block records "
1430                                   "in this journal session!\n");
1431
1432         features_incompatible =
1433                 ext4_get32(&jbd_fs->inode_ref.fs->sb,
1434                            features_incompatible);
1435         features_incompatible &= ~EXT4_FINCOM_RECOVER;
1436         ext4_set32(&jbd_fs->inode_ref.fs->sb,
1437                         features_incompatible,
1438                         features_incompatible);
1439         r = ext4_sb_write(jbd_fs->bdev,
1440                         &jbd_fs->inode_ref.fs->sb);
1441         if (r != EOK)
1442                 return r;
1443
1444         journal->start = 0;
1445         journal->trans_id = 0;
1446         jbd_journal_write_sb(journal);
1447         return jbd_write_sb(journal->jbd_fs);
1448 }
1449
1450 /**@brief  Allocate a block in the journal.
1451  * @param  journal current journal session
1452  * @param  trans transaction
1453  * @return allocated block address*/
1454 static uint32_t jbd_journal_alloc_block(struct jbd_journal *journal,
1455                                         struct jbd_trans *trans)
1456 {
1457         uint32_t start_block;
1458
1459         start_block = journal->last++;
1460         trans->alloc_blocks++;
1461         wrap(&journal->jbd_fs->sb, journal->last);
1462         
1463         /* If there is no space left, flush just one journalled
1464          * transaction.*/
1465         if (journal->last == journal->start) {
1466                 jbd_journal_purge_cp_trans(journal, true, true);
1467                 ext4_assert(journal->last != journal->start);
1468         }
1469
1470         return start_block;
1471 }
1472
1473 static struct jbd_block_rec *
1474 jbd_trans_block_rec_lookup(struct jbd_journal *journal,
1475                            ext4_fsblk_t lba)
1476 {
1477         struct jbd_block_rec tmp = {
1478                 .lba = lba
1479         };
1480
1481         return RB_FIND(jbd_block,
1482                        &journal->block_rec_root,
1483                        &tmp);
1484 }
1485
1486 static void
1487 jbd_trans_change_ownership(struct jbd_block_rec *block_rec,
1488                            struct jbd_trans *new_trans)
1489 {
1490         LIST_REMOVE(block_rec, tbrec_node);
1491         if (new_trans) {
1492                 /* Now this block record belongs to this transaction. */
1493                 LIST_INSERT_HEAD(&new_trans->tbrec_list, block_rec, tbrec_node);
1494         }
1495         block_rec->trans = new_trans;
1496 }
1497
1498 static inline struct jbd_block_rec *
1499 jbd_trans_insert_block_rec(struct jbd_trans *trans,
1500                            ext4_fsblk_t lba)
1501 {
1502         struct jbd_block_rec *block_rec;
1503         block_rec = jbd_trans_block_rec_lookup(trans->journal, lba);
1504         if (block_rec) {
1505                 jbd_trans_change_ownership(block_rec, trans);
1506                 return block_rec;
1507         }
1508         block_rec = ext4_calloc(1, sizeof(struct jbd_block_rec));
1509         if (!block_rec)
1510                 return NULL;
1511
1512         block_rec->lba = lba;
1513         block_rec->trans = trans;
1514         TAILQ_INIT(&block_rec->dirty_buf_queue);
1515         LIST_INSERT_HEAD(&trans->tbrec_list, block_rec, tbrec_node);
1516         RB_INSERT(jbd_block, &trans->journal->block_rec_root, block_rec);
1517         return block_rec;
1518 }
1519
1520 /*
1521  * This routine will do the dirty works.
1522  */
1523 static void
1524 jbd_trans_finish_callback(struct jbd_journal *journal,
1525                           const struct jbd_trans *trans,
1526                           struct jbd_block_rec *block_rec,
1527                           bool abort,
1528                           bool revoke)
1529 {
1530         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1531         if (block_rec->trans != trans)
1532                 return;
1533
1534         if (!abort) {
1535                 struct jbd_buf *jbd_buf, *tmp;
1536                 TAILQ_FOREACH_SAFE(jbd_buf,
1537                                 &block_rec->dirty_buf_queue,
1538                                 dirty_buf_node,
1539                                 tmp) {
1540                         jbd_trans_end_write(fs->bdev->bc,
1541                                         NULL,
1542                                         EOK,
1543                                         jbd_buf);
1544                 }
1545         } else {
1546                 /*
1547                  * We have to roll back data if the block is going to be
1548                  * aborted.
1549                  */
1550                 struct jbd_buf *jbd_buf;
1551                 struct ext4_block jbd_block = EXT4_BLOCK_ZERO(),
1552                                   block = EXT4_BLOCK_ZERO();
1553                 jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue,
1554                                 jbd_buf_dirty);
1555                 if (jbd_buf) {
1556                         if (!revoke) {
1557                                 ext4_assert(ext4_block_get_noread(fs->bdev,
1558                                                         &block,
1559                                                         block_rec->lba) == EOK);
1560                                 ext4_assert(jbd_block_get(journal->jbd_fs,
1561                                                         &jbd_block,
1562                                                         jbd_buf->jbd_lba) == EOK);
1563                                 memcpy(block.data, jbd_block.data,
1564                                                 journal->block_size);
1565
1566                                 jbd_trans_change_ownership(block_rec,
1567                                                 jbd_buf->trans);
1568
1569                                 block.buf->end_write = jbd_trans_end_write;
1570                                 block.buf->end_write_arg = jbd_buf;
1571
1572                                 ext4_bcache_set_flag(jbd_block.buf, BC_TMP);
1573                                 ext4_bcache_set_dirty(block.buf);
1574
1575                                 ext4_block_set(fs->bdev, &jbd_block);
1576                                 ext4_block_set(fs->bdev, &block);
1577                                 return;
1578                         } else {
1579                                 /* The revoked buffer is yet written. */
1580                                 jbd_trans_change_ownership(block_rec,
1581                                                 jbd_buf->trans);
1582                         }
1583                 }
1584         }
1585 }
1586
1587 static inline void
1588 jbd_trans_remove_block_rec(struct jbd_journal *journal,
1589                            struct jbd_block_rec *block_rec,
1590                            struct jbd_trans *trans)
1591 {
1592         /* If this block record doesn't belong to this transaction,
1593          * give up.*/
1594         if (block_rec->trans == trans) {
1595                 LIST_REMOVE(block_rec, tbrec_node);
1596                 RB_REMOVE(jbd_block,
1597                                 &journal->block_rec_root,
1598                                 block_rec);
1599                 ext4_free(block_rec);
1600         }
1601 }
1602
1603 /**@brief  Add block to a transaction and mark it dirty.
1604  * @param  trans transaction
1605  * @param  block block descriptor
1606  * @return standard error code*/
1607 int jbd_trans_set_block_dirty(struct jbd_trans *trans,
1608                               struct ext4_block *block)
1609 {
1610         struct jbd_buf *jbd_buf;
1611         struct jbd_revoke_rec *rec, tmp_rec = {
1612                 .lba = block->lb_id
1613         };
1614         struct jbd_block_rec *block_rec;
1615
1616         if (block->buf->end_write == jbd_trans_end_write) {
1617                 jbd_buf = block->buf->end_write_arg;
1618                 if (jbd_buf && jbd_buf->trans == trans)
1619                         return EOK;
1620         }
1621         jbd_buf = ext4_calloc(1, sizeof(struct jbd_buf));
1622         if (!jbd_buf)
1623                 return ENOMEM;
1624
1625         if ((block_rec = jbd_trans_insert_block_rec(trans,
1626                                         block->lb_id)) == NULL) {
1627                 ext4_free(jbd_buf);
1628                 return ENOMEM;
1629         }
1630
1631         TAILQ_INSERT_TAIL(&block_rec->dirty_buf_queue,
1632                         jbd_buf,
1633                         dirty_buf_node);
1634
1635         jbd_buf->block_rec = block_rec;
1636         jbd_buf->trans = trans;
1637         jbd_buf->block = *block;
1638         ext4_bcache_inc_ref(block->buf);
1639
1640         /* If the content reach the disk, notify us
1641          * so that we may do a checkpoint. */
1642         block->buf->end_write = jbd_trans_end_write;
1643         block->buf->end_write_arg = jbd_buf;
1644
1645         trans->data_cnt++;
1646         TAILQ_INSERT_HEAD(&trans->buf_queue, jbd_buf, buf_node);
1647
1648         ext4_bcache_set_dirty(block->buf);
1649         rec = RB_FIND(jbd_revoke_tree,
1650                         &trans->revoke_root,
1651                         &tmp_rec);
1652         if (rec) {
1653                 RB_REMOVE(jbd_revoke_tree, &trans->revoke_root,
1654                           rec);
1655                 ext4_free(rec);
1656         }
1657
1658         return EOK;
1659 }
1660
1661 /**@brief  Add block to be revoked to a transaction
1662  * @param  trans transaction
1663  * @param  lba logical block address
1664  * @return standard error code*/
1665 int jbd_trans_revoke_block(struct jbd_trans *trans,
1666                            ext4_fsblk_t lba)
1667 {
1668         struct jbd_revoke_rec tmp_rec = {
1669                 .lba = lba
1670         }, *rec;
1671         rec = RB_FIND(jbd_revoke_tree,
1672                       &trans->revoke_root,
1673                       &tmp_rec);
1674         if (rec)
1675                 return EOK;
1676
1677         rec = ext4_calloc(1, sizeof(struct jbd_revoke_rec));
1678         if (!rec)
1679                 return ENOMEM;
1680
1681         rec->lba = lba;
1682         RB_INSERT(jbd_revoke_tree, &trans->revoke_root, rec);
1683         return EOK;
1684 }
1685
1686 /**@brief  Try to add block to be revoked to a transaction.
1687  *         If @lba still remains in an transaction on checkpoint
1688  *         queue, add @lba as a revoked block to the transaction.
1689  * @param  trans transaction
1690  * @param  lba logical block address
1691  * @return standard error code*/
1692 int jbd_trans_try_revoke_block(struct jbd_trans *trans,
1693                                ext4_fsblk_t lba)
1694 {
1695         struct jbd_journal *journal = trans->journal;
1696         struct jbd_block_rec *block_rec =
1697                 jbd_trans_block_rec_lookup(journal, lba);
1698
1699         if (block_rec) {
1700                 if (block_rec->trans == trans) {
1701                         struct jbd_buf *jbd_buf =
1702                                 TAILQ_LAST(&block_rec->dirty_buf_queue,
1703                                         jbd_buf_dirty);
1704                         /* If there are still unwritten buffers. */
1705                         if (TAILQ_FIRST(&block_rec->dirty_buf_queue) !=
1706                             jbd_buf)
1707                                 jbd_trans_revoke_block(trans, lba);
1708
1709                 } else
1710                         jbd_trans_revoke_block(trans, lba);
1711         }
1712
1713         return EOK;
1714 }
1715
1716 /**@brief  Free a transaction
1717  * @param  journal current journal session
1718  * @param  trans transaction
1719  * @param  abort discard all the modifications on the block?
1720  * @return standard error code*/
1721 void jbd_journal_free_trans(struct jbd_journal *journal,
1722                             struct jbd_trans *trans,
1723                             bool abort)
1724 {
1725         struct jbd_buf *jbd_buf, *tmp;
1726         struct jbd_revoke_rec *rec, *tmp2;
1727         struct jbd_block_rec *block_rec, *tmp3;
1728         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1729         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
1730                           tmp) {
1731                 block_rec = jbd_buf->block_rec;
1732                 if (abort) {
1733                         jbd_buf->block.buf->end_write = NULL;
1734                         jbd_buf->block.buf->end_write_arg = NULL;
1735                         ext4_bcache_clear_dirty(jbd_buf->block.buf);
1736                         ext4_block_set(fs->bdev, &jbd_buf->block);
1737                 }
1738
1739                 TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
1740                         jbd_buf,
1741                         dirty_buf_node);
1742                 jbd_trans_finish_callback(journal,
1743                                 trans,
1744                                 block_rec,
1745                                 abort,
1746                                 false);
1747                 TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1748                 ext4_free(jbd_buf);
1749         }
1750         RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root,
1751                           tmp2) {
1752                 RB_REMOVE(jbd_revoke_tree, &trans->revoke_root, rec);
1753                 ext4_free(rec);
1754         }
1755         LIST_FOREACH_SAFE(block_rec, &trans->tbrec_list, tbrec_node,
1756                           tmp3) {
1757                 jbd_trans_remove_block_rec(journal, block_rec, trans);
1758         }
1759
1760         ext4_free(trans);
1761 }
1762
1763 /**@brief  Write commit block for a transaction
1764  * @param  trans transaction
1765  * @return standard error code*/
1766 static int jbd_trans_write_commit_block(struct jbd_trans *trans)
1767 {
1768         int rc;
1769         struct ext4_block block;
1770         struct jbd_commit_header *header;
1771         uint32_t commit_iblock;
1772         struct jbd_journal *journal = trans->journal;
1773
1774         commit_iblock = jbd_journal_alloc_block(journal, trans);
1775
1776         rc = jbd_block_get_noread(journal->jbd_fs, &block, commit_iblock);
1777         if (rc != EOK)
1778                 return rc;
1779
1780         header = (struct jbd_commit_header *)block.data;
1781         jbd_set32(&header->header, magic, JBD_MAGIC_NUMBER);
1782         jbd_set32(&header->header, blocktype, JBD_COMMIT_BLOCK);
1783         jbd_set32(&header->header, sequence, trans->trans_id);
1784
1785         if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb,
1786                                 JBD_FEATURE_COMPAT_CHECKSUM)) {
1787                 jbd_set32(header, chksum_type, JBD_CRC32_CHKSUM);
1788                 jbd_set32(header, chksum_size, JBD_CRC32_CHKSUM_SIZE);
1789                 jbd_set32(header, chksum[0], trans->data_csum);
1790         }
1791         jbd_commit_csum_set(journal->jbd_fs, header);
1792         ext4_bcache_set_dirty(block.buf);
1793         ext4_bcache_set_flag(block.buf, BC_TMP);
1794         rc = jbd_block_set(journal->jbd_fs, &block);
1795         return rc;
1796 }
1797
1798 /**@brief  Write descriptor block for a transaction
1799  * @param  journal current journal session
1800  * @param  trans transaction
1801  * @return standard error code*/
1802 static int jbd_journal_prepare(struct jbd_journal *journal,
1803                                struct jbd_trans *trans)
1804 {
1805         int rc = EOK, i = 0;
1806         struct ext4_block desc_block = EXT4_BLOCK_ZERO(),
1807                           data_block = EXT4_BLOCK_ZERO();
1808         int32_t tag_tbl_size = 0;
1809         uint32_t desc_iblock = 0;
1810         uint32_t data_iblock = 0;
1811         char *tag_start = NULL, *tag_ptr = NULL;
1812         struct jbd_buf *jbd_buf, *tmp;
1813         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1814         uint32_t checksum = EXT4_CRC32_INIT;
1815         struct jbd_bhdr *bhdr = NULL;
1816         void *data;
1817
1818         /* Try to remove any non-dirty buffers from the tail of
1819          * buf_queue. */
1820         TAILQ_FOREACH_REVERSE_SAFE(jbd_buf, &trans->buf_queue,
1821                         jbd_trans_buf, buf_node, tmp) {
1822                 struct jbd_revoke_rec tmp_rec = {
1823                         .lba = jbd_buf->block_rec->lba
1824                 };
1825                 /* We stop the iteration when we find a dirty buffer. */
1826                 if (ext4_bcache_test_flag(jbd_buf->block.buf,
1827                                         BC_DIRTY))
1828                         break;
1829         
1830                 TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
1831                         jbd_buf,
1832                         dirty_buf_node);
1833
1834                 jbd_buf->block.buf->end_write = NULL;
1835                 jbd_buf->block.buf->end_write_arg = NULL;
1836                 jbd_trans_finish_callback(journal,
1837                                 trans,
1838                                 jbd_buf->block_rec,
1839                                 true,
1840                                 RB_FIND(jbd_revoke_tree,
1841                                         &trans->revoke_root,
1842                                         &tmp_rec));
1843                 jbd_trans_remove_block_rec(journal,
1844                                         jbd_buf->block_rec, trans);
1845                 trans->data_cnt--;
1846
1847                 ext4_block_set(fs->bdev, &jbd_buf->block);
1848                 TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1849                 ext4_free(jbd_buf);
1850         }
1851
1852         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, tmp) {
1853                 struct tag_info tag_info;
1854                 bool uuid_exist = false;
1855                 bool is_escape = false;
1856                 struct jbd_revoke_rec tmp_rec = {
1857                         .lba = jbd_buf->block_rec->lba
1858                 };
1859                 if (!ext4_bcache_test_flag(jbd_buf->block.buf,
1860                                            BC_DIRTY)) {
1861                         TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
1862                                         jbd_buf,
1863                                         dirty_buf_node);
1864
1865                         jbd_buf->block.buf->end_write = NULL;
1866                         jbd_buf->block.buf->end_write_arg = NULL;
1867
1868                         /* The buffer has not been modified, just release
1869                          * that jbd_buf. */
1870                         jbd_trans_finish_callback(journal,
1871                                         trans,
1872                                         jbd_buf->block_rec,
1873                                         true,
1874                                         RB_FIND(jbd_revoke_tree,
1875                                                 &trans->revoke_root,
1876                                                 &tmp_rec));
1877                         jbd_trans_remove_block_rec(journal,
1878                                         jbd_buf->block_rec, trans);
1879                         trans->data_cnt--;
1880
1881                         ext4_block_set(fs->bdev, &jbd_buf->block);
1882                         TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1883                         ext4_free(jbd_buf);
1884                         continue;
1885                 }
1886                 checksum = jbd_block_csum(journal->jbd_fs,
1887                                           jbd_buf->block.data,
1888                                           checksum,
1889                                           trans->trans_id);
1890                 if (((struct jbd_bhdr *)jbd_buf->block.data)->magic ==
1891                                 to_be32(JBD_MAGIC_NUMBER))
1892                         is_escape = true;
1893
1894 again:
1895                 if (!desc_iblock) {
1896                         desc_iblock = jbd_journal_alloc_block(journal, trans);
1897                         rc = jbd_block_get_noread(journal->jbd_fs, &desc_block, desc_iblock);
1898                         if (rc != EOK)
1899                                 break;
1900
1901                         bhdr = (struct jbd_bhdr *)desc_block.data;
1902                         jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
1903                         jbd_set32(bhdr, blocktype, JBD_DESCRIPTOR_BLOCK);
1904                         jbd_set32(bhdr, sequence, trans->trans_id);
1905
1906                         tag_start = (char *)(bhdr + 1);
1907                         tag_ptr = tag_start;
1908                         uuid_exist = true;
1909                         tag_tbl_size = journal->block_size -
1910                                 sizeof(struct jbd_bhdr);
1911
1912                         if (jbd_has_csum(&journal->jbd_fs->sb))
1913                                 tag_tbl_size -= sizeof(struct jbd_block_tail);
1914
1915                         if (!trans->start_iblock)
1916                                 trans->start_iblock = desc_iblock;
1917
1918                         ext4_bcache_set_dirty(desc_block.buf);
1919                         ext4_bcache_set_flag(desc_block.buf, BC_TMP);
1920                 }
1921                 tag_info.block = jbd_buf->block.lb_id;
1922                 tag_info.uuid_exist = uuid_exist;
1923                 tag_info.is_escape = is_escape;
1924                 if (i == trans->data_cnt - 1)
1925                         tag_info.last_tag = true;
1926                 else
1927                         tag_info.last_tag = false;
1928
1929                 tag_info.checksum = checksum;
1930
1931                 if (uuid_exist)
1932                         memcpy(tag_info.uuid, journal->jbd_fs->sb.uuid,
1933                                         UUID_SIZE);
1934
1935                 rc = jbd_write_block_tag(journal->jbd_fs,
1936                                 tag_ptr,
1937                                 tag_tbl_size,
1938                                 &tag_info);
1939                 if (rc != EOK) {
1940                         jbd_meta_csum_set(journal->jbd_fs, bhdr);
1941                         desc_iblock = 0;
1942                         rc = jbd_block_set(journal->jbd_fs, &desc_block);
1943                         if (rc != EOK)
1944                                 break;
1945
1946                         goto again;
1947                 }
1948
1949                 data_iblock = jbd_journal_alloc_block(journal, trans);
1950                 rc = jbd_block_get_noread(journal->jbd_fs, &data_block, data_iblock);
1951                 if (rc != EOK) {
1952                         desc_iblock = 0;
1953                         ext4_bcache_clear_dirty(desc_block.buf);
1954                         jbd_block_set(journal->jbd_fs, &desc_block);
1955                         break;
1956                 }
1957
1958                 data = data_block.data;
1959                 memcpy(data, jbd_buf->block.data,
1960                         journal->block_size);
1961                 if (is_escape)
1962                         ((struct jbd_bhdr *)data)->magic = 0;
1963
1964                 ext4_bcache_set_dirty(data_block.buf);
1965                 ext4_bcache_set_flag(data_block.buf, BC_TMP);
1966                 rc = jbd_block_set(journal->jbd_fs, &data_block);
1967                 if (rc != EOK) {
1968                         desc_iblock = 0;
1969                         ext4_bcache_clear_dirty(desc_block.buf);
1970                         jbd_block_set(journal->jbd_fs, &desc_block);
1971                         break;
1972                 }
1973                 jbd_buf->jbd_lba = data_iblock;
1974
1975                 tag_ptr += tag_info.tag_bytes;
1976                 tag_tbl_size -= tag_info.tag_bytes;
1977
1978                 i++;
1979         }
1980         if (rc == EOK && desc_iblock) {
1981                 jbd_meta_csum_set(journal->jbd_fs,
1982                                 (struct jbd_bhdr *)bhdr);
1983                 trans->data_csum = checksum;
1984                 rc = jbd_block_set(journal->jbd_fs, &desc_block);
1985         }
1986
1987         return rc;
1988 }
1989
1990 /**@brief  Write revoke block for a transaction
1991  * @param  journal current journal session
1992  * @param  trans transaction
1993  * @return standard error code*/
1994 static int
1995 jbd_journal_prepare_revoke(struct jbd_journal *journal,
1996                            struct jbd_trans *trans)
1997 {
1998         int rc = EOK, i = 0;
1999         struct ext4_block desc_block = EXT4_BLOCK_ZERO();
2000         int32_t tag_tbl_size = 0;
2001         uint32_t desc_iblock = 0;
2002         char *blocks_entry = NULL;
2003         struct jbd_revoke_rec *rec, *tmp;
2004         struct jbd_revoke_header *header = NULL;
2005         int32_t record_len = 4;
2006         struct jbd_bhdr *bhdr = NULL;
2007
2008         if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb,
2009                                      JBD_FEATURE_INCOMPAT_64BIT))
2010                 record_len = 8;
2011
2012         RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root,
2013                           tmp) {
2014 again:
2015                 if (!desc_iblock) {
2016                         desc_iblock = jbd_journal_alloc_block(journal, trans);
2017                         rc = jbd_block_get_noread(journal->jbd_fs, &desc_block,
2018                                                   desc_iblock);
2019                         if (rc != EOK)
2020                                 break;
2021
2022                         bhdr = (struct jbd_bhdr *)desc_block.data;
2023                         jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
2024                         jbd_set32(bhdr, blocktype, JBD_REVOKE_BLOCK);
2025                         jbd_set32(bhdr, sequence, trans->trans_id);
2026                         
2027                         header = (struct jbd_revoke_header *)bhdr;
2028                         blocks_entry = (char *)(header + 1);
2029                         tag_tbl_size = journal->block_size -
2030                                 sizeof(struct jbd_revoke_header);
2031
2032                         if (jbd_has_csum(&journal->jbd_fs->sb))
2033                                 tag_tbl_size -= sizeof(struct jbd_block_tail);
2034
2035                         if (!trans->start_iblock)
2036                                 trans->start_iblock = desc_iblock;
2037
2038                         ext4_bcache_set_dirty(desc_block.buf);
2039                         ext4_bcache_set_flag(desc_block.buf, BC_TMP);
2040                 }
2041
2042                 if (tag_tbl_size < record_len) {
2043                         jbd_set32(header, count,
2044                                   journal->block_size - tag_tbl_size);
2045                         jbd_meta_csum_set(journal->jbd_fs, bhdr);
2046                         bhdr = NULL;
2047                         desc_iblock = 0;
2048                         header = NULL;
2049                         rc = jbd_block_set(journal->jbd_fs, &desc_block);
2050                         if (rc != EOK)
2051                                 break;
2052
2053                         goto again;
2054                 }
2055                 if (record_len == 8) {
2056                         uint64_t *blocks =
2057                                 (uint64_t *)blocks_entry;
2058                         *blocks = to_be64(rec->lba);
2059                 } else {
2060                         uint32_t *blocks =
2061                                 (uint32_t *)blocks_entry;
2062                         *blocks = to_be32((uint32_t)rec->lba);
2063                 }
2064                 blocks_entry += record_len;
2065                 tag_tbl_size -= record_len;
2066
2067                 i++;
2068         }
2069         if (rc == EOK && desc_iblock) {
2070                 if (header != NULL)
2071                         jbd_set32(header, count,
2072                                   journal->block_size - tag_tbl_size);
2073
2074                 jbd_meta_csum_set(journal->jbd_fs, bhdr);
2075                 rc = jbd_block_set(journal->jbd_fs, &desc_block);
2076         }
2077
2078         return rc;
2079 }
2080
2081 /**@brief  Put references of block descriptors in a transaction.
2082  * @param  journal current journal session
2083  * @param  trans transaction*/
2084 void jbd_journal_cp_trans(struct jbd_journal *journal, struct jbd_trans *trans)
2085 {
2086         struct jbd_buf *jbd_buf, *tmp;
2087         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
2088         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
2089                         tmp) {
2090                 struct ext4_block block = jbd_buf->block;
2091                 ext4_block_set(fs->bdev, &block);
2092         }
2093 }
2094
2095 /**@brief  Update the start block of the journal when
2096  *         all the contents in a transaction reach the disk.*/
2097 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
2098                           struct ext4_buf *buf,
2099                           int res,
2100                           void *arg)
2101 {
2102         struct jbd_buf *jbd_buf = arg;
2103         struct jbd_trans *trans = jbd_buf->trans;
2104         struct jbd_block_rec *block_rec = jbd_buf->block_rec;
2105         struct jbd_journal *journal = trans->journal;
2106         bool first_in_queue =
2107                 trans == TAILQ_FIRST(&journal->cp_queue);
2108         if (res != EOK)
2109                 trans->error = res;
2110
2111         TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
2112         TAILQ_REMOVE(&block_rec->dirty_buf_queue,
2113                         jbd_buf,
2114                         dirty_buf_node);
2115
2116         jbd_trans_finish_callback(journal,
2117                         trans,
2118                         jbd_buf->block_rec,
2119                         false,
2120                         false);
2121         if (block_rec->trans == trans && buf) {
2122                 /* Clear the end_write and end_write_arg fields. */
2123                 buf->end_write = NULL;
2124                 buf->end_write_arg = NULL;
2125         }
2126
2127         ext4_free(jbd_buf);
2128
2129         trans->written_cnt++;
2130         if (trans->written_cnt == trans->data_cnt) {
2131                 /* If it is the first transaction on checkpoint queue,
2132                  * we will shift the start of the journal to the next
2133                  * transaction, and remove subsequent written
2134                  * transactions from checkpoint queue until we find
2135                  * an unwritten one. */
2136                 if (first_in_queue) {
2137                         journal->start = trans->start_iblock +
2138                                 trans->alloc_blocks;
2139                         wrap(&journal->jbd_fs->sb, journal->start);
2140                         journal->trans_id = trans->trans_id + 1;
2141                         TAILQ_REMOVE(&journal->cp_queue, trans, trans_node);
2142                         jbd_journal_free_trans(journal, trans, false);
2143
2144                         jbd_journal_purge_cp_trans(journal, false, true);
2145                         jbd_journal_write_sb(journal);
2146                         jbd_write_sb(journal->jbd_fs);
2147                 }
2148         }
2149 }
2150
2151 /**@brief  Commit a transaction to the journal immediately.
2152  * @param  journal current journal session
2153  * @param  trans transaction
2154  * @return standard error code*/
2155 static int __jbd_journal_commit_trans(struct jbd_journal *journal,
2156                                       struct jbd_trans *trans)
2157 {
2158         int rc = EOK;
2159         uint32_t last = journal->last;
2160         struct jbd_revoke_rec *rec, *tmp;
2161
2162         trans->trans_id = journal->alloc_trans_id;
2163         rc = jbd_journal_prepare(journal, trans);
2164         if (rc != EOK)
2165                 goto Finish;
2166
2167         rc = jbd_journal_prepare_revoke(journal, trans);
2168         if (rc != EOK)
2169                 goto Finish;
2170
2171         if (TAILQ_EMPTY(&trans->buf_queue) &&
2172             RB_EMPTY(&trans->revoke_root)) {
2173                 /* Since there are no entries in both buffer list
2174                  * and revoke entry list, we do not consider trans as
2175                  * complete transaction and just return EOK.*/
2176                 jbd_journal_free_trans(journal, trans, false);
2177                 goto Finish;
2178         }
2179
2180         rc = jbd_trans_write_commit_block(trans);
2181         if (rc != EOK)
2182                 goto Finish;
2183
2184         journal->alloc_trans_id++;
2185
2186         /* Complete the checkpoint of buffers which are revoked. */
2187         RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root,
2188                         tmp) {
2189                 struct jbd_block_rec *block_rec =
2190                         jbd_trans_block_rec_lookup(journal, rec->lba);
2191                 struct jbd_buf *jbd_buf = NULL;
2192                 if (block_rec)
2193                         jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue,
2194                                         jbd_buf_dirty);
2195                 if (jbd_buf) {
2196                         struct ext4_buf *buf;
2197                         struct ext4_block block = EXT4_BLOCK_ZERO();
2198                         /*
2199                          * We do this to reset the ext4_buf::end_write and
2200                          * ext4_buf::end_write_arg fields so that the checkpoint
2201                          * callback won't be triggered again.
2202                          */
2203                         buf = ext4_bcache_find_get(journal->jbd_fs->bdev->bc,
2204                                         &block,
2205                                         jbd_buf->block_rec->lba);
2206                         jbd_trans_end_write(journal->jbd_fs->bdev->bc,
2207                                         buf,
2208                                         EOK,
2209                                         jbd_buf);
2210                         if (buf)
2211                                 ext4_block_set(journal->jbd_fs->bdev, &block);
2212                 }
2213         }
2214
2215         if (TAILQ_EMPTY(&journal->cp_queue)) {
2216                 /*
2217                  * This transaction is going to be the first object in the
2218                  * checkpoint queue.
2219                  * When the first transaction in checkpoint queue is completely
2220                  * written to disk, we shift the tail of the log to right.
2221                  */
2222                 if (trans->data_cnt) {
2223                         journal->start = trans->start_iblock;
2224                         wrap(&journal->jbd_fs->sb, journal->start);
2225                         journal->trans_id = trans->trans_id;
2226                         jbd_journal_write_sb(journal);
2227                         jbd_write_sb(journal->jbd_fs);
2228                         TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
2229                                         trans_node);
2230                         jbd_journal_cp_trans(journal, trans);
2231                 } else {
2232                         journal->start = trans->start_iblock +
2233                                 trans->alloc_blocks;
2234                         wrap(&journal->jbd_fs->sb, journal->start);
2235                         journal->trans_id = trans->trans_id + 1;
2236                         jbd_journal_write_sb(journal);
2237                         jbd_journal_free_trans(journal, trans, false);
2238                 }
2239         } else {
2240                 /* No need to do anything to the JBD superblock. */
2241                 TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
2242                                 trans_node);
2243                 if (trans->data_cnt)
2244                         jbd_journal_cp_trans(journal, trans);
2245         }
2246 Finish:
2247         if (rc != EOK && rc != ENOSPC) {
2248                 journal->last = last;
2249                 jbd_journal_free_trans(journal, trans, true);
2250         }
2251         return rc;
2252 }
2253
2254 /**@brief  Allocate a new transaction
2255  * @param  journal current journal session
2256  * @return transaction allocated*/
2257 struct jbd_trans *
2258 jbd_journal_new_trans(struct jbd_journal *journal)
2259 {
2260         struct jbd_trans *trans = NULL;
2261         trans = ext4_calloc(1, sizeof(struct jbd_trans));
2262         if (!trans)
2263                 return NULL;
2264
2265         /* We will assign a trans_id to this transaction,
2266          * once it has been committed.*/
2267         trans->journal = journal;
2268         trans->data_csum = EXT4_CRC32_INIT;
2269         trans->error = EOK;
2270         TAILQ_INIT(&trans->buf_queue);
2271         return trans;
2272 }
2273
2274 /**@brief  Commit a transaction to the journal immediately.
2275  * @param  journal current journal session
2276  * @param  trans transaction
2277  * @return standard error code*/
2278 int jbd_journal_commit_trans(struct jbd_journal *journal,
2279                              struct jbd_trans *trans)
2280 {
2281         int r = EOK;
2282         r = __jbd_journal_commit_trans(journal, trans);
2283         return r;
2284 }
2285
2286 /**
2287  * @}
2288  */