ext4_journal: modify the first 4 bytes of the logged block when needed
[lwext4.git] / src / ext4_journal.c
1 /*
2  * Copyright (c) 2015 Grzegorz Kostka (kostka.grzegorz@gmail.com)
3  * Copyright (c) 2015 Kaho Ng (ngkaho1234@gmail.com)
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * - Redistributions of source code must retain the above copyright
11  *   notice, this list of conditions and the following disclaimer.
12  * - Redistributions in binary form must reproduce the above copyright
13  *   notice, this list of conditions and the following disclaimer in the
14  *   documentation and/or other materials provided with the distribution.
15  * - The name of the author may not be used to endorse or promote products
16  *   derived from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29
30 /** @addtogroup lwext4
31  * @{
32  */
33 /**
34  * @file  ext4_journal.c
35  * @brief Journal handle functions
36  */
37
38 #include "ext4_config.h"
39 #include "ext4_types.h"
40 #include "ext4_misc.h"
41 #include "ext4_errno.h"
42 #include "ext4_debug.h"
43
44 #include "ext4_fs.h"
45 #include "ext4_super.h"
46 #include "ext4_journal.h"
47 #include "ext4_blockdev.h"
48 #include "ext4_crc32.h"
49 #include "ext4_journal.h"
50
51 #include <string.h>
52 #include <stdlib.h>
53
54 /**@brief  Revoke entry during journal replay.*/
55 struct revoke_entry {
56         /**@brief  Block number not to be replayed.*/
57         ext4_fsblk_t block;
58
59         /**@brief  For any transaction id smaller
60          *         than trans_id, records of @block
61          *         in those transactions should not
62          *         be replayed.*/
63         uint32_t trans_id;
64
65         /**@brief  Revoke tree node.*/
66         RB_ENTRY(revoke_entry) revoke_node;
67 };
68
69 /**@brief  Valid journal replay information.*/
70 struct recover_info {
71         /**@brief  Starting transaction id.*/
72         uint32_t start_trans_id;
73
74         /**@brief  Ending transaction id.*/
75         uint32_t last_trans_id;
76
77         /**@brief  Used as internal argument.*/
78         uint32_t this_trans_id;
79
80         /**@brief  No of transactions went through.*/
81         uint32_t trans_cnt;
82
83         /**@brief  RB-Tree storing revoke entries.*/
84         RB_HEAD(jbd_revoke, revoke_entry) revoke_root;
85 };
86
87 /**@brief  Journal replay internal arguments.*/
88 struct replay_arg {
89         /**@brief  Journal replay information.*/
90         struct recover_info *info;
91
92         /**@brief  Current block we are on.*/
93         uint32_t *this_block;
94
95         /**@brief  Current trans_id we are on.*/
96         uint32_t this_trans_id;
97 };
98
99 /* Make sure we wrap around the log correctly! */
100 #define wrap(sb, var)                                           \
101 do {                                                                    \
102         if (var >= jbd_get32((sb), maxlen))                                     \
103                 var -= (jbd_get32((sb), maxlen) - jbd_get32((sb), first));      \
104 } while (0)
105
106 static inline int32_t
107 trans_id_diff(uint32_t x, uint32_t y)
108 {
109         int32_t diff = x - y;
110         return diff;
111 }
112
113 static int
114 jbd_revoke_entry_cmp(struct revoke_entry *a, struct revoke_entry *b)
115 {
116         if (a->block > b->block)
117                 return 1;
118         else if (a->block < b->block)
119                 return -1;
120         return 0;
121 }
122
123 static int
124 jbd_block_rec_cmp(struct jbd_block_rec *a, struct jbd_block_rec *b)
125 {
126         if (a->lba > b->lba)
127                 return 1;
128         else if (a->lba < b->lba)
129                 return -1;
130         return 0;
131 }
132
133 static int
134 jbd_revoke_rec_cmp(struct jbd_revoke_rec *a, struct jbd_revoke_rec *b)
135 {
136         if (a->lba > b->lba)
137                 return 1;
138         else if (a->lba < b->lba)
139                 return -1;
140         return 0;
141 }
142
143 RB_GENERATE_INTERNAL(jbd_revoke, revoke_entry, revoke_node,
144                      jbd_revoke_entry_cmp, static inline)
145 RB_GENERATE_INTERNAL(jbd_block, jbd_block_rec, block_rec_node,
146                      jbd_block_rec_cmp, static inline)
147 RB_GENERATE_INTERNAL(jbd_revoke_tree, jbd_revoke_rec, revoke_node,
148                      jbd_revoke_rec_cmp, static inline)
149
150 #define jbd_alloc_revoke_entry() ext4_calloc(1, sizeof(struct revoke_entry))
151 #define jbd_free_revoke_entry(addr) ext4_free(addr)
152
153 static int jbd_has_csum(struct jbd_sb *jbd_sb)
154 {
155         if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V2))
156                 return 2;
157
158         if (JBD_HAS_INCOMPAT_FEATURE(jbd_sb, JBD_FEATURE_INCOMPAT_CSUM_V3))
159                 return 3;
160
161         return 0;
162 }
163
164 #if CONFIG_META_CSUM_ENABLE
165 static uint32_t jbd_sb_csum(struct jbd_sb *jbd_sb)
166 {
167         uint32_t checksum = 0;
168
169         if (jbd_has_csum(jbd_sb)) {
170                 uint32_t orig_checksum = jbd_sb->checksum;
171                 jbd_set32(jbd_sb, checksum, 0);
172                 /* Calculate crc32c checksum against tho whole superblock */
173                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_sb,
174                                 JBD_SUPERBLOCK_SIZE);
175                 jbd_sb->checksum = orig_checksum;
176         }
177         return checksum;
178 }
179 #else
180 #define jbd_sb_csum(...) 0
181 #endif
182
183 static void jbd_sb_csum_set(struct jbd_sb *jbd_sb)
184 {
185         if (!jbd_has_csum(jbd_sb))
186                 return;
187
188         jbd_set32(jbd_sb, checksum, jbd_sb_csum(jbd_sb));
189 }
190
191 #if CONFIG_META_CSUM_ENABLE
192 static bool
193 jbd_verify_sb_csum(struct jbd_sb *jbd_sb)
194 {
195         if (!jbd_has_csum(jbd_sb))
196                 return true;
197
198         return jbd_sb_csum(jbd_sb) == jbd_get32(jbd_sb, checksum);
199 }
200 #else
201 #define jbd_verify_sb_csum(...) true
202 #endif
203
204 #if CONFIG_META_CSUM_ENABLE
205 static uint32_t jbd_meta_csum(struct jbd_fs *jbd_fs,
206                               struct jbd_bhdr *bhdr)
207 {
208         uint32_t checksum = 0;
209
210         if (jbd_has_csum(&jbd_fs->sb)) {
211                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
212                 struct jbd_block_tail *tail =
213                         (struct jbd_block_tail *)((char *)bhdr + block_size -
214                                 sizeof(struct jbd_block_tail));
215                 uint32_t orig_checksum = tail->checksum;
216                 tail->checksum = 0;
217
218                 /* First calculate crc32c checksum against fs uuid */
219                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
220                                        sizeof(jbd_fs->sb.uuid));
221                 /* Calculate crc32c checksum against tho whole block */
222                 checksum = ext4_crc32c(checksum, bhdr,
223                                 block_size);
224                 tail->checksum = orig_checksum;
225         }
226         return checksum;
227 }
228 #else
229 #define jbd_meta_csum(...) 0
230 #endif
231
232 static void jbd_meta_csum_set(struct jbd_fs *jbd_fs,
233                               struct jbd_bhdr *bhdr)
234 {
235         uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
236         struct jbd_block_tail *tail = (struct jbd_block_tail *)
237                                 ((char *)bhdr + block_size -
238                                 sizeof(struct jbd_block_tail));
239         if (!jbd_has_csum(&jbd_fs->sb))
240                 return;
241
242         tail->checksum = to_be32(jbd_meta_csum(jbd_fs, bhdr));
243 }
244
245 #if CONFIG_META_CSUM_ENABLE
246 static bool
247 jbd_verify_meta_csum(struct jbd_fs *jbd_fs,
248                      struct jbd_bhdr *bhdr)
249 {
250         uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
251         struct jbd_block_tail *tail = (struct jbd_block_tail *)
252                                 ((char *)bhdr + block_size -
253                                 sizeof(struct jbd_block_tail));
254         if (!jbd_has_csum(&jbd_fs->sb))
255                 return true;
256
257         return jbd_meta_csum(jbd_fs, bhdr) == to_be32(tail->checksum);
258 }
259 #else
260 #define jbd_verify_meta_csum(...) true
261 #endif
262
263 #if CONFIG_META_CSUM_ENABLE
264 static uint32_t jbd_commit_csum(struct jbd_fs *jbd_fs,
265                               struct jbd_commit_header *header)
266 {
267         uint32_t checksum = 0;
268
269         if (jbd_has_csum(&jbd_fs->sb)) {
270                 uint32_t orig_checksum_type = header->chksum_type,
271                          orig_checksum_size = header->chksum_size,
272                          orig_checksum = header->chksum[0];
273                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
274                 header->chksum_type = 0;
275                 header->chksum_size = 0;
276                 header->chksum[0] = 0;
277
278                 /* First calculate crc32c checksum against fs uuid */
279                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
280                                        sizeof(jbd_fs->sb.uuid));
281                 /* Calculate crc32c checksum against tho whole block */
282                 checksum = ext4_crc32c(checksum, header,
283                                 block_size);
284
285                 header->chksum_type = orig_checksum_type;
286                 header->chksum_size = orig_checksum_size;
287                 header->chksum[0] = orig_checksum;
288         }
289         return checksum;
290 }
291 #else
292 #define jbd_commit_csum(...) 0
293 #endif
294
295 static void jbd_commit_csum_set(struct jbd_fs *jbd_fs,
296                               struct jbd_commit_header *header)
297 {
298         if (!jbd_has_csum(&jbd_fs->sb))
299                 return;
300
301         header->chksum_type = 0;
302         header->chksum_size = 0;
303         header->chksum[0] = jbd_commit_csum(jbd_fs, header);
304 }
305
306 #if CONFIG_META_CSUM_ENABLE
307 static bool jbd_verify_commit_csum(struct jbd_fs *jbd_fs,
308                                    struct jbd_commit_header *header)
309 {
310         if (!jbd_has_csum(&jbd_fs->sb))
311                 return true;
312
313         return header->chksum[0] == to_be32(jbd_commit_csum(jbd_fs,
314                                             header));
315 }
316 #else
317 #define jbd_verify_commit_csum(...) true
318 #endif
319
320 #if CONFIG_META_CSUM_ENABLE
321 /*
322  * NOTE: We only make use of @csum parameter when
323  *       JBD_FEATURE_COMPAT_CHECKSUM is enabled.
324  */
325 static uint32_t jbd_block_csum(struct jbd_fs *jbd_fs, const void *buf,
326                                uint32_t csum,
327                                uint32_t sequence)
328 {
329         uint32_t checksum = 0;
330
331         if (jbd_has_csum(&jbd_fs->sb)) {
332                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
333                 /* First calculate crc32c checksum against fs uuid */
334                 checksum = ext4_crc32c(EXT4_CRC32_INIT, jbd_fs->sb.uuid,
335                                        sizeof(jbd_fs->sb.uuid));
336                 /* Then calculate crc32c checksum against sequence no. */
337                 checksum = ext4_crc32c(checksum, &sequence,
338                                 sizeof(uint32_t));
339                 /* Calculate crc32c checksum against tho whole block */
340                 checksum = ext4_crc32c(checksum, buf,
341                                 block_size);
342         } else if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
343                                      JBD_FEATURE_COMPAT_CHECKSUM)) {
344                 uint32_t block_size = jbd_get32(&jbd_fs->sb, blocksize);
345                 /* Calculate crc32c checksum against tho whole block */
346                 checksum = ext4_crc32(csum, buf,
347                                 block_size);
348         }
349         return checksum;
350 }
351 #else
352 #define jbd_block_csum(...) 0
353 #endif
354
355 static void jbd_block_tag_csum_set(struct jbd_fs *jbd_fs, void *__tag,
356                                    uint32_t checksum)
357 {
358         int ver = jbd_has_csum(&jbd_fs->sb);
359         if (!ver)
360                 return;
361
362         if (ver == 2) {
363                 struct jbd_block_tag *tag = __tag;
364                 tag->checksum = (uint16_t)to_be32(checksum);
365         } else {
366                 struct jbd_block_tag3 *tag = __tag;
367                 tag->checksum = to_be32(checksum);
368         }
369 }
370
371 /**@brief  Write jbd superblock to disk.
372  * @param  jbd_fs jbd filesystem
373  * @param  s jbd superblock
374  * @return standard error code*/
375 static int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s)
376 {
377         int rc;
378         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
379         uint64_t offset;
380         ext4_fsblk_t fblock;
381         rc = jbd_inode_bmap(jbd_fs, 0, &fblock);
382         if (rc != EOK)
383                 return rc;
384
385         jbd_sb_csum_set(s);
386         offset = fblock * ext4_sb_get_block_size(&fs->sb);
387         return ext4_block_writebytes(fs->bdev, offset, s,
388                                      EXT4_SUPERBLOCK_SIZE);
389 }
390
391 /**@brief  Read jbd superblock from disk.
392  * @param  jbd_fs jbd filesystem
393  * @param  s jbd superblock
394  * @return standard error code*/
395 static int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s)
396 {
397         int rc;
398         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
399         uint64_t offset;
400         ext4_fsblk_t fblock;
401         rc = jbd_inode_bmap(jbd_fs, 0, &fblock);
402         if (rc != EOK)
403                 return rc;
404
405         offset = fblock * ext4_sb_get_block_size(&fs->sb);
406         return ext4_block_readbytes(fs->bdev, offset, s,
407                                     EXT4_SUPERBLOCK_SIZE);
408 }
409
410 /**@brief  Verify jbd superblock.
411  * @param  sb jbd superblock
412  * @return true if jbd superblock is valid */
413 static bool jbd_verify_sb(struct jbd_sb *sb)
414 {
415         struct jbd_bhdr *header = &sb->header;
416         if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER)
417                 return false;
418
419         if (jbd_get32(header, blocktype) != JBD_SUPERBLOCK &&
420             jbd_get32(header, blocktype) != JBD_SUPERBLOCK_V2)
421                 return false;
422
423         return jbd_verify_sb_csum(sb);
424 }
425
426 /**@brief  Write back dirty jbd superblock to disk.
427  * @param  jbd_fs jbd filesystem
428  * @return standard error code*/
429 static int jbd_write_sb(struct jbd_fs *jbd_fs)
430 {
431         int rc = EOK;
432         if (jbd_fs->dirty) {
433                 rc = jbd_sb_write(jbd_fs, &jbd_fs->sb);
434                 if (rc != EOK)
435                         return rc;
436
437                 jbd_fs->dirty = false;
438         }
439         return rc;
440 }
441
442 /**@brief  Get reference to jbd filesystem.
443  * @param  fs Filesystem to load journal of
444  * @param  jbd_fs jbd filesystem
445  * @return standard error code*/
446 int jbd_get_fs(struct ext4_fs *fs,
447                struct jbd_fs *jbd_fs)
448 {
449         int rc;
450         uint32_t journal_ino;
451
452         memset(jbd_fs, 0, sizeof(struct jbd_fs));
453         /* See if there is journal inode on this filesystem.*/
454         /* FIXME: detection on existance ofbkejournal bdev is
455          *        missing.*/
456         journal_ino = ext4_get32(&fs->sb, journal_inode_number);
457
458         rc = ext4_fs_get_inode_ref(fs,
459                                    journal_ino,
460                                    &jbd_fs->inode_ref);
461         if (rc != EOK) {
462                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
463                 return rc;
464         }
465         rc = jbd_sb_read(jbd_fs, &jbd_fs->sb);
466         if (rc != EOK) {
467                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
468                 ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
469                 return rc;
470         }
471         if (!jbd_verify_sb(&jbd_fs->sb)) {
472                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
473                 ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
474                 rc = EIO;
475         }
476
477         if (rc == EOK)
478                 jbd_fs->bdev = fs->bdev;
479
480         return rc;
481 }
482
483 /**@brief  Put reference of jbd filesystem.
484  * @param  jbd_fs jbd filesystem
485  * @return standard error code*/
486 int jbd_put_fs(struct jbd_fs *jbd_fs)
487 {
488         int rc = EOK;
489         rc = jbd_write_sb(jbd_fs);
490
491         ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
492         return rc;
493 }
494
495 /**@brief  Data block lookup helper.
496  * @param  jbd_fs jbd filesystem
497  * @param  iblock block index
498  * @param  fblock logical block address
499  * @return standard error code*/
500 int jbd_inode_bmap(struct jbd_fs *jbd_fs,
501                    ext4_lblk_t iblock,
502                    ext4_fsblk_t *fblock)
503 {
504         int rc = ext4_fs_get_inode_dblk_idx(
505                         &jbd_fs->inode_ref,
506                         iblock,
507                         fblock,
508                         false);
509         return rc;
510 }
511
512 /**@brief   jbd block get function (through cache).
513  * @param   jbd_fs jbd filesystem
514  * @param   block block descriptor
515  * @param   fblock jbd logical block address
516  * @return  standard error code*/
517 static int jbd_block_get(struct jbd_fs *jbd_fs,
518                   struct ext4_block *block,
519                   ext4_fsblk_t fblock)
520 {
521         /* TODO: journal device. */
522         int rc;
523         struct ext4_blockdev *bdev = jbd_fs->bdev;
524         ext4_lblk_t iblock = (ext4_lblk_t)fblock;
525
526         /* Lookup the logical block address of
527          * fblock.*/
528         rc = jbd_inode_bmap(jbd_fs, iblock,
529                             &fblock);
530         if (rc != EOK)
531                 return rc;
532
533         rc = ext4_block_get(bdev, block, fblock);
534
535         /* If succeeded, mark buffer as BC_FLUSH to indicate
536          * that data should be written to disk immediately.*/
537         if (rc == EOK) {
538                 ext4_bcache_set_flag(block->buf, BC_FLUSH);
539                 /* As we don't want to occupy too much space
540                  * in block cache, we set this buffer BC_TMP.*/
541                 ext4_bcache_set_flag(block->buf, BC_TMP);
542         }
543
544         return rc;
545 }
546
547 /**@brief   jbd block get function (through cache, don't read).
548  * @param   jbd_fs jbd filesystem
549  * @param   block block descriptor
550  * @param   fblock jbd logical block address
551  * @return  standard error code*/
552 static int jbd_block_get_noread(struct jbd_fs *jbd_fs,
553                          struct ext4_block *block,
554                          ext4_fsblk_t fblock)
555 {
556         /* TODO: journal device. */
557         int rc;
558         struct ext4_blockdev *bdev = jbd_fs->bdev;
559         ext4_lblk_t iblock = (ext4_lblk_t)fblock;
560         rc = jbd_inode_bmap(jbd_fs, iblock,
561                             &fblock);
562         if (rc != EOK)
563                 return rc;
564
565         rc = ext4_block_get_noread(bdev, block, fblock);
566         if (rc == EOK)
567                 ext4_bcache_set_flag(block->buf, BC_FLUSH);
568
569         return rc;
570 }
571
572 /**@brief   jbd block set procedure (through cache).
573  * @param   jbd_fs jbd filesystem
574  * @param   block block descriptor
575  * @return  standard error code*/
576 static int jbd_block_set(struct jbd_fs *jbd_fs,
577                   struct ext4_block *block)
578 {
579         struct ext4_blockdev *bdev = jbd_fs->bdev;
580         return ext4_block_set(bdev, block);
581 }
582
583 /**@brief  helper functions to calculate
584  *         block tag size, not including UUID part.
585  * @param  jbd_fs jbd filesystem
586  * @return tag size in bytes*/
587 static int jbd_tag_bytes(struct jbd_fs *jbd_fs)
588 {
589         int size;
590
591         /* It is very easy to deal with the case which
592          * JBD_FEATURE_INCOMPAT_CSUM_V3 is enabled.*/
593         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
594                                      JBD_FEATURE_INCOMPAT_CSUM_V3))
595                 return sizeof(struct jbd_block_tag3);
596
597         size = sizeof(struct jbd_block_tag);
598
599         /* If JBD_FEATURE_INCOMPAT_CSUM_V2 is enabled,
600          * add 2 bytes to size.*/
601         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
602                                      JBD_FEATURE_INCOMPAT_CSUM_V2))
603                 size += sizeof(uint16_t);
604
605         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
606                                      JBD_FEATURE_INCOMPAT_64BIT))
607                 return size;
608
609         /* If block number is 4 bytes in size,
610          * minus 4 bytes from size */
611         return size - sizeof(uint32_t);
612 }
613
614 /**@brief  Tag information. */
615 struct tag_info {
616         /**@brief  Tag size in bytes, including UUID part.*/
617         int tag_bytes;
618
619         /**@brief  block number stored in this tag.*/
620         ext4_fsblk_t block;
621
622         /**@brief  Is the first 4 bytes of block equals to
623          *         JBD_MAGIC_NUMBER? */
624         bool is_escape;
625
626         /**@brief  whether UUID part exists or not.*/
627         bool uuid_exist;
628
629         /**@brief  UUID content if UUID part exists.*/
630         uint8_t uuid[UUID_SIZE];
631
632         /**@brief  Is this the last tag? */
633         bool last_tag;
634
635         /**@brief  crc32c checksum. */
636         uint32_t checksum;
637 };
638
639 /**@brief  Extract information from a block tag.
640  * @param  __tag pointer to the block tag
641  * @param  tag_bytes block tag size of this jbd filesystem
642  * @param  remaining size in buffer containing the block tag
643  * @param  tag_info information of this tag.
644  * @return  EOK when succeed, otherwise return EINVAL.*/
645 static int
646 jbd_extract_block_tag(struct jbd_fs *jbd_fs,
647                       void *__tag,
648                       int tag_bytes,
649                       int32_t remain_buf_size,
650                       struct tag_info *tag_info)
651 {
652         char *uuid_start;
653         tag_info->tag_bytes = tag_bytes;
654         tag_info->uuid_exist = false;
655         tag_info->last_tag = false;
656         tag_info->is_escape = false;
657
658         /* See whether it is possible to hold a valid block tag.*/
659         if (remain_buf_size - tag_bytes < 0)
660                 return EINVAL;
661
662         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
663                                      JBD_FEATURE_INCOMPAT_CSUM_V3)) {
664                 struct jbd_block_tag3 *tag = __tag;
665                 tag_info->block = jbd_get32(tag, blocknr);
666                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
667                                              JBD_FEATURE_INCOMPAT_64BIT))
668                          tag_info->block |=
669                                  (uint64_t)jbd_get32(tag, blocknr_high) << 32;
670
671                 if (jbd_get32(tag, flags) & JBD_FLAG_ESCAPE)
672                         tag_info->is_escape = true;
673
674                 if (!(jbd_get32(tag, flags) & JBD_FLAG_SAME_UUID)) {
675                         /* See whether it is possible to hold UUID part.*/
676                         if (remain_buf_size - tag_bytes < UUID_SIZE)
677                                 return EINVAL;
678
679                         uuid_start = (char *)tag + tag_bytes;
680                         tag_info->uuid_exist = true;
681                         tag_info->tag_bytes += UUID_SIZE;
682                         memcpy(tag_info->uuid, uuid_start, UUID_SIZE);
683                 }
684
685                 if (jbd_get32(tag, flags) & JBD_FLAG_LAST_TAG)
686                         tag_info->last_tag = true;
687
688         } else {
689                 struct jbd_block_tag *tag = __tag;
690                 tag_info->block = jbd_get32(tag, blocknr);
691                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
692                                              JBD_FEATURE_INCOMPAT_64BIT))
693                          tag_info->block |=
694                                  (uint64_t)jbd_get32(tag, blocknr_high) << 32;
695
696                 if (jbd_get16(tag, flags) & JBD_FLAG_ESCAPE)
697                         tag_info->is_escape = true;
698
699                 if (!(jbd_get16(tag, flags) & JBD_FLAG_SAME_UUID)) {
700                         /* See whether it is possible to hold UUID part.*/
701                         if (remain_buf_size - tag_bytes < UUID_SIZE)
702                                 return EINVAL;
703
704                         uuid_start = (char *)tag + tag_bytes;
705                         tag_info->uuid_exist = true;
706                         tag_info->tag_bytes += UUID_SIZE;
707                         memcpy(tag_info->uuid, uuid_start, UUID_SIZE);
708                 }
709
710                 if (jbd_get16(tag, flags) & JBD_FLAG_LAST_TAG)
711                         tag_info->last_tag = true;
712
713         }
714         return EOK;
715 }
716
717 /**@brief  Write information to a block tag.
718  * @param  __tag pointer to the block tag
719  * @param  remaining size in buffer containing the block tag
720  * @param  tag_info information of this tag.
721  * @return  EOK when succeed, otherwise return EINVAL.*/
722 static int
723 jbd_write_block_tag(struct jbd_fs *jbd_fs,
724                     void *__tag,
725                     int32_t remain_buf_size,
726                     struct tag_info *tag_info)
727 {
728         char *uuid_start;
729         int tag_bytes = jbd_tag_bytes(jbd_fs);
730
731         tag_info->tag_bytes = tag_bytes;
732
733         /* See whether it is possible to hold a valid block tag.*/
734         if (remain_buf_size - tag_bytes < 0)
735                 return EINVAL;
736
737         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
738                                      JBD_FEATURE_INCOMPAT_CSUM_V3)) {
739                 struct jbd_block_tag3 *tag = __tag;
740                 memset(tag, 0, sizeof(struct jbd_block_tag3));
741                 jbd_set32(tag, blocknr, (uint32_t)tag_info->block);
742                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
743                                              JBD_FEATURE_INCOMPAT_64BIT))
744                         jbd_set32(tag, blocknr_high, tag_info->block >> 32);
745
746                 if (tag_info->uuid_exist) {
747                         /* See whether it is possible to hold UUID part.*/
748                         if (remain_buf_size - tag_bytes < UUID_SIZE)
749                                 return EINVAL;
750
751                         uuid_start = (char *)tag + tag_bytes;
752                         tag_info->tag_bytes += UUID_SIZE;
753                         memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
754                 } else
755                         jbd_set32(tag, flags,
756                                   jbd_get32(tag, flags) | JBD_FLAG_SAME_UUID);
757
758                 jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum);
759
760                 if (tag_info->last_tag)
761                         jbd_set32(tag, flags,
762                                   jbd_get32(tag, flags) | JBD_FLAG_LAST_TAG);
763
764                 if (tag_info->is_escape)
765                         jbd_set32(tag, flags,
766                                   jbd_get32(tag, flags) | JBD_FLAG_ESCAPE);
767
768         } else {
769                 struct jbd_block_tag *tag = __tag;
770                 memset(tag, 0, sizeof(struct jbd_block_tag));
771                 jbd_set32(tag, blocknr, (uint32_t)tag_info->block);
772                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
773                                              JBD_FEATURE_INCOMPAT_64BIT))
774                         jbd_set32(tag, blocknr_high, tag_info->block >> 32);
775
776                 if (tag_info->uuid_exist) {
777                         /* See whether it is possible to hold UUID part.*/
778                         if (remain_buf_size - tag_bytes < UUID_SIZE)
779                                 return EINVAL;
780
781                         uuid_start = (char *)tag + tag_bytes;
782                         tag_info->tag_bytes += UUID_SIZE;
783                         memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
784                 } else
785                         jbd_set16(tag, flags,
786                                   jbd_get16(tag, flags) | JBD_FLAG_SAME_UUID);
787
788                 jbd_block_tag_csum_set(jbd_fs, __tag, tag_info->checksum);
789
790                 if (tag_info->last_tag)
791                         jbd_set16(tag, flags,
792                                   jbd_get16(tag, flags) | JBD_FLAG_LAST_TAG);
793
794
795                 if (tag_info->is_escape)
796                         jbd_set16(tag, flags,
797                                   jbd_get16(tag, flags) | JBD_FLAG_ESCAPE);
798
799         }
800         return EOK;
801 }
802
803 /**@brief  Iterate all block tags in a block.
804  * @param  jbd_fs jbd filesystem
805  * @param  __tag_start pointer to the block
806  * @param  tag_tbl_size size of the block
807  * @param  func callback routine to indicate that
808  *         a block tag is found
809  * @param  arg additional argument to be passed to func */
810 static void
811 jbd_iterate_block_table(struct jbd_fs *jbd_fs,
812                         void *__tag_start,
813                         int32_t tag_tbl_size,
814                         void (*func)(struct jbd_fs * jbd_fs,
815                                      struct tag_info *tag_info,
816                                      void *arg),
817                         void *arg)
818 {
819         char *tag_start, *tag_ptr;
820         int tag_bytes = jbd_tag_bytes(jbd_fs);
821         tag_start = __tag_start;
822         tag_ptr = tag_start;
823
824         /* Cut off the size of block tail storing checksum. */
825         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
826                                      JBD_FEATURE_INCOMPAT_CSUM_V2) ||
827             JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
828                                      JBD_FEATURE_INCOMPAT_CSUM_V3))
829                 tag_tbl_size -= sizeof(struct jbd_block_tail);
830
831         while (tag_tbl_size) {
832                 struct tag_info tag_info;
833                 int rc = jbd_extract_block_tag(jbd_fs,
834                                       tag_ptr,
835                                       tag_bytes,
836                                       tag_tbl_size,
837                                       &tag_info);
838                 if (rc != EOK)
839                         break;
840
841                 if (func)
842                         func(jbd_fs, &tag_info, arg);
843
844                 /* Stop the iteration when we reach the last tag. */
845                 if (tag_info.last_tag)
846                         break;
847
848                 tag_ptr += tag_info.tag_bytes;
849                 tag_tbl_size -= tag_info.tag_bytes;
850         }
851 }
852
853 static void jbd_display_block_tags(struct jbd_fs *jbd_fs,
854                                    struct tag_info *tag_info,
855                                    void *arg)
856 {
857         uint32_t *iblock = arg;
858         ext4_dbg(DEBUG_JBD, "Block in block_tag: %" PRIu64 "\n", tag_info->block);
859         (*iblock)++;
860         wrap(&jbd_fs->sb, *iblock);
861         (void)jbd_fs;
862         return;
863 }
864
865 static struct revoke_entry *
866 jbd_revoke_entry_lookup(struct recover_info *info, ext4_fsblk_t block)
867 {
868         struct revoke_entry tmp = {
869                 .block = block
870         };
871
872         return RB_FIND(jbd_revoke, &info->revoke_root, &tmp);
873 }
874
875 /**@brief  Replay a block in a transaction.
876  * @param  jbd_fs jbd filesystem
877  * @param  tag_info tag_info of the logged block.*/
878 static void jbd_replay_block_tags(struct jbd_fs *jbd_fs,
879                                   struct tag_info *tag_info,
880                                   void *__arg)
881 {
882         int r;
883         struct replay_arg *arg = __arg;
884         struct recover_info *info = arg->info;
885         uint32_t *this_block = arg->this_block;
886         struct revoke_entry *revoke_entry;
887         struct ext4_block journal_block, ext4_block;
888         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
889
890         (*this_block)++;
891         wrap(&jbd_fs->sb, *this_block);
892
893         /* We replay this block only if the current transaction id
894          * is equal or greater than that in revoke entry.*/
895         revoke_entry = jbd_revoke_entry_lookup(info, tag_info->block);
896         if (revoke_entry &&
897             trans_id_diff(arg->this_trans_id, revoke_entry->trans_id) <= 0)
898                 return;
899
900         ext4_dbg(DEBUG_JBD,
901                  "Replaying block in block_tag: %" PRIu64 "\n",
902                  tag_info->block);
903
904         r = jbd_block_get(jbd_fs, &journal_block, *this_block);
905         if (r != EOK)
906                 return;
907
908         /* We need special treatment for ext4 superblock. */
909         if (tag_info->block) {
910                 r = ext4_block_get_noread(fs->bdev, &ext4_block, tag_info->block);
911                 if (r != EOK) {
912                         jbd_block_set(jbd_fs, &journal_block);
913                         return;
914                 }
915
916                 memcpy(ext4_block.data,
917                         journal_block.data,
918                         jbd_get32(&jbd_fs->sb, blocksize));
919
920                 if (tag_info->is_escape)
921                         ((struct jbd_bhdr *)ext4_block.data)->magic =
922                                         to_be32(JBD_MAGIC_NUMBER);
923
924                 ext4_bcache_set_dirty(ext4_block.buf);
925                 ext4_block_set(fs->bdev, &ext4_block);
926         } else {
927                 uint16_t mount_count, state;
928                 mount_count = ext4_get16(&fs->sb, mount_count);
929                 state = ext4_get16(&fs->sb, state);
930
931                 memcpy(&fs->sb,
932                         journal_block.data + EXT4_SUPERBLOCK_OFFSET,
933                         EXT4_SUPERBLOCK_SIZE);
934
935                 /* Mark system as mounted */
936                 ext4_set16(&fs->sb, state, state);
937                 r = ext4_sb_write(fs->bdev, &fs->sb);
938                 if (r != EOK)
939                         return;
940
941                 /*Update mount count*/
942                 ext4_set16(&fs->sb, mount_count, mount_count);
943         }
944
945         jbd_block_set(jbd_fs, &journal_block);
946         
947         return;
948 }
949
950 /**@brief  Add block address to revoke tree, along with
951  *         its transaction id.
952  * @param  info  journal replay info
953  * @param  block  block address to be replayed.*/
954 static void jbd_add_revoke_block_tags(struct recover_info *info,
955                                       ext4_fsblk_t block)
956 {
957         struct revoke_entry *revoke_entry;
958
959         ext4_dbg(DEBUG_JBD, "Add block %" PRIu64 " to revoke tree\n", block);
960         /* If the revoke entry with respect to the block address
961          * exists already, update its transaction id.*/
962         revoke_entry = jbd_revoke_entry_lookup(info, block);
963         if (revoke_entry) {
964                 revoke_entry->trans_id = info->this_trans_id;
965                 return;
966         }
967
968         revoke_entry = jbd_alloc_revoke_entry();
969         ext4_assert(revoke_entry);
970         revoke_entry->block = block;
971         revoke_entry->trans_id = info->this_trans_id;
972         RB_INSERT(jbd_revoke, &info->revoke_root, revoke_entry);
973
974         return;
975 }
976
977 static void jbd_destroy_revoke_tree(struct recover_info *info)
978 {
979         while (!RB_EMPTY(&info->revoke_root)) {
980                 struct revoke_entry *revoke_entry =
981                         RB_MIN(jbd_revoke, &info->revoke_root);
982                 ext4_assert(revoke_entry);
983                 RB_REMOVE(jbd_revoke, &info->revoke_root, revoke_entry);
984                 jbd_free_revoke_entry(revoke_entry);
985         }
986 }
987
988
989 #define ACTION_SCAN 0
990 #define ACTION_REVOKE 1
991 #define ACTION_RECOVER 2
992
993 /**@brief  Add entries in a revoke block to revoke tree.
994  * @param  jbd_fs jbd filesystem
995  * @param  header revoke block header
996  * @param  recover_info  journal replay info*/
997 static void jbd_build_revoke_tree(struct jbd_fs *jbd_fs,
998                                   struct jbd_bhdr *header,
999                                   struct recover_info *info)
1000 {
1001         char *blocks_entry;
1002         struct jbd_revoke_header *revoke_hdr =
1003                 (struct jbd_revoke_header *)header;
1004         uint32_t i, nr_entries, record_len = 4;
1005
1006         /* If we are working on a 64bit jbd filesystem, */
1007         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
1008                                      JBD_FEATURE_INCOMPAT_64BIT))
1009                 record_len = 8;
1010
1011         nr_entries = (jbd_get32(revoke_hdr, count) -
1012                         sizeof(struct jbd_revoke_header)) /
1013                         record_len;
1014
1015         blocks_entry = (char *)(revoke_hdr + 1);
1016
1017         for (i = 0;i < nr_entries;i++) {
1018                 if (record_len == 8) {
1019                         uint64_t *blocks =
1020                                 (uint64_t *)blocks_entry;
1021                         jbd_add_revoke_block_tags(info, to_be64(*blocks));
1022                 } else {
1023                         uint32_t *blocks =
1024                                 (uint32_t *)blocks_entry;
1025                         jbd_add_revoke_block_tags(info, to_be32(*blocks));
1026                 }
1027                 blocks_entry += record_len;
1028         }
1029 }
1030
1031 static void jbd_debug_descriptor_block(struct jbd_fs *jbd_fs,
1032                                        struct jbd_bhdr *header,
1033                                        uint32_t *iblock)
1034 {
1035         jbd_iterate_block_table(jbd_fs,
1036                                 header + 1,
1037                                 jbd_get32(&jbd_fs->sb, blocksize) -
1038                                         sizeof(struct jbd_bhdr),
1039                                 jbd_display_block_tags,
1040                                 iblock);
1041 }
1042
1043 static void jbd_replay_descriptor_block(struct jbd_fs *jbd_fs,
1044                                         struct jbd_bhdr *header,
1045                                         struct replay_arg *arg)
1046 {
1047         jbd_iterate_block_table(jbd_fs,
1048                                 header + 1,
1049                                 jbd_get32(&jbd_fs->sb, blocksize) -
1050                                         sizeof(struct jbd_bhdr),
1051                                 jbd_replay_block_tags,
1052                                 arg);
1053 }
1054
1055 /**@brief  The core routine of journal replay.
1056  * @param  jbd_fs jbd filesystem
1057  * @param  recover_info  journal replay info
1058  * @param  action action needed to be taken
1059  * @return standard error code*/
1060 static int jbd_iterate_log(struct jbd_fs *jbd_fs,
1061                            struct recover_info *info,
1062                            int action)
1063 {
1064         int r = EOK;
1065         bool log_end = false;
1066         struct jbd_sb *sb = &jbd_fs->sb;
1067         uint32_t start_trans_id, this_trans_id;
1068         uint32_t start_block, this_block;
1069
1070         /* We start iterating valid blocks in the whole journal.*/
1071         start_trans_id = this_trans_id = jbd_get32(sb, sequence);
1072         start_block = this_block = jbd_get32(sb, start);
1073         if (action == ACTION_SCAN)
1074                 info->trans_cnt = 0;
1075         else if (!info->trans_cnt)
1076                 log_end = true;
1077
1078         ext4_dbg(DEBUG_JBD, "Start of journal at trans id: %" PRIu32 "\n",
1079                             start_trans_id);
1080
1081         while (!log_end) {
1082                 struct ext4_block block;
1083                 struct jbd_bhdr *header;
1084                 /* If we are not scanning for the last
1085                  * valid transaction in the journal,
1086                  * we will stop when we reach the end of
1087                  * the journal.*/
1088                 if (action != ACTION_SCAN)
1089                         if (trans_id_diff(this_trans_id, info->last_trans_id) > 0) {
1090                                 log_end = true;
1091                                 continue;
1092                         }
1093
1094                 r = jbd_block_get(jbd_fs, &block, this_block);
1095                 if (r != EOK)
1096                         break;
1097
1098                 header = (struct jbd_bhdr *)block.data;
1099                 /* This block does not have a valid magic number,
1100                  * so we have reached the end of the journal.*/
1101                 if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER) {
1102                         jbd_block_set(jbd_fs, &block);
1103                         log_end = true;
1104                         continue;
1105                 }
1106
1107                 /* If the transaction id we found is not expected,
1108                  * we may have reached the end of the journal.
1109                  *
1110                  * If we are not scanning the journal, something
1111                  * bad might have taken place. :-( */
1112                 if (jbd_get32(header, sequence) != this_trans_id) {
1113                         if (action != ACTION_SCAN)
1114                                 r = EIO;
1115
1116                         jbd_block_set(jbd_fs, &block);
1117                         log_end = true;
1118                         continue;
1119                 }
1120
1121                 switch (jbd_get32(header, blocktype)) {
1122                 case JBD_DESCRIPTOR_BLOCK:
1123                         if (!jbd_verify_meta_csum(jbd_fs, header)) {
1124                                 ext4_dbg(DEBUG_JBD,
1125                                         DBG_WARN "Descriptor block checksum failed."
1126                                                 "Journal block: %" PRIu32"\n",
1127                                                 this_block);
1128                                 log_end = true;
1129                                 break;
1130                         }
1131                         ext4_dbg(DEBUG_JBD, "Descriptor block: %" PRIu32", "
1132                                             "trans_id: %" PRIu32"\n",
1133                                             this_block, this_trans_id);
1134                         if (action == ACTION_RECOVER) {
1135                                 struct replay_arg replay_arg;
1136                                 replay_arg.info = info;
1137                                 replay_arg.this_block = &this_block;
1138                                 replay_arg.this_trans_id = this_trans_id;
1139
1140                                 jbd_replay_descriptor_block(jbd_fs,
1141                                                 header, &replay_arg);
1142                         } else
1143                                 jbd_debug_descriptor_block(jbd_fs,
1144                                                 header, &this_block);
1145
1146                         break;
1147                 case JBD_COMMIT_BLOCK:
1148                         if (!jbd_verify_commit_csum(jbd_fs,
1149                                         (struct jbd_commit_header *)header)) {
1150                                 ext4_dbg(DEBUG_JBD,
1151                                         DBG_WARN "Commit block checksum failed."
1152                                                 "Journal block: %" PRIu32"\n",
1153                                                 this_block);
1154                                 log_end = true;
1155                                 break;
1156                         }
1157                         ext4_dbg(DEBUG_JBD, "Commit block: %" PRIu32", "
1158                                             "trans_id: %" PRIu32"\n",
1159                                             this_block, this_trans_id);
1160                         /* This is the end of a transaction,
1161                          * we may now proceed to the next transaction.
1162                          */
1163                         this_trans_id++;
1164                         info->trans_cnt++;
1165                         break;
1166                 case JBD_REVOKE_BLOCK:
1167                         if (!jbd_verify_meta_csum(jbd_fs, header)) {
1168                                 ext4_dbg(DEBUG_JBD,
1169                                         DBG_WARN "Revoke block checksum failed."
1170                                                 "Journal block: %" PRIu32"\n",
1171                                                 this_block);
1172                                 log_end = true;
1173                                 break;
1174                         }
1175                         ext4_dbg(DEBUG_JBD, "Revoke block: %" PRIu32", "
1176                                             "trans_id: %" PRIu32"\n",
1177                                             this_block, this_trans_id);
1178                         if (action == ACTION_REVOKE) {
1179                                 info->this_trans_id = this_trans_id;
1180                                 jbd_build_revoke_tree(jbd_fs,
1181                                                 header, info);
1182                         }
1183                         break;
1184                 default:
1185                         log_end = true;
1186                         break;
1187                 }
1188                 jbd_block_set(jbd_fs, &block);
1189                 this_block++;
1190                 wrap(sb, this_block);
1191                 if (this_block == start_block)
1192                         log_end = true;
1193
1194         }
1195         ext4_dbg(DEBUG_JBD, "End of journal.\n");
1196         if (r == EOK && action == ACTION_SCAN) {
1197                 /* We have finished scanning the journal. */
1198                 info->start_trans_id = start_trans_id;
1199                 if (trans_id_diff(this_trans_id, start_trans_id) > 0)
1200                         info->last_trans_id = this_trans_id - 1;
1201                 else
1202                         info->last_trans_id = this_trans_id;
1203         }
1204
1205         return r;
1206 }
1207
1208 /**@brief  Replay journal.
1209  * @param  jbd_fs jbd filesystem
1210  * @return standard error code*/
1211 int jbd_recover(struct jbd_fs *jbd_fs)
1212 {
1213         int r;
1214         struct recover_info info;
1215         struct jbd_sb *sb = &jbd_fs->sb;
1216         if (!sb->start)
1217                 return EOK;
1218
1219         RB_INIT(&info.revoke_root);
1220
1221         r = jbd_iterate_log(jbd_fs, &info, ACTION_SCAN);
1222         if (r != EOK)
1223                 return r;
1224
1225         r = jbd_iterate_log(jbd_fs, &info, ACTION_REVOKE);
1226         if (r != EOK)
1227                 return r;
1228
1229         r = jbd_iterate_log(jbd_fs, &info, ACTION_RECOVER);
1230         if (r == EOK) {
1231                 /* If we successfully replay the journal,
1232                  * clear EXT4_FINCOM_RECOVER flag on the
1233                  * ext4 superblock, and set the start of
1234                  * journal to 0.*/
1235                 uint32_t features_incompatible =
1236                         ext4_get32(&jbd_fs->inode_ref.fs->sb,
1237                                    features_incompatible);
1238                 jbd_set32(&jbd_fs->sb, start, 0);
1239                 features_incompatible &= ~EXT4_FINCOM_RECOVER;
1240                 ext4_set32(&jbd_fs->inode_ref.fs->sb,
1241                            features_incompatible,
1242                            features_incompatible);
1243                 jbd_fs->dirty = true;
1244                 r = ext4_sb_write(jbd_fs->bdev,
1245                                   &jbd_fs->inode_ref.fs->sb);
1246         }
1247         jbd_destroy_revoke_tree(&info);
1248         return r;
1249 }
1250
1251 static void jbd_journal_write_sb(struct jbd_journal *journal)
1252 {
1253         struct jbd_fs *jbd_fs = journal->jbd_fs;
1254         jbd_set32(&jbd_fs->sb, start, journal->start);
1255         jbd_set32(&jbd_fs->sb, sequence, journal->trans_id);
1256         jbd_fs->dirty = true;
1257 }
1258
1259 /**@brief  Start accessing the journal.
1260  * @param  jbd_fs jbd filesystem
1261  * @param  journal current journal session
1262  * @return standard error code*/
1263 int jbd_journal_start(struct jbd_fs *jbd_fs,
1264                       struct jbd_journal *journal)
1265 {
1266         int r;
1267         uint32_t features_incompatible =
1268                         ext4_get32(&jbd_fs->inode_ref.fs->sb,
1269                                    features_incompatible);
1270         struct ext4_block block = EXT4_BLOCK_ZERO();
1271         features_incompatible |= EXT4_FINCOM_RECOVER;
1272         ext4_set32(&jbd_fs->inode_ref.fs->sb,
1273                         features_incompatible,
1274                         features_incompatible);
1275         r = ext4_sb_write(jbd_fs->bdev,
1276                         &jbd_fs->inode_ref.fs->sb);
1277         if (r != EOK)
1278                 return r;
1279
1280         journal->first = jbd_get32(&jbd_fs->sb, first);
1281         journal->start = journal->first;
1282         journal->last = journal->first;
1283         journal->trans_id = 1;
1284         journal->alloc_trans_id = 1;
1285
1286         journal->block_size = jbd_get32(&jbd_fs->sb, blocksize);
1287
1288         r = jbd_block_get_noread(jbd_fs,
1289                          &block,
1290                          journal->start);
1291         if (r != EOK) {
1292                 memset(journal, 0, sizeof(struct jbd_journal));
1293                 return r;
1294         }
1295         memset(block.data, 0, journal->block_size);
1296         ext4_bcache_set_dirty(block.buf);
1297         r = jbd_block_set(jbd_fs, &block);
1298         if (r != EOK) {
1299                 memset(journal, 0, sizeof(struct jbd_journal));
1300                 return r;
1301         }
1302
1303         TAILQ_INIT(&journal->cp_queue);
1304         RB_INIT(&journal->block_rec_root);
1305         journal->jbd_fs = jbd_fs;
1306         jbd_journal_write_sb(journal);
1307         r = jbd_write_sb(jbd_fs);
1308         if (r != EOK)
1309                 return r;
1310
1311         jbd_fs->bdev->journal = journal;
1312         return EOK;
1313 }
1314
1315 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
1316                           struct ext4_buf *buf __unused,
1317                           int res,
1318                           void *arg);
1319
1320 /*
1321  * This routine is only suitable to committed transactions. */
1322 static void jbd_journal_flush_trans(struct jbd_trans *trans)
1323 {
1324         struct jbd_buf *jbd_buf, *tmp;
1325         struct jbd_journal *journal = trans->journal;
1326         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1327         void *tmp_data = ext4_malloc(journal->block_size);
1328         ext4_assert(tmp_data);
1329
1330         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
1331                         tmp) {
1332                 struct ext4_buf *buf;
1333                 struct ext4_block block;
1334                 /* The buffer is not yet flushed. */
1335                 buf = ext4_bcache_find_get(fs->bdev->bc, &block,
1336                                            jbd_buf->block_rec->lba);
1337                 if (!(buf && ext4_bcache_test_flag(buf, BC_UPTODATE) &&
1338                       jbd_buf->block_rec->trans == trans)) {
1339                         int r;
1340                         struct ext4_block jbd_block = EXT4_BLOCK_ZERO();
1341                         ext4_assert(jbd_block_get(journal->jbd_fs,
1342                                                 &jbd_block,
1343                                                 jbd_buf->jbd_lba) == EOK);
1344                         memcpy(tmp_data, jbd_block.data,
1345                                         journal->block_size);
1346                         ext4_block_set(fs->bdev, &jbd_block);
1347                         r = ext4_blocks_set_direct(fs->bdev, tmp_data,
1348                                         jbd_buf->block_rec->lba, 1);
1349                         jbd_trans_end_write(fs->bdev->bc, buf, r, jbd_buf);
1350                 } else
1351                         ext4_block_flush_buf(fs->bdev, buf);
1352
1353                 if (buf)
1354                         ext4_block_set(fs->bdev, &block);
1355         }
1356
1357         ext4_free(tmp_data);
1358 }
1359
1360 static void
1361 jbd_journal_skip_pure_revoke(struct jbd_journal *journal,
1362                              struct jbd_trans *trans)
1363 {
1364         journal->start = trans->start_iblock +
1365                 trans->alloc_blocks;
1366         wrap(&journal->jbd_fs->sb, journal->start);
1367         journal->trans_id = trans->trans_id + 1;
1368         jbd_journal_free_trans(journal,
1369                         trans, false);
1370         jbd_journal_write_sb(journal);
1371 }
1372
1373 void
1374 jbd_journal_purge_cp_trans(struct jbd_journal *journal,
1375                            bool flush,
1376                            bool once)
1377 {
1378         struct jbd_trans *trans;
1379         while ((trans = TAILQ_FIRST(&journal->cp_queue))) {
1380                 if (!trans->data_cnt) {
1381                         TAILQ_REMOVE(&journal->cp_queue,
1382                                         trans,
1383                                         trans_node);
1384                         jbd_journal_skip_pure_revoke(journal, trans);
1385                 } else {
1386                         if (trans->data_cnt ==
1387                                         trans->written_cnt) {
1388                                 journal->start =
1389                                         trans->start_iblock +
1390                                         trans->alloc_blocks;
1391                                 wrap(&journal->jbd_fs->sb,
1392                                                 journal->start);
1393                                 journal->trans_id =
1394                                         trans->trans_id + 1;
1395                                 TAILQ_REMOVE(&journal->cp_queue,
1396                                                 trans,
1397                                                 trans_node);
1398                                 jbd_journal_free_trans(journal,
1399                                                 trans,
1400                                                 false);
1401                                 jbd_journal_write_sb(journal);
1402                         } else if (!flush) {
1403                                 journal->start =
1404                                         trans->start_iblock;
1405                                 wrap(&journal->jbd_fs->sb,
1406                                                 journal->start);
1407                                 journal->trans_id =
1408                                         trans->trans_id;
1409                                 jbd_journal_write_sb(journal);
1410                                 break;
1411                         } else
1412                                 jbd_journal_flush_trans(trans);
1413                 }
1414                 if (once)
1415                         break;
1416         }
1417 }
1418
1419 /**@brief  Stop accessing the journal.
1420  * @param  journal current journal session
1421  * @return standard error code*/
1422 int jbd_journal_stop(struct jbd_journal *journal)
1423 {
1424         int r;
1425         struct jbd_fs *jbd_fs = journal->jbd_fs;
1426         uint32_t features_incompatible;
1427
1428         /* Make sure that journalled content have reached
1429          * the disk.*/
1430         jbd_journal_purge_cp_trans(journal, true, false);
1431
1432         /* There should be no block record in this journal
1433          * session. */
1434         if (!RB_EMPTY(&journal->block_rec_root))
1435                 ext4_dbg(DEBUG_JBD,
1436                          DBG_WARN "There are still block records "
1437                                   "in this journal session!\n");
1438
1439         features_incompatible =
1440                 ext4_get32(&jbd_fs->inode_ref.fs->sb,
1441                            features_incompatible);
1442         features_incompatible &= ~EXT4_FINCOM_RECOVER;
1443         ext4_set32(&jbd_fs->inode_ref.fs->sb,
1444                         features_incompatible,
1445                         features_incompatible);
1446         r = ext4_sb_write(jbd_fs->bdev,
1447                         &jbd_fs->inode_ref.fs->sb);
1448         if (r != EOK)
1449                 return r;
1450
1451         journal->start = 0;
1452         journal->trans_id = 0;
1453         jbd_journal_write_sb(journal);
1454         return jbd_write_sb(journal->jbd_fs);
1455 }
1456
1457 /**@brief  Allocate a block in the journal.
1458  * @param  journal current journal session
1459  * @param  trans transaction
1460  * @return allocated block address*/
1461 static uint32_t jbd_journal_alloc_block(struct jbd_journal *journal,
1462                                         struct jbd_trans *trans)
1463 {
1464         uint32_t start_block;
1465
1466         start_block = journal->last++;
1467         trans->alloc_blocks++;
1468         wrap(&journal->jbd_fs->sb, journal->last);
1469         
1470         /* If there is no space left, flush all journalled
1471          * blocks to disk first.*/
1472         if (journal->last == journal->start)
1473                 jbd_journal_purge_cp_trans(journal, true, false);
1474
1475         return start_block;
1476 }
1477
1478 static struct jbd_block_rec *
1479 jbd_trans_block_rec_lookup(struct jbd_journal *journal,
1480                            ext4_fsblk_t lba)
1481 {
1482         struct jbd_block_rec tmp = {
1483                 .lba = lba
1484         };
1485
1486         return RB_FIND(jbd_block,
1487                        &journal->block_rec_root,
1488                        &tmp);
1489 }
1490
1491 static void
1492 jbd_trans_change_ownership(struct jbd_block_rec *block_rec,
1493                            struct jbd_trans *new_trans)
1494 {
1495         LIST_REMOVE(block_rec, tbrec_node);
1496         if (new_trans) {
1497                 /* Now this block record belongs to this transaction. */
1498                 LIST_INSERT_HEAD(&new_trans->tbrec_list, block_rec, tbrec_node);
1499         }
1500         block_rec->trans = new_trans;
1501 }
1502
1503 static inline struct jbd_block_rec *
1504 jbd_trans_insert_block_rec(struct jbd_trans *trans,
1505                            ext4_fsblk_t lba)
1506 {
1507         struct jbd_block_rec *block_rec;
1508         block_rec = jbd_trans_block_rec_lookup(trans->journal, lba);
1509         if (block_rec) {
1510                 jbd_trans_change_ownership(block_rec, trans);
1511                 return block_rec;
1512         }
1513         block_rec = ext4_calloc(1, sizeof(struct jbd_block_rec));
1514         if (!block_rec)
1515                 return NULL;
1516
1517         block_rec->lba = lba;
1518         block_rec->trans = trans;
1519         TAILQ_INIT(&block_rec->dirty_buf_queue);
1520         LIST_INSERT_HEAD(&trans->tbrec_list, block_rec, tbrec_node);
1521         RB_INSERT(jbd_block, &trans->journal->block_rec_root, block_rec);
1522         return block_rec;
1523 }
1524
1525 /*
1526  * This routine will do the dirty works.
1527  */
1528 static void
1529 jbd_trans_finish_callback(struct jbd_journal *journal,
1530                           const struct jbd_trans *trans,
1531                           struct jbd_block_rec *block_rec,
1532                           bool abort,
1533                           bool revoke)
1534 {
1535         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1536         if (block_rec->trans != trans)
1537                 return;
1538
1539         if (!abort) {
1540                 struct jbd_buf *jbd_buf, *tmp;
1541                 TAILQ_FOREACH_SAFE(jbd_buf,
1542                                 &block_rec->dirty_buf_queue,
1543                                 dirty_buf_node,
1544                                 tmp) {
1545                         jbd_trans_end_write(fs->bdev->bc,
1546                                         NULL,
1547                                         EOK,
1548                                         jbd_buf);
1549                 }
1550         } else {
1551                 /*
1552                  * We have to roll back data if the block is going to be
1553                  * aborted.
1554                  */
1555                 struct jbd_buf *jbd_buf;
1556                 struct ext4_block jbd_block = EXT4_BLOCK_ZERO(),
1557                                   block = EXT4_BLOCK_ZERO();
1558                 jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue,
1559                                 jbd_buf_dirty);
1560                 if (jbd_buf) {
1561                         if (!revoke) {
1562                                 ext4_assert(ext4_block_get_noread(fs->bdev,
1563                                                         &block,
1564                                                         block_rec->lba) == EOK);
1565                                 ext4_assert(jbd_block_get(journal->jbd_fs,
1566                                                         &jbd_block,
1567                                                         jbd_buf->jbd_lba) == EOK);
1568                                 memcpy(block.data, jbd_block.data,
1569                                                 journal->block_size);
1570
1571                                 jbd_trans_change_ownership(block_rec,
1572                                                 jbd_buf->trans);
1573
1574                                 block.buf->end_write = jbd_trans_end_write;
1575                                 block.buf->end_write_arg = jbd_buf;
1576
1577                                 ext4_bcache_set_flag(jbd_block.buf, BC_TMP);
1578                                 ext4_bcache_set_dirty(block.buf);
1579
1580                                 ext4_block_set(fs->bdev, &jbd_block);
1581                                 ext4_block_set(fs->bdev, &block);
1582                                 return;
1583                         } else {
1584                                 /* The revoked buffer is yet written. */
1585                                 jbd_trans_change_ownership(block_rec,
1586                                                 jbd_buf->trans);
1587                         }
1588                 }
1589         }
1590 }
1591
1592 static inline void
1593 jbd_trans_remove_block_rec(struct jbd_journal *journal,
1594                            struct jbd_block_rec *block_rec,
1595                            struct jbd_trans *trans)
1596 {
1597         /* If this block record doesn't belong to this transaction,
1598          * give up.*/
1599         if (block_rec->trans == trans) {
1600                 LIST_REMOVE(block_rec, tbrec_node);
1601                 RB_REMOVE(jbd_block,
1602                                 &journal->block_rec_root,
1603                                 block_rec);
1604                 ext4_free(block_rec);
1605         }
1606 }
1607
1608 /**@brief  Add block to a transaction and mark it dirty.
1609  * @param  trans transaction
1610  * @param  block block descriptor
1611  * @return standard error code*/
1612 int jbd_trans_set_block_dirty(struct jbd_trans *trans,
1613                               struct ext4_block *block)
1614 {
1615         struct jbd_buf *jbd_buf;
1616         struct jbd_revoke_rec *rec, tmp_rec = {
1617                 .lba = block->lb_id
1618         };
1619         struct jbd_block_rec *block_rec;
1620
1621         if (block->buf->end_write == jbd_trans_end_write) {
1622                 jbd_buf = block->buf->end_write_arg;
1623                 if (jbd_buf && jbd_buf->trans == trans)
1624                         return EOK;
1625         }
1626         jbd_buf = ext4_calloc(1, sizeof(struct jbd_buf));
1627         if (!jbd_buf)
1628                 return ENOMEM;
1629
1630         if ((block_rec = jbd_trans_insert_block_rec(trans,
1631                                         block->lb_id)) == NULL) {
1632                 ext4_free(jbd_buf);
1633                 return ENOMEM;
1634         }
1635
1636         TAILQ_INSERT_TAIL(&block_rec->dirty_buf_queue,
1637                         jbd_buf,
1638                         dirty_buf_node);
1639
1640         jbd_buf->block_rec = block_rec;
1641         jbd_buf->trans = trans;
1642         jbd_buf->block = *block;
1643         ext4_bcache_inc_ref(block->buf);
1644
1645         /* If the content reach the disk, notify us
1646          * so that we may do a checkpoint. */
1647         block->buf->end_write = jbd_trans_end_write;
1648         block->buf->end_write_arg = jbd_buf;
1649
1650         trans->data_cnt++;
1651         TAILQ_INSERT_HEAD(&trans->buf_queue, jbd_buf, buf_node);
1652
1653         ext4_bcache_set_dirty(block->buf);
1654         rec = RB_FIND(jbd_revoke_tree,
1655                         &trans->revoke_root,
1656                         &tmp_rec);
1657         if (rec) {
1658                 RB_REMOVE(jbd_revoke_tree, &trans->revoke_root,
1659                           rec);
1660                 ext4_free(rec);
1661         }
1662
1663         return EOK;
1664 }
1665
1666 /**@brief  Add block to be revoked to a transaction
1667  * @param  trans transaction
1668  * @param  lba logical block address
1669  * @return standard error code*/
1670 int jbd_trans_revoke_block(struct jbd_trans *trans,
1671                            ext4_fsblk_t lba)
1672 {
1673         struct jbd_revoke_rec tmp_rec = {
1674                 .lba = lba
1675         }, *rec;
1676         rec = RB_FIND(jbd_revoke_tree,
1677                       &trans->revoke_root,
1678                       &tmp_rec);
1679         if (rec)
1680                 return EOK;
1681
1682         rec = ext4_calloc(1, sizeof(struct jbd_revoke_rec));
1683         if (!rec)
1684                 return ENOMEM;
1685
1686         rec->lba = lba;
1687         RB_INSERT(jbd_revoke_tree, &trans->revoke_root, rec);
1688         return EOK;
1689 }
1690
1691 /**@brief  Try to add block to be revoked to a transaction.
1692  *         If @lba still remains in an transaction on checkpoint
1693  *         queue, add @lba as a revoked block to the transaction.
1694  * @param  trans transaction
1695  * @param  lba logical block address
1696  * @return standard error code*/
1697 int jbd_trans_try_revoke_block(struct jbd_trans *trans,
1698                                ext4_fsblk_t lba)
1699 {
1700         struct jbd_journal *journal = trans->journal;
1701         struct jbd_block_rec *block_rec =
1702                 jbd_trans_block_rec_lookup(journal, lba);
1703
1704         if (block_rec) {
1705                 if (block_rec->trans == trans) {
1706                         struct jbd_buf *jbd_buf =
1707                                 TAILQ_LAST(&block_rec->dirty_buf_queue,
1708                                         jbd_buf_dirty);
1709                         /* If there are still unwritten buffers. */
1710                         if (TAILQ_FIRST(&block_rec->dirty_buf_queue) !=
1711                             jbd_buf)
1712                                 jbd_trans_revoke_block(trans, lba);
1713
1714                 } else
1715                         jbd_trans_revoke_block(trans, lba);
1716         }
1717
1718         return EOK;
1719 }
1720
1721 /**@brief  Free a transaction
1722  * @param  journal current journal session
1723  * @param  trans transaction
1724  * @param  abort discard all the modifications on the block?
1725  * @return standard error code*/
1726 void jbd_journal_free_trans(struct jbd_journal *journal,
1727                             struct jbd_trans *trans,
1728                             bool abort)
1729 {
1730         struct jbd_buf *jbd_buf, *tmp;
1731         struct jbd_revoke_rec *rec, *tmp2;
1732         struct jbd_block_rec *block_rec, *tmp3;
1733         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1734         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
1735                           tmp) {
1736                 block_rec = jbd_buf->block_rec;
1737                 if (abort) {
1738                         jbd_buf->block.buf->end_write = NULL;
1739                         jbd_buf->block.buf->end_write_arg = NULL;
1740                         ext4_bcache_clear_dirty(jbd_buf->block.buf);
1741                         ext4_block_set(fs->bdev, &jbd_buf->block);
1742                 }
1743
1744                 TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
1745                         jbd_buf,
1746                         dirty_buf_node);
1747                 jbd_trans_finish_callback(journal,
1748                                 trans,
1749                                 block_rec,
1750                                 abort,
1751                                 false);
1752                 TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1753                 ext4_free(jbd_buf);
1754         }
1755         RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root,
1756                           tmp2) {
1757                 RB_REMOVE(jbd_revoke_tree, &trans->revoke_root, rec);
1758                 ext4_free(rec);
1759         }
1760         LIST_FOREACH_SAFE(block_rec, &trans->tbrec_list, tbrec_node,
1761                           tmp3) {
1762                 jbd_trans_remove_block_rec(journal, block_rec, trans);
1763         }
1764
1765         ext4_free(trans);
1766 }
1767
1768 /**@brief  Write commit block for a transaction
1769  * @param  trans transaction
1770  * @return standard error code*/
1771 static int jbd_trans_write_commit_block(struct jbd_trans *trans)
1772 {
1773         int rc;
1774         struct ext4_block block;
1775         struct jbd_commit_header *header;
1776         uint32_t commit_iblock, orig_commit_iblock;
1777         struct jbd_journal *journal = trans->journal;
1778
1779         commit_iblock = jbd_journal_alloc_block(journal, trans);
1780         orig_commit_iblock = commit_iblock;
1781         commit_iblock++;
1782         wrap(&journal->jbd_fs->sb, commit_iblock);
1783
1784         /* To prevent accidental reference to stale journalling metadata. */
1785         if (orig_commit_iblock < commit_iblock) {
1786                 rc = jbd_block_get_noread(journal->jbd_fs, &block, commit_iblock);
1787                 if (rc != EOK)
1788                         return rc;
1789
1790                 memset(block.data, 0, journal->block_size);
1791                 ext4_bcache_set_dirty(block.buf);
1792                 ext4_bcache_set_flag(block.buf, BC_TMP);
1793                 rc = jbd_block_set(journal->jbd_fs, &block);
1794                 if (rc != EOK)
1795                         return rc;
1796         }
1797
1798         rc = jbd_block_get_noread(journal->jbd_fs, &block, orig_commit_iblock);
1799         if (rc != EOK)
1800                 return rc;
1801
1802         header = (struct jbd_commit_header *)block.data;
1803         jbd_set32(&header->header, magic, JBD_MAGIC_NUMBER);
1804         jbd_set32(&header->header, blocktype, JBD_COMMIT_BLOCK);
1805         jbd_set32(&header->header, sequence, trans->trans_id);
1806
1807         if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb,
1808                                 JBD_FEATURE_COMPAT_CHECKSUM)) {
1809                 jbd_set32(header, chksum_type, JBD_CRC32_CHKSUM);
1810                 jbd_set32(header, chksum_size, JBD_CRC32_CHKSUM_SIZE);
1811                 jbd_set32(header, chksum[0], trans->data_csum);
1812         }
1813         jbd_commit_csum_set(journal->jbd_fs, header);
1814         ext4_bcache_set_dirty(block.buf);
1815         ext4_bcache_set_flag(block.buf, BC_TMP);
1816         rc = jbd_block_set(journal->jbd_fs, &block);
1817         return rc;
1818 }
1819
1820 /**@brief  Write descriptor block for a transaction
1821  * @param  journal current journal session
1822  * @param  trans transaction
1823  * @return standard error code*/
1824 static int jbd_journal_prepare(struct jbd_journal *journal,
1825                                struct jbd_trans *trans)
1826 {
1827         int rc = EOK, i = 0;
1828         struct ext4_block desc_block = EXT4_BLOCK_ZERO(),
1829                           data_block = EXT4_BLOCK_ZERO();
1830         int32_t tag_tbl_size = 0;
1831         uint32_t desc_iblock = 0;
1832         uint32_t data_iblock = 0;
1833         char *tag_start = NULL, *tag_ptr = NULL;
1834         struct jbd_buf *jbd_buf, *tmp;
1835         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1836         uint32_t checksum = EXT4_CRC32_INIT;
1837         struct jbd_bhdr *bhdr = NULL;
1838         void *data;
1839
1840         /* Try to remove any non-dirty buffers from the tail of
1841          * buf_queue. */
1842         TAILQ_FOREACH_REVERSE_SAFE(jbd_buf, &trans->buf_queue,
1843                         jbd_trans_buf, buf_node, tmp) {
1844                 struct jbd_revoke_rec tmp_rec = {
1845                         .lba = jbd_buf->block_rec->lba
1846                 };
1847                 /* We stop the iteration when we find a dirty buffer. */
1848                 if (ext4_bcache_test_flag(jbd_buf->block.buf,
1849                                         BC_DIRTY))
1850                         break;
1851         
1852                 TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
1853                         jbd_buf,
1854                         dirty_buf_node);
1855
1856                 jbd_buf->block.buf->end_write = NULL;
1857                 jbd_buf->block.buf->end_write_arg = NULL;
1858                 jbd_trans_finish_callback(journal,
1859                                 trans,
1860                                 jbd_buf->block_rec,
1861                                 true,
1862                                 RB_FIND(jbd_revoke_tree,
1863                                         &trans->revoke_root,
1864                                         &tmp_rec));
1865                 jbd_trans_remove_block_rec(journal,
1866                                         jbd_buf->block_rec, trans);
1867                 trans->data_cnt--;
1868
1869                 ext4_block_set(fs->bdev, &jbd_buf->block);
1870                 TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1871                 ext4_free(jbd_buf);
1872         }
1873
1874         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, tmp) {
1875                 struct tag_info tag_info;
1876                 bool uuid_exist = false;
1877                 bool is_escape = false;
1878                 struct jbd_revoke_rec tmp_rec = {
1879                         .lba = jbd_buf->block_rec->lba
1880                 };
1881                 if (!ext4_bcache_test_flag(jbd_buf->block.buf,
1882                                            BC_DIRTY)) {
1883                         TAILQ_REMOVE(&jbd_buf->block_rec->dirty_buf_queue,
1884                                         jbd_buf,
1885                                         dirty_buf_node);
1886
1887                         jbd_buf->block.buf->end_write = NULL;
1888                         jbd_buf->block.buf->end_write_arg = NULL;
1889
1890                         /* The buffer has not been modified, just release
1891                          * that jbd_buf. */
1892                         jbd_trans_finish_callback(journal,
1893                                         trans,
1894                                         jbd_buf->block_rec,
1895                                         true,
1896                                         RB_FIND(jbd_revoke_tree,
1897                                                 &trans->revoke_root,
1898                                                 &tmp_rec));
1899                         jbd_trans_remove_block_rec(journal,
1900                                         jbd_buf->block_rec, trans);
1901                         trans->data_cnt--;
1902
1903                         ext4_block_set(fs->bdev, &jbd_buf->block);
1904                         TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1905                         ext4_free(jbd_buf);
1906                         continue;
1907                 }
1908                 checksum = jbd_block_csum(journal->jbd_fs,
1909                                           jbd_buf->block.data,
1910                                           checksum,
1911                                           trans->trans_id);
1912                 if (((struct jbd_bhdr *)jbd_buf->block.data)->magic ==
1913                                 to_be32(JBD_MAGIC_NUMBER))
1914                         is_escape = true;
1915
1916 again:
1917                 if (!desc_iblock) {
1918                         desc_iblock = jbd_journal_alloc_block(journal, trans);
1919                         rc = jbd_block_get_noread(journal->jbd_fs, &desc_block, desc_iblock);
1920                         if (rc != EOK)
1921                                 break;
1922
1923                         bhdr = (struct jbd_bhdr *)desc_block.data;
1924                         jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
1925                         jbd_set32(bhdr, blocktype, JBD_DESCRIPTOR_BLOCK);
1926                         jbd_set32(bhdr, sequence, trans->trans_id);
1927
1928                         tag_start = (char *)(bhdr + 1);
1929                         tag_ptr = tag_start;
1930                         uuid_exist = true;
1931                         tag_tbl_size = journal->block_size -
1932                                 sizeof(struct jbd_bhdr);
1933
1934                         if (jbd_has_csum(&journal->jbd_fs->sb))
1935                                 tag_tbl_size -= sizeof(struct jbd_block_tail);
1936
1937                         if (!trans->start_iblock)
1938                                 trans->start_iblock = desc_iblock;
1939
1940                         ext4_bcache_set_dirty(desc_block.buf);
1941                         ext4_bcache_set_flag(desc_block.buf, BC_TMP);
1942                 }
1943                 tag_info.block = jbd_buf->block.lb_id;
1944                 tag_info.uuid_exist = uuid_exist;
1945                 tag_info.is_escape = is_escape;
1946                 if (i == trans->data_cnt - 1)
1947                         tag_info.last_tag = true;
1948                 else
1949                         tag_info.last_tag = false;
1950
1951                 tag_info.checksum = checksum;
1952
1953                 if (uuid_exist)
1954                         memcpy(tag_info.uuid, journal->jbd_fs->sb.uuid,
1955                                         UUID_SIZE);
1956
1957                 rc = jbd_write_block_tag(journal->jbd_fs,
1958                                 tag_ptr,
1959                                 tag_tbl_size,
1960                                 &tag_info);
1961                 if (rc != EOK) {
1962                         jbd_meta_csum_set(journal->jbd_fs, bhdr);
1963                         desc_iblock = 0;
1964                         rc = jbd_block_set(journal->jbd_fs, &desc_block);
1965                         if (rc != EOK)
1966                                 break;
1967
1968                         goto again;
1969                 }
1970
1971                 data_iblock = jbd_journal_alloc_block(journal, trans);
1972                 rc = jbd_block_get_noread(journal->jbd_fs, &data_block, data_iblock);
1973                 if (rc != EOK) {
1974                         desc_iblock = 0;
1975                         ext4_bcache_clear_dirty(desc_block.buf);
1976                         jbd_block_set(journal->jbd_fs, &desc_block);
1977                         break;
1978                 }
1979
1980                 data = data_block.data;
1981                 memcpy(data, jbd_buf->block.data,
1982                         journal->block_size);
1983                 if (is_escape)
1984                         ((struct jbd_bhdr *)data)->magic = 0;
1985
1986                 ext4_bcache_set_dirty(data_block.buf);
1987                 ext4_bcache_set_flag(data_block.buf, BC_TMP);
1988                 rc = jbd_block_set(journal->jbd_fs, &data_block);
1989                 if (rc != EOK) {
1990                         desc_iblock = 0;
1991                         ext4_bcache_clear_dirty(desc_block.buf);
1992                         jbd_block_set(journal->jbd_fs, &desc_block);
1993                         break;
1994                 }
1995                 jbd_buf->jbd_lba = data_iblock;
1996
1997                 tag_ptr += tag_info.tag_bytes;
1998                 tag_tbl_size -= tag_info.tag_bytes;
1999
2000                 i++;
2001         }
2002         if (rc == EOK && desc_iblock) {
2003                 jbd_meta_csum_set(journal->jbd_fs,
2004                                 (struct jbd_bhdr *)bhdr);
2005                 trans->data_csum = checksum;
2006                 rc = jbd_block_set(journal->jbd_fs, &desc_block);
2007         }
2008
2009         return rc;
2010 }
2011
2012 /**@brief  Write revoke block for a transaction
2013  * @param  journal current journal session
2014  * @param  trans transaction
2015  * @return standard error code*/
2016 static int
2017 jbd_journal_prepare_revoke(struct jbd_journal *journal,
2018                            struct jbd_trans *trans)
2019 {
2020         int rc = EOK, i = 0;
2021         struct ext4_block desc_block = EXT4_BLOCK_ZERO();
2022         int32_t tag_tbl_size = 0;
2023         uint32_t desc_iblock = 0;
2024         char *blocks_entry = NULL;
2025         struct jbd_revoke_rec *rec, *tmp;
2026         struct jbd_revoke_header *header = NULL;
2027         int32_t record_len = 4;
2028         struct jbd_bhdr *bhdr = NULL;
2029
2030         if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb,
2031                                      JBD_FEATURE_INCOMPAT_64BIT))
2032                 record_len = 8;
2033
2034         RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root,
2035                           tmp) {
2036 again:
2037                 if (!desc_iblock) {
2038                         desc_iblock = jbd_journal_alloc_block(journal, trans);
2039                         rc = jbd_block_get_noread(journal->jbd_fs, &desc_block,
2040                                                   desc_iblock);
2041                         if (rc != EOK)
2042                                 break;
2043
2044                         bhdr = (struct jbd_bhdr *)desc_block.data;
2045                         jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
2046                         jbd_set32(bhdr, blocktype, JBD_REVOKE_BLOCK);
2047                         jbd_set32(bhdr, sequence, trans->trans_id);
2048                         
2049                         header = (struct jbd_revoke_header *)bhdr;
2050                         blocks_entry = (char *)(header + 1);
2051                         tag_tbl_size = journal->block_size -
2052                                 sizeof(struct jbd_revoke_header);
2053
2054                         if (jbd_has_csum(&journal->jbd_fs->sb))
2055                                 tag_tbl_size -= sizeof(struct jbd_block_tail);
2056
2057                         if (!trans->start_iblock)
2058                                 trans->start_iblock = desc_iblock;
2059
2060                         ext4_bcache_set_dirty(desc_block.buf);
2061                         ext4_bcache_set_flag(desc_block.buf, BC_TMP);
2062                 }
2063
2064                 if (tag_tbl_size < record_len) {
2065                         jbd_set32(header, count,
2066                                   journal->block_size - tag_tbl_size);
2067                         jbd_meta_csum_set(journal->jbd_fs, bhdr);
2068                         bhdr = NULL;
2069                         desc_iblock = 0;
2070                         header = NULL;
2071                         rc = jbd_block_set(journal->jbd_fs, &desc_block);
2072                         if (rc != EOK)
2073                                 break;
2074
2075                         goto again;
2076                 }
2077                 if (record_len == 8) {
2078                         uint64_t *blocks =
2079                                 (uint64_t *)blocks_entry;
2080                         *blocks = to_be64(rec->lba);
2081                 } else {
2082                         uint32_t *blocks =
2083                                 (uint32_t *)blocks_entry;
2084                         *blocks = to_be32((uint32_t)rec->lba);
2085                 }
2086                 blocks_entry += record_len;
2087                 tag_tbl_size -= record_len;
2088
2089                 i++;
2090         }
2091         if (rc == EOK && desc_iblock) {
2092                 if (header != NULL)
2093                         jbd_set32(header, count,
2094                                   journal->block_size - tag_tbl_size);
2095
2096                 jbd_meta_csum_set(journal->jbd_fs, bhdr);
2097                 rc = jbd_block_set(journal->jbd_fs, &desc_block);
2098         }
2099
2100         return rc;
2101 }
2102
2103 /**@brief  Put references of block descriptors in a transaction.
2104  * @param  journal current journal session
2105  * @param  trans transaction*/
2106 void jbd_journal_cp_trans(struct jbd_journal *journal, struct jbd_trans *trans)
2107 {
2108         struct jbd_buf *jbd_buf, *tmp;
2109         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
2110         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
2111                         tmp) {
2112                 struct ext4_block block = jbd_buf->block;
2113                 ext4_block_set(fs->bdev, &block);
2114         }
2115 }
2116
2117 /**@brief  Update the start block of the journal when
2118  *         all the contents in a transaction reach the disk.*/
2119 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
2120                           struct ext4_buf *buf,
2121                           int res,
2122                           void *arg)
2123 {
2124         struct jbd_buf *jbd_buf = arg;
2125         struct jbd_trans *trans = jbd_buf->trans;
2126         struct jbd_block_rec *block_rec = jbd_buf->block_rec;
2127         struct jbd_journal *journal = trans->journal;
2128         bool first_in_queue =
2129                 trans == TAILQ_FIRST(&journal->cp_queue);
2130         if (res != EOK)
2131                 trans->error = res;
2132
2133         TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
2134         TAILQ_REMOVE(&block_rec->dirty_buf_queue,
2135                         jbd_buf,
2136                         dirty_buf_node);
2137
2138         jbd_trans_finish_callback(journal,
2139                         trans,
2140                         jbd_buf->block_rec,
2141                         false,
2142                         false);
2143         if (block_rec->trans == trans && buf) {
2144                 /* Clear the end_write and end_write_arg fields. */
2145                 buf->end_write = NULL;
2146                 buf->end_write_arg = NULL;
2147         }
2148
2149         ext4_free(jbd_buf);
2150
2151         trans->written_cnt++;
2152         if (trans->written_cnt == trans->data_cnt) {
2153                 /* If it is the first transaction on checkpoint queue,
2154                  * we will shift the start of the journal to the next
2155                  * transaction, and remove subsequent written
2156                  * transactions from checkpoint queue until we find
2157                  * an unwritten one. */
2158                 if (first_in_queue) {
2159                         journal->start = trans->start_iblock +
2160                                 trans->alloc_blocks;
2161                         wrap(&journal->jbd_fs->sb, journal->start);
2162                         journal->trans_id = trans->trans_id + 1;
2163                         TAILQ_REMOVE(&journal->cp_queue, trans, trans_node);
2164                         jbd_journal_free_trans(journal, trans, false);
2165
2166                         jbd_journal_purge_cp_trans(journal, false, true);
2167                         jbd_journal_write_sb(journal);
2168                         jbd_write_sb(journal->jbd_fs);
2169                 }
2170         }
2171 }
2172
2173 /**@brief  Commit a transaction to the journal immediately.
2174  * @param  journal current journal session
2175  * @param  trans transaction
2176  * @return standard error code*/
2177 static int __jbd_journal_commit_trans(struct jbd_journal *journal,
2178                                       struct jbd_trans *trans)
2179 {
2180         int rc = EOK;
2181         uint32_t last = journal->last;
2182         struct jbd_revoke_rec *rec, *tmp;
2183
2184         trans->trans_id = journal->alloc_trans_id;
2185         rc = jbd_journal_prepare(journal, trans);
2186         if (rc != EOK)
2187                 goto Finish;
2188
2189         rc = jbd_journal_prepare_revoke(journal, trans);
2190         if (rc != EOK)
2191                 goto Finish;
2192
2193         if (TAILQ_EMPTY(&trans->buf_queue) &&
2194             RB_EMPTY(&trans->revoke_root)) {
2195                 /* Since there are no entries in both buffer list
2196                  * and revoke entry list, we do not consider trans as
2197                  * complete transaction and just return EOK.*/
2198                 jbd_journal_free_trans(journal, trans, false);
2199                 goto Finish;
2200         }
2201
2202         rc = jbd_trans_write_commit_block(trans);
2203         if (rc != EOK)
2204                 goto Finish;
2205
2206         journal->alloc_trans_id++;
2207
2208         /* Complete the checkpoint of buffers which are revoked. */
2209         RB_FOREACH_SAFE(rec, jbd_revoke_tree, &trans->revoke_root,
2210                         tmp) {
2211                 struct jbd_block_rec *block_rec =
2212                         jbd_trans_block_rec_lookup(journal, rec->lba);
2213                 struct jbd_buf *jbd_buf = NULL;
2214                 if (block_rec)
2215                         jbd_buf = TAILQ_LAST(&block_rec->dirty_buf_queue,
2216                                         jbd_buf_dirty);
2217                 if (jbd_buf) {
2218                         struct ext4_buf *buf;
2219                         struct ext4_block block = EXT4_BLOCK_ZERO();
2220                         /*
2221                          * We do this to reset the ext4_buf::end_write and
2222                          * ext4_buf::end_write_arg fields so that the checkpoint
2223                          * callback won't be triggered again.
2224                          */
2225                         buf = ext4_bcache_find_get(journal->jbd_fs->bdev->bc,
2226                                         &block,
2227                                         jbd_buf->block_rec->lba);
2228                         jbd_trans_end_write(journal->jbd_fs->bdev->bc,
2229                                         buf,
2230                                         EOK,
2231                                         jbd_buf);
2232                         if (buf)
2233                                 ext4_block_set(journal->jbd_fs->bdev, &block);
2234                 }
2235         }
2236
2237         if (TAILQ_EMPTY(&journal->cp_queue)) {
2238                 /*
2239                  * This transaction is going to be the first object in the
2240                  * checkpoint queue.
2241                  * When the first transaction in checkpoint queue is completely
2242                  * written to disk, we shift the tail of the log to right.
2243                  */
2244                 if (trans->data_cnt) {
2245                         journal->start = trans->start_iblock;
2246                         wrap(&journal->jbd_fs->sb, journal->start);
2247                         journal->trans_id = trans->trans_id;
2248                         jbd_journal_write_sb(journal);
2249                         jbd_write_sb(journal->jbd_fs);
2250                         TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
2251                                         trans_node);
2252                         jbd_journal_cp_trans(journal, trans);
2253                 } else {
2254                         journal->start = trans->start_iblock +
2255                                 trans->alloc_blocks;
2256                         wrap(&journal->jbd_fs->sb, journal->start);
2257                         journal->trans_id = trans->trans_id + 1;
2258                         jbd_journal_write_sb(journal);
2259                         jbd_journal_free_trans(journal, trans, false);
2260                 }
2261         } else {
2262                 /* No need to do anything to the JBD superblock. */
2263                 TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
2264                                 trans_node);
2265                 if (trans->data_cnt)
2266                         jbd_journal_cp_trans(journal, trans);
2267         }
2268 Finish:
2269         if (rc != EOK && rc != ENOSPC) {
2270                 journal->last = last;
2271                 jbd_journal_free_trans(journal, trans, true);
2272         }
2273         return rc;
2274 }
2275
2276 /**@brief  Allocate a new transaction
2277  * @param  journal current journal session
2278  * @return transaction allocated*/
2279 struct jbd_trans *
2280 jbd_journal_new_trans(struct jbd_journal *journal)
2281 {
2282         struct jbd_trans *trans = NULL;
2283         trans = ext4_calloc(1, sizeof(struct jbd_trans));
2284         if (!trans)
2285                 return NULL;
2286
2287         /* We will assign a trans_id to this transaction,
2288          * once it has been committed.*/
2289         trans->journal = journal;
2290         trans->data_csum = EXT4_CRC32_INIT;
2291         trans->error = EOK;
2292         TAILQ_INIT(&trans->buf_queue);
2293         return trans;
2294 }
2295
2296 /**@brief  Commit a transaction to the journal immediately.
2297  * @param  journal current journal session
2298  * @param  trans transaction
2299  * @return standard error code*/
2300 int jbd_journal_commit_trans(struct jbd_journal *journal,
2301                              struct jbd_trans *trans)
2302 {
2303         int r = EOK;
2304         r = __jbd_journal_commit_trans(journal, trans);
2305         return r;
2306 }
2307
2308 /**
2309  * @}
2310  */