f291722c76d54a18c67b7079760526ca88e99bb4
[lwext4.git] / lwext4 / ext4_journal.c
1 /*
2  * Copyright (c) 2015 Grzegorz Kostka (kostka.grzegorz@gmail.com)
3  * Copyright (c) 2015 Kaho Ng (ngkaho1234@gmail.com)
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * - Redistributions of source code must retain the above copyright
11  *   notice, this list of conditions and the following disclaimer.
12  * - Redistributions in binary form must reproduce the above copyright
13  *   notice, this list of conditions and the following disclaimer in the
14  *   documentation and/or other materials provided with the distribution.
15  * - The name of the author may not be used to endorse or promote products
16  *   derived from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29
30 /** @addtogroup lwext4
31  * @{
32  */
33 /**
34  * @file  ext4_journal.c
35  * @brief Journal handle functions
36  */
37
38 #include "ext4_config.h"
39 #include "ext4_types.h"
40 #include "ext4_fs.h"
41 #include "ext4_super.h"
42 #include "ext4_journal.h"
43 #include "ext4_errno.h"
44 #include "ext4_blockdev.h"
45 #include "ext4_crc32c.h"
46 #include "ext4_debug.h"
47 #include "tree.h"
48
49 #include <string.h>
50 #include <stdlib.h>
51
52 /**@brief  Revoke entry during journal replay.*/
53 struct revoke_entry {
54         /**@brief  Block number not to be replayed.*/
55         ext4_fsblk_t block;
56
57         /**@brief  For any transaction id smaller
58          *         than trans_id, records of @block
59          *         in those transactions should not
60          *         be replayed.*/
61         uint32_t trans_id;
62
63         /**@brief  Revoke tree node.*/
64         RB_ENTRY(revoke_entry) revoke_node;
65 };
66
67 /**@brief  Valid journal replay information.*/
68 struct recover_info {
69         /**@brief  Starting transaction id.*/
70         uint32_t start_trans_id;
71
72         /**@brief  Ending transaction id.*/
73         uint32_t last_trans_id;
74
75         /**@brief  Used as internal argument.*/
76         uint32_t this_trans_id;
77
78         /**@brief  RB-Tree storing revoke entries.*/
79         RB_HEAD(jbd_revoke, revoke_entry) revoke_root;
80 };
81
82 /**@brief  Journal replay internal arguments.*/
83 struct replay_arg {
84         /**@brief  Journal replay information.*/
85         struct recover_info *info;
86
87         /**@brief  Current block we are on.*/
88         uint32_t *this_block;
89
90         /**@brief  Current trans_id we are on.*/
91         uint32_t this_trans_id;
92 };
93
94 static int
95 jbd_revoke_entry_cmp(struct revoke_entry *a, struct revoke_entry *b)
96 {
97         if (a->block > b->block)
98                 return 1;
99         else if (a->block < b->block)
100                 return -1;
101         return 0;
102 }
103
104 static int
105 jbd_block_rec_cmp(struct jbd_block_rec *a, struct jbd_block_rec *b)
106 {
107         if (a->lba > b->lba)
108                 return 1;
109         else if (a->lba < b->lba)
110                 return -1;
111         return 0;
112 }
113
114 RB_GENERATE_INTERNAL(jbd_revoke, revoke_entry, revoke_node,
115                      jbd_revoke_entry_cmp, static inline)
116 RB_GENERATE_INTERNAL(jbd_block, jbd_block_rec, block_rec_node,
117                      jbd_block_rec_cmp, static inline)
118
119 #define jbd_alloc_revoke_entry() calloc(1, sizeof(struct revoke_entry))
120 #define jbd_free_revoke_entry(addr) free(addr)
121
122 /**@brief  Write jbd superblock to disk.
123  * @param  jbd_fs jbd filesystem
124  * @param  s jbd superblock
125  * @return standard error code*/
126 static int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s)
127 {
128         int rc;
129         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
130         uint64_t offset;
131         ext4_fsblk_t fblock;
132         rc = jbd_inode_bmap(jbd_fs, 0, &fblock);
133         if (rc != EOK)
134                 return rc;
135
136         offset = fblock * ext4_sb_get_block_size(&fs->sb);
137         return ext4_block_writebytes(fs->bdev, offset, s,
138                                      EXT4_SUPERBLOCK_SIZE);
139 }
140
141 /**@brief  Read jbd superblock from disk.
142  * @param  jbd_fs jbd filesystem
143  * @param  s jbd superblock
144  * @return standard error code*/
145 static int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s)
146 {
147         int rc;
148         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
149         uint64_t offset;
150         ext4_fsblk_t fblock;
151         rc = jbd_inode_bmap(jbd_fs, 0, &fblock);
152         if (rc != EOK)
153                 return rc;
154
155         offset = fblock * ext4_sb_get_block_size(&fs->sb);
156         return ext4_block_readbytes(fs->bdev, offset, s,
157                                     EXT4_SUPERBLOCK_SIZE);
158 }
159
160 /**@brief  Verify jbd superblock.
161  * @param  sb jbd superblock
162  * @return true if jbd superblock is valid */
163 static bool jbd_verify_sb(struct jbd_sb *sb)
164 {
165         struct jbd_bhdr *header = &sb->header;
166         if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER)
167                 return false;
168
169         if (jbd_get32(header, blocktype) != JBD_SUPERBLOCK &&
170             jbd_get32(header, blocktype) != JBD_SUPERBLOCK_V2)
171                 return false;
172
173         return true;
174 }
175
176 /**@brief  Write back dirty jbd superblock to disk.
177  * @param  jbd_fs jbd filesystem
178  * @return standard error code*/
179 static int jbd_write_sb(struct jbd_fs *jbd_fs)
180 {
181         int rc = EOK;
182         if (jbd_fs->dirty) {
183                 rc = jbd_sb_write(jbd_fs, &jbd_fs->sb);
184                 if (rc != EOK)
185                         return rc;
186
187                 jbd_fs->dirty = false;
188         }
189         return rc;
190 }
191
192 /**@brief  Get reference to jbd filesystem.
193  * @param  fs Filesystem to load journal of
194  * @param  jbd_fs jbd filesystem
195  * @return standard error code*/
196 int jbd_get_fs(struct ext4_fs *fs,
197                struct jbd_fs *jbd_fs)
198 {
199         int rc;
200         uint32_t journal_ino;
201
202         memset(jbd_fs, 0, sizeof(struct jbd_fs));
203         /* See if there is journal inode on this filesystem.*/
204         /* FIXME: detection on existance ofbkejournal bdev is
205          *        missing.*/
206         journal_ino = ext4_get32(&fs->sb, journal_inode_number);
207
208         rc = ext4_fs_get_inode_ref(fs,
209                                    journal_ino,
210                                    &jbd_fs->inode_ref);
211         if (rc != EOK) {
212                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
213                 return rc;
214         }
215         rc = jbd_sb_read(jbd_fs, &jbd_fs->sb);
216         if (rc != EOK) {
217                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
218                 ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
219                 return rc;
220         }
221         if (!jbd_verify_sb(&jbd_fs->sb)) {
222                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
223                 ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
224                 rc = EIO;
225         }
226
227         return rc;
228 }
229
230 /**@brief  Put reference of jbd filesystem.
231  * @param  jbd_fs jbd filesystem
232  * @return standard error code*/
233 int jbd_put_fs(struct jbd_fs *jbd_fs)
234 {
235         int rc = EOK;
236         rc = jbd_write_sb(jbd_fs);
237
238         ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
239         return rc;
240 }
241
242 /**@brief  Data block lookup helper.
243  * @param  jbd_fs jbd filesystem
244  * @param  iblock block index
245  * @param  fblock logical block address
246  * @return standard error code*/
247 int jbd_inode_bmap(struct jbd_fs *jbd_fs,
248                    ext4_lblk_t iblock,
249                    ext4_fsblk_t *fblock)
250 {
251         int rc = ext4_fs_get_inode_dblk_idx(
252                         &jbd_fs->inode_ref,
253                         iblock,
254                         fblock,
255                         false);
256         return rc;
257 }
258
259 /**@brief   jbd block get function (through cache).
260  * @param   jbd_fs jbd filesystem
261  * @param   block block descriptor
262  * @param   fblock jbd logical block address
263  * @return  standard error code*/
264 static int jbd_block_get(struct jbd_fs *jbd_fs,
265                   struct ext4_block *block,
266                   ext4_fsblk_t fblock)
267 {
268         /* TODO: journal device. */
269         int rc;
270         ext4_lblk_t iblock = (ext4_lblk_t)fblock;
271
272         /* Lookup the logical block address of
273          * fblock.*/
274         rc = jbd_inode_bmap(jbd_fs, iblock,
275                             &fblock);
276         if (rc != EOK)
277                 return rc;
278
279         struct ext4_blockdev *bdev = jbd_fs->inode_ref.fs->bdev;
280         rc = ext4_block_get(bdev, block, fblock);
281
282         /* If succeeded, mark buffer as BC_FLUSH to indicate
283          * that data should be written to disk immediately.*/
284         if (rc == EOK)
285                 ext4_bcache_set_flag(block->buf, BC_FLUSH);
286
287         return rc;
288 }
289
290 /**@brief   jbd block get function (through cache, don't read).
291  * @param   jbd_fs jbd filesystem
292  * @param   block block descriptor
293  * @param   fblock jbd logical block address
294  * @return  standard error code*/
295 static int jbd_block_get_noread(struct jbd_fs *jbd_fs,
296                          struct ext4_block *block,
297                          ext4_fsblk_t fblock)
298 {
299         /* TODO: journal device. */
300         int rc;
301         ext4_lblk_t iblock = (ext4_lblk_t)fblock;
302         rc = jbd_inode_bmap(jbd_fs, iblock,
303                             &fblock);
304         if (rc != EOK)
305                 return rc;
306
307         struct ext4_blockdev *bdev = jbd_fs->inode_ref.fs->bdev;
308         rc = ext4_block_get_noread(bdev, block, fblock);
309         if (rc == EOK)
310                 ext4_bcache_set_flag(block->buf, BC_FLUSH);
311
312         return rc;
313 }
314
315 /**@brief   jbd block set procedure (through cache).
316  * @param   jbd_fs jbd filesystem
317  * @param   block block descriptor
318  * @return  standard error code*/
319 static int jbd_block_set(struct jbd_fs *jbd_fs,
320                   struct ext4_block *block)
321 {
322         return ext4_block_set(jbd_fs->inode_ref.fs->bdev,
323                               block);
324 }
325
326 /**@brief  helper functions to calculate
327  *         block tag size, not including UUID part.
328  * @param  jbd_fs jbd filesystem
329  * @return tag size in bytes*/
330 static int jbd_tag_bytes(struct jbd_fs *jbd_fs)
331 {
332         int size;
333
334         /* It is very easy to deal with the case which
335          * JBD_FEATURE_INCOMPAT_CSUM_V3 is enabled.*/
336         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
337                                      JBD_FEATURE_INCOMPAT_CSUM_V3))
338                 return sizeof(struct jbd_block_tag3);
339
340         size = sizeof(struct jbd_block_tag);
341
342         /* If JBD_FEATURE_INCOMPAT_CSUM_V2 is enabled,
343          * add 2 bytes to size.*/
344         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
345                                      JBD_FEATURE_INCOMPAT_CSUM_V2))
346                 size += sizeof(uint16_t);
347
348         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
349                                      JBD_FEATURE_INCOMPAT_64BIT))
350                 return size;
351
352         /* If block number is 4 bytes in size,
353          * minus 4 bytes from size */
354         return size - sizeof(uint32_t);
355 }
356
357 /**@brief  Tag information. */
358 struct tag_info {
359         /**@brief  Tag size in bytes, including UUID part.*/
360         int tag_bytes;
361
362         /**@brief  block number stored in this tag.*/
363         ext4_fsblk_t block;
364
365         /**@brief  whether UUID part exists or not.*/
366         bool uuid_exist;
367
368         /**@brief  UUID content if UUID part exists.*/
369         uint8_t uuid[UUID_SIZE];
370
371         /**@brief  Is this the last tag? */
372         bool last_tag;
373 };
374
375 /**@brief  Extract information from a block tag.
376  * @param  __tag pointer to the block tag
377  * @param  tag_bytes block tag size of this jbd filesystem
378  * @param  remaining size in buffer containing the block tag
379  * @param  tag_info information of this tag.
380  * @return  EOK when succeed, otherwise return EINVAL.*/
381 static int
382 jbd_extract_block_tag(struct jbd_fs *jbd_fs,
383                       void *__tag,
384                       int tag_bytes,
385                       int32_t remain_buf_size,
386                       struct tag_info *tag_info)
387 {
388         char *uuid_start;
389         tag_info->tag_bytes = tag_bytes;
390         tag_info->uuid_exist = false;
391         tag_info->last_tag = false;
392
393         /* See whether it is possible to hold a valid block tag.*/
394         if (remain_buf_size - tag_bytes < 0)
395                 return EINVAL;
396
397         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
398                                      JBD_FEATURE_INCOMPAT_CSUM_V3)) {
399                 struct jbd_block_tag3 *tag = __tag;
400                 tag_info->block = jbd_get32(tag, blocknr);
401                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
402                                              JBD_FEATURE_INCOMPAT_64BIT))
403                          tag_info->block |=
404                                  (uint64_t)jbd_get32(tag, blocknr_high) << 32;
405
406                 if (jbd_get32(tag, flags) & JBD_FLAG_ESCAPE)
407                         tag_info->block = 0;
408
409                 if (!(jbd_get32(tag, flags) & JBD_FLAG_SAME_UUID)) {
410                         /* See whether it is possible to hold UUID part.*/
411                         if (remain_buf_size - tag_bytes < UUID_SIZE)
412                                 return EINVAL;
413
414                         uuid_start = (char *)tag + tag_bytes;
415                         tag_info->uuid_exist = true;
416                         tag_info->tag_bytes += UUID_SIZE;
417                         memcpy(tag_info->uuid, uuid_start, UUID_SIZE);
418                 }
419
420                 if (jbd_get32(tag, flags) & JBD_FLAG_LAST_TAG)
421                         tag_info->last_tag = true;
422
423         } else {
424                 struct jbd_block_tag *tag = __tag;
425                 tag_info->block = jbd_get32(tag, blocknr);
426                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
427                                              JBD_FEATURE_INCOMPAT_64BIT))
428                          tag_info->block |=
429                                  (uint64_t)jbd_get32(tag, blocknr_high) << 32;
430
431                 if (jbd_get16(tag, flags) & JBD_FLAG_ESCAPE)
432                         tag_info->block = 0;
433
434                 if (!(jbd_get16(tag, flags) & JBD_FLAG_SAME_UUID)) {
435                         /* See whether it is possible to hold UUID part.*/
436                         if (remain_buf_size - tag_bytes < UUID_SIZE)
437                                 return EINVAL;
438
439                         uuid_start = (char *)tag + tag_bytes;
440                         tag_info->uuid_exist = true;
441                         tag_info->tag_bytes += UUID_SIZE;
442                         memcpy(tag_info->uuid, uuid_start, UUID_SIZE);
443                 }
444
445                 if (jbd_get16(tag, flags) & JBD_FLAG_LAST_TAG)
446                         tag_info->last_tag = true;
447
448         }
449         return EOK;
450 }
451
452 /**@brief  Write information to a block tag.
453  * @param  __tag pointer to the block tag
454  * @param  remaining size in buffer containing the block tag
455  * @param  tag_info information of this tag.
456  * @return  EOK when succeed, otherwise return EINVAL.*/
457 static int
458 jbd_write_block_tag(struct jbd_fs *jbd_fs,
459                     void *__tag,
460                     int32_t remain_buf_size,
461                     struct tag_info *tag_info)
462 {
463         char *uuid_start;
464         int tag_bytes = jbd_tag_bytes(jbd_fs);
465
466         tag_info->tag_bytes = tag_bytes;
467
468         /* See whether it is possible to hold a valid block tag.*/
469         if (remain_buf_size - tag_bytes < 0)
470                 return EINVAL;
471
472         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
473                                      JBD_FEATURE_INCOMPAT_CSUM_V3)) {
474                 struct jbd_block_tag3 *tag = __tag;
475                 memset(tag, 0, sizeof(struct jbd_block_tag3));
476                 jbd_set32(tag, blocknr, tag_info->block);
477                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
478                                              JBD_FEATURE_INCOMPAT_64BIT))
479                         jbd_set32(tag, blocknr_high, tag_info->block >> 32);
480
481                 if (tag_info->uuid_exist) {
482                         /* See whether it is possible to hold UUID part.*/
483                         if (remain_buf_size - tag_bytes < UUID_SIZE)
484                                 return EINVAL;
485
486                         uuid_start = (char *)tag + tag_bytes;
487                         tag_info->tag_bytes += UUID_SIZE;
488                         memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
489                 } else
490                         jbd_set32(tag, flags,
491                                   jbd_get32(tag, flags) | JBD_FLAG_SAME_UUID);
492
493                 if (tag_info->last_tag)
494                         jbd_set32(tag, flags,
495                                   jbd_get32(tag, flags) | JBD_FLAG_LAST_TAG);
496
497         } else {
498                 struct jbd_block_tag *tag = __tag;
499                 memset(tag, 0, sizeof(struct jbd_block_tag));
500                 jbd_set32(tag, blocknr, tag_info->block);
501                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
502                                              JBD_FEATURE_INCOMPAT_64BIT))
503                         jbd_set32(tag, blocknr_high, tag_info->block >> 32);
504
505                 if (tag_info->uuid_exist) {
506                         /* See whether it is possible to hold UUID part.*/
507                         if (remain_buf_size - tag_bytes < UUID_SIZE)
508                                 return EINVAL;
509
510                         uuid_start = (char *)tag + tag_bytes;
511                         tag_info->tag_bytes += UUID_SIZE;
512                         memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
513                 } else
514                         jbd_set16(tag, flags,
515                                   jbd_get16(tag, flags) | JBD_FLAG_SAME_UUID);
516
517                 if (tag_info->last_tag)
518                         jbd_set16(tag, flags,
519                                   jbd_get16(tag, flags) | JBD_FLAG_LAST_TAG);
520
521         }
522         return EOK;
523 }
524
525 /**@brief  Iterate all block tags in a block.
526  * @param  jbd_fs jbd filesystem
527  * @param  __tag_start pointer to the block
528  * @param  tag_tbl_size size of the block
529  * @param  func callback routine to indicate that
530  *         a block tag is found
531  * @param  arg additional argument to be passed to func */
532 static void
533 jbd_iterate_block_table(struct jbd_fs *jbd_fs,
534                         void *__tag_start,
535                         int32_t tag_tbl_size,
536                         void (*func)(struct jbd_fs * jbd_fs,
537                                         ext4_fsblk_t block,
538                                         uint8_t *uuid,
539                                         void *arg),
540                         void *arg)
541 {
542         char *tag_start, *tag_ptr;
543         int tag_bytes = jbd_tag_bytes(jbd_fs);
544         tag_start = __tag_start;
545         tag_ptr = tag_start;
546
547         /* Cut off the size of block tail storing checksum. */
548         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
549                                      JBD_FEATURE_INCOMPAT_CSUM_V2) ||
550             JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
551                                      JBD_FEATURE_INCOMPAT_CSUM_V3))
552                 tag_tbl_size -= sizeof(struct jbd_block_tail);
553
554         while (tag_tbl_size) {
555                 struct tag_info tag_info;
556                 int rc = jbd_extract_block_tag(jbd_fs,
557                                       tag_ptr,
558                                       tag_bytes,
559                                       tag_tbl_size,
560                                       &tag_info);
561                 if (rc != EOK)
562                         break;
563
564                 if (func)
565                         func(jbd_fs, tag_info.block, tag_info.uuid, arg);
566
567                 /* Stop the iteration when we reach the last tag. */
568                 if (tag_info.last_tag)
569                         break;
570
571                 tag_ptr += tag_info.tag_bytes;
572                 tag_tbl_size -= tag_info.tag_bytes;
573         }
574 }
575
576 static void jbd_display_block_tags(struct jbd_fs *jbd_fs,
577                                    ext4_fsblk_t block,
578                                    uint8_t *uuid,
579                                    void *arg)
580 {
581         uint32_t *iblock = arg;
582         ext4_dbg(DEBUG_JBD, "Block in block_tag: %" PRIu64 "\n", block);
583         (*iblock)++;
584         (void)jbd_fs;
585         (void)uuid;
586         return;
587 }
588
589 static struct revoke_entry *
590 jbd_revoke_entry_lookup(struct recover_info *info, ext4_fsblk_t block)
591 {
592         struct revoke_entry tmp = {
593                 .block = block
594         };
595
596         return RB_FIND(jbd_revoke, &info->revoke_root, &tmp);
597 }
598
599 /**@brief  Replay a block in a transaction.
600  * @param  jbd_fs jbd filesystem
601  * @param  block  block address to be replayed.*/
602 static void jbd_replay_block_tags(struct jbd_fs *jbd_fs,
603                                   ext4_fsblk_t block,
604                                   uint8_t *uuid __unused,
605                                   void *__arg)
606 {
607         int r;
608         struct replay_arg *arg = __arg;
609         struct recover_info *info = arg->info;
610         uint32_t *this_block = arg->this_block;
611         struct revoke_entry *revoke_entry;
612         struct ext4_block journal_block, ext4_block;
613         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
614
615         (*this_block)++;
616
617         /* We replay this block only if the current transaction id
618          * is equal or greater than that in revoke entry.*/
619         revoke_entry = jbd_revoke_entry_lookup(info, block);
620         if (revoke_entry &&
621             arg->this_trans_id < revoke_entry->trans_id)
622                 return;
623
624         ext4_dbg(DEBUG_JBD,
625                  "Replaying block in block_tag: %" PRIu64 "\n",
626                  block);
627
628         r = jbd_block_get(jbd_fs, &journal_block, *this_block);
629         if (r != EOK)
630                 return;
631
632         /* We need special treatment for ext4 superblock. */
633         if (block) {
634                 r = ext4_block_get_noread(fs->bdev, &ext4_block, block);
635                 if (r != EOK) {
636                         jbd_block_set(jbd_fs, &journal_block);
637                         return;
638                 }
639
640                 memcpy(ext4_block.data,
641                         journal_block.data,
642                         jbd_get32(&jbd_fs->sb, blocksize));
643
644                 ext4_bcache_set_dirty(ext4_block.buf);
645                 ext4_block_set(fs->bdev, &ext4_block);
646         } else {
647                 uint16_t mount_count, state;
648                 mount_count = ext4_get16(&fs->sb, mount_count);
649                 state = ext4_get16(&fs->sb, state);
650
651                 memcpy(&fs->sb,
652                         journal_block.data + EXT4_SUPERBLOCK_OFFSET,
653                         EXT4_SUPERBLOCK_SIZE);
654
655                 /* Mark system as mounted */
656                 ext4_set16(&fs->sb, state, state);
657                 r = ext4_sb_write(fs->bdev, &fs->sb);
658                 if (r != EOK)
659                         return;
660
661                 /*Update mount count*/
662                 ext4_set16(&fs->sb, mount_count, mount_count);
663         }
664
665         jbd_block_set(jbd_fs, &journal_block);
666         
667         return;
668 }
669
670 /**@brief  Add block address to revoke tree, along with
671  *         its transaction id.
672  * @param  info  journal replay info
673  * @param  block  block address to be replayed.*/
674 static void jbd_add_revoke_block_tags(struct recover_info *info,
675                                       ext4_fsblk_t block)
676 {
677         struct revoke_entry *revoke_entry;
678
679         ext4_dbg(DEBUG_JBD, "Add block %" PRIu64 " to revoke tree\n", block);
680         /* If the revoke entry with respect to the block address
681          * exists already, update its transaction id.*/
682         revoke_entry = jbd_revoke_entry_lookup(info, block);
683         if (revoke_entry) {
684                 revoke_entry->trans_id = info->this_trans_id;
685                 return;
686         }
687
688         revoke_entry = jbd_alloc_revoke_entry();
689         ext4_assert(revoke_entry);
690         revoke_entry->block = block;
691         revoke_entry->trans_id = info->this_trans_id;
692         RB_INSERT(jbd_revoke, &info->revoke_root, revoke_entry);
693
694         return;
695 }
696
697 static void jbd_destroy_revoke_tree(struct recover_info *info)
698 {
699         while (!RB_EMPTY(&info->revoke_root)) {
700                 struct revoke_entry *revoke_entry =
701                         RB_MIN(jbd_revoke, &info->revoke_root);
702                 ext4_assert(revoke_entry);
703                 RB_REMOVE(jbd_revoke, &info->revoke_root, revoke_entry);
704                 jbd_free_revoke_entry(revoke_entry);
705         }
706 }
707
708 /* Make sure we wrap around the log correctly! */
709 #define wrap(sb, var)                                           \
710 do {                                                                    \
711         if (var >= jbd_get32((sb), maxlen))                                     \
712                 var -= (jbd_get32((sb), maxlen) - jbd_get32((sb), first));      \
713 } while (0)
714
715 #define ACTION_SCAN 0
716 #define ACTION_REVOKE 1
717 #define ACTION_RECOVER 2
718
719 /**@brief  Add entries in a revoke block to revoke tree.
720  * @param  jbd_fs jbd filesystem
721  * @param  header revoke block header
722  * @param  recover_info  journal replay info*/
723 static void jbd_build_revoke_tree(struct jbd_fs *jbd_fs,
724                                   struct jbd_bhdr *header,
725                                   struct recover_info *info)
726 {
727         char *blocks_entry;
728         struct jbd_revoke_header *revoke_hdr =
729                 (struct jbd_revoke_header *)header;
730         uint32_t i, nr_entries, record_len = 4;
731
732         /* If we are working on a 64bit jbd filesystem, */
733         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
734                                      JBD_FEATURE_INCOMPAT_64BIT))
735                 record_len = 8;
736
737         nr_entries = (jbd_get32(revoke_hdr, count) -
738                         sizeof(struct jbd_revoke_header)) /
739                         record_len;
740
741         blocks_entry = (char *)(revoke_hdr + 1);
742
743         for (i = 0;i < nr_entries;i++) {
744                 if (record_len == 8) {
745                         uint64_t *blocks =
746                                 (uint64_t *)blocks_entry;
747                         jbd_add_revoke_block_tags(info, to_be64(*blocks));
748                 } else {
749                         uint32_t *blocks =
750                                 (uint32_t *)blocks_entry;
751                         jbd_add_revoke_block_tags(info, to_be32(*blocks));
752                 }
753                 blocks_entry += record_len;
754         }
755 }
756
757 static void jbd_debug_descriptor_block(struct jbd_fs *jbd_fs,
758                                        struct jbd_bhdr *header,
759                                        uint32_t *iblock)
760 {
761         jbd_iterate_block_table(jbd_fs,
762                                 header + 1,
763                                 jbd_get32(&jbd_fs->sb, blocksize) -
764                                         sizeof(struct jbd_bhdr),
765                                 jbd_display_block_tags,
766                                 iblock);
767 }
768
769 static void jbd_replay_descriptor_block(struct jbd_fs *jbd_fs,
770                                         struct jbd_bhdr *header,
771                                         struct replay_arg *arg)
772 {
773         jbd_iterate_block_table(jbd_fs,
774                                 header + 1,
775                                 jbd_get32(&jbd_fs->sb, blocksize) -
776                                         sizeof(struct jbd_bhdr),
777                                 jbd_replay_block_tags,
778                                 arg);
779 }
780
781 /**@brief  The core routine of journal replay.
782  * @param  jbd_fs jbd filesystem
783  * @param  recover_info  journal replay info
784  * @param  action action needed to be taken
785  * @return standard error code*/
786 static int jbd_iterate_log(struct jbd_fs *jbd_fs,
787                            struct recover_info *info,
788                            int action)
789 {
790         int r = EOK;
791         bool log_end = false;
792         struct jbd_sb *sb = &jbd_fs->sb;
793         uint32_t start_trans_id, this_trans_id;
794         uint32_t start_block, this_block;
795
796         /* We start iterating valid blocks in the whole journal.*/
797         start_trans_id = this_trans_id = jbd_get32(sb, sequence);
798         start_block = this_block = jbd_get32(sb, start);
799
800         ext4_dbg(DEBUG_JBD, "Start of journal at trans id: %" PRIu32 "\n",
801                             start_trans_id);
802
803         while (!log_end) {
804                 struct ext4_block block;
805                 struct jbd_bhdr *header;
806                 /* If we are not scanning for the last
807                  * valid transaction in the journal,
808                  * we will stop when we reach the end of
809                  * the journal.*/
810                 if (action != ACTION_SCAN)
811                         if (this_trans_id > info->last_trans_id) {
812                                 log_end = true;
813                                 continue;
814                         }
815
816                 r = jbd_block_get(jbd_fs, &block, this_block);
817                 if (r != EOK)
818                         break;
819
820                 header = (struct jbd_bhdr *)block.data;
821                 /* This block does not have a valid magic number,
822                  * so we have reached the end of the journal.*/
823                 if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER) {
824                         jbd_block_set(jbd_fs, &block);
825                         log_end = true;
826                         continue;
827                 }
828
829                 /* If the transaction id we found is not expected,
830                  * we may have reached the end of the journal.
831                  *
832                  * If we are not scanning the journal, something
833                  * bad might have taken place. :-( */
834                 if (jbd_get32(header, sequence) != this_trans_id) {
835                         if (action != ACTION_SCAN)
836                                 r = EIO;
837
838                         jbd_block_set(jbd_fs, &block);
839                         log_end = true;
840                         continue;
841                 }
842
843                 switch (jbd_get32(header, blocktype)) {
844                 case JBD_DESCRIPTOR_BLOCK:
845                         ext4_dbg(DEBUG_JBD, "Descriptor block: %" PRIu32", "
846                                             "trans_id: %" PRIu32"\n",
847                                             this_block, this_trans_id);
848                         if (action == ACTION_RECOVER) {
849                                 struct replay_arg replay_arg;
850                                 replay_arg.info = info;
851                                 replay_arg.this_block = &this_block;
852                                 replay_arg.this_trans_id = this_trans_id;
853
854                                 jbd_replay_descriptor_block(jbd_fs,
855                                                 header, &replay_arg);
856                         } else
857                                 jbd_debug_descriptor_block(jbd_fs,
858                                                 header, &this_block);
859
860                         break;
861                 case JBD_COMMIT_BLOCK:
862                         ext4_dbg(DEBUG_JBD, "Commit block: %" PRIu32", "
863                                             "trans_id: %" PRIu32"\n",
864                                             this_block, this_trans_id);
865                         /* This is the end of a transaction,
866                          * we may now proceed to the next transaction.
867                          */
868                         this_trans_id++;
869                         break;
870                 case JBD_REVOKE_BLOCK:
871                         ext4_dbg(DEBUG_JBD, "Revoke block: %" PRIu32", "
872                                             "trans_id: %" PRIu32"\n",
873                                             this_block, this_trans_id);
874                         if (action == ACTION_REVOKE) {
875                                 info->this_trans_id = this_trans_id;
876                                 jbd_build_revoke_tree(jbd_fs,
877                                                 header, info);
878                         }
879                         break;
880                 default:
881                         log_end = true;
882                         break;
883                 }
884                 jbd_block_set(jbd_fs, &block);
885                 this_block++;
886                 wrap(sb, this_block);
887                 if (this_block == start_block)
888                         log_end = true;
889
890         }
891         ext4_dbg(DEBUG_JBD, "End of journal.\n");
892         if (r == EOK && action == ACTION_SCAN) {
893                 /* We have finished scanning the journal. */
894                 info->start_trans_id = start_trans_id;
895                 if (this_trans_id > start_trans_id)
896                         info->last_trans_id = this_trans_id - 1;
897                 else
898                         info->last_trans_id = this_trans_id;
899         }
900
901         return r;
902 }
903
904 /**@brief  Replay journal.
905  * @param  jbd_fs jbd filesystem
906  * @return standard error code*/
907 int jbd_recover(struct jbd_fs *jbd_fs)
908 {
909         int r;
910         struct recover_info info;
911         struct jbd_sb *sb = &jbd_fs->sb;
912         if (!sb->start)
913                 return EOK;
914
915         RB_INIT(&info.revoke_root);
916
917         r = jbd_iterate_log(jbd_fs, &info, ACTION_SCAN);
918         if (r != EOK)
919                 return r;
920
921         r = jbd_iterate_log(jbd_fs, &info, ACTION_REVOKE);
922         if (r != EOK)
923                 return r;
924
925         r = jbd_iterate_log(jbd_fs, &info, ACTION_RECOVER);
926         if (r == EOK) {
927                 /* If we successfully replay the journal,
928                  * clear EXT4_FINCOM_RECOVER flag on the
929                  * ext4 superblock, and set the start of
930                  * journal to 0.*/
931                 uint32_t features_incompatible =
932                         ext4_get32(&jbd_fs->inode_ref.fs->sb,
933                                    features_incompatible);
934                 jbd_set32(&jbd_fs->sb, start, 0);
935                 features_incompatible &= ~EXT4_FINCOM_RECOVER;
936                 ext4_set32(&jbd_fs->inode_ref.fs->sb,
937                            features_incompatible,
938                            features_incompatible);
939                 jbd_fs->dirty = true;
940                 r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev,
941                                   &jbd_fs->inode_ref.fs->sb);
942         }
943         jbd_destroy_revoke_tree(&info);
944         return r;
945 }
946
947 static void jbd_journal_write_sb(struct jbd_journal *journal)
948 {
949         struct jbd_fs *jbd_fs = journal->jbd_fs;
950         jbd_set32(&jbd_fs->sb, start, journal->start);
951         jbd_set32(&jbd_fs->sb, sequence, journal->trans_id);
952         jbd_fs->dirty = true;
953 }
954
955 /**@brief  Start accessing the journal.
956  * @param  jbd_fs jbd filesystem
957  * @param  journal current journal session
958  * @return standard error code*/
959 int jbd_journal_start(struct jbd_fs *jbd_fs,
960                       struct jbd_journal *journal)
961 {
962         int r;
963         uint32_t features_incompatible =
964                         ext4_get32(&jbd_fs->inode_ref.fs->sb,
965                                    features_incompatible);
966         features_incompatible |= EXT4_FINCOM_RECOVER;
967         ext4_set32(&jbd_fs->inode_ref.fs->sb,
968                         features_incompatible,
969                         features_incompatible);
970         r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev,
971                         &jbd_fs->inode_ref.fs->sb);
972         if (r != EOK)
973                 return r;
974
975         journal->first = jbd_get32(&jbd_fs->sb, first);
976         journal->start = journal->first;
977         journal->last = journal->first;
978         journal->trans_id = 1;
979         journal->alloc_trans_id = 1;
980
981         journal->block_size = jbd_get32(&jbd_fs->sb, blocksize);
982
983         TAILQ_INIT(&journal->trans_queue);
984         TAILQ_INIT(&journal->cp_queue);
985         RB_INIT(&journal->block_rec_root);
986         journal->jbd_fs = jbd_fs;
987         jbd_journal_write_sb(journal);
988         return jbd_write_sb(jbd_fs);
989 }
990
991 static void jbd_journal_flush_trans(struct jbd_trans *trans)
992 {
993         struct jbd_buf *jbd_buf, *tmp;
994         struct jbd_journal *journal = trans->journal;
995         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
996         LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node,
997                         tmp) {
998                 struct ext4_block block = jbd_buf->block;
999                 ext4_block_flush_buf(fs->bdev, block.buf);
1000         }
1001 }
1002
1003 static void
1004 jbd_journal_skip_pure_revoke(struct jbd_journal *journal,
1005                              struct jbd_trans *trans)
1006 {
1007         journal->start = trans->start_iblock +
1008                 trans->alloc_blocks;
1009         wrap(&journal->jbd_fs->sb, journal->start);
1010         journal->trans_id = trans->trans_id + 1;
1011         jbd_journal_free_trans(journal,
1012                         trans, false);
1013         jbd_journal_write_sb(journal);
1014 }
1015
1016 static void jbd_journal_flush_all_trans(struct jbd_journal *journal)
1017 {
1018         struct jbd_trans *trans;
1019         while ((trans = TAILQ_FIRST(&journal->cp_queue))) {
1020                 if (!trans->data_cnt) {
1021                         TAILQ_REMOVE(&journal->cp_queue,
1022                                         trans,
1023                                         trans_node);
1024                         jbd_journal_skip_pure_revoke(journal, trans);
1025                 } else
1026                         jbd_journal_flush_trans(trans);
1027
1028         }
1029 }
1030
1031 /**@brief  Stop accessing the journal.
1032  * @param  journal current journal session
1033  * @return standard error code*/
1034 int jbd_journal_stop(struct jbd_journal *journal)
1035 {
1036         int r;
1037         struct jbd_fs *jbd_fs = journal->jbd_fs;
1038         uint32_t features_incompatible;
1039
1040         /* Commit all the transactions to the journal.*/
1041         jbd_journal_commit_all(journal);
1042
1043         /* Make sure that journalled content have reached
1044          * the disk.*/
1045         jbd_journal_flush_all_trans(journal);
1046
1047         /* There should be no block record in this journal
1048          * session. */
1049         if (!RB_EMPTY(&journal->block_rec_root))
1050                 ext4_dbg(DEBUG_JBD,
1051                          DBG_WARN "There are still block records "
1052                                   "in this journal session!\n");
1053
1054         features_incompatible =
1055                 ext4_get32(&jbd_fs->inode_ref.fs->sb,
1056                            features_incompatible);
1057         features_incompatible &= ~EXT4_FINCOM_RECOVER;
1058         ext4_set32(&jbd_fs->inode_ref.fs->sb,
1059                         features_incompatible,
1060                         features_incompatible);
1061         r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev,
1062                         &jbd_fs->inode_ref.fs->sb);
1063         if (r != EOK)
1064                 return r;
1065
1066         journal->start = 0;
1067         journal->trans_id = 0;
1068         jbd_journal_write_sb(journal);
1069         return jbd_write_sb(journal->jbd_fs);
1070 }
1071
1072 /**@brief  Allocate a block in the journal.
1073  * @param  journal current journal session
1074  * @param  trans transaction
1075  * @return allocated block address*/
1076 static uint32_t jbd_journal_alloc_block(struct jbd_journal *journal,
1077                                         struct jbd_trans *trans)
1078 {
1079         uint32_t start_block;
1080
1081         start_block = journal->last++;
1082         trans->alloc_blocks++;
1083         wrap(&journal->jbd_fs->sb, journal->last);
1084         
1085         /* If there is no space left, flush all journalled
1086          * blocks to disk first.*/
1087         if (journal->last == journal->start)
1088                 jbd_journal_flush_all_trans(journal);
1089
1090         return start_block;
1091 }
1092
1093 /**@brief  Allocate a new transaction
1094  * @param  journal current journal session
1095  * @return transaction allocated*/
1096 struct jbd_trans *
1097 jbd_journal_new_trans(struct jbd_journal *journal)
1098 {
1099         struct jbd_trans *trans = calloc(1, sizeof(struct jbd_trans));
1100         if (!trans)
1101                 return NULL;
1102
1103         /* We will assign a trans_id to this transaction,
1104          * once it has been committed.*/
1105         trans->journal = journal;
1106         trans->error = EOK;
1107         return trans;
1108 }
1109
1110 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
1111                           struct ext4_buf *buf __unused,
1112                           int res,
1113                           void *arg);
1114
1115 /**@brief  gain access to it before making any modications.
1116  * @param  journal current journal session
1117  * @param  trans transaction
1118  * @param  block descriptor
1119  * @return standard error code.*/
1120 int jbd_trans_get_access(struct jbd_journal *journal,
1121                          struct jbd_trans *trans,
1122                          struct ext4_block *block)
1123 {
1124         int r = EOK;
1125         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1126
1127         /* If the buffer has already been modified, we should
1128          * flush dirty data in this buffer to disk.*/
1129         if (ext4_bcache_test_flag(block->buf, BC_DIRTY) &&
1130             block->buf->end_write == jbd_trans_end_write &&
1131             block->buf->end_write_arg != trans) {
1132                 r = ext4_block_flush_buf(fs->bdev, block->buf);
1133         }
1134         return r;
1135 }
1136
1137 static struct jbd_block_rec *
1138 jbd_trans_block_rec_lookup(struct jbd_journal *journal,
1139                            ext4_fsblk_t lba)
1140 {
1141         struct jbd_block_rec tmp = {
1142                 .lba = lba
1143         };
1144
1145         return RB_FIND(jbd_block,
1146                        &journal->block_rec_root,
1147                        &tmp);
1148 }
1149
1150 static inline struct jbd_block_rec *
1151 jbd_trans_insert_block_rec(struct jbd_trans *trans,
1152                            ext4_fsblk_t lba,
1153                            struct ext4_buf *buf)
1154 {
1155         struct jbd_block_rec *block_rec;
1156         block_rec = jbd_trans_block_rec_lookup(trans->journal, lba);
1157         if (block_rec) {
1158                 /* Data should be flushed to disk already. */
1159                 ext4_assert(!block_rec->buf);
1160                 /* Now this block record belongs to this transaction. */
1161                 block_rec->trans = trans;
1162                 return block_rec;
1163         }
1164         block_rec = calloc(1, sizeof(struct jbd_block_rec));
1165         if (!block_rec)
1166                 return NULL;
1167
1168         block_rec->lba = lba;
1169         block_rec->buf = buf;
1170         block_rec->trans = trans;
1171         RB_INSERT(jbd_block, &trans->journal->block_rec_root, block_rec);
1172         return block_rec;
1173 }
1174
1175 static inline void
1176 jbd_trans_remove_block_rec(struct jbd_journal *journal,
1177                            struct jbd_buf *jbd_buf)
1178 {
1179         struct jbd_block_rec *block_rec = jbd_buf->block_rec;
1180         /* If this block record doesn't belong to this transaction,
1181          * give up.*/
1182         if (block_rec->trans == jbd_buf->trans) {
1183                 RB_REMOVE(jbd_block,
1184                                 &journal->block_rec_root,
1185                                 block_rec);
1186                 free(block_rec);
1187         }
1188 }
1189
1190 /**@brief  Add block to a transaction and mark it dirty.
1191  * @param  trans transaction
1192  * @param  block block descriptor
1193  * @return standard error code*/
1194 int jbd_trans_set_block_dirty(struct jbd_trans *trans,
1195                               struct ext4_block *block)
1196 {
1197         struct jbd_buf *buf;
1198
1199         if (!ext4_bcache_test_flag(block->buf, BC_DIRTY) &&
1200             block->buf->end_write != jbd_trans_end_write) {
1201                 struct jbd_block_rec *block_rec;
1202                 buf = calloc(1, sizeof(struct jbd_buf));
1203                 if (!buf)
1204                         return ENOMEM;
1205
1206                 if ((block_rec = jbd_trans_insert_block_rec(trans,
1207                                         block->lb_id,
1208                                         block->buf)) == NULL) {
1209                         free(buf);
1210                         return ENOMEM;
1211                 }
1212
1213                 buf->block_rec = block_rec;
1214                 buf->trans = trans;
1215                 buf->block = *block;
1216                 ext4_bcache_inc_ref(block->buf);
1217
1218                 /* If the content reach the disk, notify us
1219                  * so that we may do a checkpoint. */
1220                 block->buf->end_write = jbd_trans_end_write;
1221                 block->buf->end_write_arg = buf;
1222
1223                 trans->data_cnt++;
1224                 LIST_INSERT_HEAD(&trans->buf_list, buf, buf_node);
1225
1226                 ext4_bcache_set_dirty(block->buf);
1227         }
1228         return EOK;
1229 }
1230
1231 /**@brief  Add block to be revoked to a transaction
1232  * @param  trans transaction
1233  * @param  lba logical block address
1234  * @return standard error code*/
1235 int jbd_trans_revoke_block(struct jbd_trans *trans,
1236                            ext4_fsblk_t lba)
1237 {
1238         struct jbd_revoke_rec *rec =
1239                 calloc(1, sizeof(struct jbd_revoke_rec));
1240         if (!rec)
1241                 return ENOMEM;
1242
1243         rec->lba = lba;
1244         LIST_INSERT_HEAD(&trans->revoke_list, rec, revoke_node);
1245         return EOK;
1246 }
1247
1248 /**@brief  Try to add block to be revoked to a transaction.
1249  *         If @lba still remains in an transaction on checkpoint
1250  *         queue, add @lba as a revoked block to the transaction.
1251  * @param  trans transaction
1252  * @param  lba logical block address
1253  * @return standard error code*/
1254 int jbd_trans_try_revoke_block(struct jbd_trans *trans,
1255                                ext4_fsblk_t lba)
1256 {
1257         int r = EOK;
1258         struct jbd_journal *journal = trans->journal;
1259         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1260         struct jbd_block_rec *block_rec =
1261                 jbd_trans_block_rec_lookup(journal, lba);
1262
1263         /* Make sure we don't flush any buffers belong to this transaction. */
1264         if (block_rec && block_rec->trans != trans) {
1265                 /* If the buffer has not been flushed yet, flush it now. */
1266                 if (block_rec->buf) {
1267                         r = ext4_block_flush_buf(fs->bdev, block_rec->buf);
1268                         if (r != EOK)
1269                                 return r;
1270
1271                 }
1272
1273                 jbd_trans_revoke_block(trans, lba);
1274         }
1275
1276         return EOK;
1277 }
1278
1279 /**@brief  Free a transaction
1280  * @param  journal current journal session
1281  * @param  trans transaction
1282  * @param  abort discard all the modifications on the block?
1283  * @return standard error code*/
1284 void jbd_journal_free_trans(struct jbd_journal *journal,
1285                             struct jbd_trans *trans,
1286                             bool abort)
1287 {
1288         struct jbd_buf *jbd_buf, *tmp;
1289         struct jbd_revoke_rec *rec, *tmp2;
1290         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1291         LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node,
1292                           tmp) {
1293                 if (abort) {
1294                         jbd_buf->block.buf->end_write = NULL;
1295                         jbd_buf->block.buf->end_write_arg = NULL;
1296                         ext4_bcache_clear_dirty(jbd_buf->block.buf);
1297                         ext4_block_set(fs->bdev, &jbd_buf->block);
1298                 }
1299
1300                 jbd_trans_remove_block_rec(journal, jbd_buf);
1301                 LIST_REMOVE(jbd_buf, buf_node);
1302                 free(jbd_buf);
1303         }
1304         LIST_FOREACH_SAFE(rec, &trans->revoke_list, revoke_node,
1305                           tmp2) {
1306                 LIST_REMOVE(rec, revoke_node);
1307                 free(rec);
1308         }
1309
1310         free(trans);
1311 }
1312
1313 /**@brief  Write commit block for a transaction
1314  * @param  trans transaction
1315  * @return standard error code*/
1316 static int jbd_trans_write_commit_block(struct jbd_trans *trans)
1317 {
1318         int rc;
1319         struct jbd_commit_header *header;
1320         uint32_t commit_iblock = 0;
1321         struct ext4_block commit_block;
1322         struct jbd_journal *journal = trans->journal;
1323
1324         commit_iblock = jbd_journal_alloc_block(journal, trans);
1325         rc = jbd_block_get_noread(journal->jbd_fs,
1326                         &commit_block, commit_iblock);
1327         if (rc != EOK)
1328                 return rc;
1329
1330         header = (struct jbd_commit_header *)commit_block.data;
1331         jbd_set32(&header->header, magic, JBD_MAGIC_NUMBER);
1332         jbd_set32(&header->header, blocktype, JBD_COMMIT_BLOCK);
1333         jbd_set32(&header->header, sequence, trans->trans_id);
1334
1335         ext4_bcache_set_dirty(commit_block.buf);
1336         rc = jbd_block_set(journal->jbd_fs, &commit_block);
1337         if (rc != EOK)
1338                 return rc;
1339
1340         return EOK;
1341 }
1342
1343 /**@brief  Write descriptor block for a transaction
1344  * @param  journal current journal session
1345  * @param  trans transaction
1346  * @return standard error code*/
1347 static int jbd_journal_prepare(struct jbd_journal *journal,
1348                                struct jbd_trans *trans)
1349 {
1350         int rc = EOK, i = 0;
1351         int32_t tag_tbl_size;
1352         uint32_t desc_iblock = 0;
1353         uint32_t data_iblock = 0;
1354         char *tag_start = NULL, *tag_ptr = NULL;
1355         struct jbd_buf *jbd_buf, *tmp;
1356         struct ext4_block desc_block, data_block;
1357         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1358
1359         LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node, tmp) {
1360                 struct tag_info tag_info;
1361                 bool uuid_exist = false;
1362                 if (!ext4_bcache_test_flag(jbd_buf->block.buf,
1363                                            BC_DIRTY)) {
1364                         /* The buffer has not been modified, just release
1365                          * that jbd_buf. */
1366                         jbd_buf->block.buf->end_write = NULL;
1367                         jbd_buf->block.buf->end_write_arg = NULL;
1368                         ext4_block_set(fs->bdev, &jbd_buf->block);
1369                         LIST_REMOVE(jbd_buf, buf_node);
1370                         free(jbd_buf);
1371                         continue;
1372                 }
1373 again:
1374                 if (!desc_iblock) {
1375                         struct jbd_bhdr *bhdr;
1376                         desc_iblock = jbd_journal_alloc_block(journal, trans);
1377                         rc = jbd_block_get_noread(journal->jbd_fs,
1378                                            &desc_block, desc_iblock);
1379                         if (rc != EOK)
1380                                 break;
1381
1382                         ext4_bcache_set_dirty(desc_block.buf);
1383
1384                         bhdr = (struct jbd_bhdr *)desc_block.data;
1385                         jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
1386                         jbd_set32(bhdr, blocktype, JBD_DESCRIPTOR_BLOCK);
1387                         jbd_set32(bhdr, sequence, trans->trans_id);
1388
1389                         tag_start = (char *)(bhdr + 1);
1390                         tag_ptr = tag_start;
1391                         uuid_exist = true;
1392                         tag_tbl_size = journal->block_size -
1393                                 sizeof(struct jbd_bhdr);
1394
1395                         if (!trans->start_iblock)
1396                                 trans->start_iblock = desc_iblock;
1397
1398                 }
1399                 tag_info.block = jbd_buf->block.lb_id;
1400                 tag_info.uuid_exist = uuid_exist;
1401                 if (i == trans->data_cnt - 1)
1402                         tag_info.last_tag = true;
1403
1404                 if (uuid_exist)
1405                         memcpy(tag_info.uuid, journal->jbd_fs->sb.uuid,
1406                                         UUID_SIZE);
1407
1408                 rc = jbd_write_block_tag(journal->jbd_fs,
1409                                 tag_ptr,
1410                                 tag_tbl_size,
1411                                 &tag_info);
1412                 if (rc != EOK) {
1413                         jbd_block_set(journal->jbd_fs, &desc_block);
1414                         desc_iblock = 0;
1415                         goto again;
1416                 }
1417
1418                 data_iblock = jbd_journal_alloc_block(journal, trans);
1419                 rc = jbd_block_get_noread(journal->jbd_fs,
1420                                 &data_block, data_iblock);
1421                 if (rc != EOK)
1422                         break;
1423
1424                 ext4_bcache_set_dirty(data_block.buf);
1425
1426                 memcpy(data_block.data, jbd_buf->block.data,
1427                         journal->block_size);
1428
1429                 rc = jbd_block_set(journal->jbd_fs, &data_block);
1430                 if (rc != EOK)
1431                         break;
1432
1433                 tag_ptr += tag_info.tag_bytes;
1434                 tag_tbl_size -= tag_info.tag_bytes;
1435
1436                 i++;
1437         }
1438         if (rc == EOK && desc_iblock)
1439                 jbd_block_set(journal->jbd_fs, &desc_block);
1440
1441         return rc;
1442 }
1443
1444 /**@brief  Write revoke block for a transaction
1445  * @param  journal current journal session
1446  * @param  trans transaction
1447  * @return standard error code*/
1448 static int
1449 jbd_journal_prepare_revoke(struct jbd_journal *journal,
1450                            struct jbd_trans *trans)
1451 {
1452         int rc = EOK, i = 0;
1453         int32_t tag_tbl_size;
1454         uint32_t desc_iblock = 0;
1455         char *blocks_entry = NULL;
1456         struct jbd_revoke_rec *rec, *tmp;
1457         struct ext4_block desc_block;
1458         struct jbd_revoke_header *header = NULL;
1459         int32_t record_len = 4;
1460
1461         if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb,
1462                                      JBD_FEATURE_INCOMPAT_64BIT))
1463                 record_len = 8;
1464
1465         LIST_FOREACH_SAFE(rec, &trans->revoke_list, revoke_node,
1466                           tmp) {
1467 again:
1468                 if (!desc_iblock) {
1469                         struct jbd_bhdr *bhdr;
1470                         desc_iblock = jbd_journal_alloc_block(journal, trans);
1471                         rc = jbd_block_get_noread(journal->jbd_fs,
1472                                            &desc_block, desc_iblock);
1473                         if (rc != EOK) {
1474                                 break;
1475                         }
1476
1477                         ext4_bcache_set_dirty(desc_block.buf);
1478
1479                         bhdr = (struct jbd_bhdr *)desc_block.data;
1480                         jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
1481                         jbd_set32(bhdr, blocktype, JBD_REVOKE_BLOCK);
1482                         jbd_set32(bhdr, sequence, trans->trans_id);
1483                         
1484                         header = (struct jbd_revoke_header *)bhdr;
1485                         blocks_entry = (char *)(header + 1);
1486                         tag_tbl_size = journal->block_size -
1487                                 sizeof(struct jbd_revoke_header);
1488
1489                         if (!trans->start_iblock)
1490                                 trans->start_iblock = desc_iblock;
1491
1492                 }
1493
1494                 if (tag_tbl_size < record_len) {
1495                         jbd_set32(header, count,
1496                                   journal->block_size - tag_tbl_size);
1497                         jbd_block_set(journal->jbd_fs, &desc_block);
1498                         desc_iblock = 0;
1499                         header = NULL;
1500                         goto again;
1501                 }
1502                 if (record_len == 8) {
1503                         uint64_t *blocks =
1504                                 (uint64_t *)blocks_entry;
1505                         *blocks = to_be64(rec->lba);
1506                 } else {
1507                         uint32_t *blocks =
1508                                 (uint32_t *)blocks_entry;
1509                         *blocks = to_be32(rec->lba);
1510                 }
1511                 blocks_entry += record_len;
1512                 tag_tbl_size -= record_len;
1513
1514                 i++;
1515         }
1516         if (rc == EOK && desc_iblock) {
1517                 if (header != NULL)
1518                         jbd_set32(header, count,
1519                                   journal->block_size - tag_tbl_size);
1520
1521                 jbd_block_set(journal->jbd_fs, &desc_block);
1522         }
1523
1524         return rc;
1525 }
1526
1527 /**@brief  Submit the transaction to transaction queue.
1528  * @param  journal current journal session
1529  * @param  trans transaction*/
1530 void
1531 jbd_journal_submit_trans(struct jbd_journal *journal,
1532                          struct jbd_trans *trans)
1533 {
1534         TAILQ_INSERT_TAIL(&journal->trans_queue,
1535                           trans,
1536                           trans_node);
1537 }
1538
1539 /**@brief  Put references of block descriptors in a transaction.
1540  * @param  journal current journal session
1541  * @param  trans transaction*/
1542 void jbd_journal_cp_trans(struct jbd_journal *journal, struct jbd_trans *trans)
1543 {
1544         struct jbd_buf *jbd_buf, *tmp;
1545         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1546         LIST_FOREACH_SAFE(jbd_buf, &trans->buf_list, buf_node,
1547                         tmp) {
1548                 struct ext4_block block = jbd_buf->block;
1549                 ext4_block_set(fs->bdev, &block);
1550         }
1551 }
1552
1553 /**@brief  Update the start block of the journal when
1554  *         all the contents in a transaction reach the disk.*/
1555 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
1556                           struct ext4_buf *buf,
1557                           int res,
1558                           void *arg)
1559 {
1560         struct jbd_buf *jbd_buf = arg;
1561         struct jbd_trans *trans = jbd_buf->trans;
1562         struct jbd_journal *journal = trans->journal;
1563         bool first_in_queue =
1564                 trans == TAILQ_FIRST(&journal->cp_queue);
1565         if (res != EOK)
1566                 trans->error = res;
1567
1568         LIST_REMOVE(jbd_buf, buf_node);
1569         jbd_buf->block_rec->buf = NULL;
1570         jbd_trans_remove_block_rec(journal, jbd_buf);
1571         free(jbd_buf);
1572
1573         /* Clear the end_write and end_write_arg fields. */
1574         buf->end_write = NULL;
1575         buf->end_write_arg = NULL;
1576
1577         trans->written_cnt++;
1578         if (trans->written_cnt == trans->data_cnt) {
1579                 TAILQ_REMOVE(&journal->cp_queue, trans, trans_node);
1580
1581                 if (first_in_queue) {
1582                         journal->start = trans->start_iblock +
1583                                 trans->alloc_blocks;
1584                         wrap(&journal->jbd_fs->sb, journal->start);
1585                         journal->trans_id = trans->trans_id + 1;
1586                 }
1587                 jbd_journal_free_trans(journal, trans, false);
1588
1589                 if (first_in_queue) {
1590                         while ((trans = TAILQ_FIRST(&journal->cp_queue))) {
1591                                 if (!trans->data_cnt) {
1592                                         TAILQ_REMOVE(&journal->cp_queue,
1593                                                      trans,
1594                                                      trans_node);
1595                                         jbd_journal_skip_pure_revoke(journal,
1596                                                                      trans);
1597                                 } else {
1598                                         journal->start = trans->start_iblock;
1599                                         wrap(&journal->jbd_fs->sb, journal->start);
1600                                         journal->trans_id = trans->trans_id;
1601                                         break;
1602                                 }
1603                         }
1604                         jbd_journal_write_sb(journal);
1605                         jbd_write_sb(journal->jbd_fs);
1606                 }
1607         }
1608 }
1609
1610 /**@brief  Commit a transaction to the journal immediately.
1611  * @param  journal current journal session
1612  * @param  trans transaction
1613  * @return standard error code*/
1614 int jbd_journal_commit_trans(struct jbd_journal *journal,
1615                              struct jbd_trans *trans)
1616 {
1617         int rc = EOK;
1618         uint32_t last = journal->last;
1619
1620         trans->trans_id = journal->alloc_trans_id;
1621         rc = jbd_journal_prepare(journal, trans);
1622         if (rc != EOK)
1623                 goto Finish;
1624
1625         rc = jbd_journal_prepare_revoke(journal, trans);
1626         if (rc != EOK)
1627                 goto Finish;
1628
1629         if (LIST_EMPTY(&trans->buf_list) &&
1630             LIST_EMPTY(&trans->revoke_list)) {
1631                 /* Since there are no entries in both buffer list
1632                  * and revoke entry list, we do not consider trans as
1633                  * complete transaction and just return EOK.*/
1634                 jbd_journal_free_trans(journal, trans, false);
1635                 goto Finish;
1636         }
1637
1638         rc = jbd_trans_write_commit_block(trans);
1639         if (rc != EOK)
1640                 goto Finish;
1641
1642         journal->alloc_trans_id++;
1643         if (TAILQ_EMPTY(&journal->cp_queue)) {
1644                 if (trans->data_cnt) {
1645                         journal->start = trans->start_iblock;
1646                         wrap(&journal->jbd_fs->sb, journal->start);
1647                         journal->trans_id = trans->trans_id;
1648                         jbd_journal_write_sb(journal);
1649                         jbd_write_sb(journal->jbd_fs);
1650                         TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
1651                                         trans_node);
1652                         jbd_journal_cp_trans(journal, trans);
1653                 } else {
1654                         journal->start = trans->start_iblock +
1655                                 trans->alloc_blocks;
1656                         wrap(&journal->jbd_fs->sb, journal->start);
1657                         journal->trans_id = trans->trans_id + 1;
1658                         jbd_journal_write_sb(journal);
1659                         jbd_journal_free_trans(journal, trans, false);
1660                 }
1661         } else {
1662                 TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
1663                                 trans_node);
1664                 if (trans->data_cnt)
1665                         jbd_journal_cp_trans(journal, trans);
1666
1667         }
1668 Finish:
1669         if (rc != EOK) {
1670                 journal->last = last;
1671                 jbd_journal_free_trans(journal, trans, true);
1672         }
1673         return rc;
1674 }
1675
1676 /**@brief  Commit one transaction on transaction queue
1677  *         to the journal.
1678  * @param  journal current journal session.*/
1679 void jbd_journal_commit_one(struct jbd_journal *journal)
1680 {
1681         struct jbd_trans *trans;
1682
1683         if ((trans = TAILQ_FIRST(&journal->trans_queue))) {
1684                 TAILQ_REMOVE(&journal->trans_queue, trans, trans_node);
1685                 jbd_journal_commit_trans(journal, trans);
1686         }
1687 }
1688
1689 /**@brief  Commit all the transactions on transaction queue
1690  *         to the journal.
1691  * @param  journal current journal session.*/
1692 void jbd_journal_commit_all(struct jbd_journal *journal)
1693 {
1694         while (!TAILQ_EMPTY(&journal->trans_queue)) {
1695                 jbd_journal_commit_one(journal);
1696         }
1697 }
1698
1699 /**
1700  * @}
1701  */