ext4_journal: improve transaction handling
[lwext4.git] / lwext4 / ext4_journal.c
1 /*
2  * Copyright (c) 2015 Grzegorz Kostka (kostka.grzegorz@gmail.com)
3  * Copyright (c) 2015 Kaho Ng (ngkaho1234@gmail.com)
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * - Redistributions of source code must retain the above copyright
11  *   notice, this list of conditions and the following disclaimer.
12  * - Redistributions in binary form must reproduce the above copyright
13  *   notice, this list of conditions and the following disclaimer in the
14  *   documentation and/or other materials provided with the distribution.
15  * - The name of the author may not be used to endorse or promote products
16  *   derived from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29
30 /** @addtogroup lwext4
31  * @{
32  */
33 /**
34  * @file  ext4_journal.c
35  * @brief Journal handle functions
36  */
37
38 #include "ext4_config.h"
39 #include "ext4_types.h"
40 #include "ext4_fs.h"
41 #include "ext4_super.h"
42 #include "ext4_journal.h"
43 #include "ext4_errno.h"
44 #include "ext4_blockdev.h"
45 #include "ext4_crc32c.h"
46 #include "ext4_debug.h"
47
48 #include <string.h>
49 #include <stdlib.h>
50
51 /**@brief  Revoke entry during journal replay.*/
52 struct revoke_entry {
53         /**@brief  Block number not to be replayed.*/
54         ext4_fsblk_t block;
55
56         /**@brief  For any transaction id smaller
57          *         than trans_id, records of @block
58          *         in those transactions should not
59          *         be replayed.*/
60         uint32_t trans_id;
61
62         /**@brief  Revoke tree node.*/
63         RB_ENTRY(revoke_entry) revoke_node;
64 };
65
66 /**@brief  Valid journal replay information.*/
67 struct recover_info {
68         /**@brief  Starting transaction id.*/
69         uint32_t start_trans_id;
70
71         /**@brief  Ending transaction id.*/
72         uint32_t last_trans_id;
73
74         /**@brief  Used as internal argument.*/
75         uint32_t this_trans_id;
76
77         /**@brief  RB-Tree storing revoke entries.*/
78         RB_HEAD(jbd_revoke, revoke_entry) revoke_root;
79 };
80
81 /**@brief  Journal replay internal arguments.*/
82 struct replay_arg {
83         /**@brief  Journal replay information.*/
84         struct recover_info *info;
85
86         /**@brief  Current block we are on.*/
87         uint32_t *this_block;
88
89         /**@brief  Current trans_id we are on.*/
90         uint32_t this_trans_id;
91 };
92
93 static int
94 jbd_revoke_entry_cmp(struct revoke_entry *a, struct revoke_entry *b)
95 {
96         if (a->block > b->block)
97                 return 1;
98         else if (a->block < b->block)
99                 return -1;
100         return 0;
101 }
102
103 static int
104 jbd_block_rec_cmp(struct jbd_block_rec *a, struct jbd_block_rec *b)
105 {
106         if (a->lba > b->lba)
107                 return 1;
108         else if (a->lba < b->lba)
109                 return -1;
110         return 0;
111 }
112
113 RB_GENERATE_INTERNAL(jbd_revoke, revoke_entry, revoke_node,
114                      jbd_revoke_entry_cmp, static inline)
115 RB_GENERATE_INTERNAL(jbd_block, jbd_block_rec, block_rec_node,
116                      jbd_block_rec_cmp, static inline)
117
118 #define jbd_alloc_revoke_entry() calloc(1, sizeof(struct revoke_entry))
119 #define jbd_free_revoke_entry(addr) free(addr)
120
121 /**@brief  Write jbd superblock to disk.
122  * @param  jbd_fs jbd filesystem
123  * @param  s jbd superblock
124  * @return standard error code*/
125 static int jbd_sb_write(struct jbd_fs *jbd_fs, struct jbd_sb *s)
126 {
127         int rc;
128         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
129         uint64_t offset;
130         ext4_fsblk_t fblock;
131         rc = jbd_inode_bmap(jbd_fs, 0, &fblock);
132         if (rc != EOK)
133                 return rc;
134
135         offset = fblock * ext4_sb_get_block_size(&fs->sb);
136         return ext4_block_writebytes(fs->bdev, offset, s,
137                                      EXT4_SUPERBLOCK_SIZE);
138 }
139
140 /**@brief  Read jbd superblock from disk.
141  * @param  jbd_fs jbd filesystem
142  * @param  s jbd superblock
143  * @return standard error code*/
144 static int jbd_sb_read(struct jbd_fs *jbd_fs, struct jbd_sb *s)
145 {
146         int rc;
147         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
148         uint64_t offset;
149         ext4_fsblk_t fblock;
150         rc = jbd_inode_bmap(jbd_fs, 0, &fblock);
151         if (rc != EOK)
152                 return rc;
153
154         offset = fblock * ext4_sb_get_block_size(&fs->sb);
155         return ext4_block_readbytes(fs->bdev, offset, s,
156                                     EXT4_SUPERBLOCK_SIZE);
157 }
158
159 /**@brief  Verify jbd superblock.
160  * @param  sb jbd superblock
161  * @return true if jbd superblock is valid */
162 static bool jbd_verify_sb(struct jbd_sb *sb)
163 {
164         struct jbd_bhdr *header = &sb->header;
165         if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER)
166                 return false;
167
168         if (jbd_get32(header, blocktype) != JBD_SUPERBLOCK &&
169             jbd_get32(header, blocktype) != JBD_SUPERBLOCK_V2)
170                 return false;
171
172         return true;
173 }
174
175 /**@brief  Write back dirty jbd superblock to disk.
176  * @param  jbd_fs jbd filesystem
177  * @return standard error code*/
178 static int jbd_write_sb(struct jbd_fs *jbd_fs)
179 {
180         int rc = EOK;
181         if (jbd_fs->dirty) {
182                 rc = jbd_sb_write(jbd_fs, &jbd_fs->sb);
183                 if (rc != EOK)
184                         return rc;
185
186                 jbd_fs->dirty = false;
187         }
188         return rc;
189 }
190
191 /**@brief  Get reference to jbd filesystem.
192  * @param  fs Filesystem to load journal of
193  * @param  jbd_fs jbd filesystem
194  * @return standard error code*/
195 int jbd_get_fs(struct ext4_fs *fs,
196                struct jbd_fs *jbd_fs)
197 {
198         int rc;
199         uint32_t journal_ino;
200
201         memset(jbd_fs, 0, sizeof(struct jbd_fs));
202         /* See if there is journal inode on this filesystem.*/
203         /* FIXME: detection on existance ofbkejournal bdev is
204          *        missing.*/
205         journal_ino = ext4_get32(&fs->sb, journal_inode_number);
206
207         rc = ext4_fs_get_inode_ref(fs,
208                                    journal_ino,
209                                    &jbd_fs->inode_ref);
210         if (rc != EOK) {
211                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
212                 return rc;
213         }
214         rc = jbd_sb_read(jbd_fs, &jbd_fs->sb);
215         if (rc != EOK) {
216                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
217                 ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
218                 return rc;
219         }
220         if (!jbd_verify_sb(&jbd_fs->sb)) {
221                 memset(jbd_fs, 0, sizeof(struct jbd_fs));
222                 ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
223                 rc = EIO;
224         }
225
226         return rc;
227 }
228
229 /**@brief  Put reference of jbd filesystem.
230  * @param  jbd_fs jbd filesystem
231  * @return standard error code*/
232 int jbd_put_fs(struct jbd_fs *jbd_fs)
233 {
234         int rc = EOK;
235         rc = jbd_write_sb(jbd_fs);
236
237         ext4_fs_put_inode_ref(&jbd_fs->inode_ref);
238         return rc;
239 }
240
241 /**@brief  Data block lookup helper.
242  * @param  jbd_fs jbd filesystem
243  * @param  iblock block index
244  * @param  fblock logical block address
245  * @return standard error code*/
246 int jbd_inode_bmap(struct jbd_fs *jbd_fs,
247                    ext4_lblk_t iblock,
248                    ext4_fsblk_t *fblock)
249 {
250         int rc = ext4_fs_get_inode_dblk_idx(
251                         &jbd_fs->inode_ref,
252                         iblock,
253                         fblock,
254                         false);
255         return rc;
256 }
257
258 /**@brief   jbd block get function (through cache).
259  * @param   jbd_fs jbd filesystem
260  * @param   block block descriptor
261  * @param   fblock jbd logical block address
262  * @return  standard error code*/
263 static int jbd_block_get(struct jbd_fs *jbd_fs,
264                   struct ext4_block *block,
265                   ext4_fsblk_t fblock)
266 {
267         /* TODO: journal device. */
268         int rc;
269         ext4_lblk_t iblock = (ext4_lblk_t)fblock;
270
271         /* Lookup the logical block address of
272          * fblock.*/
273         rc = jbd_inode_bmap(jbd_fs, iblock,
274                             &fblock);
275         if (rc != EOK)
276                 return rc;
277
278         struct ext4_blockdev *bdev = jbd_fs->inode_ref.fs->bdev;
279         rc = ext4_block_get(bdev, block, fblock);
280
281         /* If succeeded, mark buffer as BC_FLUSH to indicate
282          * that data should be written to disk immediately.*/
283         if (rc == EOK)
284                 ext4_bcache_set_flag(block->buf, BC_FLUSH);
285
286         return rc;
287 }
288
289 /**@brief   jbd block get function (through cache, don't read).
290  * @param   jbd_fs jbd filesystem
291  * @param   block block descriptor
292  * @param   fblock jbd logical block address
293  * @return  standard error code*/
294 static int jbd_block_get_noread(struct jbd_fs *jbd_fs,
295                          struct ext4_block *block,
296                          ext4_fsblk_t fblock)
297 {
298         /* TODO: journal device. */
299         int rc;
300         ext4_lblk_t iblock = (ext4_lblk_t)fblock;
301         rc = jbd_inode_bmap(jbd_fs, iblock,
302                             &fblock);
303         if (rc != EOK)
304                 return rc;
305
306         struct ext4_blockdev *bdev = jbd_fs->inode_ref.fs->bdev;
307         rc = ext4_block_get_noread(bdev, block, fblock);
308         if (rc == EOK)
309                 ext4_bcache_set_flag(block->buf, BC_FLUSH);
310
311         return rc;
312 }
313
314 /**@brief   jbd block set procedure (through cache).
315  * @param   jbd_fs jbd filesystem
316  * @param   block block descriptor
317  * @return  standard error code*/
318 static int jbd_block_set(struct jbd_fs *jbd_fs,
319                   struct ext4_block *block)
320 {
321         return ext4_block_set(jbd_fs->inode_ref.fs->bdev,
322                               block);
323 }
324
325 /**@brief  helper functions to calculate
326  *         block tag size, not including UUID part.
327  * @param  jbd_fs jbd filesystem
328  * @return tag size in bytes*/
329 static int jbd_tag_bytes(struct jbd_fs *jbd_fs)
330 {
331         int size;
332
333         /* It is very easy to deal with the case which
334          * JBD_FEATURE_INCOMPAT_CSUM_V3 is enabled.*/
335         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
336                                      JBD_FEATURE_INCOMPAT_CSUM_V3))
337                 return sizeof(struct jbd_block_tag3);
338
339         size = sizeof(struct jbd_block_tag);
340
341         /* If JBD_FEATURE_INCOMPAT_CSUM_V2 is enabled,
342          * add 2 bytes to size.*/
343         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
344                                      JBD_FEATURE_INCOMPAT_CSUM_V2))
345                 size += sizeof(uint16_t);
346
347         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
348                                      JBD_FEATURE_INCOMPAT_64BIT))
349                 return size;
350
351         /* If block number is 4 bytes in size,
352          * minus 4 bytes from size */
353         return size - sizeof(uint32_t);
354 }
355
356 /**@brief  Tag information. */
357 struct tag_info {
358         /**@brief  Tag size in bytes, including UUID part.*/
359         int tag_bytes;
360
361         /**@brief  block number stored in this tag.*/
362         ext4_fsblk_t block;
363
364         /**@brief  whether UUID part exists or not.*/
365         bool uuid_exist;
366
367         /**@brief  UUID content if UUID part exists.*/
368         uint8_t uuid[UUID_SIZE];
369
370         /**@brief  Is this the last tag? */
371         bool last_tag;
372 };
373
374 /**@brief  Extract information from a block tag.
375  * @param  __tag pointer to the block tag
376  * @param  tag_bytes block tag size of this jbd filesystem
377  * @param  remaining size in buffer containing the block tag
378  * @param  tag_info information of this tag.
379  * @return  EOK when succeed, otherwise return EINVAL.*/
380 static int
381 jbd_extract_block_tag(struct jbd_fs *jbd_fs,
382                       void *__tag,
383                       int tag_bytes,
384                       int32_t remain_buf_size,
385                       struct tag_info *tag_info)
386 {
387         char *uuid_start;
388         tag_info->tag_bytes = tag_bytes;
389         tag_info->uuid_exist = false;
390         tag_info->last_tag = false;
391
392         /* See whether it is possible to hold a valid block tag.*/
393         if (remain_buf_size - tag_bytes < 0)
394                 return EINVAL;
395
396         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
397                                      JBD_FEATURE_INCOMPAT_CSUM_V3)) {
398                 struct jbd_block_tag3 *tag = __tag;
399                 tag_info->block = jbd_get32(tag, blocknr);
400                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
401                                              JBD_FEATURE_INCOMPAT_64BIT))
402                          tag_info->block |=
403                                  (uint64_t)jbd_get32(tag, blocknr_high) << 32;
404
405                 if (jbd_get32(tag, flags) & JBD_FLAG_ESCAPE)
406                         tag_info->block = 0;
407
408                 if (!(jbd_get32(tag, flags) & JBD_FLAG_SAME_UUID)) {
409                         /* See whether it is possible to hold UUID part.*/
410                         if (remain_buf_size - tag_bytes < UUID_SIZE)
411                                 return EINVAL;
412
413                         uuid_start = (char *)tag + tag_bytes;
414                         tag_info->uuid_exist = true;
415                         tag_info->tag_bytes += UUID_SIZE;
416                         memcpy(tag_info->uuid, uuid_start, UUID_SIZE);
417                 }
418
419                 if (jbd_get32(tag, flags) & JBD_FLAG_LAST_TAG)
420                         tag_info->last_tag = true;
421
422         } else {
423                 struct jbd_block_tag *tag = __tag;
424                 tag_info->block = jbd_get32(tag, blocknr);
425                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
426                                              JBD_FEATURE_INCOMPAT_64BIT))
427                          tag_info->block |=
428                                  (uint64_t)jbd_get32(tag, blocknr_high) << 32;
429
430                 if (jbd_get16(tag, flags) & JBD_FLAG_ESCAPE)
431                         tag_info->block = 0;
432
433                 if (!(jbd_get16(tag, flags) & JBD_FLAG_SAME_UUID)) {
434                         /* See whether it is possible to hold UUID part.*/
435                         if (remain_buf_size - tag_bytes < UUID_SIZE)
436                                 return EINVAL;
437
438                         uuid_start = (char *)tag + tag_bytes;
439                         tag_info->uuid_exist = true;
440                         tag_info->tag_bytes += UUID_SIZE;
441                         memcpy(tag_info->uuid, uuid_start, UUID_SIZE);
442                 }
443
444                 if (jbd_get16(tag, flags) & JBD_FLAG_LAST_TAG)
445                         tag_info->last_tag = true;
446
447         }
448         return EOK;
449 }
450
451 /**@brief  Write information to a block tag.
452  * @param  __tag pointer to the block tag
453  * @param  remaining size in buffer containing the block tag
454  * @param  tag_info information of this tag.
455  * @return  EOK when succeed, otherwise return EINVAL.*/
456 static int
457 jbd_write_block_tag(struct jbd_fs *jbd_fs,
458                     void *__tag,
459                     int32_t remain_buf_size,
460                     struct tag_info *tag_info)
461 {
462         char *uuid_start;
463         int tag_bytes = jbd_tag_bytes(jbd_fs);
464
465         tag_info->tag_bytes = tag_bytes;
466
467         /* See whether it is possible to hold a valid block tag.*/
468         if (remain_buf_size - tag_bytes < 0)
469                 return EINVAL;
470
471         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
472                                      JBD_FEATURE_INCOMPAT_CSUM_V3)) {
473                 struct jbd_block_tag3 *tag = __tag;
474                 memset(tag, 0, sizeof(struct jbd_block_tag3));
475                 jbd_set32(tag, blocknr, tag_info->block);
476                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
477                                              JBD_FEATURE_INCOMPAT_64BIT))
478                         jbd_set32(tag, blocknr_high, tag_info->block >> 32);
479
480                 if (tag_info->uuid_exist) {
481                         /* See whether it is possible to hold UUID part.*/
482                         if (remain_buf_size - tag_bytes < UUID_SIZE)
483                                 return EINVAL;
484
485                         uuid_start = (char *)tag + tag_bytes;
486                         tag_info->tag_bytes += UUID_SIZE;
487                         memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
488                 } else
489                         jbd_set32(tag, flags,
490                                   jbd_get32(tag, flags) | JBD_FLAG_SAME_UUID);
491
492                 if (tag_info->last_tag)
493                         jbd_set32(tag, flags,
494                                   jbd_get32(tag, flags) | JBD_FLAG_LAST_TAG);
495
496         } else {
497                 struct jbd_block_tag *tag = __tag;
498                 memset(tag, 0, sizeof(struct jbd_block_tag));
499                 jbd_set32(tag, blocknr, tag_info->block);
500                 if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
501                                              JBD_FEATURE_INCOMPAT_64BIT))
502                         jbd_set32(tag, blocknr_high, tag_info->block >> 32);
503
504                 if (tag_info->uuid_exist) {
505                         /* See whether it is possible to hold UUID part.*/
506                         if (remain_buf_size - tag_bytes < UUID_SIZE)
507                                 return EINVAL;
508
509                         uuid_start = (char *)tag + tag_bytes;
510                         tag_info->tag_bytes += UUID_SIZE;
511                         memcpy(uuid_start, tag_info->uuid, UUID_SIZE);
512                 } else
513                         jbd_set16(tag, flags,
514                                   jbd_get16(tag, flags) | JBD_FLAG_SAME_UUID);
515
516                 if (tag_info->last_tag)
517                         jbd_set16(tag, flags,
518                                   jbd_get16(tag, flags) | JBD_FLAG_LAST_TAG);
519
520         }
521         return EOK;
522 }
523
524 /**@brief  Iterate all block tags in a block.
525  * @param  jbd_fs jbd filesystem
526  * @param  __tag_start pointer to the block
527  * @param  tag_tbl_size size of the block
528  * @param  func callback routine to indicate that
529  *         a block tag is found
530  * @param  arg additional argument to be passed to func */
531 static void
532 jbd_iterate_block_table(struct jbd_fs *jbd_fs,
533                         void *__tag_start,
534                         int32_t tag_tbl_size,
535                         void (*func)(struct jbd_fs * jbd_fs,
536                                         ext4_fsblk_t block,
537                                         uint8_t *uuid,
538                                         void *arg),
539                         void *arg)
540 {
541         char *tag_start, *tag_ptr;
542         int tag_bytes = jbd_tag_bytes(jbd_fs);
543         tag_start = __tag_start;
544         tag_ptr = tag_start;
545
546         /* Cut off the size of block tail storing checksum. */
547         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
548                                      JBD_FEATURE_INCOMPAT_CSUM_V2) ||
549             JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
550                                      JBD_FEATURE_INCOMPAT_CSUM_V3))
551                 tag_tbl_size -= sizeof(struct jbd_block_tail);
552
553         while (tag_tbl_size) {
554                 struct tag_info tag_info;
555                 int rc = jbd_extract_block_tag(jbd_fs,
556                                       tag_ptr,
557                                       tag_bytes,
558                                       tag_tbl_size,
559                                       &tag_info);
560                 if (rc != EOK)
561                         break;
562
563                 if (func)
564                         func(jbd_fs, tag_info.block, tag_info.uuid, arg);
565
566                 /* Stop the iteration when we reach the last tag. */
567                 if (tag_info.last_tag)
568                         break;
569
570                 tag_ptr += tag_info.tag_bytes;
571                 tag_tbl_size -= tag_info.tag_bytes;
572         }
573 }
574
575 static void jbd_display_block_tags(struct jbd_fs *jbd_fs,
576                                    ext4_fsblk_t block,
577                                    uint8_t *uuid,
578                                    void *arg)
579 {
580         uint32_t *iblock = arg;
581         ext4_dbg(DEBUG_JBD, "Block in block_tag: %" PRIu64 "\n", block);
582         (*iblock)++;
583         (void)jbd_fs;
584         (void)uuid;
585         return;
586 }
587
588 static struct revoke_entry *
589 jbd_revoke_entry_lookup(struct recover_info *info, ext4_fsblk_t block)
590 {
591         struct revoke_entry tmp = {
592                 .block = block
593         };
594
595         return RB_FIND(jbd_revoke, &info->revoke_root, &tmp);
596 }
597
598 /**@brief  Replay a block in a transaction.
599  * @param  jbd_fs jbd filesystem
600  * @param  block  block address to be replayed.*/
601 static void jbd_replay_block_tags(struct jbd_fs *jbd_fs,
602                                   ext4_fsblk_t block,
603                                   uint8_t *uuid __unused,
604                                   void *__arg)
605 {
606         int r;
607         struct replay_arg *arg = __arg;
608         struct recover_info *info = arg->info;
609         uint32_t *this_block = arg->this_block;
610         struct revoke_entry *revoke_entry;
611         struct ext4_block journal_block, ext4_block;
612         struct ext4_fs *fs = jbd_fs->inode_ref.fs;
613
614         (*this_block)++;
615
616         /* We replay this block only if the current transaction id
617          * is equal or greater than that in revoke entry.*/
618         revoke_entry = jbd_revoke_entry_lookup(info, block);
619         if (revoke_entry &&
620             arg->this_trans_id < revoke_entry->trans_id)
621                 return;
622
623         ext4_dbg(DEBUG_JBD,
624                  "Replaying block in block_tag: %" PRIu64 "\n",
625                  block);
626
627         r = jbd_block_get(jbd_fs, &journal_block, *this_block);
628         if (r != EOK)
629                 return;
630
631         /* We need special treatment for ext4 superblock. */
632         if (block) {
633                 r = ext4_block_get_noread(fs->bdev, &ext4_block, block);
634                 if (r != EOK) {
635                         jbd_block_set(jbd_fs, &journal_block);
636                         return;
637                 }
638
639                 memcpy(ext4_block.data,
640                         journal_block.data,
641                         jbd_get32(&jbd_fs->sb, blocksize));
642
643                 ext4_bcache_set_dirty(ext4_block.buf);
644                 ext4_block_set(fs->bdev, &ext4_block);
645         } else {
646                 uint16_t mount_count, state;
647                 mount_count = ext4_get16(&fs->sb, mount_count);
648                 state = ext4_get16(&fs->sb, state);
649
650                 memcpy(&fs->sb,
651                         journal_block.data + EXT4_SUPERBLOCK_OFFSET,
652                         EXT4_SUPERBLOCK_SIZE);
653
654                 /* Mark system as mounted */
655                 ext4_set16(&fs->sb, state, state);
656                 r = ext4_sb_write(fs->bdev, &fs->sb);
657                 if (r != EOK)
658                         return;
659
660                 /*Update mount count*/
661                 ext4_set16(&fs->sb, mount_count, mount_count);
662         }
663
664         jbd_block_set(jbd_fs, &journal_block);
665         
666         return;
667 }
668
669 /**@brief  Add block address to revoke tree, along with
670  *         its transaction id.
671  * @param  info  journal replay info
672  * @param  block  block address to be replayed.*/
673 static void jbd_add_revoke_block_tags(struct recover_info *info,
674                                       ext4_fsblk_t block)
675 {
676         struct revoke_entry *revoke_entry;
677
678         ext4_dbg(DEBUG_JBD, "Add block %" PRIu64 " to revoke tree\n", block);
679         /* If the revoke entry with respect to the block address
680          * exists already, update its transaction id.*/
681         revoke_entry = jbd_revoke_entry_lookup(info, block);
682         if (revoke_entry) {
683                 revoke_entry->trans_id = info->this_trans_id;
684                 return;
685         }
686
687         revoke_entry = jbd_alloc_revoke_entry();
688         ext4_assert(revoke_entry);
689         revoke_entry->block = block;
690         revoke_entry->trans_id = info->this_trans_id;
691         RB_INSERT(jbd_revoke, &info->revoke_root, revoke_entry);
692
693         return;
694 }
695
696 static void jbd_destroy_revoke_tree(struct recover_info *info)
697 {
698         while (!RB_EMPTY(&info->revoke_root)) {
699                 struct revoke_entry *revoke_entry =
700                         RB_MIN(jbd_revoke, &info->revoke_root);
701                 ext4_assert(revoke_entry);
702                 RB_REMOVE(jbd_revoke, &info->revoke_root, revoke_entry);
703                 jbd_free_revoke_entry(revoke_entry);
704         }
705 }
706
707 /* Make sure we wrap around the log correctly! */
708 #define wrap(sb, var)                                           \
709 do {                                                                    \
710         if (var >= jbd_get32((sb), maxlen))                                     \
711                 var -= (jbd_get32((sb), maxlen) - jbd_get32((sb), first));      \
712 } while (0)
713
714 #define ACTION_SCAN 0
715 #define ACTION_REVOKE 1
716 #define ACTION_RECOVER 2
717
718 /**@brief  Add entries in a revoke block to revoke tree.
719  * @param  jbd_fs jbd filesystem
720  * @param  header revoke block header
721  * @param  recover_info  journal replay info*/
722 static void jbd_build_revoke_tree(struct jbd_fs *jbd_fs,
723                                   struct jbd_bhdr *header,
724                                   struct recover_info *info)
725 {
726         char *blocks_entry;
727         struct jbd_revoke_header *revoke_hdr =
728                 (struct jbd_revoke_header *)header;
729         uint32_t i, nr_entries, record_len = 4;
730
731         /* If we are working on a 64bit jbd filesystem, */
732         if (JBD_HAS_INCOMPAT_FEATURE(&jbd_fs->sb,
733                                      JBD_FEATURE_INCOMPAT_64BIT))
734                 record_len = 8;
735
736         nr_entries = (jbd_get32(revoke_hdr, count) -
737                         sizeof(struct jbd_revoke_header)) /
738                         record_len;
739
740         blocks_entry = (char *)(revoke_hdr + 1);
741
742         for (i = 0;i < nr_entries;i++) {
743                 if (record_len == 8) {
744                         uint64_t *blocks =
745                                 (uint64_t *)blocks_entry;
746                         jbd_add_revoke_block_tags(info, to_be64(*blocks));
747                 } else {
748                         uint32_t *blocks =
749                                 (uint32_t *)blocks_entry;
750                         jbd_add_revoke_block_tags(info, to_be32(*blocks));
751                 }
752                 blocks_entry += record_len;
753         }
754 }
755
756 static void jbd_debug_descriptor_block(struct jbd_fs *jbd_fs,
757                                        struct jbd_bhdr *header,
758                                        uint32_t *iblock)
759 {
760         jbd_iterate_block_table(jbd_fs,
761                                 header + 1,
762                                 jbd_get32(&jbd_fs->sb, blocksize) -
763                                         sizeof(struct jbd_bhdr),
764                                 jbd_display_block_tags,
765                                 iblock);
766 }
767
768 static void jbd_replay_descriptor_block(struct jbd_fs *jbd_fs,
769                                         struct jbd_bhdr *header,
770                                         struct replay_arg *arg)
771 {
772         jbd_iterate_block_table(jbd_fs,
773                                 header + 1,
774                                 jbd_get32(&jbd_fs->sb, blocksize) -
775                                         sizeof(struct jbd_bhdr),
776                                 jbd_replay_block_tags,
777                                 arg);
778 }
779
780 /**@brief  The core routine of journal replay.
781  * @param  jbd_fs jbd filesystem
782  * @param  recover_info  journal replay info
783  * @param  action action needed to be taken
784  * @return standard error code*/
785 static int jbd_iterate_log(struct jbd_fs *jbd_fs,
786                            struct recover_info *info,
787                            int action)
788 {
789         int r = EOK;
790         bool log_end = false;
791         struct jbd_sb *sb = &jbd_fs->sb;
792         uint32_t start_trans_id, this_trans_id;
793         uint32_t start_block, this_block;
794
795         /* We start iterating valid blocks in the whole journal.*/
796         start_trans_id = this_trans_id = jbd_get32(sb, sequence);
797         start_block = this_block = jbd_get32(sb, start);
798
799         ext4_dbg(DEBUG_JBD, "Start of journal at trans id: %" PRIu32 "\n",
800                             start_trans_id);
801
802         while (!log_end) {
803                 struct ext4_block block;
804                 struct jbd_bhdr *header;
805                 /* If we are not scanning for the last
806                  * valid transaction in the journal,
807                  * we will stop when we reach the end of
808                  * the journal.*/
809                 if (action != ACTION_SCAN)
810                         if (this_trans_id > info->last_trans_id) {
811                                 log_end = true;
812                                 continue;
813                         }
814
815                 r = jbd_block_get(jbd_fs, &block, this_block);
816                 if (r != EOK)
817                         break;
818
819                 header = (struct jbd_bhdr *)block.data;
820                 /* This block does not have a valid magic number,
821                  * so we have reached the end of the journal.*/
822                 if (jbd_get32(header, magic) != JBD_MAGIC_NUMBER) {
823                         jbd_block_set(jbd_fs, &block);
824                         log_end = true;
825                         continue;
826                 }
827
828                 /* If the transaction id we found is not expected,
829                  * we may have reached the end of the journal.
830                  *
831                  * If we are not scanning the journal, something
832                  * bad might have taken place. :-( */
833                 if (jbd_get32(header, sequence) != this_trans_id) {
834                         if (action != ACTION_SCAN)
835                                 r = EIO;
836
837                         jbd_block_set(jbd_fs, &block);
838                         log_end = true;
839                         continue;
840                 }
841
842                 switch (jbd_get32(header, blocktype)) {
843                 case JBD_DESCRIPTOR_BLOCK:
844                         ext4_dbg(DEBUG_JBD, "Descriptor block: %" PRIu32", "
845                                             "trans_id: %" PRIu32"\n",
846                                             this_block, this_trans_id);
847                         if (action == ACTION_RECOVER) {
848                                 struct replay_arg replay_arg;
849                                 replay_arg.info = info;
850                                 replay_arg.this_block = &this_block;
851                                 replay_arg.this_trans_id = this_trans_id;
852
853                                 jbd_replay_descriptor_block(jbd_fs,
854                                                 header, &replay_arg);
855                         } else
856                                 jbd_debug_descriptor_block(jbd_fs,
857                                                 header, &this_block);
858
859                         break;
860                 case JBD_COMMIT_BLOCK:
861                         ext4_dbg(DEBUG_JBD, "Commit block: %" PRIu32", "
862                                             "trans_id: %" PRIu32"\n",
863                                             this_block, this_trans_id);
864                         /* This is the end of a transaction,
865                          * we may now proceed to the next transaction.
866                          */
867                         this_trans_id++;
868                         break;
869                 case JBD_REVOKE_BLOCK:
870                         ext4_dbg(DEBUG_JBD, "Revoke block: %" PRIu32", "
871                                             "trans_id: %" PRIu32"\n",
872                                             this_block, this_trans_id);
873                         if (action == ACTION_REVOKE) {
874                                 info->this_trans_id = this_trans_id;
875                                 jbd_build_revoke_tree(jbd_fs,
876                                                 header, info);
877                         }
878                         break;
879                 default:
880                         log_end = true;
881                         break;
882                 }
883                 jbd_block_set(jbd_fs, &block);
884                 this_block++;
885                 wrap(sb, this_block);
886                 if (this_block == start_block)
887                         log_end = true;
888
889         }
890         ext4_dbg(DEBUG_JBD, "End of journal.\n");
891         if (r == EOK && action == ACTION_SCAN) {
892                 /* We have finished scanning the journal. */
893                 info->start_trans_id = start_trans_id;
894                 if (this_trans_id > start_trans_id)
895                         info->last_trans_id = this_trans_id - 1;
896                 else
897                         info->last_trans_id = this_trans_id;
898         }
899
900         return r;
901 }
902
903 /**@brief  Replay journal.
904  * @param  jbd_fs jbd filesystem
905  * @return standard error code*/
906 int jbd_recover(struct jbd_fs *jbd_fs)
907 {
908         int r;
909         struct recover_info info;
910         struct jbd_sb *sb = &jbd_fs->sb;
911         if (!sb->start)
912                 return EOK;
913
914         RB_INIT(&info.revoke_root);
915
916         r = jbd_iterate_log(jbd_fs, &info, ACTION_SCAN);
917         if (r != EOK)
918                 return r;
919
920         r = jbd_iterate_log(jbd_fs, &info, ACTION_REVOKE);
921         if (r != EOK)
922                 return r;
923
924         r = jbd_iterate_log(jbd_fs, &info, ACTION_RECOVER);
925         if (r == EOK) {
926                 /* If we successfully replay the journal,
927                  * clear EXT4_FINCOM_RECOVER flag on the
928                  * ext4 superblock, and set the start of
929                  * journal to 0.*/
930                 uint32_t features_incompatible =
931                         ext4_get32(&jbd_fs->inode_ref.fs->sb,
932                                    features_incompatible);
933                 jbd_set32(&jbd_fs->sb, start, 0);
934                 features_incompatible &= ~EXT4_FINCOM_RECOVER;
935                 ext4_set32(&jbd_fs->inode_ref.fs->sb,
936                            features_incompatible,
937                            features_incompatible);
938                 jbd_fs->dirty = true;
939                 r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev,
940                                   &jbd_fs->inode_ref.fs->sb);
941         }
942         jbd_destroy_revoke_tree(&info);
943         return r;
944 }
945
946 static void jbd_journal_write_sb(struct jbd_journal *journal)
947 {
948         struct jbd_fs *jbd_fs = journal->jbd_fs;
949         jbd_set32(&jbd_fs->sb, start, journal->start);
950         jbd_set32(&jbd_fs->sb, sequence, journal->trans_id);
951         jbd_fs->dirty = true;
952 }
953
954 /**@brief  Start accessing the journal.
955  * @param  jbd_fs jbd filesystem
956  * @param  journal current journal session
957  * @return standard error code*/
958 int jbd_journal_start(struct jbd_fs *jbd_fs,
959                       struct jbd_journal *journal)
960 {
961         int r;
962         uint32_t features_incompatible =
963                         ext4_get32(&jbd_fs->inode_ref.fs->sb,
964                                    features_incompatible);
965         features_incompatible |= EXT4_FINCOM_RECOVER;
966         ext4_set32(&jbd_fs->inode_ref.fs->sb,
967                         features_incompatible,
968                         features_incompatible);
969         r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev,
970                         &jbd_fs->inode_ref.fs->sb);
971         if (r != EOK)
972                 return r;
973
974         journal->first = jbd_get32(&jbd_fs->sb, first);
975         journal->start = journal->first;
976         journal->last = journal->first;
977         journal->trans_id = 1;
978         journal->alloc_trans_id = 1;
979
980         journal->block_size = jbd_get32(&jbd_fs->sb, blocksize);
981
982         TAILQ_INIT(&journal->trans_queue);
983         TAILQ_INIT(&journal->cp_queue);
984         RB_INIT(&journal->block_rec_root);
985         journal->jbd_fs = jbd_fs;
986         jbd_journal_write_sb(journal);
987         return jbd_write_sb(jbd_fs);
988 }
989
990 static void jbd_journal_flush_trans(struct jbd_trans *trans)
991 {
992         struct jbd_buf *jbd_buf, *tmp;
993         struct jbd_journal *journal = trans->journal;
994         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
995         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
996                         tmp) {
997                 struct ext4_block block = jbd_buf->block;
998                 ext4_block_flush_buf(fs->bdev, block.buf);
999         }
1000 }
1001
1002 static void
1003 jbd_journal_skip_pure_revoke(struct jbd_journal *journal,
1004                              struct jbd_trans *trans)
1005 {
1006         journal->start = trans->start_iblock +
1007                 trans->alloc_blocks;
1008         wrap(&journal->jbd_fs->sb, journal->start);
1009         journal->trans_id = trans->trans_id + 1;
1010         jbd_journal_free_trans(journal,
1011                         trans, false);
1012         jbd_journal_write_sb(journal);
1013 }
1014
1015 static void jbd_journal_flush_all_trans(struct jbd_journal *journal)
1016 {
1017         struct jbd_trans *trans, *tmp;
1018         TAILQ_FOREACH_SAFE(trans, &journal->cp_queue, trans_node,
1019                           tmp) {
1020                 if (!trans->data_cnt) {
1021                         TAILQ_REMOVE(&journal->cp_queue,
1022                                         trans,
1023                                         trans_node);
1024                         jbd_journal_skip_pure_revoke(journal, trans);
1025                 } else {
1026                         ext4_assert(trans->data_cnt != trans->written_cnt);
1027                         jbd_journal_flush_trans(trans);
1028                 }
1029         }
1030 }
1031
1032 /**@brief  Stop accessing the journal.
1033  * @param  journal current journal session
1034  * @return standard error code*/
1035 int jbd_journal_stop(struct jbd_journal *journal)
1036 {
1037         int r;
1038         struct jbd_fs *jbd_fs = journal->jbd_fs;
1039         uint32_t features_incompatible;
1040
1041         /* Commit all the transactions to the journal.*/
1042         jbd_journal_commit_all(journal);
1043
1044         /* Make sure that journalled content have reached
1045          * the disk.*/
1046         jbd_journal_flush_all_trans(journal);
1047
1048         /* There should be no block record in this journal
1049          * session. */
1050         if (!RB_EMPTY(&journal->block_rec_root))
1051                 ext4_dbg(DEBUG_JBD,
1052                          DBG_WARN "There are still block records "
1053                                   "in this journal session!\n");
1054
1055         features_incompatible =
1056                 ext4_get32(&jbd_fs->inode_ref.fs->sb,
1057                            features_incompatible);
1058         features_incompatible &= ~EXT4_FINCOM_RECOVER;
1059         ext4_set32(&jbd_fs->inode_ref.fs->sb,
1060                         features_incompatible,
1061                         features_incompatible);
1062         r = ext4_sb_write(jbd_fs->inode_ref.fs->bdev,
1063                         &jbd_fs->inode_ref.fs->sb);
1064         if (r != EOK)
1065                 return r;
1066
1067         journal->start = 0;
1068         journal->trans_id = 0;
1069         jbd_journal_write_sb(journal);
1070         return jbd_write_sb(journal->jbd_fs);
1071 }
1072
1073 /**@brief  Allocate a block in the journal.
1074  * @param  journal current journal session
1075  * @param  trans transaction
1076  * @return allocated block address*/
1077 static uint32_t jbd_journal_alloc_block(struct jbd_journal *journal,
1078                                         struct jbd_trans *trans)
1079 {
1080         uint32_t start_block;
1081
1082         start_block = journal->last++;
1083         trans->alloc_blocks++;
1084         wrap(&journal->jbd_fs->sb, journal->last);
1085         
1086         /* If there is no space left, flush all journalled
1087          * blocks to disk first.*/
1088         if (journal->last == journal->start)
1089                 jbd_journal_flush_all_trans(journal);
1090
1091         return start_block;
1092 }
1093
1094 /**@brief  Allocate a new transaction
1095  * @param  journal current journal session
1096  * @return transaction allocated*/
1097 struct jbd_trans *
1098 jbd_journal_new_trans(struct jbd_journal *journal)
1099 {
1100         struct jbd_trans *trans = calloc(1, sizeof(struct jbd_trans));
1101         if (!trans)
1102                 return NULL;
1103
1104         /* We will assign a trans_id to this transaction,
1105          * once it has been committed.*/
1106         trans->journal = journal;
1107         trans->error = EOK;
1108         TAILQ_INIT(&trans->buf_queue);
1109         return trans;
1110 }
1111
1112 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
1113                           struct ext4_buf *buf __unused,
1114                           int res,
1115                           void *arg);
1116
1117 /**@brief  gain access to it before making any modications.
1118  * @param  journal current journal session
1119  * @param  trans transaction
1120  * @param  block descriptor
1121  * @return standard error code.*/
1122 int jbd_trans_get_access(struct jbd_journal *journal,
1123                          struct jbd_trans *trans,
1124                          struct ext4_block *block)
1125 {
1126         int r = EOK;
1127         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1128         struct jbd_buf *jbd_buf = block->buf->end_write_arg;
1129
1130         /* If the buffer has already been modified, we should
1131          * flush dirty data in this buffer to disk.*/
1132         if (ext4_bcache_test_flag(block->buf, BC_DIRTY) &&
1133             block->buf->end_write == jbd_trans_end_write) {
1134                 ext4_assert(jbd_buf);
1135                 if (jbd_buf->trans != trans)
1136                         r = ext4_block_flush_buf(fs->bdev, block->buf);
1137
1138         }
1139         return r;
1140 }
1141
1142 static struct jbd_block_rec *
1143 jbd_trans_block_rec_lookup(struct jbd_journal *journal,
1144                            ext4_fsblk_t lba)
1145 {
1146         struct jbd_block_rec tmp = {
1147                 .lba = lba
1148         };
1149
1150         return RB_FIND(jbd_block,
1151                        &journal->block_rec_root,
1152                        &tmp);
1153 }
1154
1155 static inline struct jbd_block_rec *
1156 jbd_trans_insert_block_rec(struct jbd_trans *trans,
1157                            ext4_fsblk_t lba,
1158                            struct ext4_buf *buf)
1159 {
1160         struct jbd_block_rec *block_rec;
1161         block_rec = jbd_trans_block_rec_lookup(trans->journal, lba);
1162         if (block_rec) {
1163                 LIST_REMOVE(block_rec, tbrec_node);
1164                 /* Data should be flushed to disk already. */
1165                 ext4_assert(!block_rec->buf);
1166                 /* Now this block record belongs to this transaction. */
1167                 LIST_INSERT_HEAD(&trans->tbrec_list, block_rec, tbrec_node);
1168                 block_rec->trans = trans;
1169                 return block_rec;
1170         }
1171         block_rec = calloc(1, sizeof(struct jbd_block_rec));
1172         if (!block_rec)
1173                 return NULL;
1174
1175         block_rec->lba = lba;
1176         block_rec->buf = buf;
1177         block_rec->trans = trans;
1178         LIST_INSERT_HEAD(&trans->tbrec_list, block_rec, tbrec_node);
1179         RB_INSERT(jbd_block, &trans->journal->block_rec_root, block_rec);
1180         return block_rec;
1181 }
1182
1183 static inline void
1184 jbd_trans_remove_block_rec(struct jbd_journal *journal,
1185                            struct jbd_block_rec *block_rec,
1186                            struct jbd_trans *trans)
1187 {
1188         /* If this block record doesn't belong to this transaction,
1189          * give up.*/
1190         if (block_rec->trans == trans) {
1191                 LIST_REMOVE(block_rec, tbrec_node);
1192                 RB_REMOVE(jbd_block,
1193                                 &journal->block_rec_root,
1194                                 block_rec);
1195                 free(block_rec);
1196         }
1197 }
1198
1199 /**@brief  Add block to a transaction and mark it dirty.
1200  * @param  trans transaction
1201  * @param  block block descriptor
1202  * @return standard error code*/
1203 int jbd_trans_set_block_dirty(struct jbd_trans *trans,
1204                               struct ext4_block *block)
1205 {
1206         struct jbd_buf *buf;
1207
1208         if (!ext4_bcache_test_flag(block->buf, BC_DIRTY) &&
1209             block->buf->end_write != jbd_trans_end_write) {
1210                 struct jbd_block_rec *block_rec;
1211                 buf = calloc(1, sizeof(struct jbd_buf));
1212                 if (!buf)
1213                         return ENOMEM;
1214
1215                 if ((block_rec = jbd_trans_insert_block_rec(trans,
1216                                         block->lb_id,
1217                                         block->buf)) == NULL) {
1218                         free(buf);
1219                         return ENOMEM;
1220                 }
1221
1222                 buf->block_rec = block_rec;
1223                 buf->trans = trans;
1224                 buf->block = *block;
1225                 ext4_bcache_inc_ref(block->buf);
1226
1227                 /* If the content reach the disk, notify us
1228                  * so that we may do a checkpoint. */
1229                 block->buf->end_write = jbd_trans_end_write;
1230                 block->buf->end_write_arg = buf;
1231
1232                 trans->data_cnt++;
1233                 TAILQ_INSERT_HEAD(&trans->buf_queue, buf, buf_node);
1234
1235                 ext4_bcache_set_dirty(block->buf);
1236         }
1237         return EOK;
1238 }
1239
1240 /**@brief  Add block to be revoked to a transaction
1241  * @param  trans transaction
1242  * @param  lba logical block address
1243  * @return standard error code*/
1244 int jbd_trans_revoke_block(struct jbd_trans *trans,
1245                            ext4_fsblk_t lba)
1246 {
1247         struct jbd_revoke_rec *rec =
1248                 calloc(1, sizeof(struct jbd_revoke_rec));
1249         if (!rec)
1250                 return ENOMEM;
1251
1252         rec->lba = lba;
1253         LIST_INSERT_HEAD(&trans->revoke_list, rec, revoke_node);
1254         return EOK;
1255 }
1256
1257 /**@brief  Try to add block to be revoked to a transaction.
1258  *         If @lba still remains in an transaction on checkpoint
1259  *         queue, add @lba as a revoked block to the transaction.
1260  * @param  trans transaction
1261  * @param  lba logical block address
1262  * @return standard error code*/
1263 int jbd_trans_try_revoke_block(struct jbd_trans *trans,
1264                                ext4_fsblk_t lba)
1265 {
1266         int r = EOK;
1267         struct jbd_journal *journal = trans->journal;
1268         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1269         struct jbd_block_rec *block_rec =
1270                 jbd_trans_block_rec_lookup(journal, lba);
1271
1272         /* Make sure we don't flush any buffers belong to this transaction. */
1273         if (block_rec && block_rec->trans != trans) {
1274                 /* If the buffer has not been flushed yet, flush it now. */
1275                 if (block_rec->buf) {
1276                         r = ext4_block_flush_buf(fs->bdev, block_rec->buf);
1277                         if (r != EOK)
1278                                 return r;
1279
1280                 }
1281
1282                 jbd_trans_revoke_block(trans, lba);
1283         }
1284
1285         return EOK;
1286 }
1287
1288 /**@brief  Free a transaction
1289  * @param  journal current journal session
1290  * @param  trans transaction
1291  * @param  abort discard all the modifications on the block?
1292  * @return standard error code*/
1293 void jbd_journal_free_trans(struct jbd_journal *journal,
1294                             struct jbd_trans *trans,
1295                             bool abort)
1296 {
1297         struct jbd_buf *jbd_buf, *tmp;
1298         struct jbd_revoke_rec *rec, *tmp2;
1299         struct jbd_block_rec *block_rec, *tmp3;
1300         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1301         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
1302                           tmp) {
1303                 if (abort) {
1304                         jbd_buf->block.buf->end_write = NULL;
1305                         jbd_buf->block.buf->end_write_arg = NULL;
1306                         ext4_bcache_clear_dirty(jbd_buf->block.buf);
1307                         ext4_block_set(fs->bdev, &jbd_buf->block);
1308                 }
1309
1310                 TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1311                 free(jbd_buf);
1312         }
1313         LIST_FOREACH_SAFE(rec, &trans->revoke_list, revoke_node,
1314                           tmp2) {
1315                 LIST_REMOVE(rec, revoke_node);
1316                 free(rec);
1317         }
1318         LIST_FOREACH_SAFE(block_rec, &trans->tbrec_list, tbrec_node,
1319                           tmp3) {
1320                 jbd_trans_remove_block_rec(journal, block_rec, trans);
1321         }
1322
1323         free(trans);
1324 }
1325
1326 /**@brief  Write commit block for a transaction
1327  * @param  trans transaction
1328  * @return standard error code*/
1329 static int jbd_trans_write_commit_block(struct jbd_trans *trans)
1330 {
1331         int rc;
1332         struct jbd_commit_header *header;
1333         uint32_t commit_iblock = 0;
1334         struct ext4_block commit_block;
1335         struct jbd_journal *journal = trans->journal;
1336
1337         commit_iblock = jbd_journal_alloc_block(journal, trans);
1338         rc = jbd_block_get_noread(journal->jbd_fs,
1339                         &commit_block, commit_iblock);
1340         if (rc != EOK)
1341                 return rc;
1342
1343         header = (struct jbd_commit_header *)commit_block.data;
1344         jbd_set32(&header->header, magic, JBD_MAGIC_NUMBER);
1345         jbd_set32(&header->header, blocktype, JBD_COMMIT_BLOCK);
1346         jbd_set32(&header->header, sequence, trans->trans_id);
1347
1348         ext4_bcache_set_dirty(commit_block.buf);
1349         rc = jbd_block_set(journal->jbd_fs, &commit_block);
1350         if (rc != EOK)
1351                 return rc;
1352
1353         return EOK;
1354 }
1355
1356 /**@brief  Write descriptor block for a transaction
1357  * @param  journal current journal session
1358  * @param  trans transaction
1359  * @return standard error code*/
1360 static int jbd_journal_prepare(struct jbd_journal *journal,
1361                                struct jbd_trans *trans)
1362 {
1363         int rc = EOK, i = 0;
1364         int32_t tag_tbl_size;
1365         uint32_t desc_iblock = 0;
1366         uint32_t data_iblock = 0;
1367         char *tag_start = NULL, *tag_ptr = NULL;
1368         struct jbd_buf *jbd_buf, *tmp;
1369         struct ext4_block desc_block, data_block;
1370         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1371
1372         /* Try to remove any non-dirty buffers from the tail of
1373          * buf_queue. */
1374         TAILQ_FOREACH_REVERSE_SAFE(jbd_buf, &trans->buf_queue,
1375                         jbd_trans_buf, buf_node, tmp) {
1376                 /* We stop the iteration when we find a dirty buffer. */
1377                 if (ext4_bcache_test_flag(jbd_buf->block.buf,
1378                                         BC_DIRTY))
1379                         break;
1380
1381                 /* The buffer has not been modified, just release
1382                  * that jbd_buf. */
1383                 jbd_trans_remove_block_rec(journal,
1384                                 jbd_buf->block_rec, trans);
1385                 trans->data_cnt--;
1386
1387                 jbd_buf->block.buf->end_write = NULL;
1388                 jbd_buf->block.buf->end_write_arg = NULL;
1389                 ext4_block_set(fs->bdev, &jbd_buf->block);
1390                 TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1391                 free(jbd_buf);
1392         }
1393
1394         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node, tmp) {
1395                 struct tag_info tag_info;
1396                 bool uuid_exist = false;
1397                 if (!ext4_bcache_test_flag(jbd_buf->block.buf,
1398                                            BC_DIRTY)) {
1399                         /* The buffer has not been modified, just release
1400                          * that jbd_buf. */
1401                         jbd_trans_remove_block_rec(journal,
1402                                         jbd_buf->block_rec, trans);
1403                         trans->data_cnt--;
1404
1405                         jbd_buf->block.buf->end_write = NULL;
1406                         jbd_buf->block.buf->end_write_arg = NULL;
1407                         ext4_block_set(fs->bdev, &jbd_buf->block);
1408                         TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1409                         free(jbd_buf);
1410                         continue;
1411                 }
1412 again:
1413                 if (!desc_iblock) {
1414                         struct jbd_bhdr *bhdr;
1415                         desc_iblock = jbd_journal_alloc_block(journal, trans);
1416                         rc = jbd_block_get_noread(journal->jbd_fs,
1417                                            &desc_block, desc_iblock);
1418                         if (rc != EOK)
1419                                 break;
1420
1421                         ext4_bcache_set_dirty(desc_block.buf);
1422
1423                         bhdr = (struct jbd_bhdr *)desc_block.data;
1424                         jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
1425                         jbd_set32(bhdr, blocktype, JBD_DESCRIPTOR_BLOCK);
1426                         jbd_set32(bhdr, sequence, trans->trans_id);
1427
1428                         tag_start = (char *)(bhdr + 1);
1429                         tag_ptr = tag_start;
1430                         uuid_exist = true;
1431                         tag_tbl_size = journal->block_size -
1432                                 sizeof(struct jbd_bhdr);
1433
1434                         if (!trans->start_iblock)
1435                                 trans->start_iblock = desc_iblock;
1436
1437                 }
1438                 tag_info.block = jbd_buf->block.lb_id;
1439                 tag_info.uuid_exist = uuid_exist;
1440                 if (i == trans->data_cnt - 1)
1441                         tag_info.last_tag = true;
1442                 else
1443                         tag_info.last_tag = false;
1444
1445                 if (uuid_exist)
1446                         memcpy(tag_info.uuid, journal->jbd_fs->sb.uuid,
1447                                         UUID_SIZE);
1448
1449                 rc = jbd_write_block_tag(journal->jbd_fs,
1450                                 tag_ptr,
1451                                 tag_tbl_size,
1452                                 &tag_info);
1453                 if (rc != EOK) {
1454                         jbd_block_set(journal->jbd_fs, &desc_block);
1455                         desc_iblock = 0;
1456                         goto again;
1457                 }
1458
1459                 data_iblock = jbd_journal_alloc_block(journal, trans);
1460                 rc = jbd_block_get_noread(journal->jbd_fs,
1461                                 &data_block, data_iblock);
1462                 if (rc != EOK)
1463                         break;
1464
1465                 ext4_bcache_set_dirty(data_block.buf);
1466
1467                 memcpy(data_block.data, jbd_buf->block.data,
1468                         journal->block_size);
1469
1470                 rc = jbd_block_set(journal->jbd_fs, &data_block);
1471                 if (rc != EOK)
1472                         break;
1473
1474                 tag_ptr += tag_info.tag_bytes;
1475                 tag_tbl_size -= tag_info.tag_bytes;
1476
1477                 i++;
1478         }
1479         if (rc == EOK && desc_iblock)
1480                 jbd_block_set(journal->jbd_fs, &desc_block);
1481
1482         return rc;
1483 }
1484
1485 /**@brief  Write revoke block for a transaction
1486  * @param  journal current journal session
1487  * @param  trans transaction
1488  * @return standard error code*/
1489 static int
1490 jbd_journal_prepare_revoke(struct jbd_journal *journal,
1491                            struct jbd_trans *trans)
1492 {
1493         int rc = EOK, i = 0;
1494         int32_t tag_tbl_size;
1495         uint32_t desc_iblock = 0;
1496         char *blocks_entry = NULL;
1497         struct jbd_revoke_rec *rec, *tmp;
1498         struct ext4_block desc_block;
1499         struct jbd_revoke_header *header = NULL;
1500         int32_t record_len = 4;
1501
1502         if (JBD_HAS_INCOMPAT_FEATURE(&journal->jbd_fs->sb,
1503                                      JBD_FEATURE_INCOMPAT_64BIT))
1504                 record_len = 8;
1505
1506         LIST_FOREACH_SAFE(rec, &trans->revoke_list, revoke_node,
1507                           tmp) {
1508 again:
1509                 if (!desc_iblock) {
1510                         struct jbd_bhdr *bhdr;
1511                         desc_iblock = jbd_journal_alloc_block(journal, trans);
1512                         rc = jbd_block_get_noread(journal->jbd_fs,
1513                                            &desc_block, desc_iblock);
1514                         if (rc != EOK) {
1515                                 break;
1516                         }
1517
1518                         ext4_bcache_set_dirty(desc_block.buf);
1519
1520                         bhdr = (struct jbd_bhdr *)desc_block.data;
1521                         jbd_set32(bhdr, magic, JBD_MAGIC_NUMBER);
1522                         jbd_set32(bhdr, blocktype, JBD_REVOKE_BLOCK);
1523                         jbd_set32(bhdr, sequence, trans->trans_id);
1524                         
1525                         header = (struct jbd_revoke_header *)bhdr;
1526                         blocks_entry = (char *)(header + 1);
1527                         tag_tbl_size = journal->block_size -
1528                                 sizeof(struct jbd_revoke_header);
1529
1530                         if (!trans->start_iblock)
1531                                 trans->start_iblock = desc_iblock;
1532
1533                 }
1534
1535                 if (tag_tbl_size < record_len) {
1536                         jbd_set32(header, count,
1537                                   journal->block_size - tag_tbl_size);
1538                         jbd_block_set(journal->jbd_fs, &desc_block);
1539                         desc_iblock = 0;
1540                         header = NULL;
1541                         goto again;
1542                 }
1543                 if (record_len == 8) {
1544                         uint64_t *blocks =
1545                                 (uint64_t *)blocks_entry;
1546                         *blocks = to_be64(rec->lba);
1547                 } else {
1548                         uint32_t *blocks =
1549                                 (uint32_t *)blocks_entry;
1550                         *blocks = to_be32(rec->lba);
1551                 }
1552                 blocks_entry += record_len;
1553                 tag_tbl_size -= record_len;
1554
1555                 i++;
1556         }
1557         if (rc == EOK && desc_iblock) {
1558                 if (header != NULL)
1559                         jbd_set32(header, count,
1560                                   journal->block_size - tag_tbl_size);
1561
1562                 jbd_block_set(journal->jbd_fs, &desc_block);
1563         }
1564
1565         return rc;
1566 }
1567
1568 /**@brief  Submit the transaction to transaction queue.
1569  * @param  journal current journal session
1570  * @param  trans transaction*/
1571 void
1572 jbd_journal_submit_trans(struct jbd_journal *journal,
1573                          struct jbd_trans *trans)
1574 {
1575         TAILQ_INSERT_TAIL(&journal->trans_queue,
1576                           trans,
1577                           trans_node);
1578 }
1579
1580 /**@brief  Put references of block descriptors in a transaction.
1581  * @param  journal current journal session
1582  * @param  trans transaction*/
1583 void jbd_journal_cp_trans(struct jbd_journal *journal, struct jbd_trans *trans)
1584 {
1585         struct jbd_buf *jbd_buf, *tmp;
1586         struct ext4_fs *fs = journal->jbd_fs->inode_ref.fs;
1587         TAILQ_FOREACH_SAFE(jbd_buf, &trans->buf_queue, buf_node,
1588                         tmp) {
1589                 struct ext4_block block = jbd_buf->block;
1590                 ext4_block_set(fs->bdev, &block);
1591         }
1592 }
1593
1594 /**@brief  Update the start block of the journal when
1595  *         all the contents in a transaction reach the disk.*/
1596 static void jbd_trans_end_write(struct ext4_bcache *bc __unused,
1597                           struct ext4_buf *buf,
1598                           int res,
1599                           void *arg)
1600 {
1601         struct jbd_buf *jbd_buf = arg;
1602         struct jbd_trans *trans = jbd_buf->trans;
1603         struct jbd_journal *journal = trans->journal;
1604         bool first_in_queue =
1605                 trans == TAILQ_FIRST(&journal->cp_queue);
1606         if (res != EOK)
1607                 trans->error = res;
1608
1609         TAILQ_REMOVE(&trans->buf_queue, jbd_buf, buf_node);
1610         jbd_buf->block_rec->buf = NULL;
1611         free(jbd_buf);
1612
1613         /* Clear the end_write and end_write_arg fields. */
1614         buf->end_write = NULL;
1615         buf->end_write_arg = NULL;
1616
1617         trans->written_cnt++;
1618         if (trans->written_cnt == trans->data_cnt) {
1619                 /* If it is the first transaction on checkpoint queue,
1620                  * we will shift the start of the journal to the next
1621                  * transaction, and remove subsequent written
1622                  * transactions from checkpoint queue until we find
1623                  * an unwritten one. */
1624                 if (first_in_queue) {
1625                         journal->start = trans->start_iblock +
1626                                 trans->alloc_blocks;
1627                         wrap(&journal->jbd_fs->sb, journal->start);
1628                         journal->trans_id = trans->trans_id + 1;
1629                         TAILQ_REMOVE(&journal->cp_queue, trans, trans_node);
1630                         jbd_journal_free_trans(journal, trans, false);
1631
1632                         while ((trans = TAILQ_FIRST(&journal->cp_queue))) {
1633                                 if (!trans->data_cnt) {
1634                                         TAILQ_REMOVE(&journal->cp_queue,
1635                                                      trans,
1636                                                      trans_node);
1637                                         jbd_journal_skip_pure_revoke(journal,
1638                                                                      trans);
1639                                 } else {
1640                                         if (trans->data_cnt ==
1641                                             trans->written_cnt) {
1642                                                 journal->start =
1643                                                         trans->start_iblock +
1644                                                         trans->alloc_blocks;
1645                                                 wrap(&journal->jbd_fs->sb,
1646                                                         journal->start);
1647                                                 journal->trans_id =
1648                                                         trans->trans_id + 1;
1649                                                 TAILQ_REMOVE(&journal->cp_queue,
1650                                                              trans,
1651                                                              trans_node);
1652                                                 jbd_journal_free_trans(journal,
1653                                                                 trans,
1654                                                                 false);
1655                                         } else {
1656                                                 journal->start =
1657                                                         trans->start_iblock;
1658                                                 wrap(&journal->jbd_fs->sb,
1659                                                         journal->start);
1660                                                 journal->trans_id =
1661                                                         trans->trans_id;
1662                                                 break;
1663                                         }
1664                                 }
1665                         }
1666                         jbd_journal_write_sb(journal);
1667                         jbd_write_sb(journal->jbd_fs);
1668                 }
1669         }
1670 }
1671
1672 /**@brief  Commit a transaction to the journal immediately.
1673  * @param  journal current journal session
1674  * @param  trans transaction
1675  * @return standard error code*/
1676 int jbd_journal_commit_trans(struct jbd_journal *journal,
1677                              struct jbd_trans *trans)
1678 {
1679         int rc = EOK;
1680         uint32_t last = journal->last;
1681
1682         trans->trans_id = journal->alloc_trans_id;
1683         rc = jbd_journal_prepare(journal, trans);
1684         if (rc != EOK)
1685                 goto Finish;
1686
1687         rc = jbd_journal_prepare_revoke(journal, trans);
1688         if (rc != EOK)
1689                 goto Finish;
1690
1691         if (TAILQ_EMPTY(&trans->buf_queue) &&
1692             LIST_EMPTY(&trans->revoke_list)) {
1693                 /* Since there are no entries in both buffer list
1694                  * and revoke entry list, we do not consider trans as
1695                  * complete transaction and just return EOK.*/
1696                 jbd_journal_free_trans(journal, trans, false);
1697                 goto Finish;
1698         }
1699
1700         rc = jbd_trans_write_commit_block(trans);
1701         if (rc != EOK)
1702                 goto Finish;
1703
1704         journal->alloc_trans_id++;
1705         if (TAILQ_EMPTY(&journal->cp_queue)) {
1706                 if (trans->data_cnt) {
1707                         journal->start = trans->start_iblock;
1708                         wrap(&journal->jbd_fs->sb, journal->start);
1709                         journal->trans_id = trans->trans_id;
1710                         jbd_journal_write_sb(journal);
1711                         jbd_write_sb(journal->jbd_fs);
1712                         TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
1713                                         trans_node);
1714                         jbd_journal_cp_trans(journal, trans);
1715                 } else {
1716                         journal->start = trans->start_iblock +
1717                                 trans->alloc_blocks;
1718                         wrap(&journal->jbd_fs->sb, journal->start);
1719                         journal->trans_id = trans->trans_id + 1;
1720                         jbd_journal_write_sb(journal);
1721                         jbd_journal_free_trans(journal, trans, false);
1722                 }
1723         } else {
1724                 TAILQ_INSERT_TAIL(&journal->cp_queue, trans,
1725                                 trans_node);
1726                 if (trans->data_cnt)
1727                         jbd_journal_cp_trans(journal, trans);
1728
1729         }
1730 Finish:
1731         if (rc != EOK) {
1732                 journal->last = last;
1733                 jbd_journal_free_trans(journal, trans, true);
1734         }
1735         return rc;
1736 }
1737
1738 /**@brief  Commit one transaction on transaction queue
1739  *         to the journal.
1740  * @param  journal current journal session.*/
1741 void jbd_journal_commit_one(struct jbd_journal *journal)
1742 {
1743         struct jbd_trans *trans;
1744
1745         if ((trans = TAILQ_FIRST(&journal->trans_queue))) {
1746                 TAILQ_REMOVE(&journal->trans_queue, trans, trans_node);
1747                 jbd_journal_commit_trans(journal, trans);
1748         }
1749 }
1750
1751 /**@brief  Commit all the transactions on transaction queue
1752  *         to the journal.
1753  * @param  journal current journal session.*/
1754 void jbd_journal_commit_all(struct jbd_journal *journal)
1755 {
1756         while (!TAILQ_EMPTY(&journal->trans_queue)) {
1757                 jbd_journal_commit_one(journal);
1758         }
1759 }
1760
1761 /**
1762  * @}
1763  */