diff --git a/mmyolo/datasets/transforms/mix_img_transforms.py b/mmyolo/datasets/transforms/mix_img_transforms.py
index 71abf45ddb..fa1cf2dd71 100644
--- a/mmyolo/datasets/transforms/mix_img_transforms.py
+++ b/mmyolo/datasets/transforms/mix_img_transforms.py
@@ -195,15 +195,15 @@ class Mosaic(BaseMixImageTransform):
                         mosaic transform
                            center_x
                 +------------------------------+
-                |       pad        |  pad      |
-                |      +-----------+           |
+                |       pad        |           |
+                |      +-----------+    pad    |
                 |      |           |           |
-                |      |  image1   |--------+  |
-                |      |           |        |  |
-                |      |           | image2 |  |
-     center_y   |----+-------------+-----------|
+                |      |  image1   +-----------+
+                |      |           |           |
+                |      |           |   image2  |
+     center_y   |----+-+-----------+-----------+
                 |    |   cropped   |           |
-                |pad |   image3    |  image4   |
+                |pad |   image3    |   image4  |
                 |    |             |           |
                 +----|-------------+-----------+
                      |             |
@@ -473,11 +473,31 @@ class Mosaic9(BaseMixImageTransform):
     one output image. The output image is composed of the parts from each sub-
     image.
 
-     The mosaic transform steps are as follows:
-
-         1. Get the center image according to the index, and randomly
-            sample another 8 images from the custom dataset.
-         2. Randomly offset the image after Mosaic
+                +-------------------------------+------------+
+                | pad           |      pad      |            |
+                |    +----------+               |            |
+                |    |          +---------------+  top_right |
+                |    |          |      top      |   image2   |
+                |    | top_left |     image1    |            |
+                |    |  image8  o--------+------+--------+---+
+                |    |          |        |               |   |
+                +----+----------+        |     right     |pad|
+                |               | center |     image3    |   |
+                |     left      | image0 +---------------+---|
+                |    image7     |        |               |   |
+            +---+-----------+---+--------+               |   |
+            |   |  cropped  |            |  bottom_right |pad|
+            |   |bottom_left|            |    image4     |   |
+            |   |  image6   |   bottom   |               |   |
+            +---|-----------+   image5   +---------------+---|
+                |    pad    |            |        pad        |
+                +-----------+------------+-------------------+
+
+    The mosaic transform steps are as follows:
+
+        1. Get the center image according to the index, and randomly
+           sample another 8 images from the custom dataset.
+        2. Randomly offset the image after Mosaic
 
     Required Keys:
 
@@ -744,7 +764,7 @@ class YOLOv5MixUp(BaseMixImageTransform):
 
     .. code:: text
 
-     The mixup transform steps are as follows:
+    The mixup transform steps are as follows:
 
         1. Another random image is picked by dataset.
         2. Randomly obtain the fusion ratio from the beta distribution,
@@ -787,7 +807,7 @@ class YOLOv5MixUp(BaseMixImageTransform):
             when the cache is full. If set to False, use FIFO popping method.
             Defaults to True.
         max_refetch (int): The maximum number of iterations. If the number of
-            iterations is greater than `max_iters`, but gt_bbox is still
+            iterations is greater than `max_refetch`, but gt_bbox is still
             empty, then the iteration is terminated. Defaults to 15.
     """
 
@@ -872,20 +892,20 @@ class YOLOXMixUp(BaseMixImageTransform):
     .. code:: text
 
                          mixup transform
-                +------------------------------+
+                +---------------+--------------+
                 | mixup image   |              |
                 |      +--------|--------+     |
                 |      |        |        |     |
-                |---------------+        |     |
+                +---------------+        |     |
                 |      |                 |     |
                 |      |      image      |     |
                 |      |                 |     |
                 |      |                 |     |
-                |      |-----------------+     |
+                |      +-----------------+     |
                 |             pad              |
                 +------------------------------+
 
-     The mixup transform steps are as follows:
+    The mixup transform steps are as follows:
 
         1. Another random image is picked by dataset and embedded in
            the top left patch(after padding and resizing)
@@ -935,7 +955,7 @@ class YOLOXMixUp(BaseMixImageTransform):
             when the cache is full. If set to False, use FIFO popping method.
             Defaults to True.
         max_refetch (int): The maximum number of iterations. If the number of
-            iterations is greater than `max_iters`, but gt_bbox is still
+            iterations is greater than `max_refetch`, but gt_bbox is still
             empty, then the iteration is terminated. Defaults to 15.
     """
 
@@ -1096,6 +1116,6 @@ def __repr__(self) -> str:
         repr_str += f'ratio_range={self.ratio_range}, '
         repr_str += f'flip_ratio={self.flip_ratio}, '
         repr_str += f'pad_val={self.pad_val}, '
-        repr_str += f'max_iters={self.max_iters}, '
+        repr_str += f'max_refetch={self.max_refetch}, '
         repr_str += f'bbox_clip_border={self.bbox_clip_border})'
         return repr_str