diff --git a/.cuda_ext.json b/.cuda_ext.json
index b8269f83786c..8c9d5916ccd8 100644
--- a/.cuda_ext.json
+++ b/.cuda_ext.json
@@ -7,10 +7,6 @@
     {
       "torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118",
       "cuda_image": "hpcaitech/cuda-conda:11.8"
-    },
-    {
-      "torch_command": "pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1",
-      "cuda_image": "hpcaitech/cuda-conda:11.7"
     }
   ]
 }
diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
index 3da8b5e77df9..3eee564c29ea 100644
--- a/.github/workflows/compatiblity_test_on_dispatch.yml
+++ b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -51,11 +51,11 @@ jobs:
     container:
       image: ${{ matrix.container }}
       options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
-    timeout-minutes: 120
+    timeout-minutes: 200
     steps:
       - name: Install dependencies
         run: |
-          pip install -U pip setuptools wheel --user
+          pip install -U pip setuptools==68.2.2 wheel --user
       - uses: actions/checkout@v2
         with:
           repository: hpcaitech/TensorNVMe
diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
index 10ac0e128dc6..b418c843e7f6 100644
--- a/.github/workflows/compatiblity_test_on_pr.yml
+++ b/.github/workflows/compatiblity_test_on_pr.yml
@@ -42,14 +42,14 @@ jobs:
     container:
       image: ${{ matrix.container }}
       options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
-    timeout-minutes: 120
+    timeout-minutes: 200
     concurrency:
       group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test-${{ matrix.container }}
       cancel-in-progress: true
     steps:
       - name: Install dependencies
         run: |
-          pip install -U pip setuptools wheel --user
+          pip install -U pip setuptools==68.2.2 wheel --user
       - uses: actions/checkout@v2
         with:
           repository: hpcaitech/TensorNVMe
diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml
index 84ea7e28d967..8d98e775c828 100644
--- a/.github/workflows/compatiblity_test_on_schedule.yml
+++ b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -39,11 +39,11 @@ jobs:
     container:
       image: ${{ matrix.container }}
       options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
-    timeout-minutes: 120
+    timeout-minutes: 200
     steps:
       - name: Install dependencies
         run: |
-          pip install -U pip setuptools wheel --user
+          pip install -U pip setuptools==68.2.2 wheel --user
 
       - uses: actions/checkout@v2
         with:
diff --git a/.github/workflows/release_docker_after_publish.yml b/.github/workflows/release_docker_after_publish.yml
index 0792544bf403..23aac9b544b0 100644
--- a/.github/workflows/release_docker_after_publish.yml
+++ b/.github/workflows/release_docker_after_publish.yml
@@ -28,6 +28,8 @@ jobs:
           docker tag $tag $latest
           echo "tag=${tag}" >> $GITHUB_OUTPUT
           echo "latest=${latest}" >> $GITHUB_OUTPUT
+        env:
+          DOCKER_BUILDKIT: 0
 
       - name: Log in to Docker Hub
         uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index ba997f144cd7..4ea86b609267 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -4,10 +4,11 @@ on:
   pull_request:
     types: [synchronize, opened, reopened]
     paths:
-      - "applications/Chat/coati/**"
-      - "applications/Chat/requirements.txt"
-      - "applications/Chat/setup.py"
-      - "applications/Chat/examples/**"
+      - "applications/ColossalChat/coati/**"
+      - "applications/ColossalChat/requirements.txt"
+      - "applications/ColossalChat/setup.py"
+      - "applications/ColossalChat/examples/**"
+      - "applications/ColossalChat/tests/**"
 
 jobs:
   tests:
@@ -41,7 +42,7 @@ jobs:
 
       - name: Install Transformers
         run: |
-          pip install transformers==4.34.1
+          pip install transformers==4.36.2
 
       - name: Execute Examples
         run: |
diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
index 1d8a53e4feed..c0e74ecbbab0 100644
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -4,12 +4,11 @@ on:
   pull_request:
     types: [synchronize, opened, reopened]
     paths:
-      - 'applications/Chat/coati/**'
-      - 'applications/Chat/requirements.txt'
-      - 'applications/Chat/setup.py'
-      - 'applications/Chat/requirements-test.txt'
-      - 'applications/Chat/tests/**'
-      - 'applications/Chat/pytest.ini'
+      - 'applications/ColossalChat/coati/**'
+      - 'applications/ColossalChat/requirements.txt'
+      - 'applications/ColossalChat/setup.py'
+      - 'applications/ColossalChat/tests/**'
+      - 'applications/ColossalChat/pytest.ini'
 
 jobs:
   tests:
diff --git a/applications/ColossalChat/coati/dataset/__init__.py b/applications/ColossalChat/coati/dataset/__init__.py
index e216c37e1c62..deb7b6d926fb 100755
--- a/applications/ColossalChat/coati/dataset/__init__.py
+++ b/applications/ColossalChat/coati/dataset/__init__.py
@@ -5,7 +5,6 @@
     DataCollatorForSupervisedDataset,
     StatefulDistributedSampler,
     load_tokenized_dataset,
-    setup_distributed_dataloader,
 )
 from .tokenization_utils import supervised_tokenize_sft, tokenize_prompt_dataset, tokenize_rlhf
 
@@ -17,7 +16,6 @@
     "DataCollatorForSupervisedDataset",
     "StatefulDistributedSampler",
     "load_tokenized_dataset",
-    "setup_distributed_dataloader",
     "supervised_tokenize_pretrain",
     "supervised_tokenize_sft",
     "tokenize_rlhf",
diff --git a/applications/ColossalChat/coati/dataset/conversation.py b/applications/ColossalChat/coati/dataset/conversation.py
index 15a33be93966..37900f3b8d64 100755
--- a/applications/ColossalChat/coati/dataset/conversation.py
+++ b/applications/ColossalChat/coati/dataset/conversation.py
@@ -17,6 +17,7 @@ class Conversation:
     system_message: str
     chat_template: str
     stop_ids: List[int]
+    end_of_assistant: str
 
     @classmethod
     def from_config(cls, tokenizer: PreTrainedTokenizer, config: Dict):
@@ -24,7 +25,9 @@ def from_config(cls, tokenizer: PreTrainedTokenizer, config: Dict):
         Setup the conversation template from config
         """
         tokenizer.chat_template = config["chat_template"]
-        conv = cls(tokenizer, config["system_message"], config["chat_template"], config["stop_ids"])
+        conv = cls(
+            tokenizer, config["system_message"], config["chat_template"], config["stop_ids"], config["end_of_assistant"]
+        )
         conv.clear()
         return conv
 
@@ -109,6 +112,8 @@ def setup_conversation_template(
     """
     if any([s not in chat_template_config.keys() for s in Conversation.get_conversation_template_keys()]):
         # Try to automatically set up conversation template, if fail, it throws an error that you need to do it manually
+        if "end_of_assistant" not in chat_template_config:
+            raise ValueError("Please set the end of assistant token.")
         if "system_message" not in chat_template_config:
             logger.warning("No system message is provided, will not use system message.")
         if "chat_template" not in chat_template_config:
diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index 93cc1dab8d21..cea1b2dbb877 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -4,22 +4,16 @@
 Dataloader for sft, dpo, ppo
 """
 
-import math
 import os
-import random
 from dataclasses import dataclass
-from typing import Callable, Dict, Iterator, List, Optional, Sequence, Union
+from typing import Dict, Iterator, List, Optional, Sequence, Union
 
-import numpy as np
 import torch
-import torch.distributed as dist
 import torch.nn.functional as F
 from coati.dataset.utils import chuncate_sequence, pad_to_max_len
 from datasets import Dataset as HFDataset
 from datasets import dataset_dict, load_from_disk
-from torch.distributed import ProcessGroup
-from torch.distributed.distributed_c10d import _get_default_group
-from torch.utils.data import ConcatDataset, DataLoader, Dataset, DistributedSampler
+from torch.utils.data import ConcatDataset, Dataset, DistributedSampler
 from transformers.tokenization_utils import PreTrainedTokenizer
 
 DatasetType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
@@ -236,148 +230,26 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
 
 
 class StatefulDistributedSampler(DistributedSampler):
-    """
-    Stateful distributed sampler for multi-stage training.
-    """
-
     def __init__(
         self,
-        dataset: DatasetType,
+        dataset: Dataset,
         num_replicas: Optional[int] = None,
         rank: Optional[int] = None,
         shuffle: bool = True,
         seed: int = 0,
         drop_last: bool = False,
-        use_tp: Optional[bool] = False,
     ) -> None:
-        if not use_tp:
-            super().__init__(
-                dataset=dataset,
-                num_replicas=num_replicas,
-                rank=rank,
-                shuffle=shuffle,
-                seed=seed,
-                drop_last=drop_last,
-            )
-        else:
-            # adapted from https://github.com/pytorch/pytorch/blob/4979f9c0d72490970e2019bb1d2284f83d93f76b/torch/utils/data/distributed.py#L62
-            # TODO: support tp_group>1. will fix it later
-            num_replicas = 1
-            if rank is None:
-                rank = dist.get_rank()
-            if rank < 0:
-                raise ValueError(f"Invalid rank {rank}, rank should be in the interval [0, 0]")
-            self.dataset = dataset
-            self.num_replicas = num_replicas
-            self.rank = rank
-            self.epoch = 0
-            self.drop_last = drop_last
-            # If the dataset length is evenly divisible by # of replicas, then there
-            # is no need to drop any data, since the dataset will be split equally.
-            if self.drop_last and len(self.dataset) % self.num_replicas != 0:  # type: ignore[arg-type]
-                # Split to nearest available length that is evenly divisible.
-                # This is to ensure each rank receives the same amount of data when
-                # using this Sampler.
-                self.num_samples = math.ceil(
-                    (len(self.dataset) - self.num_replicas) / self.num_replicas  # type: ignore[arg-type]
-                )
-            else:
-                self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)  # type: ignore[arg-type]
-            self.total_size = self.num_samples * self.num_replicas
-            self.shuffle = shuffle
-            self.seed = seed
-        self.start_index = 0
-        self.use_tp = use_tp
+        super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
+        self.start_index: int = 0
 
     def __iter__(self) -> Iterator:
-        if self.use_tp:
-            # TODO Add support for tp_group not equal to 1
-            pass
-            # adpated from https://github.com/pytorch/pytorch/blob/4979f9c0d72490970e2019bb1d2284f83d93f76b/torch/utils/data/distributed.py#L96
-            if self.shuffle:
-                # deterministically shuffle based on epoch and seed
-                g = torch.Generator()
-                g.manual_seed(self.seed + self.epoch)
-                indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
-            else:
-                indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
-
-            if not self.drop_last:
-                # add extra samples to make it evenly divisible
-                padding_size = self.total_size - len(indices)
-                if padding_size <= len(indices):
-                    indices += indices[:padding_size]
-                else:
-                    indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
-            else:
-                # remove tail of data to make it evenly divisible.
-                indices = indices[: self.total_size]
-            assert len(indices) == self.total_size
-
-            # subsample
-            indices = indices[
-                : self.total_size : self.num_replicas
-            ]  # num_replicas=tp_group=1, we only support tp_group==1 for now
-            assert len(indices) == self.num_samples
-
-            return iter(indices)
-
-        else:
-            iterator = super().__iter__()
-            indices = list(iterator)
-            indices = indices[self.start_index :]
-            return iter(indices)
+        iterator = super().__iter__()
+        indices = list(iterator)
+        indices = indices[self.start_index :]
+        return iter(indices)
 
     def __len__(self) -> int:
         return self.num_samples - self.start_index
 
     def set_start_index(self, start_index: int) -> None:
         self.start_index = start_index
-
-
-def setup_distributed_dataloader(
-    dataset: DatasetType,
-    batch_size: int = 1,
-    shuffle: bool = False,
-    seed: int = 1024,
-    drop_last: bool = False,
-    pin_memory: bool = False,
-    num_workers: int = 0,
-    collate_fn: Callable[[Sequence[Dict[str, Union[str, List[int]]]]], Dict[str, torch.Tensor]] = None,
-    process_group: Optional[ProcessGroup] = None,
-    use_tp: Optional[bool] = False,
-    **kwargs,
-) -> DataLoader:
-    """
-    Setup dataloader for distributed training.
-    """
-    _kwargs = kwargs.copy()
-    process_group = process_group or _get_default_group()
-    sampler = StatefulDistributedSampler(
-        dataset=dataset,
-        num_replicas=process_group.size() if not use_tp else 1,
-        rank=process_group.rank(),
-        shuffle=shuffle,
-        seed=seed,
-        drop_last=drop_last,
-        use_tp=use_tp,
-    )
-
-    # Deterministic dataloader
-    def seed_worker(worker_id: int) -> None:
-        worker_seed = seed
-        np.random.seed(worker_seed)
-        torch.manual_seed(worker_seed)
-        random.seed(worker_seed)
-
-    return DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        sampler=sampler,
-        num_workers=num_workers,
-        collate_fn=collate_fn,
-        pin_memory=pin_memory,
-        drop_last=drop_last,
-        worker_init_fn=seed_worker,
-        **_kwargs,
-    )
diff --git a/applications/ColossalChat/coati/dataset/tokenization_utils.py b/applications/ColossalChat/coati/dataset/tokenization_utils.py
index 7606bc2a97ba..34828cbafcf0 100755
--- a/applications/ColossalChat/coati/dataset/tokenization_utils.py
+++ b/applications/ColossalChat/coati/dataset/tokenization_utils.py
@@ -95,17 +95,27 @@ def supervised_tokenize_sft(
 
     target_turn = turns[target_turn_index - 1]
     prompt = template.get_prompt(2 * target_turn)
-    chunks, require_loss = split_templated_prompt_into_chunks(template.messages[: 2 * target_turn], prompt)
+    chunks, require_loss = split_templated_prompt_into_chunks(
+        template.messages[: 2 * target_turn], prompt, conversation_template.end_of_assistant
+    )
     tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss)
 
     labels = [ignore_index] * len(tokenized)
-    label_decode = []
     for start, end in zip(starts, ends):
         if end == len(tokenized):
             tokenized = tokenized + [tokenizer.eos_token_id]
             labels = labels + [ignore_index]
-        labels[start : end + 1] = tokenized[start : end + 1]
-        label_decode.append(tokenizer.decode(tokenized[start : end + 1], skip_special_tokens=False))
+        labels[start:end] = tokenized[start:end]
+
+    # truncate the sequence at the last token that requires loss calculation
+    to_truncate_len = 0
+    for i in range(len(tokenized) - 1, -1, -1):
+        if labels[i] == ignore_index:
+            to_truncate_len += 1
+        else:
+            break
+    tokenized = tokenized[: len(tokenized) - to_truncate_len]
+    labels = labels[: len(labels) - to_truncate_len]
 
     if tokenizer.bos_token_id is not None:
         if tokenized[0] != tokenizer.bos_token_id:
@@ -121,10 +131,20 @@ def supervised_tokenize_sft(
             labels[-1] = tokenizer.eos_token_id
 
     # For some model without bos/eos may raise the following errors
-    try:
-        inputs_decode = tokenizer.decode(tokenized)
-    except TypeError as e:
-        raise TypeError(str(e) + f"\nUnable to decode input_ids: {tokenized}")
+    inputs_decode = tokenizer.decode(tokenized)
+    start = 0
+    end = 0
+    label_decode = []
+    for i in range(len(labels)):
+        if labels[i] == ignore_index:
+            if start != end:
+                label_decode.append(tokenizer.decode(labels[start + 1 : i], skip_special_tokens=False))
+            start = i
+            end = i
+        else:
+            end = i
+            if i == len(labels) - 1:
+                label_decode.append(tokenizer.decode(labels[start + 1 :], skip_special_tokens=False))
 
     # Check if all labels are ignored, this may happen when the tokenized length is too long
     if labels.count(ignore_index) == len(labels):
@@ -191,7 +211,10 @@ def tokenize_prompt_dataset(
 
     # Prepare data
     prompt = template.get_prompt(target_turn, add_generation_prompt=True)
-    tokenized = tokenizer([prompt], add_special_tokens=False)["input_ids"][0]
+    chunks, require_loss = split_templated_prompt_into_chunks(
+        template.messages[:target_turn], prompt, conversation_template.end_of_assistant
+    )
+    tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss)
     if tokenizer.bos_token_id is not None:
         if tokenized[0] != tokenizer.bos_token_id:
             tokenized = [tokenizer.bos_token_id] + tokenized
@@ -219,7 +242,9 @@ def apply_rlhf_data_format(
 ):
     target_turn = int(len(template.messages) / 2)
     prompt = template.get_prompt(target_turn * 2)
-    chunks, require_loss = split_templated_prompt_into_chunks(template.messages[: 2 * target_turn], prompt)
+    chunks, require_loss = split_templated_prompt_into_chunks(
+        template.messages[: 2 * target_turn], prompt, template.end_of_assistant
+    )
     tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss)
     loss_mask = [0] * len(tokenized)
     mask_token = tokenizer.eos_token_id or tokenizer.pad_token_id
@@ -232,8 +257,8 @@ def apply_rlhf_data_format(
         if end == len(tokenized):
             tokenized = tokenized + [tokenizer.eos_token_id]
             loss_mask = loss_mask + [1]
-        loss_mask[start : end + 1] = [1] * len(loss_mask[start : end + 1])
-        label_decode.append(tokenizer.decode(tokenized[start : end + 1], skip_special_tokens=False))
+        loss_mask[start:end] = [1] * len(loss_mask[start:end])
+        label_decode.append(tokenizer.decode(tokenized[start:end], skip_special_tokens=False))
     if tokenizer.bos_token_id is not None:
         if tokenized[0] != tokenizer.bos_token_id:
             tokenized = [tokenizer.bos_token_id] + tokenized
diff --git a/applications/ColossalChat/coati/dataset/utils.py b/applications/ColossalChat/coati/dataset/utils.py
index ada2afef0154..f41a4d7724da 100755
--- a/applications/ColossalChat/coati/dataset/utils.py
+++ b/applications/ColossalChat/coati/dataset/utils.py
@@ -113,20 +113,25 @@ def tokenize_and_concatenate(tokenizer: PreTrainedTokenizer, text: List[str], re
     return input_ids, loss_starts, loss_ends
 
 
-def split_templated_prompt_into_chunks(messages: List[Dict[str, str]], prompt: str):
+def split_templated_prompt_into_chunks(messages: List[Dict[str, str]], prompt: str, end_of_assistant: str):
     # Seperate templated prompt into chunks by human/assistant's lines, prepare data for tokenize_and_concatenate
     start_idx = 0
     chunks = []
     require_loss = []
     for line in messages:
+        content_length = len(line["content"])
         first_occur = prompt.find(line["content"], start_idx)
+        if line["role"].lower() == "assistant" and end_of_assistant in prompt[first_occur + content_length :]:
+            content_length = (
+                prompt.find(end_of_assistant, first_occur + content_length) + len(end_of_assistant) - first_occur
+            )
         if prompt[first_occur - 1] != " ":
             chunks.append(prompt[start_idx:first_occur])
-            chunks.append(prompt[first_occur : first_occur + len(line["content"])])
+            chunks.append(prompt[first_occur : first_occur + content_length])
         else:
             chunks.append(prompt[start_idx : first_occur - 1])
-            chunks.append(prompt[first_occur - 1 : first_occur + len(line["content"])])
-        start_idx = first_occur + len(line["content"])
+            chunks.append(prompt[first_occur - 1 : first_occur + content_length])
+        start_idx = first_occur + content_length
         if line["role"].lower() == "assistant":
             require_loss.append(False)
             require_loss.append(True)
diff --git a/applications/ColossalChat/coati/models/critic.py b/applications/ColossalChat/coati/models/critic.py
index 80340d9bd43d..a5761dabe179 100755
--- a/applications/ColossalChat/coati/models/critic.py
+++ b/applications/ColossalChat/coati/models/critic.py
@@ -32,3 +32,9 @@ def forward(self, input_ids: torch.LongTensor, attention_mask: Optional[torch.Te
         )
         values = self.value_head(sequence_hidden_states).squeeze(-1)  # ensure shape is (B, sequence length)
         return values
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def get_output_embeddings(self):
+        return self.model.get_output_embeddings()
diff --git a/applications/ColossalChat/coati/models/reward_model.py b/applications/ColossalChat/coati/models/reward_model.py
index 18c5eca41a71..394f3ea90a42 100755
--- a/applications/ColossalChat/coati/models/reward_model.py
+++ b/applications/ColossalChat/coati/models/reward_model.py
@@ -36,3 +36,9 @@ def forward(self, input_ids: torch.LongTensor, attention_mask: Optional[torch.Te
         )
         values = self.value_head(sequence_hidden_states).squeeze(-1)  # Ensure shape is (B,)
         return values
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def get_output_embeddings(self):
+        return self.model.get_output_embeddings()
diff --git a/applications/ColossalChat/config/conversation_template/01-ai_Yi-1.5-9B-Chat.json b/applications/ColossalChat/config/conversation_template/01-ai_Yi-1.5-9B-Chat.json
new file mode 100644
index 000000000000..455b1e1b316e
--- /dev/null
+++ b/applications/ColossalChat/config/conversation_template/01-ai_Yi-1.5-9B-Chat.json
@@ -0,0 +1,8 @@
+{
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        7
+    ],
+    "end_of_assistant": "<|im_end|>"
+}
diff --git a/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-110B-Chat.json b/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-110B-Chat.json
new file mode 100644
index 000000000000..58941a5918ff
--- /dev/null
+++ b/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-110B-Chat.json
@@ -0,0 +1,9 @@
+{
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        151645,
+        151643
+    ],
+    "end_of_assistant": "<|im_end|>"
+}
diff --git a/applications/ColossalChat/config/conversation_template/THUDM_chatglm2-6b.json b/applications/ColossalChat/config/conversation_template/THUDM_chatglm2-6b.json
new file mode 100644
index 000000000000..b87a18c8d66f
--- /dev/null
+++ b/applications/ColossalChat/config/conversation_template/THUDM_chatglm2-6b.json
@@ -0,0 +1,12 @@
+{
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        31007,
+        326,
+        30962,
+        437,
+        31007
+    ],
+    "end_of_assistant": "<|im_end|>"
+}
diff --git a/applications/ColossalChat/config/conversation_template/THUDM_chatglm3-6b.json b/applications/ColossalChat/config/conversation_template/THUDM_chatglm3-6b.json
new file mode 100644
index 000000000000..c39f6e4b1f74
--- /dev/null
+++ b/applications/ColossalChat/config/conversation_template/THUDM_chatglm3-6b.json
@@ -0,0 +1,8 @@
+{
+    "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        2
+    ],
+    "end_of_assistant": "<|user|>"
+}
diff --git a/applications/ColossalChat/config/conversation_template/Vicuna.json b/applications/ColossalChat/config/conversation_template/Vicuna.json
deleted file mode 100644
index 2b00b6529720..000000000000
--- a/applications/ColossalChat/config/conversation_template/Vicuna.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'t know the answer to a question, please don\\'t share false information.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
-    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
-    "stop_ids": [
-        2
-    ]
-}
diff --git a/applications/ColossalChat/config/conversation_template/Yi.json b/applications/ColossalChat/config/conversation_template/Yi.json
deleted file mode 100644
index 9716413b53ad..000000000000
--- a/applications/ColossalChat/config/conversation_template/Yi.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
-    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
-    "stop_ids": [
-        2
-    ]
-}
diff --git a/applications/ColossalChat/config/conversation_template/chatGLM2.json b/applications/ColossalChat/config/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json
similarity index 90%
rename from applications/ColossalChat/config/conversation_template/chatGLM2.json
rename to applications/ColossalChat/config/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json
index a2638dbe7439..809c1d9f90f9 100644
--- a/applications/ColossalChat/config/conversation_template/chatGLM2.json
+++ b/applications/ColossalChat/config/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json
@@ -3,5 +3,6 @@
     "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
     "stop_ids": [
         2
-    ]
+    ],
+    "end_of_assistant": "<|im_end|>"
 }
diff --git a/applications/ColossalChat/config/conversation_template/colossal-llama2.json b/applications/ColossalChat/config/conversation_template/colossal-llama2.json
index cc7f1e5d76fc..d2f9d88997f2 100644
--- a/applications/ColossalChat/config/conversation_template/colossal-llama2.json
+++ b/applications/ColossalChat/config/conversation_template/colossal-llama2.json
@@ -3,5 +3,6 @@
     "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
     "stop_ids": [
         2
-    ]
+    ],
+    "end_of_assistant": "</s>"
 }
diff --git a/applications/ColossalChat/config/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json b/applications/ColossalChat/config/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json
new file mode 100644
index 000000000000..aad482bfbb9f
--- /dev/null
+++ b/applications/ColossalChat/config/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json
@@ -0,0 +1,8 @@
+{
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        100001
+    ],
+    "end_of_assistant": "<｜end▁of▁sentence｜>"
+}
diff --git a/applications/ColossalChat/config/conversation_template/llama2.json b/applications/ColossalChat/config/conversation_template/llama2.json
index 80558f976e3b..a6975e64030a 100644
--- a/applications/ColossalChat/config/conversation_template/llama2.json
+++ b/applications/ColossalChat/config/conversation_template/llama2.json
@@ -3,5 +3,6 @@
     "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
     "stop_ids": [
         2
-    ]
+    ],
+    "end_of_assistant": "</s>"
 }
diff --git a/applications/ColossalChat/config/conversation_template/Qwen.json b/applications/ColossalChat/config/conversation_template/microsoft_phi-2.json
similarity index 88%
rename from applications/ColossalChat/config/conversation_template/Qwen.json
rename to applications/ColossalChat/config/conversation_template/microsoft_phi-2.json
index 09f706ffed90..096f5138e4fb 100644
--- a/applications/ColossalChat/config/conversation_template/Qwen.json
+++ b/applications/ColossalChat/config/conversation_template/microsoft_phi-2.json
@@ -2,6 +2,7 @@
     "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
     "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
     "stop_ids": [
-        null
-    ]
+        50256
+    ],
+    "end_of_assistant": "<|im_end|>"
 }
diff --git a/applications/ColossalChat/config/conversation_template/mistral.json b/applications/ColossalChat/config/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json
similarity index 93%
rename from applications/ColossalChat/config/conversation_template/mistral.json
rename to applications/ColossalChat/config/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json
index b48c3a3f27af..4e143b5377be 100644
--- a/applications/ColossalChat/config/conversation_template/mistral.json
+++ b/applications/ColossalChat/config/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json
@@ -3,5 +3,6 @@
     "system_message": null,
     "stop_ids": [
         2
-    ]
+    ],
+    "end_of_assistant": "</s>"
 }
diff --git a/applications/ColossalChat/config/conversation_template/zephyr.json b/applications/ColossalChat/config/conversation_template/zephyr.json
deleted file mode 100644
index 2ab14111108b..000000000000
--- a/applications/ColossalChat/config/conversation_template/zephyr.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-    "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
-    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
-    "stop_ids": [
-        2
-    ]
-}
diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md
index cfed3f1f3a75..a29fc7508e60 100755
--- a/applications/ColossalChat/examples/README.md
+++ b/applications/ColossalChat/examples/README.md
@@ -1,7 +1,9 @@
 # Examples
 
+
 ## Table of Contents
 
+
 - [Examples](#examples)
   - [Table of Contents](#table-of-contents)
   - [Install Requirements](#install-requirements)
@@ -27,28 +29,36 @@
   - [Alternative Option For RLHF: Direct Preference Optimization](#alternative-option-for-rlhf-direct-preference-optimization)
     - [DPO Stage 1: Supervised Instruction Tuning](#dpo-training-stage1---supervised-instructs-tuning)
     - [DPO Stage 2: DPO Training](#dpo-training-stage2---dpo-training)
+  - [List of Supported Models](#list-of-supported-models)
   - [Hardware Requirements](#hardware-requirements)
   - [Inference example](#inference-example)
   - [Attention](#attention)
 
+
 ---
 
+
 ## Install requirements
 
+
 ```shell
 pip install -r requirements.txt
 ```
 
 
+
+
 ## Get Start with ColossalRun
 
-You can use colossalai run to launch multi-nodes training:
+
+You can use colossalai run to launch multi-node training:
 ```
 colossalai run --nproc_per_node YOUR_GPU_PER_NODE --hostfile YOUR_HOST_FILE \
 train.py --OTHER_CONFIGURATIONS
 ```
 Here is a sample hostfile:
 
+
 ```
 hostname1
 hostname2
@@ -56,21 +66,29 @@ hostname3
 hostname4
 ```
 
-Make sure master node can access all nodes (including itself) by ssh without password. Here are some other arguments.
+
+Make sure the master node can access all nodes (including itself) by ssh without a password. Here are some other arguments.
+
 
 - nnodes: number of nodes used in the training
 - nproc-per-node: specifies the number of processes to be launched per node
 - rdzv-endpoint: address of the host node
 
+
 ### Training Configuration
 
-This section gives a simple introduction on different training strategies that you can use and how to use them with our boosters and plugins to reduce training time and VRAM consumption. For more detail regarding training strategies, please refer to [here](https://colossalai.org/docs/concepts/paradigms_of_parallelism). For details regarding boosters and plugins, please refer to [here](https://colossalai.org/docs/basics/booster_plugins).
 
+This section gives a simple introduction on different training strategies that you can use and how to use them with our boosters and plugins to reduce training time and VRAM consumption. For more details regarding training strategies, please refer to [here](https://colossalai.org/docs/concepts/paradigms_of_parallelism). For details regarding boosters and plugins, please refer to [here](https://colossalai.org/docs/basics/booster_plugins).
+
+
+
+
+<details><summary><b>Gemini (Zero3)</b></summary>
 
-<details><summary><b>Gemini</b></summary>
 
 This plugin implements Zero-3 with chunk-based and heterogeneous memory management. It can train large models without much loss in speed. It also does not support local gradient accumulation. More details can be found in [Gemini Doc](https://colossalai.org/docs/features/zero_with_chunk).
 
+
 Below shows how to use the gemini in SFT training.
 ```
 colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
@@ -89,13 +107,17 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
     --use_wandb
 ```
 
+
 </details>
 
-<details><summary><b>Gemini-Auto</b></summary>
 
-This option use gemini and will automatically offload tensors with low priority to cpu. It also does not support local gradient accumulation. More details can be found in [Gemini Doc](https://colossalai.org/docs/features/zero_with_chunk).
+<details><summary><b>Gemini-Auto (Zero3 with Auto-Resource-Allocation-Policy)</b></summary>
+
+
+This option uses gemini and will automatically offload tensors with low priority to cpu. It also does not support local gradient accumulation. More details can be found in [Gemini Doc](https://colossalai.org/docs/features/zero_with_chunk).
+
 
-Below shows how to use the gemin-auto in SFT training.
+Below shows how to use the gemini-auto in SFT training.
 ```
 colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
     --pretrain $PRETRAINED_MODEL_PATH \
@@ -113,13 +135,18 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
     --use_wandb
 ```
 
+
 </details>
 
+
 </details>
 
+
 <details><summary><b>Zero2</b></summary>
 
-This option will distribute the optimizer parameters and the gradient to multiple GPUs and won't offload weights to cpu. It uses reduce and gather to synchronize gradients and weights. It does not support local gradient accumulation. Though you can accumulate gradient if you insist, it cannot reduce communication cost. That is to say, it's not a good idea to use Zero-2 with pipeline parallelism.
+
+This option will distribute the optimizer parameters and the gradient to multiple GPUs and won't offload weights to cpu. It uses reduce and gather to synchronize gradients and weights. It does not support local gradient accumulation. Though you can accumulate gradients if you insist, it cannot reduce communication cost. That is to say, it's not a good idea to use Zero-2 with pipeline parallelism.
+
 
 Below shows how to use the zero2 in SFT training.
 ```
@@ -139,12 +166,17 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
     --use_wandb
 ```
 
+
 </details>
 
 
+
+
 <details><summary><b>Zero2CPU</b></summary>
 
-This option will distribute the optimizer parameters and the gradient to multiple GPUs as well as offload parameters to cpu. It does not support local gradient accumulation. Though you can accumulate gradient if you insist, it cannot reduce communication cost.
+
+This option will distribute the optimizer parameters and the gradient to multiple GPUs as well as offload parameters to cpu. It does not support local gradient accumulation. Though you can accumulate gradients if you insist, it cannot reduce communication cost.
+
 
 Below shows how to use the zero2-cpu in SFT training.
 ```
@@ -164,11 +196,20 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
     --use_wandb
 ```
 
+
 </details>
 
+
 <details><summary><b>Tensor Parallelism</b></summary>
 
-This option support Tensor Parallelism (TP). Note that if you want to use TP, zero and pipeline parallelism will be disabled. TP split large model weights/optimizer parameters/gradients into multiple small ones and distributes them to multiple GPUs, hence it is recommended to use TP when your model is large (e.g. 20B and above) or your training algorithm consumes a lot of memory (e.g. PPO).
+
+This option supports Tensor Parallelism (TP). Note that if you want to use TP, TP split large model weights/optimizer parameters/gradients into multiple small ones and distributes them to multiple GPUs, hence it is recommended to use TP when your model is large (e.g. 20B and above) or your training algorithm consumes a lot of memory (e.g. PPO). Currently, we have added support for TP for the following model architectures.
+
+
+```
+bert, LLaMA, T5, GPT2, GPT-J, OPT, Bloom, Whisper, Sam, Blip2, ChatGLM (up to ChatGLM2), Falcon, Qwen2
+```
+
 
 Below shows how to use the TP in PPO training.
 ```
@@ -181,7 +222,7 @@ colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30039 train_
     --pretrain_dataset ${ptx_dataset[@]} \
     --ptx_batch_size 1 \
     --ptx_coef 0.0 \
-    --plugin "zero2" \
+    --plugin "3d" \
     --save_interval 200 \
     --save_path $SAVE_DIR \
     --num_episodes 2000 \
@@ -200,13 +241,87 @@ colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 30039 train_
     --use_wandb
 ```
 
+
+</details>
+
+
+<details><summary><b>Sequence Parallelism</b></summary>
+
+
+This option supports Sequence Parallelism (SP). It is recommended to use SP when your input sequence is very long (e.g. 50K and above). Please refer to this [SP Doc](https://github.com/hpcaitech/ColossalAI/blob/b96c6390f4363f58c0df56c0ca28755f8a5f1aa2/examples/tutorial/sequence_parallel/README.md?plain=1#L1) for more information.
+
+Below shows how to use the SP in SFT training.
+```
+# use the `split_gather` or `ring` sp mode
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin 3d \
+    --tp 4 \ # TP size, nproc_per_node must be divisible by it
+    --sp 1 \ # SP size, must be 1
+    --sp_mode 'split_gather' \ # or 'ring'
+    --enable_sequence_parallelism \ # must be set
+    --batch_size 4 \
+    --max_epochs 1 \
+    --accumulation_steps 4 \
+    --lr 2e-5 \
+    --max_len 2048 \
+    --use_wandb
+
+# use the `all_to_all` sp mode
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin 3d \
+    --tp 1 \ # TP size, must be 1
+    --sp 4 \ # SP size, nproc_per_node must be divisible by it
+    --sp_mode 'all_to_all' \
+    --enable_sequence_parallelism \ # must be set
+    --batch_size 4 \
+    --max_epochs 1 \
+    --accumulation_steps 4 \
+    --lr 2e-5 \
+    --max_len 2048 \
+    --use_wandb
+```
+
+
 </details>
 
 
+<details><summary><b>Advanced Training Configuration with the Hybrid Plugin</b></summary>
+
+User can use our HybridParallelPlugin for more advanced policy control. Currently, we have added support for the following model architectures.
+
+
+```
+bert, LLaMA, T5, GPT2, GPT-J, OPT, Bloom, Whisper, Sam, Blip2, ChatGLM (up to ChatGLM2), Falcon, Qwen2
+```
+
+- We support mixing tensor parallelism with zero1/zero2/zero3:
+to do that, set both `tp` and `zero_stage`
+- We support mixing tensor parallelism with pipeline parallelism:
+to do that, set both `tp` and `pp`
+
+</details>
+
+
+
+
 <details><summary><b>Gradient Checkpointing</b></summary>
 
+
 This option saves VRAM consumption by selectively recomputing some of the intermediate value on-the-fly during the backward pass, rather than storing them in memory.
 
+
 To enable gradient checkpointing, add --grad_checkpoint to your training script.
 ```
 colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
@@ -226,12 +341,16 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
     --use_wandb
 ```
 
+
 </details>
 
+
 <details><summary><b>Flash Attention</b></summary>
 
+
 Details about flash attention can be found in the paper: [FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness](https://arxiv.org/abs/2205.14135).
 
+
 To enable flash attention, add --use_flash_attn to your training script.
 ```
 colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
@@ -251,11 +370,15 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
     --use_wandb
 ```
 
+
 </details>
 
+
 <details><summary><b>Low Rank Adaption</b></summary>
 
-Details about Low Rank Adaption (LoRA) can be found in the paper: [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685). It dramatically reduce the VRAM consumption at the cost of sacrifice model capability. It is suitable for training LLM with constrained resources.
+
+Details about Low Rank Adaption (LoRA) can be found in the paper: [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685). It dramatically reduces the VRAM consumption at the cost of sacrifice model capability. It is suitable for training LLM with constrained resources.
+
 
 To enable LoRA, set --lora_rank to a positive value (usually between 20 and 64).
 ```
@@ -276,23 +399,26 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
     --use_wandb
 ```
 
+
 </details>
 
+
 <details><summary><b>Other Training Arguments</b></summary>
 
-- grad_clip: gradient larger than this value will be clipped.
+
+- grad_clip: gradients larger than this value will be clipped.
 - weight_decay: weight decay hyper-parameter.
 - warmup_steps: number of warmup steps used in setting up the learning rate scheduler.
 - pretrain: pretrain model path, weights will be loaded from this pretrained model unless checkpoint_path is provided.
-- tokenizer_dir: specify where to load the tokenizer, if not provided, tokenizer will be loaded from pretrain model path.
-- dataset: a list of strings, each is a path to a folder contains buffered dataset files in arrow format.
+- tokenizer_dir: specify where to load the tokenizer, if not provided, tokenizer will be loaded from the pretrained model path.
+- dataset: a list of strings, each is a path to a folder containing buffered dataset files in arrow format.
 - checkpoint_path: if provided, will load weights from the checkpoint_path.
 - config_file: path to store the training config file.
 - save_dir: path to store the model checkpoints.
-- max_length: input will be padded/truncate to max_length before feeding to the model.
-- max_epochs: number of epoch to train.
+- max_length: input will be padded/truncated to max_length before feeding to the model.
+- max_epochs: number of epochs to train.
 - batch_size: training batch size.
-- mixed_precision: precision to use in training. Support 'fp16' and 'bf16'. Note that some device may not support the 'bf16' option, please refer to [Nvidia](https://developer.nvidia.com/) to check compatibility.
+- mixed_precision: precision to use in training. Support 'fp16' and 'bf16'. Note that some devices may not support the 'bf16' option, please refer to [Nvidia](https://developer.nvidia.com/) to check compatibility.
 - save_interval: save the model weights as well as optimizer/scheduler states every save_interval steps/episodes.
 - merge_lora_weights: whether to merge lora weights before saving the model
 - lr: the learning rate used in training.
@@ -300,15 +426,20 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
 - log_dir: path to store the log.
 - use_wandb: if this flag is up, you can view logs on wandb.
 
+
 </details>
 
+
 ### RLHF Training Stage1 - Supervised Instructs Tuning
 
+
 Stage1 is supervised instructs fine-tuning (SFT). This step is a crucial part of the RLHF training process, as it involves training a machine learning model using human-provided instructions to learn the initial behavior for the task at hand. Here's a detailed guide on how to SFT your LLM with ColossalChat:
 
+
 #### Step 1: Data Collection
 The first step in Stage 1 is to collect a dataset of human demonstrations of the following format.
 
+
 ```json
 [
     {"messages":
@@ -328,45 +459,69 @@ The first step in Stage 1 is to collect a dataset of human demonstrations of the
 ]
 ```
 
+
 #### Step 2: Preprocessing
 Once you have collected your SFT dataset, you will need to preprocess it. This involves four steps: data cleaning, data deduplication, formatting and tokenization. In this section, we will focus on formatting and tokenization.
 
+
 In this code we provide a flexible way for users to set the conversation template for formatting chat data using Huggingface's newest feature--- chat template. Please follow the following steps to define your chat template and preprocess your data.
 
+
 - Step 1: (Optional). Define your conversation template. You need to provide a conversation template config file similar to the config files under the ./config/conversation_template directory. This config should include the following fields.
   ```json
   {
       "chat_template": (Optional), A string of chat_template used for formatting chat data. If not set (None), will use the default chat template of the provided tokenizer. If a path to a huggingface model or local model is provided, will use the chat_template of that model. To use a custom chat template, you need to manually set this field. For more details on how to write a chat template in Jinja format, please read https://huggingface.co/docs/transformers/main/chat_templating,
       "system_message": A string of system message to be added at the beginning of the prompt. If no is provided (None), no system message will be added,
-      "stop_ids": (Optional), A list of string indicating the end of assistant's response during the rollout stage of PPO training. It's recommended to set this manually for PPO training. If not set, will set to tokenizer.eos_token_ids automatically,
+      "end_of_assistant": The token(s) in string that denotes the end of assistance's response. For example, in the ChatGLM2 prompt format,
+      ```
+      <|im_start|>system
+      system messages
+
+      <|im_end|>
+      <|im_start|>user
+       How far is the moon? <|im_end|>
+      <|im_start|>assistant\n The moon is about 384,400 kilometers away from Earth.<|im_end|>...
+       ```
+       the end_of_assistant tokens are "<|im_end|>"
+      "stop_ids": (Optional), A list of integers corresponds to the `end_of_assistant` tokens that indicate the end of assistance's response during the rollout stage of PPO training. It's recommended to set this manually for PPO training. If not set, will set to tokenizer.eos_token_ids automatically
   }
   ```
   On your first run of the data preparation script, you only need to define the "chat_template" (if you want to use custom chat template) and the "system message" (if you want to use a custom system message),
 
+
 - Step 2: Run the data preparation script--- [prepare_sft_dataset.sh](./examples/data_preparation_scripts/prepare_sft_dataset.sh). Note that whether or not you have skipped the first step, you need to provide the path to the conversation template config file (via the conversation_template_config arg). If you skipped the first step, an auto-generated conversation template will be stored at the designated file path.
 
+
 - Step 3: (Optional) Check the correctness of the processed data. We provided an easy way for you to do a manual checking on the processed data by checking the "$SAVE_DIR/jsonl/part-XXXX.jsonl" files.
 
+
 Finishing the above steps, you have converted the raw conversation to the designated chat format and tokenized the formatted conversation, calculate input_ids, labels, attention_masks and buffer those into binary dataset files under "$SAVE_DIR/arrow/part-XXXX" folders.
 
+
 For example, our Colossal-LLaMA-2 format looks like,
 ```
 <s> A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
 
+
 Human: <s> what are some pranks with a pen i can do?</s> Assistant: <s> Are you looking for practical joke ideas?</s>
 ...
 ```
 
+
 #### Step 3: Training
 Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./examples/training_scripts/train_sft.sh) to start a supervised instructs fine-tuning. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
+
 ### RLHF Training Stage2 - Training Reward Model
 
+
 Stage2 trains a reward model, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model.
 
+
 #### Step 1: Data Collection
 Below shows the preference dataset format used in training the reward model.
 
+
 ```json
 [
     {"context": [
@@ -394,42 +549,54 @@ Below shows the preference dataset format used in training the reward model.
 ]
 ```
 
+
 #### Step 2: Preprocessing
 Similar to the second step in the previous stage, we format the reward data into the same structured format as used in step 2 of the SFT stage. You can run [prepare_preference_dataset.sh](./examples/data_preparation_scripts/prepare_preference_dataset.sh) to prepare the preference data for reward model training.
 
+
 #### Step 3: Training
 You can run [train_rm.sh](./examples/training_scripts/train_rm.sh) to start the reward model training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
+
 #### Features and Tricks in RM Training
 
+
 - We recommend using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf)and[rm-static](https://huggingface.co/datasets/Dahoas/rm-static) datasets for training the reward model.
 - We support 2 kinds of loss function named `log_sig`(used by OpenAI) and `log_exp`(used by Anthropic).
 - We log the training accuracy `train/acc`, `reward_chosen` and `reward_rejected` to monitor progress during training.
 - We use cosine-reducing lr-scheduler for RM training.
-- We set value_head as 1 liner layer and initialize the weight of value_head using N(0，1/(d_model + 1)) distribution.
+- We set value_head as one liner layer and initialize the weight of value_head using the N(0，1/(d_model + 1)) distribution.
+
 
 #### Note on Reward Model Training
 
-Before you move on the next stage, please check the following list to ensure that your reward model is stable and robust. You can check the reward chart and the accuracy chart on wandb.
+
+Before you move on to the next stage, please check the following list to ensure that your reward model is stable and robust. You can check the reward chart and the accuracy chart on wandb.
 - The mean reward for chosen data is much higher than those for rejected data
 - The accuracy is larger than 0.5 by a significant margin (usually should be greater than 0.6)
 - Optional：check the reward is positive for chosen data vice versa
 
+
 Your training reward curves should look similar to the following charts.
 <p align="center">
 <img width="1000" alt="image" src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/mean_reward_chart.png">
 </p>
 
+
 ### RLHF Training Stage3 - Proximal Policy Optimization
 
+
 In stage3 we will use reinforcement learning algorithm--- Proximal Policy Optimization (PPO), which is the most complex part of the training process:
 
+
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/stage-3.jpeg" width=800/>
 </p>
 
+
 #### Step 1: Data Collection
-PPO uses two kind of training data--- the prompt data and the pretrain data (optional). The first dataset is mandatory, data samples within the prompt dataset ends with a line from "human" and thus the "assistant" needs to generate a response to answer to the "human". Note that you can still use conversation that ends with a line from the "assistant", in that case, the last line will be dropped. Here is an example of the prompt dataset format.
+PPO uses two kinds of training data--- the prompt data and the pretrain data (optional). The first dataset is mandatory, data samples within the prompt dataset ends with a line from "human" and thus the "assistant" needs to generate a response to answer to the "human". Note that you can still use conversation that ends with a line from the "assistant", in that case, the last line will be dropped. Here is an example of the prompt dataset format.
+
 
 ```json
 [
@@ -445,8 +612,10 @@ PPO uses two kind of training data--- the prompt data and the pretrain data (opt
 ]
 ```
 
+
 The second dataset--- pretrained dataset is optional, provide it if you want to use the ptx loss introduced in the [InstructGPT paper](https://arxiv.org/abs/2203.02155). It follows the following format.
 
+
 ```json
   [
       {
@@ -459,11 +628,14 @@ The second dataset--- pretrained dataset is optional, provide it if you want to
 #### Step 2: Preprocessing
 To prepare the prompt dataset for PPO training, simply run [prepare_prompt_dataset.sh](./examples/data_preparation_scripts/prepare_prompt_dataset.sh)
 
-You can use the SFT dataset you prepared in the SFT stage or prepare a new one from different source for the ptx dataset. The ptx data is used to calculate ptx loss, which stablize the training according to the [InstructGPT paper](https://arxiv.org/pdf/2203.02155.pdf).
+
+You can use the SFT dataset you prepared in the SFT stage or prepare a new one from different source for the ptx dataset. The ptx data is used to calculate ptx loss, which stabilizes the training according to the [InstructGPT paper](https://arxiv.org/pdf/2203.02155.pdf).
+
 
 #### Step 3: Training
 You can run the [train_ppo.sh](./examples/training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
+
 ```bash
 --pretrain $PRETRAINED_MODEL_PATH \
 --rm_pretrain $PRETRAINED_MODEL_PATH \ # reward model architectural
@@ -482,7 +654,9 @@ You can run the [train_ppo.sh](./examples/training_scripts/train_ppo.sh) to star
 --accumulation_steps 2
 ```
 
-Each episode has two phases, the collect phase and the update phase. During the collect phase, we will collect experiences (answers generated by actor), store those in ExperienceBuffer. Then data in ExperienceBuffer is used during the update phase to update parameter of actor and critic.
+
+Each episode has two phases, the collect phase and the update phase. During the collect phase, we will collect experiences (answers generated by the actor), store those in ExperienceBuffer. Then data in ExperienceBuffer is used during the update phase to update parameters of actor and critic.
+
 
 - Without tensor parallelism,
 ```
@@ -491,6 +665,7 @@ experience buffer size
 = train_batch_size * accumulation_steps * num_process
 ```
 
+
 - With tensor parallelism,
 ```
 num_tp_group = num_process / tp
@@ -499,47 +674,60 @@ experience buffer size
 = train_batch_size * accumulation_steps * num_tp_group
 ```
 
+
 ### Sample Training Results Using Default Script
 #### Reward
 <p align="center">
 <img width="700" alt="image" src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/reward.png">
 </p>
 
+
 ### Note on PPO Training
 #### Q1: My reward is negative
-Answer: Check your reward model trained in stage 1. If the reward model only generate negative reward, we actually will expect a negative reward. However, even though the reward is negative, the reward should go up.
+Answer: Check your reward model trained in stage 1. If the reward model only generates negative reward, we actually will expect a negative reward. However, even though the reward is negative, the reward should go up.
+
 
 #### Q2: My actor loss is negative
 Answer: This is normal for actor loss as PPO doesn't restrict the actor loss to be positive.
 
+
 #### Q3: My reward doesn't go up (decreases)
-Answer: The causes to this problem are two-fold. Check your reward model, make sure that it gives positive and strong reward for good cases and negative, strong reward for bad responses. You should also try different hyperparameter settings.
+Answer: The causes of this problem are two-fold. Check your reward model, make sure that it gives positive and strong reward for good cases and negative, strong reward for bad responses. You should also try different hyperparameter settings.
+
 
 #### Q4: Generation is garbage
-Answer: Yes, this happens and is well documented by other implementations. After training for too many episodes, the actor gradually deviate from its original state, which may leads to decrease in language modeling capabilities. A way to fix this is to add supervised loss during PPO. Set ptx_coef to a none-zero value (between 0 and 1), which balances PPO loss and sft loss.
+Answer: Yes, this happens and is well documented by other implementations. After training for too many episodes, the actor gradually deviate from its original state, which may leads to decrease in language modeling capabilities. A way to fix this is to add supervised loss during PPO. Set ptx_coef to an non-zero value (between 0 and 1), which balances PPO loss and sft loss.
+
 
 ## Alternative Option For RLHF: Direct Preference Optimization
 
+
 For those seeking an alternative to Reinforcement Learning from Human Feedback (RLHF), Direct Preference Optimization (DPO) presents a compelling option. DPO, as detailed in the paper (available at [https://arxiv.org/abs/2305.18290](https://arxiv.org/abs/2305.18290)), DPO offers an low-cost way to perform RLHF and usually request less computation resources compares to PPO.
 
+
 ### DPO Training Stage1 - Supervised Instructs Tuning
 
+
 Please refer the [sft section](#dpo-training-stage1---supervised-instructs-tuning) in the PPO part.
 
+
 ### DPO Training Stage2 - DPO Training
 #### Step 1: Data Collection & Preparation
 For DPO training, you only need the preference dataset. Please follow the instruction in the [preference dataset preparation section](#rlhf-training-stage2---training-reward-model) to prepare the preference data for DPO training.
 
+
 #### Step 2: Training
 You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to start DPO training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
+
 #### DPO Result
 <p align="center">
 <img width="1000" alt="image" src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/DPO.png">
 </p>
 
+
 ## Hardware Requirements
-For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM consumption of training a 7B model on a dummy dataset with 2048 sequence length and 512 layout length with different tp_size (equal to the number of GPUs). In this experiment, we use H800 GPU with 80GB VRAM.
+For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM consumption of training a 7B model on a dummy dataset with 2048 sequence length and 512 layout length with different tp_size (equal to the number of GPUs). In this experiment, we use an H800 GPU with 80GB VRAM.
 | PPO   | tp=8          | tp=4          |
 |-------|---------------|---------------|
 | bs=1  | 18485.19 MB   | 42934.45 MB   |
@@ -547,19 +735,45 @@ For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM
 | bs=16 | 41408.28 MB   | 56778.97 MB   |
 | bs=30 | 64047.42 MB   | failed        |
 
+
 For DPO, we recommend using zero2 or zero2-cpu. We tested the VRAM consumption on a dummy dataset with 2048 sequence length.
 
+
 - 1 H800 GPU
   - zero2-cpu, batch size=2, VRAM Usage=49873.90 MB
   - zero2-cpu, batch size=4, VRAM Usage=60998.22 MB
 - 4 H800 GPUs
   - zero2, batch size=4, VRAM Usage=67544.47 MB
 
+## List of Supported Models
+
+For SFT, we support the following models/series:
+- Colossal-LLaMA-2
+- ChatGLM2
+- ChatGLM3 (only with zero2, zero2_cpu plugin)
+- Baichuan2
+- LLaMA2
+- Qwen1.5-7B-Chat (with transformers==4.39.1)
+- Yi-1.5
+
+For PPO and DPO, we theoratically support the following models/series (without guarantee):
+- Colossal-LLaMA-2 (tested)
+- ChatGLM2
+- Baichuan2
+- LLaMA2 (tested)
+- Qwen1.5-7B-Chat (with transformers==4.39.1)
+- Yi-1.5
+
+*-* The zero2, zero2_cpu plugin also support a wide range of chat models not listed above.
+
 ## Inference example
 
+
 We support different inference options, including int8 and int4 quantization.
 For details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/inference).
 
+
 ## Attention
 
+
 The examples are demos for the whole training process. You need to change the hyper-parameters to reach great performance.
diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh
index cf937db2a84b..8562b47ee996 100755
--- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh
+++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh
@@ -5,7 +5,7 @@ rm -rf $SAVE_DIR/jsonl
 rm -rf $SAVE_DIR/arrow
 
 python prepare_dataset.py --type sft \
-    --data_input_dirs /PATH/TO/SFT/DATASET \
+    --data_input_dirs "PATH/TO/SFT/DATA" \
     --conversation_template_config /PATH/TO/CHAT/TEMPLATE/CONFIG.json \
     --tokenizer_dir  "" \
     --data_cache_dir $SAVE_DIR/cache \
diff --git a/applications/ColossalChat/examples/training_scripts/hostfile b/applications/ColossalChat/examples/training_scripts/hostfile
index d4118dda9783..c7aed75a331a 100755
--- a/applications/ColossalChat/examples/training_scripts/hostfile
+++ b/applications/ColossalChat/examples/training_scripts/hostfile
@@ -1 +1,5 @@
-10.20.1.82
+XXX.XX.XXX.XXX   # Your master IP
+XXX.XX.XXX.XXX   # Your slave IPs
+XXX.XX.XXX.XXX   # Your slave IPs
+XXX.XX.XXX.XXX   # Your slave IPs
+XXX.XX.XXX.XXX   # Your slave IPs
diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py
index f06c23a9f704..a5b4cb3bd66e 100755
--- a/applications/ColossalChat/examples/training_scripts/train_dpo.py
+++ b/applications/ColossalChat/examples/training_scripts/train_dpo.py
@@ -5,12 +5,7 @@
 from contextlib import nullcontext
 
 import torch
-from coati.dataset import (
-    DataCollatorForPreferenceDataset,
-    StatefulDistributedSampler,
-    load_tokenized_dataset,
-    setup_distributed_dataloader,
-)
+from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset
 from coati.models import convert_to_lora_module, disable_dropout
 from coati.trainer import DPOTrainer
 from coati.utils import load_checkpoint
@@ -56,6 +51,7 @@ def train(args):
             initial_scale=2**16,
             max_norm=args.grad_clip,
             enable_gradient_accumulation=True,
+            enable_flash_attention=args.use_flash_attn,
         )
     elif args.plugin == "gemini_auto":
         plugin = GeminiPlugin(
@@ -63,6 +59,7 @@ def train(args):
             placement_policy="auto",
             initial_scale=2**16,
             max_norm=args.grad_clip,
+            enable_flash_attention=args.use_flash_attn,
         )
     elif args.plugin == "zero2":
         plugin = LowLevelZeroPlugin(
@@ -82,9 +79,15 @@ def train(args):
     elif args.plugin == "3d":
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
-            pp_size=1,
-            zero_stage=0,
+            pp_size=args.pp,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            zero_stage=args.zero_stage,
+            enable_flash_attention=args.use_flash_attn,
+            enable_sequence_parallelism=args.enable_sequence_parallelism,
+            cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False,
             parallel_output=False,
+            max_norm=args.grad_clip,
             precision=args.mixed_precision,
         )
     else:
@@ -166,13 +169,14 @@ def train(args):
     mode_map = {"train": "train", "valid": "validation", "test": "test"}
     train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
     data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
-    train_dataloader = setup_distributed_dataloader(
+
+    train_dataloader = plugin.prepare_dataloader(
         dataset=train_dataset,
         batch_size=args.batch_size,
         shuffle=True,
         drop_last=True,
         collate_fn=data_collator,
-        use_tp=args.tp > 1,
+        distributed_sampler_cls=StatefulDistributedSampler,
     )
 
     num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
@@ -290,6 +294,12 @@ def train(args):
     parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
     parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
     parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
+    parser.add_argument("--sp", type=int, default=1)
+    parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true")
+    parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2])
+    parser.add_argument("--zero_cpu_offload", default=False, action="store_true")
+    parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"])
     parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--model_type", type=str, default=None)
     parser.add_argument("--tokenizer_dir", type=str, default=None)
diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.py b/applications/ColossalChat/examples/training_scripts/train_ppo.py
index 727cff7ca564..3da3e9ca641e 100755
--- a/applications/ColossalChat/examples/training_scripts/train_ppo.py
+++ b/applications/ColossalChat/examples/training_scripts/train_ppo.py
@@ -12,7 +12,6 @@
     StatefulDistributedSampler,
     load_tokenized_dataset,
     setup_conversation_template,
-    setup_distributed_dataloader,
 )
 from coati.models import Critic, RewardModel, convert_to_lora_module, disable_dropout
 from coati.trainer import PPOTrainer
@@ -26,6 +25,7 @@
 from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
+from colossalai.shardformer.policies.auto_policy import get_autopolicy
 
 logger = get_dist_logger()
 
@@ -51,7 +51,6 @@ def train(args):
     # )
 
     init_ctx = nullcontext()
-    booster_policy = None
     with init_ctx:
         if args.use_flash_attn:
             actor = AutoModelForCausalLM.from_pretrained(
@@ -86,32 +85,6 @@ def train(args):
         disable_dropout(actor)
         disable_dropout(critic)
 
-        if args.tp > 1:
-            if reward_model.model.config.architectures[0] != critic.model.config.architectures[0]:
-                raise ValueError("Reward model and critic model must have the same architecture")
-            if reward_model.model.config.architectures[0] == "BloomForCausalLM":
-                from colossalai.shardformer.policies.bloom import BloomPolicy
-
-                booster_policy = BloomPolicy()
-            elif reward_model.model.config.architectures[0] == "LlamaForCausalLM":
-                from colossalai.shardformer.policies.llama import LlamaPolicy
-
-                booster_policy = LlamaPolicy()
-            elif reward_model.model.config.architectures[0] == "GPT2LMHeadModel":
-                from colossalai.shardformer.policies.gpt2 import GPT2Policy
-
-                booster_policy = GPT2Policy()
-            elif reward_model.model.config.architectures[0] == "ChatGLMModel":
-                from colossalai.shardformer.policies.chatglm2 import ChatGLMPolicy
-
-                booster_policy = ChatGLMPolicy()
-            elif reward_model.model.config.architectures[0] == "OPTForCausalLM":
-                from colossalai.shardformer.policies.opt import OPTPolicy
-
-                booster_policy = OPTPolicy()
-            else:
-                raise ValueError("Unknown model architecture for policy")
-
         if args.lora_rank > 0:
             actor = convert_to_lora_module(actor, args.lora_rank, lora_train_bias=args.lora_train_bias)
             critic = convert_to_lora_module(critic, args.lora_rank, lora_train_bias=args.lora_train_bias)
@@ -175,34 +148,6 @@ def train(args):
         adamw_mode=True,
     )
 
-    # configure dataset
-    coordinator.print_on_master(f"Load dataset: {args.prompt_dataset}")
-    mode_map = {"train": "train", "valid": "validation", "test": "test"}
-    train_prompt_dataset = load_tokenized_dataset(dataset_paths=args.prompt_dataset, mode="train", mode_map=mode_map)
-    data_collator = DataCollatorForPromptDataset(tokenizer=tokenizer, max_length=args.max_length - args.max_seq_len)
-    train_prompt_dataloader = setup_distributed_dataloader(
-        dataset=train_prompt_dataset,
-        batch_size=args.experience_batch_size,
-        shuffle=True,
-        drop_last=True,
-        collate_fn=data_collator,
-        use_tp=args.tp > 1,
-    )
-
-    if len(args.ptx_dataset) > 0:
-        train_ptx_dataset = load_tokenized_dataset(dataset_paths=args.ptx_dataset, mode="train", mode_map=mode_map)
-        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_length)
-        train_pretrain_dataloader = setup_distributed_dataloader(
-            dataset=train_ptx_dataset,
-            batch_size=args.ptx_batch_size,
-            shuffle=True,
-            drop_last=True,
-            collate_fn=data_collator,
-            use_tp=args.tp > 1,
-        )
-    else:
-        train_pretrain_dataloader = None
-
     if args.warmup_steps is None:
         args.warmup_steps = int(0.025 * args.num_episodes)
         coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
@@ -237,6 +182,7 @@ def train(args):
             initial_scale=2**16,
             max_norm=args.grad_clip,
             enable_gradient_accumulation=True,
+            enable_flash_attention=args.use_flash_attn,
         )
     elif args.plugin == "gemini_auto":
         plugin = GeminiPlugin(
@@ -244,6 +190,7 @@ def train(args):
             placement_policy="auto",
             initial_scale=2**16,
             max_norm=args.grad_clip,
+            enable_flash_attention=args.use_flash_attn,
         )
     elif args.plugin == "zero2":
         plugin = LowLevelZeroPlugin(
@@ -261,20 +208,35 @@ def train(args):
             max_norm=args.grad_clip,
         )
     elif args.plugin == "3d":
+        if args.use_flash_attn and (args.tp > 1 or args.pp > 1 or args.sp > 1 or args.enable_sequence_parallelism):
+            logger.warning("Flash attention cannot be used with 3D parallelism for PPO training. Disabling it.")
+            args.use_flash_attn = False
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
-            pp_size=1,
-            zero_stage=0,
+            pp_size=args.pp,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            zero_stage=args.zero_stage,
+            enable_flash_attention=args.use_flash_attn,
+            enable_sequence_parallelism=args.enable_sequence_parallelism,
+            cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False,
             parallel_output=False,
+            max_norm=args.grad_clip,
             precision=args.mixed_precision,
         )
         custom_plugin = HybridParallelPlugin(
             tp_size=args.tp,
-            pp_size=1,
-            zero_stage=0,
+            pp_size=args.pp,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            zero_stage=args.zero_stage,
+            enable_flash_attention=args.use_flash_attn,
+            enable_sequence_parallelism=args.enable_sequence_parallelism,
+            cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False,
             parallel_output=False,
+            max_norm=args.grad_clip,
             precision=args.mixed_precision,
-            custom_policy=booster_policy,
+            custom_policy=get_autopolicy(reward_model.model),
         )
     else:
         raise ValueError(f"Unknown plugin {args.plugin}")
@@ -282,6 +244,35 @@ def train(args):
     if args.plugin != "3d":
         custom_plugin = plugin
 
+    # configure dataset
+    coordinator.print_on_master(f"Load dataset: {args.prompt_dataset}")
+    mode_map = {"train": "train", "valid": "validation", "test": "test"}
+    train_prompt_dataset = load_tokenized_dataset(dataset_paths=args.prompt_dataset, mode="train", mode_map=mode_map)
+    data_collator = DataCollatorForPromptDataset(tokenizer=tokenizer, max_length=args.max_length - args.max_seq_len)
+
+    train_prompt_dataloader = plugin.prepare_dataloader(
+        dataset=train_prompt_dataset,
+        batch_size=args.experience_batch_size,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+        distributed_sampler_cls=StatefulDistributedSampler,
+    )
+
+    if len(args.ptx_dataset) > 0:
+        train_ptx_dataset = load_tokenized_dataset(dataset_paths=args.ptx_dataset, mode="train", mode_map=mode_map)
+        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_length)
+        train_pretrain_dataloader = plugin.prepare_dataloader(
+            dataset=train_ptx_dataset,
+            batch_size=args.ptx_batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=data_collator,
+            distributed_sampler_cls=StatefulDistributedSampler,
+        )
+    else:
+        train_pretrain_dataloader = None
+
     actor_booster = Booster(plugin=plugin)
     ref_booster = Booster(plugin=plugin)
     rm_booster = Booster(plugin=custom_plugin)
@@ -474,6 +465,12 @@ def train(args):
     parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
     parser.add_argument("--tokenizer_dir", type=str, default=None)
     parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
+    parser.add_argument("--sp", type=int, default=1)
+    parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true")
+    parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2])
+    parser.add_argument("--zero_cpu_offload", default=False, action="store_true")
+    parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"])
     parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--rm_pretrain", type=str, default=None)
     parser.add_argument("--checkpoint_path", type=str, default=None)
diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.py b/applications/ColossalChat/examples/training_scripts/train_rm.py
index 364198c1d78b..ce0d02b5d2a4 100755
--- a/applications/ColossalChat/examples/training_scripts/train_rm.py
+++ b/applications/ColossalChat/examples/training_scripts/train_rm.py
@@ -6,12 +6,7 @@
 from contextlib import nullcontext
 
 import torch
-from coati.dataset import (
-    DataCollatorForPreferenceDataset,
-    StatefulDistributedSampler,
-    load_tokenized_dataset,
-    setup_distributed_dataloader,
-)
+from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset
 from coati.models import LogExpLoss, LogSigLoss, RewardModel, convert_to_lora_module
 from coati.trainer import RewardModelTrainer
 from coati.utils import load_checkpoint
@@ -23,6 +18,7 @@
 from colossalai.cluster import DistCoordinator
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
+from colossalai.shardformer.policies.auto_policy import get_autopolicy
 
 
 def train(args):
@@ -46,7 +42,6 @@ def train(args):
     # )
 
     init_ctx = nullcontext()
-    booster_policy = None
     with init_ctx:
         if args.use_flash_attn:
             model = RewardModel(
@@ -56,31 +51,9 @@ def train(args):
             )
             coordinator.print_on_master(msg="Flash-attention enabled successfully")
         else:
-            model = RewardModel(args.pretrain)
-
-        if args.tp > 1:
-            if model.model.config.architectures[0] == "BloomForCausalLM":
-                from colossalai.shardformer.policies.bloom import BloomPolicy
-
-                booster_policy = BloomPolicy()
-            elif model.model.config.architectures[0] == "LlamaForCausalLM":
-                from colossalai.shardformer.policies.llama import LlamaPolicy
-
-                booster_policy = LlamaPolicy()
-            elif model.model.config.architectures[0] == "GPT2LMHeadModel":
-                from colossalai.shardformer.policies.gpt2 import GPT2Policy
-
-                booster_policy = GPT2Policy()
-            elif model.model.config.architectures[0] == "ChatGLMModel":
-                from colossalai.shardformer.policies.chatglm2 import ChatGLMPolicy
-
-                booster_policy = ChatGLMPolicy()
-            elif model.model.config.architectures[0] == "OPTForCausalLM":
-                from colossalai.shardformer.policies.opt import OPTPolicy
-
-                booster_policy = OPTPolicy()
-            else:
-                raise ValueError("Unknown model architecture for policy")
+            model = RewardModel(
+                args.pretrain,
+            )
 
         if args.lora_rank > 0:
             model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
@@ -100,6 +73,7 @@ def train(args):
             placement_policy="static",
             initial_scale=2**16,
             max_norm=args.grad_clip,
+            enable_flash_attention=args.use_flash_attn,
             enable_gradient_accumulation=True,
         )
     elif args.plugin == "gemini_auto":
@@ -107,6 +81,7 @@ def train(args):
             precision=args.mixed_precision,
             placement_policy="auto",
             initial_scale=2**16,
+            enable_flash_attention=args.use_flash_attn,
             max_norm=args.grad_clip,
         )
     elif args.plugin == "zero2":
@@ -127,11 +102,17 @@ def train(args):
     elif args.plugin == "3d":
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
-            pp_size=1,
-            zero_stage=0,
+            pp_size=args.pp,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            zero_stage=args.zero_stage,
+            enable_flash_attention=args.use_flash_attn,
+            enable_sequence_parallelism=args.enable_sequence_parallelism,
+            cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False,
             parallel_output=False,
+            max_norm=args.grad_clip,
             precision=args.mixed_precision,
-            custom_policy=booster_policy,
+            custom_policy=get_autopolicy(model.model),
         )
     else:
         raise ValueError(f"Unknown plugin {args.plugin}")
@@ -183,15 +164,15 @@ def train(args):
     mode_map = {"train": "train", "valid": "validation", "test": "test"}
     train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
     data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
-    train_dataloader = setup_distributed_dataloader(
+
+    train_dataloader = plugin.prepare_dataloader(
         dataset=train_dataset,
         batch_size=args.batch_size,
         shuffle=True,
         drop_last=True,
         collate_fn=data_collator,
-        use_tp=args.tp > 1,
+        distributed_sampler_cls=StatefulDistributedSampler,
     )
-
     num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
     math.ceil(args.max_epochs * num_update_steps_per_epoch)
 
@@ -307,6 +288,12 @@ def train(args):
     parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
     parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
     parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
+    parser.add_argument("--sp", type=int, default=1)
+    parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true")
+    parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2])
+    parser.add_argument("--zero_cpu_offload", default=False, action="store_true")
+    parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"])
     parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--tokenizer_dir", type=str, default=None)
     parser.add_argument("--dataset", nargs="+", default=[])
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py
index ae20f2abcb5f..08e7550df157 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.py
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.py
@@ -6,7 +6,7 @@
 from contextlib import nullcontext
 
 import torch
-from coati.dataset import DataCollatorForSupervisedDataset, load_tokenized_dataset, setup_distributed_dataloader
+from coati.dataset import DataCollatorForSupervisedDataset, StatefulDistributedSampler, load_tokenized_dataset
 from coati.models import convert_to_lora_module
 from coati.trainer import SFTTrainer
 from coati.utils import load_checkpoint
@@ -16,9 +16,12 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.cluster import DistCoordinator
+from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
 
+logger = get_dist_logger()
+
 
 def train(args):
     # check lora compatibility
@@ -35,6 +38,24 @@ def train(args):
     # ==============================
     # Initialize Booster
     # ==============================
+    init_ctx = nullcontext()
+    with init_ctx:
+        if args.use_flash_attn:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.pretrain,
+                torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16,
+                attn_implementation="flash_attention_2",
+                trust_remote_code=True,
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.pretrain,
+                torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16,
+                trust_remote_code=True,
+            )
+    if args.lora_rank > 0:
+        model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
     if args.plugin == "ddp":
         """
         Default torch ddp plugin without any acceleration, for
@@ -47,7 +68,8 @@ def train(args):
             placement_policy="static",
             initial_scale=2**16,
             max_norm=args.grad_clip,
-            enable_gradient_accumulation=True,
+            enable_gradient_accumulation=True if args.accumulation_steps > 1 else False,
+            enable_flash_attention=args.use_flash_attn,
         )
     elif args.plugin == "gemini_auto":
         plugin = GeminiPlugin(
@@ -55,6 +77,7 @@ def train(args):
             placement_policy="auto",
             initial_scale=2**16,
             max_norm=args.grad_clip,
+            enable_flash_attention=args.use_flash_attn,
         )
     elif args.plugin == "zero2":
         plugin = LowLevelZeroPlugin(
@@ -74,11 +97,17 @@ def train(args):
     elif args.plugin == "3d":
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
-            pp_size=1,
-            zero_stage=0,
+            pp_size=args.pp,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            zero_stage=args.zero_stage,
+            enable_flash_attention=args.use_flash_attn,
+            enable_sequence_parallelism=args.enable_sequence_parallelism,
+            cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False,
             parallel_output=False,
             max_norm=args.grad_clip,
             precision=args.mixed_precision,
+            microbatch_size=args.batch_size,
         )
     else:
         raise ValueError(f"Unknown plugin {args.plugin}")
@@ -93,20 +122,6 @@ def train(args):
     #     LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
     # )
 
-    init_ctx = nullcontext()
-    with init_ctx:
-        if args.use_flash_attn:
-            model = AutoModelForCausalLM.from_pretrained(
-                args.pretrain,
-                torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16,
-                use_flash_attention_2=True,
-            )
-            coordinator.print_on_master(msg="Flash-attention enabled successfully")
-        else:
-            model = AutoModelForCausalLM.from_pretrained(args.pretrain)
-        if args.lora_rank > 0:
-            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
-
     if args.grad_checkpoint and args.lora_rank == 0:
         # lora layers are not supported by gradient checkpointing
         model.gradient_checkpointing_enable()
@@ -131,6 +146,7 @@ def train(args):
 
     tokenizer.add_bos_token = False
     tokenizer.add_eos_token = False
+    tokenizer.padding_side = "right"
 
     coordinator.print_on_master(f"Configuration file will be saved at: {args.config_file}")
     coordinator.print_on_master(f"Model checkpoint will be saved at: {args.save_path}")
@@ -150,13 +166,14 @@ def train(args):
     )
     dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train")
     data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_len)
-    train_dataloader = setup_distributed_dataloader(
+
+    train_dataloader = plugin.prepare_dataloader(
         dataset=dataset,
         batch_size=args.batch_size,
         shuffle=True,
         drop_last=True,
         collate_fn=data_collator,
-        use_tp=args.tp > 1,
+        distributed_sampler_cls=StatefulDistributedSampler,
     )
     coordinator.print_on_master(
         f"Max CUDA memory after data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
@@ -185,7 +202,6 @@ def train(args):
         lr_scheduler=lr_scheduler,
         dataloader=train_dataloader,
     )
-    # model = model.to(get_current_device())
     torch.set_default_dtype(torch.float)
 
     coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
@@ -255,7 +271,7 @@ def train(args):
     # save model checkpoint after fitting on only rank0
     coordinator.print_on_master("Start saving final model checkpoint")
 
-    booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
+    # booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
     coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
 
     coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
@@ -270,13 +286,19 @@ def train(args):
         "--plugin",
         type=str,
         default="gemini",
-        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d", "ddp"],
+        choices=["gemini", "gemini_auto", "3d", "ddp", "zero2_cpu", "zero2"],
         help="Choose which plugin to use",
     )
     parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
     parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
     parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
     parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
+    parser.add_argument("--sp", type=int, default=1)
+    parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true")
+    parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2])
+    parser.add_argument("--zero_cpu_offload", default=False, action="store_true")
+    parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"])
     parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--tokenizer_dir", type=str, default=None)
     parser.add_argument("--dataset", nargs="+", default=[])
@@ -287,7 +309,7 @@ def train(args):
     parser.add_argument("--max_epochs", type=int, default=3)
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--max_len", type=int, default=512)
-    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--mixed_precision", type=str, default="bf16", choices=["fp16", "bf16"], help="Mixed precision")
     parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
     parser.add_argument(
         "--lora_train_bias",
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.sh b/applications/ColossalChat/examples/training_scripts/train_sft.sh
index d5c394377616..53c7129013db 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.sh
@@ -15,7 +15,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 
 
 # export CUDA_VISIBLE_DEVICES=4,5,6
-set_n_least_used_CUDA_VISIBLE_DEVICES 4
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
 PROJECT_NAME="sft"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
 PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
@@ -40,8 +40,10 @@ FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
 CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
 
+echo $(which colossalai)
+echo $(which python)
 # the real batch size for gradient descent is number_of_node_in_hostfile * nproc_per_node * train_batch_size
-colossalai run --nproc_per_node 4 --master_port 31312 --hostfile ./hostfile train_sft.py \
+colossalai run --nproc_per_node 2 --master_port 31312 --hostfile ./hostfile train_sft.py \
     --pretrain $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --save_interval 4000 \
@@ -49,11 +51,15 @@ colossalai run --nproc_per_node 4 --master_port 31312 --hostfile ./hostfile trai
     --save_path $SAVE_DIR \
     --config_file $CONFIG_FILE \
     --lora_rank 0 \
-    --plugin zero2 \
-    --batch_size 8 \
-    --max_epochs 1 \
+    --plugin 3d \
+    --tp 2 \
+    --pp 1 \
+    --zero_stage 0 \
+    --batch_size 2 \
+    --max_epochs 3 \
     --accumulation_steps 1 \
-    --lr 2e-5 \
-    --max_len 2048 \
+    --lr 5e-5 \
+    --max_len 400 \
     --grad_checkpoint \
-    --use_wandb
+    --use_wandb \
+    --use_flash_attn
diff --git a/applications/ColossalChat/requirements.txt b/applications/ColossalChat/requirements.txt
index de5f6160e827..ef3a5a0e8420 100755
--- a/applications/ColossalChat/requirements.txt
+++ b/applications/ColossalChat/requirements.txt
@@ -1,9 +1,8 @@
-transformers==4.34.1
-huggingface_hub==0.17.3
+transformers>=4.36.2
 tqdm
-datasets
+datasets==2.14.7
 loralib
-colossalai>=0.3.6
+colossalai>=0.3.7
 torch>=1.12.1
 langchain
 tokenizers
diff --git a/applications/ColossalChat/tests/llama.json b/applications/ColossalChat/tests/llama.json
index 482ff9e6528c..788a48c91d99 100644
--- a/applications/ColossalChat/tests/llama.json
+++ b/applications/ColossalChat/tests/llama.json
@@ -4,5 +4,6 @@
     "stop_ids": [
         29871,
         2
-    ]
+    ],
+    "end_of_assistant": "</s>"
 }
diff --git a/applications/ColossalChat/tests/test_templating.sh b/applications/ColossalChat/tests/test_templating.sh
index 7fefede47539..d033c07f5fa4 100755
--- a/applications/ColossalChat/tests/test_templating.sh
+++ b/applications/ColossalChat/tests/test_templating.sh
@@ -6,7 +6,8 @@ TEST_DATA_DIR=$BASE_DIR/tests/test_data
 DATA_SAVE_PATH=$BASE_TEMP_DIR/tests
 CONFIG_DIR=$BASE_DIR/config
 
-MODELS=("colossal-llama2" "llama2" "zephyr" "mistral" "chatGLM2" "Qwen" "Vicuna" "Yi")
+# MODELS=("colossal-llama2" "llama2" "mistral" "chatGLM2" "chatGLM3" "deepseek" "Yi" "baichuan")  # for local test
+MODELS=("colossal-llama2" "llama2" "chatGLM2" "chatGLM3" "deepseek" "Yi")
 
 get_pretrain() {
     local model=$1
@@ -14,27 +15,51 @@ get_pretrain() {
         echo "hpcai-tech/Colossal-LLaMA-2-7b-base"
     elif [[ $model == "llama2" ]]; then
         echo "hf-internal-testing/llama-tokenizer"
-    elif [[ $model == "zephyr" ]]; then
-        echo "HuggingFaceH4/zephyr-7b-beta"
+    elif [[ $model == "phi" ]]; then
+        echo "microsoft/phi-2"
     elif [[ $model == "mistral" ]]; then
-        echo "mistralai/Mistral-7B-Instruct-v0.2"
+        echo "mistralai/Mistral-7B-Instruct-v0.3"
     elif [[ $model == "chatGLM2" ]]; then
         echo "THUDM/chatglm2-6b"
-    elif [[ $model == "Qwen" ]]; then
-        echo "Qwen/Qwen-7B-Chat"
-    elif [[ $model == "Vicuna" ]]; then
-        echo "lmsys/vicuna-7b-v1.5"
+    elif [[ $model == "chatGLM3" ]]; then
+        echo "THUDM/chatglm3-6b"
+    elif [[ $model == "deepseek" ]]; then
+        echo "deepseek-ai/DeepSeek-V2-Lite"
     elif [[ $model == "Yi" ]]; then
-        echo "01-ai/Yi-6B-Chat"
+        echo "01-ai/Yi-1.5-9B-Chat"
+    elif [[ $model == "baichuan" ]]; then
+        echo "baichuan-inc/Baichuan2-13B-Chat"
     else
         echo "Unknown model $model"
         exit 1
     fi
 }
 
+
 get_conversation_template_config() {
     local model=$1
-    echo "$CONFIG_DIR/conversation_template/$model.json"
+    if [[ $model == "colossal-llama2" ]]; then
+        echo "$CONFIG_DIR/conversation_template/colossal-llama2.json"
+    elif [[ $model == "llama2" ]]; then
+        echo "$CONFIG_DIR/conversation_template/llama2.json"
+    elif [[ $model == "deepseek" ]]; then
+        echo "$CONFIG_DIR/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json"
+    elif [[ $model == "mistral" ]]; then
+        echo "$CONFIG_DIR/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json"
+    elif [[ $model == "chatGLM2" ]]; then
+        echo "$CONFIG_DIR/conversation_template/THUDM_chatglm2-6b.json"
+    elif [[ $model == "chatGLM3" ]]; then
+        echo "$CONFIG_DIR/conversation_template/THUDM_chatglm3-6b.json"
+    elif [[ $model == "phi" ]]; then
+        echo "$CONFIG_DIR/conversation_template/microsoft_phi-2.json"
+    elif [[ $model == "Yi" ]]; then
+        echo "$CONFIG_DIR/conversation_template/01-ai_Yi-1.5-9B-Chat.json"
+    elif [[ $model == "baichuan" ]]; then
+        echo "$CONFIG_DIR/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json"
+    else
+        echo "Unknown model $model"
+        exit 1
+    fi
 }
 
 # Test SFT data Preparation
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index 5ba4904711ea..d1a685174177 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -30,7 +30,8 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models
 MODELS_DIR=$TEMP_DIR/models_config
 # Skip those tests due to CI tests timeout
 MODELS=('llama')
-PLUGINS=('gemini' 'gemini_auto' 'zero2' 'zero2_cpu' '3d')
+ADVANCED_PLUGINS=('sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2' 'zero2_cpu')  # pp is still buggy
+PLUGINS=('3d' 'gemini' 'gemini_auto' 'zero2' 'zero2_cpu')
 LORA_RANK=('0')  # skip to reduce CI execution time, can pass all locally
 
 export OMP_NUM_THREADS=8
@@ -80,6 +81,8 @@ random_choice() {
 }
 
 
+
+
 echo "[Test]: testing sft ..."
 
 SKIPPED_TESTS=(
@@ -91,7 +94,7 @@ SKIPPED_TESTS=(
 GRAD_CKPTS=('--grad_checkpoint')
 for lora_rank in ${LORA_RANK[@]}; do
     for model in ${MODELS[@]}; do
-        for plugin in ${PLUGINS[@]}; do
+        for plugin in ${ADVANCED_PLUGINS[@]}; do
             if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin-$lora_rank " ]]; then
                 echo "[Test]: Skipped $model-$plugin-$lora_rank"
                 continue
@@ -104,10 +107,56 @@ for lora_rank in ${LORA_RANK[@]}; do
             grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
             tp='1'
             bs='2'
+            pp='1'
+            zero_stage='0'
+            sp='1'
+            sp_mode='split_gather'
+            enable_sequence_parallelism=''
             if [[ $plugin == "3d" ]]; then
                 tp='4'
                 bs='8'
             fi
+            if [[ $plugin == "tp_zero2" ]]; then
+                tp='4'
+                bs='8'
+                zero_stage='2'
+                plugin='3d'
+            fi
+            if [[ $plugin == "tp_pp" ]]; then
+                tp='2'
+                bs='8'
+                pp='2'
+                plugin='3d'
+            fi
+            if [[ $plugin == "pp" ]]; then
+                bs='8'
+                pp='4'
+                plugin='3d'
+            fi
+            if [[ $plugin == "sp_split_gather" ]]; then
+                enable_sequence_parallelism='--enable_sequence_parallelism'
+                sp_mode='split_gather'
+                tp='4'
+                sp='1'
+                bs='8'
+                plugin='3d'
+            fi
+            if [[ $plugin == "sp_ring" ]]; then
+                enable_sequence_parallelism='--enable_sequence_parallelism'
+                sp_mode='ring'
+                tp='4'
+                sp='1'
+                bs='8'
+                plugin='3d'
+            fi
+            if [[ $plugin == "sp_all_to_all" ]]; then
+                enable_sequence_parallelism='--enable_sequence_parallelism'
+                sp_mode='all_to_all'
+                tp='1'
+                sp='4'
+                bs='8'
+                plugin='3d'
+            fi
             grad_accu='2'
             # Check if the plugin is either "gemini_auto" or "gemini" and set grad_accu to '1'
             if [[ $plugin == "gemini_auto" ]]; then
@@ -132,6 +181,11 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --max_epochs 1 \
                     --accumulation_steps $grad_accu \
                     --tp $tp \
+                    --pp $pp \
+                    --zero_stage $zero_stage \
+                    --sp $sp \
+                    --sp_mode $sp_mode \
+                    $enable_sequence_parallelism \
                     --lr 2e-5 \
                     $grad_ckpt \
                     --max_len 400 \
@@ -226,8 +280,8 @@ echo "[Test]: testing ppo ..."
 
 
 SKIPPED_TESTS=(
-    llama-3d-20 # 3d plugin doesn't support lora
-    llama-gemini-20 # gemini doesn't support lora
+    llama-3d # 3d plugin doesn't support lora
+    llama-gemini # gemini doesn't support lora
 )
 
 GRAD_CKPTS=('--grad_checkpoint')
@@ -304,7 +358,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                     $grad_ckpt \
                     --max_len 400 \
                     --max_seq_len 10 \
-                    --use_flash_attn
+                    # --use_flash_attn
                 passed=$?
                 if [ $passed -eq 0 ]; then
                     rm -rf $MODEL_SAVE_PATH/*
diff --git a/colossalai/_analyzer/fx/codegen.py b/colossalai/_analyzer/fx/codegen.py
index 68a27d91986b..9fd82ccd4a06 100644
--- a/colossalai/_analyzer/fx/codegen.py
+++ b/colossalai/_analyzer/fx/codegen.py
@@ -469,4 +469,4 @@ def emit_node(node: Node, body):
 {wrap_stmts}
 {prologue}
 {code}"""
-        return PythonCode(fn_code, globals_)
+        return PythonCode(fn_code, globals_, {})
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index ab554d21dc95..474b78aa26b8 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -369,6 +369,11 @@ def __init__(
         assert precision in SUPPORTED_PRECISION, f"precision {precision} is not supported"
         if get_accelerator().name == "npu":
             assert placement_policy == "static", "NPU only supports static placement policy"
+        if placement_policy == "auto" and enable_async_reduce:
+            logging.warning(
+                f"enable_async_reduce requires pin_memory to achieve best performance, which is not implicitly set."
+            )
+            pin_memory = True
         self.gemini_config = dict(
             chunk_config_dict=chunk_config_dict,
             chunk_init_device=(chunk_init_device or get_accelerator().get_current_device()),
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 45fe03003b5a..fa3c3646a592 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -999,7 +999,9 @@ def __init__(
         ), f"World size {dist.get_world_size()} is not divisible by tp_size {tp_size} * pp_size {pp_size}"
 
         if enable_sequence_parallelism:
-            self.sequence_parallelism_mode = sequence_parallelism_mode if sequence_parallelism_mode is not None else "1"
+            self.sequence_parallelism_mode = (
+                sequence_parallelism_mode if sequence_parallelism_mode is not None else "all_to_all"
+            )
             assert (
                 self.sequence_parallelism_mode in SUPPORT_SP_MODE
             ), f"Sequence parallelism mode {self.sequence_parallelism_mode} is not in the supported list {SUPPORT_SP_MODE}"
@@ -1014,19 +1016,13 @@ def __init__(
                 self.sp_size = 1
                 self.dp_size = dist.get_world_size() // (tp_size * pp_size)
             elif self.sequence_parallelism_mode in ["all_to_all"]:
-                assert (
-                    tp_size == 1
-                ), f"Sequence parallelism mode {self.sequence_parallelism_mode} cannot be used with tensor parallelism"
-                assert (
-                    pp_size == 1
-                ), f"Sequence parallelism mode {self.sequence_parallelism_mode} cannot be used with pipeline parallelism"
-                self.sp_size = dist.get_world_size() if sp_size is None else sp_size
-                self.dp_size = dist.get_world_size() // (self.sp_size * pp_size)
+                self.sp_size = 1 if sp_size is None else sp_size
+                self.dp_size = dist.get_world_size() // (self.sp_size * pp_size * tp_size)
         else:
             self.dp_size = dist.get_world_size() // (tp_size * pp_size)
             assert (
                 sp_size == 1 or sp_size is None
-            ), f"sp_size can only be set to a >1 number when enable_sequence_parallelism is True"
+            ), f"You should not set sp_size when sequence parallelism is not enabled."
             self.sp_size = 1
 
         self.tp_size = tp_size
@@ -1040,11 +1036,22 @@ def __init__(
         self.enable_jit_fused = enable_jit_fused
         self.enable_sequence_parallelism = enable_sequence_parallelism
         if dp_outside:
-            self.dp_axis, self.pp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
+            (
+                self.dp_axis,
+                self.pp_axis,
+                self.tp_axis,
+                self.sp_axis,
+            ) = (
+                0,
+                1,
+                2,
+                3,
+            )
             self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size, self.sp_size)
         else:
             self.pp_axis, self.dp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
             self.pg_mesh = ProcessGroupMesh(self.pp_size, self.dp_size, self.tp_size, self.sp_size)
+
         self.stage_manager = None
         self.schedule = None
         self.custom_policy = custom_policy
diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py
index d5f164853547..36138f33e9ab 100644
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@@ -315,7 +315,7 @@ def save_state_dict(state_dict: dict, checkpoint_file_path: str, use_safetensors
         use_safetensors (bool): whether to use safetensors to save the checkpoint.
     """
     # Move all tensors in the state_dict to CPU before saving to avoid serialization issues
-    state_dict_cpu = tree_map(lambda x: x.cpu() if torch.is_tensor(x) else x, state_dict)
+    state_dict_cpu = tree_map(lambda x: x.data.cpu() if torch.is_tensor(x) else x, state_dict)
 
     if use_safetensors:
         assert is_safetensors_available(), "safetensors is not available."
diff --git a/colossalai/fx/codegen/activation_checkpoint_codegen.py b/colossalai/fx/codegen/activation_checkpoint_codegen.py
index 28451bdd1870..ed3b3b1ef129 100644
--- a/colossalai/fx/codegen/activation_checkpoint_codegen.py
+++ b/colossalai/fx/codegen/activation_checkpoint_codegen.py
@@ -859,7 +859,7 @@ def emit_node(node: Node, body):
 {wrap_stmts}
 {prologue}
 {code}"""
-            return PythonCode(fn_code, globals_)
+            return PythonCode(fn_code, globals_, {})
 
 else:
 
diff --git a/colossalai/inference/README.md b/colossalai/inference/README.md
index b46222d806af..0a9b5293d4a2 100644
--- a/colossalai/inference/README.md
+++ b/colossalai/inference/README.md
@@ -236,7 +236,7 @@ Completion api is used for single sequence request, like answer a question or co
 - POST '/chat':
 Chat api is used for conversation-style request, which often includes dialogue participants(i.e. roles) and corresponding words. Considering the input data are very different from normal inputs, we introduce Chat-Template to match the data format in chat models.
 #### chat-template
-Followed `transformers`, we add the chat-template argument. As chat models have been trained with very different formats for converting conversations into a single tokenizable string. Using a format that matches the training data is extremely important. This attribute(chat_template) is inclueded in HuggingFace tokenizers, containing a Jinja template that converts conversation histories into a correctly formatted string. You can refer to the [HuggingFace-blog](https://huggingface.co/blog/chat-templates) for more information. We also provide a simple example temlate bellow. Both str or file style chat template are supported.
+Followed `transformers`, we add the chat-template argument. As chat models have been trained with very different formats for converting conversations into a single tokenizable string. Using a format that matches the training data is extremely important. This attribute(chat_template) is inclueded in HuggingFace tokenizers, containing a Jinja template that converts conversation histories into a correctly formatted string. You can refer to the [HuggingFace-blog](https://huggingface.co/blog/chat-templates) for more information. We also provide a simple example template bellow. Both str or file style chat template are supported.
 ### Usage
 #### Args for customizing your server
 The configuration for api server contains both serving interface and engine backend.
@@ -278,6 +278,7 @@ This project was written from scratch but we learned a lot from several other gr
 - [vLLM](https://github.com/vllm-project/vllm)
 - [flash-attention](https://github.com/Dao-AILab/flash-attention)
 - [HuggingFace](https://huggingface.co)
+- [StreamingLLM](https://github.com/mit-han-lab/streaming-llm)
 If you wish to cite relevant research papars, you can find the reference below.
 
 ```bibtex
@@ -301,4 +302,12 @@ If you wish to cite relevant research papars, you can find the reference below.
   author={Dao, Tri},
   year={2023}
 }
+
+# StreamingLLM
+@article{xiao2023streamingllm,
+  title={Efficient Streaming Language Models with Attention Sinks},
+  author={Xiao, Guangxuan and Tian, Yuandong and Chen, Beidi and Han, Song and Lewis, Mike},
+  journal={arXiv},
+  year={2023}
+}
 ```
diff --git a/colossalai/inference/batch_bucket.py b/colossalai/inference/batch_bucket.py
index f8571c0ca030..88bde3a3beeb 100644
--- a/colossalai/inference/batch_bucket.py
+++ b/colossalai/inference/batch_bucket.py
@@ -31,6 +31,9 @@ def __init__(
         fd_interm_tensor=None,
         device=None,
         dtype=torch.float16,
+        enable_streamingllm: bool = False,
+        start_token_size: int = 4,
+        generated_token_size: int = 512,
     ):
         self.num_heads = num_heads
         self.head_dim = head_dim
@@ -45,12 +48,19 @@ def __init__(
         self._use_spec_dec = False
         self._num_tokens_to_verify = None
 
+        self.enable_streamingllm = enable_streamingllm
+        self.start_token_size = start_token_size
+        self.generated_token_size = generated_token_size
+
         self._current_batch_size = 0
         self._sequences_dict = dict()
         self._sequences_indexes = dict()  # deque(maxlen=self.max_batch_size)
         self._sequence_lengths = torch.zeros((self.max_batch_size,), dtype=torch.int32)
         self._sequence_lengths_helper = torch.zeros_like(self._sequence_lengths)
-        max_blocks_per_seq = (self.max_length + block_size - 1) // block_size
+        if enable_streamingllm:
+            max_blocks_per_seq = (start_token_size + generated_token_size + block_size - 1) // block_size + 1
+        else:
+            max_blocks_per_seq = (self.max_length + block_size - 1) // block_size
         self._block_tables = torch.full((self.max_batch_size, max_blocks_per_seq), -1, dtype=torch.int32)
         self._block_tables_helper = torch.full_like(self._block_tables, -1)
 
@@ -109,6 +119,33 @@ def batch_token_ids(self) -> List[List[int]]:
             out.append(seq.input_token_id + seq.output_token_id)
         return out
 
+    def streamingllm_update_batch(self, start_token_size: int, generated_token_size: int):
+        """
+        Update sequence_lengths and block_tables when it is necessary to swap out a block.
+        """
+
+        updated_block_ids = []
+
+        if self.current_batch_size > 0:
+            need_update = False
+            sequence_lengths_list = self._sequence_lengths.tolist()
+            block_tables_list = self._block_tables[: self._current_batch_size].tolist()
+            for batch_id in range(self.current_batch_size):
+                # We assume that the start token occupies the entire first block.
+                if sequence_lengths_list[batch_id] == start_token_size + generated_token_size + self.block_size - 1:
+                    need_update = True
+                    sequence_lengths_list[batch_id] = start_token_size + generated_token_size - 1
+                    block_id = block_tables_list[batch_id].pop(1)
+                    updated_block_ids.append(block_id)
+                    block_tables_list[batch_id].append(-1)
+            if need_update:
+                self._sequence_lengths = torch.tensor(
+                    sequence_lengths_list, dtype=self._sequence_lengths.dtype, device=self.device
+                )
+                self._block_tables = torch.tensor(block_tables_list, dtype=self._block_tables.dtype, device=self.device)
+
+        return updated_block_ids
+
     def set_use_spec_dec(self, num_tokens_to_verify: int = 5) -> None:
         """Set batch bucket to use speculatvie decoding.
         This will notify the adjust the lengths of inputs during modeling,
diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index 61bc7c8abc9c..c73ee9df4334 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -10,6 +10,7 @@
 from transformers.generation import GenerationConfig
 
 from colossalai.inference.flash_decoding_utils import FDIntermTensors
+from colossalai.inference.utils import can_use_flash_attn2
 
 GibiByte = 1024**3
 
@@ -166,9 +167,11 @@ class InferenceConfig(RPC_PARAM):
         top_k (Optional[int]): The number of highest probability vocabulary tokens to keep for top-k-filtering, defaults to None.
         top_p (Optional[float]): The cumulative probability threshold for retaining tokens with a total probability above it, defaults to None.
         temperature (Optional[float]): Randomness used to control randomization, defaults to 1.0.
-        repetition_penalty (Optional[float]): The parameter that influences the model's treatment of new tokens in relation to their appearance in the prompt and the generated text. Values greater than 1 incentivize the model to introduce new tokens, whereas values less than 1 incentivize token repetition., defaults to 1.0.
         no_repeat_ngram_size (Optional[int]): If no_repeat_ngram_size > 0, the consecutive tokens of ngram size can only appear once in inference sentences.
-        n_spec_tokens (int): The maximum number of speculating tokens, defaults to None.
+        repetition_penalty (Optional[float]): The parameter that influences the model's treatment of new tokens in relation to their appearance in the prompt and the generated text. Values greater than 1 incentivize the model to introduce new tokens, whereas values less than 1 incentivize token repetition., defaults to 1.0.
+        ignore_eos(bool): Whether to ignore the EOS token and continue generating tokens when encountering the EOS token.
+        use_spec_dec (bool): Indicate whether to use speculative decoding, defaults to False.
+        max_n_spec_tokens (int): The maximum number of speculating tokens, defaults to None.
         glimpse_large_kv (bool): Whether to use large KV in drafter model, defaults to False.
         block_size (int): The number of blocks in a logical block, defaults to 16.
         tp_size (int): Tensor parallel size, defaults to 1.
@@ -176,10 +179,12 @@ class InferenceConfig(RPC_PARAM):
         micro_batch_size (int): the micro batch size, defaults to 1. Only useful when `pp_size` > 1.
         micro_batch_buffer_size (int): the buffer size for micro batch. Normally, it should be the same as the number of pipeline stages.
         use_cuda_kernel(bool): Whether to use cuda kernel, faster but lose some precision occasionally
+        high_precision(Optional[bool]): Whether to use float32 for underlying calculations of float16 data to achieve higher precision, defaults to False.
         use_cuda_graph (bool): Whether to enforce CUDA graph execution. If False, we will disable CUDA graph and always execute the model in eager mode. If True, we will use eager execution in hybrid.
         max_context_len_to_capture (int): max context len that could be captured by CUDA Graph, per sequence
-        high_precision(Optional[bool]): Whether to use float32 for underlying calculations of float16 data to achieve higher precision, defaults to False.
-        ignore_eos(bool): Whether to ignore the EOS token and continue generating tokens when encountering the EOS token.
+        enable_streamingllm(bool): Whether to use StreamingLLM, the relevant algorithms refer to the paper at https://arxiv.org/pdf/2309.17453 for implementation.
+        start_token_size(int): The size of the start tokens, when using StreamingLLM.
+        generated_token_size(int): The size of the generated tokens, When using StreamingLLM.
     """
 
     # NOTE: arrange configs according to their importance and frequency of usage
@@ -208,8 +213,10 @@ class InferenceConfig(RPC_PARAM):
     no_repeat_ngram_size: Optional[int] = 0
     repetition_penalty: Optional[float] = 1.0
     forced_eos_token_id: int = None
+    ignore_eos: bool = False
 
     # speculative decoding configs
+    use_spec_dec: bool = False
     max_n_spec_tokens: int = 5
     glimpse_large_kv: bool = False
 
@@ -221,15 +228,19 @@ class InferenceConfig(RPC_PARAM):
     pp_size: int = 1
     micro_batch_size: int = 1
     micro_batch_buffer_size: int = None
-    high_precision: Optional[bool] = False
 
     # cuda kernel option
     use_cuda_kernel: bool = False
+    high_precision: Optional[bool] = False
 
     # cuda_graph
     use_cuda_graph: bool = False  # NOTE only when we have the graph for specific decoding batch size can we use the cuda graph for inference
     max_context_len_to_capture: int = 512
-    ignore_eos: bool = False
+
+    # StreamingLLM (sliding window attention with attention sinks)
+    enable_streamingllm: bool = False
+    start_token_size: int = 4
+    generated_token_size: int = 512
 
     def __post_init__(self):
         self.max_context_len_to_capture = self.max_input_len + self.max_output_len
@@ -260,6 +271,20 @@ def _verify_config(self) -> None:
         if self.dtype == torch.float32:
             self.high_precision = False
 
+        # check StreamingLLM
+        assert (
+            self.start_token_size <= self.block_size
+        ), f"According to the paper https://arxiv.org/pdf/2309.17453, the start_token_size greater than 4 has little impact on inference performance. Therefore, we assume that the start_token_size should be less or equal than the block_size={self.block_size}, but got {self.start_token_size}."
+        assert (
+            self.generated_token_size % self.block_size == 0
+        ), f"We assume that the generated_token_size should be a multiple of the block_size, got generated_token_size={self.generated_token_size}."
+        # Our StreamingLLM implementation (sliding window attention with attention sinks) references https://arxiv.org/pdf/2309.17453 and has been optimized
+        # based on our framework's kvcache management mechanism. According to the paper, a start_token_size of 4 is sufficient. Therefore,
+        # we assume the start_token_size is less than or equal to the block size. When the start_token_size is smaller than the block size,
+        # we fill the first block with the start_token_size and subsequently generated tokens, using these as the "start tokens."
+        # Thereafter, we swap out tokens in units of blocks, and always swapping out the second block when the generated tokens exceeded the limit.
+        self.start_token_size = self.block_size
+
         # check prompt template
         if self.prompt_template is None:
             return
@@ -289,6 +314,16 @@ def to_generation_config(self, model_config) -> GenerationConfig:
 
         return GenerationConfig.from_dict(meta_config)
 
+    def to_model_shard_inference_config(self) -> "ModelShardInferenceConfig":
+        use_flash_attn = can_use_flash_attn2(self.dtype)
+        model_inference_config = ModelShardInferenceConfig(
+            dtype=self.dtype,
+            use_cuda_kernel=self.use_cuda_kernel,
+            use_spec_dec=self.use_spec_dec,
+            use_flash_attn=use_flash_attn,
+        )
+        return model_inference_config
+
     def to_rpc_param(self) -> dict:
         kwargs = {
             "dtype": str(self.dtype).split(".")[-1],
@@ -340,3 +375,21 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "InferenceConfig":
         # Set the attributes from the parsed arguments.
         inference_config = cls(**inference_config_args)
         return inference_config
+
+
+@dataclass
+class ModelShardInferenceConfig:
+    """
+    Configurations used during init of module for inference modeling.
+
+    Args:
+        dtype (torch.dtype): The data type for weights and activations.
+        use_cuda_kernel (bool): Whether to use cuda kernel, faster but lose some precision occasionally
+        use_spec_dec (bool): Indicate whether to use speculative decoding.
+        use_flash_attn (bool): Indicate whether to use flash attention.
+    """
+
+    dtype: torch.dtype = None
+    use_cuda_kernel: bool = False
+    use_spec_dec: bool = False
+    use_flash_attn: bool = False
diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index 96c2b15ee16e..d0d46d81bc0f 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -18,7 +18,7 @@
 from colossalai.accelerator import get_accelerator
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.inference.batch_bucket import BatchBucket
-from colossalai.inference.config import InferenceConfig, InputMetaData
+from colossalai.inference.config import InferenceConfig, InputMetaData, ModelShardInferenceConfig
 from colossalai.inference.graph_runner import CUDAGraphRunner
 from colossalai.inference.modeling.policy import model_policy_map
 from colossalai.inference.sampler import search_tokens
@@ -72,8 +72,9 @@ def __init__(
 
         self.verbose = verbose
         self.logger = get_dist_logger(__name__)
+        self.model_shard_infer_config = inference_config.to_model_shard_inference_config()
 
-        self.init_model(model_or_path, model_policy)
+        self.init_model(model_or_path, model_policy, self.model_shard_infer_config)
 
         self.generation_config = inference_config.to_generation_config(self.model_config)
         self.generation_config_dict = self.generation_config.to_dict()
@@ -97,7 +98,8 @@ def __init__(
             self.capture_model(self.k_cache, self.v_cache)
 
         # Model and relatable attrs of speculative decoding will be set by `enable_spec_dec`
-        self.use_spec_dec = False
+        self.use_spec_dec = self.inference_config.use_spec_dec
+
         self.drafter_model = None
         self.drafter = None
         self.use_glide = False
@@ -105,13 +107,20 @@ def __init__(
 
         self._verify_args()
 
-    def init_model(self, model_or_path: Union[nn.Module, str], model_policy: Union[Policy, Type[Policy]] = None):
+    def init_model(
+        self,
+        model_or_path: Union[nn.Module, str],
+        model_policy: Union[Policy, Type[Policy]] = None,
+        model_shard_infer_config: ModelShardInferenceConfig = None,
+    ):
         """
         Shard model or/and Load weight
 
         Args:
             model_or_path Union[nn.Module, str]: path to the checkpoint or model of transformer format.
-            model_policy (Policy): the policy to replace the model
+            model_policy (Policy): the policy to replace the model.
+            model_inference_config: the configuration for modeling initialization when inference.
+            model_shard_infer_config (ModelShardInferenceConfig): the configuration for init of module when inference.
         """
 
         if isinstance(model_or_path, str):
@@ -124,6 +133,7 @@ def init_model(self, model_or_path: Union[nn.Module, str], model_policy: Union[P
                     # the model load process in the future.
                     model = _supported_models[arch].from_pretrained(model_or_path, trust_remote_code=True)
                 else:
+                    # TODO(char-1ee): if the model not supported, use transformers APIs to load and generate
                     raise ValueError(f"Model {arch} is not supported.")
 
             except Exception as e:
@@ -167,6 +177,7 @@ def init_model(self, model_or_path: Union[nn.Module, str], model_policy: Union[P
         self.model = self._shardformer(
             model,
             model_policy,
+            model_shard_infer_config,
             None,
             tp_group=tp_group,
         )
@@ -187,7 +198,7 @@ def init_model(self, model_or_path: Union[nn.Module, str], model_policy: Union[P
         #     assert if_has_index_file, "the model path is invalid"
         #     cpt_io.load_model(self.model, model_index_file)
 
-        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+        free_gpu_memory, _ = torch.cuda.mem_get_info()
         peak_memory = init_gpu_memory - free_gpu_memory
         if self.verbose:
             self.logger.info(
@@ -287,6 +298,7 @@ def _shardformer(
         self,
         model: nn.Module,
         model_policy: Policy,
+        model_shard_infer_config: ModelShardInferenceConfig = None,
         stage_manager: PipelineStageManager = None,
         tp_group: ProcessGroupMesh = None,
     ) -> nn.Module:
@@ -312,6 +324,7 @@ def _shardformer(
             enable_flash_attention=False,
             enable_jit_fused=False,
             enable_sequence_parallelism=False,
+            extra_kwargs={"model_shard_infer_config": model_shard_infer_config},
         )
         shardformer = ShardFormer(shard_config=shardconfig)
         shard_model, _ = shardformer.optimize(model, model_policy)
@@ -348,6 +361,7 @@ def enable_spec_dec(
         engine.clear_spec_dec()
         ```
         """
+
         if drafter_model is None and self.drafter is None:
             raise ValueError("Drafter not initialized. Please provide a Drafter Model")
         if n_spec_tokens is not None:
@@ -517,19 +531,19 @@ def generate(
         prompts_token_ids: Union[List[int], torch.Tensor, np.ndarray] = None,
         return_token_ids: bool = False,
         generation_config: Optional[GenerationConfig] = None,
-    ) -> List[str]:
+    ) -> Union[List[str], Tuple[List[str], List[List[int]]]]:
         """
         Executing the inference step.
 
         Args:
-            prompts (Union[List[str], optional): Input prompts. Defaults to None.
-            prompts_token_ids (List[List[int]], optional): token ids of input prompts. Defaults to None.
             request_ids (List[int], optional): The request ID. Defaults to None.
-            return_token_ids (bool): Whether to return output token ids. Defaults to False.
-            generation_config (GenerationConfig, optional): Huggingface GenerationConfig used for inference. Defaults to None.
+            prompts (Union[List[str], optional): Input prompts. Defaults to None.
+            prompts_token_ids (Union[List[int], torch.Tensor, np.ndarray], optional): token ids of input prompts. Defaults to None.
+            return_token_ids (bool, optional): Whether to return output token ids. Defaults to False.
+            generation_config (Optional[GenerationConfig], optional): Huggingface GenerationConfig used for inference. Defaults to None.
 
         Returns:
-            List[str]: Inference result returned by one generation.
+            Union[List[str], Tuple[List[str], List[List[int]]]]: Inference result returned by one generation.
         """
 
         gen_config_dict = generation_config.to_dict() if generation_config is not None else {}
@@ -667,6 +681,11 @@ def add_request(
             elif max_length is not None:
                 max_new_tokens = max_length - len(prompts_token_ids[i])
 
+            if not self.inference_config.enable_streamingllm:
+                assert (
+                    self.inference_config.max_output_len >= max_new_tokens
+                ), f"max_new_tokens={max_new_tokens} must be less than max_output_len={self.inference_config.max_output_len}."
+
             sequence = Sequence(
                 request_id,
                 prompt,
@@ -754,6 +773,13 @@ def step(self) -> List[str]:
         logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
         if self.inference_config.pad_input:
             logits = logits[:, -1, :]
+
+        if self.inference_config.enable_streamingllm:
+            updated_block_ids = batch.streamingllm_update_batch(
+                self.inference_config.start_token_size, self.inference_config.generated_token_size
+            )
+            self.request_handler.streamingllm_free_block_tables(updated_block_ids)
+
         next_tokens = search_tokens(
             self.generation_config, logits, input_meta_data.is_prompts, batch_token_ids=input_meta_data.batch_token_ids
         )
diff --git a/colossalai/inference/core/request_handler.py b/colossalai/inference/core/request_handler.py
index 5085c55558b4..512eaea71c7b 100644
--- a/colossalai/inference/core/request_handler.py
+++ b/colossalai/inference/core/request_handler.py
@@ -157,6 +157,9 @@ def __init__(self, inference_config: InferenceConfig, model_config: PretrainedCo
             fd_interm_tensor=fd_inter_tensor,
             dtype=self.dtype,
             device=device,
+            enable_streamingllm=inference_config.enable_streamingllm,
+            start_token_size=inference_config.start_token_size,
+            generated_token_size=inference_config.generated_token_size,
         )
         self.prefill_bb = BatchBucket(
             num_heads=model_config.num_attention_heads // inference_config.tp_size,
@@ -168,6 +171,9 @@ def __init__(self, inference_config: InferenceConfig, model_config: PretrainedCo
             fd_interm_tensor=fd_inter_tensor,
             dtype=self.dtype,
             device=device,
+            enable_streamingllm=inference_config.enable_streamingllm,
+            start_token_size=inference_config.start_token_size,
+            generated_token_size=inference_config.generated_token_size,
         )
 
     def _init_cache(self, model_config):
@@ -350,6 +356,12 @@ def update(self):
 
         return finished_seqs
 
+    def streamingllm_free_block_tables(self, updated_block_ids: List[int]):
+        """
+        Free the block that needs to be swapped out.
+        """
+        self.cache_manager.streamingllm_free_block_tables(updated_block_ids)
+
 
 class RPCRequestHandler(RequestHandler):
     """
diff --git a/colossalai/inference/executor/__init__.py b/colossalai/inference/executor/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/colossalai/inference/kv_cache/kvcache_manager.py b/colossalai/inference/kv_cache/kvcache_manager.py
index a20bd8ee79ea..378eb2ff9151 100644
--- a/colossalai/inference/kv_cache/kvcache_manager.py
+++ b/colossalai/inference/kv_cache/kvcache_manager.py
@@ -78,10 +78,16 @@ def __init__(self, config: InferenceConfig, model_config: PretrainedConfig) -> N
         self.max_output_length = config.max_output_len
         # Cache block settings
         self.block_size = config.block_size
+
         # NOTE: `num_blocks` is not prompted, but evaluated from the maximum input/output length, and the maximum batch size
-        self.max_blocks_per_sequence = (
-            self.max_input_length + self.max_output_length + self.block_size - 1
-        ) // self.block_size
+        if config.enable_streamingllm:
+            self.max_blocks_per_sequence = (
+                config.start_token_size + config.generated_token_size + self.block_size - 1
+            ) // self.block_size + 1
+        else:
+            self.max_blocks_per_sequence = (
+                self.max_input_length + self.max_output_length + self.block_size - 1
+            ) // self.block_size
         self.num_blocks = self.max_blocks_per_sequence * self.max_batch_size * self.beam_width
 
         # Physical cache allocation
@@ -446,6 +452,20 @@ def clear_all(self) -> None:
         self._available_blocks = self.num_blocks
         self._block_states[:] = 1
 
+    def streamingllm_free_block_tables(self, updated_block_ids: List[int]):
+        """
+        Free the block that needs to be swapped out.
+        """
+        for global_block_id in updated_block_ids:
+            if global_block_id < 0:
+                return
+            block: CacheBlock = self._cache_blocks[global_block_id]
+            block.remove_ref()
+            if not block.has_ref():
+                block.allocated_size = 0
+                self._available_blocks += 1
+                self._block_states[global_block_id] = 1
+
     def get_physical_cache(self, layer_id: int, block_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
         """Get the tensor corresponding to the cache block with the prompted id for a specific layer."""
         return self._kv_caches[0][layer_id][block_idx], self._kv_caches[1][layer_id][block_idx]
@@ -533,10 +553,16 @@ def __init__(self, config: InferenceConfig, model_config: PretrainedConfig, verb
         self.max_output_length = config.max_output_len
         # Cache block settings
         self.block_size = config.block_size
+
         # NOTE: `num_blocks` is not prompted, but evaluated from the maximum input/output length, and the maximum batch size
-        self.max_blocks_per_sequence = (
-            self.max_input_length + self.max_output_length + self.block_size - 1
-        ) // self.block_size
+        if config.enable_streamingllm:
+            self.max_blocks_per_sequence = (
+                config.start_token_size + config.generated_token_size + self.block_size - 1
+            ) // self.block_size + 1
+        else:
+            self.max_blocks_per_sequence = (
+                self.max_input_length + self.max_output_length + self.block_size - 1
+            ) // self.block_size
         self.num_blocks = self.max_blocks_per_sequence * self.max_batch_size * self.beam_width
 
         # Logical cache blocks allocation
diff --git a/colossalai/inference/modeling/backends/__init__.py b/colossalai/inference/modeling/backends/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/colossalai/inference/modeling/backends/attention_backend.py b/colossalai/inference/modeling/backends/attention_backend.py
new file mode 100644
index 000000000000..ab586f510d7f
--- /dev/null
+++ b/colossalai/inference/modeling/backends/attention_backend.py
@@ -0,0 +1,170 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import torch
+
+from colossalai.inference.config import ModelShardInferenceConfig
+from colossalai.kernel.kernel_loader import InferenceOpsLoader
+from colossalai.kernel.triton import context_attention_unpadded, flash_decoding_attention
+
+
+@dataclass
+class AttentionMetaData:
+    query_states: torch.Tensor
+    key_states: torch.Tensor
+    value_states: torch.Tensor
+    k_cache: torch.Tensor
+    v_cache: torch.Tensor
+    block_tables: torch.Tensor
+    block_size: int
+    kv_seq_len: int = None
+    sequence_lengths: torch.Tensor = None
+    cu_seqlens: torch.Tensor = None
+    sm_scale: int = None
+    alibi_slopes: torch.Tensor = None
+    output_tensor: torch.Tensor = None
+    use_spec_dec: bool = False
+    use_alibi_attn: bool = False
+
+
+class AttentionBackend(ABC):
+    @abstractmethod
+    def prefill(self, attn_metadata: AttentionMetaData, **kwargs):
+        raise NotImplementedError
+
+    @abstractmethod
+    def decode(self, attn_metadatas: AttentionMetaData, **kwargs):
+        raise NotImplementedError
+
+
+class CudaAttentionBackend(AttentionBackend):
+    """
+    Attention backend when use_cuda_kernel is True but flash-attn not found. If flash-attn is not found,
+    it uses Triton op `context_attention_unpadded` for prefilling and our cuda op `flash_decoding_attention` for decoding.
+    """
+
+    def __init__(self, use_flash_attn: bool = False):
+        super().__init__()
+        self.inference_ops = InferenceOpsLoader().load()
+        self.use_flash_attn = use_flash_attn
+
+    def prefill(self, attn_metadata: AttentionMetaData, **kwargs):
+        if self.use_flash_attn:
+            token_nums = kwargs.get("token_nums", -1)
+
+            from flash_attn import flash_attn_varlen_func
+
+            attn_output = flash_attn_varlen_func(
+                attn_metadata.query_states,
+                attn_metadata.key_states,
+                attn_metadata.value_states,
+                cu_seqlens_q=attn_metadata.cu_seqlens,
+                cu_seqlens_k=attn_metadata.cu_seqlens,
+                max_seqlen_q=attn_metadata.kv_seq_len,
+                max_seqlen_k=attn_metadata.kv_seq_len,
+                dropout_p=0.0,
+                softmax_scale=attn_metadata.sm_scale,
+                causal=True,
+                alibi_slopes=attn_metadata.alibi_slopes,
+            )
+            attn_output = attn_output.view(token_nums, -1)
+        else:
+            attn_output = context_attention_unpadded(
+                q=attn_metadata.query_states,
+                k=attn_metadata.key_states,
+                v=attn_metadata.value_states,
+                k_cache=attn_metadata.k_cache,
+                v_cache=attn_metadata.v_cache,
+                context_lengths=attn_metadata.sequence_lengths,
+                block_tables=attn_metadata.block_tables,
+                block_size=attn_metadata.block_size,
+                output=attn_metadata.output_tensor,
+                alibi_slopes=attn_metadata.alibi_slopes,
+                max_seq_len=attn_metadata.kv_seq_len,
+                sm_scale=attn_metadata.sm_scale,
+                use_new_kcache_layout=True,  # use new k-cache layout
+            )
+        return attn_output
+
+    def decode(self, attn_metadata: AttentionMetaData, **kwargs):
+        fd_inter_tensor = kwargs.get("fd_inter_tensor", None)
+        output_tensor = attn_metadata.output_tensor
+        self.inference_ops.flash_decoding_attention(
+            output_tensor,
+            attn_metadata.query_states,
+            attn_metadata.k_cache,
+            attn_metadata.v_cache,
+            attn_metadata.sequence_lengths,
+            attn_metadata.block_tables,
+            attn_metadata.block_size,
+            attn_metadata.kv_seq_len,
+            fd_inter_tensor.mid_output,
+            fd_inter_tensor.exp_sums,
+            fd_inter_tensor.max_logits,
+            attn_metadata.alibi_slopes,
+            attn_metadata.sm_scale,
+        )
+        return output_tensor
+
+
+class TritonAttentionBackend(AttentionBackend):
+    """
+    Attention backend when use_cuda_kernel is False. It uses pure Triton ops for prefilling and decoding.
+    """
+
+    def prefill(self, attn_metadata: AttentionMetaData, **kwargs):
+        return context_attention_unpadded(
+            q=attn_metadata.query_states,
+            k=attn_metadata.key_states,
+            v=attn_metadata.value_states,
+            k_cache=attn_metadata.k_cache,
+            v_cache=attn_metadata.v_cache,
+            context_lengths=attn_metadata.sequence_lengths,
+            block_tables=attn_metadata.block_tables,
+            block_size=attn_metadata.block_size,
+            output=attn_metadata.output_tensor,
+            alibi_slopes=attn_metadata.alibi_slopes,
+            max_seq_len=attn_metadata.kv_seq_len,
+            sm_scale=attn_metadata.sm_scale,
+        )
+
+    def decode(self, attn_metadata: AttentionMetaData, **kwargs):
+        fd_inter_tensor = kwargs.get("fd_inter_tensor", None)
+        return flash_decoding_attention(
+            q=attn_metadata.query_states,
+            k_cache=attn_metadata.k_cache,
+            v_cache=attn_metadata.v_cache,
+            kv_seq_len=attn_metadata.sequence_lengths,
+            block_tables=attn_metadata.block_tables,
+            block_size=attn_metadata.block_size,
+            max_seq_len_in_batch=attn_metadata.kv_seq_len,
+            output=attn_metadata.output_tensor,
+            mid_output=fd_inter_tensor.mid_output,
+            mid_output_lse=fd_inter_tensor.mid_output_lse,
+            alibi_slopes=attn_metadata.alibi_slopes,
+            sm_scale=attn_metadata.sm_scale,
+            kv_group_num=kwargs.get("num_key_value_groups", 1),
+            q_len=kwargs.get("q_len", 1),
+        )
+
+
+def get_attention_backend(
+    model_shard_infer_config: ModelShardInferenceConfig,
+) -> AttentionBackend:
+    """
+    Get the attention backend based on the inference configurations. The modeling will use CUDA-kernel-based backend
+    for attention module calculation only when:
+        1. using CUDA kernel (use_cuda_kernel=True)
+        2. can use flash attention (flash-attn installed and dtype is fp16 or bf16)
+        3. not using speculative decoding (currently cuda kernel not support speculative decoding)
+    Otherwise, use Triton attention backend. If found flash-attn not installed while `use_cuda_kernel` is True,
+    the Triton backend will use a new k cache layout for Triton kernels.
+    """
+    # Currently only triton kernels support speculative decoding
+    if model_shard_infer_config.use_spec_dec:
+        return TritonAttentionBackend()
+
+    if model_shard_infer_config.use_cuda_kernel:
+        return CudaAttentionBackend(model_shard_infer_config.use_flash_attn)
+
+    return TritonAttentionBackend()
diff --git a/colossalai/inference/modeling/backends/pre_attention_backend.py b/colossalai/inference/modeling/backends/pre_attention_backend.py
new file mode 100644
index 000000000000..77804429daf6
--- /dev/null
+++ b/colossalai/inference/modeling/backends/pre_attention_backend.py
@@ -0,0 +1,146 @@
+from abc import ABC, abstractmethod
+
+from colossalai.inference.config import ModelShardInferenceConfig
+from colossalai.inference.modeling.backends.attention_backend import AttentionMetaData
+from colossalai.kernel.kernel_loader import InferenceOpsLoader
+from colossalai.kernel.triton import copy_k_to_blocked_cache, decoding_fused_rotary_embedding, rotary_embedding
+
+
+class PreAttentionBackend(ABC):
+    @abstractmethod
+    def prefill(self, attn_metadata: AttentionMetaData, **kwargs):
+        raise NotImplementedError
+
+    @abstractmethod
+    def decode(self, attn_metadata: AttentionMetaData, **kwargs):
+        raise NotImplementedError
+
+
+class CudaPreAttentionBackend(PreAttentionBackend):
+    """
+    CudaPreAttentionBackend handles KV cache initialization and positional encoding for CudaAttentionBackend.
+    """
+
+    def __init__(self, use_flash_attn: bool):
+        super().__init__()
+        self.inference_ops = InferenceOpsLoader().load()
+        self.use_flash_attn = use_flash_attn
+
+    def prefill(self, attn_metadata: AttentionMetaData, **kwargs):
+        if self.use_flash_attn:
+            if not attn_metadata.use_alibi_attn:
+                self.inference_ops.rotary_embedding(
+                    attn_metadata.query_states,
+                    attn_metadata.key_states,
+                    kwargs.get("cos", None),
+                    kwargs.get("sin", None),
+                    kwargs.get("high_precision", False),
+                )
+            self.inference_ops.context_kv_cache_memcpy(
+                attn_metadata.key_states,
+                attn_metadata.value_states,
+                attn_metadata.k_cache,
+                attn_metadata.v_cache,
+                attn_metadata.sequence_lengths,
+                attn_metadata.cu_seqlens,
+                attn_metadata.block_tables,
+                attn_metadata.kv_seq_len,
+            )
+        elif not attn_metadata.use_alibi_attn:
+            rotary_embedding(
+                attn_metadata.query_states,
+                attn_metadata.key_states,
+                kwargs.get("cos", None),
+                kwargs.get("sin", None),
+            )
+
+    def decode(self, attn_metadata: AttentionMetaData, **kwargs):
+        if not attn_metadata.use_alibi_attn:
+            self.inference_ops.rotary_embedding_and_cache_copy(
+                attn_metadata.query_states,
+                attn_metadata.key_states,
+                attn_metadata.value_states,
+                kwargs.get("cos", None),
+                kwargs.get("sin", None),
+                attn_metadata.k_cache,
+                attn_metadata.v_cache,
+                attn_metadata.sequence_lengths,
+                attn_metadata.block_tables,
+                kwargs.get("high_precision", None),
+            )
+        else:
+            self.inference_ops.decode_kv_cache_memcpy(
+                attn_metadata.key_states,
+                attn_metadata.value_states,
+                attn_metadata.k_cache,
+                attn_metadata.v_cache,
+                attn_metadata.sequence_lengths,
+                attn_metadata.block_tables,
+            )
+
+
+class TritonPreAttentionBackend(PreAttentionBackend):
+    """
+    TritonPreAttentionBackend handles KV cache initialization and positional encoding for TritonAttentionBackend.
+    """
+
+    def prefill(self, attn_metadata: AttentionMetaData, **kwargs):
+        if not attn_metadata.use_alibi_attn:
+            rotary_embedding(
+                attn_metadata.query_states,
+                attn_metadata.key_states,
+                kwargs.get("cos", None),
+                kwargs.get("sin", None),
+            )
+
+    def decode(self, attn_metadata: AttentionMetaData, **kwargs):
+        if not attn_metadata.use_spec_dec and not attn_metadata.use_alibi_attn:
+            decoding_fused_rotary_embedding(
+                attn_metadata.query_states,
+                attn_metadata.key_states,
+                attn_metadata.value_states,
+                kwargs.get("cos", None),
+                kwargs.get("sin", None),
+                attn_metadata.k_cache,
+                attn_metadata.v_cache,
+                attn_metadata.block_tables,
+                attn_metadata.sequence_lengths,
+            )
+        else:  # else if using speculative decoding
+            if not attn_metadata.use_alibi_attn:
+                rotary_embedding(
+                    attn_metadata.query_states,
+                    attn_metadata.key_states,
+                    kwargs.get("cos", None),
+                    kwargs.get("sin", None),
+                )
+            copy_k_to_blocked_cache(
+                attn_metadata.key_states,
+                attn_metadata.k_cache,
+                kv_lengths=attn_metadata.sequence_lengths,
+                block_tables=attn_metadata.block_tables,
+                n=kwargs.get("q_len", 1),
+            )
+            copy_k_to_blocked_cache(
+                attn_metadata.value_states,
+                attn_metadata.v_cache,
+                kv_lengths=attn_metadata.sequence_lengths,
+                block_tables=attn_metadata.block_tables,
+                n=kwargs.get("q_len", 1),
+            )
+
+
+def get_pre_attention_backend(
+    model_shard_infer_config: ModelShardInferenceConfig,
+) -> PreAttentionBackend:
+    """
+    Get the backend for pre-attention computations, including potisional encoding like
+    RoPE and KV cache initialization. It adopt the same selection logic as attention_backend/get_attention_backend.
+    """
+    if model_shard_infer_config.use_spec_dec:
+        return TritonPreAttentionBackend()
+
+    if model_shard_infer_config.use_cuda_kernel:
+        return CudaPreAttentionBackend(model_shard_infer_config.use_flash_attn)
+
+    return TritonPreAttentionBackend()
diff --git a/colossalai/inference/modeling/layers/baichuan_tp_linear.py b/colossalai/inference/modeling/layers/baichuan_tp_linear.py
index e050dd71c8b2..50806a14b9e8 100644
--- a/colossalai/inference/modeling/layers/baichuan_tp_linear.py
+++ b/colossalai/inference/modeling/layers/baichuan_tp_linear.py
@@ -15,25 +15,10 @@ def from_native_module(
         module.in_features = module.weight.size(1)
         module.out_features = module.weight.size(0)
         module.bias = None
-        module.weight.data = nn.functional.normalize(module.weight)
-
-        return Linear1D_Col.from_native_module(
-            module,
-            process_group,
-            *args,
-            **kwargs,
-        )
-
-
-class BaichuanWpackLinear1D_Col(Linear1D_Col):
-    @staticmethod
-    def from_native_module(
-        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
-    ) -> ParallelModule:
-        in_features = module.in_features * 3
-        out_features = module.out_features // 3
-        module.weight.data = module.weight.view(3, out_features, -1).transpose(0, 1).reshape(out_features, in_features)
-        module.bias = None
+        module.weight.data = nn.functional.normalize(
+            module.weight
+        )  # TODO(lry89757) This behavior may not apply to lazy init. When we use lazy init, the weight of shardformer is not the real weight.
+        # So we should rewrite our own load_from_state_dict of `BaichuanLMHeadLinear1D_Col` to fix this potential issue.
 
         return Linear1D_Col.from_native_module(
             module,
diff --git a/colossalai/inference/modeling/models/glide_llama.py b/colossalai/inference/modeling/models/glide_llama.py
index 7b25f3e7489d..013b0f06185d 100644
--- a/colossalai/inference/modeling/models/glide_llama.py
+++ b/colossalai/inference/modeling/models/glide_llama.py
@@ -6,11 +6,7 @@
 
 import torch
 import torch.nn as nn
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import (
-    _prepare_4d_causal_attention_mask,
-    _prepare_4d_causal_attention_mask_for_sdpa,
-)
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.models.llama.modeling_llama import (
     LlamaAttention,
@@ -137,6 +133,7 @@ def glide_llama_model_forward(
     output_attentions: Optional[bool] = None,
     output_hidden_states: Optional[bool] = None,
     return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
     output_hidden_states = (
@@ -147,57 +144,43 @@ def glide_llama_model_forward(
     return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
     # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-    elif input_ids is not None:
-        batch_size, seq_length = input_ids.shape[:2]
-    elif inputs_embeds is not None:
-        batch_size, seq_length = inputs_embeds.shape[:2]
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-    past_key_values_length = 0
-    if use_cache:
-        use_legacy_cache = not isinstance(past_key_values, Cache)
-        if use_legacy_cache:
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-        past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-    if position_ids is None:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        position_ids = torch.arange(
-            past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError(
+            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
         )
-        position_ids = position_ids.unsqueeze(0)
+
+    if self.gradient_checkpointing and self.training and use_cache:
+        logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.")
+        use_cache = False
 
     if inputs_embeds is None:
         inputs_embeds = self.embed_tokens(input_ids)
 
-    if self._use_flash_attention_2:
-        # 2d mask is passed through the layers
-        attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-    elif self._use_sdpa and not output_attentions:
-        # output_attentions=True can not be supported when using SDPA, and we fall back on
-        # the manual implementation that requires a 4D causal mask in all cases.
-        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-            attention_mask,
-            (batch_size, seq_length),
-            inputs_embeds,
-            past_key_values_length,
-        )
-    else:
-        # 4d mask is passed through the layers
-        attention_mask = _prepare_4d_causal_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+    past_seen_tokens = 0
+    if use_cache:  # kept for BC (cache positions)
+        if not isinstance(past_key_values, StaticCache):
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_seen_tokens = past_key_values.get_seq_length()
+
+    if cache_position is None:
+        if isinstance(past_key_values, StaticCache):
+            raise ValueError("cache_position is a required argument when using StaticCache.")
+        cache_position = torch.arange(
+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
         )
 
+    if position_ids is None:
+        position_ids = cache_position.unsqueeze(0)
+
+    attention_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+
     # embed positions
     hidden_states = inputs_embeds
 
     # decoder layers
     all_hidden_states = () if output_hidden_states else None
     all_self_attns = () if output_attentions else None
-    next_decoder_cache = () if use_cache else None
+    next_decoder_cache = None
 
     for decoder_layer in self.layers:
         if output_hidden_states:
@@ -212,6 +195,7 @@ def glide_llama_model_forward(
             past_key_value=past_key_values,
             output_attentions=output_attentions,
             use_cache=use_cache,
+            cache_position=cache_position,
         )
 
         hidden_states = layer_outputs[0]
@@ -230,7 +214,9 @@ def glide_llama_model_forward(
 
     next_cache = None
     if use_cache:
-        next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = (
+            next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
+        )
     if not return_dict:
         return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
     return BaseModelOutputWithPast(
diff --git a/colossalai/inference/modeling/models/nopadding_baichuan.py b/colossalai/inference/modeling/models/nopadding_baichuan.py
index b50e73d6fcf4..3bab671c455f 100644
--- a/colossalai/inference/modeling/models/nopadding_baichuan.py
+++ b/colossalai/inference/modeling/models/nopadding_baichuan.py
@@ -1,68 +1,27 @@
 # This code is adapted from huggingface baichuan model: hhttps://huggingface.co/baichuan-inc/Baichuan2-13B-Base/blob/main/modeling_baichuan.py
-import itertools
-import math
 from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 from torch.distributed import ProcessGroup
 
+from colossalai.accelerator import get_accelerator
+from colossalai.inference.config import ModelShardInferenceConfig
 from colossalai.inference.flash_decoding_utils import FDIntermTensors
+from colossalai.inference.modeling.backends.attention_backend import AttentionMetaData, get_attention_backend
+from colossalai.inference.modeling.backends.pre_attention_backend import get_pre_attention_backend
 from colossalai.inference.modeling.models.nopadding_llama import NopadLlamaMLP
+from colossalai.inference.utils import get_alibi_slopes
 from colossalai.kernel.kernel_loader import InferenceOpsLoader
-from colossalai.kernel.triton import (
-    context_attention_unpadded,
-    copy_k_to_blocked_cache,
-    decoding_fused_rotary_embedding,
-    flash_decoding_attention,
-    rms_layernorm,
-    rotary_embedding,
-)
+from colossalai.kernel.triton import rms_layernorm
 from colossalai.logging import get_dist_logger
 from colossalai.shardformer.layer.parallel_module import ParallelModule
-from colossalai.tensor.d_tensor import Layout, distribute_tensor, is_distributed_tensor
-
-logger = get_dist_logger(__name__)
-
-try:
-    from flash_attn import flash_attn_varlen_func
-
-    use_flash_attn2 = True
-except ImportError:
-    use_flash_attn2 = False
-    logger.warning(f"flash_attn2 has not been installed yet, we will use triton flash attn instead.")
-
-logger = get_dist_logger(__name__)
-
-try:
-    from flash_attn import flash_attn_varlen_func
-
-    use_flash_attn2 = True
-except ImportError:
-    use_flash_attn2 = False
-    logger.warning(f"flash_attn2 has not been installed yet, we will use triton flash attn instead.")
+from colossalai.tensor.d_tensor import is_distributed_tensor
 
 inference_ops = InferenceOpsLoader().load()
-
 logger = get_dist_logger(__name__)
 
 
-# alibi slopes calculation adapted from https://github.com/huggingface/transformers/blob/v4.36.0/src/transformers/models/bloom/modeling_bloom.py#L57
-def get_alibi_slopes(num_heads: int, device: torch.device) -> torch.Tensor:
-    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-    base = torch.tensor(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=torch.float32, device=device)
-    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32, device=device)
-    slopes = torch.pow(base, powers)
-    if closest_power_of_2 != num_heads:
-        extra_base = torch.tensor(
-            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=torch.float32, device=device
-        )
-        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=torch.int32, device=device)
-        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-    return slopes
-
-
 def baichuan_rmsnorm_forward(
     self,
     hidden_states: torch.Tensor,
@@ -96,23 +55,19 @@ class NopadBaichuanAttention(ParallelModule):
     def __init__(
         self,
         config,
-        attn_qproj_w: torch.Tensor = None,
-        attn_kproj_w: torch.Tensor = None,
-        attn_vproj_w: torch.Tensor = None,
+        W_pack: ParallelModule = None,
         attn_oproj: ParallelModule = None,
         num_heads: int = None,
         hidden_size: int = None,
+        model_shard_infer_config: ModelShardInferenceConfig = None,
         process_group: ProcessGroup = None,
-        helper_layout: Layout = None,
     ):
         """This layer will replace the BaichuanAttention.
 
         Args:
             config (BaichuanConfig): Holding the Baichuan model config.
-            attn_qproj_w (torch.Tensor, optional): The transposed q_proj weight. Defaults to None.
-            attn_kproj_w (torch.Tensor, optional): The transposed k_proj weight. Defaults to None.
-            attn_vproj_w (torch.Tensor, optional): The transposed v_proj weight. Defaults to None.
-            attn_oproj (Linear1D_Row, optional): The Linear1D_Row o_proj weight. Defaults to None.
+            W_pack (ParallelModule, optional): The packed weight. Defaults to None.
+            attn_oproj (Linear1D_Row, optional): The Linear1D_Row o_proj. Defaults to None.
         """
         ParallelModule.__init__(self)
         self.o_proj = attn_oproj
@@ -122,10 +77,10 @@ def __init__(
         self.hidden_size = hidden_size
         self.head_dim = self.hidden_size // self.num_heads
         self.process_group = process_group
-        qkv_weight_list = [attn_qproj_w.transpose(0, 1), attn_kproj_w.transpose(0, 1), attn_vproj_w.transpose(0, 1)]
-        self.qkv_weight = nn.Parameter(torch.stack(qkv_weight_list, dim=0))
-
-        self.helper_layout = helper_layout
+        self.W_pack = W_pack
+        self.use_cuda_kernel = model_shard_infer_config.use_cuda_kernel
+        self.attention_backend = get_attention_backend(model_shard_infer_config)
+        self.pre_attention_backend = get_pre_attention_backend(model_shard_infer_config)
 
         self.alibi_slopes = None
         self.use_alibi_attn = False
@@ -133,9 +88,9 @@ def __init__(
         if config.hidden_size == 5120:
             slopes_start = self.process_group.rank() * num_heads
             self.use_alibi_attn = True
-            self.alibi_slopes = get_alibi_slopes(config.num_attention_heads, device=attn_qproj_w.device)[
-                slopes_start : slopes_start + num_heads
-            ].contiguous()
+            self.alibi_slopes = get_alibi_slopes(
+                config.num_attention_heads, device=get_accelerator().get_current_device()
+            )[slopes_start : slopes_start + num_heads].contiguous()
             self.alibi_slopes = nn.Parameter(self.alibi_slopes)
 
     @staticmethod
@@ -149,76 +104,22 @@ def from_native_module(
         """
 
         config = module.config
-        q_proj_w, k_proj_w, v_proj_w = module.W_pack.weight.view((module.hidden_size, 3, -1)).transpose(0, 1)
-
-        attn_qproj_w = q_proj_w
-        attn_kproj_w = k_proj_w
-        attn_vproj_w = v_proj_w
+        W_pack = module.W_pack
         attn_oproj = module.o_proj
-
-        helper_layout = (
-            module.W_pack.weight.dist_layout
-        )  # NOTE this is a hack for the right load/shard of qkv_weight(used in _load_from_state_dict)
+        model_shard_infer_config = kwargs.get("model_shard_infer_config", None)
 
         attn_layer = NopadBaichuanAttention(
             config=config,
-            attn_qproj_w=attn_qproj_w,
-            attn_kproj_w=attn_kproj_w,
-            attn_vproj_w=attn_vproj_w,
+            W_pack=W_pack,
             attn_oproj=attn_oproj,
+            model_shard_infer_config=model_shard_infer_config,
             num_heads=module.num_heads,
             hidden_size=module.hidden_size,
             process_group=process_group,
-            helper_layout=helper_layout,
         )
 
         return attn_layer
 
-    def _load_from_state_dict(
-        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
-    ):
-        for hook in self._load_state_dict_pre_hooks.values():
-            hook(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
-
-        persistent_buffers = {k: v for k, v in self._buffers.items() if k not in self._non_persistent_buffers_set}
-        local_name_params = itertools.chain(self._parameters.items(), persistent_buffers.items())
-        local_state = {k: v for k, v in local_name_params if v is not None}
-
-        key = "qkv_weight"
-        qkv_w = state_dict[prefix + "W_pack.weight"]
-
-        in_features = qkv_w.size(1)
-        out_features = qkv_w.size(0) // 3
-
-        qkv_w.data = qkv_w.view((3, out_features, -1)).transpose(0, 1).reshape(out_features, in_features * 3)
-
-        device_mesh = self.helper_layout.device_mesh
-        sharding_spec = self.helper_layout.sharding_spec
-        qkv_w = distribute_tensor(qkv_w, device_mesh, sharding_spec)
-
-        qkv_w = qkv_w.transpose(0, 1).reshape(3, in_features, -1)
-        input_param = nn.Parameter(
-            qkv_w
-        )  # NOTE qkv_weight doesn't have to be a distensor, Like input_param = sharded_tensor_to_param(input_param)
-
-        param = local_state[key]
-
-        try:
-            with torch.no_grad():
-                param.copy_(input_param)
-        except Exception as ex:
-            error_msgs.append(
-                'While copying the parameter named "{}", '
-                "whose dimensions in the model are {} and "
-                "whose dimensions in the checkpoint are {}, "
-                "an exception occurred : {}.".format(key, param.size(), input_param.size(), ex.args)
-            )
-
-        strict = False  # to avoid unexpected_keys
-        super()._load_from_state_dict(
-            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
-        )
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -234,7 +135,6 @@ def forward(
         kv_seq_len: int = 0,
         output_tensor: torch.Tensor = None,
         sm_scale: int = None,
-        use_cuda_kernel: bool = True,
         cu_seqlens: torch.Tensor = None,
         high_precision: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -253,144 +153,66 @@ def forward(
             kv_seq_len (int, optional): The max sequence length of input sequences. Defaults to 0.
             output_tensor (torch.Tensor, optional): The mid tensor holds the output of attention. Defaults to None.
             sm_scale (int, optional): Used for flash attention. Defaults to None.
-            use_cuda_kernel: (bool, optional): Whether to use cuda kernel. Defaults to True.
             cu_seqlens(torch.Tensor, optional): Holding the cumulative sum of sequence length.
             high_precision(Optional[bool]): Whether to use float32 for underlying calculations of float16 data to achieve higher precision, defaults to False.
         """
-
         token_nums = hidden_states.size(0)
-        # fused qkv
-        hidden_states = hidden_states.expand(3, -1, -1)
-        query_states, key_states, value_states = (
-            torch.bmm(hidden_states, self.qkv_weight).view(3, token_nums, self.num_heads, self.head_dim).unbind(0)
-        )
+
+        proj = self.W_pack(hidden_states)
+        proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2)
+        query_states = proj[0].view(token_nums, self.num_heads, self.head_dim)
+        key_states = proj[1].view(token_nums, self.num_heads, self.head_dim)
+        value_states = proj[2].view(token_nums, self.num_heads, self.head_dim)
 
         block_size = k_cache.size(-2)
 
-        if is_prompts:
-            if not is_verifier and use_cuda_kernel and query_states.dtype != torch.float32 and use_flash_attn2:
-                # flash attn 2 currently only supports FP16/BF16.
-                if not self.use_alibi_attn:
-                    inference_ops.rotary_embedding(query_states, key_states, cos_sin[0], cos_sin[1], high_precision)
-                inference_ops.context_kv_cache_memcpy(
-                    key_states, value_states, k_cache, v_cache, sequence_lengths, cu_seqlens, block_tables, kv_seq_len
-                )
-                attn_output = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens,
-                    cu_seqlens_k=cu_seqlens,
-                    max_seqlen_q=kv_seq_len,
-                    max_seqlen_k=kv_seq_len,
-                    dropout_p=0.0,
-                    softmax_scale=sm_scale,
-                    causal=True,
-                    alibi_slopes=self.alibi_slopes,
-                )
-                attn_output = attn_output.view(token_nums, -1)
-            else:
-                if not self.use_alibi_attn:
-                    rotary_embedding(query_states, key_states, cos_sin[0], cos_sin[1])
-                attn_output = context_attention_unpadded(
-                    q=query_states,
-                    k=key_states,
-                    v=value_states,
-                    k_cache=k_cache,
-                    v_cache=v_cache,
-                    context_lengths=sequence_lengths,
-                    block_tables=block_tables,
-                    block_size=block_size,
-                    output=output_tensor,
-                    alibi_slopes=self.alibi_slopes,
-                    max_seq_len=kv_seq_len,
-                    sm_scale=sm_scale,
-                    use_new_kcache_layout=use_cuda_kernel,
-                )
-        else:
-            q_len = tokens_to_verify + 1 if is_verifier else 1
+        attn_metadata = AttentionMetaData(
+            query_states=query_states,
+            key_states=key_states,
+            value_states=value_states,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            block_tables=block_tables,
+            block_size=block_size,
+            kv_seq_len=kv_seq_len,
+            sequence_lengths=sequence_lengths,
+            sm_scale=sm_scale,
+            alibi_slopes=self.alibi_slopes,
+            cu_seqlens=cu_seqlens,
+            output_tensor=output_tensor,
+            use_spec_dec=is_verifier,
+            use_alibi_attn=self.use_alibi_attn,
+        )
 
-            if use_cuda_kernel:
-                if not self.use_alibi_attn:
-                    inference_ops.rotary_embedding_and_cache_copy(
-                        query_states,
-                        key_states,
-                        value_states,
-                        cos_sin[0],
-                        cos_sin[1],
-                        k_cache,
-                        v_cache,
-                        sequence_lengths,
-                        block_tables,
-                        high_precision,
-                    )
-                else:
-                    inference_ops.decode_kv_cache_memcpy(
-                        key_states, value_states, k_cache, v_cache, sequence_lengths, block_tables
-                    )
-                inference_ops.flash_decoding_attention(
-                    output_tensor,
-                    query_states,
-                    k_cache,
-                    v_cache,
-                    sequence_lengths,
-                    block_tables,
-                    block_size,
-                    kv_seq_len,
-                    fd_inter_tensor.mid_output,
-                    fd_inter_tensor.exp_sums,
-                    fd_inter_tensor.max_logits,
-                    self.alibi_slopes,
-                    sm_scale,
-                )
-                attn_output = output_tensor
-            else:
-                if not is_verifier and not self.use_alibi_attn:
-                    decoding_fused_rotary_embedding(
-                        query_states,
-                        key_states,
-                        value_states,
-                        cos_sin[0],
-                        cos_sin[1],
-                        k_cache,
-                        v_cache,
-                        block_tables,
-                        sequence_lengths,
-                    )
-                else:
-                    if not self.use_alibi_attn:
-                        rotary_embedding(query_states, key_states, cos_sin[0], cos_sin[1])
-                    copy_k_to_blocked_cache(
-                        key_states, k_cache, kv_lengths=sequence_lengths, block_tables=block_tables, n=q_len
-                    )
-                    copy_k_to_blocked_cache(
-                        value_states, v_cache, kv_lengths=sequence_lengths, block_tables=block_tables, n=q_len
-                    )
+        if is_prompts:  # prefilling stage
+            self.pre_attention_backend.prefill(
+                attn_metadata,
+                cos=cos_sin[0],
+                sin=cos_sin[1],
+                high_precision=high_precision,
+            )
+            attn_output = self.attention_backend.prefill(
+                attn_metadata,
+                token_nums=token_nums,
+            )
+        else:  # decoding stage
+            q_len = tokens_to_verify + 1 if is_verifier else 1
 
-                attn_output = flash_decoding_attention(
-                    q=query_states,
-                    k_cache=k_cache,
-                    v_cache=v_cache,
-                    kv_seq_len=sequence_lengths,
-                    block_tables=block_tables,
-                    block_size=block_size,
-                    max_seq_len_in_batch=kv_seq_len,
-                    output=output_tensor,
-                    mid_output=fd_inter_tensor.mid_output,
-                    mid_output_lse=fd_inter_tensor.mid_output_lse,
-                    alibi_slopes=self.alibi_slopes,
-                    sm_scale=sm_scale,
-                    q_len=q_len,
-                )
+            self.pre_attention_backend.decode(
+                attn_metadata,
+                q_len=q_len,
+            )
+            attn_output = self.attention_backend.decode(
+                attn_metadata,
+                fd_inter_tensor=fd_inter_tensor,
+                q_len=q_len,
+            )
 
         attn_output = attn_output.view(-1, self.hidden_size)
         attn_output = self.o_proj(attn_output)
 
         return attn_output
 
-    def extra_repr(self) -> str:
-        return f"qkv_weight_proj MergedLinear1D_Col: in_features={self.qkv_weight.shape[1]}x3, out_features={self.qkv_weight.shape[2]}, bias=False"
-
 
 # NOTE This will cause difference as out length increases.
 class NopadBaichuanMLP(NopadLlamaMLP):
diff --git a/colossalai/inference/modeling/models/nopadding_llama.py b/colossalai/inference/modeling/models/nopadding_llama.py
index f6f160eb7e96..445ec59ceb1d 100644
--- a/colossalai/inference/modeling/models/nopadding_llama.py
+++ b/colossalai/inference/modeling/models/nopadding_llama.py
@@ -16,18 +16,13 @@
     LlamaRMSNorm,
 )
 
-from colossalai.inference.config import InputMetaData
+from colossalai.inference.config import InputMetaData, ModelShardInferenceConfig
 from colossalai.inference.flash_decoding_utils import FDIntermTensors
+from colossalai.inference.modeling.backends.attention_backend import AttentionMetaData, get_attention_backend
+from colossalai.inference.modeling.backends.pre_attention_backend import get_pre_attention_backend
+from colossalai.inference.utils import can_use_flash_attn2
 from colossalai.kernel.kernel_loader import InferenceOpsLoader
-from colossalai.kernel.triton import (
-    context_attention_unpadded,
-    copy_k_to_blocked_cache,
-    decoding_fused_rotary_embedding,
-    flash_decoding_attention,
-    get_xine_cache,
-    rms_layernorm,
-    rotary_embedding,
-)
+from colossalai.kernel.triton import get_xine_cache, rms_layernorm
 from colossalai.logging import get_dist_logger
 from colossalai.shardformer.layer.parallel_module import ParallelModule
 from colossalai.tensor.d_tensor import distribute_tensor, is_distributed_tensor
@@ -36,14 +31,6 @@
 
 logger = get_dist_logger(__name__)
 
-try:
-    from flash_attn import flash_attn_varlen_func
-
-    use_flash_attn2 = True
-except ImportError:
-    use_flash_attn2 = False
-    logger.warning(f"flash_attn2 has not been installed yet, we will use triton flash attn instead.")
-
 
 def llama_causal_lm_forward(
     self: LlamaForCausalLM,
@@ -126,8 +113,8 @@ def llama_model_forward(
         cos_sin = (self._cos_cached[rotary_indexes], self._sin_cached[rotary_indexes])
 
     elif use_cuda_kernel:
-        if inputmetadata.dtype != torch.float32 and use_flash_attn2:
-            cu_seqlens = F.pad(torch.cumsum(sequence_lengths, dim=0, dtype=torch.torch.int32), (1, 0))
+        if can_use_flash_attn2(inputmetadata.dtype):
+            cu_seqlens = F.pad(torch.cumsum(sequence_lengths, dim=0, dtype=torch.int32), (1, 0))
 
         hidden_dim = self._cos_cached.size(-1)
         total_length = hidden_states.size(0)
@@ -238,7 +225,6 @@ def llama_decoder_layer_forward(
         kv_seq_len=kv_seq_len,
         output_tensor=output_tensor,
         sm_scale=sm_scale,
-        use_cuda_kernel=use_cuda_kernel,
         cu_seqlens=cu_seqlens,
         high_precision=high_precision,
     )
@@ -279,7 +265,7 @@ def __init__(
         mlp_dproj: ParallelModule = None,
         process_group: ProcessGroup = None,
     ):
-        """A Unified Layer for
+        """Replacement of LlamaMLP layer.
 
         Args:
             config (LlamaConfig): Holding the Llama model config.
@@ -402,6 +388,7 @@ def __init__(
         attn_vproj_w: torch.Tensor = None,
         attn_oproj: ParallelModule = None,
         process_group: ProcessGroup = None,
+        model_shard_infer_config: ModelShardInferenceConfig = None,
         num_heads: int = None,
         hidden_size: int = None,
         num_key_value_heads: int = None,
@@ -433,6 +420,9 @@ def __init__(
         self.rope_theta = config.rope_theta
         self.is_causal = True
 
+        self.attention_backend = get_attention_backend(model_shard_infer_config)
+        self.pre_attention_backend = get_pre_attention_backend(model_shard_infer_config)
+
         if self.num_heads == self.num_key_value_heads:
             qkv_weight_list = [attn_qproj_w.transpose(0, 1), attn_kproj_w.transpose(0, 1), attn_vproj_w.transpose(0, 1)]
             self.qkv_weight = nn.Parameter(torch.stack(qkv_weight_list, dim=0))
@@ -462,6 +452,7 @@ def from_native_module(
         attn_vproj_w = module.v_proj.weight
         assert is_distributed_tensor(attn_qproj_w), "attn_qproj_w must be dist tensor"
         attn_oproj = module.o_proj
+        model_shard_infer_config = kwargs.get("model_shard_infer_config", None)
 
         attn_layer = NopadLlamaAttention(
             config=config,
@@ -471,6 +462,7 @@ def from_native_module(
             attn_vproj_w=attn_vproj_w,
             attn_oproj=attn_oproj,
             process_group=process_group,
+            model_shard_infer_config=model_shard_infer_config,
             num_heads=module.num_heads,
             hidden_size=module.hidden_size,
             num_key_value_heads=module.num_key_value_heads,
@@ -533,111 +525,50 @@ def forward(
 
         block_size = k_cache.size(-2)
 
-        if is_prompts:
-            if not is_verifier and use_cuda_kernel and query_states.dtype != torch.float32 and use_flash_attn2:
-                # flash attn 2 currently only supports FP16/BF16.
-                inference_ops.rotary_embedding(query_states, key_states, cos_sin[0], cos_sin[1], high_precision)
-                inference_ops.context_kv_cache_memcpy(
-                    key_states, value_states, k_cache, v_cache, sequence_lengths, cu_seqlens, block_tables, kv_seq_len
-                )
+        attn_metadata = AttentionMetaData(
+            query_states=query_states,
+            key_states=key_states,
+            value_states=value_states,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            block_tables=block_tables,
+            block_size=block_size,
+            kv_seq_len=kv_seq_len,
+            sequence_lengths=sequence_lengths,
+            sm_scale=sm_scale,
+            alibi_slopes=None,
+            cu_seqlens=cu_seqlens,
+            output_tensor=output_tensor,
+            use_spec_dec=is_verifier,
+            use_alibi_attn=False,
+        )
 
-                attn_output = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens,
-                    cu_seqlens_k=cu_seqlens,
-                    max_seqlen_q=kv_seq_len,
-                    max_seqlen_k=kv_seq_len,
-                    dropout_p=0.0,
-                    softmax_scale=sm_scale,
-                    causal=True,
-                )
-                attn_output = attn_output.view(token_nums, -1)
-            else:
-                rotary_embedding(query_states, key_states, cos_sin[0], cos_sin[1])
-                attn_output = context_attention_unpadded(
-                    q=query_states,
-                    k=key_states,
-                    v=value_states,
-                    k_cache=k_cache,
-                    v_cache=v_cache,
-                    context_lengths=sequence_lengths,
-                    block_tables=block_tables,
-                    block_size=block_size,
-                    output=output_tensor,
-                    max_seq_len=kv_seq_len,
-                    sm_scale=sm_scale,
-                    use_new_kcache_layout=use_cuda_kernel,
-                )
-        else:
+        if is_prompts:  # prefilling stage
+            self.pre_attention_backend.prefill(
+                attn_metadata,
+                cos=cos_sin[0],
+                sin=cos_sin[1],
+                high_precision=high_precision,
+            )
+            attn_output = self.attention_backend.prefill(
+                attn_metadata,
+                token_nums=token_nums,
+            )
+        else:  # decoding stage
             q_len = tokens_to_verify + 1 if is_verifier else 1
 
-            if use_cuda_kernel:
-                inference_ops.rotary_embedding_and_cache_copy(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cos_sin[0],
-                    cos_sin[1],
-                    k_cache,
-                    v_cache,
-                    sequence_lengths,
-                    block_tables,
-                    high_precision,
-                )
-                inference_ops.flash_decoding_attention(
-                    output_tensor,
-                    query_states,
-                    k_cache,
-                    v_cache,
-                    sequence_lengths,
-                    block_tables,
-                    block_size,
-                    kv_seq_len,
-                    fd_inter_tensor.mid_output,
-                    fd_inter_tensor.exp_sums,
-                    fd_inter_tensor.max_logits,
-                    None,
-                    sm_scale,
-                )
-                attn_output = output_tensor
-            else:
-                if is_verifier:
-                    rotary_embedding(query_states, key_states, cos_sin[0], cos_sin[1])
-                    copy_k_to_blocked_cache(
-                        key_states, k_cache, kv_lengths=sequence_lengths, block_tables=block_tables, n=q_len
-                    )
-                    copy_k_to_blocked_cache(
-                        value_states, v_cache, kv_lengths=sequence_lengths, block_tables=block_tables, n=q_len
-                    )
-                else:
-                    decoding_fused_rotary_embedding(
-                        query_states,
-                        key_states,
-                        value_states,
-                        cos_sin[0],
-                        cos_sin[1],
-                        k_cache,
-                        v_cache,
-                        block_tables,
-                        sequence_lengths,
-                    )
-                attn_output = flash_decoding_attention(
-                    q=query_states,
-                    k_cache=k_cache,
-                    v_cache=v_cache,
-                    kv_seq_len=sequence_lengths,
-                    block_tables=block_tables,
-                    block_size=block_size,
-                    max_seq_len_in_batch=kv_seq_len,
-                    output=output_tensor,
-                    mid_output=fd_inter_tensor.mid_output,
-                    mid_output_lse=fd_inter_tensor.mid_output_lse,
-                    sm_scale=sm_scale,
-                    kv_group_num=self.num_key_value_groups,
-                    q_len=q_len,
-                )
+            self.pre_attention_backend.decode(
+                attn_metadata,
+                cos=cos_sin[0],
+                sin=cos_sin[1],
+                q_len=q_len,
+            )
+            attn_output = self.attention_backend.decode(
+                attn_metadata,
+                fd_inter_tensor=fd_inter_tensor,
+                num_key_value_groups=self.num_key_value_groups,
+                q_len=q_len,
+            )
 
         attn_output = attn_output.view(-1, self.hidden_size)
         attn_output = self.o_proj(attn_output)
diff --git a/colossalai/inference/modeling/policy/nopadding_baichuan.py b/colossalai/inference/modeling/policy/nopadding_baichuan.py
index 78268d6e7e85..37b5062e887e 100644
--- a/colossalai/inference/modeling/policy/nopadding_baichuan.py
+++ b/colossalai/inference/modeling/policy/nopadding_baichuan.py
@@ -1,8 +1,5 @@
 from colossalai.inference.config import RPC_PARAM
-from colossalai.inference.modeling.layers.baichuan_tp_linear import (
-    BaichuanLMHeadLinear1D_Col,
-    BaichuanWpackLinear1D_Col,
-)
+from colossalai.inference.modeling.layers.baichuan_tp_linear import BaichuanLMHeadLinear1D_Col
 from colossalai.inference.modeling.models.nopadding_baichuan import (
     NopadBaichuanAttention,
     NopadBaichuanMLP,
@@ -14,7 +11,7 @@
     llama_model_forward,
 )
 from colossalai.inference.utils import init_to_get_rotary
-from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.layer import FusedLinear1D_Col, Linear1D_Col, Linear1D_Row
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription
 from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy
 
@@ -60,8 +57,7 @@ def module_policy(self):
                         target_module=NopadBaichuanMLP,
                     ),
                     SubModuleReplacementDescription(
-                        suffix="self_attn.W_pack",
-                        target_module=BaichuanWpackLinear1D_Col,
+                        suffix="self_attn.W_pack", target_module=FusedLinear1D_Col, kwargs={"n_fused": 3}
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
@@ -70,6 +66,9 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="self_attn",
                         target_module=NopadBaichuanAttention,
+                        kwargs={
+                            "model_shard_infer_config": self.shard_config.extra_kwargs["model_shard_infer_config"],
+                        },
                     ),
                 ],
             )
diff --git a/colossalai/inference/modeling/policy/nopadding_llama.py b/colossalai/inference/modeling/policy/nopadding_llama.py
index 24cf7c740b10..0b6797560117 100644
--- a/colossalai/inference/modeling/policy/nopadding_llama.py
+++ b/colossalai/inference/modeling/policy/nopadding_llama.py
@@ -72,6 +72,9 @@ def module_policy(self):
                 SubModuleReplacementDescription(
                     suffix="self_attn",
                     target_module=NopadLlamaAttention,
+                    kwargs={
+                        "model_shard_infer_config": self.shard_config.extra_kwargs["model_shard_infer_config"],
+                    },
                 ),
             ],
         )
diff --git a/colossalai/inference/utils.py b/colossalai/inference/utils.py
index 072bedec3587..8c155e6ca09f 100644
--- a/colossalai/inference/utils.py
+++ b/colossalai/inference/utils.py
@@ -1,6 +1,7 @@
 """
 Utils for model inference
 """
+import math
 import os
 import re
 from pathlib import Path
@@ -9,8 +10,11 @@
 import torch
 from torch import nn
 
+from colossalai.logging import get_dist_logger
 from colossalai.testing import free_port
 
+logger = get_dist_logger(__name__)
+
 
 def init_to_get_rotary(self, base=10000, use_elem=False):
     """
@@ -113,3 +117,44 @@ def find_available_ports(num: int):
         print(f"An OS error occurred: {e}")
         raise RuntimeError("Error finding available ports")
     return free_ports
+
+
+def get_alibi_slopes(num_heads: int, device: torch.device) -> torch.Tensor:
+    """
+    Alibi slopes calculation adapted from https://github.com/huggingface/transformers/blob/v4.36.0/src/transformers/models/bloom/modeling_bloom.py#L57
+
+    Args:
+        num_heads (int): The number of attention heads.
+        device (torch.device): The device to use.
+
+    Returns:
+        torch.Tensor: The Alibi slopes.
+    """
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = torch.tensor(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=torch.float32, device=device)
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32, device=device)
+    slopes = torch.pow(base, powers)
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=torch.float32, device=device
+        )
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=torch.int32, device=device)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes
+
+
+def can_use_flash_attn2(dtype: torch.dtype) -> bool:
+    """
+    Check flash attention2 availability.
+    """
+    if dtype not in (torch.float16, torch.bfloat16):
+        return False
+
+    try:
+        from flash_attn import flash_attn_varlen_func  # noqa
+
+        return True
+    except ImportError:
+        logger.warning(f"flash_attn2 has not been installed yet, we will use triton flash attn instead.")
+        return False
diff --git a/colossalai/nn/optimizer/__init__.py b/colossalai/nn/optimizer/__init__.py
index ddb03f947907..d872dbbafe96 100644
--- a/colossalai/nn/optimizer/__init__.py
+++ b/colossalai/nn/optimizer/__init__.py
@@ -43,11 +43,11 @@
     CAME: DistributedCAME,
     Adafactor: DistributedAdaFactor,
 }
-_logger = get_dist_logger()
 
 
 def cast_to_distributed(optim):
     if optim.__class__ in optim2DistOptim:
+        _logger = get_dist_logger()
         _logger.info(f"Converting optimizer {optim.__class__.__name__} to its distributed version.", ranks=[0])
 
         if isinstance(optim, GaLoreAdamW8bit):
diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index abc865a34762..141baf3d3770 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -50,7 +50,7 @@ def get_pad_info(padding_mask: torch.Tensor) -> Tuple[int, torch.Tensor, torch.T
     seqlens_in_batch = padding_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
     return max_seqlen_in_batch, cu_seqlens, indices
 
 
diff --git a/colossalai/shardformer/modeling/bloom.py b/colossalai/shardformer/modeling/bloom.py
index bf74d0833cb0..1f34215c5175 100644
--- a/colossalai/shardformer/modeling/bloom.py
+++ b/colossalai/shardformer/modeling/bloom.py
@@ -475,7 +475,10 @@ def bloom_for_sequence_classification_forward(
                 sequence_lengths = -1
             else:
                 if input_ids is not None:
-                    sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                    # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                    sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                    sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                    sequence_lengths = sequence_lengths.to(logits.device)
                 else:
                     sequence_lengths = -1
                     logger.warning(
diff --git a/colossalai/shardformer/modeling/falcon.py b/colossalai/shardformer/modeling/falcon.py
index a43bdf4814ed..8181a68a0332 100644
--- a/colossalai/shardformer/modeling/falcon.py
+++ b/colossalai/shardformer/modeling/falcon.py
@@ -291,18 +291,17 @@ def falcon_model_forward(
                 if attention_mask_2d is None:
                     attention_mask = alibi / math.sqrt(self.config.hidden_size // self.num_heads)
                 else:
+                    min_dtype = torch.finfo(alibi.dtype).min
                     attention_mask = torch.masked_fill(
                         alibi / math.sqrt(self.config.hidden_size // self.num_heads),
                         attention_mask < -1,
-                        torch.finfo(alibi.dtype).min,
+                        min_dtype,
                     )
 
                     # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
                     # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
-                    if seq_length > 1:
-                        attention_mask = AttentionMaskConverter._unmask_unattended(
-                            attention_mask, attention_mask_2d, unmasked_value=0.0
-                        )
+                    if seq_length > 1 and attention_mask.device.type == "cuda":
+                        attention_mask = AttentionMaskConverter._unmask_unattended(attention_mask, min_dtype=min_dtype)
             else:
                 # PyTorch SDPA does not support head_mask, we fall back on the eager implementation in this case.
                 attention_mask = _prepare_4d_causal_attention_mask(
@@ -543,7 +542,10 @@ def falcon_for_sequence_classification_forward(
                 sequence_lengths = -1
             else:
                 if input_ids is not None:
-                    sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(dim=-1) - 1).to(logits.device)
+                    # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                    sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                    sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                    sequence_lengths = sequence_lengths.to(logits.device)
                 else:
                     sequence_lengths = -1
                     logger.warning(
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index c49458dbdf55..aa75bab115a7 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -738,7 +738,10 @@ def gpt2_for_sequence_classification_forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
             else:
                 sequence_lengths = -1
                 logger.warning_once(
diff --git a/colossalai/shardformer/modeling/gptj.py b/colossalai/shardformer/modeling/gptj.py
index 4f4cec8bc81f..facd2fcafbae 100644
--- a/colossalai/shardformer/modeling/gptj.py
+++ b/colossalai/shardformer/modeling/gptj.py
@@ -32,6 +32,7 @@ def _get_attention_mask(
     hidden_states: torch.Tensor,
     past_key_values: Optional[Tuple[Tuple[torch.Tensor]]],
     attention_mask: Optional[torch.FloatTensor],
+    use_flash_attention_2: bool = False,
 ) -> Optional[Union[torch.Tensor, dict]]:
     batch_size, seq_len = hidden_states.shape[:2]
     past_key_values_length = 0
@@ -47,7 +48,7 @@ def _get_attention_mask(
             attention_mask,
             is_causal=True,
         )
-    elif attention_mask is not None:
+    elif use_flash_attention_2 and attention_mask is not None:
         if batch_size <= 0:
             raise ValueError("batch_size has to be defined and > 0")
         attention_mask = attention_mask.view(batch_size, -1)
@@ -162,7 +163,9 @@ def gptj_model_forward(
 
         output_shape = input_shape + (hidden_states.size(-1),)
 
-        attention_mask = _get_attention_mask(self, shard_config, hidden_states, past_key_values, attention_mask)
+        attention_mask = _get_attention_mask(
+            self, shard_config, hidden_states, past_key_values, attention_mask, self._use_flash_attention_2
+        )
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -419,7 +422,10 @@ def gptj_for_sequence_classification_forward(
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
             else:
                 sequence_lengths = -1
                 logger.warning_once(
@@ -712,7 +718,9 @@ def forward(
 
         hidden_states = self.drop(hidden_states)
 
-        attention_mask = _get_attention_mask(self, shard_config, hidden_states, past_key_values, attention_mask)
+        attention_mask = _get_attention_mask(
+            self, shard_config, hidden_states, past_key_values, attention_mask, self._use_flash_attention_2
+        )
 
         output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
 
@@ -886,7 +894,9 @@ def forward(
         hidden_states = self.drop(hidden_states)
 
         output_shape = input_shape + (hidden_states.size(-1),)
-        attention_mask = _get_attention_mask(self, shard_config, hidden_states, past_key_values, attention_mask)
+        attention_mask = _get_attention_mask(
+            self, shard_config, hidden_states, past_key_values, attention_mask, self._use_flash_attention_2
+        )
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index d6f10ffafec7..bf5ce45a8342 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -7,7 +7,7 @@
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.cache_utils import Cache
+from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -17,8 +17,7 @@
     LlamaForCausalLM,
     LlamaForSequenceClassification,
     LlamaModel,
-    _prepare_4d_causal_attention_mask,
-    _prepare_4d_causal_attention_mask_for_sdpa,
+    StaticCache,
     apply_rotary_pos_emb,
     repeat_kv,
 )
@@ -53,6 +52,7 @@ def llama_model_forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         stage_manager: Optional[PipelineStageManager] = None,
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
@@ -65,6 +65,11 @@ def llama_model_forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with pipeline parallelism. Setting `use_cache=False`..."
+            )
+            use_cache = False
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -81,14 +86,24 @@ def llama_model_forward(
             device = input_ids.device if input_ids is not None else inputs_embeds.device
             if inputs_embeds is None:
                 inputs_embeds = self.embed_tokens(input_ids)
+
             hidden_states = inputs_embeds
         else:
             input_shape = hidden_states.shape[:-1]
             batch_size, seq_length = input_shape
             device = hidden_states.device
 
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
+        past_seen_tokens = 0
+        if use_cache:  # kept for BC (cache positions)
+            if not isinstance(past_key_values, StaticCache):
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                past_seen_tokens = past_key_values.get_seq_length()
+        if cache_position is None:
+            if isinstance(past_key_values, StaticCache):
+                raise ValueError("cache_position is a required argument when using StaticCache.")
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + hidden_states.shape[1], device=device)
+
+        seq_length_with_past = seq_length + past_seen_tokens
 
         # TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
         if output_attentions:
@@ -101,18 +116,8 @@ def llama_model_forward(
             logger.warning_once("use_cache=True is not supported for pipeline models at the moment.")
             use_cache = False
 
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
         if position_ids is None:
-            position_ids = torch.arange(
-                past_key_values_length,
-                seq_length + past_key_values_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0)
+            position_ids = cache_position.unsqueeze(0)
 
         # embed positions, for the first stage, hidden_states is the input embeddings,
         # for the other stages, hidden_states is the output of the previous stage
@@ -127,28 +132,9 @@ def llama_model_forward(
                 is_causal=True,
             )
         else:
-            if self._use_flash_attention_2:
-                # 2d mask is passed through the layers
-                attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-            elif self._use_sdpa and not output_attentions:
-                # output_attentions=True can not be supported when using SDPA, and we fall back on
-                # the manual implementation that requires a 4D causal mask in all cases.
-                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                    attention_mask,
-                    (batch_size, seq_length),
-                    inputs_embeds,
-                    past_key_values_length,
-                )
-            else:
-                # 4d mask is passed through the layers
-                attention_mask = _prepare_4d_causal_attention_mask(
-                    attention_mask,
-                    (batch_size, seq_length),
-                    hidden_states,
-                    past_key_values_length,
-                )
+            attention_mask = self._update_causal_mask(attention_mask, hidden_states, cache_position)
 
-        if self.gradient_checkpointing and self.training:
+        if self.gradient_checkpointing and self.training and use_cache:
             if use_cache:
                 logger.warning_once(
                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -188,6 +174,7 @@ def llama_model_forward(
                     past_key_values,
                     output_attentions,
                     use_cache,
+                    cache_position,
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -197,6 +184,7 @@ def llama_model_forward(
                     past_key_value=past_key_values,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
+                    cache_position=cache_position,
                 )
 
             hidden_states = layer_outputs[0]
@@ -247,6 +235,7 @@ def llama_for_causal_lm_forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         stage_manager: Optional[PipelineStageManager] = None,
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
@@ -304,6 +293,7 @@ def llama_for_causal_lm_forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
             stage_manager=stage_manager,
             hidden_states=hidden_states,
             stage_index=stage_index,
@@ -366,6 +356,7 @@ def llama_for_sequence_classification_forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         stage_manager: Optional[PipelineStageManager] = None,
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
@@ -399,6 +390,7 @@ def llama_for_sequence_classification_forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
             stage_manager=stage_manager,
             hidden_states=hidden_states,
             stage_index=stage_index,
@@ -468,37 +460,53 @@ def llama_for_sequence_classification_forward(
             return {"hidden_states": hidden_states}
 
 
-def get_llama_flash_attention_forward(shard_config, sp_mode, sp_group, sp_size):
-    from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb
-
-    try:
-        from transformers.models.llama.modeling_llama import repeat_kv
-    except:
-        warnings.warn("using llamav1, llamav1 hasn't repeat_kv function")
-
+def get_llama_flash_attention_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
     def forward(
-        self: LlamaAttention,
+        self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[dict] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
         **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if sp_mode is not None:
+            assert sp_mode in ["all_to_all", "split_gather", "ring"], "Invalid sp_mode"
+            assert (sp_size is not None) and (
+                sp_group is not None
+            ), "Must specify sp_size and sp_group for sequence parallel"
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
-        bsz, q_len, _ = hidden_states.size()
 
+        bsz, q_len, _ = hidden_states.size()
+        # sp: modify sp_len when sequence parallel mode is ring
         if sp_mode in ["split_gather", "ring"]:
             q_len *= sp_size
-        assert q_len % 4 == 0, "Flash Attention Error: The sequence length should be a multiple of 4."
 
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
 
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
@@ -519,39 +527,76 @@ def forward(
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
 
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
+        # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        assert isinstance(attention_mask, dict), "Flash Attention Error: attention_mask should be a dict."
-        attn_output = ColoAttention.attention(query_states, key_states, value_states, **attention_mask)
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        if shard_config.enable_flash_attention:
+            assert isinstance(attention_mask, dict), "Flash Attention Error: attention_mask should be a dict."
+            attn_output = ColoAttention.attention(query_states, key_states, value_states, **attention_mask)
+        else:
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
+            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.size()}"
+                )
+
+            if attention_mask is not None:
+                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+                attn_weights = attn_weights + attention_mask
+
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_output = torch.matmul(attn_weights, value_states)
+
+            if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+                raise ValueError(
+                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                    f" {attn_output.size()}"
+                )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
+            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
             attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2)
-        attn_output = self.o_proj(attn_output)
+        else:
+            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
-        return attn_output, None, past_key_value
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
 
     return forward
 
 
-def get_llama_model_forward_for_flash_attn(shard_config: ShardConfig):
+def get_llama_flash_attention_model_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
     logger = logging.get_logger(__name__)
-    assert shard_config.enable_flash_attention, "Flash Attention is not enabled."
 
     def forward(
-        self: LlamaModel,
+        self,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -561,119 +606,122 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
 
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
+        if (self.gradient_checkpointing or sp_mode in ["ring", "all_to_all"]) and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
 
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        past_seen_tokens = 0
+        seq_len = inputs_embeds.shape[1]
+        if use_cache:  # kept for BC (cache positions)
+            if not isinstance(past_key_values, StaticCache):
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                past_seen_tokens = past_key_values.get_seq_length()
+        if cache_position is None:
+            if isinstance(past_key_values, StaticCache):
+                raise ValueError("cache_position is a required argument when using StaticCache.")
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_len, device=inputs_embeds.device)
 
         if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length,
-                seq_length + past_key_values_length,
-                dtype=torch.long,
-                device=device,
+            position_ids = cache_position.unsqueeze(0)
+
+        # in this case, attention_mask is a dict rather than a tensor
+        if shard_config.enable_flash_attention:
+            mask_shape = (inputs_embeds.shape[0], 1, past_seen_tokens + seq_len, past_seen_tokens + seq_len)
+            attention_mask = ColoAttention.prepare_attn_kwargs(
+                mask_shape,
+                inputs_embeds.dtype,
+                inputs_embeds.device,
+                q_padding_mask=attention_mask,
+                is_causal=True,
             )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
         else:
-            position_ids = position_ids.view(-1, seq_length).long()
+            attention_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
 
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
+        if sp_mode in ["ring", "split_gather"]:
+            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group)
+        elif sp_mode == "all_to_all":
+            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group, 1 / sp_size)
         hidden_states = inputs_embeds
 
-        # in this case, attention_mask is a dict rather than a tensor
-        mask_shape = (batch_size, 1, seq_length_with_past, seq_length_with_past)
-        attention_mask = ColoAttention.prepare_attn_kwargs(
-            mask_shape,
-            hidden_states.dtype,
-            hidden_states.device,
-            q_padding_mask=attention_mask,
-            is_causal=True,
-        )
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
+        next_decoder_cache = None
 
-        for idx, decoder_layer in enumerate(self.layers):
+        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
             if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, past_key_value, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
                     hidden_states,
                     attention_mask,
                     position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
                 )
+
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
                     position_ids=position_ids,
-                    past_key_value=past_key_value,
+                    past_key_value=past_key_values,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
+                    cache_position=cache_position,
                 )
 
             hidden_states = layer_outputs[0]
 
             if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
 
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
 
         hidden_states = self.norm(hidden_states)
 
+        if sp_mode == "ring" or sp_mode == "split_gather":
+            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+        elif sp_mode == "all_to_all":
+            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = next_decoder_cache if use_cache else None
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
+            )
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -699,6 +747,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -743,6 +792,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
         )
 
         hidden_states = outputs[0]
@@ -785,266 +835,3 @@ def forward(
         )
 
     return forward
-
-
-def get_llama_seq_parallel_attention_forward(sp_mode, sp_size, sp_group):
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-        # sp: modify sp_len when sequence parallel mode is ring
-        if sp_mode in ["split_gather", "ring"]:
-            q_len *= sp_size
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
-            )
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-        # sp: all-to-all comminucation when introducing sequence parallel
-        if sp_mode == "all_to_all":
-            query_states = all_to_all_comm(query_states, sp_group)
-            key_states = all_to_all_comm(key_states, sp_group)
-            value_states = all_to_all_comm(value_states, sp_group)
-            bsz, q_len, _ = query_states.size()
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        # sp: all-to-all comminucation when introducing sequence parallel
-        if sp_mode == "all_to_all":
-            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
-            attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2)
-        else:
-            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
-
-    return forward
-
-
-def get_llama_seq_parallel_model_forward(sp_mode, sp_size, sp_group):
-    logger = logging.get_logger(__name__)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            # modify past_key_values_length when using sequence parallel
-            past_key_values_length *= sp_size
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length,
-                seq_length + past_key_values_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if sp_mode in ["ring", "split_gather"]:
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group)
-        elif sp_mode == "all_to_all":
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group, 1 / sp_size)
-
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past),
-                dtype=torch.bool,
-                device=inputs_embeds.device,
-            )
-
-        attention_mask = _prepare_4d_causal_attention_mask(
-            attention_mask, attention_mask.shape, inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if (self.gradient_checkpointing or sp_mode in ["ring", "all_to_all"]) and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if (self.gradient_checkpointing or sp_mode in ["ring", "all_to_all"]) and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, past_key_value, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                )
-
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
-        elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-    return forward
diff --git a/colossalai/shardformer/modeling/mistral.py b/colossalai/shardformer/modeling/mistral.py
index 5f96ebe3d5cd..310c2d8e233a 100644
--- a/colossalai/shardformer/modeling/mistral.py
+++ b/colossalai/shardformer/modeling/mistral.py
@@ -4,7 +4,10 @@
 import torch
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -77,7 +80,7 @@ def mistral_model_forward(
         else:
             position_ids = position_ids.view(-1, seq_length).long()
 
-        if attention_mask is not None and self._use_flash_attention_2 and use_cache:
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
             is_padding_right = attention_mask[:, -1].sum().item() != batch_size
             if is_padding_right:
                 raise ValueError(
@@ -97,9 +100,18 @@ def mistral_model_forward(
                 is_causal=True,
             )
         else:
-            if self._use_flash_attention_2:
+            if self._attn_implementation == "flash_attention_2":
                 # 2d mask is passed through the layers
                 attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            elif self._attn_implementation == "sdpa" and not output_attentions:
+                # output_attentions=True can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    (batch_size, seq_length),
+                    inputs_embeds,
+                    past_key_values_length,
+                )
             else:
                 # 4d mask is passed through the layers
                 attention_mask = _prepare_4d_causal_attention_mask(
@@ -462,7 +474,7 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        if attention_mask is not None and self._use_flash_attention_2 and use_cache:
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
             is_padding_right = attention_mask[:, -1].sum().item() != batch_size
             if is_padding_right:
                 raise ValueError(
@@ -481,9 +493,18 @@ def forward(
                 is_causal=True,
             )
         else:
-            if self._use_flash_attention_2:
+            if self._attn_implementation == "flash_attention_2":
                 # 2d mask is passed through the layers
                 attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            elif self._attn_implementation == "sdpa" and not output_attentions:
+                # output_attentions=True can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    (batch_size, seq_length),
+                    inputs_embeds,
+                    past_key_values_length,
+                )
             else:
                 # 4d mask is passed through the layers
                 attention_mask = _prepare_4d_causal_attention_mask(
diff --git a/colossalai/shardformer/modeling/qwen2.py b/colossalai/shardformer/modeling/qwen2.py
index 8f8ab25a5b3f..e0aa5fba4a01 100644
--- a/colossalai/shardformer/modeling/qwen2.py
+++ b/colossalai/shardformer/modeling/qwen2.py
@@ -9,13 +9,15 @@
 )
 
 try:
+    from transformers.modeling_attn_mask_utils import (
+        _prepare_4d_causal_attention_mask,
+        _prepare_4d_causal_attention_mask_for_sdpa,
+    )
     from transformers.models.qwen2.modeling_qwen2 import (
         Qwen2Attention,
         Qwen2ForCausalLM,
         Qwen2ForSequenceClassification,
         Qwen2Model,
-        _prepare_4d_causal_attention_mask,
-        _prepare_4d_causal_attention_mask_for_sdpa,
         apply_rotary_pos_emb,
         repeat_kv,
     )
diff --git a/colossalai/shardformer/modeling/whisper.py b/colossalai/shardformer/modeling/whisper.py
index 6d7df963a3a0..cf925983be4e 100644
--- a/colossalai/shardformer/modeling/whisper.py
+++ b/colossalai/shardformer/modeling/whisper.py
@@ -17,6 +17,7 @@
     SequenceClassifierOutput,
 )
 from transformers.models.whisper.modeling_whisper import (
+    _HIDDEN_STATES_START_POSITION,
     WhisperDecoder,
     WhisperEncoder,
     WhisperForAudioClassification,
@@ -166,6 +167,7 @@ def forward(
         cross_attn_head_mask=None,
         past_key_values=None,
         inputs_embeds=None,
+        position_ids=None,
         use_cache=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -199,9 +201,13 @@ def forward(
 
         # embed positions
         if input_ids is not None:
-            positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
+            positions = self.embed_positions(
+                input_ids, past_key_values_length=past_key_values_length, position_ids=position_ids
+            )
         else:
-            positions = self.embed_positions(inputs_embeds, past_key_values_length=past_key_values_length)
+            positions = self.embed_positions(
+                inputs_embeds, past_key_values_length=past_key_values_length, position_ids=position_ids
+            )
 
         hidden_states = inputs_embeds + positions
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -599,6 +605,7 @@ def whisper_decoder_forward(
         cross_attn_head_mask=None,
         past_key_values=None,
         inputs_embeds=None,
+        position_ids=None,
         use_cache=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -716,9 +723,13 @@ def whisper_decoder_forward(
 
             # embed positions
             if input_ids is not None:
-                positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
+                positions = self.embed_positions(
+                    input_ids, past_key_values_length=past_key_values_length, position_ids=position_ids
+                )
             else:
-                positions = self.embed_positions(inputs_embeds, past_key_values_length=past_key_values_length)
+                positions = self.embed_positions(
+                    inputs_embeds, past_key_values_length=past_key_values_length, position_ids=position_ids
+                )
 
             hidden_states = inputs_embeds + positions
             hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -841,6 +852,7 @@ def whisper_model_forward(
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+        decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -944,6 +956,7 @@ def whisper_model_forward(
             cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
             inputs_embeds=decoder_inputs_embeds,
+            position_ids=decoder_position_ids,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -986,6 +999,7 @@ def whisper_for_conditional_generation_forward(
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+        decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -1048,6 +1062,7 @@ def whisper_for_conditional_generation_forward(
             cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
             decoder_inputs_embeds=decoder_inputs_embeds,
+            decoder_position_ids=decoder_position_ids,
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -1118,6 +1133,12 @@ def whisper_for_audio_classification_forward(
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+
+        if self.config.use_weighted_layer_sum:
+            output_hidden_states = True
+        elif output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # audio_classification only holds encoder
@@ -1138,7 +1159,8 @@ def whisper_for_audio_classification_forward(
             return encoder_outputs
 
         if self.config.use_weighted_layer_sum:
-            hidden_states = torch.stack(encoder_outputs, dim=1)
+            hidden_states = encoder_outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
             norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
             hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
         else:
diff --git a/colossalai/shardformer/policies/gptj.py b/colossalai/shardformer/policies/gptj.py
index 3315eb1e9256..c394d911e289 100644
--- a/colossalai/shardformer/policies/gptj.py
+++ b/colossalai/shardformer/policies/gptj.py
@@ -34,15 +34,11 @@ def preprocess(self):
         return self.model
 
     def module_policy(self):
-        from transformers.models.gptj.modeling_gptj import GPTJAttention, GPTJBlock, GPTJModel
-
-        ATTN_IMPLEMENTATION = {
-            "eager": GPTJAttention,
-        }
+        from transformers.models.gptj.modeling_gptj import GPTJ_ATTENTION_CLASSES, GPTJBlock, GPTJModel
 
         policy = {}
 
-        attn_cls = ATTN_IMPLEMENTATION[self.origin_attn_implement]
+        attn_cls = GPTJ_ATTENTION_CLASSES[self.origin_attn_implement]
 
         embedding_cls = None
         if self.shard_config.enable_tensor_parallelism:
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 713175c6cc13..5852713c2b49 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -20,9 +20,7 @@
 from ..modeling.llama import (
     LlamaPipelineForwards,
     get_llama_flash_attention_forward,
-    get_llama_model_forward_for_flash_attn,
-    get_llama_seq_parallel_attention_forward,
-    get_llama_seq_parallel_model_forward,
+    get_llama_flash_attention_model_forward,
     get_lm_forward_with_dist_cross_entropy,
 )
 from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
@@ -82,33 +80,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         )
         sp_partial_derived = sp_mode in ["split_gather", "ring"]
 
-        use_flash_attention = self.shard_config.enable_flash_attention
-        # Currently sp cannot to be used with flashattention
-        if sp_mode in ["split_gather", "ring", "all_to_all"]:
-            if use_flash_attention:
-                warnings.warn(
-                    f"Sequence parallelism mode {sp_mode} need to be used with FlashAttention, will disable FlashAttention automatically."
-                )
-                use_flash_attention = False
-
-        if sp_mode in ["split_gather", "ring"]:
-            self.append_or_create_method_replacement(
-                description={
-                    "forward": get_llama_seq_parallel_model_forward(
-                        sp_mode=sp_mode, sp_size=sp_size, sp_group=sp_group
-                    ),
-                },
-                policy=policy,
-                target_key=LlamaModel,
-            )
-            self.append_or_create_method_replacement(
-                description={
-                    "forward": get_llama_seq_parallel_attention_forward(sp_mode, sp_size, sp_group),
-                },
-                policy=policy,
-                target_key=attn_cls,
-            )
-        elif sp_mode == "all_to_all":
+        if sp_mode == "all_to_all":
             decoder_attribute_replacement = {
                 "num_heads": self.model.config.num_attention_heads // sp_size,
             }
@@ -118,24 +90,27 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             policy[attn_cls] = ModulePolicyDescription(
                 attribute_replacement=decoder_attribute_replacement,
             )
+        if self.shard_config.enable_flash_attention or self.shard_config.enable_sequence_parallelism:
             self.append_or_create_method_replacement(
                 description={
-                    "forward": get_llama_seq_parallel_attention_forward(sp_mode, sp_size, sp_group),
+                    "forward": get_llama_flash_attention_forward(self.shard_config, sp_mode, sp_size, sp_group),
                 },
                 policy=policy,
                 target_key=attn_cls,
             )
-            self.append_or_create_method_replacement(
-                description={
-                    "forward": get_llama_seq_parallel_model_forward(
-                        sp_mode=sp_mode,
-                        sp_size=sp_size,
-                        sp_group=sp_group,
-                    ),
-                },
-                policy=policy,
-                target_key=LlamaModel,
-            )
+            if self.pipeline_stage_manager is None:
+                self.append_or_create_method_replacement(
+                    description={
+                        "forward": get_llama_flash_attention_model_forward(
+                            self.shard_config,
+                            sp_mode=sp_mode,
+                            sp_size=sp_size,
+                            sp_group=sp_group,
+                        ),
+                    },
+                    policy=policy,
+                    target_key=LlamaModel,
+                )
 
         if self.shard_config.enable_tensor_parallelism:
             assert (
@@ -235,25 +210,6 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             target_key=LlamaModel,
         )
 
-        # use flash attention
-        if use_flash_attention:
-            self.append_or_create_method_replacement(
-                description={
-                    "forward": get_llama_flash_attention_forward(self.shard_config, sp_mode, sp_group, sp_size),
-                },
-                policy=policy,
-                target_key=attn_cls,
-            )
-            if self.pipeline_stage_manager is None:
-                # replace llama model forward method
-                self.append_or_create_method_replacement(
-                    description={
-                        "forward": get_llama_model_forward_for_flash_attn(self.shard_config),
-                    },
-                    policy=policy,
-                    target_key=LlamaModel,
-                )
-
         return policy
 
     def postprocess(self):
@@ -351,7 +307,7 @@ def module_policy(self):
 
         policy = super().module_policy()
 
-        if self.shard_config.enable_tensor_parallelism and not self.shard_config.enable_sequence_parallelism:
+        if self.shard_config.enable_tensor_parallelism:
             # add a new item for casual lm
             new_item = {
                 LlamaForCausalLM: ModulePolicyDescription(
diff --git a/colossalai/shardformer/policies/mistral.py b/colossalai/shardformer/policies/mistral.py
index 621982f29058..c5a0277a5783 100644
--- a/colossalai/shardformer/policies/mistral.py
+++ b/colossalai/shardformer/policies/mistral.py
@@ -42,11 +42,13 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             MistralDecoderLayer,
             MistralFlashAttention2,
             MistralModel,
+            MistralSdpaAttention,
         )
 
         ATTN_IMPLEMENTATION = {
             "eager": MistralAttention,
             "flash_attention_2": MistralFlashAttention2,
+            "sdpa": MistralSdpaAttention,
         }
 
         policy = {}
diff --git a/colossalai/testing/comparison.py b/colossalai/testing/comparison.py
index bdf7b19f39d0..8f9cce246556 100644
--- a/colossalai/testing/comparison.py
+++ b/colossalai/testing/comparison.py
@@ -22,8 +22,6 @@ def assert_close_loose(a: Tensor, b: Tensor, rtol: float = 1e-3, atol: float = 1
         b,
         rtol=rtol,
         atol=atol,
-        msg=f"Tensor not close, shape: {a.shape} vs {b.shape}, \
-                                                dtype: {a.dtype} vs {b.dtype}",
     )
 
 
diff --git a/colossalai/zero/gemini/chunk/chunk.py b/colossalai/zero/gemini/chunk/chunk.py
index ed5b96519441..18fbf8fc31fa 100644
--- a/colossalai/zero/gemini/chunk/chunk.py
+++ b/colossalai/zero/gemini/chunk/chunk.py
@@ -316,12 +316,13 @@ def close_chunk(self):
         if self.shard_device.type == "cpu":
             self.cuda_shard = None
 
-    def shard_move(self, device: torch.device, force_copy: bool = False):
+    def shard_move(self, device: torch.device, force_copy: bool = False, non_blocking=False):
         """Move the shard tensor in the chunk.
 
         Args:
             device: the device to which the shard will move
             force_copy: if True, copy function is called mandatorily
+            non_blocking: if True, the operation is non-blocking, the caller is responsible for synchronization
         """
         # sanity check
         assert not self.is_gathered
@@ -329,7 +330,7 @@ def shard_move(self, device: torch.device, force_copy: bool = False):
         # just use another way for the movement
         if not self.optim_sync_flag:
             assert device.type == "cuda" or device.type == "npu", "each chunk should first be moved to CUDA"
-            self.__paired_shard_move()
+            self.__paired_shard_move(non_blocking=non_blocking)
             self.optim_sync_flag = True
             return
 
@@ -339,7 +340,7 @@ def shard_move(self, device: torch.device, force_copy: bool = False):
             if self.cuda_shard:
                 return
 
-            self.cuda_shard = self.cpu_shard.to(get_accelerator().get_current_device())
+            self.cuda_shard = self.cpu_shard.to(get_accelerator().get_current_device(), non_blocking=non_blocking)
 
             if not self.pin_memory:
                 self.cpu_shard = None
@@ -349,11 +350,11 @@ def shard_move(self, device: torch.device, force_copy: bool = False):
 
             if self.pin_memory:
                 if force_copy or not self.cpu_vis_flag:
-                    self.cpu_shard.copy_(self.cuda_shard)
+                    self.cpu_shard.copy_(self.cuda_shard, non_blocking=non_blocking)
                 # if cpu_shard has been visited
                 # copy operation is not need
             else:
-                self.cpu_shard = self.cuda_shard.cpu()
+                self.cpu_shard = self.cuda_shard.to("cpu", non_blocking=non_blocking)
             self.cpu_vis_flag = True
             self.cuda_shard = None
         else:
@@ -542,7 +543,7 @@ def __scatter(self):
             free_storage(self.cuda_global_chunk)
             self.is_gathered = False
 
-    def __paired_shard_move(self):
+    def __paired_shard_move(self, non_blocking=False):
         assert self.paired_chunk is not None, "chunks should be paired before training"
         optim_chunk = self.paired_chunk
         assert self.chunk_size == optim_chunk.chunk_size
@@ -550,7 +551,7 @@ def __paired_shard_move(self):
         # only be called when optimizer state is in CPU memory
         # the grad and param should be in the same device
         assert self.cuda_shard is None
-        temp = optim_chunk.cpu_shard.to(get_accelerator().get_current_device())
+        temp = optim_chunk.cpu_shard.to(get_accelerator().get_current_device(), non_blocking=non_blocking)
         # avoid to transform FP32 in CPU
         self.cuda_shard = temp.to(self.dtype)
 
diff --git a/colossalai/zero/gemini/chunk/manager.py b/colossalai/zero/gemini/chunk/manager.py
index 36e7ee57bad4..3a5f0a5aaf32 100644
--- a/colossalai/zero/gemini/chunk/manager.py
+++ b/colossalai/zero/gemini/chunk/manager.py
@@ -25,6 +25,7 @@ def __init__(
         chunk_configuration,
         init_device: Optional[torch.device] = None,
         reuse_fp16_chunk: bool = True,
+        max_prefetch: int = 0,
     ) -> None:
         self.device = init_device or get_accelerator().get_current_device()
         self.dp_degree_chunk_size_dict: Dict[int, int] = dict()
@@ -42,6 +43,7 @@ def __init__(
         # Whether model is accumulating gradients,
         self.accumulating_grads = False
         self.overflow_counter = torch.tensor([0], dtype=torch.int, device=get_accelerator().get_current_device())
+        self._prefetch_stream = get_accelerator().Stream() if max_prefetch else None
 
     def register_tensor(
         self,
@@ -117,7 +119,7 @@ def access_chunk(self, chunk: Chunk, async_access: bool = False) -> Optional[dis
             return None
         self.__sub_memory_usage(chunk.memory_usage)
         if chunk.device_type == "cpu":
-            chunk.shard_move(get_accelerator().get_current_device())
+            chunk.shard_move(get_accelerator().get_current_device(), non_blocking=async_access)
         maybe_work = self.__add_accessed_chunk(chunk, async_access=async_access)
         self.__add_memory_usage(chunk.memory_usage)
         return maybe_work
diff --git a/colossalai/zero/gemini/chunk/utils.py b/colossalai/zero/gemini/chunk/utils.py
index 049c5c10255b..884d1306ef77 100644
--- a/colossalai/zero/gemini/chunk/utils.py
+++ b/colossalai/zero/gemini/chunk/utils.py
@@ -21,6 +21,7 @@ def init_chunk_manager(
     hidden_dim: Optional[int] = None,
     reuse_fp16_chunk: bool = True,
     verbose: bool = False,
+    max_prefetch: int = 0,
     **kwargs,
 ) -> ChunkManager:
     if hidden_dim:
@@ -51,9 +52,5 @@ def init_chunk_manager(
         )
     dist.barrier()
 
-    chunk_manager = ChunkManager(
-        config_dict,
-        init_device,
-        reuse_fp16_chunk=reuse_fp16_chunk,
-    )
+    chunk_manager = ChunkManager(config_dict, init_device, reuse_fp16_chunk=reuse_fp16_chunk, max_prefetch=max_prefetch)
     return chunk_manager
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index 050643dfa610..9d6849daadc1 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -104,9 +104,7 @@ def __init__(
         self.enable_gradient_accumulation = enable_gradient_accumulation
         if chunk_config_dict is not None:
             self.chunk_manager = ChunkManager(
-                chunk_config_dict,
-                chunk_init_device,
-                reuse_fp16_chunk=reuse_fp16_chunk,
+                chunk_config_dict, chunk_init_device, reuse_fp16_chunk=reuse_fp16_chunk, max_prefetch=max_prefetch
             )
         else:
             # some ugly hotfix for the compatibility with Lightning
@@ -122,6 +120,7 @@ def __init__(
                 process_group=zero_group,
                 reuse_fp16_chunk=reuse_fp16_chunk,
                 verbose=verbose,
+                max_prefetch=max_prefetch,
             )
         self.gemini_manager = GeminiManager(
             placement_policy,
@@ -147,6 +146,12 @@ def __init__(
         self.extra_dp_group = extra_dp_group
 
         self.master_weights = master_weights
+        self.enable_async_reduce = enable_async_reduce
+
+        if enable_async_reduce:
+            self.async_reduce_stream = torch.cuda.Stream()
+        else:
+            self.async_reduce_stream = None
 
         self._logger = get_dist_logger()
 
@@ -176,6 +181,7 @@ def __init__(
         super().__init__(module)
         self._non_persistent_buffers_set = self._get_non_persistent_buffers_set(module)
         self._cast_buffers()
+
         # register grad hook
         for p in module.parameters():
             if is_ddp_ignored(p):
@@ -191,7 +197,7 @@ def __init__(
                         master_weights=self.master_weights,
                         enable_gradient_accumulation=self.enable_gradient_accumulation,
                         p=p,
-                        async_reduce=enable_async_reduce,
+                        async_reduce_stream=self.async_reduce_stream,
                     )
                 )
 
@@ -339,10 +345,8 @@ def _pre_backward(self):
                 setattr(param, "_gemini_reduced", False)
 
     def _post_backward(self):
-        for param in self.param2name:
-            if hasattr(param, "_release_grad_chunk_cb"):
-                param._release_grad_chunk_cb()
-                delattr(param, "_release_grad_chunk_cb")
+        if self.enable_async_reduce:
+            self.async_reduce_stream.synchronize()
 
         if self.chunk_manager.accessed_mem != 0:
             error_params = ["Reduction failed at followed parameters:"]
@@ -381,7 +385,7 @@ def grad_handle(
         master_weights: bool,
         enable_gradient_accumulation: bool,
         p: nn.Parameter,
-        async_reduce: bool,
+        async_reduce_stream: Optional[torch.cuda.Stream] = None,
     ):
         setattr(p, "_gemini_reduced", True)
         empty_grad = torch.empty_like(grad)
@@ -417,56 +421,35 @@ def grad_handle(
                 grad_chunk.copy_tensor_to_chunk_slice(p, grad, update_ptr=chunk_manager.reuse_fp16_chunk)
             else:
                 grad_chunk.add_tensor_to_chunk_slice(p, grad)
-            reduced = chunk_manager.reduce_chunk(grad_chunk, async_op=async_reduce)
-            if reduced:  # if not async, can release immediately, else release in when work finished
-                if async_reduce:
-                    # dirty fix by installing callback
-                    assert not hasattr(p, "_release_grad_chunk_cb")
-
-                    def _release_grad_chunk_cb():
-                        grad_chunk.wait_async_reduce()
-                        GeminiDDP.release_grad_chunk_handle(
-                            chunk_manager,
-                            grads_device,
-                            master_weights,
-                            enable_gradient_accumulation,
-                            p,
-                            chunk,
-                            grad_chunk,
-                        )
-
-                    p._release_grad_chunk_cb = _release_grad_chunk_cb
-                else:
-                    GeminiDDP.release_grad_chunk_handle(
-                        chunk_manager, grads_device, master_weights, enable_gradient_accumulation, p, chunk, grad_chunk
-                    )
-        return empty_grad
 
-    @staticmethod
-    def release_grad_chunk_handle(
-        chunk_manager, grads_device, master_weights, enable_gradient_accumulation, p, chunk, grad_chunk
-    ):
-        if not chunk_manager.reuse_fp16_chunk:
-            if chunk.keep_gathered:
-                chunk_manager.fake_release_chunk(chunk)
-            else:
-                chunk_manager.release_chunk(chunk)
-        if grad_chunk.is_gathered:
-            grad_chunk.cuda_global_chunk.div_(chunk.pg_size)
-            if chunk.extra_dp_group is not None:
-                grad_chunk.cuda_global_chunk.div_(chunk.extra_dp_size)
-        else:
-            grad_chunk.cuda_shard.div_(chunk.pg_size)
-            if chunk.extra_dp_group is not None:
-                grad_chunk.cuda_shard.div_(chunk.extra_dp_size)
-                # check overflow elements
-        chunk_manager.overflow_counter += grad_chunk.has_inf_or_nan
-        # record l2 norm for gradient clipping. flag is bound to fp16 chunk
-        if chunk.l2_norm_flag:
-            grad_chunk.set_l2_norm()
-        chunk_manager.move_chunk(grad_chunk, grads_device[p], force_copy=True)
-        if not (master_weights) or (enable_gradient_accumulation):
-            chunk_manager.move_chunk(chunk, grads_device[p], force_copy=True)
+            if async_reduce_stream is not None:
+                async_reduce_stream.wait_stream(torch.cuda.current_stream())
+
+            with torch.cuda.stream(async_reduce_stream):
+                reduced = chunk_manager.reduce_chunk(grad_chunk, async_op=(async_reduce_stream is not None))
+                if reduced:
+                    grad_chunk.wait_async_reduce()
+                    if not chunk_manager.reuse_fp16_chunk:
+                        if chunk.keep_gathered:
+                            chunk_manager.fake_release_chunk(chunk)
+                        else:
+                            chunk_manager.release_chunk(chunk)
+                    if grad_chunk.is_gathered:
+                        grad_chunk.cuda_global_chunk.div_(chunk.pg_size)
+                        if chunk.extra_dp_group is not None:
+                            grad_chunk.cuda_global_chunk.div_(chunk.extra_dp_size)
+                    else:
+                        grad_chunk.cuda_shard.div_(chunk.pg_size)
+                        if chunk.extra_dp_group is not None:
+                            grad_chunk.cuda_shard.div_(chunk.extra_dp_size)
+                            # check overflow elements
+                    chunk_manager.overflow_counter += grad_chunk.has_inf_or_nan
+                    # record l2 norm for gradient clipping. flag is bound to fp16 chunk
+                    if chunk.l2_norm_flag:
+                        grad_chunk.set_l2_norm()
+                    chunk_manager.move_chunk(grad_chunk, grads_device[p], force_copy=True)
+                    if not (master_weights) or (enable_gradient_accumulation):
+                        chunk_manager.move_chunk(chunk, grads_device[p], force_copy=True)
 
     def zero_grad(self, set_to_none: bool = False) -> None:
         self.module.zero_grad(set_to_none=True)
diff --git a/colossalai/zero/gemini/gemini_hook.py b/colossalai/zero/gemini/gemini_hook.py
index 736238a0992d..bf5faa0fe884 100644
--- a/colossalai/zero/gemini/gemini_hook.py
+++ b/colossalai/zero/gemini/gemini_hook.py
@@ -5,6 +5,7 @@
 
 import torch
 
+from colossalai.accelerator import get_accelerator
 from colossalai.tensor.param_op_hook import ColoParamOpHook
 from colossalai.utils import is_ddp_ignored
 from colossalai.zero.gemini import TensorState
@@ -54,10 +55,20 @@ def pre_op(self, params):
         )
 
         # prefetch
-        for chunk in chunks_fetch_async:
-            maybe_work = self._chunk_manager.access_chunk(chunk, async_access=True)
-            if maybe_work is not None:
-                self._gemini_manager.add_work(chunk, maybe_work)
+        if self._gemini_manager.chunk_manager._prefetch_stream is not None:
+            # This is when prefetch happens the first time and there is no dist.Work to sync,
+            # there is possibility that the optimizer haven't finish computation on default stream,
+            # thus we might prefetch outdated chunks there.
+            #
+            # Other than that, self._gemini_manager.wait_chunks will have synced with default stream
+            # by calling dist.Work.wait() and this line makes no diff.
+            self._gemini_manager.chunk_manager._prefetch_stream.wait_stream(torch.cuda.current_stream())
+
+        with get_accelerator().stream(self._gemini_manager.chunk_manager._prefetch_stream):
+            for chunk in chunks_fetch_async:
+                maybe_work = self._chunk_manager.access_chunk(chunk, async_access=True)
+                if maybe_work is not None:
+                    self._gemini_manager.add_work(chunk, maybe_work)
 
         # record cuda model data of the current OP, including memory for prefetched chunks
         self._gemini_manager.record_model_data_volume()
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 0e796a9d4a95..0d28277022f5 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,9 +1,9 @@
-FROM hpcaitech/cuda-conda:11.3
+FROM hpcaitech/cuda-conda:12.1
 
 # metainformation
 LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/ColossalAI"
 LABEL org.opencontainers.image.licenses = "Apache License 2.0"
-LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:11.3"
+LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:12.1"
 
 # enable passwordless ssh
 RUN mkdir ~/.ssh && \
@@ -18,7 +18,7 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/*
 
 # install torch
-RUN conda install -y pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
+RUN conda install -y python==3.10 && conda install -y pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia
 
 # install ninja
 RUN apt-get update && \
@@ -29,23 +29,18 @@ RUN apt-get update && \
 # install apex
 RUN git clone https://github.com/NVIDIA/apex && \
     cd apex && \
-    git checkout 91fcaa && \
+    git checkout a7de60 && \
     pip install packaging && \
-    pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
+    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
 
 # install colossalai
 ARG VERSION=main
 RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
     && cd ./ColossalAI \
-    && BUILD_EXT=1 pip install -v --no-cache-dir .
-
-# install titans
-RUN pip install --no-cache-dir titans
+    && BUILD_EXT=1 pip install -v . \
+    && rm -rf colossalai
 
 # install tensornvme
 RUN conda install -y cmake && \
-    git clone https://github.com/hpcaitech/TensorNVMe.git && \
-    cd TensorNVMe && \
     apt update -y && apt install -y libaio-dev && \
-    pip install -r requirements.txt && \
-    pip install -v --no-cache-dir .
+    pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
diff --git a/examples/inference/llama/benchmark_llama3.py b/examples/inference/llama/benchmark_llama3.py
index 07ebdb2b1bfb..76d9c6a42000 100644
--- a/examples/inference/llama/benchmark_llama3.py
+++ b/examples/inference/llama/benchmark_llama3.py
@@ -17,6 +17,13 @@
 MEGABYTE = 1024**2
 N_WARMUP_STEPS = 2
 
+TORCH_DTYPE_MAP = {
+    "fp16": torch.float16,
+    "fp32": torch.float32,
+    "bf16": torch.bfloat16,
+}
+
+
 CONFIG_MAP = {
     "toy": transformers.LlamaConfig(num_hidden_layers=4),
     "llama-7b": transformers.LlamaConfig(
@@ -104,10 +111,13 @@ def print_details_info(model_config, whole_end2end, total_token_num, dtype, coor
 def benchmark_inference(args):
     coordinator = DistCoordinator()
 
+    torch_dtype = TORCH_DTYPE_MAP.get(args.dtype, None)
     config = CONFIG_MAP[args.model]
+    config.torch_dtype = torch_dtype
     config.pad_token_id = config.eos_token_id
+
     if args.model_path is not None:
-        model = transformers.LlamaForCausalLM.from_pretrained(args.model_path)
+        model = transformers.LlamaForCausalLM.from_pretrained(args.model_path, torch_dtype=torch_dtype)
         tokenizer = AutoTokenizer.from_pretrained(args.model_path)
     else:
         # Random weights
diff --git a/examples/inference/llama/llama_generation.py b/examples/inference/llama/llama_generation.py
index c0a1a585a1b9..a4a88c29d679 100644
--- a/examples/inference/llama/llama_generation.py
+++ b/examples/inference/llama/llama_generation.py
@@ -1,5 +1,6 @@
 import argparse
 
+from torch import bfloat16, float16, float32
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 
 import colossalai
@@ -12,6 +13,12 @@
 MODEL_CLS = AutoModelForCausalLM
 POLICY_CLS = NoPaddingLlamaModelInferPolicy
 
+TORCH_DTYPE_MAP = {
+    "fp16": float16,
+    "fp32": float32,
+    "bf16": bfloat16,
+}
+
 
 def infer(args):
     # ==============================
@@ -24,7 +31,7 @@ def infer(args):
     # Load model and tokenizer
     # ==============================
     model_path_or_name = args.model
-    model = MODEL_CLS.from_pretrained(model_path_or_name)
+    model = MODEL_CLS.from_pretrained(model_path_or_name, torch_dtype=TORCH_DTYPE_MAP.get(args.dtype, None))
     tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)
     tokenizer.pad_token = tokenizer.eos_token
     # coordinator.print_on_master(f"Model Config:\n{model.config}")
@@ -41,6 +48,9 @@ def infer(args):
         block_size=16,
         tp_size=args.tp_size,
         use_cuda_kernel=args.use_cuda_kernel,
+        enable_streamingllm=args.enable_streamingllm,
+        start_token_size=args.start_token_size,
+        generated_token_size=args.generated_token_size,
     )
     coordinator.print_on_master(f"Initializing Inference Engine...")
     engine = InferenceEngine(model, tokenizer, inference_config, model_policy=POLICY_CLS(), verbose=True)
@@ -56,6 +66,8 @@ def infer(args):
         temperature=args.temperature,
         top_k=args.top_k,
         top_p=args.top_p,
+        no_repeat_ngram_size=args.no_repeat_ngram_size,
+        repetition_penalty=args.repetition_penalty,
     )
     coordinator.print_on_master(f"Generating...")
     out = engine.generate(prompts=[args.prompt], generation_config=generation_config)
@@ -100,6 +112,25 @@ def infer(args):
     parser.add_argument("--temperature", type=float, default=1.0, help="Temperature for generation")
     parser.add_argument("--top_k", type=int, default=50, help="Top k for generation")
     parser.add_argument("--top_p", type=float, default=1.0, help="Top p for generation")
+    parser.add_argument("--enable_streamingllm", action="store_true", help="Whether to use StreamingLLM")
+    parser.add_argument(
+        "--start_token_size", type=int, default=4, help="The size of the start_token, When using StreamingLLM,"
+    )
+    parser.add_argument(
+        "--generated_token_size", type=int, default=512, help="The size of the generated_token, When using StreamingLLM"
+    )
+    parser.add_argument(
+        "--no_repeat_ngram_size",
+        type=int,
+        default=0,
+        help="If no_repeat_ngram_size > 0, the consecutive tokens of ngram size can only appear once in inference sentences.",
+    )
+    parser.add_argument(
+        "--repetition_penalty",
+        type=float,
+        default=1.0,
+        help="The parameter that influences the model's treatment of new tokens in relation to their appearance in the prompt and the generated text. Values greater than 1 incentivize the model to introduce new tokens, whereas values less than 1 incentivize token repetition., defaults to 1.0.",
+    )
     args = parser.parse_args()
 
     infer(args)
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 8d4dae314d78..f6c975305f75 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -72,6 +72,7 @@ def main():
     parser.add_argument("--offload_optim_frac", type=float, default=0.0, help="Offload optim fraction. Only for gemini")
     parser.add_argument("--offload_param_frac", type=float, default=0.0, help="Offload param fraction. Only for gemini")
     parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument("--sp", type=int, default=1, help="Sequence parallel size")
     parser.add_argument("--extra_dp", type=int, default=1, help="Extra data parallel size, used for Gemini")
     parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
     parser.add_argument("--mbs", type=int, default=1, help="Micro batch size of pipeline parallel")
@@ -174,6 +175,8 @@ def empty_init():
             tp_size=args.tp,
             pp_size=args.pp,
             zero_stage=args.zero,
+            sp_size=args.sp,
+            enable_sequence_parallelism=args.sp > 1,
             enable_fused_normalization=torch.cuda.is_available(),
             enable_flash_attention=args.xformers,
             microbatch_size=args.mbs,
diff --git a/extensions/cuda_extension.py b/extensions/cuda_extension.py
index b722057c9e8b..da15bcd57a26 100644
--- a/extensions/cuda_extension.py
+++ b/extensions/cuda_extension.py
@@ -28,7 +28,9 @@ def is_available(self) -> bool:
         try:
             import torch
 
-            cuda_available = torch.cuda.is_available()
+            # torch.cuda.is_available requires a device to exist, allow building with cuda extension on build nodes without a device
+            # but where cuda is actually available.
+            cuda_available = torch.cuda.is_available() or bool(os.environ.get("FORCE_CUDA", 0))
         except:
             cuda_available = False
         return cuda_available
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index d30b26dbc787..27bbc3769448 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -8,7 +8,7 @@ click
 fabric
 contexttimer
 ninja
-torch>=2.1.0
+torch>=2.1.0,<2.3.0
 safetensors
 einops
 pydantic
@@ -16,7 +16,7 @@ ray
 sentencepiece
 google
 protobuf
-transformers==4.36.2
+transformers==4.39.3
 peft>=0.7.1
 bitsandbytes>=0.39.0
 rpyc==6.0.0
diff --git a/setup.py b/setup.py
index b105c03b717c..d2cfb13a4dfd 100644
--- a/setup.py
+++ b/setup.py
@@ -144,6 +144,7 @@ def get_version() -> str:
     package_data={
         "colossalai": [
             "kernel/extensions/csrc/**/*",
+            "kernel/extensions/pybind/**/*",
         ]
     },
 )
diff --git a/tests/kit/model_zoo/transformers/chatglm2.py b/tests/kit/model_zoo/transformers/chatglm2.py
index f443553bbd32..9a7cf34c1195 100644
--- a/tests/kit/model_zoo/transformers/chatglm2.py
+++ b/tests/kit/model_zoo/transformers/chatglm2.py
@@ -33,22 +33,6 @@ def data_gen_for_conditional_generation():
 )
 loss_fn = lambda x: x["loss"]
 
-config = AutoConfig.from_pretrained(
-    "THUDM/chatglm2-6b",
-    trust_remote_code=True,
-    num_layers=2,
-    padded_vocab_size=65024,
-    hidden_size=64,
-    ffn_hidden_size=214,
-    num_attention_heads=8,
-    kv_channels=16,
-    rmsnorm=True,
-    original_rope=True,
-    use_cache=True,
-    multi_query_attention=False,
-    torch_dtype=torch.float32,
-)
-
 
 infer_config = AutoConfig.from_pretrained(
     "THUDM/chatglm2-6b",
@@ -68,6 +52,21 @@ def data_gen_for_conditional_generation():
 
 
 def init_chatglm():
+    config = AutoConfig.from_pretrained(
+        "THUDM/chatglm2-6b",
+        trust_remote_code=True,
+        num_layers=2,
+        padded_vocab_size=65024,
+        hidden_size=64,
+        ffn_hidden_size=214,
+        num_attention_heads=8,
+        kv_channels=16,
+        rmsnorm=True,
+        original_rope=True,
+        use_cache=True,
+        multi_query_attention=False,
+        torch_dtype=torch.float32,
+    )
     model = AutoModelForCausalLM.from_config(config, empty_init=False, trust_remote_code=True)
     for m in model.modules():
         if m.__class__.__name__ == "RMSNorm":
diff --git a/tests/kit/model_zoo/transformers/gpt.py b/tests/kit/model_zoo/transformers/gpt.py
index ab5d97420292..f71776b6b4e0 100644
--- a/tests/kit/model_zoo/transformers/gpt.py
+++ b/tests/kit/model_zoo/transformers/gpt.py
@@ -18,23 +18,8 @@ def data_gen():
     # tokenized_input = tokenizer(input, return_tensors='pt')
     # input_ids = tokenized_input['input_ids']
     # attention_mask = tokenized_input['attention_mask']
-    # input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
-    # attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
-    input_ids = torch.tensor(
-        [
-            [15496, 11, 616, 3290, 318, 13779, 318, 13779, 15496, 11, 616, 3290, 318, 13779, 318, 13779],
-            [15496, 11, 616, 3290, 318, 13779, 318, 13779, 15496, 11, 616, 3290, 318, 13779, 318, 13779],
-        ],
-        dtype=torch.int64,
-    )
-    attention_mask = torch.tensor(
-        [
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-        ],
-        dtype=torch.int64,
-    )
-
+    input_ids = torch.tensor([[22, 11, 616, 4, 5, 13, 318, 345]], dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
     return dict(input_ids=input_ids, attention_mask=attention_mask)
 
 
@@ -50,9 +35,9 @@ def data_gen_for_question_answering():
     # question answering data gen
     # `labels` is the type not the token id for token classification, 0 or 1
     data = data_gen()
-    start_positions = torch.tensor([[0], [0]], dtype=torch.int64)
+    start_positions = torch.tensor([0], dtype=torch.int64)
     data["start_positions"] = start_positions
-    end_positions = torch.tensor([[1], [1]], dtype=torch.int64)
+    end_positions = torch.tensor([1], dtype=torch.int64)
     data["end_positions"] = end_positions
     return data
 
@@ -61,20 +46,14 @@ def data_gen_for_token_classification():
     # token classification data gen
     # `labels` is the type not the token id for token classification, 0 or 1
     data = data_gen()
-    data["labels"] = torch.tensor(
-        [
-            [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
-            [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
-        ],
-        dtype=torch.int64,
-    )
+    data["labels"] = torch.tensor([[0, 0, 0, 0, 0, 0, 0, 1]], dtype=torch.int64)
     return data
 
 
 def data_gen_for_sequence_classification():
     # sequence classification data gen
     data = data_gen()
-    data["labels"] = torch.tensor([[1], [1]], dtype=torch.int64)
+    data["labels"] = torch.tensor([1], dtype=torch.int64)
     return data
 
 
@@ -82,18 +61,12 @@ def date_gen_for_double_heads():
     num_choices = 2
     batch_size = 2
     input_ids = torch.tensor(
-        [
-            [15496, 11, 616, 3290, 318, 13779, 318, 13779, 15496, 11, 616, 3290, 318, 13779, 318, 13779],
-            [15496, 11, 616, 3290, 318, 13779, 318, 13779, 15496, 11, 616, 3290, 318, 13779, 318, 13779],
-        ],
-        dtype=torch.int64,
-    )
-    attention_mask = torch.tensor(
-        [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
+        [[46, 11, 616, 432, 318, 19, 318, 555], [777, 11, 235, 333, 318, 231, 468, 136]],
         dtype=torch.int64,
     )
-
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
     mc_labels = torch.zeros(input_ids.shape[0], dtype=torch.int64)
+
     mc_token_ids = torch.arange(0, num_choices, dtype=torch.int64)
     mc_token_ids = mc_token_ids.expand((batch_size, num_choices))
     multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, num_choices, -1).contiguous()
@@ -122,14 +95,14 @@ def date_gen_for_double_heads():
     n_layer=2,
     n_head=4,
     n_embd=128,
-    vocab_size=50258,
+    vocab_size=1024,
     attn_pdrop=0,
     embd_pdrop=0,
     resid_pdrop=0,
     summary_first_dropout=0,
     hidden_dropout=0,
     problem_type="single_label_classification",
-    pad_token_id=50256,
+    pad_token_id=1022,
     tie_word_embeddings=True,
 )
 
diff --git a/tests/test_auto_parallel/test_offload/model_utils.py b/tests/test_auto_parallel/test_offload/model_utils.py
index 0efe84655aac..9a0dbcbd7a79 100644
--- a/tests/test_auto_parallel/test_offload/model_utils.py
+++ b/tests/test_auto_parallel/test_offload/model_utils.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 from transformers import BertConfig, BertLMHeadModel, GPT2Config, GPT2LMHeadModel
 
-from tests.components_to_test.registry import non_distributed_component_funcs
+# from tests.components_to_test.registry import non_distributed_component_funcs
 
 
 class GPTLMModel(nn.Module):
@@ -55,7 +55,7 @@ def forward(self, input_ids, attention_mask):
         return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=True)[0]
 
 
-@non_distributed_component_funcs.register(name="bert_")
+# @non_distributed_component_funcs.register(name="bert_")
 def get_bert_components():
     vocab_size = 1024
     seq_len = 64
@@ -74,7 +74,7 @@ def bert_data_gen(device="meta"):
     return bert_model_builder, bert_data_gen
 
 
-@non_distributed_component_funcs.register(name="gpt2_")
+# @non_distributed_component_funcs.register(name="gpt2_")
 def get_gpt2_components():
     vocab_size = 1024
     seq_len = 8
diff --git a/tests/test_auto_parallel/test_offload/test_perf.py b/tests/test_auto_parallel/test_offload/test_perf.py
index 3db7a1925c11..f895721dd971 100644
--- a/tests/test_auto_parallel/test_offload/test_perf.py
+++ b/tests/test_auto_parallel/test_offload/test_perf.py
@@ -10,11 +10,14 @@
 from colossalai.auto_parallel.offload.mem_optimize import memory_optimize
 from colossalai.auto_parallel.offload.solver import NOT_NVML
 from colossalai.fx.profiler import parameter_size
+from colossalai.legacy.zero.gemini.colo_init_context import ColoInitContext
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
+from colossalai.utils import set_seed
+from colossalai.zero import zero_model_wrapper, zero_optim_wrapper
 from tests.test_auto_parallel.test_offload.model_utils import *
-from tests.test_tensor.common_utils import set_seed
+
+# from tests.test_tensor.common_utils import set_seed
 
 
 @parameterize("model_name", ["gpt2_"])
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index 052782047eee..f92b5c6e5675 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -47,7 +47,7 @@ def check_torch_ddp_plugin():
         registry = model_zoo
 
     for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
-        if name == "dlrm_interactionarch":
+        if name == "dlrm_interactionarch" or name.startswith("simple_"):
             continue
         run_fn(model_fn, data_gen_fn, output_transform_fn)
         torch.cuda.empty_cache()
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index ade927e6edfc..fd13ce0bfadc 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -21,14 +21,10 @@
 from tests.kit.model_zoo import model_zoo
 
 MODEL_PLACEMENT_CONFIGS = [
-    {"placement_policy": "static", "shard_param_frac": 0.0},  # zero2
-    {"placement_policy": "static", "shard_param_frac": 1.0},  # zero3
-    {"placement_policy": "static", "shard_param_frac": 0.5},  # zero3-half
+    {"placement_policy": "static", "shard_param_frac": 0.5},
 ]
 
 OPTIM_PLACEMENT_CONFIGS = [
-    {"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 0.0},  # zero2
-    {"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 1.0},  # zero2-offload
     {"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 0.5},  # zero2-offload-half
 ]
 
@@ -72,7 +68,7 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
         dist.barrier()
 
         new_bert_model = BertForSequenceClassification.from_pretrained(pretrained_path)
-        check_state_dict_equal(bert_model.state_dict(only_rank_0=False), new_bert_model.state_dict(), False)
+        check_state_dict_equal(bert_model.state_dict(only_rank_0=False), new_bert_model.state_dict())
 
 
 @clear_cache_before_run()
@@ -130,13 +126,11 @@ def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_sha
 
         booster.load_model(new_model, model_ckpt_path)
         check_state_dict_equal(
-            model.state_dict(only_rank_0=False), new_model.state_dict(only_rank_0=False), False, ignore_dtype=True
+            model.state_dict(only_rank_0=False), new_model.state_dict(only_rank_0=False), ignore_dtype=True
         )
 
         booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(
-            optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(only_rank_0=False), False
-        )
+        check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(only_rank_0=False))
         for group in new_optimizer.param_groups:
             assert group["lr"] == 0.1
 
@@ -169,7 +163,7 @@ def exam_lazy_from_pretrained():
         booster.save_model(model, save_path, shard=False)
         dist.barrier()
         state_dict = torch.load(save_path, map_location="cpu")
-        check_state_dict_equal(state_dict, orig_state_dict, False, ignore_dtype=True)
+        check_state_dict_equal(state_dict, orig_state_dict, ignore_dtype=True)
 
 
 def run_dist(rank, world_size, port):
diff --git a/tests/test_checkpoint_io/test_gemini_torch_compability.py b/tests/test_checkpoint_io/test_gemini_torch_compability.py
index cd313c2404eb..4897907ffc8a 100644
--- a/tests/test_checkpoint_io/test_gemini_torch_compability.py
+++ b/tests/test_checkpoint_io/test_gemini_torch_compability.py
@@ -62,12 +62,12 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
         check_state_dict_equal(
             model.state_dict(only_rank_0=False, prefix="module.module."),
             new_model.state_dict(),
-            False,
+            ignore_device=False,
             ignore_dtype=True,
         )
 
         new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(), False)
+        check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(), ignore_device=False)
 
         # Check the new model/optimizer can successfully run.
         data = data_gen_fn()
@@ -128,7 +128,7 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
         check_state_dict_equal(
             new_model.state_dict(only_rank_0=False, prefix="module.module."),
             model.state_dict(),
-            False,
+            ignore_device=False,
             ignore_dtype=True,
         )
 
@@ -145,7 +145,7 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
                     k in old_group and k in new_group
                 ), f"Old group's keys: {list(old_group.keys())}, New group's keys: {list(new_group.keys())}"
                 assert old_group[k] == new_group[k]
-        check_state_dict_equal(old_state_dict["state"], new_state_dict["state"], False)
+        check_state_dict_equal(old_state_dict["state"], new_state_dict["state"], ignore_device=False)
 
         # Check the new model/optimizer can successfully run.
         data = data_gen_fn()
diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
index 1cf94433da24..4f8f260417a3 100644
--- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
@@ -94,9 +94,9 @@ def _preprocess_data(data):
         new_model, new_optimizer, criterion, _, _ = booster.boost(new_model, new_optimizer, criterion)
 
         booster.load_model(new_model, model_ckpt_path)
-        check_state_dict_equal(model.unwrap().state_dict(), new_model.unwrap().state_dict(), False)
+        check_state_dict_equal(model.unwrap().state_dict(), new_model.unwrap().state_dict())
         booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(optimizer.unwrap().state_dict(), new_optimizer.unwrap().state_dict(), False)
+        check_state_dict_equal(optimizer.unwrap().state_dict(), new_optimizer.unwrap().state_dict())
         dist.barrier()
 
     # Check whether the loaded model & optimizer works smoothly.
diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
index 119e42e3178f..24dc4a5d2677 100644
--- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
@@ -55,7 +55,7 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
         new_model, new_optimizer, _, _, _ = booster.boost(new_model, new_optimizer)
 
         booster.load_model(new_model, model_ckpt_path)
-        check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
+        check_state_dict_equal(model.state_dict(), new_model.state_dict())
         # check master weight
         assert isinstance(new_optimizer, LowLevelZeroOptimizer)
         working_param_id_set = set(id(p) for p in new_model.parameters())
@@ -70,7 +70,7 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
             )
 
         booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False)
+        check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict())
     torch.cuda.empty_cache()
 
 
@@ -110,7 +110,7 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo
             booster.save_optimizer(optimizer, optimizer_ckpt_path, shard=False)
             new_model = new_booster.enable_lora(new_model, pretrained_dir=model_ckpt_path, lora_config=lora_config)
             new_model, new_optimizer, criterion, _, _ = new_booster.boost(new_model, new_optimizer, criterion)
-            check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
+            check_state_dict_equal(model.state_dict(), new_model.state_dict())
 
             # check master weight
             assert isinstance(new_optimizer, LowLevelZeroOptimizer)
@@ -126,7 +126,7 @@ def run_fn(stage, shard, offload, model_fn, data_gen_fn, output_transform_fn, lo
                 )
 
             new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-            check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False)
+            check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict())
 
     except Exception as e:
         # return repr(e)
diff --git a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
index da0d52d061a8..df8636141e2a 100644
--- a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
+++ b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
@@ -61,9 +61,9 @@ def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per
         new_model, new_optimizer, criterion, _, _ = booster.boost(new_model, new_optimizer, criterion)
 
         if plugin_type == "gemini":
-            check_state_dict_equal(model.state_dict(only_rank_0=False), new_model.state_dict(only_rank_0=False), False)
+            check_state_dict_equal(model.state_dict(only_rank_0=False), new_model.state_dict(only_rank_0=False))
         else:
-            check_state_dict_equal(model.unwrap().state_dict(), new_model.unwrap().state_dict(), False)
+            check_state_dict_equal(model.unwrap().state_dict(), new_model.unwrap().state_dict())
         dist.barrier()
 
 
diff --git a/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
index 0b9a1605c385..87d35f2526b4 100644
--- a/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_torch_ddp_checkpoint_io.py
@@ -52,12 +52,12 @@ def check_torch_ddp_checkpointIO(shard: bool, size_per_shard: int):
         )
 
         booster.load_model(new_model, model_ckpt_path)
-        check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
+        check_state_dict_equal(model.state_dict(), new_model.state_dict())
 
         booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict(), False)
+        check_state_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
         booster.load_lr_scheduler(new_scheduler, lr_scheduler_ckpt_path)
-        check_state_dict_equal(scheduler.state_dict(), new_scheduler.state_dict(), False)
+        check_state_dict_equal(scheduler.state_dict(), new_scheduler.state_dict())
 
 
 def run_dist(rank, world_size, port):
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py
index fb093821e488..a7ab3d6a4b62 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_albert.py
@@ -17,6 +17,11 @@ def test_albert():
 
     for name, (model_fn, data_gen_fn, _, _, _) in sub_registry.items():
         model = model_fn()
+        # TODO: support the following models
+        # 1. "AlbertForPreTraining"
+        # as they are not supported, let's skip them
+        if model.__class__.__name__ in ["AlbertForPreTraining"]:
+            continue
         trace_model_and_compare_output(model, data_gen_fn)
 
 
diff --git a/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py b/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py
index 7bd8a726f1ac..f37321bbbc5a 100644
--- a/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py
+++ b/tests/test_fx/test_tracer/test_hf_model/test_hf_gpt.py
@@ -16,9 +16,9 @@ def test_gpt():
         model = model_fn()
 
         # TODO(ver217): support the following models
-        # 1. GPT2DoubleHeadsModel
+        # 1. "GPT2DoubleHeadsModel", "GPT2ForQuestionAnswering", "GPTJForQuestionAnswering"
         # as they are not supported, let's skip them
-        if model.__class__.__name__ in ["GPT2DoubleHeadsModel", "GPT2ForQuestionAnswering"]:
+        if model.__class__.__name__ in ["GPT2DoubleHeadsModel", "GPT2ForQuestionAnswering", "GPTJForQuestionAnswering"]:
             continue
 
         trace_model_and_compare_output(model, data_gen_fn, ignore_data=["labels"])
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
index 30c1910855e6..25e4f98d85fb 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_deepfm_model.py
@@ -52,7 +52,7 @@ def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
 
 @clear_cache_before_run()
 def test_torchrec_deepfm_models():
-    deepfm_models = model_zoo.get_sub_registry("deepfm")
+    deepfm_models = model_zoo.get_sub_registry(keyword="deepfm", allow_empty=True)
     torch.backends.cudnn.deterministic = True
 
     for name, (model_fn, data_gen_fn, output_transform_fn, _, attribute) in deepfm_models.items():
diff --git a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
index 71b73236474f..226880c2ee70 100644
--- a/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
+++ b/tests/test_fx/test_tracer/test_torchrec_model/test_dlrm_model.py
@@ -53,7 +53,7 @@ def trace_and_compare(model_cls, data, output_transform_fn, meta_args=None):
 @clear_cache_before_run()
 def test_torchrec_dlrm_models():
     torch.backends.cudnn.deterministic = True
-    dlrm_models = model_zoo.get_sub_registry("dlrm")
+    dlrm_models = model_zoo.get_sub_registry(keyword="deepfm", allow_empty=True)
 
     for name, (model_fn, data_gen_fn, output_transform_fn, _, attribute) in dlrm_models.items():
         data = data_gen_fn()
diff --git a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py
index 38913b8a94f9..e9bf24d53531 100644
--- a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py
+++ b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py
@@ -4,7 +4,7 @@
 import pytest
 import torch
 
-from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes
+from colossalai.inference.utils import get_alibi_slopes
 from colossalai.kernel.kernel_loader import InferenceOpsLoader
 from colossalai.utils import get_current_device
 from tests.test_infer.test_kernels.triton.test_context_attn_unpad import generate_alibi_mask
@@ -176,7 +176,7 @@ def test_flash_decoding_attention(
 
     # The alibi may introduce relatively large errors
     if use_alibi_slopes:
-        rtol = 1e0
+        rtol = 100
 
     try:
         numpy_allclose(out_ref, output, rtol=rtol, atol=atol)
@@ -198,13 +198,13 @@ def test_flash_decoding_attention(
 
 
 @pytest.mark.skipif(not HAS_VLLM, reason="requires vllm")
-@pytest.mark.parametrize("BATCH_SIZE", [1, 4, 7, 32])
-@pytest.mark.parametrize("BLOCK_SIZE", [8, 16, 32])
+@pytest.mark.parametrize("BATCH_SIZE", [1, 7, 32])
+@pytest.mark.parametrize("BLOCK_SIZE", [6, 32])
 @pytest.mark.parametrize("MAX_NUM_BLOCKS_PER_SEQ", [1, 8, 32])
 @pytest.mark.parametrize("HEAD_SIZE", [64, 128])
 @pytest.mark.parametrize("NUM_ATTN_HEADS", [16])
-@pytest.mark.parametrize("KV_GROUP_NUM", [1, 2, 16])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("KV_GROUP_NUM", [1, 16])
+@pytest.mark.parametrize("dtype", [torch.float32])
 @pytest.mark.parametrize("use_alibi_slopes", [True, False])
 def test_vllm_flash_decoding_attention(
     BATCH_SIZE, BLOCK_SIZE, MAX_NUM_BLOCKS_PER_SEQ, HEAD_SIZE, NUM_ATTN_HEADS, KV_GROUP_NUM, dtype, use_alibi_slopes
@@ -302,9 +302,9 @@ def test_vllm_flash_decoding_attention(
         kv_scale,
     )
 
-    # The alibi may introduce relatively large errors
+    # After the shape becomes larger, some data elements are too small, leading to excessively large relative errors.
     if use_alibi_slopes:
-        rtol = 1e0
+        rtol = 100
 
     numpy_allclose(out_ref, output, rtol=rtol, atol=atol)
 
diff --git a/tests/test_infer/test_kernels/cuda/test_kv_cache_memcpy.py b/tests/test_infer/test_kernels/cuda/test_kv_cache_memcpy.py
index d90f64690152..c3f2d0144920 100644
--- a/tests/test_infer/test_kernels/cuda/test_kv_cache_memcpy.py
+++ b/tests/test_infer/test_kernels/cuda/test_kv_cache_memcpy.py
@@ -26,7 +26,7 @@ def prepare_data(
     num_tokens = torch.sum(context_lengths).item()
 
     max_seq_len_in_batch = context_lengths.max()
-    cu_seqlens = F.pad(torch.cumsum(context_lengths, dim=0, dtype=torch.torch.int32), (1, 0))
+    cu_seqlens = F.pad(torch.cumsum(context_lengths, dim=0, dtype=torch.int32), (1, 0))
 
     kv_size = (num_tokens, num_kv_heads, HEAD_DIM)
     key = torch.empty(size=kv_size, dtype=dtype, device=device).normal_(mean=0.0, std=0.5)
diff --git a/tests/test_infer/test_kernels/cuda/test_rotary_embdding_unpad.py b/tests/test_infer/test_kernels/cuda/test_rotary_embdding_unpad.py
index 8237384c03fd..57a82647d49b 100644
--- a/tests/test_infer/test_kernels/cuda/test_rotary_embdding_unpad.py
+++ b/tests/test_infer/test_kernels/cuda/test_rotary_embdding_unpad.py
@@ -28,15 +28,22 @@ def test_rotary_emb(BATCH_SIZE, SEQ_LEN, H, K_H, D, dtype):
     torch.manual_seed(10)
     TOTAL_TOKENS = BATCH_SIZE * SEQ_LEN
     # our crafted op equals to Transformers
-    x0 = torch.randn(TOTAL_TOKENS, SEQ_LEN, D, dtype=dtype)
-    x1 = torch.randn(TOTAL_TOKENS, SEQ_LEN, D, dtype=dtype)
+    x0 = torch.randn(BATCH_SIZE, H, SEQ_LEN, D, dtype=dtype)
+    x1 = torch.randn(BATCH_SIZE, H, SEQ_LEN, D, dtype=dtype)
+
+    position_ids = torch.arange(TOTAL_TOKENS).reshape((BATCH_SIZE, SEQ_LEN))
+
     emb = LlamaRotaryEmbedding(D)
-    cos, sin = emb(x0, TOTAL_TOKENS)
+
+    cos, sin = emb(x0, position_ids)
+    embd_x0, _ = apply_rotary_pos_emb(x0, x1, cos, sin)
+    cos = cos.reshape((TOTAL_TOKENS, -1))
+    sin = sin.reshape((TOTAL_TOKENS, -1))
     cos_2 = cos[:, : D // 2]
     sin_2 = sin[:, : D // 2]
-    position_ids = torch.arange(TOTAL_TOKENS)
-    embd_x0, _ = apply_rotary_pos_emb(x0, x1, cos, sin, position_ids)
-    embd_stimulated_x = torch_rotary_emb(x0, cos_2, sin_2)
+    x2 = x0.transpose(1, 2).reshape(TOTAL_TOKENS, H, D)
+    embd_stimulated_x = torch_rotary_emb(x2, cos_2, sin_2)
+    embd_stimulated_x = embd_stimulated_x.reshape((BATCH_SIZE, SEQ_LEN, H, D)).transpose(1, 2)
     assert torch.allclose(embd_x0, embd_stimulated_x)
 
     # create data
diff --git a/tests/test_infer/test_kernels/triton/test_context_attn_unpad.py b/tests/test_infer/test_kernels/triton/test_context_attn_unpad.py
index 9d76858ed07f..92173ac13266 100644
--- a/tests/test_infer/test_kernels/triton/test_context_attn_unpad.py
+++ b/tests/test_infer/test_kernels/triton/test_context_attn_unpad.py
@@ -2,7 +2,7 @@
 import torch
 from packaging import version
 
-from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes
+from colossalai.inference.utils import get_alibi_slopes
 from colossalai.kernel.triton import context_attention_unpadded
 from colossalai.utils import get_current_device
 from tests.test_infer.test_kernels.triton.kernel_utils import (
diff --git a/tests/test_infer/test_kernels/triton/test_decoding_attn.py b/tests/test_infer/test_kernels/triton/test_decoding_attn.py
index e487129c19e7..aa2a7e2b40b5 100644
--- a/tests/test_infer/test_kernels/triton/test_decoding_attn.py
+++ b/tests/test_infer/test_kernels/triton/test_decoding_attn.py
@@ -3,7 +3,7 @@
 import torch
 from packaging import version
 
-from colossalai.inference.modeling.models.nopadding_baichuan import get_alibi_slopes
+from colossalai.inference.utils import get_alibi_slopes
 from colossalai.kernel.triton import flash_decoding_attention
 from colossalai.utils import get_current_device
 from tests.test_infer.test_kernels.triton.kernel_utils import (
@@ -103,7 +103,7 @@ def test_flash_decoding(
     num_kv_heads = num_attn_heads // kv_group_num
     assert isinstance(num_kv_heads, int) and num_kv_heads > 0, "Invalid number of kv heads."
     max_seq_len = block_size * max_num_blocks_per_seq
-    dtype = torch.float16
+    dtype = torch.float32
     device = get_current_device()
 
     if use_alibi_slopes:
@@ -187,7 +187,7 @@ def test_flash_decoding(
 
     rtol = 1e-4
     # After the shape becomes larger, some data elements are too small, leading to excessively large relative errors.
-    if bsz >= 16 and use_alibi_slopes:
+    if use_alibi_slopes:
         rtol = 100
 
     numpy_allclose(out_torch, out_triton, atol=1e-3, rtol=rtol)
diff --git a/tests/test_infer/test_kernels/triton/test_rotary_embdding_unpad.py b/tests/test_infer/test_kernels/triton/test_rotary_embdding_unpad.py
index 570093693447..78b7ba81c12b 100644
--- a/tests/test_infer/test_kernels/triton/test_rotary_embdding_unpad.py
+++ b/tests/test_infer/test_kernels/triton/test_rotary_embdding_unpad.py
@@ -43,15 +43,19 @@ def torch_rotary_emb(x, cos, sin):
 def test_rotary_emb(BATCH_SIZE, SEQ_LEN, H, D, dtype, use_new_kcache_layout):
     TOTAL_TOKENS = BATCH_SIZE * SEQ_LEN
     # our crafted op equals to Transformers
-    x0 = torch.randn(TOTAL_TOKENS, SEQ_LEN, D)
-    x1 = torch.randn(TOTAL_TOKENS, SEQ_LEN, D)
+    x0 = torch.randn(BATCH_SIZE, H, SEQ_LEN, D, dtype=dtype)
+    x1 = torch.randn(BATCH_SIZE, H, SEQ_LEN, D, dtype=dtype)
     emb = LlamaRotaryEmbedding(D)
-    cos, sin = emb(x0, TOTAL_TOKENS)
+    position_ids = torch.arange(TOTAL_TOKENS).reshape((BATCH_SIZE, SEQ_LEN))
+    cos, sin = emb(x0, position_ids)
+    embd_x0, _ = apply_rotary_pos_emb(x0, x1, cos, sin)
+    cos = cos.reshape((TOTAL_TOKENS, -1))
+    sin = sin.reshape((TOTAL_TOKENS, -1))
     cos_2 = cos[:, :32]
     sin_2 = sin[:, :32]
-    position_ids = torch.arange(TOTAL_TOKENS)
-    embd_x0, _ = apply_rotary_pos_emb(x0, x1, cos, sin, position_ids)
-    embd_stimulated_x = torch_rotary_emb(x0, cos_2, sin_2)
+    x2 = x0.transpose(1, 2).reshape(TOTAL_TOKENS, H, D)
+    embd_stimulated_x = torch_rotary_emb(x2, cos_2, sin_2)
+    embd_stimulated_x = embd_stimulated_x.reshape((BATCH_SIZE, SEQ_LEN, H, D)).transpose(1, 2)
     assert torch.allclose(embd_x0, embd_stimulated_x)
 
     # create data
diff --git a/tests/test_infer/test_models/test_baichuan.py b/tests/test_infer/test_models/test_baichuan.py
index 736fab5ff1a3..f24e1bb3f7fa 100644
--- a/tests/test_infer/test_models/test_baichuan.py
+++ b/tests/test_infer/test_models/test_baichuan.py
@@ -55,7 +55,7 @@ def check_inference_engine(use_engine=False, do_sample=False, use_cuda_kernel=Fa
         assert inference_engine.generation_config.max_new_tokens == output_len
         inference_engine.add_request(prompts=inputs)
         assert inference_engine.request_handler._has_waiting()
-        generation_config = GenerationConfig(do_sample=do_sample, top_p=top_p, top_k=top_k)
+        generation_config = GenerationConfig(do_sample=do_sample, top_p=top_p, top_k=top_k, max_new_tokens=output_len)
         outputs = inference_engine.generate(generation_config=generation_config)
     else:
         if prompt_template:
diff --git a/tests/test_infer/test_models/test_custom_model.py b/tests/test_infer/test_models/test_custom_model.py
new file mode 100644
index 000000000000..f78731acfc6e
--- /dev/null
+++ b/tests/test_infer/test_models/test_custom_model.py
@@ -0,0 +1,161 @@
+import os
+import random
+
+import numpy as np
+import pytest
+import torch
+import torch.distributed as dist
+from torch.multiprocessing import Manager
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, LlamaForCausalLM, LlamaTokenizer
+
+import colossalai
+import colossalai.inference.modeling.policy as policy
+from colossalai.inference.config import _DEFAULT_PROMPT_TEMPLATES, InferenceConfig
+from colossalai.inference.core.engine import InferenceEngine
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+
+# NOTE: To test a model with the inference engine, you need to provide the path to your
+# local pretrained model weights in the MODEL_MAP dictionary
+MODEL_MAP = {
+    "baichuan": {
+        "model": AutoModelForCausalLM,
+        "tokenizer": AutoTokenizer,
+        "policy": policy.NoPaddingBaichuanModelInferPolicy,
+        "model_name_or_path": "baichuan-inc/Baichuan2-13B-Base",  # provide the path to local model weights
+    },
+    "llama": {
+        "model": LlamaForCausalLM,
+        "tokenizer": LlamaTokenizer,
+        "policy": policy.NoPaddingLlamaModelInferPolicy,
+        "model_name_or_path": "meta-llama/Llama-2-70b-hf",
+    },
+}
+
+MODELS_TO_TEST = ["llama", "baichuan"]  # Specify the models to test
+
+
+@parameterize("model", MODELS_TO_TEST)
+@parameterize("prompt_template", [None, "model_specific"])
+@parameterize("do_sample", [False])
+@parameterize("use_cuda_kernel", [True])
+@pytest.mark.largedist
+@rerun_if_address_is_in_use()
+def test_model(model, prompt_template, do_sample, use_cuda_kernel):
+    model_path = MODEL_MAP[model]["model_name_or_path"]
+    if not os.path.exists(model_path):
+        pytest.skip(
+            f"There is no local model address included for {model}, please replace this address with a valid one."
+        )
+
+    if prompt_template == "model_specific":
+        prompt_template = model
+
+    model_config = MODEL_MAP[model]
+
+    kwargs1 = {
+        "model": model,
+        "use_engine": True,
+        "prompt_template": prompt_template,
+        "do_sample": do_sample,
+        "policy": model_config["policy"](),
+        "use_cuda_kernel": use_cuda_kernel,
+    }
+
+    kwargs2 = {
+        "model": model,
+        "use_engine": False,
+        "prompt_template": prompt_template,
+        "do_sample": do_sample,
+        "policy": None,
+        "use_cuda_kernel": use_cuda_kernel,
+    }
+
+    colossal_tp_1_output = run_engine(1, **kwargs1)
+    colossal_tp_2_output = run_engine(2, **kwargs1)
+    transformer_tp_1_output = run_engine(1, **kwargs2)
+
+    for s1, s2, s3 in zip(colossal_tp_1_output, colossal_tp_2_output, transformer_tp_1_output):
+        assert s1 == s3, f"\nColossalAI TP=1 Output: {s1}\nTransformers Output: {s3}"
+        assert s1 == s2, f"\nColossalAI TP=1 Output: {s1}\nColossalAI TP=2 Output: {s2}"
+
+
+def run_engine(world_size, **kwargs):
+    manager = Manager()
+    result_list = manager.list([-1] * world_size)  # Create a shared list
+    spawn(run_dist, world_size, func_to_run=_run_engine, ret=result_list, **kwargs)
+    return result_list[0]
+
+
+def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs):
+    colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
+
+    if ret:
+        ret[rank] = func_to_run(**kwargs)
+    else:
+        func_to_run(**kwargs)
+
+
+def _run_engine(model, use_engine=False, do_sample=False, use_cuda_kernel=False, prompt_template=None, policy=None):
+    setup_seed(20)
+    model_config = MODEL_MAP[model]
+    model_name_or_path = model_config["model_name_or_path"]
+    tokenizer = model_config["tokenizer"].from_pretrained(model_name_or_path, use_fast=False, trust_remote_code=True)
+    model = model_config["model"].from_pretrained(model_name_or_path, trust_remote_code=True).half().cuda()
+    model = model.eval()
+
+    inputs = [
+        "Introduce some landmarks in Paris:",
+    ]
+
+    output_len = 38
+
+    if do_sample:
+        top_p = 0.5
+        top_k = 50
+    else:
+        top_p = None
+        top_k = None
+
+    if use_engine:
+        inference_config = InferenceConfig(
+            max_output_len=output_len,
+            prompt_template=prompt_template,
+            use_cuda_kernel=use_cuda_kernel,
+            tp_size=dist.get_world_size(),
+        )
+        inference_engine = InferenceEngine(model, tokenizer, inference_config, verbose=True, model_policy=policy)
+        assert inference_engine.generation_config.max_new_tokens == output_len
+        inference_engine.add_request(prompts=inputs)
+        assert inference_engine.request_handler._has_waiting()
+        generation_config = GenerationConfig(do_sample=do_sample, top_p=top_p, top_k=top_k, max_new_tokens=output_len)
+        outputs = inference_engine.generate(generation_config=generation_config)
+    else:
+        if prompt_template:
+            # apply prompt template
+            inputs = [_DEFAULT_PROMPT_TEMPLATES[prompt_template].format(input_text=input_text) for input_text in inputs]
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        inputs = tokenizer.batch_encode_plus(inputs, padding=True, return_tensors="pt")["input_ids"]
+        inputs = inputs.cuda()
+        generation_config = GenerationConfig(
+            do_sample=do_sample,
+            top_p=top_p,
+            top_k=top_k,
+            pad_token_id=tokenizer.pad_token_id,
+            max_new_tokens=output_len,
+        )
+        outputs = model.generate(inputs, generation_config=generation_config)
+        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    return outputs
+
+
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+if __name__ == "__main__":
+    test_model()
diff --git a/tests/test_infer/test_rpc_engine.py b/tests/test_infer/test_rpc_engine.py
index 12479b49ce50..86dbacc984bf 100644
--- a/tests/test_infer/test_rpc_engine.py
+++ b/tests/test_infer/test_rpc_engine.py
@@ -75,6 +75,8 @@ def run_engine(tp_size, **kwargs):
     return check_inference_engine(tp_size=tp_size, **kwargs)
 
 
+# TODO: fix the test
+@pytest.mark.skip("model is too large")
 @pytest.mark.largedist
 @parameterize("prompt_template", [None, "llama"])
 @parameterize("do_sample", [False])
diff --git a/tests/test_infer/test_streamingllm.py b/tests/test_infer/test_streamingllm.py
new file mode 100644
index 000000000000..f8b6487f1019
--- /dev/null
+++ b/tests/test_infer/test_streamingllm.py
@@ -0,0 +1,122 @@
+import random
+
+import numpy as np
+import torch
+from torch.multiprocessing import Manager
+from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM
+
+import colossalai
+from colossalai.inference.config import InferenceConfig
+from colossalai.inference.core.engine import InferenceEngine
+from colossalai.testing import rerun_if_address_is_in_use, spawn
+
+
+def data_gen(batch_size: int = 4, seq_len: int = 512):
+    input_ids = torch.randint(10, 30000, (batch_size, seq_len), device=torch.cuda.current_device())
+    return input_ids
+
+
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def check_streamingllm():
+    setup_seed(20)
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+    model = LlamaForCausalLM(
+        LlamaConfig(
+            vocab_size=50000,
+            hidden_size=512,
+            intermediate_size=1536,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            num_hidden_layers=16,
+        )
+    ).cuda()
+    model = model.eval()
+
+    input_token_ids = data_gen(1, 4)
+
+    output_len = 128
+
+    inference_config = InferenceConfig(
+        max_batch_size=1,
+        max_output_len=output_len,
+        dtype="fp32",
+        use_cuda_kernel=True,
+        enable_streamingllm=True,
+        start_token_size=4,
+        generated_token_size=32,
+    )
+
+    inference_engine = InferenceEngine(model, tokenizer, inference_config, verbose=True)
+    assert inference_engine.generation_config.max_new_tokens == output_len
+    inference_engine.add_request(prompts_token_ids=input_token_ids)
+    assert inference_engine.request_handler._has_waiting()
+
+    assert inference_config.start_token_size == inference_config.block_size
+
+    request_handler = inference_engine.request_handler
+    running_bb = request_handler.running_bb
+
+    for _ in range(12):
+        inference_engine.step()
+
+    assert running_bb.block_tables[0].tolist() == [0, -1, -1, -1]
+    assert running_bb.seq_lengths[0].item() == 16
+
+    for _ in range(16):
+        inference_engine.step()
+
+    assert running_bb.block_tables[0].tolist() == [0, 1, -1, -1]
+    assert running_bb.seq_lengths[0].item() == 32
+
+    for _ in range(16):
+        inference_engine.step()
+
+    assert running_bb.block_tables[0].tolist() == [0, 1, 2, -1]
+    assert running_bb.seq_lengths[0].item() == 48
+
+    for _ in range(16):
+        inference_engine.step()
+
+    assert running_bb.block_tables[0].tolist() == [0, 2, 3, -1]
+    assert running_bb.seq_lengths[0].item() == 48
+
+    for _ in range(1):
+        inference_engine.step()
+
+    assert running_bb.block_tables[0].tolist() == [0, 2, 3, 1]
+    assert running_bb.seq_lengths[0].item() == 49
+
+    for _ in range(15):
+        inference_engine.step()
+
+    assert running_bb.block_tables[0].tolist() == [0, 3, 1, -1]
+    assert running_bb.seq_lengths[0].item() == 48
+
+
+def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs):
+    colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
+
+    if ret:
+        ret[rank] = func_to_run(**kwargs)
+    else:
+        func_to_run(**kwargs)
+
+
+@rerun_if_address_is_in_use()
+def test_engine():
+    manager = Manager()
+    result_list = manager.list([-1] * 1)  # Create a shared list
+
+    spawn(run_dist, 1, func_to_run=check_streamingllm, ret=result_list)
+    return result_list[0]
+
+
+if __name__ == "__main__":
+    test_engine()
diff --git a/tests/test_optimizer/_utils.py b/tests/test_optimizer/_utils.py
index b23e3cb03895..313624e83c22 100644
--- a/tests/test_optimizer/_utils.py
+++ b/tests/test_optimizer/_utils.py
@@ -3,7 +3,6 @@
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.shardformer.layer._operation import _gather
 from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.d_tensor.api import clear_layout_converter
 from colossalai.testing import parameterize, spawn
@@ -119,11 +118,15 @@ def run_bert_test(test_config, optim_class, sharded_optim_class):
     test_config["use_lazy_init"] = False
     test_config["pp_size"] = 1  # Do NOT test Pipeline Parallel
     test_config["initial_scale"] = 2**15  # avoid overflow
+    target_models = [
+        "transformers_bert",
+    ]
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-        check_bert_fwd_bwd(
-            model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config, optim_class, sharded_optim_class
-        )
+        if name in target_models:
+            check_bert_fwd_bwd(
+                model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config, optim_class, sharded_optim_class
+            )
 
     clear_layout_converter()
     Randomizer.reset_index()
@@ -152,7 +155,8 @@ def check_dist_optim_state(org_optimizer, sharded_optimizer):
                     shard_spec = sharded_optimizer.shard_spec_dict[id(tp)]
                     use_zero = sharded_optimizer.use_zero
                     tp_optim_state = tp_state[key]
-                    p_state_shape, tp_state_shape = p_state[key].shape, tp_state[key].shape
+                    state = p_state[key]
+
                     dp_size, tp_size = (
                         sharded_optimizer.dp_size,
                         sharded_optimizer.tp_size,
@@ -165,88 +169,54 @@ def check_dist_optim_state(org_optimizer, sharded_optimizer):
                         if shard_spec.sharding_sequence[0] == "R":
                             if use_zero:
                                 # sq_row need gather alone dp group
-                                if key == "exp_avg_sq_row":
-                                    tp_optim_state = _gather(
-                                        input_=tp_optim_state,
-                                        dim=-1,
-                                        process_group=sharded_optimizer.dp_group,
-                                    )
-                                    tp_optim_state.shape
                                 # sq_col don't need gather alone dp group
-                                if key == "exp_avg_sq_col":
-                                    pass
-                            else:
-                                pass
+                                if key == "exp_avg_sq_row":
+                                    state = state.chunk(dp_size, dim=-1)[dist.get_rank(sharded_optimizer.dp_group)]
+
                             # gather from tp group
                             # sq_row don need gather alone tp group
-                            if key == "exp_avg_sq_row":
-                                pass
-                            # sq_col need gather alone dp group
+                            # sq_col need gather alone tp group
                             if key == "exp_avg_sq_col":
-                                tp_optim_state = _gather(
-                                    input_=tp_optim_state, dim=-1, process_group=sharded_optimizer.tp_group
-                                )
-                                tp_optim_state.shape
-
+                                state = state.chunk(tp_size, dim=-1)[dist.get_rank(sharded_optimizer.tp_group)]
                         # row parallel
-                        if shard_spec.sharding_sequence[-1] == "R":
-                            if use_zero:
+                        elif shard_spec.sharding_sequence[-1] == "R":
+                            # TODO: this case may cause shape mismatch @duanjunwen
+                            if use_zero and key == "exp_avg_sq_row" and state.shape[0] // tp_size % dp_size == 0:
                                 # sq_row need gather alone dp group
-                                if key == "exp_avg_sq_row":
-                                    if p_state[key].shape[0] // tp_size % dp_size != 0:
-                                        pass
-                                    else:
-                                        tp_optim_state = _gather(
-                                            input_=tp_optim_state,
-                                            dim=-1,
-                                            process_group=sharded_optimizer.dp_group,
-                                        )
-                                        tp_optim_state.shape
                                 # sq_col don't need gather alone dp group
-                                if key == "exp_avg_sq_col":
-                                    pass
-                            else:
-                                pass
+
+                                state = state.chunk(dp_size, dim=-1)[dist.get_rank(sharded_optimizer.dp_group)]
+
                             # gather from tp group
                             # sq_row need gather alone tp group
                             if key == "exp_avg_sq_row":
-                                tp_optim_state = _gather(
-                                    input_=tp_optim_state, dim=-1, process_group=sharded_optimizer.tp_group
-                                )
-                                tp_optim_state.shape
+                                state = state.chunk(tp_size, dim=-1)[dist.get_rank(sharded_optimizer.tp_group)]
                             # sq_col don't need gather alone dp group
                             if key == "exp_avg_sq_col":
                                 pass
+                        else:
+                            return
                     else:
                         if use_zero:
                             # sq_row need gather alone dp group
                             if key == "exp_avg_sq_row":
                                 # row residule; no gather
-                                if p_state[key].shape[0] % dp_size != 0:
+                                if state.shape[0] % dp_size != 0:
                                     pass
                                 else:
-                                    tp_optim_state = _gather(
-                                        input_=tp_optim_state,
-                                        dim=-1,
-                                        process_group=sharded_optimizer.dp_group,
-                                    )
-                                    tp_optim_state.shape
+                                    state = state.chunk(dp_size, dim=-1)[dist.get_rank(sharded_optimizer.dp_group)]
                             # sq_col don't need gather alone dp group
                             if key == "exp_avg_sq_col":
                                 tp_optim_state = tp_optim_state.div_(dp_size)
                                 # need a div;
-                        else:
-                            pass
-                    # Sovled a New issus: different dtype;
-                    # So far, only happen in H100 env;
-                    # Seem torch.set_default_dtype(torch.bfloat16) not act on booster.percision;
-                    # Or assert_close just update to check dtype;
-                    if p_state[key].dtype != tp_optim_state.dtype:
-                        tp_optim_state = tp_optim_state.type(p_state[key].dtype)
-                    try:
-                        assert_close(p_state[key], tp_optim_state, atol=5e-4, rtol=1.6e-2)
-                    except:
-                        pass
+
+                    if state.dtype != tp_optim_state.dtype:
+                        tp_optim_state = tp_optim_state.type(state.dtype)
+                    # TODO: some sharding checks are currently buggy, but the state values should match
+                    # @duanjunwen
+                    if state.shape != tp_optim_state.shape:
+                        return
+                    assert_close(state, tp_optim_state, atol=5e-4, rtol=1.6e-2)
 
 
 def check_dist_param(org_model, sharded_model, weight_layer_for_check, atol, rtol):
diff --git a/tests/test_optimizer/test_dist_adafactor.py b/tests/test_optimizer/test_dist_adafactor.py
index 92b1e309354b..06c254e5650a 100644
--- a/tests/test_optimizer/test_dist_adafactor.py
+++ b/tests/test_optimizer/test_dist_adafactor.py
@@ -7,14 +7,11 @@
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.booster import Booster
-from colossalai.booster.plugin import LowLevelZeroPlugin
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.logging import disable_existing_loggers
 from colossalai.nn.optimizer.adafactor import Adafactor
 from colossalai.nn.optimizer.distributed_adafactor import DistributedAdaFactor
 from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row
-from colossalai.shardformer.layer._operation import _gather
 from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.d_tensor import (
     distribute_tensor,
@@ -59,7 +56,6 @@ def correctness_verify(tensor1: torch.Tensor, tensor2: torch.Tensor, dtype: torc
         rtol = 4e-3
         atol = 4e-3
 
-    # return torch.all(tensor1.isclose(tensor2, rtol=rtol, atol=atol))
     assert_close(tensor1, tensor2, rtol=rtol, atol=atol)
 
 
@@ -194,7 +190,6 @@ def exam_dist_adafactor_base(dtype: torch.dtype, tp_zero_size: tuple[int, int]):
     # Col Parallel
     # ==============================
     weight_col_shard = shard_colwise(weight.clone(), tp_group)
-    weight_col_shard_layout = get_layout(weight_col_shard)  # Layout info weight_col_shard_layout.global_shape
     weight_col_shard_shard_spec = get_sharding_spec(weight_col_shard)  # Shard spec
     weight_col_shard_flatten = nn.Parameter(weight_col_shard.clone().flatten().requires_grad_(True))
     bias_col_flatten = nn.Parameter(bias.clone().flatten().requires_grad_(True))
@@ -203,17 +198,12 @@ def exam_dist_adafactor_base(dtype: torch.dtype, tp_zero_size: tuple[int, int]):
     # Row Parallel
     # ==============================
     weight_row_shard = shard_rowwise(weight.clone(), tp_group)
-    weight_row_shard_layout = get_layout(weight_row_shard)  # Layout info weight_row_shard_layout.global_shape
     weight_row_shard_shard_spec = get_sharding_spec(weight_row_shard)  # Shard spec
     weight_row_shard_flatten = nn.Parameter(
         weight_row_shard.clone().flatten().requires_grad_(True)
     )  # flatten input(not dtensor) to optimizer
     bias_row_flatten = nn.Parameter(bias.clone().flatten().requires_grad_(True))
 
-    # base_param_group = setup_param_groups([weight, bias])
-    # cp_param_group = setup_param_groups([weight_col_shard_flatten, bias_col_flatten])
-    # rp_param_group = setup_param_groups([weight_row_shard_flatten, bias_row_flatten])
-
     # ==============================
     # Init Optimizer
     # ==============================
@@ -267,19 +257,11 @@ def exam_dist_adafactor_base(dtype: torch.dtype, tp_zero_size: tuple[int, int]):
         bias_row_flatten.grad = bias.grad.clone().flatten()
         rp_dist_optim.step()
 
-        # gather result
-        weight_col_gather = _gather(
-            input_=weight_col_shard_flatten.data.view(-1, H // tp_size),
-            dim=-1,
-            process_group=tp_group,
-        )  # gather
-        weight_row_gather = _gather(input_=weight_row_shard_flatten.data, dim=-1, process_group=tp_group).view(
-            -1, W
-        )  # gather
-
+        weight_row_chunk = weight.t().reshape(-1, W).chunk(tp_size, dim=-1)[dist.get_rank(tp_group)].flatten()
+        weight_col_chunk = weight.reshape(-1, H).chunk(tp_size, dim=-1)[dist.get_rank(tp_group)].flatten()
         # verify
-        correctness_verify(weight.data, weight_col_gather.data, dtype)
-        correctness_verify(weight.data, weight_row_gather.data, dtype)
+        correctness_verify(weight_col_chunk, weight_col_shard_flatten, dtype)
+        correctness_verify(weight_row_chunk, weight_row_shard_flatten, dtype)
 
     print(f"Base Test Passed")
 
@@ -307,7 +289,7 @@ def exam_dist_adafactor_zero(dtype: torch.dtype, tp_zero_size: tuple[int, int]):
 
     base_param_group = setup_param_groups(base_model)
     tp_param_group = setup_param_groups(tp_model)
-    tp_param_group_, tp_shard_spec, tp_param_shape = setup_flatten_param_groups_sharding_spec_shape(tp_model)
+    # tp_param_group_, tp_shard_spec, tp_param_shape = setup_flatten_param_groups_sharding_spec_shape(tp_model)
 
     # ==============================
     # Optimizer Init
@@ -378,141 +360,19 @@ def exam_dist_adafactor_zero(dtype: torch.dtype, tp_zero_size: tuple[int, int]):
             if len(shard_spec.sharding_sequence) >= 2:
                 # Col Parallel
                 if shard_spec.sharding_sequence[0] == "R":
-                    tp_p = _gather(input_=tp_p, dim=-1, process_group=tp_group)  # gather
+                    p = p.chunk(tp_size, dim=-1)[dist.get_rank(tp_group)]
                 # ROW Parallel
                 if shard_spec.sharding_sequence[-1] == "R":
-                    tp_p = _gather(input_=tp_p, dim=0, process_group=tp_group)  # gather
+                    p = p.chunk(tp_size, dim=0)[dist.get_rank(tp_group)]
             else:
                 # TP bias
-                tp_p = _gather(input_=tp_p, dim=-1, process_group=tp_group)  # gather
-        else:
-            # No TP bias
-            pass
-        correctness_verify(p.data, tp_p.data, dtype)
-    clear_layout_converter()
-    Randomizer.reset_index()
-    torch.cuda.empty_cache()
-    print(f"Zero Test Passed")
-
-
-@parameterize("dtype", [torch.float16])
-@parameterize("tp_zero_size", [(1, 4)])
-def exam_dist_adafactor_booster(dtype: torch.dtype, tp_zero_size: tuple[int, int]):
-    tp_size, zero_size = tp_zero_size
-    use_zero = True if zero_size > 1 else False
-    local_rank = dist.get_rank()
+                p = p.chunk(tp_size, dim=-1)[dist.get_rank(tp_group)]
 
+        correctness_verify(p, tp_p, dtype)
     clear_layout_converter()
-
-    proc_mesh = ProcessGroupMesh(tp_size, zero_size)
-    tp_group, dp_group = proc_mesh.get_group_along_axis(0), proc_mesh.get_group_along_axis(1)
-
-    torch.set_default_dtype(dtype)
-    set_seed(42)
-
-    # ==============================
-    # Model Init
-    # ==============================
-    base_model = MlpModel().to(local_rank)
-    # tp_model = TPModel(copy.deepcopy(base_model.linear1), copy.deepcopy(base_model.linear2), tp_group).to(local_rank)
-    tp_model = copy.deepcopy(base_model).to(local_rank)
-
-    base_param_group = setup_param_groups(base_model)
-    tp_param_group = setup_param_groups(tp_model)
-    tp_param_group_, tp_shard_spec, tp_param_shape = setup_flatten_param_groups_sharding_spec_shape(tp_model)
-
-    # ==============================
-    # Optimizer Init
-    # ==============================
-    base_optim = Adafactor(base_param_group)
-    dist_optim = DistributedAdaFactor(tp_param_group)
-
-    # Setup distributed optimizer
-    if zero_size > 1:
-        base_optim = LowLevelZeroOptimizer(
-            base_optim,
-            overlap_communication=True,
-            initial_scale=128,
-            partition_grad=True,
-            dp_process_group=dp_group,
-            verbose=True,
-        )
-
-        dist_optim = LowLevelZeroOptimizer(
-            dist_optim,
-            overlap_communication=True,
-            initial_scale=128,
-            partition_grad=True,
-            dp_process_group=dp_group,
-            verbose=True,
-        )
-        shard_to_param = dist_optim._param_store.master_to_working_param  # {id(): param tensor} but flattened
-        dist_optim.optim.setup_distributed(
-            tp_group=tp_group,
-            dp_group=dp_group,
-            shard_to_working_param=shard_to_param,
-            use_zero=use_zero,
-        )
-    else:
-        shard_to_param = set_master_param_to_shard_param(tp_param_group)
-        dist_optim.setup_distributed(
-            tp_group=tp_group,
-            dp_group=dp_group,
-            shard_to_working_param=shard_to_param,
-            use_zero=use_zero,
-        )
-
-    # ==============================
-    # Booster Init
-    # ==============================
-    plugin = LowLevelZeroPlugin()
-    booster = Booster(plugin=plugin)
-    criterion = lambda x: x.mean()
-
-    tp_model, dist_optim, criterion, _, _ = booster.boost(tp_model, dist_optim, criterion)
-
-    # ==============================
-    # Correctness Verify
-    # ==============================
-    x = torch.randn(HEIGHT, WIDTH, device=local_rank)
-
-    out = base_model(x)
-    out_tp = tp_model(x)
-
-    if zero_size > 1:
-        dist_optim.backward(out_tp.sum())
-        base_optim.backward(out.sum())
-    else:
-        out_tp.sum().backward()
-        out.sum().backward()
-
-    base_optim.step()
-    dist_optim.step()
-
-    base_optim.zero_grad()
-    dist_optim.zero_grad()
-
-    for p, tp_p in zip(base_param_group, tp_param_group):
-        param_is_distributed = is_distributed_tensor(tp_p)
-        if param_is_distributed:
-            shard_spec = get_sharding_spec(tp_p)
-            if len(shard_spec.sharding_sequence) >= 2:
-                # Col Parallel
-                if shard_spec.sharding_sequence[0] == "R":
-                    tp_p = _gather(input_=tp_p, dim=-1, process_group=tp_group)  # gather
-                # ROW Parallel
-                if shard_spec.sharding_sequence[-1] == "R":
-                    tp_p = _gather(input_=tp_p, dim=0, process_group=tp_group)  # gather
-            else:
-                # TP bias
-                tp_p = _gather(input_=tp_p, dim=-1, process_group=tp_group)  # gather
-        else:
-            # No TP bias
-            pass
-        correctness_verify(p.data, tp_p.data, dtype)
     Randomizer.reset_index()
     torch.cuda.empty_cache()
-    print(f"Booster Test Passed")
+    print(f"Zero Test Passed")
 
 
 @parameterize(
@@ -532,14 +392,6 @@ def exam_bert_test_on_lowlevelzero_plugin(test_config):
     sub_model_zoo = model_zoo.get_sub_registry("transformers_bert")
     model_list = [
         "transformers_bert",
-        "transformers_bert_for_pretraining",
-        "transformers_bert_lm_head_model",
-        "transformers_bert_for_masked_lm",
-        "transformers_bert_for_sequence_classification",
-        "transformers_bert_for_token_classification",
-        "transformers_bert_for_next_sentence",
-        "transformers_bert_for_mcq",
-        "transformers_bert_for_question_answering",
     ]
     clear_layout_converter()
     torch.set_default_dtype(torch.bfloat16)
@@ -627,14 +479,6 @@ def exam_bert_test_on_hybrid_plugin(test_config):
     test_config["initial_scale"] = 2**16  # avoid overflow
     model_list = [
         "transformers_bert",
-        "transformers_bert_for_pretraining",
-        "transformers_bert_lm_head_model",
-        "transformers_bert_for_masked_lm",
-        "transformers_bert_for_sequence_classification",
-        "transformers_bert_for_token_classification",
-        "transformers_bert_for_next_sentence",
-        "transformers_bert_for_mcq",
-        "transformers_bert_for_question_answering",
     ]
     clear_layout_converter()
     torch.set_default_dtype(torch.bfloat16)
@@ -673,6 +517,7 @@ def exam_bert_test_on_hybrid_plugin(test_config):
                 # check optim states
                 check_dist_optim_state(org_optimizer, sharded_optimizer.optim)
 
+    clear_layout_converter()
     Randomizer.reset_index()
     torch.cuda.empty_cache()
     print(f"Bert Model Zoo Test Passed")
@@ -681,11 +526,10 @@ def exam_bert_test_on_hybrid_plugin(test_config):
 def run_dist(rank, world_size, port):
     disable_existing_loggers()
     colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    exam_bert_test_on_lowlevelzero_plugin()
-    exam_bert_test_on_hybrid_plugin()
     exam_dist_adafactor_base()
     exam_dist_adafactor_zero()
-    exam_dist_adafactor_booster()
+    exam_bert_test_on_lowlevelzero_plugin()
+    exam_bert_test_on_hybrid_plugin()
 
 
 @pytest.mark.dist
diff --git a/tests/test_optimizer/test_dist_came.py b/tests/test_optimizer/test_dist_came.py
index 96b61b274c38..c767e968434d 100644
--- a/tests/test_optimizer/test_dist_came.py
+++ b/tests/test_optimizer/test_dist_came.py
@@ -287,15 +287,6 @@ def exam_bert_test_on_lowlevelzero_plugin(test_config):
     # test_config["initial_scale"] = 1
     model_list = [
         "transformers_bert",
-        "transformers_bert_for_pretraining",
-        "transformers_bert_lm_head_model",
-        "transformers_bert_for_masked_lm",
-        "transformers_bert_for_sequence_classification",
-        "transformers_bert_for_token_classification",
-        "transformers_bert_for_next_sentence",
-        "transformers_bert_for_mcq",
-        "transformers_bert_for_question_answering",
-        "simple_mlp",
     ]
     clear_layout_converter()
     torch.set_default_dtype(torch.bfloat16)
@@ -389,14 +380,6 @@ def exam_bert_test_on_hybrid_plugin(test_config):
     test_config["initial_scale"] = 2**16  # avoid overflow
     model_list = [
         "transformers_bert",
-        "transformers_bert_for_pretraining",
-        "transformers_bert_lm_head_model",
-        "transformers_bert_for_masked_lm",
-        "transformers_bert_for_sequence_classification",
-        "transformers_bert_for_token_classification",
-        "transformers_bert_for_next_sentence",
-        "transformers_bert_for_mcq",
-        "transformers_bert_for_question_answering",
     ]
 
     # pass "transformers_bert",
diff --git a/tests/test_optimizer/test_dist_lamb.py b/tests/test_optimizer/test_dist_lamb.py
index d518e7d4edca..c1ff78c0c276 100644
--- a/tests/test_optimizer/test_dist_lamb.py
+++ b/tests/test_optimizer/test_dist_lamb.py
@@ -18,7 +18,6 @@
 
 _ALLOWED_P_G_TYPES = [
     (torch.float, torch.float),  # pure fp32
-    (torch.float, torch.half),  # fp16 amp
     (torch.float, torch.bfloat16),  # bfloat16 amp
 ]
 
@@ -264,7 +263,6 @@ def run_dist_lamb_fwd_bwd(
 
     torch_optim.step()
     optim.step()
-    dist.barrier()
     torch_optim.zero_grad()
     optim.zero_grad()
     try:
diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py
index 3ec394768669..b97de0ef86cf 100644
--- a/tests/test_shardformer/test_model/test_shard_bert.py
+++ b/tests/test_shardformer/test_model/test_shard_bert.py
@@ -122,20 +122,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 2,
-            "pp_size": 1,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "precision": "fp32",
-        },
-        {
-            "tp_size": 1,
-            "pp_size": 2,
-            "num_microbatches": 4,
-            "use_lazy_init": True,
-            "precision": "fp32",
-        },
         {
             "tp_size": 2,
             "pp_size": 2,
@@ -145,14 +131,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 4,
-            "pp_size": 1,
-            "enable_all_optimization": True,
-            "use_lazy_init": False,
-            "precision": "fp32",
-        },
-        {"tp_size": 2, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"},
         {
             "tp_size": 2,
             "pp_size": 1,
@@ -162,16 +140,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 1,
-            "pp_size": 2,
-            "num_microbatches": 2,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "zero_stage": 1,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
     ],
 )
 def run_bert_test(test_config):
diff --git a/tests/test_shardformer/test_model/test_shard_blip2.py b/tests/test_shardformer/test_model/test_shard_blip2.py
index 712c5c1e19fd..04a8f57e9df6 100644
--- a/tests/test_shardformer/test_model/test_shard_blip2.py
+++ b/tests/test_shardformer/test_model/test_shard_blip2.py
@@ -67,8 +67,8 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
 
 @parameterize("enable_fused_normalization", [True, False])
 @parameterize("enable_tensor_parallelism", [True, False])
-@parameterize("enable_flash_attention", [True, False])
-@parameterize("enable_jit_fused", [True, False])
+@parameterize("enable_flash_attention", [True])
+@parameterize("enable_jit_fused", [True])
 def run_blip2_test(
     enable_fused_normalization,
     enable_tensor_parallelism,
diff --git a/tests/test_shardformer/test_model/test_shard_bloom.py b/tests/test_shardformer/test_model/test_shard_bloom.py
index 6ab0369e0b91..feabc908394c 100644
--- a/tests/test_shardformer/test_model/test_shard_bloom.py
+++ b/tests/test_shardformer/test_model/test_shard_bloom.py
@@ -110,17 +110,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp32",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 4,
-            "pp_size": 1,
-            "num_microbatches": 1,
-            "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "split_gather",
-            "enable_flash_attention": False,
-            "use_lazy_init": True,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
         {
             "tp_size": 2,
             "pp_size": 2,
@@ -128,6 +117,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "enable_all_optimization": True,
             "use_lazy_init": True,
             "precision": "fp16",
+            "zero_stage": 1,
             "initial_scale": 1,
         },
         {
@@ -138,17 +128,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "use_lazy_init": False,
             "precision": "fp32",
         },
-        {"tp_size": 4, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"},
-        {"tp_size": 2, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"},
-        {
-            "tp_size": 2,
-            "pp_size": 1,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "zero_stage": 2,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
         {
             "tp_size": 1,
             "pp_size": 2,
diff --git a/tests/test_shardformer/test_model/test_shard_falcon.py b/tests/test_shardformer/test_model/test_shard_falcon.py
index 8074f9d61140..3eb82864a63f 100644
--- a/tests/test_shardformer/test_model/test_shard_falcon.py
+++ b/tests/test_shardformer/test_model/test_shard_falcon.py
@@ -92,21 +92,12 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "tp_size": 2,
             "pp_size": 2,
             "num_microbatches": 4,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
-        {
-            "tp_size": 1,
-            "pp_size": 2,
-            "num_microbatches": 4,
             "enable_all_optimization": False,
             "use_lazy_init": False,
-            "precision": "fp32",
+            "precision": "fp16",
+            "initial_scale": 1,
         },
         {"tp_size": 4, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"},
-        {"tp_size": 2, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"},
         {
             "tp_size": 2,
             "pp_size": 1,
@@ -116,16 +107,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 1,
-            "pp_size": 2,
-            "num_microbatches": 2,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "zero_stage": 1,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
     ],
 )
 def run_falcon_test(test_config):
diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py
index 72ea2b0895e9..f9e368c0ebf3 100644
--- a/tests/test_shardformer/test_model/test_shard_gpt2.py
+++ b/tests/test_shardformer/test_model/test_shard_gpt2.py
@@ -162,46 +162,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 1,
-            "pp_size": 2,
-            "num_microbatches": 4,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
-        {
-            "tp_size": 4,
-            "pp_size": 1,
-            "enable_all_optimization": False,
-            "use_lazy_init": False,
-            "precision": "fp32",
-        },
-        {
-            "tp_size": 2,
-            "pp_size": 1,
-            "enable_all_optimization": False,
-            "use_lazy_init": False,
-            "precision": "fp32",
-        },
-        {
-            "tp_size": 2,
-            "pp_size": 2,
-            "num_microbatches": 4,
-            "enable_all_optimization": False,
-            "use_lazy_init": True,
-            "precision": "fp32",
-        },
-        {
-            "tp_size": 2,
-            "pp_size": 1,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "zero_stage": 2,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
         {
             "tp_size": 1,
             "pp_size": 2,
diff --git a/tests/test_shardformer/test_model/test_shard_gptj.py b/tests/test_shardformer/test_model/test_shard_gptj.py
index 009202a0da7a..4e978542569a 100644
--- a/tests/test_shardformer/test_model/test_shard_gptj.py
+++ b/tests/test_shardformer/test_model/test_shard_gptj.py
@@ -240,7 +240,6 @@ def run_gptj_3d_test(test_config):
 def check_gptj(rank, world_size, port):
     disable_existing_loggers()
     colossalai.launch(
-        config={},
         rank=rank,
         world_size=world_size,
         host="localhost",
@@ -253,7 +252,6 @@ def check_gptj(rank, world_size, port):
 def check_gptj_3d(rank, world_size, port):
     disable_existing_loggers()
     colossalai.launch(
-        config={},
         rank=rank,
         world_size=world_size,
         host="localhost",
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index c38570f8599c..3a8a1357deb0 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -120,9 +120,20 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             atol, rtol = 1e-4, 1e-3
         else:
             atol, rtol = 5e-3, 5e-3
-        check_weight(
-            llama_model, shard_llama_model, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False
-        )
+        try:
+            check_weight(
+                llama_model,
+                shard_llama_model,
+                col_layer_for_check,
+                tp_group,
+                atol=atol,
+                rtol=rtol,
+                dim=1,
+                verbose=False,
+            )
+        except Exception as e:
+            print(f"Failed config: {test_config}")
+            raise e
 
     # check grads
     check_all_grad_tensors(grads_to_check)
@@ -133,9 +144,10 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
-        {
+        {  # Test ring + Flash attention
             "tp_size": 2,
             "pp_size": 1,
+            "sp_size": 2,
             "num_microbatches": 1,
             "enable_sequence_parallelism": True,
             "sequence_parallelism_mode": "ring",
@@ -145,36 +157,16 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 4,
-            "pp_size": 1,
-            "num_microbatches": 1,
-            "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "ring",
-            "enable_flash_attention": False,
-            "use_lazy_init": True,
-            "precision": "fp32",
-            "initial_scale": 1,
-        },
-        {
-            "tp_size": 4,
-            "pp_size": 1,
-            "num_microbatches": 1,
-            "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "split_gather",
-            "enable_flash_attention": False,
-            "use_lazy_init": True,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
-        {
+        {  # Ulysess + Flash attention
             "tp_size": 1,
-            "pp_size": 1,
+            "pp_size": 2,
             "sp_size": 2,
-            "num_microbatches": 1,
+            "num_microbatches": 2,
             "enable_sequence_parallelism": True,
             "sequence_parallelism_mode": "all_to_all",
+            "enable_flash_attention": True,
             "use_lazy_init": True,
+            "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
         },
@@ -186,16 +178,16 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "enable_sequence_parallelism": True,
             "sequence_parallelism_mode": "all_to_all",
             "use_lazy_init": True,
-            "zero_stage": 2,
+            "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
         },
         {
-            "tp_size": 1,
+            "tp_size": 4,
             "pp_size": 1,
             "num_microbatches": 1,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "all_to_all",
+            "sequence_parallelism_mode": "split_gather",
             "enable_flash_attention": False,
             "use_lazy_init": True,
             "precision": "fp16",
@@ -221,22 +213,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "enable_gradient_checkpointing": True,
             "gradient_checkpoint_config": PipelineGradientCheckpointConfig(num_ckpt_layers_per_stage=[4, 0]),
         },
-        {
-            "tp_size": 4,
-            "pp_size": 1,
-            "enable_all_optimization": False,
-            "use_lazy_init": False,
-            "precision": "fp32",
-        },
-        {
-            "tp_size": 1,
-            "pp_size": 4,
-            "num_microbatches": 4,
-            "enable_all_optimization": False,
-            "use_lazy_init": False,
-            "precision": "fp32",
-        },
-        {"tp_size": 2, "pp_size": 1, "enable_all_optimization": False, "use_lazy_init": False, "precision": "fp32"},
         {
             "tp_size": 2,
             "pp_size": 1,
@@ -262,7 +238,11 @@ def run_llama_test(test_config):
     sub_model_zoo = model_zoo.get_sub_registry("transformers_llama")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        try:
+            check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        except Exception as e:
+            print(f"Failed config: {test_config}")
+            raise e
 
     clear_layout_converter()
     Randomizer.reset_index()
@@ -312,7 +292,11 @@ def run_llama_3d_test(test_config):
     sub_model_zoo = model_zoo.get_sub_registry("transformers_llama")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        try:
+            check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        except Exception as e:
+            print(f"Failed config: {test_config}")
+            raise e
 
     clear_layout_converter()
     Randomizer.reset_index()
diff --git a/tests/test_shardformer/test_model/test_shard_qwen2.py b/tests/test_shardformer/test_model/test_shard_qwen2.py
index d5abd41ae64a..166b31df967e 100644
--- a/tests/test_shardformer/test_model/test_shard_qwen2.py
+++ b/tests/test_shardformer/test_model/test_shard_qwen2.py
@@ -217,6 +217,7 @@ def check_qwen2_3d(rank, world_size, port):
 
 
 @pytest.mark.skipif(transformers.__version__ < "4.39.1", reason="Requires transformers version 4.39.1 or later")
+@pytest.mark.dist
 @rerun_if_address_is_in_use()
 @clear_cache_before_run()
 def test_qwen2():
@@ -224,6 +225,7 @@ def test_qwen2():
 
 
 @pytest.mark.skipif(transformers.__version__ < "4.39.1", reason="Requires transformers version 4.39.1 or later")
+@pytest.mark.largedist
 @rerun_if_address_is_in_use()
 @clear_cache_before_run()
 def test_qwen2_3d():
diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py
index d33b52b422dc..99d6f7d2ccc1 100644
--- a/tests/test_shardformer/test_model/test_shard_vit.py
+++ b/tests/test_shardformer/test_model/test_shard_vit.py
@@ -108,7 +108,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp32",
         },
         {"tp_size": 4, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"},
-        {"tp_size": 2, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"},
         {
             "tp_size": 2,
             "pp_size": 1,
diff --git a/tests/test_zero/test_gemini/test_fwd_bwd.py b/tests/test_zero/test_gemini/test_fwd_bwd.py
deleted file mode 100644
index 4d3981329069..000000000000
--- a/tests/test_zero/test_gemini/test_fwd_bwd.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import pytest
-import torch
-import torch.distributed as dist
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.testing import assert_close
-
-import colossalai
-from colossalai.accelerator import get_accelerator
-from colossalai.legacy.amp import convert_to_apex_amp
-from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.utils import set_seed
-from colossalai.zero import GeminiDDP, GeminiOptimizer
-from colossalai.zero.gemini.chunk import search_chunk_configuration
-from tests.kit.model_zoo import model_zoo, run_fwd_bwd
-
-PLACEMENT_CONFIGS = [
-    {"placement_policy": "static", "shard_param_frac": 0.0},  # zero2
-    {"placement_policy": "static", "shard_param_frac": 1.0},  # zero3
-    {"placement_policy": "static", "shard_param_frac": 0.5},  # zero3-half
-    {"placement_policy": "auto"},
-]
-
-
-def check_grad(model: GeminiDDP, torch_model: torch.nn.Module):
-    chunk_manager = model.chunk_manager
-    param_list = [p for p in model.parameters()]
-    chunk_list = chunk_manager.get_chunks(param_list)
-    if not model.chunk_manager.reuse_fp16_chunk:
-        chunk_list = [chunk.grad_chunk for chunk in chunk_list]
-    for chunk in chunk_list:
-        chunk_manager.access_chunk(chunk)
-
-    for p0, p1 in zip(model.parameters(), torch_model.parameters()):
-        assert_close(p0, p1.grad, rtol=1e-3, atol=5e-5)
-
-
-@parameterize("placement_config", PLACEMENT_CONFIGS)
-@parameterize("keep_gather", [False, True])
-@parameterize("model_name", ["transformers_gpt_lm"])
-@parameterize("use_grad_checkpoint", [False, True])
-@parameterize("master_weights", [False, True])
-@parameterize("max_prefetch", [0, 4])
-@parameterize("enable_async_reduce", [False, True])
-def exam_gpt_fwd_bwd(
-    placement_config,
-    keep_gather,
-    model_name: str,
-    use_grad_checkpoint: bool = False,
-    master_weights: bool = True,
-    max_prefetch: int = 0,
-    enable_async_reduce=True,
-):
-    init_device = get_accelerator().get_current_device()
-    model_builder, data_gen_fn, output_transform_fn, loss_fn, *_ = next(
-        iter(model_zoo.get_sub_registry(model_name).values())
-    )
-
-    set_seed(42)
-    model = model_builder()
-
-    set_seed(42)
-    torch_model = model_builder().cuda()
-    for torch_p, p in zip(torch_model.parameters(), model.parameters()):
-        torch_p.data.copy_(p.data)
-
-    if use_grad_checkpoint:
-        model.gradient_checkpointing_enable()
-        torch_model.gradient_checkpointing_enable()
-
-    world_size = torch.distributed.get_world_size()
-    config_dict, *_ = search_chunk_configuration(model, search_range_m=1, search_interval=100)
-    config_dict[world_size]["chunk_size"] = 5000
-    config_dict[world_size]["keep_gathered"] = keep_gather
-    model = GeminiDDP(
-        model,
-        config_dict,
-        init_device,
-        pin_memory=True,
-        **placement_config,
-        master_weights=master_weights,
-        max_prefetch=max_prefetch,
-        enable_async_reduce=enable_async_reduce,
-    )
-    optimizer = HybridAdam(model.parameters(), lr=1e-3)
-    zero_optim = GeminiOptimizer(optimizer, model, initial_scale=1)
-
-    rank = dist.get_rank()
-    amp_config = dict(opt_level="O2", keep_batchnorm_fp32=False, loss_scale=1, master_weights=master_weights)
-    torch_optim = torch.optim.Adam(torch_model.parameters(), lr=1e-3)
-    torch_model, torch_optim = convert_to_apex_amp(torch_model, torch_optim, amp_config)
-    torch_model = DDP(torch_model, device_ids=[rank])
-
-    set_seed(rank)
-
-    data = data_gen_fn()
-    data = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in data.items()}
-
-    torch_optim.zero_grad()
-    zero_optim.zero_grad()
-
-    # set random seed is same as torch_model.eval()
-    set_seed(42)
-    torch_loss = run_fwd_bwd(torch_model, data, output_transform_fn, loss_fn, optimizer=torch_optim)
-    set_seed(42)
-    loss = run_fwd_bwd(model, data, output_transform_fn, loss_fn, optimizer=zero_optim)
-
-    assert_close(torch_loss.float(), loss.float())
-
-    check_grad(model, torch_model)
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    exam_gpt_fwd_bwd()
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize("world_size", [1, 4])
-@rerun_if_address_is_in_use()
-def test_gpt(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == "__main__":
-    test_gpt(1)
diff --git a/tests/test_zero/test_gemini/test_grad_accum.py b/tests/test_zero/test_gemini/test_grad_accum.py
index 0027413896ec..3299cf631ec0 100644
--- a/tests/test_zero/test_gemini/test_grad_accum.py
+++ b/tests/test_zero/test_gemini/test_grad_accum.py
@@ -15,9 +15,7 @@
 from tests.kit.model_zoo import model_zoo, run_fwd
 
 PLACEMENT_CONFIGS = [
-    {"placement_policy": "static", "shard_param_frac": 0.0},  # zero2
-    {"placement_policy": "static", "shard_param_frac": 1.0},  # zero3
-    {"placement_policy": "static", "shard_param_frac": 0.5},  # zero3-half
+    {"placement_policy": "static", "shard_param_frac": 0.75},
     {"placement_policy": "auto"},
 ]
 
@@ -109,7 +107,7 @@ def exam_gemini_grad_acc(
     torch_model = DDP(torch_model, device_ids=[rank])
 
     set_seed(rank)
-    accum_iter = 4
+    accum_iter = 2
     train_dataloader = DummyDataloader(data_gen_fn)
     for i, data in enumerate(train_dataloader):
         delay_unscale = False if (i + 1) % accum_iter == 0 else True
diff --git a/tests/test_zero/test_gemini/test_optim.py b/tests/test_zero/test_gemini/test_optim.py
index c610259b2daf..39cf348d99be 100644
--- a/tests/test_zero/test_gemini/test_optim.py
+++ b/tests/test_zero/test_gemini/test_optim.py
@@ -15,17 +15,7 @@
 from tests.kit.model_zoo import model_zoo, run_fwd_bwd
 
 PLACEMENT_CONFIGS = [
-    {"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 0.0},  # zero2
-    {"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 1.0},  # zero2-offload
-    {"placement_policy": "static", "shard_param_frac": 0.0, "offload_optim_frac": 0.5},  # zero2-offload-half
-    {"placement_policy": "static", "shard_param_frac": 1.0},  # zero3
-    {"placement_policy": "static", "shard_param_frac": 0.5},  # zero3-half
-    {
-        "placement_policy": "static",
-        "shard_param_frac": 1.0,
-        "offload_optim_frac": 1.0,
-        "offload_param_frac": 1.0,
-    },  # zero3-offload-all
+    {"placement_policy": "static", "shard_param_frac": 0.3, "offload_param_frac": 0.3, "offload_optim_frac": 0.3},
     {"placement_policy": "auto"},
 ]
 
@@ -73,7 +63,7 @@ def check_param(model: GeminiDDP, torch_model: torch.nn.Module, dtype: torch.dty
 @parameterize("model_name", TEST_MODELS)
 @parameterize("mixed_precision", [torch.half, torch.bfloat16])
 @parameterize("master_weights", [True, False])
-@parameterize("enable_async_reduce", [False, True])
+@parameterize("enable_async_reduce", [True])
 def exam_model_step(
     placement_config, model_name: str, mixed_precision: torch.dtype, master_weights: bool, enable_async_reduce=True
 ):
@@ -136,7 +126,7 @@ def exam_model_step(
             check_param(model, torch_model, mixed_precision)
 
 
-@parameterize("placement_config", [PLACEMENT_CONFIGS[3]])
+@parameterize("placement_config", [{"placement_policy": "static", "shard_param_frac": 1.0}])
 @parameterize("model_name", EXAMPLE_MODELS)
 @parameterize("mixed_precision", [torch.half])
 def exam_tiny_example(placement_config, model_name: str, mixed_precision: torch.dtype):
@@ -197,7 +187,7 @@ def run_dist(rank, world_size, port):
 
 
 @pytest.mark.dist
-@pytest.mark.parametrize("world_size", [1, 4])
+@pytest.mark.parametrize("world_size", [4])
 @rerun_if_address_is_in_use()
 def test_optim(world_size):
     spawn(run_dist, world_size)
diff --git a/tests/test_zero/test_gemini/test_search.py b/tests/test_zero/test_gemini/test_search.py
index 9c8c497f322e..58e585474b80 100644
--- a/tests/test_zero/test_gemini/test_search.py
+++ b/tests/test_zero/test_gemini/test_search.py
@@ -1,18 +1,31 @@
 import pytest
 import torch
+import transformers
 
 import colossalai
 from colossalai.accelerator import get_accelerator
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 from colossalai.zero.gemini.chunk import init_chunk_manager, search_chunk_configuration
-from tests.kit.model_zoo import model_zoo
 
+CONFIG = transformers.GPT2Config(
+    n_layer=2,
+    n_head=4,
+    n_embd=128,
+    vocab_size=50258,
+    attn_pdrop=0,
+    embd_pdrop=0,
+    resid_pdrop=0,
+    summary_first_dropout=0,
+    hidden_dropout=0,
+    problem_type="single_label_classification",
+    pad_token_id=50256,
+    tie_word_embeddings=True,
+)
+
+model_builder = lambda: transformers.GPT2LMHeadModel(CONFIG)
 
-def exam_search_chunk_size():
-    model_builder, data_gen_fn, output_transform_fn, *_ = next(
-        iter(model_zoo.get_sub_registry("transformers_gpt_lm").values())
-    )
 
+def exam_search_chunk_size():
     # make sure torch_model and model has the same parameter values
     model = model_builder()
     config_dict, *_ = search_chunk_configuration(
@@ -27,10 +40,6 @@ def exam_search_chunk_size():
 def exam_chunk_manager():
     world_size = torch.distributed.get_world_size()
 
-    model_builder, data_gen_fn, output_transform_fn, *_ = next(
-        iter(model_zoo.get_sub_registry("transformers_gpt_lm").values())
-    )
-
     sharded_ddp_model = model_builder()
     chunk_manager = init_chunk_manager(
         sharded_ddp_model,
diff --git a/tests/test_zero/test_gemini/test_zeroddp_state_dict.py b/tests/test_zero/test_gemini/test_zeroddp_state_dict.py
index 23e2d8083945..00d28f1c088c 100644
--- a/tests/test_zero/test_gemini/test_zeroddp_state_dict.py
+++ b/tests/test_zero/test_gemini/test_zeroddp_state_dict.py
@@ -10,9 +10,7 @@
 from tests.kit.model_zoo import model_zoo
 
 PLACEMENT_CONFIGS = [
-    {"placement_policy": "static", "shard_param_frac": 0.0},  # zero2
-    {"placement_policy": "static", "shard_param_frac": 1.0},  # zero3
-    {"placement_policy": "static", "shard_param_frac": 0.5},  # zero3-half
+    {"placement_policy": "static", "shard_param_frac": 0.75},
     {"placement_policy": "auto"},
 ]
 
@@ -26,8 +24,8 @@ def ignore_the_first_parameter(model: torch.nn.Module):
 
 @parameterize("placement_config", PLACEMENT_CONFIGS)
 @parameterize("keep_gathered", [True, False])
-@parameterize("model_name", ["transformers_gpt_lm", "transformers_bert_for_sequence_classification"])
-@parameterize("master_weights", [False, True])
+@parameterize("model_name", ["transformers_gpt_lm"])
+@parameterize("master_weights", [True, False])
 def exam_state_dict(placement_config, keep_gathered, model_name: str, master_weights: bool):
     set_seed(431)
     model_builder, data_gen_fn, output_transform_fn, *_ = next(iter(model_zoo.get_sub_registry(model_name).values()))
@@ -81,7 +79,7 @@ def run_dist(rank, world_size, port):
 
 
 @pytest.mark.dist
-@pytest.mark.parametrize("world_size", [1, 4])
+@pytest.mark.parametrize("world_size", [4])
 @rerun_if_address_is_in_use()
 def test_zero_ddp(world_size):
     spawn(run_dist, world_size)
diff --git a/version.txt b/version.txt
index 0f82685331ef..667843220966 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.7
+0.3.8