hpcaitech · CWHer · Nov 28, 2023 · Sep 5, 2023 · Sep 6, 2023 · Sep 6, 2023
@@ -79,7 +79,7 @@ jobs:
     container:
       image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
-    timeout-minutes: 10
+    timeout-minutes: 20
     concurrency:
       group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example-${{ matrix.directory }}
       cancel-in-progress: true

@@ -20,7 +20,7 @@
 from colossalai.checkpoint_io import CheckpointIO, HybridParallelCheckpointIO
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.interface import ModelWrapper, OptimizerWrapper
-from colossalai.pipeline.schedule import OneForwardOneBackwardSchedule
+from colossalai.pipeline.schedule import InterleavedSchedule, OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer import ShardConfig, ShardFormer
 from colossalai.shardformer.policies.base_policy import Policy
@@ -317,6 +317,8 @@ class HybridParallelPlugin(PipelinePluginBase):
         communication_dtype (torch.dtype, optional): Communication dtype when using ZeRO. If not specified, the dtype of param will be used. Defaults to None.
         overlap_communication (bool, optional): Whether to overlap communication and computation when using ZeRO. Defaults to True.
         custom_policy (Policy, optional): Custom policy for Shardformer. Defaults to None.
+        pp_style (str, optional): The style for pipeline parallelism. Defaults to '1f1b'.
+        num_model_chunks (int, optional): The number of model chunks for interleaved pipeline parallelism. Defaults to 1.
     """
 
     def __init__(
@@ -352,6 +354,8 @@ def __init__(
         communication_dtype: Optional[torch.dtype] = None,
         overlap_communication: bool = True,
         custom_policy: Policy = None,
+        pp_style: str = "1f1b",
+        num_model_chunks: int = 1,
     ) -> None:
         super().__init__()
         assert (
@@ -378,17 +382,38 @@ def __init__(
         self.custom_policy = custom_policy
         assert zero_stage in (0, 1, 2)
         if self.pp_size > 1:
+            assert pp_style in ["1f1b", "interleaved"], "Unsupported pipeline parallelism style"
+            assert pp_style == "interleaved" or num_model_chunks == 1, "num_model_chunks must be 1 when using 1f1b"
             assert (
                 num_microbatches is not None or microbatch_size is not None
             ), "num_microbatches or microbatch_size must be specified when using pipeline parallelism"
             assert self.zero_stage <= 1, "zero stage must be 0 or 1 when using pipeline parallelism"
-            self.stage_manager = PipelineStageManager(self.pg_mesh, PP_AXIS)
-            self.schedule = OneForwardOneBackwardSchedule(
-                self.stage_manager, num_microbatches=num_microbatches, microbatch_size=microbatch_size
+            self.stage_manager = PipelineStageManager(
+                self.pg_mesh,
+                pipeline_axis=PP_AXIS,
+                enable_interleave=pp_style == "interleaved",
+                num_model_chunks=num_model_chunks,
             )
+
+            if pp_style == "interleaved":
+                assert num_model_chunks > 1, "number of model chunks must be > 1 when using interleaved"
+                self.schedule = InterleavedSchedule(
+                    stage_manager=self.stage_manager,
+                    num_model_chunks=num_model_chunks,
+                    num_microbatch=num_microbatches,
+                    microbatch_size=microbatch_size,
+                )
+            elif pp_style == "1f1b":
+                self.schedule = OneForwardOneBackwardSchedule(
+                    self.stage_manager, num_microbatches=num_microbatches, microbatch_size=microbatch_size
+                )
+            else:
+                raise NotImplementedError()
+
         self.tp_group = self.pg_mesh.get_group_along_axis(TP_AXIS)
         self.dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS)
         self.pp_group = self.pg_mesh.get_group_along_axis(PP_AXIS)
+
         self.shard_config = ShardConfig(
             tensor_parallel_process_group=self.tp_group,
             pipeline_stage_manager=self.stage_manager,

@@ -1,3 +1,4 @@
+from .colo_init_context import ColoInitContext, post_process_colo_init_ctx
 from .ophooks import BaseOpHook, register_ophooks_recursively
 from .stateful_tensor import StatefulTensor
 from .stateful_tensor_mgr import StatefulTensorMgr
@@ -11,4 +12,6 @@
     "AutoTensorPlacementPolicy",
     "register_ophooks_recursively",
     "BaseOpHook",
+    "ColoInitContext",
+    "post_process_colo_init_ctx",
 ]