diff --git a/compiler/include/byteir/Dialect/GPU/Passes.h b/compiler/include/byteir/Dialect/GPU/Passes.h
index 18fcd7283..916e146fc 100644
--- a/compiler/include/byteir/Dialect/GPU/Passes.h
+++ b/compiler/include/byteir/Dialect/GPU/Passes.h
@@ -23,7 +23,6 @@
 #include "byteir/Dialect/GPU/Transforms/GPUDistributeSharedMemoryCopy.h"
 #include "byteir/Dialect/GPU/Transforms/GPUDistributeToWarp.h"
 #include "byteir/Dialect/GPU/Transforms/GPUPackSharedMemoryAlloc.h"
-#include "byteir/Dialect/GPU/Transforms/GPUPipelining.h"
 #include "byteir/Dialect/GPU/Transforms/GPUTensorCoreVectorization.h"
 #include "byteir/Dialect/GPU/Transforms/GPUVectorToGPU.h"
 #include "byteir/Dialect/GPU/Transforms/OptimizeVectorTransfer.h"
diff --git a/compiler/include/byteir/Dialect/GPU/Passes.td b/compiler/include/byteir/Dialect/GPU/Passes.td
index 215e4c2e1..42339a707 100644
--- a/compiler/include/byteir/Dialect/GPU/Passes.td
+++ b/compiler/include/byteir/Dialect/GPU/Passes.td
@@ -103,20 +103,6 @@ def GPUPackSharedMemoryAlloc : Pass<"gpu-pack-shared-memory-alloc", "func::FuncO
   let constructor = "mlir::createGPUPackSharedMemoryAllocPass()";
 }
 
-//===----------------------------------------------------------------------===//
-// GPUPipelining
-//===----------------------------------------------------------------------===//
-def GPUPipelining : Pass<"gpu-pipelining", "func::FuncOp"> {
-  let summary = "Pipelining async copy and mma oprations to improve performance.";
-  let constructor = "mlir::createGPUPipeliningPass()";
-  let options = [
-    Option<"stages", "stages", "int64_t", /*default=*/"0", "the number of stages for pipelining">,
-  ];
-  let dependentDialects = [
-    "NVVM::NVVMDialect",
-  ];
-}
-
 //===----------------------------------------------------------------------===//
 // GPUVectorToGPU
 //===----------------------------------------------------------------------===//
diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/GPUPipelining.h b/compiler/include/byteir/Dialect/GPU/Transforms/GPUPipelining.h
deleted file mode 100644
index c041e8a6e..000000000
--- a/compiler/include/byteir/Dialect/GPU/Transforms/GPUPipelining.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===- GPUPipelining.h ---------------------------------------*--- C++-*-===//
-//
-// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H
-#define BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H
-
-#include "mlir/Pass/Pass.h"
-#include "llvm/ADT/StringRef.h"
-#include <memory>
-
-namespace mlir {
-namespace func {
-class FuncOp;
-} // namespace func
-
-/// Pipelining async copy and mma oprations to improve performance.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createGPUPipeliningPass(int64_t stages = 0);
-
-} // namespace mlir
-
-#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H
\ No newline at end of file
diff --git a/compiler/include/byteir/Dialect/MemRef/Transforms/MultiBufferExt.h b/compiler/include/byteir/Dialect/MemRef/Transforms/MultiBufferExt.h
deleted file mode 100644
index 532dcee5b..000000000
--- a/compiler/include/byteir/Dialect/MemRef/Transforms/MultiBufferExt.h
+++ /dev/null
@@ -1,81 +0,0 @@
-//===- RemoveCopy.h -------------------------------------------*--- C++ -*-===//
-//
-// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H
-#define BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H
-
-#include "mlir/Support/LogicalResult.h"
-#include "llvm/ADT/STLFunctionalExtras.h"
-
-namespace mlir {
-class OpBuilder;
-class RewritePatternSet;
-class RewriterBase;
-class Value;
-class ValueRange;
-
-namespace arith {
-class WideIntEmulationConverter;
-class NarrowTypeEmulationConverter;
-} // namespace arith
-
-namespace memref {
-class AllocOp;
-class AllocaOp;
-class DeallocOp;
-
-/// Transformation to do multi-buffering/array expansion to remove dependencies
-/// on the temporary allocation between consecutive loop iterations.
-/// It returns the new allocation if the original allocation was multi-buffered
-/// and returns failure() otherwise.
-/// When `skipOverrideAnalysis`, the pass will apply the transformation
-/// without checking thwt the buffer is overrided at the beginning of each
-/// iteration. This implies that user knows that there is no data carried across
-/// loop iterations. Example:
-/// ```
-/// %0 = memref.alloc() : memref<4x128xf32>
-/// scf.for %iv = %c1 to %c1024 step %c3 {
-///   memref.copy %1, %0 : memref<4x128xf32> to memref<4x128xf32>
-///   "some_use"(%0) : (memref<4x128xf32>) -> ()
-/// }
-/// ```
-/// into:
-/// ```
-/// %0 = memref.alloc() : memref<5x4x128xf32>
-/// scf.for %iv = %c1 to %c1024 step %c3 {
-///   %s = arith.subi %iv, %c1 : index
-///   %d = arith.divsi %s, %c3 : index
-///   %i = arith.remsi %d, %c5 : index
-///   %sv = memref.subview %0[%i, 0, 0] [1, 4, 128] [1, 1, 1] :
-///     memref<5x4x128xf32> to memref<4x128xf32, strided<[128, 1], offset: ?>>
-///   memref.copy %1, %sv : memref<4x128xf32> to memref<4x128xf32, strided<...>>
-///   "some_use"(%sv) : (memref<4x128xf32, strided<...>) -> ()
-/// }
-/// ```
-template <typename AllocOpType>
-FailureOr<AllocOpType> multiBufferExt(RewriterBase &rewriter,
-                                      AllocOpType allocOp, unsigned multiplier,
-                                      bool skipOverrideAnalysis = false);
-/// Call into `multiBuffer` with  locally constructed IRRewriter.
-template <typename AllocOpType>
-FailureOr<AllocOpType> multiBufferExt(AllocOpType allocOp, unsigned multiplier,
-                                      bool skipOverrideAnalysis = false);
-
-} // namespace memref
-} // namespace mlir
-
-#endif // BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H
\ No newline at end of file
diff --git a/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt b/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt
index fa17a80e3..e722f7265 100644
--- a/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt
+++ b/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt
@@ -5,7 +5,6 @@ add_mlir_dialect_library(ByteIRGPUPasses
   GPUDistributeToWarp.cpp
   GPUTensorCoreVectorization.cpp
   GPUPackSharedMemoryAlloc.cpp
-  GPUPipelining.cpp
   GPUVectorToGPU.cpp
   OptimizeVectorTransfer.cpp
   RemoveTrivialLoops.cpp
diff --git a/compiler/lib/Dialect/GPU/Transforms/GPUPipelining.cpp b/compiler/lib/Dialect/GPU/Transforms/GPUPipelining.cpp
deleted file mode 100644
index 1be80cf2f..000000000
--- a/compiler/lib/Dialect/GPU/Transforms/GPUPipelining.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//===- GPUPipelining.cpp -------------------------------------*--- C++-*-===//
-//
-// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-
-#include "byteir/Dialect/GPU/Transforms/GPUPipelining.h"
-#include "byteir/Dialect/GPU/Passes.h"
-#include "byteir/Dialect/GPU/Transforms/Transforms.h"
-#include "byteir/Dialect/GPU/Transforms/Utils.h"
-#include "byteir/Dialect/Linalg/Transforms/Transforms.h"
-#include "byteir/Dialect/MemRef/Transforms/MultiBufferExt.h"
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
-#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
-#include "mlir/Dialect/SCF/Transforms/Patterns.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/Support/Debug.h"
-
-#include "PassDetail.h"
-
-#define DEBUG_TYPE "gpu-pipelining"
-
-using namespace mlir;
-
-namespace {
-
-/// Helper to recursively add operation dependencies within `block` to `dep`
-/// set.
-static void addDepOps(llvm::SmallDenseSet<Operation *> &dep, Operation *op,
-                      Block *block) {
-  if (!dep.insert(op).second)
-    return;
-  for (Value operand : op->getOperands()) {
-    Operation *defOp = operand.getDefiningOp();
-    if (defOp && defOp->getBlock() == block)
-      addDepOps(dep, defOp, block);
-  }
-}
-
-static void
-getPipelineStages(scf::ForOp forOp,
-                  std::vector<std::pair<Operation *, unsigned>> &ops,
-                  unsigned depth) {
-  SmallVector<linalg::CopyOp> copyOps;
-  forOp.walk([&](linalg::CopyOp copyOp) {
-    if (hasMarker(copyOp, {getCopyToSharedMemoryAMarker(),
-                           getCopyToSharedMemoryBMarker()})) {
-      copyOps.push_back(copyOp);
-    }
-  });
-
-  llvm::SmallDenseSet<Operation *> loadDep;
-  for (linalg::CopyOp copyOp : copyOps) {
-    addDepOps(loadDep, copyOp, forOp.getBody());
-  }
-
-  for (Operation &op : forOp.getBody()->getOperations()) {
-    if (!loadDep.count(&op) && !isa<scf::YieldOp>(op))
-      ops.push_back(std::make_pair(&op, depth));
-  }
-  for (Operation &op : forOp.getBody()->getOperations()) {
-    if (loadDep.count(&op))
-      ops.push_back(std::make_pair(&op, 0));
-  }
-}
-
-static Operation *replaceLinalgMatmulWithIfOp(RewriterBase &rewriter,
-                                              Operation *op, Value pred) {
-  Location loc = op->getLoc();
-  if (!isa<linalg::CopyOp>(op))
-    return op;
-  auto ifOp = rewriter.create<scf::IfOp>(loc, op->getResultTypes(), pred, true);
-  // True branch.
-  op->moveBefore(&ifOp.getThenRegion().front(),
-                 ifOp.getThenRegion().front().begin());
-  rewriter.setInsertionPointAfter(op);
-  if (op->getNumResults() > 0)
-    rewriter.create<scf::YieldOp>(loc, op->getResults());
-  return ifOp.getOperation();
-}
-
-struct GPUPipeliningPass : public GPUPipeliningBase<GPUPipeliningPass> {
-  GPUPipeliningPass(int64_t stages) : GPUPipeliningBase() {
-    this->stages = stages;
-  }
-
-  void runOnOperation() override {
-    func::FuncOp funcOp = getOperation();
-    stages = 3;
-    // step 1: collect all the alloc operations and do multi-buffering
-    SmallVector<memref::AllocOp> allocas;
-    // Collect all the alloc operations.
-    funcOp.walk([&](memref::AllocOp AllocOp) {
-      if (nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(
-              AllocOp.getType()) &&
-          hasMarker(AllocOp, {getAllocSharedMemoryAMarker(),
-                               getAllocSharedMemoryBMarker()})) {
-        allocas.push_back(AllocOp);
-      }
-    });
-    assert(allocas.size() == 2 && "Only support 2 allocas for now");
-    // Apply multi-buffering to all of them.
-    for (memref::AllocOp AllocOp : allocas) {
-      if (failed(memref::multiBufferExt(AllocOp, (unsigned int)stages, true))) {
-        // Error out and stop if any buffer cannot be multi buffered, as
-        // future software pipelining transformations will assume this
-        // happened.
-        AllocOp.emitOpError("cannot be multi-buffered");
-        return signalPassFailure();
-      }
-    }
-
-    // step 2: find linalg.copy ops in scf.for and its dependencies
-    SmallVector<scf::ForOp> forOps;
-    // Mark the loop with shared memory copy for pipelining.
-    funcOp.walk([&forOps](scf::ForOp forOp) { forOps.push_back(forOp); });
-
-    assert(forOps.size() == 1 && "Only support 1 loop in matmul");
-
-    scf::PipeliningOption options;
-    unsigned maxDepth = stages;
-    auto getSchedule =
-        [maxDepth](scf::ForOp forOp,
-                   std::vector<std::pair<Operation *, unsigned>> &schedule) {
-          getPipelineStages(forOp, schedule, maxDepth);
-        };
-
-    // step 3: apply software pipelining
-    options.getScheduleFn = getSchedule;
-    options.supportDynamicLoops = false;
-    options.peelEpilogue = false;
-    options.predicateFn = replaceLinalgMatmulWithIfOp;
-
-    RewritePatternSet patterns(&getContext());
-    scf::populateSCFLoopPipeliningPatterns(patterns, options);
-    (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
-
-    // step 3: add nvvm commit_group and wait_group
-    // 3.1 find all the linalg.copy ops which do __byteir_load_matrix_a__ or
-    // __byteir_load_matrix_b__
-    SmallVector<linalg::CopyOp> copyOps;
-    funcOp.walk([&](linalg::CopyOp copyOp) {
-      if (hasMarker(copyOp, {getCopyToSharedMemoryAMarker(),
-                             getCopyToSharedMemoryBMarker()})) {
-        copyOps.push_back(copyOp);
-      }
-    });
-    // There is (stages + 1) * 2 copy ops in total
-    assert(copyOps.size() == (stages + 1) * 2 &&
-           "Wrong linalg copy ops number after pipelining");
-    OpBuilder b(funcOp.getContext());
-    // As group = stages + 1, we need to add commit_group after every group
-    for (int64_t g = 0; g < stages + 1; g++) {
-      Operation *lastCopyInGroup = copyOps[g * 2 + 1];
-      // if linalg.copy is inside a scf.if, we need to add commit_group after
-      // scf.if as we want to generate predicated copy
-      if (lastCopyInGroup->getParentOfType<scf::IfOp>()) {
-        lastCopyInGroup = lastCopyInGroup->getParentOfType<scf::IfOp>();
-      }
-      b.setInsertionPointAfter(lastCopyInGroup);
-      b.create<NVVM::CpAsyncCommitGroupOp>(funcOp.getLoc());
-    }
-    // 3.2 find linalg.matmul and add wait_group before it
-    SmallVector<linalg::MatmulOp> matmulOps;
-    funcOp.walk(
-        [&](linalg::MatmulOp matmulOp) { matmulOps.push_back(matmulOp); });
-    assert(matmulOps.size() == 1 && "Only support 1 matmul op in the loop");
-    linalg::MatmulOp matmulOp = matmulOps[0];
-    b.setInsertionPoint(matmulOp);
-    // wait first group done, stages - 1 prefetch groups can run in the pipeline
-    b.create<NVVM::CpAsyncWaitGroupOp>(funcOp.getLoc(), stages - 1);
-  }
-};
-
-} // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-mlir::createGPUPipeliningPass(int64_t stages) {
-  return std::make_unique<GPUPipeliningPass>(stages);
-}
\ No newline at end of file
diff --git a/compiler/lib/Dialect/MemRef/CMakeLists.txt b/compiler/lib/Dialect/MemRef/CMakeLists.txt
index c76cf1281..9304445ce 100644
--- a/compiler/lib/Dialect/MemRef/CMakeLists.txt
+++ b/compiler/lib/Dialect/MemRef/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_mlir_dialect_library(ByteIRMemRefPasses
   Transforms/ApplyMemRefAffineLayout.cpp
   Transforms/ExtractAddressComputation.cpp
-  Transforms/MultiBufferExt.cpp
   Transforms/RemoveCopy.cpp
   Transforms/SimplifyLinearizedIndex.cpp
   Transforms/SimplifyView.cpp
diff --git a/compiler/lib/Dialect/MemRef/Transforms/MultiBufferExt.cpp b/compiler/lib/Dialect/MemRef/Transforms/MultiBufferExt.cpp
deleted file mode 100644
index 639170d4d..000000000
--- a/compiler/lib/Dialect/MemRef/Transforms/MultiBufferExt.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-//===- MultiBufferExt.cpp -----------------------------------------*--- C++
-//-*-===//
-//
-// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-
-// Some code comes from mlir/lib/Dialect/Memref/Transforms/MultiBuffer.cpp of
-// LLVM Project.
-// Original license:
-//===----------- MultiBuffering.cpp ---------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"
-#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/Dominance.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/ValueRange.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
-
-using namespace mlir;
-
-#define DEBUG_TYPE "memref-multi-buffer-ext"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-#define DBGSNL() (llvm::dbgs() << "\n")
-
-/// Return true if the op fully overwrite the given `buffer` value.
-static bool overrideBuffer(Operation *op, Value buffer) {
-  auto memrefCopyOp = dyn_cast<memref::CopyOp>(op);
-  auto linalgCopyOp = dyn_cast<linalg::CopyOp>(op);
-  if (memrefCopyOp)
-    return memrefCopyOp.getTarget() == buffer;
-  if (linalgCopyOp)
-    return linalgCopyOp.getDpsInitOperand(0)->get() == buffer;
-  return false;
-}
-
-/// Replace the uses of `oldOp` with the given `val` and for subview uses
-/// propagate the type change. Changing the memref type may require propagating
-/// it through subview ops so we cannot just do a replaceAllUse but need to
-/// propagate the type change and erase old subview ops.
-static void replaceUsesAndPropagateType(RewriterBase &rewriter,
-                                        Operation *oldOp, Value val) {
-  SmallVector<Operation *> opsToDelete;
-  SmallVector<OpOperand *> operandsToReplace;
-
-  // Save the operand to replace / delete later (avoid iterator invalidation).
-  // TODO: can we use an early_inc iterator?
-  for (OpOperand &use : oldOp->getUses()) {
-    // Non-subview ops will be replaced by `val`.
-    auto subviewUse = dyn_cast<memref::SubViewOp>(use.getOwner());
-    if (!subviewUse) {
-      operandsToReplace.push_back(&use);
-      continue;
-    }
-
-    // `subview(old_op)` is replaced by a new `subview(val)`.
-    OpBuilder::InsertionGuard g(rewriter);
-    rewriter.setInsertionPoint(subviewUse);
-    Type newType = memref::SubViewOp::inferRankReducedResultType(
-        subviewUse.getType().getShape(), cast<MemRefType>(val.getType()),
-        subviewUse.getStaticOffsets(), subviewUse.getStaticSizes(),
-        subviewUse.getStaticStrides());
-    Value newSubview = rewriter.create<memref::SubViewOp>(
-        subviewUse->getLoc(), cast<MemRefType>(newType), val,
-        subviewUse.getMixedOffsets(), subviewUse.getMixedSizes(),
-        subviewUse.getMixedStrides());
-
-    // Ouch recursion ... is this really necessary?
-    replaceUsesAndPropagateType(rewriter, subviewUse, newSubview);
-
-    opsToDelete.push_back(use.getOwner());
-  }
-
-  // Perform late replacement.
-  // TODO: can we use an early_inc iterator?
-  for (OpOperand *operand : operandsToReplace) {
-    Operation *op = operand->getOwner();
-    rewriter.startRootUpdate(op);
-    operand->set(val);
-    rewriter.finalizeRootUpdate(op);
-  }
-
-  // Perform late op erasure.
-  // TODO: can we use an early_inc iterator?
-  for (Operation *op : opsToDelete)
-    rewriter.eraseOp(op);
-}
-
-namespace mlir {
-namespace memref {
-
-// Transformation to do multi-buffering/array expansion to remove dependencies
-// on the temporary allocation between consecutive loop iterations.
-// Returns success if the transformation happened and failure otherwise.
-// This is not a pattern as it requires propagating the new memref type to its
-// uses and requires updating subview ops.
-template <typename AllocOpType>
-FailureOr<AllocOpType>
-multiBufferExt(RewriterBase &rewriter, AllocOpType allocOp,
-               unsigned multiBufferingFactor, bool skipOverrideAnalysis) {
-  LLVM_DEBUG(DBGS() << "Start multibuffering: " << allocOp << "\n");
-  DominanceInfo dom(allocOp->getParentOp());
-  LoopLikeOpInterface candidateLoop;
-  for (Operation *user : allocOp->getUsers()) {
-    auto parentLoop = user->getParentOfType<LoopLikeOpInterface>();
-    if (!parentLoop) {
-      if (isa<memref::DeallocOp>(user)) {
-        // Allow dealloc outside of any loop.
-        // TODO: The whole precondition function here is very brittle and will
-        // need to rethought an isolated into a cleaner analysis.
-        continue;
-      }
-      LLVM_DEBUG(DBGS() << "--no parent loop -> fail\n");
-      LLVM_DEBUG(DBGS() << "----due to user: " << *user << "\n");
-      return failure();
-    }
-    if (!skipOverrideAnalysis) {
-      /// Make sure there is no loop-carried dependency on the allocation.
-      if (!overrideBuffer(user, allocOp.getResult())) {
-        LLVM_DEBUG(DBGS() << "--Skip user: found loop-carried dependence\n");
-        continue;
-      }
-      // If this user doesn't dominate all the other users keep looking.
-      if (llvm::any_of(allocOp->getUsers(), [&](Operation *otherUser) {
-            return !dom.dominates(user, otherUser);
-          })) {
-        LLVM_DEBUG(
-            DBGS() << "--Skip user: does not dominate all other users\n");
-        continue;
-      }
-    } else {
-      if (llvm::any_of(allocOp->getUsers(), [&](Operation *otherUser) {
-            return !isa<memref::DeallocOp>(otherUser) &&
-                   !parentLoop->isProperAncestor(otherUser);
-          })) {
-        LLVM_DEBUG(
-            DBGS()
-            << "--Skip user: not all other users are in the parent loop\n");
-        continue;
-      }
-    }
-    candidateLoop = parentLoop;
-    break;
-  }
-
-  if (!candidateLoop) {
-    LLVM_DEBUG(DBGS() << "Skip alloc: no candidate loop\n");
-    return failure();
-  }
-
-  std::optional<Value> inductionVar = candidateLoop.getSingleInductionVar();
-  std::optional<OpFoldResult> lowerBound = candidateLoop.getSingleLowerBound();
-  std::optional<OpFoldResult> singleStep = candidateLoop.getSingleStep();
-  if (!inductionVar || !lowerBound || !singleStep ||
-      !llvm::hasSingleElement(candidateLoop.getLoopRegions())) {
-    LLVM_DEBUG(DBGS() << "Skip alloc: no single iv, lb, step or region\n");
-    return failure();
-  }
-
-  if (!dom.dominates(allocOp.getOperation(), candidateLoop)) {
-    LLVM_DEBUG(DBGS() << "Skip alloc: does not dominate candidate loop\n");
-    return failure();
-  }
-
-  LLVM_DEBUG(DBGS() << "Start multibuffering loop: " << candidateLoop << "\n");
-
-  // 1. Construct the multi-buffered memref type.
-  ArrayRef<int64_t> originalShape = allocOp.getType().getShape();
-  SmallVector<int64_t, 4> multiBufferedShape{multiBufferingFactor};
-  llvm::append_range(multiBufferedShape, originalShape);
-  LLVM_DEBUG(DBGS() << "--original type: " << allocOp.getType() << "\n");
-  MemRefType mbMemRefType = MemRefType::Builder(allocOp.getType())
-                                .setShape(multiBufferedShape)
-                                .setLayout(MemRefLayoutAttrInterface());
-  LLVM_DEBUG(DBGS() << "--multi-buffered type: " << mbMemRefType << "\n");
-
-  // 2. Create the multi-buffered alloc.
-  Location loc = allocOp->getLoc();
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(allocOp);
-  auto mbAlloc = rewriter.create<AllocOpType>(loc, mbMemRefType, ValueRange{},
-                                              allocOp->getAttrs());
-  LLVM_DEBUG(DBGS() << "--multi-buffered alloc: " << mbAlloc << "\n");
-
-  // 3. Within the loop, build the modular leading index (i.e. each loop
-  // iteration %iv accesses slice ((%iv - %lb) / %step) % %mb_factor).
-  rewriter.setInsertionPointToStart(
-      &candidateLoop.getLoopRegions().front()->front());
-  Value ivVal = *inductionVar;
-  Value lbVal = getValueOrCreateConstantIndexOp(rewriter, loc, *lowerBound);
-  Value stepVal = getValueOrCreateConstantIndexOp(rewriter, loc, *singleStep);
-  AffineExpr iv, lb, step;
-  bindDims(rewriter.getContext(), iv, lb, step);
-  Value bufferIndex = affine::makeComposedAffineApply(
-      rewriter, loc, ((iv - lb).floorDiv(step)) % multiBufferingFactor,
-      {ivVal, lbVal, stepVal});
-  LLVM_DEBUG(DBGS() << "--multi-buffered indexing: " << bufferIndex << "\n");
-
-  // 4. Build the subview accessing the particular slice, taking modular
-  // rotation into account.
-  int64_t mbMemRefTypeRank = mbMemRefType.getRank();
-  IntegerAttr zero = rewriter.getIndexAttr(0);
-  IntegerAttr one = rewriter.getIndexAttr(1);
-  SmallVector<OpFoldResult> offsets(mbMemRefTypeRank, zero);
-  SmallVector<OpFoldResult> sizes(mbMemRefTypeRank, one);
-  SmallVector<OpFoldResult> strides(mbMemRefTypeRank, one);
-  // Offset is [bufferIndex, 0 ... 0 ].
-  offsets.front() = bufferIndex;
-  // Sizes is [1, original_size_0 ... original_size_n ].
-  for (int64_t i = 0, e = originalShape.size(); i != e; ++i)
-    sizes[1 + i] = rewriter.getIndexAttr(originalShape[i]);
-  // Strides is [1, 1 ... 1 ].
-  auto dstMemref =
-      cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
-          originalShape, mbMemRefType, offsets, sizes, strides));
-  Value subview = rewriter.create<memref::SubViewOp>(loc, dstMemref, mbAlloc,
-                                                     offsets, sizes, strides);
-  LLVM_DEBUG(DBGS() << "--multi-buffered slice: " << subview << "\n");
-
-  // 5. Due to the recursive nature of replaceUsesAndPropagateType , we need to
-  // handle dealloc uses separately..
-  for (OpOperand &use : llvm::make_early_inc_range(allocOp->getUses())) {
-    auto deallocOp = dyn_cast<memref::DeallocOp>(use.getOwner());
-    if (!deallocOp)
-      continue;
-    OpBuilder::InsertionGuard g(rewriter);
-    rewriter.setInsertionPoint(deallocOp);
-    auto newDeallocOp =
-        rewriter.create<memref::DeallocOp>(deallocOp->getLoc(), mbAlloc);
-    (void)newDeallocOp;
-    LLVM_DEBUG(DBGS() << "----Created dealloc: " << newDeallocOp << "\n");
-    rewriter.eraseOp(deallocOp);
-  }
-
-  // 6. RAUW with the particular slice, taking modular rotation into account.
-  replaceUsesAndPropagateType(rewriter, allocOp, subview);
-
-  // 7. Finally, erase the old allocOp.
-  rewriter.eraseOp(allocOp);
-
-  return mbAlloc;
-}
-
-template <typename AllocOpType>
-FailureOr<AllocOpType> multiBufferExt(AllocOpType allocOp,
-                                      unsigned multiBufferingFactor,
-                                      bool skipOverrideAnalysis) {
-  IRRewriter rewriter(allocOp->getContext());
-  return multiBufferExt(rewriter, allocOp, multiBufferingFactor,
-                        skipOverrideAnalysis);
-}
-
-template FailureOr<memref::AllocOp> multiBufferExt(memref::AllocOp, unsigned,
-                                                   bool);
-template FailureOr<memref::AllocaOp> multiBufferExt(memref::AllocaOp, unsigned,
-                                                    bool);
-} // namespace memref
-} // namespace mlir
\ No newline at end of file
diff --git a/compiler/lib/Pipelines/LinalgMemrefOpt.cpp b/compiler/lib/Pipelines/LinalgMemrefOpt.cpp
index 7ca50249d..99c00e253 100644
--- a/compiler/lib/Pipelines/LinalgMemrefOpt.cpp
+++ b/compiler/lib/Pipelines/LinalgMemrefOpt.cpp
@@ -56,9 +56,7 @@ void addGemmOptPasses(OpPassManager &pm) {
       anchoredPM.addPass(createCanonicalizerPass());
       anchoredPM.addPass(createCSEPass());
       anchoredPM.addPass(createCanonicalizerPass());
-      // anchoredPM.addPass(createGPUPipeliningPass());
-      // anchoredPM.addPass(createCSEPass());
-      // anchoredPM.addPass(createCanonicalizerPass());
+
       anchoredPM.addPass(createGPUDistributeToWarpPass());
       anchoredPM.addPass(createRemoveTrivialLoopsPass());
       anchoredPM.addPass(createGPUTensorCoreVectorizationPass());
@@ -87,7 +85,6 @@ void addGemmOptPasses(OpPassManager &pm) {
       pm.addPass(memref::createFoldMemRefAliasOpsPass());
     }
 
-    // anchoredPM.addPass(createGPUPipeliningPass());
     {
       OpPassManager anchoredPM(func::FuncOp::getOperationName());
       anchoredPM.addPass(createGPUPackSharedMemoryAllocPass());
diff --git a/compiler/test/Dialect/GPU/gpu-pipelining.mlir b/compiler/test/Dialect/GPU/gpu-pipelining.mlir
deleted file mode 100644
index b2f2475a7..000000000
--- a/compiler/test/Dialect/GPU/gpu-pipelining.mlir
+++ /dev/null
@@ -1,94 +0,0 @@
-// RUN: byteir-opt -gpu-pipelining="stages=3" -canonicalize --cse --verify-diagnostics %s | FileCheck %s
-
-#map = affine_map<(d0) -> (d0 * 128)>
-module {
-  func.func private @Unknown0(%arg0: memref<5376x2048xf16>, %arg1: memref<2048x5376xf16>) -> memref<5376x5376xf16> attributes {__byteir_gemm_block_size__ = [64, 2, 1], __byteir_gemm_pipeline_depth__ = 3 : i64, __byteir_gemm_tile_config__ = [128, 128, 32], __byteir_matmul_epilogue_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c2048 = arith.constant 2048 : index
-    %c32 = arith.constant 32 : index
-    %alloc = memref.alloc() : memref<5376x5376xf16>
-    scf.forall (%arg2, %arg3) in (42, 42) {
-      %alloca = memref.alloca() {__byteir_alloca_accumulator__} : memref<128x128xf16, #gpu.address_space<workgroup>>
-      %alloca_0 = memref.alloca() {__byteir_alloca_matrix_b__} : memref<32x128xf16, #gpu.address_space<workgroup>>
-      %alloca_1 = memref.alloca() {__byteir_alloca_matrix_a__} : memref<128x32xf16, #gpu.address_space<workgroup>>
-      %0 = affine.apply #map(%arg2)
-      %1 = affine.apply #map(%arg3)
-      %subview = memref.subview %alloc[%0, %1] [128, 128] [1, 1] : memref<5376x5376xf16> to memref<128x128xf16, strided<[5376, 1], offset: ?>>
-      linalg.fill ins(%cst : f16) outs(%alloca : memref<128x128xf16, #gpu.address_space<workgroup>>)
-      scf.for %arg4 = %c0 to %c2048 step %c32 {
-        %subview_2 = memref.subview %arg0[%0, %arg4] [128, 32] [1, 1] : memref<5376x2048xf16> to memref<128x32xf16, strided<[2048, 1], offset: ?>>
-        %subview_3 = memref.subview %arg1[%arg4, %1] [32, 128] [1, 1] : memref<2048x5376xf16> to memref<32x128xf16, strided<[5376, 1], offset: ?>>
-        linalg.copy {__byteir_load_matrix_a__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%subview_2 : memref<128x32xf16, strided<[2048, 1], offset: ?>>) outs(%alloca_1 : memref<128x32xf16, #gpu.address_space<workgroup>>)
-        linalg.copy {__byteir_load_matrix_b__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%subview_3 : memref<32x128xf16, strided<[5376, 1], offset: ?>>) outs(%alloca_0 : memref<32x128xf16, #gpu.address_space<workgroup>>)
-        linalg.matmul {__byteir_gpu_tile_gemm_0, __byteir_mma__, __byteir_mma_level__ = "Threadblock", __byteir_target__ = "nv_sm_80"} ins(%alloca_1, %alloca_0 : memref<128x32xf16, #gpu.address_space<workgroup>>, memref<32x128xf16, #gpu.address_space<workgroup>>) outs(%alloca : memref<128x128xf16, #gpu.address_space<workgroup>>)
-      }
-      linalg.copy {__byteir_store_matrix_c__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%alloca : memref<128x128xf16, #gpu.address_space<workgroup>>) outs(%subview : memref<128x128xf16, strided<[5376, 1], offset: ?>>)
-    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
-    return %alloc : memref<5376x5376xf16>
-  }
-}
-
-// CHECK-LABEL: scf.forall (%arg2, %arg3) in (42, 42) {
-
-// init:
-// CHECK: %[[ALLOCA:.*]] = memref.alloca() {__byteir_alloca_accumulator__} : memref<128x128xf16, #gpu.address_space<workgroup>>
-// CHECK: %[[ALLOCA0:.*]] = memref.alloca() {__byteir_alloca_matrix_b__} : memref<3x32x128xf16, #gpu.address_space<workgroup>
-// CHECK: %[[ALLOCA1:.*]] = memref.alloca() {__byteir_alloca_matrix_a__} : memref<3x128x32xf16, #gpu.address_space<workgroup>
-// CHECK: %[[IDX0:.*]] = affine.apply #map(%{{.*}})
-// CHECK: %[[IDX1:.*]] = affine.apply #map(%{{.*}})
-// CHECK: %[[SUBVIEW:.*]] = memref.subview %[[ALLOC:.*]][%[[IDX0]], %[[IDX1]]] [128, 128] [1, 1] : memref<5376x5376xf16> to memref<128x128xf16, strided<[5376, 1], offset: ?>>
-// CHECK: linalg.fill ins(%[[CST:.*]] : f16) outs(%[[ALLOCA]] : memref<128x128xf16, #gpu.address_space<workgroup>>)
-
-// prelogue0:
-// CHECK: %[[SUBVIEW2:.*]] = memref.subview %[[ALLOCA1]][0, 0, 0] [1, 128, 32] [1, 1, 1] : memref<3x128x32xf16, #gpu.address_space<workgroup>> to memref<128x32xf16, strided<[32, 1]>, #gpu.address_space<workgroup>
-// CHECK: %[[CAST2:.*]] = memref.cast %[[SUBVIEW2]] : memref<128x32xf16, strided<[32, 1]>, #gpu.address_space<workgroup>> to memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>
-// CHECK: %[[SUBVIEW3:.*]] = memref.subview %[[ALLOCA0]][0, 0, 0] [1, 32, 128] [1, 1, 1] : memref<3x32x128xf16, #gpu.address_space<workgroup>> to memref<32x128xf16, strided<[128, 1]>, #gpu.address_space<workgroup>
-// CHECK: %[[CAST3:.*]] = memref.cast %[[SUBVIEW3]] : memref<32x128xf16, strided<[128, 1]>, #gpu.address_space<workgroup>> to memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>
-// CHECK: %[[SUBVIEW5:.*]] = memref.subview %arg0[%[[IDX0]], 0] [128, 32] [1, 1] : memref<5376x2048xf16> to memref<128x32xf16, strided<[2048, 1], offset: ?>
-// CHECK: %[[SUBVIEW6:.*]] = memref.subview %arg1[0, %[[IDX1]]] [32, 128] [1, 1] : memref<2048x5376xf16> to memref<32x128xf16, strided<[5376, 1], offset: ?>
-// CHECK: linalg.copy {__byteir_load_matrix_a__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW5]] : memref<128x32xf16, strided<[2048, 1], offset: ?>>) outs(%[[SUBVIEW2]] : memref<128x32xf16, strided<[32, 1]>, #gpu.address_space<workgroup>>)
-// CHECK: linalg.copy {__byteir_load_matrix_b__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW6]] : memref<32x128xf16, strided<[5376, 1], offset: ?>>) outs(%[[SUBVIEW3]] : memref<32x128xf16, strided<[128, 1]>, #gpu.address_space<workgroup>>)
-// CHECK: nvvm.cp.async.commit.group
-
-// prelogue1:
-// CHECK: %[[SUBVIEW7:.*]] = memref.subview %[[ALLOCA1]][1, 0, 0] [1, 128, 32] [1, 1, 1] : memref<3x128x32xf16, #gpu.address_space<workgroup>> to memref<128x32xf16, strided<[32, 1], offset: 4096>, #gpu.address_space<workgroup>
-// CHECK: %[[CAST4:.*]] = memref.cast %[[SUBVIEW7]] : memref<128x32xf16, strided<[32, 1], offset: 4096>, #gpu.address_space<workgroup>> to memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>
-// CHECK: %[[SUBVIEW9:.*]] = memref.subview %[[ALLOCA0]][1, 0, 0] [1, 32, 128] [1, 1, 1] : memref<3x32x128xf16, #gpu.address_space<workgroup>> to memref<32x128xf16, strided<[128, 1], offset: 4096>, #gpu.address_space<workgroup>
-// CHECK: %[[CAST5:.*]] = memref.cast %[[SUBVIEW9]] : memref<32x128xf16, strided<[128, 1], offset: 4096>, #gpu.address_space<workgroup>> to memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>
-// CHECK: %[[SUBVIEW11:.*]] = memref.subview %arg0[%[[IDX0]], 32] [128, 32] [1, 1] : memref<5376x2048xf16> to memref<128x32xf16, strided<[2048, 1], offset: ?>
-// CHECK: %[[SUBVIEW12:.*]] = memref.subview %arg1[32, %[[IDX1]]] [32, 128] [1, 1] : memref<2048x5376xf16> to memref<32x128xf16, strided<[5376, 1], offset: ?>
-// CHECK: linalg.copy {__byteir_load_matrix_a__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW11]] : memref<128x32xf16, strided<[2048, 1], offset: ?>>) outs(%[[SUBVIEW7]] : memref<128x32xf16, strided<[32, 1], offset: 4096>, #gpu.address_space<workgroup>>)
-// CHECK: linalg.copy {__byteir_load_matrix_b__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW12]] : memref<32x128xf16, strided<[5376, 1], offset: ?>>) outs(%[[SUBVIEW9]] : memref<32x128xf16, strided<[128, 1], offset: 4096>, #gpu.address_space<workgroup>>)
-// CHECK: nvvm.cp.async.commit.group
-
-// prelogue2:
-// CHECK: %[[SUBVIEW13:.*]] = memref.subview %[[ALLOCA1]][2, 0, 0] [1, 128, 32] [1, 1, 1] : memref<3x128x32xf16, #gpu.address_space<workgroup>> to memref<128x32xf16, strided<[32, 1], offset: 8192>, #gpu.address_space<workgroup>
-// CHECK: %[[CAST6:.*]] = memref.cast %[[SUBVIEW13]] : memref<128x32xf16, strided<[32, 1], offset: 8192>, #gpu.address_space<workgroup>> to memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>
-// CHECK: %[[SUBVIEW15:.*]] = memref.subview %[[ALLOCA0]][2, 0, 0] [1, 32, 128] [1, 1, 1] : memref<3x32x128xf16, #gpu.address_space<workgroup>> to memref<32x128xf16, strided<[128, 1], offset: 8192>, #gpu.address_space<workgroup>
-// CHECK: %[[CAST7:.*]] = memref.cast %[[SUBVIEW15]] : memref<32x128xf16, strided<[128, 1], offset: 8192>, #gpu.address_space<workgroup>> to memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>
-// CHECK: %[[SUBVIEW17:.*]] = memref.subview %arg0[%[[IDX0]], 64] [128, 32] [1, 1] : memref<5376x2048xf16> to memref<128x32xf16, strided<[2048, 1], offset: ?>
-// CHECK: %[[SUBVIEW18:.*]] = memref.subview %arg1[64, %[[IDX1]]] [32, 128] [1, 1] : memref<2048x5376xf16> to memref<32x128xf16, strided<[5376, 1], offset: ?>
-// CHECK: linalg.copy {__byteir_load_matrix_a__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW17]] : memref<128x32xf16, strided<[2048, 1], offset: ?>>) outs(%[[SUBVIEW13]] : memref<128x32xf16, strided<[32, 1], offset: 8192>, #gpu.address_space<workgroup>>)
-// CHECK: linalg.copy {__byteir_load_matrix_b__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW18]] : memref<32x128xf16, strided<[5376, 1], offset: ?>>) outs(%[[SUBVIEW15]] : memref<32x128xf16, strided<[128, 1], offset: 8192>, #gpu.address_space<workgroup>>)
-// CHECK: nvvm.cp.async.commit.group
-
-// kernel:
-// CHECK: %[[CAST:.*]] = scf.for %arg4 = %c0 to %c2048 step %c32 iter_args(%arg5 = %[[CAST2]], %arg6 = %[[CAST4]], %arg7 = %[[CAST6]], %arg8 = %[[CAST3]], %arg9 = %[[CAST5]], %arg10 = %[[CAST7]]) -> (memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>>) {
-// CHECK: nvvm.cp.async.wait.group 2
-// CHECK: linalg.matmul {__byteir_gpu_tile_gemm_0, __byteir_mma__, __byteir_mma_level__ = "Threadblock", __byteir_target__ = "nv_sm_80"} ins(%arg5, %arg8 : memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%[[ALLOCA]] : memref<128x128xf16, #gpu.address_space<workgroup>>)
-
-// CHECK: %[[IDX5:.*]] = affine.apply #map1(%[[IDX4:.*]])
-// CHECK: %[[SUBVIEW19:.*]] = memref.subview %[[ALLOCA1]][%[[IDX5]], 0, 0] [1, 128, 32] [1, 1, 1] : memref<3x128x32xf16, #gpu.address_space<workgroup>> to memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>
-// CHECK: %[[SUBVIEW20:.*]] = memref.subview %[[ALLOCA0]][%[[IDX5]], 0, 0] [1, 32, 128] [1, 1, 1] : memref<3x32x128xf16, #gpu.address_space<workgroup>> to memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>
-// CHECK: %[[SUBVIEW21:.*]] = memref.subview %arg0[%[[IDX0]], %[[IDX8:.*]]] [128, 32] [1, 1] : memref<5376x2048xf16> to memref<128x32xf16, strided<[2048, 1], offset: ?>
-// CHECK: %[[SUBVIEW22:.*]] = memref.subview %arg1[%[[IDX9:.*]], %[[IDX1]]] [32, 128] [1, 1] : memref<2048x5376xf16> to memref<32x128xf16, strided<[5376, 1], offset: ?>
-// CHECK: scf.if %[[CMP:.*]] {
-// CHECK: linalg.copy {__byteir_load_matrix_a__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW21]] : memref<128x32xf16, strided<[2048, 1], offset: ?>>) outs(%[[SUBVIEW19]] : memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
-// CHECK: linalg.copy {__byteir_load_matrix_b__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW22]] : memref<32x128xf16, strided<[5376, 1], offset: ?>>) outs(%[[SUBVIEW20]] : memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>>)
-// CHECK: nvvm.cp.async.commit.group
-// CHECK: scf.yield %arg6, %arg7, %[[SUBVIEW19]], %arg9, %arg10, %[[SUBVIEW20]] : memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space<workgroup>
-// CHECK: }
-
-// copy back to global memory:
-// CHECK: linalg.copy {__byteir_store_matrix_c__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[ALLOCA]] : memref<128x128xf16, #gpu.address_space<workgroup>>) outs(%[[SUBVIEW]] : memref<128x128xf16, strided<[5376, 1], offset: ?>>)