diff --git a/compiler/include/byteir/Dialect/GPU/Passes.h b/compiler/include/byteir/Dialect/GPU/Passes.h index 18fcd7283..916e146fc 100644 --- a/compiler/include/byteir/Dialect/GPU/Passes.h +++ b/compiler/include/byteir/Dialect/GPU/Passes.h @@ -23,7 +23,6 @@ #include "byteir/Dialect/GPU/Transforms/GPUDistributeSharedMemoryCopy.h" #include "byteir/Dialect/GPU/Transforms/GPUDistributeToWarp.h" #include "byteir/Dialect/GPU/Transforms/GPUPackSharedMemoryAlloc.h" -#include "byteir/Dialect/GPU/Transforms/GPUPipelining.h" #include "byteir/Dialect/GPU/Transforms/GPUTensorCoreVectorization.h" #include "byteir/Dialect/GPU/Transforms/GPUVectorToGPU.h" #include "byteir/Dialect/GPU/Transforms/OptimizeVectorTransfer.h" diff --git a/compiler/include/byteir/Dialect/GPU/Passes.td b/compiler/include/byteir/Dialect/GPU/Passes.td index 215e4c2e1..42339a707 100644 --- a/compiler/include/byteir/Dialect/GPU/Passes.td +++ b/compiler/include/byteir/Dialect/GPU/Passes.td @@ -103,20 +103,6 @@ def GPUPackSharedMemoryAlloc : Pass<"gpu-pack-shared-memory-alloc", "func::FuncO let constructor = "mlir::createGPUPackSharedMemoryAllocPass()"; } -//===----------------------------------------------------------------------===// -// GPUPipelining -//===----------------------------------------------------------------------===// -def GPUPipelining : Pass<"gpu-pipelining", "func::FuncOp"> { - let summary = "Pipelining async copy and mma oprations to improve performance."; - let constructor = "mlir::createGPUPipeliningPass()"; - let options = [ - Option<"stages", "stages", "int64_t", /*default=*/"0", "the number of stages for pipelining">, - ]; - let dependentDialects = [ - "NVVM::NVVMDialect", - ]; -} - //===----------------------------------------------------------------------===// // GPUVectorToGPU //===----------------------------------------------------------------------===// diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/GPUPipelining.h b/compiler/include/byteir/Dialect/GPU/Transforms/GPUPipelining.h deleted file mode 100644 index c041e8a6e..000000000 --- a/compiler/include/byteir/Dialect/GPU/Transforms/GPUPipelining.h +++ /dev/null @@ -1,36 +0,0 @@ -//===- GPUPipelining.h ---------------------------------------*--- C++-*-===// -// -// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// - -#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H -#define BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H - -#include "mlir/Pass/Pass.h" -#include "llvm/ADT/StringRef.h" -#include - -namespace mlir { -namespace func { -class FuncOp; -} // namespace func - -/// Pipelining async copy and mma oprations to improve performance. -std::unique_ptr> -createGPUPipeliningPass(int64_t stages = 0); - -} // namespace mlir - -#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_GPUPIPELINING_H \ No newline at end of file diff --git a/compiler/include/byteir/Dialect/MemRef/Transforms/MultiBufferExt.h b/compiler/include/byteir/Dialect/MemRef/Transforms/MultiBufferExt.h deleted file mode 100644 index 532dcee5b..000000000 --- a/compiler/include/byteir/Dialect/MemRef/Transforms/MultiBufferExt.h +++ /dev/null @@ -1,81 +0,0 @@ -//===- RemoveCopy.h -------------------------------------------*--- C++ -*-===// -// -// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// - -#ifndef BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H -#define BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H - -#include "mlir/Support/LogicalResult.h" -#include "llvm/ADT/STLFunctionalExtras.h" - -namespace mlir { -class OpBuilder; -class RewritePatternSet; -class RewriterBase; -class Value; -class ValueRange; - -namespace arith { -class WideIntEmulationConverter; -class NarrowTypeEmulationConverter; -} // namespace arith - -namespace memref { -class AllocOp; -class AllocaOp; -class DeallocOp; - -/// Transformation to do multi-buffering/array expansion to remove dependencies -/// on the temporary allocation between consecutive loop iterations. -/// It returns the new allocation if the original allocation was multi-buffered -/// and returns failure() otherwise. -/// When `skipOverrideAnalysis`, the pass will apply the transformation -/// without checking thwt the buffer is overrided at the beginning of each -/// iteration. This implies that user knows that there is no data carried across -/// loop iterations. Example: -/// ``` -/// %0 = memref.alloc() : memref<4x128xf32> -/// scf.for %iv = %c1 to %c1024 step %c3 { -/// memref.copy %1, %0 : memref<4x128xf32> to memref<4x128xf32> -/// "some_use"(%0) : (memref<4x128xf32>) -> () -/// } -/// ``` -/// into: -/// ``` -/// %0 = memref.alloc() : memref<5x4x128xf32> -/// scf.for %iv = %c1 to %c1024 step %c3 { -/// %s = arith.subi %iv, %c1 : index -/// %d = arith.divsi %s, %c3 : index -/// %i = arith.remsi %d, %c5 : index -/// %sv = memref.subview %0[%i, 0, 0] [1, 4, 128] [1, 1, 1] : -/// memref<5x4x128xf32> to memref<4x128xf32, strided<[128, 1], offset: ?>> -/// memref.copy %1, %sv : memref<4x128xf32> to memref<4x128xf32, strided<...>> -/// "some_use"(%sv) : (memref<4x128xf32, strided<...>) -> () -/// } -/// ``` -template -FailureOr multiBufferExt(RewriterBase &rewriter, - AllocOpType allocOp, unsigned multiplier, - bool skipOverrideAnalysis = false); -/// Call into `multiBuffer` with locally constructed IRRewriter. -template -FailureOr multiBufferExt(AllocOpType allocOp, unsigned multiplier, - bool skipOverrideAnalysis = false); - -} // namespace memref -} // namespace mlir - -#endif // BYTEIR_DIALECT_MEMREF_TRANSFORMS_MULTIBUFFEREXT_H \ No newline at end of file diff --git a/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt b/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt index fa17a80e3..e722f7265 100644 --- a/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt +++ b/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt @@ -5,7 +5,6 @@ add_mlir_dialect_library(ByteIRGPUPasses GPUDistributeToWarp.cpp GPUTensorCoreVectorization.cpp GPUPackSharedMemoryAlloc.cpp - GPUPipelining.cpp GPUVectorToGPU.cpp OptimizeVectorTransfer.cpp RemoveTrivialLoops.cpp diff --git a/compiler/lib/Dialect/GPU/Transforms/GPUPipelining.cpp b/compiler/lib/Dialect/GPU/Transforms/GPUPipelining.cpp deleted file mode 100644 index 1be80cf2f..000000000 --- a/compiler/lib/Dialect/GPU/Transforms/GPUPipelining.cpp +++ /dev/null @@ -1,196 +0,0 @@ -//===- GPUPipelining.cpp -------------------------------------*--- C++-*-===// -// -// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// - -#include "byteir/Dialect/GPU/Transforms/GPUPipelining.h" -#include "byteir/Dialect/GPU/Passes.h" -#include "byteir/Dialect/GPU/Transforms/Transforms.h" -#include "byteir/Dialect/GPU/Transforms/Utils.h" -#include "byteir/Dialect/Linalg/Transforms/Transforms.h" -#include "byteir/Dialect/MemRef/Transforms/MultiBufferExt.h" - -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/LLVMIR/NVVMDialect.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/MemRef/Transforms/Transforms.h" -#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" -#include "mlir/Dialect/SCF/Transforms/Patterns.h" -#include "mlir/IR/Builders.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" -#include "llvm/Support/Debug.h" - -#include "PassDetail.h" - -#define DEBUG_TYPE "gpu-pipelining" - -using namespace mlir; - -namespace { - -/// Helper to recursively add operation dependencies within `block` to `dep` -/// set. -static void addDepOps(llvm::SmallDenseSet &dep, Operation *op, - Block *block) { - if (!dep.insert(op).second) - return; - for (Value operand : op->getOperands()) { - Operation *defOp = operand.getDefiningOp(); - if (defOp && defOp->getBlock() == block) - addDepOps(dep, defOp, block); - } -} - -static void -getPipelineStages(scf::ForOp forOp, - std::vector> &ops, - unsigned depth) { - SmallVector copyOps; - forOp.walk([&](linalg::CopyOp copyOp) { - if (hasMarker(copyOp, {getCopyToSharedMemoryAMarker(), - getCopyToSharedMemoryBMarker()})) { - copyOps.push_back(copyOp); - } - }); - - llvm::SmallDenseSet loadDep; - for (linalg::CopyOp copyOp : copyOps) { - addDepOps(loadDep, copyOp, forOp.getBody()); - } - - for (Operation &op : forOp.getBody()->getOperations()) { - if (!loadDep.count(&op) && !isa(op)) - ops.push_back(std::make_pair(&op, depth)); - } - for (Operation &op : forOp.getBody()->getOperations()) { - if (loadDep.count(&op)) - ops.push_back(std::make_pair(&op, 0)); - } -} - -static Operation *replaceLinalgMatmulWithIfOp(RewriterBase &rewriter, - Operation *op, Value pred) { - Location loc = op->getLoc(); - if (!isa(op)) - return op; - auto ifOp = rewriter.create(loc, op->getResultTypes(), pred, true); - // True branch. - op->moveBefore(&ifOp.getThenRegion().front(), - ifOp.getThenRegion().front().begin()); - rewriter.setInsertionPointAfter(op); - if (op->getNumResults() > 0) - rewriter.create(loc, op->getResults()); - return ifOp.getOperation(); -} - -struct GPUPipeliningPass : public GPUPipeliningBase { - GPUPipeliningPass(int64_t stages) : GPUPipeliningBase() { - this->stages = stages; - } - - void runOnOperation() override { - func::FuncOp funcOp = getOperation(); - stages = 3; - // step 1: collect all the alloc operations and do multi-buffering - SmallVector allocas; - // Collect all the alloc operations. - funcOp.walk([&](memref::AllocOp AllocOp) { - if (nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace( - AllocOp.getType()) && - hasMarker(AllocOp, {getAllocSharedMemoryAMarker(), - getAllocSharedMemoryBMarker()})) { - allocas.push_back(AllocOp); - } - }); - assert(allocas.size() == 2 && "Only support 2 allocas for now"); - // Apply multi-buffering to all of them. - for (memref::AllocOp AllocOp : allocas) { - if (failed(memref::multiBufferExt(AllocOp, (unsigned int)stages, true))) { - // Error out and stop if any buffer cannot be multi buffered, as - // future software pipelining transformations will assume this - // happened. - AllocOp.emitOpError("cannot be multi-buffered"); - return signalPassFailure(); - } - } - - // step 2: find linalg.copy ops in scf.for and its dependencies - SmallVector forOps; - // Mark the loop with shared memory copy for pipelining. - funcOp.walk([&forOps](scf::ForOp forOp) { forOps.push_back(forOp); }); - - assert(forOps.size() == 1 && "Only support 1 loop in matmul"); - - scf::PipeliningOption options; - unsigned maxDepth = stages; - auto getSchedule = - [maxDepth](scf::ForOp forOp, - std::vector> &schedule) { - getPipelineStages(forOp, schedule, maxDepth); - }; - - // step 3: apply software pipelining - options.getScheduleFn = getSchedule; - options.supportDynamicLoops = false; - options.peelEpilogue = false; - options.predicateFn = replaceLinalgMatmulWithIfOp; - - RewritePatternSet patterns(&getContext()); - scf::populateSCFLoopPipeliningPatterns(patterns, options); - (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns)); - - // step 3: add nvvm commit_group and wait_group - // 3.1 find all the linalg.copy ops which do __byteir_load_matrix_a__ or - // __byteir_load_matrix_b__ - SmallVector copyOps; - funcOp.walk([&](linalg::CopyOp copyOp) { - if (hasMarker(copyOp, {getCopyToSharedMemoryAMarker(), - getCopyToSharedMemoryBMarker()})) { - copyOps.push_back(copyOp); - } - }); - // There is (stages + 1) * 2 copy ops in total - assert(copyOps.size() == (stages + 1) * 2 && - "Wrong linalg copy ops number after pipelining"); - OpBuilder b(funcOp.getContext()); - // As group = stages + 1, we need to add commit_group after every group - for (int64_t g = 0; g < stages + 1; g++) { - Operation *lastCopyInGroup = copyOps[g * 2 + 1]; - // if linalg.copy is inside a scf.if, we need to add commit_group after - // scf.if as we want to generate predicated copy - if (lastCopyInGroup->getParentOfType()) { - lastCopyInGroup = lastCopyInGroup->getParentOfType(); - } - b.setInsertionPointAfter(lastCopyInGroup); - b.create(funcOp.getLoc()); - } - // 3.2 find linalg.matmul and add wait_group before it - SmallVector matmulOps; - funcOp.walk( - [&](linalg::MatmulOp matmulOp) { matmulOps.push_back(matmulOp); }); - assert(matmulOps.size() == 1 && "Only support 1 matmul op in the loop"); - linalg::MatmulOp matmulOp = matmulOps[0]; - b.setInsertionPoint(matmulOp); - // wait first group done, stages - 1 prefetch groups can run in the pipeline - b.create(funcOp.getLoc(), stages - 1); - } -}; - -} // namespace - -std::unique_ptr> -mlir::createGPUPipeliningPass(int64_t stages) { - return std::make_unique(stages); -} \ No newline at end of file diff --git a/compiler/lib/Dialect/MemRef/CMakeLists.txt b/compiler/lib/Dialect/MemRef/CMakeLists.txt index c76cf1281..9304445ce 100644 --- a/compiler/lib/Dialect/MemRef/CMakeLists.txt +++ b/compiler/lib/Dialect/MemRef/CMakeLists.txt @@ -1,7 +1,6 @@ add_mlir_dialect_library(ByteIRMemRefPasses Transforms/ApplyMemRefAffineLayout.cpp Transforms/ExtractAddressComputation.cpp - Transforms/MultiBufferExt.cpp Transforms/RemoveCopy.cpp Transforms/SimplifyLinearizedIndex.cpp Transforms/SimplifyView.cpp diff --git a/compiler/lib/Dialect/MemRef/Transforms/MultiBufferExt.cpp b/compiler/lib/Dialect/MemRef/Transforms/MultiBufferExt.cpp deleted file mode 100644 index 639170d4d..000000000 --- a/compiler/lib/Dialect/MemRef/Transforms/MultiBufferExt.cpp +++ /dev/null @@ -1,283 +0,0 @@ -//===- MultiBufferExt.cpp -----------------------------------------*--- C++ -//-*-===// -// -// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// - -// Some code comes from mlir/lib/Dialect/Memref/Transforms/MultiBuffer.cpp of -// LLVM Project. -// Original license: -//===----------- MultiBuffering.cpp ---------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Arith/Utils/Utils.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/MemRef/Transforms/Passes.h" -#include "mlir/Dialect/MemRef/Transforms/Transforms.h" -#include "mlir/IR/AffineExpr.h" -#include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/Dominance.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/IR/ValueRange.h" -#include "mlir/Interfaces/LoopLikeInterface.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/Debug.h" - -using namespace mlir; - -#define DEBUG_TYPE "memref-multi-buffer-ext" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") -#define DBGSNL() (llvm::dbgs() << "\n") - -/// Return true if the op fully overwrite the given `buffer` value. -static bool overrideBuffer(Operation *op, Value buffer) { - auto memrefCopyOp = dyn_cast(op); - auto linalgCopyOp = dyn_cast(op); - if (memrefCopyOp) - return memrefCopyOp.getTarget() == buffer; - if (linalgCopyOp) - return linalgCopyOp.getDpsInitOperand(0)->get() == buffer; - return false; -} - -/// Replace the uses of `oldOp` with the given `val` and for subview uses -/// propagate the type change. Changing the memref type may require propagating -/// it through subview ops so we cannot just do a replaceAllUse but need to -/// propagate the type change and erase old subview ops. -static void replaceUsesAndPropagateType(RewriterBase &rewriter, - Operation *oldOp, Value val) { - SmallVector opsToDelete; - SmallVector operandsToReplace; - - // Save the operand to replace / delete later (avoid iterator invalidation). - // TODO: can we use an early_inc iterator? - for (OpOperand &use : oldOp->getUses()) { - // Non-subview ops will be replaced by `val`. - auto subviewUse = dyn_cast(use.getOwner()); - if (!subviewUse) { - operandsToReplace.push_back(&use); - continue; - } - - // `subview(old_op)` is replaced by a new `subview(val)`. - OpBuilder::InsertionGuard g(rewriter); - rewriter.setInsertionPoint(subviewUse); - Type newType = memref::SubViewOp::inferRankReducedResultType( - subviewUse.getType().getShape(), cast(val.getType()), - subviewUse.getStaticOffsets(), subviewUse.getStaticSizes(), - subviewUse.getStaticStrides()); - Value newSubview = rewriter.create( - subviewUse->getLoc(), cast(newType), val, - subviewUse.getMixedOffsets(), subviewUse.getMixedSizes(), - subviewUse.getMixedStrides()); - - // Ouch recursion ... is this really necessary? - replaceUsesAndPropagateType(rewriter, subviewUse, newSubview); - - opsToDelete.push_back(use.getOwner()); - } - - // Perform late replacement. - // TODO: can we use an early_inc iterator? - for (OpOperand *operand : operandsToReplace) { - Operation *op = operand->getOwner(); - rewriter.startRootUpdate(op); - operand->set(val); - rewriter.finalizeRootUpdate(op); - } - - // Perform late op erasure. - // TODO: can we use an early_inc iterator? - for (Operation *op : opsToDelete) - rewriter.eraseOp(op); -} - -namespace mlir { -namespace memref { - -// Transformation to do multi-buffering/array expansion to remove dependencies -// on the temporary allocation between consecutive loop iterations. -// Returns success if the transformation happened and failure otherwise. -// This is not a pattern as it requires propagating the new memref type to its -// uses and requires updating subview ops. -template -FailureOr -multiBufferExt(RewriterBase &rewriter, AllocOpType allocOp, - unsigned multiBufferingFactor, bool skipOverrideAnalysis) { - LLVM_DEBUG(DBGS() << "Start multibuffering: " << allocOp << "\n"); - DominanceInfo dom(allocOp->getParentOp()); - LoopLikeOpInterface candidateLoop; - for (Operation *user : allocOp->getUsers()) { - auto parentLoop = user->getParentOfType(); - if (!parentLoop) { - if (isa(user)) { - // Allow dealloc outside of any loop. - // TODO: The whole precondition function here is very brittle and will - // need to rethought an isolated into a cleaner analysis. - continue; - } - LLVM_DEBUG(DBGS() << "--no parent loop -> fail\n"); - LLVM_DEBUG(DBGS() << "----due to user: " << *user << "\n"); - return failure(); - } - if (!skipOverrideAnalysis) { - /// Make sure there is no loop-carried dependency on the allocation. - if (!overrideBuffer(user, allocOp.getResult())) { - LLVM_DEBUG(DBGS() << "--Skip user: found loop-carried dependence\n"); - continue; - } - // If this user doesn't dominate all the other users keep looking. - if (llvm::any_of(allocOp->getUsers(), [&](Operation *otherUser) { - return !dom.dominates(user, otherUser); - })) { - LLVM_DEBUG( - DBGS() << "--Skip user: does not dominate all other users\n"); - continue; - } - } else { - if (llvm::any_of(allocOp->getUsers(), [&](Operation *otherUser) { - return !isa(otherUser) && - !parentLoop->isProperAncestor(otherUser); - })) { - LLVM_DEBUG( - DBGS() - << "--Skip user: not all other users are in the parent loop\n"); - continue; - } - } - candidateLoop = parentLoop; - break; - } - - if (!candidateLoop) { - LLVM_DEBUG(DBGS() << "Skip alloc: no candidate loop\n"); - return failure(); - } - - std::optional inductionVar = candidateLoop.getSingleInductionVar(); - std::optional lowerBound = candidateLoop.getSingleLowerBound(); - std::optional singleStep = candidateLoop.getSingleStep(); - if (!inductionVar || !lowerBound || !singleStep || - !llvm::hasSingleElement(candidateLoop.getLoopRegions())) { - LLVM_DEBUG(DBGS() << "Skip alloc: no single iv, lb, step or region\n"); - return failure(); - } - - if (!dom.dominates(allocOp.getOperation(), candidateLoop)) { - LLVM_DEBUG(DBGS() << "Skip alloc: does not dominate candidate loop\n"); - return failure(); - } - - LLVM_DEBUG(DBGS() << "Start multibuffering loop: " << candidateLoop << "\n"); - - // 1. Construct the multi-buffered memref type. - ArrayRef originalShape = allocOp.getType().getShape(); - SmallVector multiBufferedShape{multiBufferingFactor}; - llvm::append_range(multiBufferedShape, originalShape); - LLVM_DEBUG(DBGS() << "--original type: " << allocOp.getType() << "\n"); - MemRefType mbMemRefType = MemRefType::Builder(allocOp.getType()) - .setShape(multiBufferedShape) - .setLayout(MemRefLayoutAttrInterface()); - LLVM_DEBUG(DBGS() << "--multi-buffered type: " << mbMemRefType << "\n"); - - // 2. Create the multi-buffered alloc. - Location loc = allocOp->getLoc(); - OpBuilder::InsertionGuard g(rewriter); - rewriter.setInsertionPoint(allocOp); - auto mbAlloc = rewriter.create(loc, mbMemRefType, ValueRange{}, - allocOp->getAttrs()); - LLVM_DEBUG(DBGS() << "--multi-buffered alloc: " << mbAlloc << "\n"); - - // 3. Within the loop, build the modular leading index (i.e. each loop - // iteration %iv accesses slice ((%iv - %lb) / %step) % %mb_factor). - rewriter.setInsertionPointToStart( - &candidateLoop.getLoopRegions().front()->front()); - Value ivVal = *inductionVar; - Value lbVal = getValueOrCreateConstantIndexOp(rewriter, loc, *lowerBound); - Value stepVal = getValueOrCreateConstantIndexOp(rewriter, loc, *singleStep); - AffineExpr iv, lb, step; - bindDims(rewriter.getContext(), iv, lb, step); - Value bufferIndex = affine::makeComposedAffineApply( - rewriter, loc, ((iv - lb).floorDiv(step)) % multiBufferingFactor, - {ivVal, lbVal, stepVal}); - LLVM_DEBUG(DBGS() << "--multi-buffered indexing: " << bufferIndex << "\n"); - - // 4. Build the subview accessing the particular slice, taking modular - // rotation into account. - int64_t mbMemRefTypeRank = mbMemRefType.getRank(); - IntegerAttr zero = rewriter.getIndexAttr(0); - IntegerAttr one = rewriter.getIndexAttr(1); - SmallVector offsets(mbMemRefTypeRank, zero); - SmallVector sizes(mbMemRefTypeRank, one); - SmallVector strides(mbMemRefTypeRank, one); - // Offset is [bufferIndex, 0 ... 0 ]. - offsets.front() = bufferIndex; - // Sizes is [1, original_size_0 ... original_size_n ]. - for (int64_t i = 0, e = originalShape.size(); i != e; ++i) - sizes[1 + i] = rewriter.getIndexAttr(originalShape[i]); - // Strides is [1, 1 ... 1 ]. - auto dstMemref = - cast(memref::SubViewOp::inferRankReducedResultType( - originalShape, mbMemRefType, offsets, sizes, strides)); - Value subview = rewriter.create(loc, dstMemref, mbAlloc, - offsets, sizes, strides); - LLVM_DEBUG(DBGS() << "--multi-buffered slice: " << subview << "\n"); - - // 5. Due to the recursive nature of replaceUsesAndPropagateType , we need to - // handle dealloc uses separately.. - for (OpOperand &use : llvm::make_early_inc_range(allocOp->getUses())) { - auto deallocOp = dyn_cast(use.getOwner()); - if (!deallocOp) - continue; - OpBuilder::InsertionGuard g(rewriter); - rewriter.setInsertionPoint(deallocOp); - auto newDeallocOp = - rewriter.create(deallocOp->getLoc(), mbAlloc); - (void)newDeallocOp; - LLVM_DEBUG(DBGS() << "----Created dealloc: " << newDeallocOp << "\n"); - rewriter.eraseOp(deallocOp); - } - - // 6. RAUW with the particular slice, taking modular rotation into account. - replaceUsesAndPropagateType(rewriter, allocOp, subview); - - // 7. Finally, erase the old allocOp. - rewriter.eraseOp(allocOp); - - return mbAlloc; -} - -template -FailureOr multiBufferExt(AllocOpType allocOp, - unsigned multiBufferingFactor, - bool skipOverrideAnalysis) { - IRRewriter rewriter(allocOp->getContext()); - return multiBufferExt(rewriter, allocOp, multiBufferingFactor, - skipOverrideAnalysis); -} - -template FailureOr multiBufferExt(memref::AllocOp, unsigned, - bool); -template FailureOr multiBufferExt(memref::AllocaOp, unsigned, - bool); -} // namespace memref -} // namespace mlir \ No newline at end of file diff --git a/compiler/lib/Pipelines/LinalgMemrefOpt.cpp b/compiler/lib/Pipelines/LinalgMemrefOpt.cpp index 7ca50249d..99c00e253 100644 --- a/compiler/lib/Pipelines/LinalgMemrefOpt.cpp +++ b/compiler/lib/Pipelines/LinalgMemrefOpt.cpp @@ -56,9 +56,7 @@ void addGemmOptPasses(OpPassManager &pm) { anchoredPM.addPass(createCanonicalizerPass()); anchoredPM.addPass(createCSEPass()); anchoredPM.addPass(createCanonicalizerPass()); - // anchoredPM.addPass(createGPUPipeliningPass()); - // anchoredPM.addPass(createCSEPass()); - // anchoredPM.addPass(createCanonicalizerPass()); + anchoredPM.addPass(createGPUDistributeToWarpPass()); anchoredPM.addPass(createRemoveTrivialLoopsPass()); anchoredPM.addPass(createGPUTensorCoreVectorizationPass()); @@ -87,7 +85,6 @@ void addGemmOptPasses(OpPassManager &pm) { pm.addPass(memref::createFoldMemRefAliasOpsPass()); } - // anchoredPM.addPass(createGPUPipeliningPass()); { OpPassManager anchoredPM(func::FuncOp::getOperationName()); anchoredPM.addPass(createGPUPackSharedMemoryAllocPass()); diff --git a/compiler/test/Dialect/GPU/gpu-pipelining.mlir b/compiler/test/Dialect/GPU/gpu-pipelining.mlir deleted file mode 100644 index b2f2475a7..000000000 --- a/compiler/test/Dialect/GPU/gpu-pipelining.mlir +++ /dev/null @@ -1,94 +0,0 @@ -// RUN: byteir-opt -gpu-pipelining="stages=3" -canonicalize --cse --verify-diagnostics %s | FileCheck %s - -#map = affine_map<(d0) -> (d0 * 128)> -module { - func.func private @Unknown0(%arg0: memref<5376x2048xf16>, %arg1: memref<2048x5376xf16>) -> memref<5376x5376xf16> attributes {__byteir_gemm_block_size__ = [64, 2, 1], __byteir_gemm_pipeline_depth__ = 3 : i64, __byteir_gemm_tile_config__ = [128, 128, 32], __byteir_matmul_epilogue_fusion__} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c2048 = arith.constant 2048 : index - %c32 = arith.constant 32 : index - %alloc = memref.alloc() : memref<5376x5376xf16> - scf.forall (%arg2, %arg3) in (42, 42) { - %alloca = memref.alloca() {__byteir_alloca_accumulator__} : memref<128x128xf16, #gpu.address_space> - %alloca_0 = memref.alloca() {__byteir_alloca_matrix_b__} : memref<32x128xf16, #gpu.address_space> - %alloca_1 = memref.alloca() {__byteir_alloca_matrix_a__} : memref<128x32xf16, #gpu.address_space> - %0 = affine.apply #map(%arg2) - %1 = affine.apply #map(%arg3) - %subview = memref.subview %alloc[%0, %1] [128, 128] [1, 1] : memref<5376x5376xf16> to memref<128x128xf16, strided<[5376, 1], offset: ?>> - linalg.fill ins(%cst : f16) outs(%alloca : memref<128x128xf16, #gpu.address_space>) - scf.for %arg4 = %c0 to %c2048 step %c32 { - %subview_2 = memref.subview %arg0[%0, %arg4] [128, 32] [1, 1] : memref<5376x2048xf16> to memref<128x32xf16, strided<[2048, 1], offset: ?>> - %subview_3 = memref.subview %arg1[%arg4, %1] [32, 128] [1, 1] : memref<2048x5376xf16> to memref<32x128xf16, strided<[5376, 1], offset: ?>> - linalg.copy {__byteir_load_matrix_a__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%subview_2 : memref<128x32xf16, strided<[2048, 1], offset: ?>>) outs(%alloca_1 : memref<128x32xf16, #gpu.address_space>) - linalg.copy {__byteir_load_matrix_b__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%subview_3 : memref<32x128xf16, strided<[5376, 1], offset: ?>>) outs(%alloca_0 : memref<32x128xf16, #gpu.address_space>) - linalg.matmul {__byteir_gpu_tile_gemm_0, __byteir_mma__, __byteir_mma_level__ = "Threadblock", __byteir_target__ = "nv_sm_80"} ins(%alloca_1, %alloca_0 : memref<128x32xf16, #gpu.address_space>, memref<32x128xf16, #gpu.address_space>) outs(%alloca : memref<128x128xf16, #gpu.address_space>) - } - linalg.copy {__byteir_store_matrix_c__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%alloca : memref<128x128xf16, #gpu.address_space>) outs(%subview : memref<128x128xf16, strided<[5376, 1], offset: ?>>) - } {mapping = [#gpu.block, #gpu.block]} - return %alloc : memref<5376x5376xf16> - } -} - -// CHECK-LABEL: scf.forall (%arg2, %arg3) in (42, 42) { - -// init: -// CHECK: %[[ALLOCA:.*]] = memref.alloca() {__byteir_alloca_accumulator__} : memref<128x128xf16, #gpu.address_space> -// CHECK: %[[ALLOCA0:.*]] = memref.alloca() {__byteir_alloca_matrix_b__} : memref<3x32x128xf16, #gpu.address_space -// CHECK: %[[ALLOCA1:.*]] = memref.alloca() {__byteir_alloca_matrix_a__} : memref<3x128x32xf16, #gpu.address_space -// CHECK: %[[IDX0:.*]] = affine.apply #map(%{{.*}}) -// CHECK: %[[IDX1:.*]] = affine.apply #map(%{{.*}}) -// CHECK: %[[SUBVIEW:.*]] = memref.subview %[[ALLOC:.*]][%[[IDX0]], %[[IDX1]]] [128, 128] [1, 1] : memref<5376x5376xf16> to memref<128x128xf16, strided<[5376, 1], offset: ?>> -// CHECK: linalg.fill ins(%[[CST:.*]] : f16) outs(%[[ALLOCA]] : memref<128x128xf16, #gpu.address_space>) - -// prelogue0: -// CHECK: %[[SUBVIEW2:.*]] = memref.subview %[[ALLOCA1]][0, 0, 0] [1, 128, 32] [1, 1, 1] : memref<3x128x32xf16, #gpu.address_space> to memref<128x32xf16, strided<[32, 1]>, #gpu.address_space -// CHECK: %[[CAST2:.*]] = memref.cast %[[SUBVIEW2]] : memref<128x32xf16, strided<[32, 1]>, #gpu.address_space> to memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space -// CHECK: %[[SUBVIEW3:.*]] = memref.subview %[[ALLOCA0]][0, 0, 0] [1, 32, 128] [1, 1, 1] : memref<3x32x128xf16, #gpu.address_space> to memref<32x128xf16, strided<[128, 1]>, #gpu.address_space -// CHECK: %[[CAST3:.*]] = memref.cast %[[SUBVIEW3]] : memref<32x128xf16, strided<[128, 1]>, #gpu.address_space> to memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space -// CHECK: %[[SUBVIEW5:.*]] = memref.subview %arg0[%[[IDX0]], 0] [128, 32] [1, 1] : memref<5376x2048xf16> to memref<128x32xf16, strided<[2048, 1], offset: ?> -// CHECK: %[[SUBVIEW6:.*]] = memref.subview %arg1[0, %[[IDX1]]] [32, 128] [1, 1] : memref<2048x5376xf16> to memref<32x128xf16, strided<[5376, 1], offset: ?> -// CHECK: linalg.copy {__byteir_load_matrix_a__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW5]] : memref<128x32xf16, strided<[2048, 1], offset: ?>>) outs(%[[SUBVIEW2]] : memref<128x32xf16, strided<[32, 1]>, #gpu.address_space>) -// CHECK: linalg.copy {__byteir_load_matrix_b__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW6]] : memref<32x128xf16, strided<[5376, 1], offset: ?>>) outs(%[[SUBVIEW3]] : memref<32x128xf16, strided<[128, 1]>, #gpu.address_space>) -// CHECK: nvvm.cp.async.commit.group - -// prelogue1: -// CHECK: %[[SUBVIEW7:.*]] = memref.subview %[[ALLOCA1]][1, 0, 0] [1, 128, 32] [1, 1, 1] : memref<3x128x32xf16, #gpu.address_space> to memref<128x32xf16, strided<[32, 1], offset: 4096>, #gpu.address_space -// CHECK: %[[CAST4:.*]] = memref.cast %[[SUBVIEW7]] : memref<128x32xf16, strided<[32, 1], offset: 4096>, #gpu.address_space> to memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space -// CHECK: %[[SUBVIEW9:.*]] = memref.subview %[[ALLOCA0]][1, 0, 0] [1, 32, 128] [1, 1, 1] : memref<3x32x128xf16, #gpu.address_space> to memref<32x128xf16, strided<[128, 1], offset: 4096>, #gpu.address_space -// CHECK: %[[CAST5:.*]] = memref.cast %[[SUBVIEW9]] : memref<32x128xf16, strided<[128, 1], offset: 4096>, #gpu.address_space> to memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space -// CHECK: %[[SUBVIEW11:.*]] = memref.subview %arg0[%[[IDX0]], 32] [128, 32] [1, 1] : memref<5376x2048xf16> to memref<128x32xf16, strided<[2048, 1], offset: ?> -// CHECK: %[[SUBVIEW12:.*]] = memref.subview %arg1[32, %[[IDX1]]] [32, 128] [1, 1] : memref<2048x5376xf16> to memref<32x128xf16, strided<[5376, 1], offset: ?> -// CHECK: linalg.copy {__byteir_load_matrix_a__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW11]] : memref<128x32xf16, strided<[2048, 1], offset: ?>>) outs(%[[SUBVIEW7]] : memref<128x32xf16, strided<[32, 1], offset: 4096>, #gpu.address_space>) -// CHECK: linalg.copy {__byteir_load_matrix_b__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW12]] : memref<32x128xf16, strided<[5376, 1], offset: ?>>) outs(%[[SUBVIEW9]] : memref<32x128xf16, strided<[128, 1], offset: 4096>, #gpu.address_space>) -// CHECK: nvvm.cp.async.commit.group - -// prelogue2: -// CHECK: %[[SUBVIEW13:.*]] = memref.subview %[[ALLOCA1]][2, 0, 0] [1, 128, 32] [1, 1, 1] : memref<3x128x32xf16, #gpu.address_space> to memref<128x32xf16, strided<[32, 1], offset: 8192>, #gpu.address_space -// CHECK: %[[CAST6:.*]] = memref.cast %[[SUBVIEW13]] : memref<128x32xf16, strided<[32, 1], offset: 8192>, #gpu.address_space> to memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space -// CHECK: %[[SUBVIEW15:.*]] = memref.subview %[[ALLOCA0]][2, 0, 0] [1, 32, 128] [1, 1, 1] : memref<3x32x128xf16, #gpu.address_space> to memref<32x128xf16, strided<[128, 1], offset: 8192>, #gpu.address_space -// CHECK: %[[CAST7:.*]] = memref.cast %[[SUBVIEW15]] : memref<32x128xf16, strided<[128, 1], offset: 8192>, #gpu.address_space> to memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space -// CHECK: %[[SUBVIEW17:.*]] = memref.subview %arg0[%[[IDX0]], 64] [128, 32] [1, 1] : memref<5376x2048xf16> to memref<128x32xf16, strided<[2048, 1], offset: ?> -// CHECK: %[[SUBVIEW18:.*]] = memref.subview %arg1[64, %[[IDX1]]] [32, 128] [1, 1] : memref<2048x5376xf16> to memref<32x128xf16, strided<[5376, 1], offset: ?> -// CHECK: linalg.copy {__byteir_load_matrix_a__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW17]] : memref<128x32xf16, strided<[2048, 1], offset: ?>>) outs(%[[SUBVIEW13]] : memref<128x32xf16, strided<[32, 1], offset: 8192>, #gpu.address_space>) -// CHECK: linalg.copy {__byteir_load_matrix_b__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW18]] : memref<32x128xf16, strided<[5376, 1], offset: ?>>) outs(%[[SUBVIEW15]] : memref<32x128xf16, strided<[128, 1], offset: 8192>, #gpu.address_space>) -// CHECK: nvvm.cp.async.commit.group - -// kernel: -// CHECK: %[[CAST:.*]] = scf.for %arg4 = %c0 to %c2048 step %c32 iter_args(%arg5 = %[[CAST2]], %arg6 = %[[CAST4]], %arg7 = %[[CAST6]], %arg8 = %[[CAST3]], %arg9 = %[[CAST5]], %arg10 = %[[CAST7]]) -> (memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space>, memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space>, memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space>) { -// CHECK: nvvm.cp.async.wait.group 2 -// CHECK: linalg.matmul {__byteir_gpu_tile_gemm_0, __byteir_mma__, __byteir_mma_level__ = "Threadblock", __byteir_target__ = "nv_sm_80"} ins(%arg5, %arg8 : memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space>) outs(%[[ALLOCA]] : memref<128x128xf16, #gpu.address_space>) - -// CHECK: %[[IDX5:.*]] = affine.apply #map1(%[[IDX4:.*]]) -// CHECK: %[[SUBVIEW19:.*]] = memref.subview %[[ALLOCA1]][%[[IDX5]], 0, 0] [1, 128, 32] [1, 1, 1] : memref<3x128x32xf16, #gpu.address_space> to memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space -// CHECK: %[[SUBVIEW20:.*]] = memref.subview %[[ALLOCA0]][%[[IDX5]], 0, 0] [1, 32, 128] [1, 1, 1] : memref<3x32x128xf16, #gpu.address_space> to memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space -// CHECK: %[[SUBVIEW21:.*]] = memref.subview %arg0[%[[IDX0]], %[[IDX8:.*]]] [128, 32] [1, 1] : memref<5376x2048xf16> to memref<128x32xf16, strided<[2048, 1], offset: ?> -// CHECK: %[[SUBVIEW22:.*]] = memref.subview %arg1[%[[IDX9:.*]], %[[IDX1]]] [32, 128] [1, 1] : memref<2048x5376xf16> to memref<32x128xf16, strided<[5376, 1], offset: ?> -// CHECK: scf.if %[[CMP:.*]] { -// CHECK: linalg.copy {__byteir_load_matrix_a__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW21]] : memref<128x32xf16, strided<[2048, 1], offset: ?>>) outs(%[[SUBVIEW19]] : memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space>) -// CHECK: linalg.copy {__byteir_load_matrix_b__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[SUBVIEW22]] : memref<32x128xf16, strided<[5376, 1], offset: ?>>) outs(%[[SUBVIEW20]] : memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space>) -// CHECK: nvvm.cp.async.commit.group -// CHECK: scf.yield %arg6, %arg7, %[[SUBVIEW19]], %arg9, %arg10, %[[SUBVIEW20]] : memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space>, memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space>, memref<128x32xf16, strided<[32, 1], offset: ?>, #gpu.address_space>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space>, memref<32x128xf16, strided<[128, 1], offset: ?>, #gpu.address_space -// CHECK: } - -// copy back to global memory: -// CHECK: linalg.copy {__byteir_store_matrix_c__, __internal_linalg_transform__ = "__byteir_copy_related_to_workgroup_memory__"} ins(%[[ALLOCA]] : memref<128x128xf16, #gpu.address_space>) outs(%[[SUBVIEW]] : memref<128x128xf16, strided<[5376, 1], offset: ?>>)