diff --git a/compiler/include/byteir/Dialect/GPU/Passes.h b/compiler/include/byteir/Dialect/GPU/Passes.h index 3e249b5e3..18fcd7283 100644 --- a/compiler/include/byteir/Dialect/GPU/Passes.h +++ b/compiler/include/byteir/Dialect/GPU/Passes.h @@ -18,6 +18,7 @@ #ifndef BYTEIR_DIALECT_GPU_PASSES_H #define BYTEIR_DIALECT_GPU_PASSES_H +#include "byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h" #include "byteir/Dialect/GPU/Transforms/GPUBlockSwizzle.h" #include "byteir/Dialect/GPU/Transforms/GPUDistributeSharedMemoryCopy.h" #include "byteir/Dialect/GPU/Transforms/GPUDistributeToWarp.h" diff --git a/compiler/include/byteir/Dialect/GPU/Passes.td b/compiler/include/byteir/Dialect/GPU/Passes.td index 776e06d49..215e4c2e1 100644 --- a/compiler/include/byteir/Dialect/GPU/Passes.td +++ b/compiler/include/byteir/Dialect/GPU/Passes.td @@ -127,4 +127,12 @@ def GPUVectorToGPU : Pass<"gpu-vector-to-gpu", "func::FuncOp"> { "nvgpu::NVGPUDialect", ]; } + +//===----------------------------------------------------------------------===// +// LegalizeGPULaunch +//===----------------------------------------------------------------------===// +def LegalizeGPULaunch : Pass<"legalize-gpu-launch", "func::FuncOp"> { + let summary = "Legalize GPU launch ops."; + let constructor = "mlir::createLegalizeGPULaunchPass()"; +} #endif // BYTEIR_DIALECT_GPU_PASSES diff --git a/compiler/include/byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h b/compiler/include/byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h new file mode 100644 index 000000000..8d07ec590 --- /dev/null +++ b/compiler/include/byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h @@ -0,0 +1,34 @@ +//===- LegalizeGPULaunch.h ---------------------------------*--- C++ -*-===// +// +// Copyright 2024 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#ifndef BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H +#define BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H + +#include "mlir/Pass/Pass.h" +#include "llvm/ADT/StringRef.h" +#include + +namespace mlir { +namespace func { +class FuncOp; +} // namespace func + +std::unique_ptr> createLegalizeGPULaunchPass(); + +} // namespace mlir + +#endif // BYTEIR_DIALECT_GPU_TRANSFORMS_LEGALIZEGPULAUNCH_H \ No newline at end of file diff --git a/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp b/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp index fda8a3af4..7af99d824 100644 --- a/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp +++ b/compiler/lib/Conversion/FuncToByre/FuncToByre.cpp @@ -129,6 +129,14 @@ class ConvertGPULaunchFuncToByrePattern computeOp->setAttr("BlockSize.y", rewriter.getI32IntegerAttr(by)); computeOp->setAttr("BlockSize.z", rewriter.getI32IntegerAttr(bz)); + auto sharedMemorySize = launchOp.getDynamicSharedMemorySize(); + if (sharedMemorySize) { + auto sharedMemorySizeValue = + cast(sharedMemorySize.getDefiningOp()); + IntegerAttr smem = cast(sharedMemorySizeValue.getValue()); + computeOp->setAttr("DynamicSharedMemorySize", smem); + } + if (useBarePtrCallConv) { computeOp->setAttr(byre::getKernelCallConventionAttrName(), rewriter.getStringAttr("bare_ptr")); diff --git a/compiler/lib/Conversion/GPUToNVVM/GPUToNVVM.cpp b/compiler/lib/Conversion/GPUToNVVM/GPUToNVVM.cpp index c3e510cee..2c7c6cf96 100644 --- a/compiler/lib/Conversion/GPUToNVVM/GPUToNVVM.cpp +++ b/compiler/lib/Conversion/GPUToNVVM/GPUToNVVM.cpp @@ -68,9 +68,9 @@ using namespace mlir::NVVM; namespace { -void ConvertToDynamicSharedMemory(GPUModuleOp moduleOp) { +static void ConvertToDynamicSharedMemory(GPUModuleOp moduleOp) { SymbolTableCollection symbolTableCollection; - // Collect all the adressOfOps to static shared memory globals. + // Collect all the addressOfOps to static shared memory globals. SmallVector addressOfOps; moduleOp.walk([&](LLVM::AddressOfOp addressOfOp) { // Check that the global associated with this addressOfOp has shared memory @@ -80,17 +80,8 @@ void ConvertToDynamicSharedMemory(GPUModuleOp moduleOp) { }); if (addressOfOps.size() == 0) return; - OpBuilder builder(moduleOp); - builder.setInsertionPoint(&moduleOp.front()); - auto type = - LLVM::LLVMArrayType::get(IntegerType::get(builder.getContext(), 8), 0); - LLVM::GlobalOp global = builder.create( - moduleOp.getLoc(), type, /*isConstant=*/false, LLVM::Linkage::External, - "__dynamic_shared_memory__", Attribute(), - /*alignment=*/16, /*addr_space=*/3); + uint32_t numberOfBytes = 0; - // Replace the addressOfOps with correctly offseted pointers to dynamic - // shared memory. llvm::SmallDenseMap globalMemoryOffsetMap; for (auto addressOfOp : addressOfOps) { uint32_t offset = 0; @@ -107,6 +98,26 @@ void ConvertToDynamicSharedMemory(GPUModuleOp moduleOp) { DataLayout dataLayout = DataLayout::closest(addressOfOp); numberOfBytes = offset + dataLayout.getTypeSizeInBits(thisarray) / 8; } + } + + // Check if numberOfBytes is less than 48 * 1024 + if (numberOfBytes < 48 * 1024) { + return; + } + + OpBuilder builder(moduleOp); + builder.setInsertionPoint(&moduleOp.front()); + auto type = + LLVM::LLVMArrayType::get(IntegerType::get(builder.getContext(), 8), 0); + LLVM::GlobalOp global = builder.create( + moduleOp.getLoc(), type, /*isConstant=*/false, LLVM::Linkage::External, + "__dynamic_shared_memory__", Attribute(), + /*alignment=*/16, /*addr_space=*/3); + + // Replace the addressOfOps with correctly offseted pointers to dynamic + // shared memory. + for (auto addressOfOp : addressOfOps) { + uint32_t offset = globalMemoryOffsetMap[addressOfOp.getGlobal(symbolTableCollection)]; auto loc = addressOfOp.getLoc(); builder.setInsertionPoint(addressOfOp); LLVM::AddressOfOp globalPtr = @@ -416,6 +427,7 @@ struct GPUToNVVMExtPass : public GPUToNVVMExtBase { } } }); + ConvertToDynamicSharedMemory(m); } }; diff --git a/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt b/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt index 8ff0dfe20..fa17a80e3 100644 --- a/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt +++ b/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt @@ -1,4 +1,5 @@ add_mlir_dialect_library(ByteIRGPUPasses + LegalizeGPULaunch.cpp GPUBlockSwizzle.cpp GPUDistributeSharedMemoryCopy.cpp GPUDistributeToWarp.cpp diff --git a/compiler/lib/Dialect/GPU/Transforms/LegalizeGPULaunch.cpp b/compiler/lib/Dialect/GPU/Transforms/LegalizeGPULaunch.cpp new file mode 100644 index 000000000..2895dd7d1 --- /dev/null +++ b/compiler/lib/Dialect/GPU/Transforms/LegalizeGPULaunch.cpp @@ -0,0 +1,77 @@ +//===- LegalizeGPULaunch.cpp-*-===// +// +// Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// + +#include "byteir/Dialect/GPU/Transforms/LegalizeGPULaunch.h" +#include "byteir/Dialect/GPU/Transforms/Utils.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Visitors.h" +#include + +#include "PassDetail.h" + +using namespace llvm; +using namespace mlir; + +namespace { + +static int64_t getSharedMemorySizeInGPULaunch(gpu::LaunchOp op) { + int64_t sharedMemSizeInBytes = 0; + op->walk([&](memref::AllocaOp allocaOp) { + sharedMemSizeInBytes += + allocaOp.getType().getNumElements() * + allocaOp.getType().getElementType().getIntOrFloatBitWidth() / 8; + }); + op->walk([&](memref::AllocOp allocOp) { + sharedMemSizeInBytes += + allocOp.getType().getNumElements() * + allocOp.getType().getElementType().getIntOrFloatBitWidth() / 8; + }); + return sharedMemSizeInBytes; +} + +struct LegalizeGPULaunchPass + : public LegalizeGPULaunchBase { + LegalizeGPULaunchPass() : LegalizeGPULaunchBase() {} + void runOnOperation() override { + func::FuncOp funcOp = getOperation(); + OpBuilder builder(funcOp.getContext()); + auto launchOps = funcOp.getOps(); + for (auto launchOp : launchOps) { + int64_t sharedMemSize = getSharedMemorySizeInGPULaunch(launchOp); + if (sharedMemSize < 48 * 1024) // 48kB + continue; + builder.setInsertionPoint(launchOp); + Value sharedMemSizeValue = builder.create( + launchOp.getLoc(), builder.getI32IntegerAttr(sharedMemSize)); + if (!launchOp.getDynamicSharedMemorySizeMutable().empty()) { + continue; + } + launchOp.getDynamicSharedMemorySizeMutable().append( + ValueRange{sharedMemSizeValue}); + } + } +}; +} // namespace + +std::unique_ptr> +mlir::createLegalizeGPULaunchPass() { + return std::make_unique(); +} diff --git a/compiler/lib/Pipelines/GPU/GPUOpt.cpp b/compiler/lib/Pipelines/GPU/GPUOpt.cpp index 3426f2350..4179f88bc 100644 --- a/compiler/lib/Pipelines/GPU/GPUOpt.cpp +++ b/compiler/lib/Pipelines/GPU/GPUOpt.cpp @@ -108,13 +108,13 @@ void createReductionGPUOptPipelineImpl(OpPassManager &pm) { createGPUMappingForallTransform(pm, options); pm.addPass(createTransformDialectInterpreter(true)); - pm.addPass(createCSEPass()); - pm.addPass(createCanonicalizerPass()); - pm.addPass(createGpuLauchSinkIndexComputationsPass()); { OpPassManager anchoredPM(func::FuncOp::getOperationName()); + anchoredPM.addPass(createCSEPass()); + anchoredPM.addPass(createCanonicalizerPass()); + anchoredPM.addPass(createGpuLauchSinkIndexComputationsPass()); anchoredPM.addPass(createPromoteBuffersToStackPass( /*isSmallAlloc =*/[](Value value) { return value.getParentRegion()->getParentOfType(); @@ -132,13 +132,13 @@ void createGemmGPUOptPipelineImpl(OpPassManager &pm) { options.annotatePrefix = "__byteir_gpu_gemm_tile"; createGPUMappingForallTransform(pm, options); pm.addPass(createTransformDialectInterpreter(true)); - pm.addPass(createCSEPass()); - pm.addPass(createCanonicalizerPass()); - pm.addPass(createGpuLauchSinkIndexComputationsPass()); - { OpPassManager anchoredPM(func::FuncOp::getOperationName()); + anchoredPM.addPass(createCSEPass()); + anchoredPM.addPass(createCanonicalizerPass()); + anchoredPM.addPass(createGpuLauchSinkIndexComputationsPass()); + anchoredPM.addPass(createPromoteBuffersToStackPass( /*isSmallAlloc =*/[](Value value) { return value.getParentRegion()->getParentOfType(); @@ -147,14 +147,16 @@ void createGemmGPUOptPipelineImpl(OpPassManager &pm) { pm.addNestedPass(createAnchoredPipelinePass( getByteIRMatmulEpilogueFusionAttrName(), anchoredPM)); } - pm.addPass(createGpuKernelOutliningPass()); { OpPassManager anchoredPM(func::FuncOp::getOperationName()); + + anchoredPM.addPass(createLegalizeGPULaunchPass()); // anchoredPM.addPass(createSetSharedMemorySizePass()); pm.addNestedPass(createAnchoredPipelinePass( getByteIRMatmulEpilogueFusionAttrName(), anchoredPM)); } + pm.addPass(createGpuKernelOutliningPass()); } void createGPUOptPipelineImpl(OpPassManager &pm, const bool &useBarePtrCallConv,