Skip to content

Commit

Permalink
perf: merge the benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Nov 3, 2024
1 parent 1c09c3b commit 8a344d5
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 191 deletions.
14 changes: 11 additions & 3 deletions .buildkite/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ steps:
command: |
julia --project=benchmarks -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop([PackageSpec(path=pwd())])'
Pkg.develop([
PackageSpec(path=pwd()),
PackageSpec(path="lib/LuxLib"),
PackageSpec(path="lib/MLDataDevices"),
])'
julia --project=benchmarks -e 'println("--- :julia: Run Benchmarks")
include("benchmarks/runbenchmarks.jl")'
Expand All @@ -36,8 +40,12 @@ steps:
version: "1"
command: |
julia --project=benchmarks -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop([PackageSpec(path=pwd())])'
using Pkg;
Pkg.develop([
PackageSpec(path=pwd()),
PackageSpec(path="lib/LuxLib"),
PackageSpec(path="lib/MLDataDevices"),
])'
julia --project=benchmarks -e 'println("--- :julia: Add CUDA to benchmarks environment")
using Pkg
Expand Down
3 changes: 3 additions & 0 deletions benchmarks/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
ThreadPinning = "811555cd-349b-4f26-b7bc-1f208b848042"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
55 changes: 41 additions & 14 deletions benchmarks/setup.jl
Original file line number Diff line number Diff line change
@@ -1,30 +1,42 @@
using ADTypes: ADTypes, AutoEnzyme, AutoZygote
using ADTypes
using Adapt: adapt
using Lux: Lux, BatchNorm, Chain, Conv, Dense, Dropout, FlattenLayer, MaxPool
using MLDataDevices: AbstractDevice, CPUDevice, CUDADevice, AMDGPUDevice
using NNlib: relu, gelu
using Lux
using LuxLib
using MLDataDevices
using MLDataDevices: AbstractDevice
using NNlib
using Random: Random
using StableRNGs: StableRNG

# AD Backends
using Enzyme: Enzyme
using Zygote: Zygote

# Helper Functions
@inline synchronize(::CPUDevice) = nothing
@inline synchronize(::AMDGPUDevice) = AMDGPU.synchronize()
@inline synchronize(::CUDADevice) = CUDA.synchronize()

@inline reclaim(::CPUDevice) = GC.gc()
@inline reclaim(::AMDGPUDevice) = AMDGPU.HIP.reclaim()
@inline reclaim(::CUDADevice) = CUDA.reclaim()

@inline sumabs2(model, x, p, st) = sum(abs2, first(Lux.apply(model, x, p, st)))
@inline sumabs2(model, x) = sum(abs2, model(x))
synchronize(::CPUDevice) = nothing
synchronize(::AMDGPUDevice) = AMDGPU.synchronize()
synchronize(::CUDADevice) = CUDA.synchronize()
synchronize(::MetalDevice) = Metal.synchronize()
synchronize(::oneAPIDevice) = oneAPI.synchronize()

reclaim(::CPUDevice) = GC.gc()
reclaim(::AMDGPUDevice) = AMDGPU.HIP.reclaim()
reclaim(::CUDADevice) = CUDA.reclaim()
reclaim(::MetalDevice) = nothing # Metal.reclaim()
reclaim(::oneAPIDevice) = nothing # oneAPI.reclaim()

function sumabs2(model::Lux.AbstractLuxLayer, x, p, st)
return sum(abs2, first(Lux.apply(model, x, p, st)))
end
sumabs2(f::F, args...) where {F} = sum(abs2, f(args...))
sumabs2first(f::F, args...) where {F} = sum(abs2, first(f(args...)))

function benchmark_group_to_backend(benchmark_group::String)
benchmark_group == "CPU" && return CPUDevice()
benchmark_group == "AMDGPU" && return AMDGPUDevice()
benchmark_group == "CUDA" && return CUDADevice()
benchmark_group == "Metal" && return MetalDevice()
benchmark_group == "oneAPI" && return oneAPIDevice()
error("Unknown backend: $(benchmark_group)")
end

Expand All @@ -39,12 +51,14 @@ end
# Main benchmark files
include("setups/layers.jl")
include("setups/models.jl")
include("setups/luxlib.jl")

function setup_benchmarks!(suite::BenchmarkGroup, backend::String, num_cpu_threads::Int64)
dev = benchmark_group_to_backend(backend)
cpu_or_gpu = backend == "CPU" ? "CPU" : "GPU"
final_backend = backend == "CPU" ? string(num_cpu_threads, " ", "thread(s)") : backend

# Model Benchmarks
setup_dense_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_conv_benchmarks!(suite, cpu_or_gpu, final_backend, dev)
Expand All @@ -54,6 +68,19 @@ function setup_benchmarks!(suite::BenchmarkGroup, backend::String, num_cpu_threa
setup_mlp_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_lenet_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

# Layer Benchmarks
setup_dense_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_bias_activation_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_batchnorm_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_layernorm_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_groupnorm_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_batched_matmul_benchmarks!(suite, cpu_or_gpu, final_backend, dev)
end

function setup_forward_pass_benchmark!(suite::BenchmarkGroup, benchmark_name::String,
Expand Down
47 changes: 0 additions & 47 deletions lib/LuxLib/benchmarks/setup.jl → benchmarks/setups/luxlib.jl
Original file line number Diff line number Diff line change
@@ -1,50 +1,3 @@
using MLDataDevices, StableRNGs, Random
using NNlib
using Zygote

synchronize(::CPUDevice) = nothing
synchronize(::AMDGPUDevice) = AMDGPU.synchronize()
synchronize(::CUDADevice) = CUDA.synchronize()
synchronize(::MetalDevice) = Metal.synchronize()
synchronize(::oneAPIDevice) = oneAPI.synchronize()

reclaim(::CPUDevice) = GC.gc()
reclaim(::AMDGPUDevice) = AMDGPU.HIP.reclaim()
reclaim(::CUDADevice) = CUDA.reclaim()
reclaim(::MetalDevice) = nothing # Metal.reclaim()
reclaim(::oneAPIDevice) = nothing # oneAPI.reclaim()

function benchmark_group_to_backend(benchmark_group::String)
benchmark_group == "CPU" && return CPUDevice()
benchmark_group == "AMDGPU" && return AMDGPUDevice()
benchmark_group == "CUDA" && return CUDADevice()
benchmark_group == "Metal" && return MetalDevice()
benchmark_group == "oneAPI" && return oneAPIDevice()
error("Unknown backend: $(benchmark_group)")
end

sumabs2(f::F, args...) where {F} = sum(abs2, f(args...))
sumabs2first(f::F, args...) where {F} = sum(abs2, first(f(args...)))

function setup_benchmarks!(suite::BenchmarkGroup, backend::String, num_cpu_threads::Int64)
dev = benchmark_group_to_backend(backend)
cpu_or_gpu = backend == "CPU" ? "CPU" : "GPU"
final_backend = backend == "CPU" ? string(num_cpu_threads, " ", "thread(s)") : backend

setup_dense_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_bias_activation_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_batchnorm_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_layernorm_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_groupnorm_benchmarks!(suite, cpu_or_gpu, final_backend, dev)

setup_batched_matmul_benchmarks!(suite, cpu_or_gpu, final_backend, dev)
end

# Dense
function dense_setup(N::Int, bias::Bool, dev::MLDataDevices.AbstractDevice)
rng = StableRNG(123)
x = randn(rng, Float32, N, 128) |> dev
Expand Down
12 changes: 0 additions & 12 deletions lib/LuxLib/benchmarks/Project.toml

This file was deleted.

57 changes: 0 additions & 57 deletions lib/LuxLib/benchmarks/aggregate.jl

This file was deleted.

58 changes: 0 additions & 58 deletions lib/LuxLib/benchmarks/runbenchmarks.jl

This file was deleted.

0 comments on commit 8a344d5

Please sign in to comment.