Skip to content

Commit

Permalink
HIP and Spack (#468)
Browse files Browse the repository at this point in the history
* HIP compatibility, remove need for hipify-perl

* split GPU N-body code into additional units

handle gpu-direct with interface compile_definition
removed duplicate cartesian multipole implementation
  • Loading branch information
sekelle committed Dec 20, 2024
1 parent 261adf7 commit cad2c6e
Show file tree
Hide file tree
Showing 27 changed files with 428 additions and 153 deletions.
43 changes: 33 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,41 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()

check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
find_package(CUDAToolkit)
set(CMAKE_CUDA_STANDARD 17)
option(CSTONE_WITH_CUDA "Enable building for NVIDIA GPUs" ON)
option(CSTONE_WITH_HIP "Enable building for AMD GPUs" ON)
option(CSTONE_WITH_GPU_AWARE_MPI "Enable CUDA-aware MPI communication" OFF)

if(GPU_DIRECT)
message(WARNING "Option GPU_DIRECT is deprecated and will be removed. Use -DCSTONE_WITH_GPU_AWARE_MPI=ON instead.")
set(CSTONE_WITH_GPU_AWARE_MPI ON)
endif()

if(CSTONE_WITH_CUDA)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
find_package(CUDAToolkit)
set(CMAKE_CUDA_STANDARD 17)
else()
message(STATUS "No CUDA support")
set(CSTONE_WITH_CUDA OFF)
endif()
endif()

if(CSTONE_WITH_HIP)
check_language(HIP)
if(CMAKE_HIP_COMPILER)
enable_language(HIP)
find_package(hip)
set(CMAKE_HIP_STANDARD 17)
else()
message(STATUS "No HIP support")
set(CSTONE_WITH_HIP OFF)
endif()
endif()

check_language(HIP)
if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER)
enable_language(HIP)
find_package(hip)
set(CMAKE_HIP_STANDARD 17)
if(CSTONE_WITH_HIP AND CSTONE_WITH_CUDA)
message(FATAL_ERROR "CUDA and HIP cannot both be turned on")
endif()

add_subdirectory(include)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ CUDA version: 11.6 or later, HIP version 5.2 or later.

Example CMake invocation:
```shell
CC=mpicc CXX=mpicxx cmake -DCMAKE_CUDA_ARCHITECTURES=60,80,90 -DGPU_DIRECT=<ON/OFF> -DCMAKE_CUDA_FLAGS=-ccbin=mpicxx <GIT_SOURCE_DIR>
CC=mpicc CXX=mpicxx cmake -DCMAKE_CUDA_ARCHITECTURES=60;80;90 -DCSTONE_WITH_GPU_AWARE_MPI=<ON/OFF> -DCMAKE_CUDA_FLAGS=-ccbin=mpicxx <GIT_SOURCE_DIR>
```

GPU-direct (RDMA) MPI communication can be turned on or off by supplying `-D GPU_DIRECT=ON`. Default is `OFF`.
Expand Down
4 changes: 4 additions & 0 deletions include/cstone/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,8 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
$<TARGET_OBJECTS:traversal_obj>
$<TARGET_OBJECTS:source_center_gpu_obj>
$<TARGET_OBJECTS:gpu_utils_obj>)

if (CSTONE_WITH_GPU_AWARE_MPI)
target_compile_definitions(cstone_gpu INTERFACE CSTONE_HAVE_GPU_AWARE_MPI)
endif()
endif ()
14 changes: 14 additions & 0 deletions include/cstone/cuda/cub.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

#pragma once

#ifdef __HIPCC__

#include <hipcub/hipcub.hpp>

namespace cub = hipcub;

#else

#include <cub/cub.cuh>

#endif
61 changes: 61 additions & 0 deletions include/cstone/cuda/cuda_runtime.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*! @file
* @brief CUDA/HIP runtime API compatiblity wrapper
*
* @author Sebastian Keller <[email protected]>
*/

#pragma once

#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)

#include <hip/hip_runtime.h>

#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaErrorInvalidValue hipErrorInvalidValue
#define cudaError_t hipError_t
#define cudaEventCreate hipEventCreate
#define cudaEventDestroy hipEventDestroy
#define cudaEventElapsedTime hipEventElapsedTime
#define cudaEventRecord hipEventRecord
#define cudaEventSynchronize hipEventSynchronize
#define cudaEvent_t hipEvent_t
#define cudaFree hipFree
#define cudaFreeHost hipFreeHost
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorName hipGetErrorName
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaMalloc hipMalloc
#define cudaMallocHost hipMallocHost
#define cudaMallocManaged hipMallocManaged
#define cudaMemAttachGlobal hipMemAttachGlobal
#define cudaMemcpy hipMemcpy
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyFromSymbol hipMemcpyFromSymbol
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyToSymbol hipMemcpyToSymbol
#define cudaMemoryTypeDevice hipMemoryTypeDevice
#define cudaMemoryTypeManaged hipMemoryTypeManaged
#define cudaMemset hipMemset
#define cudaPointerAttributes hipPointerAttribute_t
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaSetDevice hipSetDevice
#define cudaStreamCreate hipStreamCreate
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess

#define GPU_SYMBOL HIP_SYMBOL

#else

#include <cuda_runtime.h>

#define GPU_SYMBOL(x) x

#endif
3 changes: 2 additions & 1 deletion include/cstone/cuda/cuda_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#pragma once

#include <type_traits>
#include <cuda_runtime.h>
#include <vector>
#include "cuda_runtime.hpp"

#include "device_vector.h"
#include "cuda_stubs.h"
Expand Down
5 changes: 4 additions & 1 deletion include/cstone/cuda/device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <thrust/device_vector.h>
#include <thrust/fill.h>

#include "cstone/cuda/cuda_runtime.hpp"
#include "cstone/cuda/errorcheck.cuh"
#include "cstone/util/noinit_thrust.cuh"

#include "device_vector.h"
Expand Down Expand Up @@ -83,7 +85,7 @@ DeviceVector<T>::DeviceVector(const T* first, const T* last)
{
auto size = last - first;
impl_->resize(size);
cudaMemcpy(impl_->data(), first, size * sizeof(T), cudaMemcpyHostToDevice);
checkGpuErrors(cudaMemcpy(impl_->data(), first, size * sizeof(T), cudaMemcpyHostToDevice));
}

template<class T>
Expand Down Expand Up @@ -174,6 +176,7 @@ template class DeviceVector<util::array<int, 2>>;
template class DeviceVector<util::array<int, 3>>;
template class DeviceVector<util::array<unsigned, 1>>;
template class DeviceVector<util::array<uint64_t, 1>>;
template class DeviceVector<util::array<uint64_t, 2>>;
template class DeviceVector<util::array<unsigned, 2>>;
template class DeviceVector<util::array<float, 3>>;
template class DeviceVector<util::array<double, 3>>;
Expand Down
2 changes: 1 addition & 1 deletion include/cstone/cuda/errorcheck.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#pragma once

#include <cstdio>
#include <cuda_runtime.h>
#include "cuda_runtime.hpp"

inline void checkErr(cudaError_t err, const char* filename, int lineno, const char* funcName)
{
Expand Down
3 changes: 1 addition & 2 deletions include/cstone/cuda/gpu_config.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@
#pragma once

#include <cstdint>
#include <cuda_runtime.h>

#include "cstone/cuda/cuda_runtime.hpp"
#include "cstone/cuda/errorcheck.cuh"

namespace cstone
Expand Down
9 changes: 4 additions & 5 deletions include/cstone/focus/rebalance_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@
* @author Sebastian Keller <[email protected]>
*/

#include <cub/cub.cuh>

#include "cstone/cuda/cub.hpp"
#include "cstone/cuda/errorcheck.cuh"
#include "cstone/focus/rebalance.hpp"
#include "cstone/focus/rebalance_gpu.h"
Expand Down Expand Up @@ -159,7 +158,7 @@ bool protectAncestorsGpu(const KeyType* prefixes,
protectAncestorsKernel<<<iceil(numNodes, numThreads), numThreads>>>(prefixes, parents, nodeOps, numNodes);

int numNodesModify;
checkGpuErrors(cudaMemcpyFromSymbol(&numNodesModify, nodeOpSum, sizeof(int)));
checkGpuErrors(cudaMemcpyFromSymbol(&numNodesModify, GPU_SYMBOL(nodeOpSum), sizeof(int)));

return numNodesModify == 0;
}
Expand Down Expand Up @@ -197,7 +196,7 @@ ResolutionStatus enforceKeysGpu(const KeyType* forcedKeys,
}

int status;
checkGpuErrors(cudaMemcpyFromSymbol(&status, enforceKeyStatus_device, sizeof(ResolutionStatus)));
checkGpuErrors(cudaMemcpyFromSymbol(&status, GPU_SYMBOL(enforceKeyStatus_device), sizeof(ResolutionStatus)));
return static_cast<ResolutionStatus>(status);
}

Expand All @@ -215,4 +214,4 @@ template ResolutionStatus enforceKeysGpu(const uint64_t* forcedKeys,
const TreeNodeIndex* parents,
TreeNodeIndex* nodeOps);

} // namespace cstone
} // namespace cstone
1 change: 1 addition & 0 deletions include/cstone/focus/source_center_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

#pragma once

#include "cstone/focus/source_center.hpp"
#include "cstone/tree/definitions.h"

namespace cstone
Expand Down
1 change: 1 addition & 0 deletions include/cstone/halos/gather_halos_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
* @author Sebastian Keller <[email protected]>
*/

#include "cstone/cuda/cuda_runtime.hpp"
#include "cstone/primitives/math.hpp"
#include "cstone/primitives/stl.hpp"
#include "cstone/util/array.hpp"
Expand Down
3 changes: 1 addition & 2 deletions include/cstone/primitives/mpi_cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,12 @@
#pragma once

#include <vector>
#include <cuda_runtime.h>

#include "cstone/primitives/mpi_wrappers.hpp"
#include "cstone/util/noinit_alloc.hpp"
#include "cstone/cuda/errorcheck.cuh"

#ifdef USE_GPU_DIRECT
#ifdef CSTONE_HAVE_GPU_AWARE_MPI
constexpr inline bool useGpuDirect = true;
#else
constexpr inline bool useGpuDirect = false;
Expand Down
5 changes: 2 additions & 3 deletions include/cstone/primitives/primitives_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@
* @author Sebastian Keller <[email protected]>
*/

#include <cub/cub.cuh>

#include <thrust/binary_search.h>
#include <thrust/count.h>
#include <thrust/execution_policy.h>
Expand All @@ -39,6 +37,7 @@
#include <thrust/sequence.h>
#include <thrust/sort.h>

#include "cstone/cuda/cub.hpp"
#include "cstone/cuda/errorcheck.cuh"
#include "cstone/primitives/math.hpp"
#include "cstone/util/array.hpp"
Expand Down Expand Up @@ -297,7 +296,7 @@ void sortByKeyGpu(KeyType* first, KeyType* last, ValueType* values, KeyType* key
// Determine temporary device storage requirements
void* d_tempStorage = nullptr;
size_t tempStorageBytes = 0;
cub::DeviceRadixSort::SortPairs(d_tempStorage, tempStorageBytes, d_keys, d_values, numElements);
checkGpuErrors(cub::DeviceRadixSort::SortPairs(d_tempStorage, tempStorageBytes, d_keys, d_values, numElements));

// Allocate temporary storage
checkGpuErrors(cudaMalloc(&d_tempStorage, tempStorageBytes));
Expand Down
14 changes: 10 additions & 4 deletions include/cstone/primitives/warpscan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,6 @@ __device__ __forceinline__ GpuConfig::ThreadMask lanemask_le()
*/
__device__ __forceinline__ int inclusiveSegscan(int value, int distance)
{
// distance should be less-equal the lane index
assert(distance <= (threadIdx.x & (GpuConfig::warpSize - 1)));
#pragma unroll
for (int i = 1; i < GpuConfig::warpSize; i *= 2)
{
Expand Down Expand Up @@ -242,7 +240,6 @@ __device__ __forceinline__ int inclusiveSegscanInt(const int packedValue, const
// distance = number of preceding lanes to include in scanned value
// e.g. if distance = 0, then no preceding lane value will be added to scannedValue
int distance = countLeadingZeros(flags & lanemask_le()) + laneIdx - (GpuConfig::warpSize - 1);
assert(distance >= 0);
int scannedValue = inclusiveSegscan(value, imin(distance, laneIdx));

// the lowest lane index for which packedValue was negative, warpSize if all were positive
Expand Down Expand Up @@ -299,4 +296,13 @@ __device__ __forceinline__ int spreadSeg8(int val)
return shflSync(val, laneIdx >> 3) + (laneIdx & 7);
}

} // namespace cstone
__device__ __forceinline__ float atomicMinFloat(float* addr, float value)
{
float old;
old = (value >= 0) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value)))
: __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));

return old;
}

} // namespace cstone
6 changes: 3 additions & 3 deletions include/cstone/traversal/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
if(CMAKE_HIP_COMPILER)
set_source_files_properties(collisions_gpu.cu PROPERTIES LANGUAGE HIP)
set_source_files_properties(collisions_gpu.cu groups_gpu.cu PROPERTIES LANGUAGE HIP)
endif()

if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
add_library(traversal_obj OBJECT collisions_gpu.cu)
add_library(traversal_obj OBJECT collisions_gpu.cu groups_gpu.cu)
target_include_directories(traversal_obj PRIVATE ${PROJECT_SOURCE_DIR}/include)
endif()
endif()
3 changes: 1 addition & 2 deletions include/cstone/traversal/find_neighbors.cuh
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
/*
* MIT License
*
* Copyright (c) 2021 CSCS, ETH Zurich
* 2021 University of Basel
* Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down
Loading

0 comments on commit cad2c6e

Please sign in to comment.