HIP and Spack (#468)

* HIP compatibility, remove need for hipify-perl * split GPU N-body code into additional units handle gpu-direct with interface compile_definition removed duplicate cartesian multipole implementation
sekelle · Dec 20, 2024 · cad2c6e · cad2c6e
1 parent 261adf7
commit cad2c6e
Show file tree

Hide file tree

Showing 27 changed files with 428 additions and 153 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,18 +26,41 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
-check_language(CUDA)
-if(CMAKE_CUDA_COMPILER)
-    enable_language(CUDA)
-    find_package(CUDAToolkit)
-    set(CMAKE_CUDA_STANDARD 17)
+option(CSTONE_WITH_CUDA "Enable building for NVIDIA GPUs" ON)
+option(CSTONE_WITH_HIP "Enable building for AMD GPUs" ON)
+option(CSTONE_WITH_GPU_AWARE_MPI "Enable CUDA-aware MPI communication" OFF)
+
+if(GPU_DIRECT)
+    message(WARNING "Option GPU_DIRECT is deprecated and will be removed. Use -DCSTONE_WITH_GPU_AWARE_MPI=ON instead.")
+    set(CSTONE_WITH_GPU_AWARE_MPI ON)
+endif()
+
+if(CSTONE_WITH_CUDA)
+    check_language(CUDA)
+    if(CMAKE_CUDA_COMPILER)
+        enable_language(CUDA)
+        find_package(CUDAToolkit)
+        set(CMAKE_CUDA_STANDARD 17)
+    else()
+        message(STATUS "No CUDA support")
+        set(CSTONE_WITH_CUDA OFF)
+    endif()
+endif()
+
+if(CSTONE_WITH_HIP)
+    check_language(HIP)
+    if(CMAKE_HIP_COMPILER)
+        enable_language(HIP)
+        find_package(hip)
+        set(CMAKE_HIP_STANDARD 17)
+    else()
+        message(STATUS "No HIP support")
+        set(CSTONE_WITH_HIP OFF)
+    endif()
 endif()
 
-check_language(HIP)
-if(CMAKE_HIP_COMPILER AND NOT CMAKE_CUDA_COMPILER)
-    enable_language(HIP)
-    find_package(hip)
-    set(CMAKE_HIP_STANDARD 17)
+if(CSTONE_WITH_HIP AND CSTONE_WITH_CUDA)
+    message(FATAL_ERROR "CUDA and HIP cannot both be turned on")
 endif()
 
 add_subdirectory(include)

diff --git a/README.md b/README.md
@@ -39,7 +39,7 @@ CUDA version: 11.6 or later, HIP version 5.2 or later.
 
 Example CMake invocation:
 ```shell
-CC=mpicc CXX=mpicxx cmake -DCMAKE_CUDA_ARCHITECTURES=60,80,90 -DGPU_DIRECT=<ON/OFF> -DCMAKE_CUDA_FLAGS=-ccbin=mpicxx <GIT_SOURCE_DIR>
+CC=mpicc CXX=mpicxx cmake -DCMAKE_CUDA_ARCHITECTURES=60;80;90 -DCSTONE_WITH_GPU_AWARE_MPI=<ON/OFF> -DCMAKE_CUDA_FLAGS=-ccbin=mpicxx <GIT_SOURCE_DIR>
 ```
 
 GPU-direct (RDMA) MPI communication can be turned on or off by supplying `-D GPU_DIRECT=ON`. Default is `OFF`.

diff --git a/include/cstone/CMakeLists.txt b/include/cstone/CMakeLists.txt
@@ -14,4 +14,8 @@ if (CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
                            $<TARGET_OBJECTS:traversal_obj>
                            $<TARGET_OBJECTS:source_center_gpu_obj>
                            $<TARGET_OBJECTS:gpu_utils_obj>)
+
+    if (CSTONE_WITH_GPU_AWARE_MPI)
+        target_compile_definitions(cstone_gpu INTERFACE CSTONE_HAVE_GPU_AWARE_MPI)
+    endif()
 endif ()
diff --git a/include/cstone/cuda/cub.hpp b/include/cstone/cuda/cub.hpp
@@ -0,0 +1,14 @@
+
+#pragma once
+
+#ifdef __HIPCC__
+
+#include <hipcub/hipcub.hpp>
+
+namespace cub = hipcub;
+
+#else
+
+#include <cub/cub.cuh>
+
+#endif
diff --git a/include/cstone/cuda/cuda_runtime.hpp b/include/cstone/cuda/cuda_runtime.hpp
@@ -0,0 +1,61 @@
+/*! @file
+ * @brief CUDA/HIP runtime API compatiblity wrapper
+ *
+ * @author Sebastian Keller <[email protected]>
+ */
+
+#pragma once
+
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+
+#include <hip/hip_runtime.h>
+
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaErrorInvalidValue hipErrorInvalidValue
+#define cudaError_t hipError_t
+#define cudaEventCreate hipEventCreate
+#define cudaEventDestroy hipEventDestroy
+#define cudaEventElapsedTime hipEventElapsedTime
+#define cudaEventRecord hipEventRecord
+#define cudaEventSynchronize hipEventSynchronize
+#define cudaEvent_t hipEvent_t
+#define cudaFree hipFree
+#define cudaFreeHost hipFreeHost
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorName hipGetErrorName
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaMalloc hipMalloc
+#define cudaMallocHost hipMallocHost
+#define cudaMallocManaged hipMallocManaged
+#define cudaMemAttachGlobal hipMemAttachGlobal
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyFromSymbol hipMemcpyFromSymbol
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyToSymbol hipMemcpyToSymbol
+#define cudaMemoryTypeDevice hipMemoryTypeDevice
+#define cudaMemoryTypeManaged hipMemoryTypeManaged
+#define cudaMemset hipMemset
+#define cudaPointerAttributes hipPointerAttribute_t
+#define cudaPointerGetAttributes hipPointerGetAttributes
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+
+#define GPU_SYMBOL HIP_SYMBOL
+
+#else
+
+#include <cuda_runtime.h>
+
+#define GPU_SYMBOL(x) x
+
+#endif
diff --git a/include/cstone/cuda/cuda_utils.cuh b/include/cstone/cuda/cuda_utils.cuh
@@ -7,7 +7,8 @@
 #pragma once
 
 #include <type_traits>
-#include <cuda_runtime.h>
+#include <vector>
+#include "cuda_runtime.hpp"
 
 #include "device_vector.h"
 #include "cuda_stubs.h"

diff --git a/include/cstone/cuda/device_vector.cu b/include/cstone/cuda/device_vector.cu
@@ -8,6 +8,8 @@
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
 
+#include "cstone/cuda/cuda_runtime.hpp"
+#include "cstone/cuda/errorcheck.cuh"
 #include "cstone/util/noinit_thrust.cuh"
 
 #include "device_vector.h"
@@ -83,7 +85,7 @@ DeviceVector<T>::DeviceVector(const T* first, const T* last)
 {
     auto size = last - first;
     impl_->resize(size);
-    cudaMemcpy(impl_->data(), first, size * sizeof(T), cudaMemcpyHostToDevice);
+    checkGpuErrors(cudaMemcpy(impl_->data(), first, size * sizeof(T), cudaMemcpyHostToDevice));
 }
 
 template<class T>
@@ -174,6 +176,7 @@ template class DeviceVector<util::array<int, 2>>;
 template class DeviceVector<util::array<int, 3>>;
 template class DeviceVector<util::array<unsigned, 1>>;
 template class DeviceVector<util::array<uint64_t, 1>>;
+template class DeviceVector<util::array<uint64_t, 2>>;
 template class DeviceVector<util::array<unsigned, 2>>;
 template class DeviceVector<util::array<float, 3>>;
 template class DeviceVector<util::array<double, 3>>;

diff --git a/include/cstone/cuda/errorcheck.cuh b/include/cstone/cuda/errorcheck.cuh
@@ -25,7 +25,7 @@
 #pragma once
 
 #include <cstdio>
-#include <cuda_runtime.h>
+#include "cuda_runtime.hpp"
 
 inline void checkErr(cudaError_t err, const char* filename, int lineno, const char* funcName)
 {

diff --git a/include/cstone/cuda/gpu_config.cuh b/include/cstone/cuda/gpu_config.cuh
@@ -32,8 +32,7 @@
 #pragma once
 
 #include <cstdint>
-#include <cuda_runtime.h>
-
+#include "cstone/cuda/cuda_runtime.hpp"
 #include "cstone/cuda/errorcheck.cuh"
 
 namespace cstone

diff --git a/include/cstone/focus/rebalance_gpu.cu b/include/cstone/focus/rebalance_gpu.cu
@@ -28,8 +28,7 @@
  * @author Sebastian Keller <[email protected]>
  */
 
-#include <cub/cub.cuh>
-
+#include "cstone/cuda/cub.hpp"
 #include "cstone/cuda/errorcheck.cuh"
 #include "cstone/focus/rebalance.hpp"
 #include "cstone/focus/rebalance_gpu.h"
@@ -159,7 +158,7 @@ bool protectAncestorsGpu(const KeyType* prefixes,
     protectAncestorsKernel<<<iceil(numNodes, numThreads), numThreads>>>(prefixes, parents, nodeOps, numNodes);
 
     int numNodesModify;
-    checkGpuErrors(cudaMemcpyFromSymbol(&numNodesModify, nodeOpSum, sizeof(int)));
+    checkGpuErrors(cudaMemcpyFromSymbol(&numNodesModify, GPU_SYMBOL(nodeOpSum), sizeof(int)));
 
     return numNodesModify == 0;
 }
@@ -197,7 +196,7 @@ ResolutionStatus enforceKeysGpu(const KeyType* forcedKeys,
     }
 
     int status;
-    checkGpuErrors(cudaMemcpyFromSymbol(&status, enforceKeyStatus_device, sizeof(ResolutionStatus)));
+    checkGpuErrors(cudaMemcpyFromSymbol(&status, GPU_SYMBOL(enforceKeyStatus_device), sizeof(ResolutionStatus)));
     return static_cast<ResolutionStatus>(status);
 }
 
@@ -215,4 +214,4 @@ template ResolutionStatus enforceKeysGpu(const uint64_t* forcedKeys,
                                          const TreeNodeIndex* parents,
                                          TreeNodeIndex* nodeOps);
 
-} // namespace cstone
+} // namespace cstone
diff --git a/include/cstone/focus/source_center_gpu.h b/include/cstone/focus/source_center_gpu.h
@@ -30,6 +30,7 @@
 
 #pragma once
 
+#include "cstone/focus/source_center.hpp"
 #include "cstone/tree/definitions.h"
 
 namespace cstone

diff --git a/include/cstone/halos/gather_halos_gpu.cu b/include/cstone/halos/gather_halos_gpu.cu
@@ -30,6 +30,7 @@
  * @author Sebastian Keller <[email protected]>
  */
 
+#include "cstone/cuda/cuda_runtime.hpp"
 #include "cstone/primitives/math.hpp"
 #include "cstone/primitives/stl.hpp"
 #include "cstone/util/array.hpp"

diff --git a/include/cstone/primitives/mpi_cuda.cuh b/include/cstone/primitives/mpi_cuda.cuh
@@ -32,13 +32,12 @@
 #pragma once
 
 #include <vector>
-#include <cuda_runtime.h>
 
 #include "cstone/primitives/mpi_wrappers.hpp"
 #include "cstone/util/noinit_alloc.hpp"
 #include "cstone/cuda/errorcheck.cuh"
 
-#ifdef USE_GPU_DIRECT
+#ifdef CSTONE_HAVE_GPU_AWARE_MPI
 constexpr inline bool useGpuDirect = true;
 #else
 constexpr inline bool useGpuDirect = false;

diff --git a/include/cstone/primitives/primitives_gpu.cu b/include/cstone/primitives/primitives_gpu.cu
@@ -29,8 +29,6 @@
  * @author Sebastian Keller <[email protected]>
  */
 
-#include <cub/cub.cuh>
-
 #include <thrust/binary_search.h>
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
@@ -39,6 +37,7 @@
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
+#include "cstone/cuda/cub.hpp"
 #include "cstone/cuda/errorcheck.cuh"
 #include "cstone/primitives/math.hpp"
 #include "cstone/util/array.hpp"
@@ -297,7 +296,7 @@ void sortByKeyGpu(KeyType* first, KeyType* last, ValueType* values, KeyType* key
     // Determine temporary device storage requirements
     void* d_tempStorage     = nullptr;
     size_t tempStorageBytes = 0;
-    cub::DeviceRadixSort::SortPairs(d_tempStorage, tempStorageBytes, d_keys, d_values, numElements);
+    checkGpuErrors(cub::DeviceRadixSort::SortPairs(d_tempStorage, tempStorageBytes, d_keys, d_values, numElements));
 
     // Allocate temporary storage
     checkGpuErrors(cudaMalloc(&d_tempStorage, tempStorageBytes));

diff --git a/include/cstone/primitives/warpscan.cuh b/include/cstone/primitives/warpscan.cuh
@@ -205,8 +205,6 @@ __device__ __forceinline__ GpuConfig::ThreadMask lanemask_le()
  */
 __device__ __forceinline__ int inclusiveSegscan(int value, int distance)
 {
-    // distance should be less-equal the lane index
-    assert(distance <= (threadIdx.x & (GpuConfig::warpSize - 1)));
 #pragma unroll
     for (int i = 1; i < GpuConfig::warpSize; i *= 2)
     {
@@ -242,7 +240,6 @@ __device__ __forceinline__ int inclusiveSegscanInt(const int packedValue, const
     // distance = number of preceding lanes to include in scanned value
     // e.g. if distance = 0, then no preceding lane value will be added to scannedValue
     int distance = countLeadingZeros(flags & lanemask_le()) + laneIdx - (GpuConfig::warpSize - 1);
-    assert(distance >= 0);
     int scannedValue = inclusiveSegscan(value, imin(distance, laneIdx));
 
     // the lowest lane index for which packedValue was negative, warpSize if all were positive
@@ -299,4 +296,13 @@ __device__ __forceinline__ int spreadSeg8(int val)
     return shflSync(val, laneIdx >> 3) + (laneIdx & 7);
 }
 
-} // namespace cstone
+__device__ __forceinline__ float atomicMinFloat(float* addr, float value)
+{
+    float old;
+    old = (value >= 0) ? __int_as_float(atomicMin((int*)addr, __float_as_int(value)))
+                       : __uint_as_float(atomicMax((unsigned int*)addr, __float_as_uint(value)));
+
+    return old;
+}
+
+} // namespace cstone
diff --git a/include/cstone/traversal/CMakeLists.txt b/include/cstone/traversal/CMakeLists.txt
@@ -1,8 +1,8 @@
 if(CMAKE_HIP_COMPILER)
-    set_source_files_properties(collisions_gpu.cu PROPERTIES LANGUAGE HIP)
+    set_source_files_properties(collisions_gpu.cu groups_gpu.cu PROPERTIES LANGUAGE HIP)
 endif()
 
 if(CMAKE_CUDA_COMPILER OR CMAKE_HIP_COMPILER)
-    add_library(traversal_obj OBJECT collisions_gpu.cu)
+    add_library(traversal_obj OBJECT collisions_gpu.cu groups_gpu.cu)
     target_include_directories(traversal_obj PRIVATE ${PROJECT_SOURCE_DIR}/include)
-endif()
+endif()
diff --git a/include/cstone/traversal/find_neighbors.cuh b/include/cstone/traversal/find_neighbors.cuh
@@ -1,8 +1,7 @@
 /*
  * MIT License
  *
- * Copyright (c) 2021 CSCS, ETH Zurich
- *               2021 University of Basel
+ * Copyright (c) 2024 CSCS, ETH Zurich, University of Basel, University of Zurich
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal