Merge pull request #34 from PROBIC/add-gpu-support

Add GPU accelerated abundance estimation
PROBIC · Sep 11, 2024 · 1216451 · 1216451
2 parents 337d085 + 0781517
commit 1216451
Show file tree

Hide file tree

Showing 5 changed files with 135 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # Local files
 build/
 example/
+external/
 
 # Emacs autosaves
 *.*~
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,6 +32,27 @@ if(CMAKE_BUILD_WITH_FLTO)
 endif()
 
 ## Check dependencies
+### Torch
+if(CMAKE_LIBTORCH_PATH)
+  message(STATUS "Torch libraries provided in: ${CMAKE_LIBTORCH_PATH}")
+  set(Torch_DIR ${CMAKE_LIBTORCH_PATH}/share/cmake/Torch)
+endif()
+
+find_package(Torch)
+if (TORCH_FOUND)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+  set(MSWEEP_TORCH_SUPPORT 1)
+  include_directories(${TORCH_INCLUDE_DIRS})
+  message(STATUS "Compiling mSWEEP with GPU and CPU support")
+else()
+  set(MSWEEP_TORCH_SUPPORT 0)
+  if (DEFINED CMAKE_LIBTORCH_PATH)
+    message(FATAL_ERROR "Could not find libtorch in: ${CMAKE_LIBTORCH_PATH}")
+  else()
+    message(STATUS "Compiling mSWEEP with CPU support only")
+  endif()
+endif()
+
 find_package(OpenMP)
 if (OPENMP_FOUND)
   set(MSWEEP_OPENMP_SUPPORT 1)
@@ -251,7 +272,7 @@ if (DEFINED CMAKE_SEAMAT_HEADERS)
 else()
   FetchContent_Declare(seamat
     GIT_REPOSITORY    https://github.com/tmaklin/seamat.git
-    GIT_TAG           v0.2.2
+    GIT_TAG           v0.2.3
     PREFIX            "external"
     SOURCE_DIR        "${CMAKE_CURRENT_SOURCE_DIR}/external/seamat"
     BUILD_IN_SOURCE   0
@@ -263,26 +284,26 @@ else()
     )
   FetchContent_MakeAvailable(seamat)
   set(CMAKE_SEAMAT_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/seamat/include ${CMAKE_CURRENT_BINARY_DIR}/_deps/seamat-build/include)
-  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/mSWEEP_openmp_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/include/mSWEEP_openmp_config.hpp @ONLY)
 endif()
 include_directories(${CMAKE_SEAMAT_HEADERS})
 
 ## rcgpar
-if (DEFINED CMAKE_RCGPAR_LIBRARY AND DEFINED CMAKE_RCGPAR_HEADERS AND DEFINED CMAKE_RCGUTILS_LIBRARY)
+if (DEFINED CMAKE_RCGPAR_LIBRARIES AND DEFINED CMAKE_RCGPAR_HEADERS AND DEFINED CMAKE_RCGUTILS_LIBRARY)
   message(STATUS "rcgpar headers provided in: ${CMAKE_RCGPAR_HEADERS}")
-  message(STATUS "rcgpar library provided in: ${CMAKE_RCGPAR_LIBRARY}")
+  message(STATUS "rcgpar library provided in: ${CMAKE_RCGPAR_LIBRARIES}")
   message(STATUS "rcgutils library provided in: ${CMAKE_RCGUTILS_LIBRARY}")
 else()
   FetchContent_Declare(rcgpar
     GIT_REPOSITORY    https://github.com/tmaklin/rcgpar.git
-    GIT_TAG           v1.1.3
+    GIT_TAG           v1.2.1
     PREFIX            "external"
     SOURCE_DIR        "${CMAKE_CURRENT_SOURCE_DIR}/external/rcgpar"
     BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar"
     BUILD_IN_SOURCE   0
     CMAKE_ARGS      -D CMAKE_ENABLE_MPI_SUPPORT=${MSWEEP_MPI_SUPPORT}
                     -D CMAKE_SEAMAT_HEADERS=${CMAKE_SEAMAT_HEADERS}
 		    -D CMAKE_BITMAGIC_HEADERS=${CMAKE_BITMAGIC_HEADERS}
+		    -D CMAKE_LIBTORCH_PATH=${CMAKE_LIBTORCH_PATH}
 		    -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
 		    -D "CMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
 		    -D "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
@@ -291,12 +312,19 @@ else()
     INSTALL_COMMAND   ""
     )
   FetchContent_MakeAvailable(rcgpar)
-  add_dependencies(mSWEEP rcgomp rcgutils)
+
+  if (TORCH_FOUND)
+    add_dependencies(mSWEEP rcgomp rcggpu rcgutils)
+    set(CMAKE_RCGPAR_LIBRARIES "${CMAKE_CURRENT_BINARY_DIR}/lib/librcgomp.a" "${CMAKE_CURRENT_BINARY_DIR}/lib/librcggpu.a")
+  else()
+    add_dependencies(mSWEEP rcgomp rcgutils)
+    set(CMAKE_RCGPAR_LIBRARIES "${CMAKE_CURRENT_BINARY_DIR}/lib/librcgomp.a")
+  endif()
   set(CMAKE_RCGPAR_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/rcgpar/include)
-  set(CMAKE_RCGPAR_LIBRARY "${CMAKE_CURRENT_BINARY_DIR}/lib/librcgomp.a")
   set(CMAKE_RCGUTILS_LIBRARY "${CMAKE_CURRENT_BINARY_DIR}/lib/librcgutils.a")
 endif()
-target_link_libraries(mSWEEP ${CMAKE_RCGPAR_LIBRARY} ${CMAKE_RCGUTILS_LIBRARY})
+target_link_libraries(mSWEEP ${CMAKE_RCGPAR_LIBRARIES})
+target_link_libraries(mSWEEP ${CMAKE_RCGUTILS_LIBRARY})
 include_directories(${CMAKE_RCGPAR_HEADERS})
 
 ## mGEMS
@@ -360,3 +388,7 @@ target_link_libraries(mSWEEP libmsweep)
 if (OPENMP_FOUND)
   target_link_libraries(mSWEEP OpenMP::OpenMP_CXX)
 endif()
+
+if (TORCH_FOUND)
+  target_link_libraries(mSWEEP "${TORCH_LIBRARIES}")
+endif()
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ In addition to mSWEEP, you will need to install [Themisto](https://github.com/al
 ## Conda
 Install mSWEEP from bioconda with
 ```
-conda install -y -c bioconda -c conda-forge -c defaults msweep
+conda install -y -c bioconda -c conda-forge msweep
 ```
 
 check that the installation succeeded by running
@@ -22,14 +22,17 @@ mSWEEP --help
 
 ## Precompiled binaries
 Precompiled binaries are available for
-* [Linux x86\_64 (mSWEEP-v2.1.0)](https://github.com/PROBIC/mSWEEP/releases/download/v2.1.0/mSWEEP-v2.1.0-x86_64-redhat-linux.tar.gz)
-* [macOS arm64 (mSWEEP-v2.1.0)](https://github.com/PROBIC/mSWEEP/releases/download/v2.1.0/mSWEEP-v2.1.0-arm64-apple-darwin22.tar.gz)
-* [macOS x86\_64 (mSWEEP-v2.1.0)](https://github.com/PROBIC/mSWEEP/releases/download/v2.1.0/mSWEEP-v2.1.0-x86_64-apple-darwin22.tar.gz)
+* Linux x86\_64
+* macOS arm64
+* macOS x86\_64
 
-## Compiling from source
+from the [Releases](https://github.com/PROBIC/mSWEEP/releases) page.
+
+## Building from source
 ### Requirements
 - C++17 compliant compiler.
-- cmake (v3.0 or newer)
+- cmake (v3.11 or newer)
+- git
 
 #### Optional
 - Compiler with OpenMP support.
@@ -49,10 +52,38 @@ enter the directory and run
 > cmake ..
 > make
 ```
+
 This will compile the mSWEEP executable in `build/bin/mSWEEP`.
 
 For more info on compiling mSWEEP from source, please see the [documentation on compiling mSWEEP](/docs/compilation.md).
 
+### Enabling GPU acceleration
+Compiling mSWEEP with GPU support requires installing
+- [LibTorch](https://pytorch.org/get-started/locally/)
+- ... and CUDA Toolkit (LibTorch w/ CUDA support)
+- ... or ROCm (LibTorch w/ ROCm)
+
+then, build mSWEEP with
+```
+> mkdir build
+> cd build
+> cmake -DCMAKE_LIBTORCH_PATH=/absolute/path/to/libtorch ..
+> make
+```
+
+where `/absolute/path/to/libtorch` should be the absolute (!) path to
+the root of the LibTorch distribution.
+
+Compiling mSWEEP with LibTorch support enables the `rcggpu` and
+`emgpu` options for `--algorithm` which directs the abundance
+estimation to run on the GPU if one is available.
+
+Both algorithms can also be run on the CPU. Compared to the default algorithm, on the CPU
+- `rcggpu` is faster but uses more memory.
+- `emgpu` is slower but uses less memory.
+
+See [docs/gpubenchmarks.md](/docs/gpubenchmarks.md) for more details.
+
 # Usage
 More information about using mSWEEP is available in the [usage documentation](/docs/README.md).
 
@@ -170,6 +201,8 @@ Estimation options:
 --no-fit-model	Do not estimate the abundances. Useful if only the likelihood matrix is required (default: false).
 --max-iters	Maximum number of iterations to run the abundance estimation optimizer for (default: 5000).
 --tol	Optimization terminates when the bound changes by less than the given tolerance (default: 0.000001).
+--algorithm Which algorithm to use for abundance estimation (one of rcggpu, emgpu, rcgcpu (original mSWEEP); default: rcggpu).
+--emprecision   Precision to use for the emgpu algorithm (one of float, double; default: double).
 
 Bootstrapping options:
 --iters	Number of times to rerun estimation with bootstrapped alignments (default: 0).

diff --git a/docs/gpubenchmarks.md b/docs/gpubenchmarks.md
@@ -0,0 +1,27 @@
+# mSWEEP GPU comparisons
+
+The following abundance estimations were performed starting with efaec-1_1.aln.gz (2.9 GBs) and efaec-1_2.aln.gz (2.8 GBs) pseudoalignment files which can be obtained by following this mGEMS [tutorial](https://github.com/PROBIC/mGEMS/blob/master/docs/TUTORIAL.md).
+
+## Table 1: algorithm comparisons across HPC platforms in terms of time, iterations, and memory usage.
+
+**Notes:**
+- On Turso, A100 GPUs were used, although other GPUs are also possible to use but have less memory and will most likely run slower.
+- On LUMI, older versions of LibTorch and ROCm had to be used, most likely affecting the resulting times.
+- Since the emgpu algorthm with the default tolenrace of 1e-6 took all 5000 iterations in this case (rare), some results from running the algorithms with a higher tolerance of 1e-3 are shown. This tolerance still seems to provide nearly identical results but in a faster time (see Table 2 for comparison of results).
+- Time was acquired from the time taken to execute [this line](https://github.com/Piketulus/mSWEEP-gpu/blob/4ca2acd510c9dfb5f0fed1d3cc3e383a3a7e8572/src/mSWEEP.cpp#L440).
+
+| **Platform** | **Algorithm**          | **Tolerance** | **Time to Estimate Abundances (seconds)** | **Iterations** | **Max Memory Used (GB)** |
+|--------------|------------------------|---------------|-------------------------------------------|----------------|--------------------------|
+| Turso        | rcgcpu (8 CPUs)         | 1.00E-06      | 1856                                      | 205            | 22.7                     |
+| Turso        | rcgcpu (32 CPUs)        | 1.00E-06      | 634                                       | 215            | 23.3                     |
+| Turso        | rcgcpu (80 CPUs)        | 1.00E-06      | 485                                       | 215            | 24.4                     |
+| Turso        | rcggpu                  | 1.00E-06      | 43                                        | 220            | 27.9 (on GPU)            |
+| Turso        | rcggpu                  | 1.00E-03      | 33                                        | 155            | 27.9 (on GPU)            |
+| Turso        | emgpu (double)          | 1.00E-06      | 258                                       | 5000           | 14 (on GPU)              |
+| Turso        | emgpu (double)          | 1.00E-03      | 143                                       | 2605           | 14 (on GPU)              |
+| Turso        | emgpu (float)           | 1.00E-06      | 19                                        | 335            | 7 (on GPU)               |
+| LUMI         | rcggpu                  | 1.00E-06      | 103                                       | 225            | 27.9 (on GPU)            |
+| LUMI         | emgpu (double)          | 1.00E-06      | 392                                       | 5000           | 14 (on GPU)              |
+| LUMI         | emgpu (float)           | 1.00E-06      | 57                                        | 300            | 7 (on GPU)               |
+
+**note** emgpu has lower numerical precision and the results will differ from the rcg algorithms.
diff --git a/src/mSWEEP.cpp b/src/mSWEEP.cpp
@@ -123,7 +123,11 @@ void parse_args(int argc, char* argv[], cxxargs::Arguments &args) {
   // Maximum iterations to run the optimizer for
   args.add_long_argument<size_t>("max-iters", "Maximum number of iterations to run the abundance estimation optimizer for (default: 5000).", (size_t)5000);
   // Tolerance for abundance estimation convergence
-  args.add_long_argument<double>("tol", "Optimization terminates when the bound changes by less than the given tolerance (default: 0.000001).\n\nBootstrapping options:", (double)0.000001);
+  args.add_long_argument<double>("tol", "Optimization terminates when the bound changes by less than the given tolerance (default: 0.000001).", (double)0.000001);
+  // Algorithm to use for abundance estimation
+  args.add_long_argument<std::string>("algorithm", "Which algorithm to use for abundance estimation (one of rcggpu, emgpu, rcgcpu (original mSWEEP); default: rcggpu).", "rcggpu");
+  // Precision for abundance estimation with emgpu algorithm
+  args.add_long_argument<std::string>("emprecision", "Precision to use for the emgpu algorithm (one of float, double; default: double).\n\nBootstrapping options:", "double");
 
   // Number of iterations to run bootstrapping for
   args.add_long_argument<size_t>("iters", "Number of times to rerun estimation with bootstrapped alignments (default: 0).", (size_t)0);
@@ -200,12 +204,21 @@ seamat::DenseMatrix<double> rcg_optl(const cxxargs::Arguments &args, const seama
   int rank;
   MPI_Comm_rank(MPI_COMM_WORLD,&rank);
   const seamat::DenseMatrix<double> &ec_probs = rcgpar::rcg_optl_mpi(ll_mat, log_ec_counts, prior_counts, args.value<double>("tol"), args.value<size_t>("max-iters"), (rank == 0 && args.value<bool>("verbose") ? log.stream() : of));
+  return ec_probs;
 
 #else
   // Only OpenMP parallelization (if enabled).
-  const seamat::DenseMatrix<double> &ec_probs = rcgpar::rcg_optl_omp(ll_mat, log_ec_counts, prior_counts, args.value<double>("tol"), args.value<size_t>("max-iters"), (args.value<bool>("verbose") ? log.stream() : of));
+  if (args.value<std::string>("algorithm") == "rcggpu") {
+    const seamat::DenseMatrix<double> &ec_probs = rcgpar::rcg_optl_torch(ll_mat, log_ec_counts, prior_counts, args.value<double>("tol"), args.value<size_t>("max-iters"), (args.value<bool>("verbose") ? log.stream() : of));
+    return ec_probs;
+  } else if (args.value<std::string>("algorithm") == "rcgcpu") {
+    const seamat::DenseMatrix<double> &ec_probs = rcgpar::rcg_optl_omp(ll_mat, log_ec_counts, prior_counts, args.value<double>("tol"), args.value<size_t>("max-iters"), (args.value<bool>("verbose") ? log.stream() : of));
+    return ec_probs;
+  } else {
+    const seamat::DenseMatrix<double> &ec_probs = rcgpar::em_torch(ll_mat, log_ec_counts, prior_counts, args.value<double>("tol"), args.value<size_t>("max-iters"), (args.value<bool>("verbose") ? log.stream() : of), args.value<std::string>("emprecision"));
+    return ec_probs;
+  }
 #endif
-  return ec_probs;
 }
 
 int main (int argc, char *argv[]) {
@@ -451,7 +464,11 @@ int main (int argc, char *argv[]) {
 	// Run binning if requested and write results to files.
 	if (rank == 0) { // root performs the rest.
 	  // Turn the probs into relative abundances
-	  sample->store_abundances(rcgpar::mixture_components(sample->get_probs(), log_likelihoods->log_counts()));
+	  if (args.value<std::string>("algorithm") == "rcgcpu") {
+	      sample->store_abundances(rcgpar::mixture_components(sample->get_probs(), log_likelihoods->log_counts()));
+	  } else {
+	      sample->store_abundances(rcgpar::mixture_components_torch(sample->get_probs(), log_likelihoods->log_counts()));
+	  }
 
 	  if (args.value<size_t>("min-hits") > 0) {
 	      for (size_t j = 0; j < reference->group_names(i).size(); ++j) {
@@ -542,8 +559,13 @@ int main (int argc, char *argv[]) {
 	      finalize("Bootstrap iteration " + std::to_string(k) + "/" + std::to_string(args.value<size_t>("iters")) + " failed:\n  " + std::string(e.what()) + "\nexiting\n", log, true);
 	      return 1;
 	    }
-	    if (rank == 0)
-	      sample->store_abundances(rcgpar::mixture_components(sample->get_probs(), resampled_counts));
+	    if (rank == 0) {
+		if (args.value<std::string>("algorithm") == "rcgcpu") {
+		    sample->store_abundances(rcgpar::mixture_components(sample->get_probs(), resampled_counts));
+		} else {
+		    sample->store_abundances(rcgpar::mixture_components_torch(sample->get_probs(), resampled_counts));
+		}
+	    }
 	  }
 	}
       }