Skip to content

Commit

Permalink
Merge pull request #34 from PROBIC/add-gpu-support
Browse files Browse the repository at this point in the history
Add GPU accelerated abundance estimation
  • Loading branch information
tmaklin authored Sep 11, 2024
2 parents 337d085 + 0781517 commit 1216451
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Local files
build/
example/
external/

# Emacs autosaves
*.*~
48 changes: 40 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,27 @@ if(CMAKE_BUILD_WITH_FLTO)
endif()

## Check dependencies
### Torch
if(CMAKE_LIBTORCH_PATH)
message(STATUS "Torch libraries provided in: ${CMAKE_LIBTORCH_PATH}")
set(Torch_DIR ${CMAKE_LIBTORCH_PATH}/share/cmake/Torch)
endif()

find_package(Torch)
if (TORCH_FOUND)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
set(MSWEEP_TORCH_SUPPORT 1)
include_directories(${TORCH_INCLUDE_DIRS})
message(STATUS "Compiling mSWEEP with GPU and CPU support")
else()
set(MSWEEP_TORCH_SUPPORT 0)
if (DEFINED CMAKE_LIBTORCH_PATH)
message(FATAL_ERROR "Could not find libtorch in: ${CMAKE_LIBTORCH_PATH}")
else()
message(STATUS "Compiling mSWEEP with CPU support only")
endif()
endif()

find_package(OpenMP)
if (OPENMP_FOUND)
set(MSWEEP_OPENMP_SUPPORT 1)
Expand Down Expand Up @@ -251,7 +272,7 @@ if (DEFINED CMAKE_SEAMAT_HEADERS)
else()
FetchContent_Declare(seamat
GIT_REPOSITORY https://github.com/tmaklin/seamat.git
GIT_TAG v0.2.2
GIT_TAG v0.2.3
PREFIX "external"
SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/seamat"
BUILD_IN_SOURCE 0
Expand All @@ -263,26 +284,26 @@ else()
)
FetchContent_MakeAvailable(seamat)
set(CMAKE_SEAMAT_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/seamat/include ${CMAKE_CURRENT_BINARY_DIR}/_deps/seamat-build/include)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/mSWEEP_openmp_config.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/include/mSWEEP_openmp_config.hpp @ONLY)
endif()
include_directories(${CMAKE_SEAMAT_HEADERS})

## rcgpar
if (DEFINED CMAKE_RCGPAR_LIBRARY AND DEFINED CMAKE_RCGPAR_HEADERS AND DEFINED CMAKE_RCGUTILS_LIBRARY)
if (DEFINED CMAKE_RCGPAR_LIBRARIES AND DEFINED CMAKE_RCGPAR_HEADERS AND DEFINED CMAKE_RCGUTILS_LIBRARY)
message(STATUS "rcgpar headers provided in: ${CMAKE_RCGPAR_HEADERS}")
message(STATUS "rcgpar library provided in: ${CMAKE_RCGPAR_LIBRARY}")
message(STATUS "rcgpar library provided in: ${CMAKE_RCGPAR_LIBRARIES}")
message(STATUS "rcgutils library provided in: ${CMAKE_RCGUTILS_LIBRARY}")
else()
FetchContent_Declare(rcgpar
GIT_REPOSITORY https://github.com/tmaklin/rcgpar.git
GIT_TAG v1.1.3
GIT_TAG v1.2.1
PREFIX "external"
SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/rcgpar"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/rcgpar"
BUILD_IN_SOURCE 0
CMAKE_ARGS -D CMAKE_ENABLE_MPI_SUPPORT=${MSWEEP_MPI_SUPPORT}
-D CMAKE_SEAMAT_HEADERS=${CMAKE_SEAMAT_HEADERS}
-D CMAKE_BITMAGIC_HEADERS=${CMAKE_BITMAGIC_HEADERS}
-D CMAKE_LIBTORCH_PATH=${CMAKE_LIBTORCH_PATH}
-D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-D "CMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
-D "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
Expand All @@ -291,12 +312,19 @@ else()
INSTALL_COMMAND ""
)
FetchContent_MakeAvailable(rcgpar)
add_dependencies(mSWEEP rcgomp rcgutils)

if (TORCH_FOUND)
add_dependencies(mSWEEP rcgomp rcggpu rcgutils)
set(CMAKE_RCGPAR_LIBRARIES "${CMAKE_CURRENT_BINARY_DIR}/lib/librcgomp.a" "${CMAKE_CURRENT_BINARY_DIR}/lib/librcggpu.a")
else()
add_dependencies(mSWEEP rcgomp rcgutils)
set(CMAKE_RCGPAR_LIBRARIES "${CMAKE_CURRENT_BINARY_DIR}/lib/librcgomp.a")
endif()
set(CMAKE_RCGPAR_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/rcgpar/include)
set(CMAKE_RCGPAR_LIBRARY "${CMAKE_CURRENT_BINARY_DIR}/lib/librcgomp.a")
set(CMAKE_RCGUTILS_LIBRARY "${CMAKE_CURRENT_BINARY_DIR}/lib/librcgutils.a")
endif()
target_link_libraries(mSWEEP ${CMAKE_RCGPAR_LIBRARY} ${CMAKE_RCGUTILS_LIBRARY})
target_link_libraries(mSWEEP ${CMAKE_RCGPAR_LIBRARIES})
target_link_libraries(mSWEEP ${CMAKE_RCGUTILS_LIBRARY})
include_directories(${CMAKE_RCGPAR_HEADERS})

## mGEMS
Expand Down Expand Up @@ -360,3 +388,7 @@ target_link_libraries(mSWEEP libmsweep)
if (OPENMP_FOUND)
target_link_libraries(mSWEEP OpenMP::OpenMP_CXX)
endif()

if (TORCH_FOUND)
target_link_libraries(mSWEEP "${TORCH_LIBRARIES}")
endif()
45 changes: 39 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ In addition to mSWEEP, you will need to install [Themisto](https://github.com/al
## Conda
Install mSWEEP from bioconda with
```
conda install -y -c bioconda -c conda-forge -c defaults msweep
conda install -y -c bioconda -c conda-forge msweep
```

check that the installation succeeded by running
Expand All @@ -22,14 +22,17 @@ mSWEEP --help

## Precompiled binaries
Precompiled binaries are available for
* [Linux x86\_64 (mSWEEP-v2.1.0)](https://github.com/PROBIC/mSWEEP/releases/download/v2.1.0/mSWEEP-v2.1.0-x86_64-redhat-linux.tar.gz)
* [macOS arm64 (mSWEEP-v2.1.0)](https://github.com/PROBIC/mSWEEP/releases/download/v2.1.0/mSWEEP-v2.1.0-arm64-apple-darwin22.tar.gz)
* [macOS x86\_64 (mSWEEP-v2.1.0)](https://github.com/PROBIC/mSWEEP/releases/download/v2.1.0/mSWEEP-v2.1.0-x86_64-apple-darwin22.tar.gz)
* Linux x86\_64
* macOS arm64
* macOS x86\_64

## Compiling from source
from the [Releases](https://github.com/PROBIC/mSWEEP/releases) page.

## Building from source
### Requirements
- C++17 compliant compiler.
- cmake (v3.0 or newer)
- cmake (v3.11 or newer)
- git

#### Optional
- Compiler with OpenMP support.
Expand All @@ -49,10 +52,38 @@ enter the directory and run
> cmake ..
> make
```

This will compile the mSWEEP executable in `build/bin/mSWEEP`.

For more info on compiling mSWEEP from source, please see the [documentation on compiling mSWEEP](/docs/compilation.md).

### Enabling GPU acceleration
Compiling mSWEEP with GPU support requires installing
- [LibTorch](https://pytorch.org/get-started/locally/)
- ... and CUDA Toolkit (LibTorch w/ CUDA support)
- ... or ROCm (LibTorch w/ ROCm)

then, build mSWEEP with
```
> mkdir build
> cd build
> cmake -DCMAKE_LIBTORCH_PATH=/absolute/path/to/libtorch ..
> make
```

where `/absolute/path/to/libtorch` should be the absolute (!) path to
the root of the LibTorch distribution.

Compiling mSWEEP with LibTorch support enables the `rcggpu` and
`emgpu` options for `--algorithm` which directs the abundance
estimation to run on the GPU if one is available.

Both algorithms can also be run on the CPU. Compared to the default algorithm, on the CPU
- `rcggpu` is faster but uses more memory.
- `emgpu` is slower but uses less memory.

See [docs/gpubenchmarks.md](/docs/gpubenchmarks.md) for more details.

# Usage
More information about using mSWEEP is available in the [usage documentation](/docs/README.md).

Expand Down Expand Up @@ -170,6 +201,8 @@ Estimation options:
--no-fit-model Do not estimate the abundances. Useful if only the likelihood matrix is required (default: false).
--max-iters Maximum number of iterations to run the abundance estimation optimizer for (default: 5000).
--tol Optimization terminates when the bound changes by less than the given tolerance (default: 0.000001).
--algorithm Which algorithm to use for abundance estimation (one of rcggpu, emgpu, rcgcpu (original mSWEEP); default: rcggpu).
--emprecision Precision to use for the emgpu algorithm (one of float, double; default: double).
Bootstrapping options:
--iters Number of times to rerun estimation with bootstrapped alignments (default: 0).
Expand Down
27 changes: 27 additions & 0 deletions docs/gpubenchmarks.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# mSWEEP GPU comparisons

The following abundance estimations were performed starting with efaec-1_1.aln.gz (2.9 GBs) and efaec-1_2.aln.gz (2.8 GBs) pseudoalignment files which can be obtained by following this mGEMS [tutorial](https://github.com/PROBIC/mGEMS/blob/master/docs/TUTORIAL.md).

## Table 1: algorithm comparisons across HPC platforms in terms of time, iterations, and memory usage.

**Notes:**
- On Turso, A100 GPUs were used, although other GPUs are also possible to use but have less memory and will most likely run slower.
- On LUMI, older versions of LibTorch and ROCm had to be used, most likely affecting the resulting times.
- Since the emgpu algorthm with the default tolenrace of 1e-6 took all 5000 iterations in this case (rare), some results from running the algorithms with a higher tolerance of 1e-3 are shown. This tolerance still seems to provide nearly identical results but in a faster time (see Table 2 for comparison of results).
- Time was acquired from the time taken to execute [this line](https://github.com/Piketulus/mSWEEP-gpu/blob/4ca2acd510c9dfb5f0fed1d3cc3e383a3a7e8572/src/mSWEEP.cpp#L440).

| **Platform** | **Algorithm** | **Tolerance** | **Time to Estimate Abundances (seconds)** | **Iterations** | **Max Memory Used (GB)** |
|--------------|------------------------|---------------|-------------------------------------------|----------------|--------------------------|
| Turso | rcgcpu (8 CPUs) | 1.00E-06 | 1856 | 205 | 22.7 |
| Turso | rcgcpu (32 CPUs) | 1.00E-06 | 634 | 215 | 23.3 |
| Turso | rcgcpu (80 CPUs) | 1.00E-06 | 485 | 215 | 24.4 |
| Turso | rcggpu | 1.00E-06 | 43 | 220 | 27.9 (on GPU) |
| Turso | rcggpu | 1.00E-03 | 33 | 155 | 27.9 (on GPU) |
| Turso | emgpu (double) | 1.00E-06 | 258 | 5000 | 14 (on GPU) |
| Turso | emgpu (double) | 1.00E-03 | 143 | 2605 | 14 (on GPU) |
| Turso | emgpu (float) | 1.00E-06 | 19 | 335 | 7 (on GPU) |
| LUMI | rcggpu | 1.00E-06 | 103 | 225 | 27.9 (on GPU) |
| LUMI | emgpu (double) | 1.00E-06 | 392 | 5000 | 14 (on GPU) |
| LUMI | emgpu (float) | 1.00E-06 | 57 | 300 | 7 (on GPU) |

**note** emgpu has lower numerical precision and the results will differ from the rcg algorithms.
34 changes: 28 additions & 6 deletions src/mSWEEP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,11 @@ void parse_args(int argc, char* argv[], cxxargs::Arguments &args) {
// Maximum iterations to run the optimizer for
args.add_long_argument<size_t>("max-iters", "Maximum number of iterations to run the abundance estimation optimizer for (default: 5000).", (size_t)5000);
// Tolerance for abundance estimation convergence
args.add_long_argument<double>("tol", "Optimization terminates when the bound changes by less than the given tolerance (default: 0.000001).\n\nBootstrapping options:", (double)0.000001);
args.add_long_argument<double>("tol", "Optimization terminates when the bound changes by less than the given tolerance (default: 0.000001).", (double)0.000001);
// Algorithm to use for abundance estimation
args.add_long_argument<std::string>("algorithm", "Which algorithm to use for abundance estimation (one of rcggpu, emgpu, rcgcpu (original mSWEEP); default: rcggpu).", "rcggpu");
// Precision for abundance estimation with emgpu algorithm
args.add_long_argument<std::string>("emprecision", "Precision to use for the emgpu algorithm (one of float, double; default: double).\n\nBootstrapping options:", "double");

// Number of iterations to run bootstrapping for
args.add_long_argument<size_t>("iters", "Number of times to rerun estimation with bootstrapped alignments (default: 0).", (size_t)0);
Expand Down Expand Up @@ -200,12 +204,21 @@ seamat::DenseMatrix<double> rcg_optl(const cxxargs::Arguments &args, const seama
int rank;
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
const seamat::DenseMatrix<double> &ec_probs = rcgpar::rcg_optl_mpi(ll_mat, log_ec_counts, prior_counts, args.value<double>("tol"), args.value<size_t>("max-iters"), (rank == 0 && args.value<bool>("verbose") ? log.stream() : of));
return ec_probs;

#else
// Only OpenMP parallelization (if enabled).
const seamat::DenseMatrix<double> &ec_probs = rcgpar::rcg_optl_omp(ll_mat, log_ec_counts, prior_counts, args.value<double>("tol"), args.value<size_t>("max-iters"), (args.value<bool>("verbose") ? log.stream() : of));
if (args.value<std::string>("algorithm") == "rcggpu") {
const seamat::DenseMatrix<double> &ec_probs = rcgpar::rcg_optl_torch(ll_mat, log_ec_counts, prior_counts, args.value<double>("tol"), args.value<size_t>("max-iters"), (args.value<bool>("verbose") ? log.stream() : of));
return ec_probs;
} else if (args.value<std::string>("algorithm") == "rcgcpu") {
const seamat::DenseMatrix<double> &ec_probs = rcgpar::rcg_optl_omp(ll_mat, log_ec_counts, prior_counts, args.value<double>("tol"), args.value<size_t>("max-iters"), (args.value<bool>("verbose") ? log.stream() : of));
return ec_probs;
} else {
const seamat::DenseMatrix<double> &ec_probs = rcgpar::em_torch(ll_mat, log_ec_counts, prior_counts, args.value<double>("tol"), args.value<size_t>("max-iters"), (args.value<bool>("verbose") ? log.stream() : of), args.value<std::string>("emprecision"));
return ec_probs;
}
#endif
return ec_probs;
}

int main (int argc, char *argv[]) {
Expand Down Expand Up @@ -451,7 +464,11 @@ int main (int argc, char *argv[]) {
// Run binning if requested and write results to files.
if (rank == 0) { // root performs the rest.
// Turn the probs into relative abundances
sample->store_abundances(rcgpar::mixture_components(sample->get_probs(), log_likelihoods->log_counts()));
if (args.value<std::string>("algorithm") == "rcgcpu") {
sample->store_abundances(rcgpar::mixture_components(sample->get_probs(), log_likelihoods->log_counts()));
} else {
sample->store_abundances(rcgpar::mixture_components_torch(sample->get_probs(), log_likelihoods->log_counts()));
}

if (args.value<size_t>("min-hits") > 0) {
for (size_t j = 0; j < reference->group_names(i).size(); ++j) {
Expand Down Expand Up @@ -542,8 +559,13 @@ int main (int argc, char *argv[]) {
finalize("Bootstrap iteration " + std::to_string(k) + "/" + std::to_string(args.value<size_t>("iters")) + " failed:\n " + std::string(e.what()) + "\nexiting\n", log, true);
return 1;
}
if (rank == 0)
sample->store_abundances(rcgpar::mixture_components(sample->get_probs(), resampled_counts));
if (rank == 0) {
if (args.value<std::string>("algorithm") == "rcgcpu") {
sample->store_abundances(rcgpar::mixture_components(sample->get_probs(), resampled_counts));
} else {
sample->store_abundances(rcgpar::mixture_components_torch(sample->get_probs(), resampled_counts));
}
}
}
}
}
Expand Down

0 comments on commit 1216451

Please sign in to comment.