Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[EM] Compress dense ellpack. #10821

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ ipch
*.filters
*.user
*log
rmm_log.txt
Debug
*suo
.Rhistory
Expand Down
23 changes: 12 additions & 11 deletions src/common/cuda_pinned_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <cstddef> // for size_t
#include <limits> // for numeric_limits
#include <new> // for bad_array_new_length

#include "common.h"

Expand All @@ -28,14 +29,14 @@ struct PinnedAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

pointer result(nullptr);
dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
Expand All @@ -52,14 +53,14 @@ struct ManagedAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

pointer result(nullptr);
dh::safe_cuda(cudaMallocManaged(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
Expand All @@ -78,14 +79,14 @@ struct SamAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

size_type n_bytes = cnt * sizeof(value_type);
pointer result = reinterpret_cast<pointer>(std::malloc(n_bytes));
Expand Down Expand Up @@ -139,10 +140,10 @@ class CudaHostAllocatorImpl : public Policy<T> {
};

template <typename T>
using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>; // NOLINT
using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;

template <typename T>
using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>; // NOLINT
using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;

template <typename T>
using SamAllocator = CudaHostAllocatorImpl<T, SamAllocPolicy>;
Expand Down
26 changes: 17 additions & 9 deletions src/common/device_vector.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,10 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
pointer thrust_ptr;
if (use_cub_allocator_) {
T *raw_ptr{nullptr};
// NOLINTBEGIN(clang-analyzer-unix.BlockInCriticalSection)
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
n * sizeof(T));
// NOLINTEND(clang-analyzer-unix.BlockInCriticalSection)
if (errc != cudaSuccess) {
detail::ThrowOOMError("Caching allocator", n * sizeof(T));
}
Expand Down Expand Up @@ -290,13 +292,13 @@ LoggingResource *GlobalLoggingResource();
/**
* @brief Container class that doesn't initialize the data when RMM is used.
*/
template <typename T>
class DeviceUVector {
template <typename T, bool is_caching>
class DeviceUVectorImpl {
private:
#if defined(XGBOOST_USE_RMM)
rmm::device_uvector<T> data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()};
#else
::dh::device_vector<T> data_;
std::conditional_t<is_caching, ::dh::caching_device_vector<T>, ::dh::device_vector<T>> data_;
#endif // defined(XGBOOST_USE_RMM)

public:
Expand All @@ -307,12 +309,12 @@ class DeviceUVector {
using const_reference = value_type const &; // NOLINT

public:
DeviceUVector() = default;
explicit DeviceUVector(std::size_t n) { this->resize(n); }
DeviceUVector(DeviceUVector const &that) = delete;
DeviceUVector &operator=(DeviceUVector const &that) = delete;
DeviceUVector(DeviceUVector &&that) = default;
DeviceUVector &operator=(DeviceUVector &&that) = default;
DeviceUVectorImpl() = default;
explicit DeviceUVectorImpl(std::size_t n) { this->resize(n); }
DeviceUVectorImpl(DeviceUVectorImpl const &that) = delete;
DeviceUVectorImpl &operator=(DeviceUVectorImpl const &that) = delete;
DeviceUVectorImpl(DeviceUVectorImpl &&that) = default;
DeviceUVectorImpl &operator=(DeviceUVectorImpl &&that) = default;

void resize(std::size_t n) { // NOLINT
#if defined(XGBOOST_USE_RMM)
Expand Down Expand Up @@ -356,4 +358,10 @@ class DeviceUVector {
[[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); } // NOLINT
[[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); } // NOLINT
};

template <typename T>
using DeviceUVector = DeviceUVectorImpl<T, false>;

template <typename T>
using CachingDeviceUVector = DeviceUVectorImpl<T, true>;
} // namespace dh
19 changes: 9 additions & 10 deletions src/data/device_adapter.cuh
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
/**
* Copyright 2019-2023 by XGBoost Contributors
* Copyright 2019-2024, XGBoost Contributors
* \file device_adapter.cuh
*/
#ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
#define XGBOOST_DATA_DEVICE_ADAPTER_H_
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include <thrust/logical.h> // for none_of

#include <cstddef> // for size_t
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <string>

#include "../common/cuda_context.cuh"
#include "../common/device_helpers.cuh"
#include "../common/math.h"
#include "adapter.h"
#include "array_interface.h"

Expand Down Expand Up @@ -208,11 +207,12 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {

// Returns maximum row length
template <typename AdapterBatchT>
bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset, DeviceOrd device,
float missing) {
bst_idx_t GetRowCounts(Context const* ctx, const AdapterBatchT batch,
common::Span<bst_idx_t> offset, DeviceOrd device, float missing) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
IsValidFunctor is_valid(missing);
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
dh::safe_cuda(
cudaMemsetAsync(offset.data(), '\0', offset.size_bytes(), ctx->CUDACtx()->Stream()));

auto n_samples = batch.NumRows();
bst_feature_t n_features = batch.NumCols();
Expand All @@ -230,7 +230,7 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
}

// Count elements per row
dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
dh::LaunchN(n_samples * stride, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t idx) {
bst_idx_t cnt{0};
auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
SPAN_CHECK(ridx < n_samples);
Expand All @@ -244,9 +244,8 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
&offset[ridx]),
static_cast<unsigned long long>(cnt)); // NOLINT
});
dh::XGBCachingDeviceAllocator<char> alloc;
bst_idx_t row_stride =
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
dh::Reduce(ctx->CUDACtx()->CTP(), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<bst_idx_t>(0), thrust::maximum<bst_idx_t>());
return row_stride;
Expand Down
Loading
Loading