diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f7a431c..83ae79bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,7 @@ include(pkgs/JSONSupport) include(pkgs/MPISupport) include(pkgs/OpenMPSupport) include(pkgs/CUDASupport) +include(pkgs/OneApiSupport) # Create gz_read executable add_executable(gz_read standard-suite/binary-traces/gz_read.cc) diff --git a/cmake/pkgs/OneApiSupport.cmake b/cmake/pkgs/OneApiSupport.cmake new file mode 100644 index 00000000..d31f6e8e --- /dev/null +++ b/cmake/pkgs/OneApiSupport.cmake @@ -0,0 +1,5 @@ +option(USE_ONEAPI "Enable support for OneApi") + +if (USE_ONEAPI) + add_definitions(-DUSE_ONEAPI) +endif() diff --git a/src/Spatter/CMakeLists.txt b/src/Spatter/CMakeLists.txt index 15300460..ed340464 100644 --- a/src/Spatter/CMakeLists.txt +++ b/src/Spatter/CMakeLists.txt @@ -6,8 +6,13 @@ if (USE_CUDA) set(CUDA_INCLUDE_FILES CudaBackend.hh) endif() +if (USE_ONEAPI) + set(ONEAPI_INCLUDE_FILES OneApiBackend.hh) +endif() + set(SPATTER_INCLUDE_FILES ${CUDA_INCLUDE_FILES} + ${ONEAPI_INCLUDE_FILES} Configuration.hh Input.hh JSONParser.hh @@ -17,6 +22,7 @@ set(SPATTER_INCLUDE_FILES Timer.hh ) +message(SPATTER_INCLUDE_FILES="${SPATTER_INCLUDE_FILES}") add_library(Spatter STATIC ${SPATTER_INCLUDE_FILES} Configuration.cc diff --git a/src/Spatter/Configuration.cc b/src/Spatter/Configuration.cc index 8d9fe7e0..bd99fa5f 100644 --- a/src/Spatter/Configuration.cc +++ b/src/Spatter/Configuration.cc @@ -922,4 +922,209 @@ void Configuration::setup() { } #endif +#ifdef USE_ONEAPI +Configuration::Configuration(const size_t id, + const std::string name, const std::string kernel, + const aligned_vector &pattern, + const aligned_vector &pattern_gather, + const aligned_vector &pattern_scatter, + aligned_vector &sparse, size_t &sparse_size, + aligned_vector &sparse_gather, size_t &sparse_gather_size, + aligned_vector &sparse_scatter, size_t &sparse_scatter_size, + aligned_vector &dense, size_t &dense_size, + aligned_vector> &dense_perthread, const size_t delta, + const size_t delta_gather, const size_t delta_scatter, const long int seed, + const size_t wrap, const size_t count, const size_t shared_mem, + const size_t local_work_size, const unsigned long nruns, + const bool aggregate, const bool atomic, const unsigned long verbosity) + : ConfigurationBase(id, name, kernel, pattern, pattern_gather, + pattern_scatter, sparse, sparse_size, sparse_gather, + sparse_gather_size, sparse_scatter, sparse_scatter_size, dense, + dense_size, dense_perthread, delta, delta_gather, delta_scatter, seed, + wrap, count, shared_mem, local_work_size, 1, nruns, aggregate, atomic, + verbosity) { + setup(); +} + +Configuration::~Configuration() { + cudaFree(dev_pattern); + cudaFree(dev_pattern_gather); + cudaFree(dev_pattern_scatter); + + cudaFree(dev_sparse); + cudaFree(dev_sparse_gather); + cudaFree(dev_sparse_scatter); + + cudaFree(dev_dense); +} + +int Configuration::run(bool timed, unsigned long run_id) { + return ConfigurationBase::run(timed, run_id); +} + +void Configuration::gather(bool timed, unsigned long run_id) { + size_t pattern_length = pattern.size(); + +#ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + + float time_ms = oneapi_gather_wrapper( + dev_pattern, dev_sparse, dev_dense, pattern_length, delta, wrap, count); + + cudaDeviceSynchronize(); + + if (timed) + time_seconds[run_id] = ((double)time_ms / 1000.0); +} + +void Configuration::scatter(bool timed, unsigned long run_id) { + size_t pattern_length = pattern.size(); + +#ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + + float time_ms = 0.0; + + if (atomic) + time_ms = oneapi_scatter_atomic_wrapper( + dev_pattern, dev_sparse, dev_dense, pattern_length, delta, wrap, count); + else + time_ms = oneapi_scatter_wrapper( + dev_pattern, dev_sparse, dev_dense, pattern_length, delta, wrap, count); + + cudaDeviceSynchronize(); + + if (timed) + time_seconds[run_id] = ((double)time_ms / 1000.0); +} + +void Configuration::scatter_gather( + bool timed, unsigned long run_id) { + assert(pattern_scatter.size() == pattern_gather.size()); + int pattern_length = static_cast(pattern_scatter.size()); + +#ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + + float time_ms = 0.0; + + if (atomic) + time_ms = oneapi_scatter_gather_atomic_wrapper(dev_pattern_scatter, + dev_sparse_scatter, dev_pattern_gather, dev_sparse_gather, + pattern_length, delta_scatter, delta_gather, wrap, count); + else + time_ms = oneapi_scatter_gather_wrapper(dev_pattern_scatter, + dev_sparse_scatter, dev_pattern_gather, dev_sparse_gather, + pattern_length, delta_scatter, delta_gather, wrap, count); + + cudaDeviceSynchronize(); + + if (timed) + time_seconds[run_id] = ((double)time_ms / 1000.0); +} + +void Configuration::multi_gather( + bool timed, unsigned long run_id) { + int pattern_length = static_cast(pattern_gather.size()); + +#ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + + float time_ms = oneapi_multi_gather_wrapper(dev_pattern, dev_pattern_gather, + dev_sparse, dev_dense, pattern_length, delta, wrap, count); + + cudaDeviceSynchronize(); + + if (timed) + time_seconds[run_id] = ((double)time_ms / 1000.0); +} + +void Configuration::multi_scatter( + bool timed, unsigned long run_id) { + int pattern_length = static_cast(pattern_scatter.size()); + +#ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + + float time_ms = 0.0; + + if (atomic) + time_ms = + oneapi_multi_scatter_atomic_wrapper(dev_pattern, dev_pattern_scatter, + dev_sparse, dev_dense, pattern_length, delta, wrap, count); + else + time_ms = oneapi_multi_scatter_wrapper(dev_pattern, dev_pattern_scatter, + dev_sparse, dev_dense, pattern_length, delta, wrap, count); + + cudaDeviceSynchronize(); + + if (timed) + time_seconds[run_id] = ((double)time_ms / 1000.0); +} + +void Configuration::setup() { + ConfigurationBase::setup(); + + if (sparse.size() < sparse_size) { + sparse.resize(sparse_size); + + for (size_t i = 0; i < sparse.size(); ++i) + sparse[i] = rand(); + } + + if (sparse_gather.size() < sparse_gather_size) { + sparse_gather.resize(sparse_gather_size); + + for (size_t i = 0; i < sparse_gather.size(); ++i) + sparse_gather[i] = rand(); + } + + if (sparse_scatter.size() < sparse_scatter_size) { + sparse_scatter.resize(sparse_scatter_size); + + for (size_t i = 0; i < sparse_scatter.size(); ++i) + sparse_scatter[i] = rand(); + } + + if (dense.size() < dense_size) { + dense.resize(dense_size); + + for (size_t i = 0; i < dense.size(); ++i) + dense[i] = rand(); + } + + cudaMalloc((void **)&dev_pattern, sizeof(size_t) * pattern.size()); + cudaMalloc( (void **)&dev_pattern_gather, sizeof(size_t) * pattern_gather.size()); + cudaMalloc( (void **)&dev_pattern_scatter, sizeof(size_t) * pattern_scatter.size()); + + cudaMalloc((void **)&dev_sparse, sizeof(double) * sparse.size()); + cudaMalloc( (void **)&dev_sparse_gather, sizeof(double) * sparse_gather.size()); + cudaMalloc( (void **)&dev_sparse_scatter, sizeof(double) * sparse_scatter.size()); + cudaMalloc((void **)&dev_dense, sizeof(double) * dense.size()); + + cudaMemcpy(dev_pattern, pattern.data(), + sizeof(size_t) * pattern.size(), cudaMemcpyHostToDevice); + cudaMemcpy(dev_pattern_gather, pattern_gather.data(), + sizeof(size_t) * pattern_gather.size(), cudaMemcpyHostToDevice); + cudaMemcpy(dev_pattern_scatter, pattern_scatter.data(), + sizeof(size_t) * pattern_scatter.size(), cudaMemcpyHostToDevice); + + cudaMemcpy(dev_sparse, sparse.data(), + sizeof(double) * sparse.size(), cudaMemcpyHostToDevice); + cudaMemcpy(dev_sparse_gather, sparse_gather.data(), + sizeof(double) * sparse_gather.size(), cudaMemcpyHostToDevice); + cudaMemcpy(dev_sparse_scatter, sparse_scatter.data(), + sizeof(double) * sparse_scatter.size(), cudaMemcpyHostToDevice); + cudaMemcpy(dev_dense, dense.data(), + sizeof(double) * dense.size(), cudaMemcpyHostToDevice); + + cudaDeviceSynchronize(); +} +#endif + } // namespace Spatter diff --git a/src/Spatter/Configuration.hh b/src/Spatter/Configuration.hh index 0636e163..184ac9c4 100644 --- a/src/Spatter/Configuration.hh +++ b/src/Spatter/Configuration.hh @@ -42,6 +42,10 @@ inline void gpuAssert( } #endif +#ifdef USE_ONEAPI +#include "OneApiBackend.hh" +#endif + #include "AlignedAllocator.hh" #include "SpatterTypes.hh" #include "Timer.hh" @@ -230,6 +234,46 @@ public: }; #endif +#ifdef USE_ONEAPI +template <> class Configuration : public ConfigurationBase { +public: + Configuration(const size_t id, const std::string name, + const std::string kernel, const aligned_vector &pattern, + const aligned_vector &pattern_gather, + const aligned_vector &pattern_scatter, + aligned_vector &sparse, size_t &sparse_size, + aligned_vector &sparse_gather, size_t &sparse_gather_size, + aligned_vector &sparse_scatter, size_t &sparse_scatter_size, + aligned_vector &dense, size_t &dense_size, + aligned_vector> &dense_perthread, + const size_t delta, const size_t delta_gather, const size_t delta_scatter, + const long int seed, const size_t wrap, const size_t count, + const size_t shared_mem, const size_t local_work_size, + const unsigned long nruns, const bool aggregate, const bool atomic, + const unsigned long verbosity); + + ~Configuration(); + + int run(bool timed, unsigned long run_id); + void gather(bool timed, unsigned long run_id); + void scatter(bool timed, unsigned long run_id); + void scatter_gather(bool timed, unsigned long run_id); + void multi_gather(bool timed, unsigned long run_id); + void multi_scatter(bool timed, unsigned long run_id); + void setup(); + +public: + size_t *dev_pattern; + size_t *dev_pattern_gather; + size_t *dev_pattern_scatter; + + double *dev_sparse; + double *dev_sparse_gather; + double *dev_sparse_scatter; + + double *dev_dense; +}; +#endif } // namespace Spatter #endif diff --git a/src/Spatter/Input.hh b/src/Spatter/Input.hh index b0a2090a..677f4d6b 100644 --- a/src/Spatter/Input.hh +++ b/src/Spatter/Input.hh @@ -321,8 +321,10 @@ int parse_input(const int argc, char **argv, ClArgs &cl) { [](unsigned char c) { return std::tolower(c); }); if ((backend.compare("serial") != 0) && - (backend.compare("openmp") != 0) && (backend.compare("cuda") != 0)) { - std::cerr << "Valid Backends are: serial, openmp, cuda" << std::endl; + (backend.compare("openmp") != 0) && + (backend.compare("cuda") != 0) && + (backend.compare("oneapi") != 0)) { + std::cerr << "Valid Backends are: serial, openmp, cuda, oneapi" << std::endl; return -1; } if (backend.compare("openmp") == 0) { @@ -335,6 +337,12 @@ int parse_input(const int argc, char **argv, ClArgs &cl) { #ifndef USE_CUDA std::cerr << "FAIL - CUDA Backend is not Enabled" << std::endl; return -1; +#endif + } + if (backend.compare("oneapi") == 0) { +#ifndef USE_ONEAPI + std::cerr << "FAIL - OneApi Backend is not Enabled" << std::endl; + return -1; #endif } break; @@ -485,12 +493,15 @@ int parse_input(const int argc, char **argv, ClArgs &cl) { // Set default backend if one was not specified if (backend.compare("") == 0) { backend = "serial"; - // Assume only one of USE_CUDA and USE_OPENMP can be true at once + // Assume only one of USE_OPENMP, USE_CUDA and USE_ONEAPI can be true at once #ifdef USE_OPENMP backend = "openmp"; #endif #ifdef USE_CUDA backend = "cuda"; +#endif +#ifdef USE_ONEAPI + backend = "oneapi"; #endif } @@ -626,6 +637,16 @@ int parse_input(const int argc, char **argv, ClArgs &cl) { cl.dense_perthread, delta, delta_gather, delta_scatter, seed, wrap, count, shared_mem, local_work_size, nruns, aggregate, atomic, verbosity); +#endif +#ifdef USE_ONEAPI + else if (backend.compare("oneapi") == 0) + c = std::make_unique>(0, + config_name, kernel, pattern, pattern_gather, pattern_scatter, + cl.sparse, cl.sparse_size, cl.sparse_gather, cl.sparse_gather_size, + cl.sparse_scatter, cl.sparse_scatter_size, cl.dense, cl.dense_size, + cl.dense_perthread, delta, delta_gather, delta_scatter, seed, wrap, + count, shared_mem, local_work_size, nruns, aggregate, atomic, + verbosity); #endif else { std::cerr << "Invalid Backend " << backend << std::endl; diff --git a/src/Spatter/JSONParser.cc b/src/Spatter/JSONParser.cc index 95d167ec..3b89f415 100644 --- a/src/Spatter/JSONParser.cc +++ b/src/Spatter/JSONParser.cc @@ -207,6 +207,17 @@ std::unique_ptr JSONParser::operator[]( data_[index]["wrap"], data_[index]["count"], shared_mem_, data_[index]["local-work-size"], data_[index]["nruns"], aggregate_, atomic_, verbosity_); +#endif +#ifdef USE_ONEAPI + else if (backend_.compare("oneapi") == 0) + c = std::make_unique>(index, + data_[index]["name"], data_[index]["kernel"], pattern, pattern_gather, + pattern_scatter, sparse, sparse_size, sparse_gather, sparse_gather_size, + sparse_scatter, sparse_scatter_size, dense, dense_size, dense_perthread, + delta, delta_gather, delta_scatter, data_[index]["seed"], + data_[index]["wrap"], data_[index]["count"], shared_mem_, + data_[index]["local-work-size"], data_[index]["nruns"], aggregate_, + atomic_, verbosity_); #endif else { std::cerr << "Invalid Backend " << backend_ << std::endl; diff --git a/src/Spatter/OneApiBackend.cc b/src/Spatter/OneApiBackend.cc new file mode 100644 index 00000000..2169361f --- /dev/null +++ b/src/Spatter/OneApiBackend.cc @@ -0,0 +1,641 @@ +#include +#include +#include + +#include "Configuration.hh" +#include + +#include + +#include + +void oneapi_gather(const size_t *pattern, const double *sparse, + double *dense, const size_t pattern_length, const size_t delta, + const size_t wrap, const size_t count, sycl::nd_item<3> item_ct1) { + size_t total_id = (size_t)((size_t)item_ct1.get_local_range(2) * + (size_t)item_ct1.get_group(2) + + (size_t)item_ct1.get_local_id(2)); + size_t j = total_id % pattern_length; // pat_idx + size_t i = total_id / pattern_length; // count_idx + + double x; + + if (i < count) { + // dense[j + pattern_length * (i % wrap)] = sparse[pattern[j] + delta * i]; + x = sparse[pattern[j] + delta * i]; + if (x == 0.5) + dense[0] = x; + } +} + +void oneapi_scatter(const size_t *pattern, double *sparse, + const double *dense, const size_t pattern_length, const size_t delta, + const size_t wrap, const size_t count, sycl::nd_item<3> item_ct1) { + size_t total_id = (size_t)((size_t)item_ct1.get_local_range(2) * + (size_t)item_ct1.get_group(2) + + (size_t)item_ct1.get_local_id(2)); + size_t j = total_id % pattern_length; // pat_idx + size_t i = total_id / pattern_length; // count_idx + + if (i < count) + sparse[pattern[j] + delta * i] = dense[j + pattern_length * (i % wrap)]; +} + +void oneapi_scatter_atomic(const size_t *pattern, double *sparse, + const double *dense, const size_t pattern_length, const size_t delta, + const size_t wrap, const size_t count, sycl::nd_item<3> item_ct1) { + size_t total_id = (size_t)((size_t)item_ct1.get_local_range(2) * + (size_t)item_ct1.get_group(2) + + (size_t)item_ct1.get_local_id(2)); + size_t j = total_id % pattern_length; // pat_idx + size_t i = total_id / pattern_length; // count_idx + + if (i < count) + dpct::atomic_exchange( + (unsigned long long int *)&sparse[pattern[j] + delta * i], + (unsigned long long)(sycl::bit_cast( + dense[j + pattern_length * (i % wrap)]))); +} + +void oneapi_scatter_gather(const size_t *pattern_scatter, + double *sparse_scatter, const size_t *pattern_gather, + const double *sparse_gather, const size_t pattern_length, + const size_t delta_scatter, const size_t delta_gather, const size_t wrap, + const size_t count, sycl::nd_item<3> item_ct1) { + size_t total_id = (size_t)((size_t)item_ct1.get_local_range(2) * + (size_t)item_ct1.get_group(2) + + (size_t)item_ct1.get_local_id(2)); + size_t j = total_id % pattern_length; // pat_idx + size_t i = total_id / pattern_length; // count_idx + + // printf("%lu, %lu, %lu\n", total_id, j, i); + if (i < count) + sparse_scatter[pattern_scatter[j] + delta_scatter * i] = + sparse_gather[pattern_gather[j] + delta_gather * i]; +} + +void oneapi_scatter_gather_atomic(const size_t *pattern_scatter, + double *sparse_scatter, const size_t *pattern_gather, + const double *sparse_gather, const size_t pattern_length, + const size_t delta_scatter, const size_t delta_gather, const size_t wrap, + const size_t count, sycl::nd_item<3> item_ct1) { + size_t total_id = (size_t)((size_t)item_ct1.get_local_range(2) * + (size_t)item_ct1.get_group(2) + + (size_t)item_ct1.get_local_id(2)); + size_t j = total_id % pattern_length; // pat_idx + size_t i = total_id / pattern_length; // count_idx + + // printf("%lu, %lu, %lu\n", total_id, j, i); + if (i < count) + dpct::atomic_exchange( + (unsigned long long int + *)&sparse_scatter[pattern_scatter[j] + delta_scatter * i], + (unsigned long long)(sycl::bit_cast( + sparse_gather[pattern_gather[j] + delta_gather * i]))); +} + +void oneapi_multi_gather(const size_t *pattern, + const size_t *pattern_gather, const double *sparse, double *dense, + const size_t pattern_length, const size_t delta, const size_t wrap, + const size_t count, sycl::nd_item<3> item_ct1) { + size_t total_id = (size_t)((size_t)item_ct1.get_local_range(2) * + (size_t)item_ct1.get_group(2) + + (size_t)item_ct1.get_local_id(2)); + size_t j = total_id % pattern_length; // pat_idx + size_t i = total_id / pattern_length; // count_idx + + double x; + + if (i < count) { + // dense[j + pattern_length * (i % wrap)] = + // sparse[pattern[pattern_gather[j]] + delta * i]; + x = sparse[pattern[pattern_gather[j]] + delta * i]; + if (x == 0.5) + dense[0] = x; + } +} + +void oneapi_multi_scatter(const size_t *pattern, + const size_t *pattern_scatter, double *sparse, const double *dense, + const size_t pattern_length, const size_t delta, const size_t wrap, + const size_t count, sycl::nd_item<3> item_ct1) { + size_t total_id = (size_t)((size_t)item_ct1.get_local_range(2) * + (size_t)item_ct1.get_group(2) + + (size_t)item_ct1.get_local_id(2)); + size_t j = total_id % pattern_length; // pat_idx + size_t i = total_id / pattern_length; // count_idx + + if (i < count) + sparse[pattern[pattern_scatter[j]] + delta * i] = + dense[j + pattern_length * (i % wrap)]; +} + +void oneapi_multi_scatter_atomic(const size_t *pattern, + const size_t *pattern_scatter, double *sparse, const double *dense, + const size_t pattern_length, const size_t delta, const size_t wrap, + const size_t count, sycl::nd_item<3> item_ct1) { + size_t total_id = (size_t)((size_t)item_ct1.get_local_range(2) * + (size_t)item_ct1.get_group(2) + + (size_t)item_ct1.get_local_id(2)); + size_t j = total_id % pattern_length; // pat_idx + size_t i = total_id / pattern_length; // count_idx + + if (i < count) + dpct::atomic_exchange( + (unsigned long long int + *)&sparse[pattern[pattern_scatter[j]] + delta * i], + (unsigned long long)(sycl::bit_cast( + dense[j + pattern_length * (i % wrap)]))); +} + +float oneapi_gather_wrapper(const size_t *pattern, const double *sparse, + double *dense, const size_t pattern_length, const size_t delta, + const size_t wrap, const size_t count) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.default_queue(); + dpct::event_ptr start, stop; + std::chrono::time_point start_ct1; + std::chrono::time_point stop_ct1; + + start = new sycl::event(); + stop = new sycl::event(); + + int threads_per_block = std::min(pattern_length, (size_t)1024); + int blocks_per_grid = + ((pattern_length * count) + threads_per_block - 1) / threads_per_block; + + dev_ct1.queues_wait_and_throw(); + /* + DPCT1012:8: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + start_ct1 = std::chrono::steady_clock::now(); + + /* + DPCT1049:0: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + *stop = q_ct1.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, blocks_per_grid) * + sycl::range<3>(1, 1, threads_per_block), + sycl::range<3>(1, 1, threads_per_block)), + [=](sycl::nd_item<3> item_ct1) { + oneapi_gather(pattern, sparse, dense, pattern_length, delta, wrap, count, + item_ct1); + }); + /* + DPCT1026:9: The call to cudaGetLastError was removed because this call is + redundant in SYCL. + */ + + /* + DPCT1012:10: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + stop->wait(); + stop_ct1 = std::chrono::steady_clock::now(); + + float time_ms = 0; + time_ms = + std::chrono::duration(stop_ct1 - start_ct1).count(); + + dpct::destroy_event(start); + dpct::destroy_event(stop); + + return time_ms; +} + +float oneapi_scatter_wrapper(const size_t *pattern, double *sparse, + const double *dense, const size_t pattern_length, const size_t delta, + const size_t wrap, const size_t count) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.default_queue(); + dpct::event_ptr start, stop; + std::chrono::time_point start_ct1; + std::chrono::time_point stop_ct1; + + start = new sycl::event(); + stop = new sycl::event(); + + int threads_per_block = std::min(pattern_length, (size_t)1024); + int blocks_per_grid = + ((pattern_length * count) + threads_per_block - 1) / threads_per_block; + + dev_ct1.queues_wait_and_throw(); + /* + DPCT1012:11: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + start_ct1 = std::chrono::steady_clock::now(); + + /* + DPCT1049:1: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + *stop = q_ct1.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, blocks_per_grid) * + sycl::range<3>(1, 1, threads_per_block), + sycl::range<3>(1, 1, threads_per_block)), + [=](sycl::nd_item<3> item_ct1) { + oneapi_scatter(pattern, sparse, dense, pattern_length, delta, wrap, count, + item_ct1); + }); + /* + DPCT1026:12: The call to cudaGetLastError was removed because this call is + redundant in SYCL. + */ + + /* + DPCT1012:13: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + stop->wait(); + stop_ct1 = std::chrono::steady_clock::now(); + + float time_ms = 0; + time_ms = + std::chrono::duration(stop_ct1 - start_ct1).count(); + + dpct::destroy_event(start); + dpct::destroy_event(stop); + + return time_ms; +} + +float oneapi_scatter_atomic_wrapper(const size_t *pattern, double *sparse, + const double *dense, const size_t pattern_length, const size_t delta, + const size_t wrap, const size_t count) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.default_queue(); + dpct::event_ptr start, stop; + std::chrono::time_point start_ct1; + std::chrono::time_point stop_ct1; + + start = new sycl::event(); + stop = new sycl::event(); + + int threads_per_block = std::min(pattern_length, (size_t)1024); + int blocks_per_grid = + ((pattern_length * count) + threads_per_block - 1) / threads_per_block; + + dev_ct1.queues_wait_and_throw(); + /* + DPCT1012:14: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + start_ct1 = std::chrono::steady_clock::now(); + + /* + DPCT1049:2: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + *stop = q_ct1.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, blocks_per_grid) * + sycl::range<3>(1, 1, threads_per_block), + sycl::range<3>(1, 1, threads_per_block)), + [=](sycl::nd_item<3> item_ct1) { + oneapi_scatter_atomic(pattern, sparse, dense, pattern_length, delta, wrap, + count, item_ct1); + }); + /* + DPCT1026:15: The call to cudaGetLastError was removed because this call is + redundant in SYCL. + */ + + /* + DPCT1012:16: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + stop->wait(); + stop_ct1 = std::chrono::steady_clock::now(); + + float time_ms = 0; + time_ms = + std::chrono::duration(stop_ct1 - start_ct1).count(); + + dpct::destroy_event(start); + dpct::destroy_event(stop); + + return time_ms; +} + +float oneapi_scatter_gather_wrapper(const size_t *pattern_scatter, + double *sparse_scatter, const size_t *pattern_gather, + const double *sparse_gather, const size_t pattern_length, + const size_t delta_scatter, const size_t delta_gather, const size_t wrap, + const size_t count) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.default_queue(); + dpct::event_ptr start, stop; + std::chrono::time_point start_ct1; + std::chrono::time_point stop_ct1; + + start = new sycl::event(); + stop = new sycl::event(); + + int threads_per_block = std::min(pattern_length, (size_t)1024); + int blocks_per_grid = + ((pattern_length * count) + threads_per_block - 1) / threads_per_block; + + dev_ct1.queues_wait_and_throw(); + /* + DPCT1012:17: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + start_ct1 = std::chrono::steady_clock::now(); + + /* + DPCT1049:3: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + *stop = q_ct1.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, blocks_per_grid) * + sycl::range<3>(1, 1, threads_per_block), + sycl::range<3>(1, 1, threads_per_block)), + [=](sycl::nd_item<3> item_ct1) { + oneapi_scatter_gather(pattern_scatter, sparse_scatter, pattern_gather, + sparse_gather, pattern_length, delta_scatter, delta_gather, wrap, + count, item_ct1); + }); + /* + DPCT1026:18: The call to cudaGetLastError was removed because this call is + redundant in SYCL. + */ + + /* + DPCT1012:19: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + stop->wait(); + stop_ct1 = std::chrono::steady_clock::now(); + + float time_ms = 0; + time_ms = + std::chrono::duration(stop_ct1 - start_ct1).count(); + + dpct::destroy_event(start); + dpct::destroy_event(stop); + + return time_ms; +} + +float oneapi_scatter_gather_atomic_wrapper(const size_t *pattern_scatter, + double *sparse_scatter, const size_t *pattern_gather, + const double *sparse_gather, const size_t pattern_length, + const size_t delta_scatter, const size_t delta_gather, const size_t wrap, + const size_t count) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.default_queue(); + dpct::event_ptr start, stop; + std::chrono::time_point start_ct1; + std::chrono::time_point stop_ct1; + + start = new sycl::event(); + stop = new sycl::event(); + + int threads_per_block = std::min(pattern_length, (size_t)1024); + int blocks_per_grid = + ((pattern_length * count) + threads_per_block - 1) / threads_per_block; + + dev_ct1.queues_wait_and_throw(); + /* + DPCT1012:20: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + start_ct1 = std::chrono::steady_clock::now(); + + /* + DPCT1049:4: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + *stop = q_ct1.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, blocks_per_grid) * + sycl::range<3>(1, 1, threads_per_block), + sycl::range<3>(1, 1, threads_per_block)), + [=](sycl::nd_item<3> item_ct1) { + oneapi_scatter_gather_atomic(pattern_scatter, sparse_scatter, + pattern_gather, sparse_gather, pattern_length, delta_scatter, + delta_gather, wrap, count, item_ct1); + }); + /* + DPCT1026:21: The call to cudaGetLastError was removed because this call is + redundant in SYCL. + */ + + /* + DPCT1012:22: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + stop->wait(); + stop_ct1 = std::chrono::steady_clock::now(); + + float time_ms = 0; + time_ms = + std::chrono::duration(stop_ct1 - start_ct1).count(); + + dpct::destroy_event(start); + dpct::destroy_event(stop); + + return time_ms; +} + +float oneapi_multi_gather_wrapper(const size_t *pattern, + const size_t *pattern_gather, const double *sparse, double *dense, + const size_t pattern_length, const size_t delta, const size_t wrap, + const size_t count) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.default_queue(); + dpct::event_ptr start, stop; + std::chrono::time_point start_ct1; + std::chrono::time_point stop_ct1; + + start = new sycl::event(); + stop = new sycl::event(); + + int threads_per_block = std::min(pattern_length, (size_t)1024); + int blocks_per_grid = + ((pattern_length * count) + threads_per_block - 1) / threads_per_block; + + dev_ct1.queues_wait_and_throw(); + /* + DPCT1012:23: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + start_ct1 = std::chrono::steady_clock::now(); + + /* + DPCT1049:5: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + *stop = q_ct1.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, blocks_per_grid) * + sycl::range<3>(1, 1, threads_per_block), + sycl::range<3>(1, 1, threads_per_block)), + [=](sycl::nd_item<3> item_ct1) { + oneapi_multi_gather(pattern, pattern_gather, sparse, dense, + pattern_length, delta, wrap, count, item_ct1); + }); + /* + DPCT1026:24: The call to cudaGetLastError was removed because this call is + redundant in SYCL. + */ + + /* + DPCT1012:25: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + stop->wait(); + stop_ct1 = std::chrono::steady_clock::now(); + + float time_ms = 0; + time_ms = + std::chrono::duration(stop_ct1 - start_ct1).count(); + + dpct::destroy_event(start); + dpct::destroy_event(stop); + + return time_ms; +} + +float oneapi_multi_scatter_wrapper(const size_t *pattern, + const size_t *pattern_scatter, double *sparse, const double *dense, + const size_t pattern_length, const size_t delta, const size_t wrap, + const size_t count) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.default_queue(); + dpct::event_ptr start, stop; + std::chrono::time_point start_ct1; + std::chrono::time_point stop_ct1; + + start = new sycl::event(); + stop = new sycl::event(); + + int threads_per_block = std::min(pattern_length, (size_t)1024); + int blocks_per_grid = + ((pattern_length * count) + threads_per_block - 1) / threads_per_block; + + dev_ct1.queues_wait_and_throw(); + /* + DPCT1012:26: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + start_ct1 = std::chrono::steady_clock::now(); + + /* + DPCT1049:6: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + *stop = q_ct1.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, blocks_per_grid) * + sycl::range<3>(1, 1, threads_per_block), + sycl::range<3>(1, 1, threads_per_block)), + [=](sycl::nd_item<3> item_ct1) { + oneapi_multi_scatter(pattern, pattern_scatter, sparse, dense, + pattern_length, delta, wrap, count, item_ct1); + }); + /* + DPCT1026:27: The call to cudaGetLastError was removed because this call is + redundant in SYCL. + */ + + /* + DPCT1012:28: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + stop->wait(); + stop_ct1 = std::chrono::steady_clock::now(); + + float time_ms = 0; + time_ms = + std::chrono::duration(stop_ct1 - start_ct1).count(); + + dpct::destroy_event(start); + dpct::destroy_event(stop); + + return time_ms; +} + +float oneapi_multi_scatter_atomic_wrapper(const size_t *pattern, + const size_t *pattern_scatter, double *sparse, const double *dense, + const size_t pattern_length, const size_t delta, const size_t wrap, + const size_t count) { + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.default_queue(); + dpct::event_ptr start, stop; + std::chrono::time_point start_ct1; + std::chrono::time_point stop_ct1; + + start = new sycl::event(); + stop = new sycl::event(); + + int threads_per_block = std::min(pattern_length, (size_t)1024); + int blocks_per_grid = + ((pattern_length * count) + threads_per_block - 1) / threads_per_block; + + dev_ct1.queues_wait_and_throw(); + /* + DPCT1012:29: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + start_ct1 = std::chrono::steady_clock::now(); + + /* + DPCT1049:7: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + *stop = q_ct1.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, blocks_per_grid) * + sycl::range<3>(1, 1, threads_per_block), + sycl::range<3>(1, 1, threads_per_block)), + [=](sycl::nd_item<3> item_ct1) { + oneapi_multi_scatter_atomic(pattern, pattern_scatter, sparse, dense, + pattern_length, delta, wrap, count, item_ct1); + }); + /* + DPCT1026:30: The call to cudaGetLastError was removed because this call is + redundant in SYCL. + */ + + /* + DPCT1012:31: Detected kernel execution time measurement pattern and generated + an initial code for time measurements in SYCL. You can change the way time is + measured depending on your goals. + */ + stop->wait(); + stop_ct1 = std::chrono::steady_clock::now(); + + float time_ms = 0; + time_ms = + std::chrono::duration(stop_ct1 - start_ct1).count(); + + dpct::destroy_event(start); + dpct::destroy_event(stop); + + return time_ms; +} diff --git a/src/Spatter/OneApiBackend.hh b/src/Spatter/OneApiBackend.hh new file mode 100644 index 00000000..0f746680 --- /dev/null +++ b/src/Spatter/OneApiBackend.hh @@ -0,0 +1,37 @@ +#ifndef ONEAPI_BACKEND_HH +#define ONEAPI_BACKEND_HH + +#include + +float oneapi_gather_wrapper(const size_t *pattern, const double *sparse, + double *dense, const size_t pattern_length, const size_t delta, + const size_t wrap, const size_t count); +float oneapi_scatter_wrapper(const size_t *pattern, double *sparse, + const double *dense, const size_t pattern_length, const size_t delta, + const size_t wrap, const size_t count); +float oneapi_scatter_atomic_wrapper(const size_t *pattern, double *sparse, + const double *dense, const size_t pattern_length, const size_t delta, + const size_t wrap, const size_t count); +float oneapi_scatter_gather_wrapper(const size_t *pattern_scatter, + double *sparse_scatter, const size_t *pattern_gather, + const double *sparse_gather, const size_t pattern_length, + const size_t delta_scatter, const size_t delta_gather, const size_t wrap, + const size_t count); +float oneapi_scatter_gather_atomic_wrapper(const size_t *pattern_scatter, + double *sparse_scatter, const size_t *pattern_gather, + const double *sparse_gather, const size_t pattern_length, + const size_t delta_scatter, const size_t delta_gather, const size_t wrap, + const size_t count); +float oneapi_multi_gather_wrapper(const size_t *pattern, + const size_t *pattern_gather, const double *sparse, double *dense, + const size_t pattern_length, const size_t delta, const size_t wrap, + const size_t count); +float oneapi_multi_scatter_wrapper(const size_t *pattern, + const size_t *pattern_scatter, double *sparse, const double *dense, + const size_t pattern_length, const size_t delta, const size_t wrap, + const size_t count); +float oneapi_multi_scatter_atomic_wrapper(const size_t *pattern, + const size_t *pattern_scatter, double *sparse, const double *dense, + const size_t pattern_length, const size_t delta, const size_t wrap, + const size_t count); +#endif diff --git a/src/Spatter/SpatterTypes.hh b/src/Spatter/SpatterTypes.hh index a486514c..97e07c57 100644 --- a/src/Spatter/SpatterTypes.hh +++ b/src/Spatter/SpatterTypes.hh @@ -11,6 +11,7 @@ namespace Spatter { struct Serial {}; struct OpenMP {}; struct CUDA {}; +struct OneApi {}; } // namespace Spatter #endif diff --git a/src/main.cc b/src/main.cc index a3b4cbe6..c9fa7e18 100644 --- a/src/main.cc +++ b/src/main.cc @@ -48,6 +48,10 @@ void print_build_info(Spatter::ClArgs &cl) { } #endif +#ifdef USE_ONEAPI + std::cout << "oneapi configuration (to be added)" << std::endl; +#endif + std::cout << std::endl; }