From 83c3741bdc4e5444d25728c3cd05291bd381d3cd Mon Sep 17 00:00:00 2001 From: Matthew Michel <106704043+mmichel11@users.noreply.github.com> Date: Fri, 31 Jan 2025 15:33:36 -0600 Subject: [PATCH] Re-implement SYCL backend `parallel_for` to improve bandwidth utilization (#1976) Signed-off-by: Matthew Michel --- .../kt/internal/esimd_radix_sort_submitters.h | 2 +- .../internal/async_impl/async_impl_hetero.h | 18 +- .../oneapi/dpl/internal/binary_search_impl.h | 37 +- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 103 +++- .../hetero/algorithm_ranges_impl_hetero.h | 71 ++- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 56 +- .../hetero/dpcpp/parallel_backend_sycl_for.h | 209 +++++++ .../hetero/dpcpp/parallel_backend_sycl_fpga.h | 4 +- .../dpcpp/parallel_backend_sycl_merge.h | 2 +- .../dpcpp/parallel_backend_sycl_reduce.h | 2 +- .../dpcpp/parallel_backend_sycl_utils.h | 167 +++++ .../dpl/pstl/hetero/dpcpp/sycl_traits.h | 50 +- .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 570 ++++++++++++++++-- .../dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h | 53 ++ .../dpl/pstl/hetero/histogram_impl_hetero.h | 4 +- .../dpl/pstl/hetero/numeric_impl_hetero.h | 7 +- include/oneapi/dpl/pstl/utils.h | 30 + .../device_copyable.pass.cpp | 66 +- .../alg.reverse/reverse.pass.cpp | 3 +- .../alg.reverse/reverse_copy.pass.cpp | 3 +- .../copy_move.pass.cpp | 6 +- .../alg.modifying.operations/fill.pass.cpp | 4 +- .../generate.pass.cpp | 4 +- .../alg.modifying.operations/replace.pass.cpp | 3 +- .../replace_copy.pass.cpp | 7 +- .../alg.modifying.operations/rotate.pass.cpp | 4 +- .../rotate_copy.pass.cpp | 4 +- .../shift_left_right.pass.cpp | 39 +- .../swap_ranges.pass.cpp | 3 +- .../transform_binary.pass.cpp | 4 +- .../transform_unary.pass.cpp | 3 +- .../alg.nonmodifying/for_each.pass.cpp | 3 +- .../alg.nonmodifying/transform_if.pass.cpp | 26 +- .../numeric.ops/adjacent_difference.pass.cpp | 3 +- test/support/utils.h | 23 + 35 files changed, 1376 insertions(+), 217 deletions(-) create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h index 4d7b81e6a2e..4fc274f2445 100644 --- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h +++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h @@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl { //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ template , _ExecutionPolicy&& __exec, _For auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view()); + unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__buf.all_view())>{ + __f, static_cast(__n)}, + __n, __buf.all_view()); return __future_obj; } @@ -67,7 +69,9 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view()); + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()), + decltype(__buf2.all_view())>{__f, static_cast(__n)}, + __n, __buf1.all_view(), __buf2.all_view()); return __future.__make_future(__first2 + __n); } @@ -91,10 +95,12 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator3>(); auto __buf3 = __keep3(__first3, __first3 + __n); - auto __future = - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, - __buf1.all_view(), __buf2.all_view(), __buf3.all_view()); + auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()), + decltype(__buf2.all_view()), decltype(__buf3.all_view())>{ + __f, static_cast(__n)}, + __n, __buf1.all_view(), __buf2.all_view(), __buf3.all_view()); return __future.__make_future(__first3 + __n); } diff --git a/include/oneapi/dpl/internal/binary_search_impl.h b/include/oneapi/dpl/internal/binary_search_impl.h index ef01be4b161..a0c4c80b5d2 100644 --- a/include/oneapi/dpl/internal/binary_search_impl.h +++ b/include/oneapi/dpl/internal/binary_search_impl.h @@ -37,13 +37,19 @@ enum class search_algorithm binary_search }; -template -struct custom_brick +#if _ONEDPL_BACKEND_SYCL +template +struct __custom_brick : oneapi::dpl::unseq_backend::walk_scalar_base<_Range> { Comp comp; T size; bool use_32bit_indexing; + __custom_brick(Comp comp, T size, bool use_32bit_indexing) + : comp(std::move(comp)), size(size), use_32bit_indexing(use_32bit_indexing) + { + } + template void search_impl(_ItemId idx, _Acc acc) const @@ -68,17 +74,23 @@ struct custom_brick get<2>(acc[idx]) = (value != end_orig) && (get<1>(acc[idx]) == get<0>(acc[value])); } } - - template + template void - operator()(_ItemId idx, _Acc acc) const + __scalar_path_impl(_IsFull, _ItemId idx, _Acc acc) const { if (use_32bit_indexing) search_impl(idx, acc); else search_impl(idx, acc); } + template + void + operator()(_IsFull __is_full, _ItemId idx, _Acc acc) const + { + __scalar_path_impl(__is_full, idx, acc); + } }; +#endif template @@ -155,7 +167,8 @@ lower_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt const bool use_32bit_indexing = size <= std::numeric_limits::max(); __bknd::__parallel_for( _BackendTag{}, ::std::forward(policy), - custom_brick{comp, size, use_32bit_indexing}, + __custom_brick{ + comp, size, use_32bit_indexing}, value_size, zip_vw) .__deferrable_wait(); return result + value_size; @@ -187,7 +200,8 @@ upper_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt const bool use_32bit_indexing = size <= std::numeric_limits::max(); __bknd::__parallel_for( _BackendTag{}, std::forward(policy), - custom_brick{comp, size, use_32bit_indexing}, + __custom_brick{ + comp, size, use_32bit_indexing}, value_size, zip_vw) .__deferrable_wait(); return result + value_size; @@ -217,10 +231,11 @@ binary_search_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, Input auto result_buf = keep_result(result, result + value_size); auto zip_vw = make_zip_view(input_buf.all_view(), value_buf.all_view(), result_buf.all_view()); const bool use_32bit_indexing = size <= std::numeric_limits::max(); - __bknd::__parallel_for(_BackendTag{}, std::forward(policy), - custom_brick{ - comp, size, use_32bit_indexing}, - value_size, zip_vw) + __bknd::__parallel_for( + _BackendTag{}, std::forward(policy), + __custom_brick{ + comp, size, use_32bit_indexing}, + value_size, zip_vw) .__deferrable_wait(); return result + value_size; } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 65bf99c8777..6c858cefa92 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -57,7 +57,10 @@ __pattern_walk1(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt auto __buf = __keep(__first, __last); oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, __exec, unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view()) + _BackendTag{}, __exec, + unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__buf.all_view())>{ + __f, static_cast(__n)}, + __n, __buf.all_view()) .__deferrable_wait(); } @@ -82,6 +85,14 @@ __pattern_walk1_n(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _F // TODO: A tag _WaitMode is used for provide a patterns call pipeline, where the last one should be synchronous // Probably it should be re-designed by a pipeline approach, when a pattern returns some sync objects // and ones are combined into a "pipeline" (probably like Range pipeline) +// +// A note on access mode types below: the __vector_path_impl in unseq_backend::walk2_vectors_or_scalars only respects +// the default template arguments: +// __acc_mode1 = __par_backend_hetero::access_mode::read +// __acc_mode2 = __par_backend_hetero::access_mode::write +// For any provided _Function object, the default access modes should be respected even if other access modes are +// required due to dependency / synchronization issues. For a detailed explanation, see: +// https://github.com/uxlfoundation/oneDPL/issues/1272 template , _ExecutionPolicy&& __exec, _ForwardIt auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view()); + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()), + decltype(__buf2.all_view())>{__f, static_cast(__n)}, + __n, __buf1.all_view(), __buf2.all_view()); // Call no wait, wait or deferrable wait depending on _WaitMode __future.wait(_WaitMode{}); @@ -130,16 +143,39 @@ _ForwardIterator2 __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Function __f) { - return __pattern_walk2( - __tag, ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __f); + const auto __n = __last1 - __first1; + if (__n == 0) + return __first2; + + auto __keep1 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator1>(); + auto __buf1 = __keep1(__first1, __last1); + + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator2>(); + auto __buf2 = __keep2(__first2, __first2 + __n); + + auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__buf1.all_view()), + decltype(__buf2.all_view())>{__f, static_cast(__n)}, + __n, __buf1.all_view(), __buf2.all_view()); + __future.wait(__par_backend_hetero::__deferrable_mode{}); + return __first2 + __n; } //------------------------------------------------------------------------ // walk3 //------------------------------------------------------------------------ +// A note on access mode types below: the __vector_path_impl in unseq_backend::walk3_vectors_or_scalars only respects +// the default template arguments: +// __acc_mode1 = __par_backend_hetero::access_mode::read +// __acc_mode2 = __par_backend_hetero::access_mode::read +// __acc_mode3 __par_backend_hetero::access_mode::write +// For any provided _Function object, the default access modes should be respected even if other access modes are +// required due to dependency / synchronization issues. For a detailed explanation, see: +// https://github.com/uxlfoundation/oneDPL/issues/1272 template , _ExecutionPolicy&& __exec, _ForwardIt auto __keep3 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode3, _ForwardIterator3>(); auto __buf3 = __keep3(__first3, __first3 + __n); - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, - __buf1.all_view(), __buf2.all_view(), __buf3.all_view()) + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()), + decltype(__buf2.all_view()), decltype(__buf3.all_view())>{ + __f, static_cast(__n)}, + __n, __buf1.all_view(), __buf2.all_view(), __buf3.all_view()) .__deferrable_wait(); return __first3 + __n; @@ -253,7 +292,9 @@ __pattern_walk2_transform_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Function __func) { // Require `read_write` access mode for output sequence to force a copy in for host iterators to capture incoming - // values of the output sequence for elements where the predicate is false. + // values of the output sequence for elements where the predicate is false. We never actually read from the output + // sequence, so there is no risk when ran with the vectorized path of walk2_vector_or_scalars. For more info, + // please see the comment above __pattern_walk2 and https://github.com/uxlfoundation/oneDPL/issues/1272. return __pattern_walk2( __tag, @@ -273,7 +314,9 @@ __pattern_walk3_transform_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& _Function __func) { // Require `read_write` access mode for output sequence to force a copy in for host iterators to capture incoming - // values of the output sequence for elements where the predicate is false. + // values of the output sequence for elements where the predicate is false. We never actually read from the output + // sequence, so there is no risk when ran with the vectorized path of walk3_vector_or_scalars. For more info, + // please see the comment above __pattern_walk3 and https://github.com/uxlfoundation/oneDPL/issues/1272. return __pattern_walk3<_BackendTag, __par_backend_hetero::access_mode::read, __par_backend_hetero::access_mode::read, __par_backend_hetero::access_mode::read_write>( __tag, @@ -1036,6 +1079,9 @@ __pattern_unique(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _It // The temporary buffer is constructed from a range, therefore it's destructor will not block, therefore // we must call __pattern_walk2 in a way which provides blocking synchronization for this pattern. + // We never actually write to the sequence, so there is no risk when ran with the vectorized path of + // walk2_vector_or_scalars. For more info, please see the comment above __pattern_walk2 and + // https://github.com/uxlfoundation/oneDPL/issues/1272. return __pattern_walk2( @@ -1564,9 +1610,10 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>(); auto __buf = __keep(__first, __last); oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::__reverse_functor::difference_type>{__n}, __n / 2, - __buf.all_view()) + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::__reverse_functor::difference_type, + decltype(__buf.all_view())>{__n}, + __n / 2, __buf.all_view()) .__deferrable_wait(); } @@ -1590,8 +1637,9 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator>(); auto __buf2 = __keep2(__result, __result + __n); oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::__reverse_copy::difference_type>{__n}, + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::__reverse_copy::difference_type, + decltype(__buf1.all_view()), decltype(__buf2.all_view())>{__n}, __n, __buf1.all_view(), __buf2.all_view()) .__deferrable_wait(); @@ -1632,18 +1680,20 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator const auto __shift = __new_first - __first; oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec), - unseq_backend::__rotate_copy::difference_type>{__n, __shift}, __n, - __buf.all_view(), __temp_rng_w); + unseq_backend::__rotate_copy::difference_type, + decltype(__buf.all_view()), decltype(__temp_rng_w)>{__n, __shift}, + __n, __buf.all_view(), __temp_rng_w); //An explicit wait isn't required here because we are working with a temporary sycl::buffer and sycl accessors and //SYCL runtime makes a dependency graph to prevent the races between two __parallel_for patterns. using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>; - auto __brick = unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}}; - auto __temp_rng_rw = oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::read_write>(__temp_buf.get_buffer()); - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick, + auto __brick = + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw), + decltype(__buf.all_view())>{_Function{}, static_cast(__n)}; + oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick, __n, __temp_rng_rw, __buf.all_view()) .__deferrable_wait(); @@ -1677,8 +1727,8 @@ __pattern_rotate_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bid oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::__rotate_copy::difference_type>{__n, - __shift}, + unseq_backend::__rotate_copy::difference_type, + decltype(__buf1.all_view()), decltype(__buf2.all_view())>{__n, __shift}, __n, __buf1.all_view(), __buf2.all_view()) .__deferrable_wait(); @@ -1936,19 +1986,22 @@ __pattern_shift_left(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rang if (__n >= __mid) { using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>; - auto __brick = oneapi::dpl::unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}}; //TODO: to consider use just "read" access mode for a source range and just "write" - for a destination range. auto __src = oneapi::dpl::__ranges::drop_view_simple<_Range, _DiffType>(__rng, __n); auto __dst = oneapi::dpl::__ranges::take_view_simple<_Range, _DiffType>(__rng, __size_res); + auto __brick = + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__src), decltype(__dst)>{ + _Function{}, static_cast(__size_res)}; + oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick, __size_res, __src, __dst) .__deferrable_wait(); } else //2. n < size/2; 'n' parallel copying { - auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType>{__size, __n}; + auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType, decltype(__rng)>{__size, __n}; oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__shift_left_right>( diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 65d15ebf26a..4dcd93367a7 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -58,10 +58,35 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function const _Size __n = std::min({_Size(__rngs.size())...}); if (__n > 0) { - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, - ::std::forward<_Ranges>(__rngs)...) - .__deferrable_wait(); + constexpr std::size_t __num_ranges = sizeof...(_Ranges); + static_assert(__num_ranges <= 3, "__pattern_walk_n only supports up to three packed range parameters"); + if constexpr (__num_ranges == 1) + { + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{ + __f, static_cast(__n)}, + __n, std::forward<_Ranges>(__rngs)...) + .__deferrable_wait(); + } + else if constexpr (__num_ranges == 2) + { + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{ + __f, static_cast(__n)}, + __n, std::forward<_Ranges>(__rngs)...) + .__deferrable_wait(); + } + else + { + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{ + __f, static_cast(__n)}, + __n, std::forward<_Ranges>(__rngs)...) + .__deferrable_wait(); + } } return __n; } @@ -149,20 +174,27 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Rang { if (__rng1.size() <= __rng2.size()) { - oneapi::dpl::__internal::__ranges::__pattern_walk_n( - __tag, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - __f, __rng1, __rng2); - return __rng1.size(); + const std::size_t __n = __rng1.size(); + auto __exec1 = oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>( + std::forward<_ExecutionPolicy>(__exec)); + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::move(__exec1), + unseq_backend::__brick_swap, std::decay_t<_Range2>>{ + __f, __n}, + __n, __rng1, __rng2) + .__deferrable_wait(); + return __n; } - - oneapi::dpl::__internal::__ranges::__pattern_walk_n( - __tag, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - __f, __rng2, __rng1); - return __rng2.size(); + const std::size_t __n = __rng2.size(); + auto __exec2 = + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(std::forward<_ExecutionPolicy>(__exec)); + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::move(__exec2), + unseq_backend::__brick_swap, std::decay_t<_Range1>>{__f, + __n}, + __n, __rng2, __rng1) + .__deferrable_wait(); + return __n; } //------------------------------------------------------------------------ @@ -627,8 +659,9 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>( std::forward<_ExecutionPolicy>(__exec)), - unseq_backend::walk_n<_ExecutionPolicy, _CopyBrick>{_CopyBrick{}}, __n, std::forward<_Range1>(__rng), - std::forward<_Range2>(__result)) + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _CopyBrick, std::decay_t<_Range1>, + std::decay_t<_Range2>>{_CopyBrick{}, static_cast(__n)}, + __n, std::forward<_Range1>(__rng), std::forward<_Range2>(__result)) .get(); return 1; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index a1fcc1f8e15..be38682049e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -36,6 +36,7 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" +#include "parallel_backend_sycl_for.h" #include "parallel_backend_sycl_reduce.h" #include "parallel_backend_sycl_merge.h" #include "parallel_backend_sycl_merge_sort.h" @@ -218,57 +219,11 @@ class __scan_single_wg_dynamic_kernel; template class __scan_copy_single_wg_kernel; -//------------------------------------------------------------------------ -// parallel_for - async pattern -//------------------------------------------------------------------------ - -// Use the trick with incomplete type and partial specialization to deduce the kernel name -// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one -// type (for explicitly specified name by the user) -template -struct __parallel_for_submitter; - -template -struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const - { - assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); - _PRINT_INFO_IN_DEBUG_MODE(__exec); - auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) { - //get an access to data under SYCL buffer: - oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - - __cgh.parallel_for<_Name...>(sycl::range(__count), [=](sycl::item __item_id) { - auto __idx = __item_id.get_linear_id(); - __brick(__idx, __rngs...); - }); - }); - return __future(__event); - } -}; - -//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, -//for some algorithms happens that size of processing range is n, but amount of iterations is n/2. -template -auto -__parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count, - _Ranges&&... __rngs) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>; - - return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count, - ::std::forward<_Ranges>(__rngs)...); -} - //------------------------------------------------------------------------ // parallel_transform_scan - async pattern //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_scan_submitter; @@ -2174,7 +2129,7 @@ struct __partial_merge_kernel } }; -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_partial_sort_submitter; @@ -2412,7 +2367,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ oneapi::dpl::__par_backend_hetero::__parallel_for( oneapi::dpl::__internal::__device_backend_tag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end, + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n), _Range2>(__binary_op, __n), + __intermediate_result_end, oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __intermediate_result_end), std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) @@ -2458,7 +2414,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ oneapi::dpl::__internal::__device_backend_tag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( std::forward<_ExecutionPolicy>(__exec)), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end), _Range4>( __binary_op, __intermediate_result_end), __result_end, oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end), diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h new file mode 100644 index 00000000000..24811a15a06 --- /dev/null +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h @@ -0,0 +1,209 @@ +// -*- C++ -*- +//===-- parallel_backend_sycl_for.h ---------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_FOR_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_FOR_H + +#include +#include +#include +#include +#include + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" +#include "execution_sycl_defs.h" +#include "unseq_backend_sycl.h" +#include "utils_ranges_sycl.h" + +#include "sycl_traits.h" //SYCL traits specialization for some oneDPL types. + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ + +template +class __parallel_for_small_kernel; + +template +class __parallel_for_large_kernel; + +//------------------------------------------------------------------------ +// parallel_for - async pattern +//------------------------------------------------------------------------ + +// Use the trick with incomplete type and partial specialization to deduce the kernel name +// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one +// type (for explicitly specified name by the user) +template +struct __parallel_for_small_submitter; + +template +struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + { + assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); + _PRINT_INFO_IN_DEBUG_MODE(__exec); + auto __event = __exec.queue().submit([__rngs..., __brick, __count](sycl::handler& __cgh) { + //get an access to data under SYCL buffer: + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); + + __cgh.parallel_for<_Name...>(sycl::range(__count), [=](sycl::item __item_id) { + const std::size_t __idx = __item_id.get_linear_id(); + // For small inputs, do not vectorize or perform multiple iterations per work item. Spread input evenly + // across compute units. + __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...); + }); + }); + return __future(__event); + } +}; + +template +struct __parallel_for_large_submitter; + +template +struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...> +{ + // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. + static constexpr std::uint16_t __max_work_group_size = 512; + + // SPIR-V compilation targets show best performance with a stride of the sub-group size. + // Other compilation targets perform best with a work-group size stride. This utility can only be called from the + // device. + static inline std::tuple + __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, + std::size_t __adj_elements_per_work_item, std::size_t __work_group_size) + { + const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); + if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) + { + const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); + const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + + const std::size_t __sub_group_start_idx = + __iters_per_work_item * __adj_elements_per_work_item * + (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); + const bool __is_full_sub_group = + __sub_group_start_idx + __iters_per_work_item * __adj_elements_per_work_item * __sub_group_size <= + __count; + const std::size_t __work_item_idx = + __sub_group_start_idx + __adj_elements_per_work_item * __sub_group_local_id; + return std::tuple(__work_item_idx, __adj_elements_per_work_item * __sub_group_size, __is_full_sub_group); + } + else + { + const std::size_t __work_group_start_idx = + __work_group_id * __work_group_size * __iters_per_work_item * __adj_elements_per_work_item; + const std::size_t __work_item_idx = + __work_group_start_idx + __item.get_local_linear_id() * __adj_elements_per_work_item; + const bool __is_full_work_group = + __work_group_start_idx + __iters_per_work_item * __work_group_size * __adj_elements_per_work_item <= + __count; + return std::tuple(__work_item_idx, __work_group_size * __adj_elements_per_work_item, __is_full_work_group); + } + } + + // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item, + // then we should start using this code path. + template + static std::size_t + __estimate_best_start_size(const _ExecutionPolicy& __exec, _Fp __brick) + { + const std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + return __work_group_size * _Fp::__preferred_iters_per_item * __max_cu; + } + + template + auto + operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + { + assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); + const std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + _PRINT_INFO_IN_DEBUG_MODE(__exec); + auto __event = __exec.queue().submit([__rngs..., __brick, __work_group_size, __count](sycl::handler& __cgh) { + //get an access to data under SYCL buffer: + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); + constexpr std::uint8_t __iters_per_work_item = _Fp::__preferred_iters_per_item; + constexpr std::uint8_t __vector_size = _Fp::__preferred_vector_size; + const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div( + __count, (__work_group_size * __vector_size * __iters_per_work_item)); + __cgh.parallel_for<_Name...>( + sycl::nd_range(sycl::range<1>(__num_groups * __work_group_size), sycl::range<1>(__work_group_size)), + [=](sycl::nd_item __item) { + const auto [__idx, __stride, __is_full] = + __stride_recommender(__item, __count, __iters_per_work_item, __vector_size, __work_group_size); + __strided_loop<__iters_per_work_item> __execute_loop{static_cast(__count)}; + if (__is_full) + { + __execute_loop(std::true_type{}, __idx, __stride, __brick, __rngs...); + } + // If we are not full, then take this branch only if there is work to process. + else if (__idx < __count) + { + __execute_loop(std::false_type{}, __idx, __stride, __brick, __rngs...); + } + }); + }); + return __future(__event); + } +}; + +//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, +//for some algorithms happens that size of processing range is n, but amount of iterations is n/2. +template +auto +__parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count, + _Ranges&&... __rngs) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + using _ForKernelSmall = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>; + using _ForKernelLarge = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>; + + using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>; + using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>; + // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a + // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1, + // then only compile the basic kernel as the two versions are effectively the same. + if constexpr (_Fp::__preferred_iters_per_item > 1 || _Fp::__preferred_vector_size > 1) + { + if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick)) + { + return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } + } + return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 9ff4fa2c80b..ef124405434 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -48,7 +48,7 @@ namespace __par_backend_hetero //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, //for some algorithms happens that size of processing range is n, but amount of iterations is n/2. -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_for_fpga_submitter; @@ -70,7 +70,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name... #pragma unroll(::std::decay <_ExecutionPolicy>::type::unroll_factor) for (auto __idx = 0; __idx < __count; ++__idx) { - __brick(__idx, __rngs...); + __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...); } }); }); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 280e3e20b13..6c97f0a55d4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -197,7 +197,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I return {__rng1_idx, __rng2_idx}; } -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_merge_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index 0d2d49ed6d1..fcd1b3e83cf 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _ //------------------------------------------------------------------------ // parallel_transform_reduce - async patterns -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ // Parallel_transform_reduce for a small arrays using a single work group. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index c19538168e2..25b50bd20ea 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -907,6 +907,173 @@ __bypass_sycl_kernel_not_supported(const sycl::exception& __e) throw; } +struct __scalar_load_op +{ + oneapi::dpl::__internal::__pstl_assign __assigner; + template + void + operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const + { + __assigner(__source_acc[__idx_source], __dest_acc[__idx_dest]); + } +}; + +template +struct __vector_load +{ + static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported"); + std::size_t __full_range_size; + template + void + operator()(/*__is_full*/ std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Rngs&&... __rngs) const + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __vec_size; ++__i) + __load_op(__start_idx + __i, __i, __rngs...); + } + + template + void + operator()(/*__is_full*/ std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Rngs&&... __rngs) const + { + std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__full_range_size - __start_idx}); + for (std::uint8_t __i = 0; __i < __elements; ++__i) + __load_op(__start_idx + __i, __i, __rngs...); + } +}; + +template +struct __scalar_store_transform_op +{ + _TransformOp __transform; + // Unary transformations into an output buffer + template + void + operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const + { + __transform(__source_acc[__idx_source], __dest_acc[__idx_dest]); + } + // Binary transformations into an output buffer + template + void + operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _Source1Acc __source1_acc, _Source2Acc __source2_acc, + _DestAcc __dest_acc) const + { + __transform(__source1_acc[__idx_source], __source2_acc[__idx_source], __dest_acc[__idx_dest]); + } +}; + +// TODO: Consider unifying the implementations of __vector_walk, __vector_load, __vector_store, and potentially +// __strided_loop with some common, generic utility +template +struct __vector_walk +{ + static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported"); + std::size_t __full_range_size; + + template + void + operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __vec_size; ++__i) + { + __f(__rngs[__idx + __i]...); + } + } + // For a non-full vector path, process it sequentially. This will always be the last sub or work group + // if it does not evenly divide into input + template + void + operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const + { + std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__full_range_size - __idx}); + for (std::uint8_t __i = 0; __i < __elements; ++__i) + { + __f(__rngs[__idx + __i]...); + } + } +}; + +template +struct __vector_store +{ + static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported"); + std::size_t __full_range_size; + + template + void + operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs&&... __rngs) const + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __vec_size; ++__i) + __store_op(__i, __start_idx + __i, __rngs...); + } + template + void + operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs&&... __rngs) const + { + std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__full_range_size - __start_idx}); + for (std::uint8_t __i = 0; __i < __elements; ++__i) + __store_op(__i, __start_idx + __i, __rngs...); + } +}; + +template +struct __vector_reverse +{ + static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported"); + template + void + operator()(/*__is_full*/ std::true_type, const _Idx /*__elements_to_process*/, _Array __array) const + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i) + std::swap(__array[__i], __array[__vec_size - __i - 1]); + } + template + void + operator()(/*__is_full*/ std::false_type, const _Idx __elements_to_process, _Array __array) const + { + for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i) + std::swap(__array[__i], __array[__elements_to_process - __i - 1]); + } +}; + +// Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns +// (potentially with vectorization) +template +struct __strided_loop +{ + std::size_t __full_range_size; + template + void + operator()(/*__is_full*/ std::true_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op, + _Ranges&&... __rngs) const + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __num_strides; ++__i) + { + __loop_body_op(std::true_type{}, __idx, __rngs...); + __idx += __stride; + } + } + template + void + operator()(/*__is_full*/ std::false_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op, + _Ranges&&... __rngs) const + { + // Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation + const std::uint8_t __adjusted_iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__full_range_size - __idx, __stride); + for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i) + { + __loop_body_op(std::false_type{}, __idx, __rngs...); + __idx += __stride; + } + } +}; + } // namespace __par_backend_hetero } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index 54ea69bae42..ac2e8da30f8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -368,7 +368,16 @@ namespace oneapi::dpl::unseq_backend template struct walk_n; -template +template +struct walk1_vector_or_scalar; + +template +struct walk2_vectors_or_scalars; + +template +struct walk3_vectors_or_scalars; + +template struct walk_adjacent_difference; template class __brick_set_op; -template +template struct __brick_reduce_idx; } // namespace oneapi::dpl::unseq_backend @@ -429,9 +438,30 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen { }; -template +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk1_vector_or_scalar, + _ExecutionPolicy, _F, _Range)> + : oneapi::dpl::__internal::__are_all_device_copyable<_F> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk2_vectors_or_scalars, + _ExecutionPolicy, _F, _Range1, _Range2)> + : oneapi::dpl::__internal::__are_all_device_copyable<_F> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk3_vectors_or_scalars, + _ExecutionPolicy, _F, _Range1, _Range2, _Range3)> + : oneapi::dpl::__internal::__are_all_device_copyable<_F> +{ +}; + +template struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk_adjacent_difference, - _ExecutionPolicy, _F)> + _ExecutionPolicy, _F, _Range1, _Range2)> : oneapi::dpl::__internal::__are_all_device_copyable<_F> { }; @@ -543,9 +573,9 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen { }; -template +template struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::__brick_reduce_idx, _BinaryOperator, - _Size)> + _Size, _Range)> : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryOperator, _Size> { }; @@ -555,8 +585,8 @@ namespace oneapi::dpl::internal enum class search_algorithm; -template -struct custom_brick; +template +struct __custom_brick; template struct replace_if_fun; @@ -575,8 +605,8 @@ class transform_if_stencil_fun; } // namespace oneapi::dpl::internal -template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::internal::custom_brick, Comp, T, func)> +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::internal::__custom_brick, Comp, T, _Range, func)> : oneapi::dpl::__internal::__are_all_device_copyable { }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index e0b57260ee0..9b8da494208 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -22,6 +22,7 @@ #include "../../onedpl_config.h" #include "../../utils.h" #include "sycl_defs.h" +#include "utils_ranges_sycl.h" #define _ONEDPL_SYCL_KNOWN_IDENTITY_PRESENT \ (_ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT || _ONEDPL_LIBSYCL_KNOWN_IDENTITY_PRESENT) @@ -112,6 +113,190 @@ struct walk_n } }; +// Base class which establishes tuning parameters including vectorization / scalar path decider at compile time +// for walk / for based algorithms +template +struct walk_vector_or_scalar_base +{ + private: + using _ValueTypes = std::tuple...>; + constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value; + // Empirically determined 'bytes-in-flight' to maximize bandwidth utilization + constexpr static std::uint8_t __bytes_per_item = 16; + // Maximum size supported by compilers to generate vector instructions + constexpr static std::uint8_t __max_vector_size = 4; + + public: + constexpr static bool __can_vectorize = + (oneapi::dpl::__ranges::__is_vectorizable_range>::value && ...) && + (std::is_fundamental_v> && ...) && __min_type_size < 4; + // Vectorize for small types, so we generate 128-byte load / stores in a sub-group + constexpr static std::uint8_t __preferred_vector_size = + __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(__max_vector_size, __min_type_size) : 1; + constexpr static std::uint8_t __preferred_iters_per_item = + __bytes_per_item / (__min_type_size * __preferred_vector_size); + + protected: + using __vec_load_t = oneapi::dpl::__par_backend_hetero::__vector_load<__preferred_vector_size>; + using __vec_store_t = oneapi::dpl::__par_backend_hetero::__vector_store<__preferred_vector_size>; + using __vec_reverse_t = oneapi::dpl::__par_backend_hetero::__vector_reverse<__preferred_vector_size>; + using __vec_walk_t = oneapi::dpl::__par_backend_hetero::__vector_walk<__preferred_vector_size>; +}; + +// Path that intentionally disables vectorization for algorithms with a scattered access pattern (e.g. binary_search) +template +struct walk_scalar_base +{ + private: + using _ValueTypes = std::tuple...>; + constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value; + constexpr static std::uint8_t __bytes_per_item = 16; + + public: + constexpr static bool __can_vectorize = false; + // With no vectorization, the vector size is 1 + constexpr static std::uint8_t __preferred_vector_size = 1; + // To achieve full bandwidth utilization, multiple iterations need to be processed by a work item + constexpr static std::uint8_t __preferred_iters_per_item = + __bytes_per_item / (__min_type_size * __preferred_vector_size); +}; + +template +struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range> +{ + private: + using __base_t = walk_vector_or_scalar_base<_Range>; + _F __f; + std::size_t __n; + + public: + walk1_vector_or_scalar(_F __f, std::size_t __n) : __f(std::move(__f)), __n(__n) {} + + template + void + __vector_path_impl(_IsFull __is_full, const std::size_t __idx, _Range __rng) const + { + typename __base_t::__vec_walk_t{__n}(__is_full, __idx, __f, __rng); + } + + // _IsFull is ignored here. We assume that boundary checking has been already performed for this index. + template + void + __scalar_path_impl(_IsFull, const std::size_t __idx, _Range __rng) const + { + __f(__rng[__idx]); + } + + template + void + operator()(_IsFull __is_full, const std::size_t __idx, _Range __rng) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng); + else + __scalar_path_impl(__is_full, __idx, __rng); + } +}; + +template +struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2> +{ + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; + _F __f; + std::size_t __n; + + public: + walk2_vectors_or_scalars(_F __f, std::size_t __n) : __f(std::move(__f)), __n(__n) {} + + template + void + __vector_path_impl(_IsFull __is_full, const std::size_t __idx, _Range1 __rng1, _Range2 __rng2) const + { + using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; + _ValueType1 __rng1_vector[__base_t::__preferred_vector_size]; + // 1. Load input into a vector + typename __base_t::__vec_load_t{__n}(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_load_op{}, + __rng1, __rng1_vector); + // 2. Apply functor to vector and store into global memory + typename __base_t::__vec_store_t{__n}(__is_full, __idx, + oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<_F>{__f}, + __rng1_vector, __rng2); + } + + // _IsFull is ignored here. We assume that boundary checking has been already performed for this index. + template + void + __scalar_path_impl(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const + { + + __f(__rng1[__idx], __rng2[__idx]); + } + + template + void + operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2); + } +}; + +template +struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2, _Range3> +{ + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2, _Range3>; + _F __f; + std::size_t __n; + + public: + walk3_vectors_or_scalars(_F __f, std::size_t __n) : __f(std::move(__f)), __n(__n) {} + + template + void + __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const + { + using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; + using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>; + + _ValueType1 __rng1_vector[__base_t::__preferred_vector_size]; + _ValueType2 __rng2_vector[__base_t::__preferred_vector_size]; + + typename __base_t::__vec_load_t __vec_load{__n}; + typename __base_t::__vec_store_t __vec_store{__n}; + oneapi::dpl::__par_backend_hetero::__scalar_load_op __load_op; + + // 1. Load inputs into vectors + __vec_load(__is_full, __idx, __load_op, __rng1, __rng1_vector); + __vec_load(__is_full, __idx, __load_op, __rng2, __rng2_vector); + // 2. Apply binary functor to vector and store into global memory + __vec_store(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<_F>{__f}, + __rng1_vector, __rng2_vector, __rng3); + } + + // _IsFull is ignored here. We assume that boundary checking has been already performed for this index. + template + void + __scalar_path_impl(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const + { + + __f(__rng1[__idx], __rng2[__idx], __rng3[__idx]); + } + + template + void + operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2, __rng3); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2, __rng3); + } +}; + // If read accessor returns temporary value then __no_op returns lvalue reference to it. // After temporary value destroying it will be a reference on invalid object. // So let's don't call functor in case of __no_op @@ -132,22 +317,56 @@ struct walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op> // walk_adjacent_difference //------------------------------------------------------------------------ -template -struct walk_adjacent_difference +template +struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Range2> { + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; _F __f; + std::size_t __n; + oneapi::dpl::__internal::__pstl_assign __assigner; - template + public: + walk_adjacent_difference(_F __f, std::size_t __n) : __f(std::move(__f)), __n(__n) {} + + template void - operator()(const _ItemId __idx, const _Acc1& _acc_src, _Acc2& _acc_dst) const + __scalar_path_impl(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const { - using ::std::get; - // just copy an element if it is the first one if (__idx == 0) - _acc_dst[__idx] = _acc_src[__idx]; + __assigner(__rng1[__idx], __rng2[__idx]); else - __f(_acc_src[__idx + (-1)], _acc_src[__idx], _acc_dst[__idx]); + __f(__rng1[__idx + (-1)], __rng1[__idx], __rng2[__idx]); + } + template + void + __vector_path_impl(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const + { + using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + _ValueType __rng1_vector[__base_t::__preferred_vector_size + 1]; + // 1. Establish a vector of __preferred_vector_size + 1 where a scalar load is performed on the first element + // followed by a vector load of the specified length. + __assigner(__idx != 0 ? __rng1[__idx - 1] : __rng1[0], __rng1_vector[0]); + typename __base_t::__vec_load_t{__n}(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_load_op{}, + __rng1, &__rng1_vector[1]); + // 2. Perform a vector store of __preferred_vector_size adjacent differences. + typename __base_t::__vec_store_t{__n}(__is_full, __idx, + oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<_F>{__f}, + __rng1_vector, &__rng1_vector[1], __rng2); + // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch + // to better handle this results in performance degradation. + if (__idx == 0) + __assigner(__rng1_vector[0], __rng2[0]); + } + template + void + operator()(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2); } }; @@ -927,47 +1146,203 @@ struct __brick_includes //------------------------------------------------------------------------ // reverse //------------------------------------------------------------------------ -template -struct __reverse_functor +template +struct __reverse_functor : public walk_vector_or_scalar_base<_Range> { + private: + using __base_t = walk_vector_or_scalar_base<_Range>; + using _ValueType = oneapi::dpl::__internal::__value_t<_Range>; _Size __size; - template + + public: + __reverse_functor(_Size __size) : __size(__size) {} + + template + void + __vector_path_impl(_IsFull, const std::size_t __left_start_idx, _Range __rng) const + { + const std::size_t __n = __size; + + // In the below implementation, we see that _IsFull is ignored in favor of std::true_type{} in all cases. + // This relaxation is due to the fact that in-place reverse launches work only over the first half of the + // buffer. As long as __size >= __vec_size there is no risk of an OOB accesses or a race condition. There may + // exist a single point of double processing between left and right vectors in the last work-item which + // reverses middle elements. This extra processing of elements <= __vec_size is more performant than applying + // additional branching (such as in reverse_copy). + + const std::size_t __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size; + + _ValueType __rng_left_vector[__base_t::__preferred_vector_size]; + _ValueType __rng_right_vector[__base_t::__preferred_vector_size]; + + typename __base_t::__vec_load_t __vec_load{__n}; + typename __base_t::__vec_reverse_t __vec_reverse; + typename __base_t::__vec_store_t __vec_store{__n}; + oneapi::dpl::__par_backend_hetero::__scalar_load_op __load_op; + oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op + __store_op; + + // 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right + __vec_load(std::true_type{}, __left_start_idx, __load_op, __rng, __rng_left_vector); + __vec_load(std::true_type{}, __right_start_idx, __load_op, __rng, __rng_right_vector); + // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full + // vector of elements to load + __vec_reverse(std::true_type{}, __left_start_idx, __rng_left_vector); + __vec_reverse(std::true_type{}, __right_start_idx, __rng_right_vector); + // 3. Store the left-half vector to the corresponding right-half indices and vice versa + __vec_store(std::true_type{}, __right_start_idx, __store_op, __rng_left_vector, __rng); + __vec_store(std::true_type{}, __left_start_idx, __store_op, __rng_right_vector, __rng); + } + template void - operator()(const _Idx __idx, _Accessor& __acc) const + __scalar_path_impl(_IsFull, const std::size_t __idx, _Range __rng) const { using ::std::swap; - swap(__acc[__idx], __acc[__size - __idx - 1]); + swap(__rng[__idx], __rng[__size - __idx - 1]); + } + template + void + operator()(_IsFull __is_full, const std::size_t __idx, _Range __rng) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng); + else + __scalar_path_impl(__is_full, __idx, __rng); } }; //------------------------------------------------------------------------ // reverse_copy //------------------------------------------------------------------------ -template -struct __reverse_copy +template +struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2> { + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; + using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>; _Size __size; - template + oneapi::dpl::__internal::__pstl_assign __assigner; + + public: + __reverse_copy(_Size __size) : __size(__size) {} + + template + void + __scalar_path_impl(_IsFull, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + __rng2[__idx] = __rng1[__size - __idx - 1]; + } + template + void + __vector_path_impl(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + const std::size_t __n = __size; + const std::size_t __remaining_elements = __n - __idx; + const std::uint8_t __elements_to_process = + std::min(static_cast(__base_t::__preferred_vector_size), __remaining_elements); + const std::size_t __output_start = __size - __idx - __elements_to_process; + // 1. Load vector to reverse + _ValueType __rng1_vector[__base_t::__preferred_vector_size]; + typename __base_t::__vec_load_t{__n}(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_load_op{}, + __rng1, __rng1_vector); + // 2. Reverse in registers + typename __base_t::__vec_reverse_t{}(__is_full, __elements_to_process, __rng1_vector); + // 3. Flip the location of the vector in the output buffer + if constexpr (_IsFull::value) + { + typename __base_t::__vec_store_t{__n}(std::true_type{}, __output_start, + oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op< + oneapi::dpl::__internal::__pstl_assign>{}, + __rng1_vector, __rng2); + } + else + { + // The non-full case is processed manually here due to the translation of indices in the reverse operation. + // The last few elements in the buffer are reversed into the beginning of the buffer. However, + // __vector_store would believe that we always have a full vector length of elements due to the starting + // index having greater than __preferred_vector_size elements until the end of the buffer. + for (std::uint8_t __i = 0; __i < __elements_to_process; ++__i) + __assigner(__rng1_vector[__i], __rng2[__output_start + __i]); + } + } + template void - operator()(const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const + operator()(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const { - __acc2[__idx] = __acc1[__size - __idx - 1]; + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2); } }; //------------------------------------------------------------------------ // rotate_copy //------------------------------------------------------------------------ -template -struct __rotate_copy +template +struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2> { + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; + using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>; _Size __size; _Size __shift; - template + oneapi::dpl::__internal::__pstl_assign __assigner; + + public: + __rotate_copy(_Size __size, _Size __shift) : __size(__size), __shift(__shift) {} + + template void - operator()(const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const + __vector_path_impl(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const { - __acc2[__idx] = __acc1[(__shift + __idx) % __size]; + const std::size_t __shifted_idx = __shift + __idx; + const std::size_t __wrapped_idx = __shifted_idx % __size; + const std::size_t __n = __size; + _ValueType __rng1_vector[__base_t::__preferred_vector_size]; + //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process + if (__wrapped_idx + __base_t::__preferred_vector_size <= __n) + { + typename __base_t::__vec_load_t{__n}( + __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__scalar_load_op{}, __rng1, __rng1_vector); + } + else + { + // A single point of non-contiguity within the rotation operation. Manually process two loops here: + // the first before the wraparound point and the second after. + const std::size_t __remaining_elements = __n - __idx; + const std::uint8_t __elements_to_process = + std::min(std::size_t{__base_t::__preferred_vector_size}, __remaining_elements); + // __n - __wrapped_idx can safely fit into a uint8_t due to the condition check above. + const std::uint8_t __loop1_elements = + std::min(__elements_to_process, static_cast(__n - __wrapped_idx)); + const std::uint8_t __loop2_elements = __elements_to_process - __loop1_elements; + std::uint8_t __i = 0; + for (__i = 0; __i < __loop1_elements; ++__i) + __assigner(__rng1[__wrapped_idx + __i], __rng1_vector[__i]); + for (std::uint8_t __j = 0; __j < __loop2_elements; ++__j) + __assigner(__rng1[__j], __rng1_vector[__i + __j]); + } + // 2. Store the rotation + typename __base_t::__vec_store_t{__n}( + __is_full, __idx, + oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op{}, + __rng1_vector, __rng2); + } + template + void + __scalar_path_impl(_IsFull, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + __rng2[__idx] = __rng1[(__shift + __idx) % __size]; + } + template + void + operator()(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2); } }; @@ -1041,15 +1416,73 @@ class __brick_set_op } }; -template +template struct __brick_shift_left { + private: + using _ValueType = oneapi::dpl::__internal::__value_t<_Range>; + // Maximum size supported by compilers to generate vector instructions + constexpr static std::uint8_t __max_vector_size = 4; + + public: + // Multiple iterations per item are manually processed in the brick with a nd-range strided approach. + constexpr static std::uint8_t __preferred_iters_per_item = 1; + constexpr static bool __can_vectorize = + oneapi::dpl::__ranges::__is_vectorizable_range>::value && + std::is_fundamental_v<_ValueType> && sizeof(_ValueType) < 4; + constexpr static std::uint8_t __preferred_vector_size = + __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(__max_vector_size, sizeof(_ValueType)) : 1; + _DiffType __size; _DiffType __n; - template + template + void + __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range __rng) const + { + const std::size_t __unsigned_size = __size; + const _DiffType __i = __idx - __n; + oneapi::dpl::__par_backend_hetero::__vector_load<__preferred_vector_size> __vec_load{__unsigned_size}; + oneapi::dpl::__par_backend_hetero::__vector_store<__preferred_vector_size> __vec_store{__unsigned_size}; + oneapi::dpl::__par_backend_hetero::__scalar_load_op __load_op; + oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op + __store_op; + for (_DiffType __k = __n; __k < __size; __k += __n) + { + const _DiffType __read_offset = __k + __idx; + const _DiffType __write_offset = __k + __i; + if constexpr (_IsFull::value) + { + if (__read_offset + __preferred_vector_size <= __size) + { + _ValueType __rng_vector[__preferred_vector_size]; + __vec_load(std::true_type{}, __read_offset, __load_op, __rng, __rng_vector); + __vec_store(std::true_type{}, __write_offset, __store_op, __rng_vector, __rng); + } + else if (__read_offset < __size) + { + const std::size_t __num_remaining = __size - __read_offset; + for (_DiffType __j = 0; __j < __num_remaining; ++__j) + __rng[__write_offset + __j] = __rng[__read_offset + __j]; + } + } + else + { + // Some items within a sub-group may still have a full vector length to process even if _IsFull is + // false by intentional design of __stride_recommender. While these are vectorizable, this will result + // in branch divergence and masked execution of both vectorized and serial paths for all items in the + // sub-group which may worsen performance. Instead, have each item in the sub-group process its work + // serially. + for (_DiffType __j = 0; __j < std::min(std::size_t{__preferred_vector_size}, __n - __idx); ++__j) + if (__read_offset + __j < __size) + __rng[__write_offset + __j] = __rng[__read_offset + __j]; + } + } + } + + template void - operator()(const _ItemId __idx, _Range&& __rng) const + __scalar_path_impl(_IsFull, const _ItemId __idx, _Range __rng) const { const _DiffType __i = __idx - __n; //loop invariant for (_DiffType __k = __n; __k < __size; __k += __n) @@ -1058,6 +1491,16 @@ struct __brick_shift_left __rng[__k + __i] = ::std::move(__rng[__k + __idx]); } } + + template + void + operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const + { + if constexpr (__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng); + else + __scalar_path_impl(__is_full, __idx, __rng); + } }; struct __brick_assign_key_position @@ -1074,14 +1517,14 @@ struct __brick_assign_key_position }; // reduce the values in a segment associated with a key -template -struct __brick_reduce_idx +template +struct __brick_reduce_idx : public walk_scalar_base<_Range> { __brick_reduce_idx(const _BinaryOperator& __b, const _Size __n_) : __binary_op(__b), __n(__n_) {} - template + template auto - reduce(_Idx __segment_begin, _Idx __segment_end, const _Values& __values) const + reduce(std::size_t __segment_begin, std::size_t __segment_end, const _Values& __values) const { using __ret_type = oneapi::dpl::__internal::__decay_with_tuple_specialization_t; __ret_type __res = __values[__segment_begin]; @@ -1090,23 +1533,82 @@ struct __brick_reduce_idx __res = __binary_op(__res, __values[__segment_begin]); return __res; } - - template + template void - operator()(const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values, - _OutValues& __out_values) const + __scalar_path_impl(_IsFull, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values, + _OutValues& __out_values) const { using __value_type = decltype(__segment_starts[__idx]); __value_type __segment_end = (__idx == __segment_starts.size() - 1) ? __value_type(__n) : __segment_starts[__idx + 1]; __out_values[__idx] = reduce(__segment_starts[__idx], __segment_end, __values); } + template + void + operator()(_IsFull __is_full, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values, + _OutValues& __out_values) const + { + __scalar_path_impl(__is_full, __idx, __segment_starts, __values, __out_values); + } private: _BinaryOperator __binary_op; _Size __n; }; +// std::swap_ranges is unique in that both sets of provided ranges will be modified. Due to this, +// we define a separate functor from __walk2_vectors_or_scalars with a customized vectorization path. +template +struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2> +{ + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; + _F __f; + std::size_t __n; + + public: + __brick_swap(_F __f, std::size_t __n) : __f(std::move(__f)), __n(__n) {} + + template + void + __vector_path_impl(_IsFull __is_full, const std::size_t __idx, _Range1 __rng1, _Range2 __rng2) const + { + // Copies are used in the vector path of swap due to the restriction to fundamental types. + using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + _ValueType __rng_vector[__base_t::__preferred_vector_size]; + typename __base_t::__vec_load_t __vec_load{__n}; + typename __base_t::__vec_store_t __vec_store{__n}; + // 1. Load elements from __rng1. + __vec_load(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_load_op{}, __rng1, __rng_vector); + // 2. Swap the __rng1 elements in the vector with __rng2 elements from global memory. Note the store operation + // updates __rng_vector due to the swap functor. + __vec_store(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<_F>{__f}, + __rng_vector, __rng2); + // 3. Store __rng2 elements in the vector into __rng1. + __vec_store( + __is_full, __idx, + oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op{}, + __rng_vector, __rng1); + } + + template + void + __scalar_path_impl(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + __f(__rng1[__idx], __rng2[__idx]); + } + + template + void + operator()(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2); + } +}; + } // namespace unseq_backend } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h index 4d16ae4508b..23b39c055c7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h @@ -18,8 +18,12 @@ #include #include +#if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT +#include // std::ranges::contiguous_range +#endif #include "../../utils_ranges.h" +#include "../../ranges_defs.h" // contiguous_range and contiguous_iterator from nanorange #include "../../iterator_impl.h" #include "sycl_iterator.h" #include "sycl_defs.h" @@ -754,6 +758,55 @@ __select_backend(const execution::fpga_policy<_Factor, _KernelName>&, _Ranges&&. } #endif +// TODO: At some point with C++20, we should implement this with concepts to more easily support the less common edge +// cases (e.g. a pipe over a counting iterator which is non-contiguous). For now, vectorization is primarily based on +// range contiguity with specialization for internal views and guard views over internal iterators. +// The following cases enable vectorization for a range: +// 1. With C++20 concepts, the range satisfies std::ranges::contiguous_range. +// 2. With C++17 nanoranges, the range satisfies __nanorange::nano::ranges::contiguous_range. Note that a view over +// a SYCL buffer satisfies this concept along with pipe views that maintain access contiguity. +// 3. The range is a guard view over an iterator with no global memory access: counting_iterator and discard_iterator +// 4. The range is one of our internal, vectorizable range types: drop_view_simple, take_view_simple, or +// transform_view_simple + +// Base case: check contiguous range properties +template +struct __is_vectorizable_range +{ + constexpr static bool value = +#if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT + std::ranges::contiguous_range<_Rng> || +#endif + __nanorange::nano::ranges::contiguous_range<_Rng>; +}; + +// Basic guard view specializations - views which are not contiguous but do not interact with global memory +template +struct __is_vectorizable_range>> : std::true_type +{ +}; + +template <> +struct __is_vectorizable_range> : std::true_type +{ +}; + +// Recursive view specializations - internal views which we need to search inwards to identify if it is vectorizable +template +struct __is_vectorizable_range> : __is_vectorizable_range<_Rng> +{ +}; + +template +struct __is_vectorizable_range> : __is_vectorizable_range<_Rng> +{ +}; + +template +struct __is_vectorizable_range> : __is_vectorizable_range<_Rng> +{ +}; + } // namespace __ranges } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h index 6970f57f4d7..937608bfc39 100644 --- a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h @@ -143,7 +143,9 @@ __pattern_histogram(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rando auto __init_event = oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__hist_fill_zeros_wrapper>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, decltype(__fill_func)>{__fill_func}, __num_bins, __bins); + unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, decltype(__fill_func), decltype(__bins)>{ + __fill_func, static_cast(__num_bins)}, + __num_bins, __bins); if (__n > 0) { diff --git a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h index d040e828eef..cf9533f8113 100644 --- a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h @@ -263,9 +263,12 @@ __pattern_adjacent_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy& oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator2>(); auto __buf2 = __keep2(__d_first, __d_last); - using _Function = unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn)>; + using _Function = + unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn), decltype(__buf1.all_view()), + decltype(__buf2.all_view())>; - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec, _Function{__fn}, __n, + oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec, + _Function{__fn, static_cast(__n)}, __n, __buf1.all_view(), __buf2.all_view()) .__deferrable_wait(); } diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 9c32178a78c..b0e41931482 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -25,6 +25,7 @@ #include #include #include +#include #if _ONEDPL_BACKEND_SYCL # include "hetero/dpcpp/sycl_defs.h" @@ -784,6 +785,35 @@ union __lazy_ctor_storage } }; +// To implement __min_nested_type_size, a general utility with an internal tuple +// specialization, we need to forward declare our internal tuple first as tuple_impl.h +// already includes this header. +template +struct tuple; + +// Returns the smallest type within a set of potentially nested template types. This function +// recursively explores std::tuple and oneapi::dpl::__internal::tuple for the smallest type. +// For all other types, its size is used directly. +// E.g. If we consider the type: T = tuple, int, double>, +// then __min_nested_type_size::value returns sizeof(short). +template +struct __min_nested_type_size +{ + constexpr static std::size_t value = sizeof(_T); +}; + +template +struct __min_nested_type_size> +{ + constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...}); +}; + +template +struct __min_nested_type_size> +{ + constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...}); +}; + } // namespace __internal } // namespace dpl } // namespace oneapi diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index da7f0ad5390..fe308318cad 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -45,10 +45,12 @@ test_device_copyable() static_assert(sycl::is_device_copyable_v, "constant_iterator_device_copyable is not device copyable"); - //custom_brick - static_assert(sycl::is_device_copyable_v>, - "custom_brick is not device copyable with device copyable types"); + //__custom_brick + static_assert( + sycl::is_device_copyable_v< + oneapi::dpl::internal::__custom_brick>, + "__custom_brick is not device copyable with device copyable types"); //replace_if_fun static_assert( sycl::is_device_copyable_v>, @@ -76,11 +78,23 @@ test_device_copyable() static_assert(sycl::is_device_copyable_v< oneapi::dpl::unseq_backend::walk_n>, "walk_n is not device copyable with device copyable types"); + //walk1_vector_or_scalar + static_assert(sycl::is_device_copyable_v>, + "walk1_vector_or_scalar is not device copyable with device copyable types"); + //walk2_vectors_or_scalars + static_assert(sycl::is_device_copyable_v>, + "walk2_vectors_or_scalars is not device copyable with device copyable types"); + //walk3_vectors_or_scalars + static_assert(sycl::is_device_copyable_v>, + "walk3_vectors_or_scalars is not device copyable with device copyable types"); //walk_adjacent_difference - static_assert( - sycl::is_device_copyable_v< - oneapi::dpl::unseq_backend::walk_adjacent_difference>, - "walk_adjacent_difference is not device copyable with device copyable types"); + static_assert(sycl::is_device_copyable_v>, + "walk_adjacent_difference is not device copyable with device copyable types"); //transform_reduce static_assert( sycl::is_device_copyable_v< @@ -148,8 +162,8 @@ test_device_copyable() int_device_copyable, int_device_copyable, std::true_type>>, "__brick_set_op is not device copyable with device copyable types"); // __brick_reduce_idx - static_assert(sycl::is_device_copyable_v< - oneapi::dpl::unseq_backend::__brick_reduce_idx>, + static_assert(sycl::is_device_copyable_v>, "__brick_reduce_idx is not device copyable with device copyable types"); //__gen_transform_input @@ -310,11 +324,11 @@ test_non_device_copyable() static_assert(!sycl::is_device_copyable_v, "iterator is device copyable"); static_assert(!sycl::is_device_copyable_v, "range_non_device_copyable is device copyable"); - //custom_brick - static_assert( - !sycl::is_device_copyable_v>, - "custom_brick is device copyable with non device copyable types"); + //__custom_brick + static_assert(!sycl::is_device_copyable_v>, + "__custom_brick is device copyable with non device copyable types"); //replace_if_fun static_assert(!sycl::is_device_copyable_v< oneapi::dpl::internal::replace_if_fun>, @@ -343,10 +357,26 @@ test_non_device_copyable() static_assert(!sycl::is_device_copyable_v< oneapi::dpl::unseq_backend::walk_n>, "walk_n is device copyable with non device copyable types"); + //walk1_vector_or_scalar + static_assert(!sycl::is_device_copyable_v>, + "walk1_vector_or_scalar is device copyable with non device copyable types"); + //walk2_vectors_or_scalars + static_assert( + !sycl::is_device_copyable_v< + oneapi::dpl::unseq_backend::walk2_vectors_or_scalars>, + "walk2_vectors_or_scalars is device copyable with non device copyable types"); + //walk3_vectors_or_scalars + static_assert(!sycl::is_device_copyable_v>, + "walk3_vectors_or_scalars is device copyable with non device copyable types"); //walk_adjacent_difference static_assert( !sycl::is_device_copyable_v< - oneapi::dpl::unseq_backend::walk_adjacent_difference>, + oneapi::dpl::unseq_backend::walk_adjacent_difference>, "walk_adjacent_difference is device copyable with non device copyable types"); //transform_reduce static_assert( @@ -415,8 +445,8 @@ test_non_device_copyable() int_device_copyable, int_device_copyable, std::true_type>>, "__brick_set_op is device copyable with non device copyable types"); //__brick_reduce_idx - static_assert(!sycl::is_device_copyable_v< - oneapi::dpl::unseq_backend::__brick_reduce_idx>, + static_assert(!sycl::is_device_copyable_v>, "__brick_reduce_idx is device copyable with non device copyable types"); //__gen_transform_input diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp index 13f03fdb3dd..41afe0693e1 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp @@ -69,7 +69,7 @@ template void test() { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); Sequence actual(max_len); @@ -99,6 +99,7 @@ int main() { test(); + test(); test(); test(); test>(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp index 44f9dc6fe7c..cbe63db7e47 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp @@ -82,7 +82,7 @@ template void test() { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); Sequence actual(max_len); Sequence data(max_len, [](::std::size_t i) { return T1(i); }); @@ -100,6 +100,7 @@ main() { // clang-3.8 fails to correctly auto vectorize the loop in some cases of different types of container's elements, // for example: std::int32_t and std::int8_t. This issue isn't detected for clang-3.9 and newer versions. + test(); test(); test(); test(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp index 9ad4cfc13bc..7dfc95b1485 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp @@ -125,8 +125,9 @@ template void test(T trash, Convert convert) { + size_t max_n = TestUtils::get_pattern_for_max_n(); // Try sequences of various lengths. - for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { // count is number of output elements, plus a handful // more for sake of detecting buffer overruns. @@ -166,6 +167,9 @@ main() test(-666, [](size_t j) { return std::int32_t(j); }); test(-666.0, [](size_t j) { return float64_t(j); }); + test(42, [](size_t j) { return std::uint16_t(j); }); + test(42, [](size_t j) { return std::uint8_t(j); }); + #if !TEST_DPCPP_BACKEND_PRESENT /*TODO: copy support of a class with no default constructor*/ test>(Wrapper(-666.0), [](std::int32_t j) { return Wrapper(j); }); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp index 66c8e321627..6c707b62079 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp @@ -101,10 +101,12 @@ int main() { - const ::std::size_t N = 100000; + const std::size_t N = TestUtils::get_pattern_for_max_n(); for (::std::size_t n = 0; n < N; n = n < 16 ? n + 1 : size_t(3.1415 * n)) { + test_fill_by_type(n); + test_fill_by_type(n); test_fill_by_type(n); test_fill_by_type(n); } diff --git a/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp index 3ee13107011..bcd65426526 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp @@ -82,7 +82,8 @@ template void test_generate_by_type() { - for (size_t n = 0; n <= 100000; n = n < 16 ? n + 1 : size_t(3.1415 * n)) + size_t max_n = TestUtils::get_pattern_for_max_n(); + for (size_t n = 0; n <= max_n; n = n < 16 ? n + 1 : size_t(3.1415 * n)) { Sequence in(n, [](size_t) -> T { return T(0); }); //fill by zero @@ -123,6 +124,7 @@ struct test_non_const_generate_n int main() { + test_generate_by_type(); test_generate_by_type(); test_generate_by_type(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp index 92820d2012f..91291851dcb 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp @@ -114,7 +114,7 @@ template void test(Pred pred) { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); const T1 value = T1(0); const T1 new_value = T1(666); @@ -162,6 +162,7 @@ int main() { test(oneapi::dpl::__internal::__equal_value(666)); + test([](const std::uint8_t& elem) { return elem % 3 < 2; }); test([](const std::uint16_t& elem) { return elem % 3 < 2; }); test([](const float64_t& elem) { return elem * elem - 3.5 * elem > 10; }); //test([](const copy_int& val) { return val.value / 5 > 2; }); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp index 50851e3d299..b89c3aa3f40 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp @@ -75,8 +75,9 @@ template void test(T trash, const T& old_value, const T& new_value, Predicate pred, Convert convert) { + const size_t max_n = TestUtils::get_pattern_for_max_n(); // Try sequences of various lengths. - for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { Sequence in(n, [&](size_t k) -> T { return convert(n ^ k); }); Sequence out(n, [=](size_t) { return trash; }); @@ -123,6 +124,10 @@ main() test(-666, 42, 99, [](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? 42 : -1 - std::int32_t(j); }); + test(123, 42, 99, [](const std::uint8_t& x) { return x != 42; }, + [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? 42 : 255; }); + + #if !TEST_DPCPP_BACKEND_PRESENT test(Number(42, OddTag()), Number(2001, OddTag()), Number(2017, OddTag()), IsMultiple(3, OddTag()), [](std::int32_t j) { return ((j + 1) % 3 & 2) != 0 ? Number(2001, OddTag()) : Number(j, OddTag()); }); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp index 2147999ce3f..a6f4600c2a8 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp @@ -132,7 +132,7 @@ template void test() { - const std::int32_t max_len = 100000; + const std::int32_t max_len = TestUtils::get_pattern_for_max_n(); Sequence actual(max_len, [](::std::size_t i) { return T(i); }); Sequence data(max_len, [](::std::size_t i) { return T(i); }); @@ -154,6 +154,8 @@ test() int main() { + test(); + test(); test(); #if !TEST_DPCPP_BACKEND_PRESENT test>(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp index edf5f56651c..1df986deb6a 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp @@ -100,7 +100,7 @@ void test() { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); Sequence actual(max_len, [](::std::size_t i) { return T1(i); }); @@ -125,6 +125,8 @@ test() int main() { + test(); + test(); test(); test(); test(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp index c15a493a639..d12cc8ba107 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp @@ -1,5 +1,5 @@ // -*- C++ -*- -//===-- shift_left.pass.cpp -----------------------------------------------===// +//===-- shift_left_right.pass.cpp -----------------------------------------------===// // // Copyright (C) Intel Corporation // @@ -40,8 +40,11 @@ #include // omp_get_max_threads, omp_set_num_threads #endif -template -struct USM; +template +struct USMKernelName; + +template +struct BufferKernelName; struct test_shift { @@ -80,7 +83,9 @@ struct test_shift TestUtils::usm_data_transfer dt_helper(queue, first, m); auto ptr = dt_helper.get_data(); - auto het_res = algo(TestUtils::make_device_policy>(::std::forward(exec)), ptr, ptr + m, n); + auto het_res = + algo(TestUtils::make_device_policy>(std::forward(exec)), ptr, + ptr + m, n); _DiffType res_idx = het_res - ptr; //3.2 check result @@ -95,14 +100,14 @@ struct test_shift operator()(Policy&& exec, It first, typename ::std::iterator_traits::difference_type m, It first_exp, typename ::std::iterator_traits::difference_type n, Algo algo) { + using _ValueType = typename std::iterator_traits::value_type; + using _DiffType = typename std::iterator_traits::difference_type; + auto buffer_policy = TestUtils::make_device_policy>(exec); //1.1 run a test with hetero policy and host itertors - auto res = algo(::std::forward(exec), first, first + m, n); + auto res = algo(buffer_policy, first, first + m, n); //1.2 check result algo.check(res, first, m, first_exp, n); - using _ValueType = typename ::std::iterator_traits::value_type; - using _DiffType = typename ::std::iterator_traits::difference_type; - //2.1 run a test with hetero policy and hetero itertors _DiffType res_idx(0); {//scope for SYCL buffer lifetime @@ -112,7 +117,7 @@ struct test_shift auto het_begin = oneapi::dpl::begin(buf); - auto het_res = algo(::std::forward(exec), het_begin, het_begin + m, n); + auto het_res = algo(buffer_policy, het_begin, het_begin + m, n); res_idx = het_res - het_begin; } //2.2 check result @@ -121,7 +126,7 @@ struct test_shift #if _PSTL_SYCL_TEST_USM //3. run a test with hetero policy and USM shared/device memory pointers test_usm(exec, first, m, first_exp, n, algo); - test_usm(exec, first, m, first_exp, n, algo); + test_usm(std::forward(exec), first, m, first_exp, n, algo); #endif } #endif @@ -208,10 +213,10 @@ test_shift_by_type(Size m, Size n) TestUtils::Sequence in(m, [](::std::size_t v) -> T { return T(v); }); //fill data #ifdef _PSTL_TEST_SHIFT_LEFT - TestUtils::invoke_on_all_policies<0>()(test_shift(), in.begin(), m, orig.begin(), n, shift_left_algo{}); + TestUtils::invoke_on_all_policies()(test_shift(), in.begin(), m, orig.begin(), n, shift_left_algo{}); #endif #ifdef _PSTL_TEST_SHIFT_RIGHT - TestUtils::invoke_on_all_policies<1>()(test_shift(), in.begin(), m, orig.begin(), n, shift_right_algo{}); + TestUtils::invoke_on_all_policies()(test_shift(), in.begin(), m, orig.begin(), n, shift_right_algo{}); #endif } @@ -233,6 +238,16 @@ main() { test_shift_by_type(m, n); } +#if TEST_DPCPP_BACKEND_PRESENT + // Test both paths of the vectorized implementation in the SYCL backend. Use shift factors that will not divide + // into the vector size to assess edge case handling. + const std::size_t large_n = 1000000; + const std::size_t quarter_shift = 250111; + const std::size_t three_quarters_shift = 750203; + test_shift_by_type(large_n, quarter_shift); + test_shift_by_type(three_quarters_shift, large_n); + test_shift_by_type(large_n, quarter_shift); +#endif return TestUtils::done(); } diff --git a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp index afd10f55f50..f23ea69e05b 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp @@ -115,7 +115,7 @@ template void test() { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); Sequence data(max_len); Sequence actual(max_len); @@ -132,6 +132,7 @@ main() { test>(); test>(); + test(); test(); test(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp index 113c24be7c8..5deae6ea667 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp @@ -104,7 +104,7 @@ test(Predicate pred, _IteratorAdapter adap = {}) #if PSTL_USE_DEBUG && ONEDPL_USE_OPENMP_BACKEND 10000; #else - 100000; + TestUtils::get_pattern_for_max_n(); #endif for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { @@ -153,5 +153,7 @@ main() //test case for zip iterator test<50, std::int32_t, std::int32_t, std::int32_t>(TheOperationZip(1), _ZipIteratorAdapter{}); + test<60, std::uint16_t, std::uint16_t, std::int32_t>(TheOperation(1)); + return done(); } diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp index 20c1403f6b9..19fe906935c 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp @@ -84,7 +84,8 @@ template <::std::size_t CallNumber, typename Tin, typename Tout, typename _Op = void test() { - for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + size_t max_n = TestUtils::get_pattern_for_max_n(); + for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { Sequence in(n, [](std::int32_t k) { return k % 5 != 1 ? 3 * k - 7 : 0; }); diff --git a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp index 526c1c45fcf..c23fd3013de 100644 --- a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp +++ b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp @@ -85,7 +85,8 @@ template void test() { - for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + const size_t max_n = TestUtils::get_pattern_for_max_n(); + for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { Sequence in_out(n, Gen()); Sequence expected(n, Gen()); diff --git a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp index 973ca3658f8..97198b72f76 100644 --- a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp +++ b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp @@ -162,10 +162,11 @@ void test() { const ::std::int64_t init_val = 999; - for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + const size_t max_n = TestUtils::get_pattern_for_max_n(); + for (size_t n = 1; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { { - Sequence<_Type> in1(n, [=](size_t k) { return (3 * k); }); + Sequence<_Type> in1(n, [=](size_t k) { return (3 * k) % std::numeric_limits<_Type>::max(); }); Sequence<_Type> in2(n, [=](size_t k) { return k % 2 == 0 ? 1 : 0; }); Sequence<_Type> out(n, [=](size_t) { return init_val; }); @@ -178,7 +179,7 @@ test() #endif } { - Sequence<_Type> in1(n, [=](size_t k) { return k; }); + Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); }); Sequence<_Type> out(n, [=](size_t) { return init_val; }); invoke_on_all_policies<2>()(test_transform_if_unary<_Type>(), in1.begin(), in1.end(), out.begin(), @@ -196,10 +197,11 @@ void test_inplace() { const ::std::int64_t init_val = 999; - for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + const size_t max_n = TestUtils::get_pattern_for_max_n(); + for (size_t n = 1; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { { - Sequence<_Type> in1(n, [=](size_t k) { return k; }); + Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); }); Sequence<_Type> out(n, [=](size_t) { return 0; }); invoke_on_all_policies<4>()(test_transform_if_unary_inplace<_Type>(), in1.begin(), in1.end(), out.begin(), @@ -211,11 +213,15 @@ test_inplace() int main() { - test<::std::int32_t>(); - test<::std::int64_t>(); - - test_inplace<::std::int32_t>(); - test_inplace<::std::int64_t>(); + test(); + test(); + test(); + test(); + + test_inplace(); + test_inplace(); + test_inplace(); + test_inplace(); return done(); } diff --git a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp index a617d3205d0..08b93265541 100644 --- a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp @@ -139,7 +139,7 @@ template void test(Pred pred) { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); const T2 value = T2(77); const T1 trash = T1(31); @@ -165,6 +165,7 @@ int main() { test([](std::uint32_t a, std::uint32_t b) { return a - b; }); + test([](std::uint16_t a, std::uint16_t b) { return a > b ? a - b : b - a; }); test([](std::int64_t a, std::int64_t b) { return a / (b + 1); }); test([](float32_t a, float32_t b) { return (a + b) / 2; }); #if !TEST_DPCPP_BACKEND_PRESENT diff --git a/test/support/utils.h b/test/support/utils.h index 666d5ca3646..af4dfdc8a61 100644 --- a/test/support/utils.h +++ b/test/support/utils.h @@ -1013,6 +1013,29 @@ generate_arithmetic_data(T* input, std::size_t size, std::uint32_t seed) input[j] = input[i]; } } + +// Utility that models __estimate_best_start_size in the SYCL backend parallel_for +// to ensure large enough inputs are used to test the large submitter path. +// A multiplier to the max n is added to ensure we get a few separate test inputs for +// this path. +std::size_t +get_pattern_for_max_n() +{ +#if TEST_DPCPP_BACKEND_PRESENT + sycl::queue q = TestUtils::get_test_queue(); + sycl::device d = q.get_device(); + constexpr std::size_t max_iters_per_item = 16; + constexpr std::size_t multiplier = 4; + constexpr std::size_t max_work_group_size = 512; + std::size_t __max_n = multiplier * max_iters_per_item * max_work_group_size * + d.get_info(); + __max_n = std::min(std::size_t{10000000}, __max_n); + return __max_n; +#else + return TestUtils::max_n; +#endif +} + } /* namespace TestUtils */ #endif // _UTILS_H