From 83c3741bdc4e5444d25728c3cd05291bd381d3cd Mon Sep 17 00:00:00 2001
From: Matthew Michel <106704043+mmichel11@users.noreply.github.com>
Date: Fri, 31 Jan 2025 15:33:36 -0600
Subject: [PATCH] Re-implement SYCL backend `parallel_for` to improve bandwidth
 utilization (#1976)

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../kt/internal/esimd_radix_sort_submitters.h |   2 +-
 .../internal/async_impl/async_impl_hetero.h   |  18 +-
 .../oneapi/dpl/internal/binary_search_impl.h  |  37 +-
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   | 103 +++-
 .../hetero/algorithm_ranges_impl_hetero.h     |  71 ++-
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h |  56 +-
 .../hetero/dpcpp/parallel_backend_sycl_for.h  | 209 +++++++
 .../hetero/dpcpp/parallel_backend_sycl_fpga.h |   4 +-
 .../dpcpp/parallel_backend_sycl_merge.h       |   2 +-
 .../dpcpp/parallel_backend_sycl_reduce.h      |   2 +-
 .../dpcpp/parallel_backend_sycl_utils.h       | 167 +++++
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       |  50 +-
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 570 ++++++++++++++++--
 .../dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h |  53 ++
 .../dpl/pstl/hetero/histogram_impl_hetero.h   |   4 +-
 .../dpl/pstl/hetero/numeric_impl_hetero.h     |   7 +-
 include/oneapi/dpl/pstl/utils.h               |  30 +
 .../device_copyable.pass.cpp                  |  66 +-
 .../alg.reverse/reverse.pass.cpp              |   3 +-
 .../alg.reverse/reverse_copy.pass.cpp         |   3 +-
 .../copy_move.pass.cpp                        |   6 +-
 .../alg.modifying.operations/fill.pass.cpp    |   4 +-
 .../generate.pass.cpp                         |   4 +-
 .../alg.modifying.operations/replace.pass.cpp |   3 +-
 .../replace_copy.pass.cpp                     |   7 +-
 .../alg.modifying.operations/rotate.pass.cpp  |   4 +-
 .../rotate_copy.pass.cpp                      |   4 +-
 .../shift_left_right.pass.cpp                 |  39 +-
 .../swap_ranges.pass.cpp                      |   3 +-
 .../transform_binary.pass.cpp                 |   4 +-
 .../transform_unary.pass.cpp                  |   3 +-
 .../alg.nonmodifying/for_each.pass.cpp        |   3 +-
 .../alg.nonmodifying/transform_if.pass.cpp    |  26 +-
 .../numeric.ops/adjacent_difference.pass.cpp  |   3 +-
 test/support/utils.h                          |  23 +
 35 files changed, 1376 insertions(+), 217 deletions(-)
 create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h

diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h
index 4d7b81e6a2e..4fc274f2445 100644
--- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h
+++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h
@@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl
 {
 
 //------------------------------------------------------------------------
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 //------------------------------------------------------------------------
 
 template <bool __is_ascending, ::std::uint8_t __radix_bits, ::std::uint16_t __data_per_work_item,
diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
index 1558919b20f..bca00e37199 100644
--- a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
+++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
@@ -44,7 +44,9 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
 
     auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view());
+        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__buf.all_view())>{
+            __f, static_cast<std::size_t>(__n)},
+        __n, __buf.all_view());
     return __future_obj;
 }
 
@@ -67,7 +69,9 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
 
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view());
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()),
+                                                decltype(__buf2.all_view())>{__f, static_cast<std::size_t>(__n)},
+        __n, __buf1.all_view(), __buf2.all_view());
 
     return __future.__make_future(__first2 + __n);
 }
@@ -91,10 +95,12 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator3>();
     auto __buf3 = __keep3(__first3, __first3 + __n);
 
-    auto __future =
-        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                          unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
-                                                          __buf1.all_view(), __buf2.all_view(), __buf3.all_view());
+    auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()),
+                                                decltype(__buf2.all_view()), decltype(__buf3.all_view())>{
+            __f, static_cast<size_t>(__n)},
+        __n, __buf1.all_view(), __buf2.all_view(), __buf3.all_view());
 
     return __future.__make_future(__first3 + __n);
 }
diff --git a/include/oneapi/dpl/internal/binary_search_impl.h b/include/oneapi/dpl/internal/binary_search_impl.h
index ef01be4b161..a0c4c80b5d2 100644
--- a/include/oneapi/dpl/internal/binary_search_impl.h
+++ b/include/oneapi/dpl/internal/binary_search_impl.h
@@ -37,13 +37,19 @@ enum class search_algorithm
     binary_search
 };
 
-template <typename Comp, typename T, search_algorithm func>
-struct custom_brick
+#if _ONEDPL_BACKEND_SYCL
+template <typename Comp, typename T, typename _Range, search_algorithm func>
+struct __custom_brick : oneapi::dpl::unseq_backend::walk_scalar_base<_Range>
 {
     Comp comp;
     T size;
     bool use_32bit_indexing;
 
+    __custom_brick(Comp comp, T size, bool use_32bit_indexing)
+        : comp(std::move(comp)), size(size), use_32bit_indexing(use_32bit_indexing)
+    {
+    }
+
     template <typename _Size, typename _ItemId, typename _Acc>
     void
     search_impl(_ItemId idx, _Acc acc) const
@@ -68,17 +74,23 @@ struct custom_brick
             get<2>(acc[idx]) = (value != end_orig) && (get<1>(acc[idx]) == get<0>(acc[value]));
         }
     }
-
-    template <typename _ItemId, typename _Acc>
+    template <typename _IsFull, typename _ItemId, typename _Acc>
     void
-    operator()(_ItemId idx, _Acc acc) const
+    __scalar_path_impl(_IsFull, _ItemId idx, _Acc acc) const
     {
         if (use_32bit_indexing)
             search_impl<std::uint32_t>(idx, acc);
         else
             search_impl<std::uint64_t>(idx, acc);
     }
+    template <typename _IsFull, typename _ItemId, typename _Acc>
+    void
+    operator()(_IsFull __is_full, _ItemId idx, _Acc acc) const
+    {
+        __scalar_path_impl(__is_full, idx, acc);
+    }
 };
+#endif
 
 template <class _Tag, typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator,
           typename StrictWeakOrdering>
@@ -155,7 +167,8 @@ lower_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
     __bknd::__parallel_for(
         _BackendTag{}, ::std::forward<decltype(policy)>(policy),
-        custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::lower_bound>{comp, size, use_32bit_indexing},
+        __custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::lower_bound>{
+            comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
@@ -187,7 +200,8 @@ upper_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
     __bknd::__parallel_for(
         _BackendTag{}, std::forward<decltype(policy)>(policy),
-        custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::upper_bound>{comp, size, use_32bit_indexing},
+        __custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::upper_bound>{
+            comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
@@ -217,10 +231,11 @@ binary_search_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, Input
     auto result_buf = keep_result(result, result + value_size);
     auto zip_vw = make_zip_view(input_buf.all_view(), value_buf.all_view(), result_buf.all_view());
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
-    __bknd::__parallel_for(_BackendTag{}, std::forward<decltype(policy)>(policy),
-                           custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::binary_search>{
-                               comp, size, use_32bit_indexing},
-                           value_size, zip_vw)
+    __bknd::__parallel_for(
+        _BackendTag{}, std::forward<decltype(policy)>(policy),
+        __custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::binary_search>{
+            comp, size, use_32bit_indexing},
+        value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
 }
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index 65bf99c8777..6c858cefa92 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -57,7 +57,10 @@ __pattern_walk1(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __buf = __keep(__first, __last);
 
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, __exec, unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view())
+        _BackendTag{}, __exec,
+        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__buf.all_view())>{
+            __f, static_cast<std::size_t>(__n)},
+        __n, __buf.all_view())
         .__deferrable_wait();
 }
 
@@ -82,6 +85,14 @@ __pattern_walk1_n(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _F
 // TODO: A tag _WaitMode is used for provide a patterns call pipeline, where the last one should be synchronous
 // Probably it should be re-designed by a pipeline approach, when a pattern returns some sync objects
 // and ones are combined into a "pipeline" (probably like Range pipeline)
+//
+// A note on access mode types below: the __vector_path_impl in unseq_backend::walk2_vectors_or_scalars only respects
+// the default template arguments:
+// __acc_mode1 = __par_backend_hetero::access_mode::read
+// __acc_mode2 = __par_backend_hetero::access_mode::write
+// For any provided _Function object, the default access modes should be respected even if other access modes are
+// required due to dependency / synchronization issues. For a detailed explanation, see:
+// https://github.com/uxlfoundation/oneDPL/issues/1272
 template <typename _WaitMode = __par_backend_hetero::__deferrable_mode,
           __par_backend_hetero::access_mode __acc_mode1 = __par_backend_hetero::access_mode::read,
           __par_backend_hetero::access_mode __acc_mode2 = __par_backend_hetero::access_mode::write,
@@ -103,7 +114,9 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
 
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view());
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()),
+                                                decltype(__buf2.all_view())>{__f, static_cast<std::size_t>(__n)},
+        __n, __buf1.all_view(), __buf2.all_view());
 
     // Call no wait, wait or deferrable wait depending on _WaitMode
     __future.wait(_WaitMode{});
@@ -130,16 +143,39 @@ _ForwardIterator2
 __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ForwardIterator1 __first1,
                _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Function __f)
 {
-    return __pattern_walk2</*_WaitMode*/ __par_backend_hetero::__deferrable_mode,
-                           __par_backend_hetero::access_mode::read_write,
-                           __par_backend_hetero::access_mode::read_write>(
-        __tag, ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __f);
+    const auto __n = __last1 - __first1;
+    if (__n == 0)
+        return __first2;
+
+    auto __keep1 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator1>();
+    auto __buf1 = __keep1(__first1, __last1);
+
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator2>();
+    auto __buf2 = __keep2(__first2, __first2 + __n);
+
+    auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__buf1.all_view()),
+                                    decltype(__buf2.all_view())>{__f, static_cast<std::size_t>(__n)},
+        __n, __buf1.all_view(), __buf2.all_view());
+    __future.wait(__par_backend_hetero::__deferrable_mode{});
+    return __first2 + __n;
 }
 
 //------------------------------------------------------------------------
 // walk3
 //------------------------------------------------------------------------
 
+// A note on access mode types below: the __vector_path_impl in unseq_backend::walk3_vectors_or_scalars only respects
+// the default template arguments:
+// __acc_mode1 = __par_backend_hetero::access_mode::read
+// __acc_mode2 = __par_backend_hetero::access_mode::read
+// __acc_mode3 __par_backend_hetero::access_mode::write
+// For any provided _Function object, the default access modes should be respected even if other access modes are
+// required due to dependency / synchronization issues. For a detailed explanation, see:
+// https://github.com/uxlfoundation/oneDPL/issues/1272
 template <typename _BackendTag, __par_backend_hetero::access_mode __acc_mode1 = __par_backend_hetero::access_mode::read,
           __par_backend_hetero::access_mode __acc_mode2 = __par_backend_hetero::access_mode::read,
           __par_backend_hetero::access_mode __acc_mode3 = __par_backend_hetero::access_mode::write,
@@ -160,9 +196,12 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __keep3 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode3, _ForwardIterator3>();
     auto __buf3 = __keep3(__first3, __first3 + __n);
 
-    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                      unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
-                                                      __buf1.all_view(), __buf2.all_view(), __buf3.all_view())
+    oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()),
+                                                decltype(__buf2.all_view()), decltype(__buf3.all_view())>{
+            __f, static_cast<std::size_t>(__n)},
+        __n, __buf1.all_view(), __buf2.all_view(), __buf3.all_view())
         .__deferrable_wait();
 
     return __first3 + __n;
@@ -253,7 +292,9 @@ __pattern_walk2_transform_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
                              _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Function __func)
 {
     // Require `read_write` access mode for output sequence to force a copy in for host iterators to capture incoming
-    // values of the output sequence for elements where the predicate is false.
+    // values of the output sequence for elements where the predicate is false. We never actually read from the output
+    // sequence, so there is no risk when ran with the vectorized path of walk2_vector_or_scalars. For more info,
+    // please see the comment above __pattern_walk2 and https://github.com/uxlfoundation/oneDPL/issues/1272.
     return __pattern_walk2</*_WaitMode*/ __par_backend_hetero::__deferrable_mode,
                            __par_backend_hetero::access_mode::read, __par_backend_hetero::access_mode::read_write>(
         __tag,
@@ -273,7 +314,9 @@ __pattern_walk3_transform_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
                              _Function __func)
 {
     // Require `read_write` access mode for output sequence to force a copy in for host iterators to capture incoming
-    // values of the output sequence for elements where the predicate is false.
+    // values of the output sequence for elements where the predicate is false. We never actually read from the output
+    // sequence, so there is no risk when ran with the vectorized path of walk3_vector_or_scalars. For more info,
+    // please see the comment above __pattern_walk3 and https://github.com/uxlfoundation/oneDPL/issues/1272.
     return __pattern_walk3<_BackendTag, __par_backend_hetero::access_mode::read,
                            __par_backend_hetero::access_mode::read, __par_backend_hetero::access_mode::read_write>(
         __tag,
@@ -1036,6 +1079,9 @@ __pattern_unique(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _It
 
     // The temporary buffer is constructed from a range, therefore it's destructor will not block, therefore
     // we must call __pattern_walk2 in a way which provides blocking synchronization for this pattern.
+    // We never actually write to the sequence, so there is no risk when ran with the vectorized path of
+    // walk2_vector_or_scalars. For more info, please see the comment above __pattern_walk2 and
+    // https://github.com/uxlfoundation/oneDPL/issues/1272.
     return __pattern_walk2</*_WaitMode*/ __par_backend_hetero::__deferrable_mode,
                            __par_backend_hetero::access_mode::read_write,
                            __par_backend_hetero::access_mode::read_write>(
@@ -1564,9 +1610,10 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
     auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>();
     auto __buf = __keep(__first, __last);
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type>{__n}, __n / 2,
-        __buf.all_view())
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__reverse_functor<typename std::iterator_traits<_Iterator>::difference_type,
+                                         decltype(__buf.all_view())>{__n},
+        __n / 2, __buf.all_view())
         .__deferrable_wait();
 }
 
@@ -1590,8 +1637,9 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator>();
     auto __buf2 = __keep2(__result, __result + __n);
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type>{__n},
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__reverse_copy<typename std::iterator_traits<_BidirectionalIterator>::difference_type,
+                                      decltype(__buf1.all_view()), decltype(__buf2.all_view())>{__n},
         __n, __buf1.all_view(), __buf2.all_view())
         .__deferrable_wait();
 
@@ -1632,18 +1680,20 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     const auto __shift = __new_first - __first;
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type>{__n, __shift}, __n,
-        __buf.all_view(), __temp_rng_w);
+        unseq_backend::__rotate_copy<typename std::iterator_traits<_Iterator>::difference_type,
+                                     decltype(__buf.all_view()), decltype(__temp_rng_w)>{__n, __shift},
+        __n, __buf.all_view(), __temp_rng_w);
 
     //An explicit wait isn't required here because we are working with a temporary sycl::buffer and sycl accessors and
     //SYCL runtime makes a dependency graph to prevent the races between two __parallel_for patterns.
 
     using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
-    auto __brick = unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}};
-
     auto __temp_rng_rw =
         oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::read_write>(__temp_buf.get_buffer());
-    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick,
+    auto __brick =
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw),
+                                                decltype(__buf.all_view())>{_Function{}, static_cast<std::size_t>(__n)};
+    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick,
                                                       __n, __temp_rng_rw, __buf.all_view())
         .__deferrable_wait();
 
@@ -1677,8 +1727,8 @@ __pattern_rotate_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bid
 
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type>{__n,
-                                                                                                               __shift},
+        unseq_backend::__rotate_copy<typename std::iterator_traits<_BidirectionalIterator>::difference_type,
+                                     decltype(__buf1.all_view()), decltype(__buf2.all_view())>{__n, __shift},
         __n, __buf1.all_view(), __buf2.all_view())
         .__deferrable_wait();
 
@@ -1936,19 +1986,22 @@ __pattern_shift_left(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rang
     if (__n >= __mid)
     {
         using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
-        auto __brick = oneapi::dpl::unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}};
 
         //TODO: to consider use just "read" access mode for a source range and just "write" - for a destination range.
         auto __src = oneapi::dpl::__ranges::drop_view_simple<_Range, _DiffType>(__rng, __n);
         auto __dst = oneapi::dpl::__ranges::take_view_simple<_Range, _DiffType>(__rng, __size_res);
 
+        auto __brick =
+            unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__src), decltype(__dst)>{
+                _Function{}, static_cast<std::size_t>(__size_res)};
+
         oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
                                                           __brick, __size_res, __src, __dst)
             .__deferrable_wait();
     }
     else //2. n < size/2; 'n' parallel copying
     {
-        auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType>{__size, __n};
+        auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType, decltype(__rng)>{__size, __n};
         oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{},
             oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__shift_left_right>(
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 65d15ebf26a..4dcd93367a7 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -58,10 +58,35 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function
     const _Size __n = std::min({_Size(__rngs.size())...});
     if (__n > 0)
     {
-        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                          unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
-                                                          ::std::forward<_Ranges>(__rngs)...)
-            .__deferrable_wait();
+        constexpr std::size_t __num_ranges = sizeof...(_Ranges);
+        static_assert(__num_ranges <= 3, "__pattern_walk_n only supports up to three packed range parameters");
+        if constexpr (__num_ranges == 1)
+        {
+            oneapi::dpl::__par_backend_hetero::__parallel_for(
+                _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+                unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
+                    __f, static_cast<std::size_t>(__n)},
+                __n, std::forward<_Ranges>(__rngs)...)
+                .__deferrable_wait();
+        }
+        else if constexpr (__num_ranges == 2)
+        {
+            oneapi::dpl::__par_backend_hetero::__parallel_for(
+                _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+                unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
+                    __f, static_cast<std::size_t>(__n)},
+                __n, std::forward<_Ranges>(__rngs)...)
+                .__deferrable_wait();
+        }
+        else
+        {
+            oneapi::dpl::__par_backend_hetero::__parallel_for(
+                _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+                unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
+                    __f, static_cast<std::size_t>(__n)},
+                __n, std::forward<_Ranges>(__rngs)...)
+                .__deferrable_wait();
+        }
     }
     return __n;
 }
@@ -149,20 +174,27 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Rang
 {
     if (__rng1.size() <= __rng2.size())
     {
-        oneapi::dpl::__internal::__ranges::__pattern_walk_n(
-            __tag,
-            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>(
-                ::std::forward<_ExecutionPolicy>(__exec)),
-            __f, __rng1, __rng2);
-        return __rng1.size();
+        const std::size_t __n = __rng1.size();
+        auto __exec1 = oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>(
+            std::forward<_ExecutionPolicy>(__exec));
+        oneapi::dpl::__par_backend_hetero::__parallel_for(
+            _BackendTag{}, std::move(__exec1),
+            unseq_backend::__brick_swap<decltype(__exec1), _Function, std::decay_t<_Range1>, std::decay_t<_Range2>>{
+                __f, __n},
+            __n, __rng1, __rng2)
+            .__deferrable_wait();
+        return __n;
     }
-
-    oneapi::dpl::__internal::__ranges::__pattern_walk_n(
-        __tag,
-        oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(
-            ::std::forward<_ExecutionPolicy>(__exec)),
-        __f, __rng2, __rng1);
-    return __rng2.size();
+    const std::size_t __n = __rng2.size();
+    auto __exec2 =
+        oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(std::forward<_ExecutionPolicy>(__exec));
+    oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, std::move(__exec2),
+        unseq_backend::__brick_swap<decltype(__exec2), _Function, std::decay_t<_Range2>, std::decay_t<_Range1>>{__f,
+                                                                                                                __n},
+        __n, __rng2, __rng1)
+        .__deferrable_wait();
+    return __n;
 }
 
 //------------------------------------------------------------------------
@@ -627,8 +659,9 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
             _BackendTag{},
             oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>(
                 std::forward<_ExecutionPolicy>(__exec)),
-            unseq_backend::walk_n<_ExecutionPolicy, _CopyBrick>{_CopyBrick{}}, __n, std::forward<_Range1>(__rng),
-            std::forward<_Range2>(__result))
+            unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _CopyBrick, std::decay_t<_Range1>,
+                                                    std::decay_t<_Range2>>{_CopyBrick{}, static_cast<std::size_t>(__n)},
+            __n, std::forward<_Range1>(__rng), std::forward<_Range2>(__result))
             .get();
 
         return 1;
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index a1fcc1f8e15..be38682049e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -36,6 +36,7 @@
 
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
+#include "parallel_backend_sycl_for.h"
 #include "parallel_backend_sycl_reduce.h"
 #include "parallel_backend_sycl_merge.h"
 #include "parallel_backend_sycl_merge_sort.h"
@@ -218,57 +219,11 @@ class __scan_single_wg_dynamic_kernel;
 template <typename... Name>
 class __scan_copy_single_wg_kernel;
 
-//------------------------------------------------------------------------
-// parallel_for - async pattern
-//------------------------------------------------------------------------
-
-// Use the trick with incomplete type and partial specialization to deduce the kernel name
-// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one
-// type (for explicitly specified name by the user)
-template <typename _KernelName>
-struct __parallel_for_submitter;
-
-template <typename... _Name>
-struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
-{
-    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
-    auto
-    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
-    {
-        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
-        _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
-            //get an access to data under SYCL buffer:
-            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
-
-            __cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
-                auto __idx = __item_id.get_linear_id();
-                __brick(__idx, __rngs...);
-            });
-        });
-        return __future(__event);
-    }
-};
-
-//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
-//for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
-template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
-auto
-__parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count,
-               _Ranges&&... __rngs)
-{
-    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
-    using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>;
-
-    return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                                  ::std::forward<_Ranges>(__rngs)...);
-}
-
 //------------------------------------------------------------------------
 // parallel_transform_scan - async pattern
 //------------------------------------------------------------------------
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _CustomName, typename _PropagateScanName>
 struct __parallel_scan_submitter;
 
@@ -2174,7 +2129,7 @@ struct __partial_merge_kernel
     }
 };
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _GlobalSortName, typename _CopyBackName>
 struct __parallel_partial_sort_submitter;
 
@@ -2412,7 +2367,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         oneapi::dpl::__internal::__device_backend_tag{},
         oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec),
-        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end,
+        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n), _Range2>(__binary_op, __n),
+        __intermediate_result_end,
         oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx),
                                                 __intermediate_result_end),
         std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values))
@@ -2458,7 +2414,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
         oneapi::dpl::__internal::__device_backend_tag{},
         oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>(
             std::forward<_ExecutionPolicy>(__exec)),
-        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>(
+        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end), _Range4>(
             __binary_op, __intermediate_result_end),
         __result_end,
         oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end),
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
new file mode 100644
index 00000000000..24811a15a06
--- /dev/null
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -0,0 +1,209 @@
+// -*- C++ -*-
+//===-- parallel_backend_sycl_for.h ---------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_FOR_H
+#define _ONEDPL_PARALLEL_BACKEND_SYCL_FOR_H
+
+#include <algorithm>
+#include <cstdint>
+#include <cassert>
+#include <type_traits>
+#include <tuple>
+
+#include "sycl_defs.h"
+#include "parallel_backend_sycl_utils.h"
+#include "execution_sycl_defs.h"
+#include "unseq_backend_sycl.h"
+#include "utils_ranges_sycl.h"
+
+#include "sycl_traits.h" //SYCL traits specialization for some oneDPL types.
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace __par_backend_hetero
+{
+
+template <typename... Name>
+class __parallel_for_small_kernel;
+
+template <typename... Name>
+class __parallel_for_large_kernel;
+
+//------------------------------------------------------------------------
+// parallel_for - async pattern
+//------------------------------------------------------------------------
+
+// Use the trick with incomplete type and partial specialization to deduce the kernel name
+// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one
+// type (for explicitly specified name by the user)
+template <typename _KernelName>
+struct __parallel_for_small_submitter;
+
+template <typename... _Name>
+struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>>
+{
+    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
+    {
+        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
+        _PRINT_INFO_IN_DEBUG_MODE(__exec);
+        auto __event = __exec.queue().submit([__rngs..., __brick, __count](sycl::handler& __cgh) {
+            //get an access to data under SYCL buffer:
+            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
+
+            __cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
+                const std::size_t __idx = __item_id.get_linear_id();
+                // For small inputs, do not vectorize or perform multiple iterations per work item. Spread input evenly
+                // across compute units.
+                __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...);
+            });
+        });
+        return __future(__event);
+    }
+};
+
+template <typename _KernelName, typename... _RangeTypes>
+struct __parallel_for_large_submitter;
+
+template <typename... _Name, typename... _RangeTypes>
+struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...>
+{
+    // Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
+    static constexpr std::uint16_t __max_work_group_size = 512;
+
+    // SPIR-V compilation targets show best performance with a stride of the sub-group size.
+    // Other compilation targets perform best with a work-group size stride. This utility can only be called from the
+    // device.
+    static inline std::tuple<std::size_t, std::size_t, bool>
+    __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
+                         std::size_t __adj_elements_per_work_item, std::size_t __work_group_size)
+    {
+        const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
+        if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
+        {
+            const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
+            const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
+            const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+            const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+
+            const std::size_t __sub_group_start_idx =
+                __iters_per_work_item * __adj_elements_per_work_item *
+                (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
+            const bool __is_full_sub_group =
+                __sub_group_start_idx + __iters_per_work_item * __adj_elements_per_work_item * __sub_group_size <=
+                __count;
+            const std::size_t __work_item_idx =
+                __sub_group_start_idx + __adj_elements_per_work_item * __sub_group_local_id;
+            return std::tuple(__work_item_idx, __adj_elements_per_work_item * __sub_group_size, __is_full_sub_group);
+        }
+        else
+        {
+            const std::size_t __work_group_start_idx =
+                __work_group_id * __work_group_size * __iters_per_work_item * __adj_elements_per_work_item;
+            const std::size_t __work_item_idx =
+                __work_group_start_idx + __item.get_local_linear_id() * __adj_elements_per_work_item;
+            const bool __is_full_work_group =
+                __work_group_start_idx + __iters_per_work_item * __work_group_size * __adj_elements_per_work_item <=
+                __count;
+            return std::tuple(__work_item_idx, __work_group_size * __adj_elements_per_work_item, __is_full_work_group);
+        }
+    }
+
+    // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item,
+    // then we should start using this code path.
+    template <typename _ExecutionPolicy, typename _Fp>
+    static std::size_t
+    __estimate_best_start_size(const _ExecutionPolicy& __exec, _Fp __brick)
+    {
+        const std::size_t __work_group_size =
+            oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
+        const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
+        return __work_group_size * _Fp::__preferred_iters_per_item * __max_cu;
+    }
+
+    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
+    {
+        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
+        const std::size_t __work_group_size =
+            oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
+        _PRINT_INFO_IN_DEBUG_MODE(__exec);
+        auto __event = __exec.queue().submit([__rngs..., __brick, __work_group_size, __count](sycl::handler& __cgh) {
+            //get an access to data under SYCL buffer:
+            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
+            constexpr std::uint8_t __iters_per_work_item = _Fp::__preferred_iters_per_item;
+            constexpr std::uint8_t __vector_size = _Fp::__preferred_vector_size;
+            const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
+                __count, (__work_group_size * __vector_size * __iters_per_work_item));
+            __cgh.parallel_for<_Name...>(
+                sycl::nd_range(sycl::range<1>(__num_groups * __work_group_size), sycl::range<1>(__work_group_size)),
+                [=](sycl::nd_item</*dim=*/1> __item) {
+                    const auto [__idx, __stride, __is_full] =
+                        __stride_recommender(__item, __count, __iters_per_work_item, __vector_size, __work_group_size);
+                    __strided_loop<__iters_per_work_item> __execute_loop{static_cast<std::size_t>(__count)};
+                    if (__is_full)
+                    {
+                        __execute_loop(std::true_type{}, __idx, __stride, __brick, __rngs...);
+                    }
+                    // If we are not full, then take this branch only if there is work to process.
+                    else if (__idx < __count)
+                    {
+                        __execute_loop(std::false_type{}, __idx, __stride, __brick, __rngs...);
+                    }
+                });
+        });
+        return __future(__event);
+    }
+};
+
+//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
+//for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
+template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+auto
+__parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count,
+               _Ranges&&... __rngs)
+{
+    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
+    using _ForKernelSmall =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>;
+    using _ForKernelLarge =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>;
+
+    using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>;
+    using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>;
+    // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
+    // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
+    // then only compile the basic kernel as the two versions are effectively the same.
+    if constexpr (_Fp::__preferred_iters_per_item > 1 || _Fp::__preferred_vector_size > 1)
+    {
+        if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
+        {
+            return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                       std::forward<_Ranges>(__rngs)...);
+        }
+    }
+    return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                               std::forward<_Ranges>(__rngs)...);
+}
+
+} // namespace __par_backend_hetero
+} // namespace dpl
+} // namespace oneapi
+
+#endif
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
index 9ff4fa2c80b..ef124405434 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
@@ -48,7 +48,7 @@ namespace __par_backend_hetero
 //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
 //for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _Name>
 struct __parallel_for_fpga_submitter;
 
@@ -70,7 +70,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...
 #pragma unroll(::std::decay <_ExecutionPolicy>::type::unroll_factor)
                 for (auto __idx = 0; __idx < __count; ++__idx)
                 {
-                    __brick(__idx, __rngs...);
+                    __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...);
                 }
             });
         });
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 280e3e20b13..6c97f0a55d4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -197,7 +197,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I
     return {__rng1_idx, __rng2_idx};
 }
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation
 template <typename _OutSizeLimit, typename _IdType, typename _Name>
 struct __parallel_merge_submitter;
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
index 0d2d49ed6d1..fcd1b3e83cf 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
@@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _
 
 //------------------------------------------------------------------------
 // parallel_transform_reduce - async patterns
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 //------------------------------------------------------------------------
 
 // Parallel_transform_reduce for a small arrays using a single work group.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index c19538168e2..25b50bd20ea 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -907,6 +907,173 @@ __bypass_sycl_kernel_not_supported(const sycl::exception& __e)
         throw;
 }
 
+struct __scalar_load_op
+{
+    oneapi::dpl::__internal::__pstl_assign __assigner;
+    template <typename _IdxType1, typename _IdxType2, typename _SourceAcc, typename _DestAcc>
+    void
+    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const
+    {
+        __assigner(__source_acc[__idx_source], __dest_acc[__idx_dest]);
+    }
+};
+
+template <std::uint8_t __vec_size>
+struct __vector_load
+{
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
+    std::size_t __full_range_size;
+    template <typename _IdxType, typename _LoadOp, typename... _Rngs>
+    void
+    operator()(/*__is_full*/ std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Rngs&&... __rngs) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
+            __load_op(__start_idx + __i, __i, __rngs...);
+    }
+
+    template <typename _IdxType, typename _LoadOp, typename... _Rngs>
+    void
+    operator()(/*__is_full*/ std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Rngs&&... __rngs) const
+    {
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__full_range_size - __start_idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
+            __load_op(__start_idx + __i, __i, __rngs...);
+    }
+};
+
+template <typename _TransformOp>
+struct __scalar_store_transform_op
+{
+    _TransformOp __transform;
+    // Unary transformations into an output buffer
+    template <typename _IdxType1, typename _IdxType2, typename _SourceAcc, typename _DestAcc>
+    void
+    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const
+    {
+        __transform(__source_acc[__idx_source], __dest_acc[__idx_dest]);
+    }
+    // Binary transformations into an output buffer
+    template <typename _IdxType1, typename _IdxType2, typename _Source1Acc, typename _Source2Acc, typename _DestAcc>
+    void
+    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _Source1Acc __source1_acc, _Source2Acc __source2_acc,
+               _DestAcc __dest_acc) const
+    {
+        __transform(__source1_acc[__idx_source], __source2_acc[__idx_source], __dest_acc[__idx_dest]);
+    }
+};
+
+// TODO: Consider unifying the implementations of __vector_walk, __vector_load, __vector_store, and potentially
+// __strided_loop with some common, generic utility
+template <std::uint8_t __vec_size>
+struct __vector_walk
+{
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
+    std::size_t __full_range_size;
+
+    template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
+    void
+    operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
+        {
+            __f(__rngs[__idx + __i]...);
+        }
+    }
+    // For a non-full vector path, process it sequentially. This will always be the last sub or work group
+    // if it does not evenly divide into input
+    template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
+    void
+    operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
+    {
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__full_range_size - __idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
+        {
+            __f(__rngs[__idx + __i]...);
+        }
+    }
+};
+
+template <std::uint8_t __vec_size>
+struct __vector_store
+{
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
+    std::size_t __full_range_size;
+
+    template <typename _IdxType, typename _StoreOp, typename... _Rngs>
+    void
+    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs&&... __rngs) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
+            __store_op(__i, __start_idx + __i, __rngs...);
+    }
+    template <typename _IdxType, typename _StoreOp, typename... _Rngs>
+    void
+    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs&&... __rngs) const
+    {
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__full_range_size - __start_idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
+            __store_op(__i, __start_idx + __i, __rngs...);
+    }
+};
+
+template <std::uint8_t __vec_size>
+struct __vector_reverse
+{
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
+    template <typename _Idx, typename _Array>
+    void
+    operator()(/*__is_full*/ std::true_type, const _Idx /*__elements_to_process*/, _Array __array) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i)
+            std::swap(__array[__i], __array[__vec_size - __i - 1]);
+    }
+    template <typename _Idx, typename _Array>
+    void
+    operator()(/*__is_full*/ std::false_type, const _Idx __elements_to_process, _Array __array) const
+    {
+        for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i)
+            std::swap(__array[__i], __array[__elements_to_process - __i - 1]);
+    }
+};
+
+// Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns
+// (potentially with vectorization)
+template <std::uint8_t __num_strides>
+struct __strided_loop
+{
+    std::size_t __full_range_size;
+    template <typename _IdxType, typename _LoopBodyOp, typename... _Ranges>
+    void
+    operator()(/*__is_full*/ std::true_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op,
+               _Ranges&&... __rngs) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __num_strides; ++__i)
+        {
+            __loop_body_op(std::true_type{}, __idx, __rngs...);
+            __idx += __stride;
+        }
+    }
+    template <typename _IdxType, typename _LoopBodyOp, typename... _Ranges>
+    void
+    operator()(/*__is_full*/ std::false_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op,
+               _Ranges&&... __rngs) const
+    {
+        // Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
+        const std::uint8_t __adjusted_iters_per_work_item =
+            oneapi::dpl::__internal::__dpl_ceiling_div(__full_range_size - __idx, __stride);
+        for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
+        {
+            __loop_body_op(std::false_type{}, __idx, __rngs...);
+            __idx += __stride;
+        }
+    }
+};
+
 } // namespace __par_backend_hetero
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index 54ea69bae42..ac2e8da30f8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -368,7 +368,16 @@ namespace oneapi::dpl::unseq_backend
 template <typename _ExecutionPolicy, typename _F>
 struct walk_n;
 
-template <typename _ExecutionPolicy, typename _F>
+template <typename _ExecutionPolicy, typename _F, typename _Range>
+struct walk1_vector_or_scalar;
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct walk2_vectors_or_scalars;
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
+struct walk3_vectors_or_scalars;
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct walk_adjacent_difference;
 
 template <typename _ExecutionPolicy, typename _Operation1, typename _Operation2, typename _Tp, typename _Commutative,
@@ -418,7 +427,7 @@ struct __brick_includes;
 template <typename _ExecutionPolicy, typename _Compare, typename _Size1, typename _Size2, typename _IsOpDifference>
 class __brick_set_op;
 
-template <typename _BinaryOperator, typename _Size>
+template <typename _BinaryOperator, typename _Size, typename _Range>
 struct __brick_reduce_idx;
 
 } // namespace oneapi::dpl::unseq_backend
@@ -429,9 +438,30 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen
 {
 };
 
-template <typename _ExecutionPolicy, typename _F>
+template <typename _ExecutionPolicy, typename _F, typename _Range>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk1_vector_or_scalar,
+                                                       _ExecutionPolicy, _F, _Range)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_F>
+{
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk2_vectors_or_scalars,
+                                                       _ExecutionPolicy, _F, _Range1, _Range2)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_F>
+{
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk3_vectors_or_scalars,
+                                                       _ExecutionPolicy, _F, _Range1, _Range2, _Range3)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_F>
+{
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk_adjacent_difference,
-                                                       _ExecutionPolicy, _F)>
+                                                       _ExecutionPolicy, _F, _Range1, _Range2)>
     : oneapi::dpl::__internal::__are_all_device_copyable<_F>
 {
 };
@@ -543,9 +573,9 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen
 {
 };
 
-template <typename _BinaryOperator, typename _Size>
+template <typename _BinaryOperator, typename _Size, typename _Range>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::__brick_reduce_idx, _BinaryOperator,
-                                                       _Size)>
+                                                       _Size, _Range)>
     : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryOperator, _Size>
 {
 };
@@ -555,8 +585,8 @@ namespace oneapi::dpl::internal
 
 enum class search_algorithm;
 
-template <typename Comp, typename T, search_algorithm func>
-struct custom_brick;
+template <typename Comp, typename T, typename _Range, search_algorithm func>
+struct __custom_brick;
 
 template <typename T, typename Predicate>
 struct replace_if_fun;
@@ -575,8 +605,8 @@ class transform_if_stencil_fun;
 
 } // namespace oneapi::dpl::internal
 
-template <typename Comp, typename T, oneapi::dpl::internal::search_algorithm func>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::internal::custom_brick, Comp, T, func)>
+template <typename Comp, typename T, typename _Range, oneapi::dpl::internal::search_algorithm func>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::internal::__custom_brick, Comp, T, _Range, func)>
     : oneapi::dpl::__internal::__are_all_device_copyable<Comp, T>
 {
 };
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index e0b57260ee0..9b8da494208 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -22,6 +22,7 @@
 #include "../../onedpl_config.h"
 #include "../../utils.h"
 #include "sycl_defs.h"
+#include "utils_ranges_sycl.h"
 
 #define _ONEDPL_SYCL_KNOWN_IDENTITY_PRESENT                                                                            \
     (_ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT || _ONEDPL_LIBSYCL_KNOWN_IDENTITY_PRESENT)
@@ -112,6 +113,190 @@ struct walk_n
     }
 };
 
+// Base class which establishes tuning parameters including vectorization / scalar path decider at compile time
+// for walk / for based algorithms
+template <typename... _Ranges>
+struct walk_vector_or_scalar_base
+{
+  private:
+    using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
+    constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
+    // Empirically determined 'bytes-in-flight' to maximize bandwidth utilization
+    constexpr static std::uint8_t __bytes_per_item = 16;
+    // Maximum size supported by compilers to generate vector instructions
+    constexpr static std::uint8_t __max_vector_size = 4;
+
+  public:
+    constexpr static bool __can_vectorize =
+        (oneapi::dpl::__ranges::__is_vectorizable_range<std::decay_t<_Ranges>>::value && ...) &&
+        (std::is_fundamental_v<oneapi::dpl::__internal::__value_t<_Ranges>> && ...) && __min_type_size < 4;
+    // Vectorize for small types, so we generate 128-byte load / stores in a sub-group
+    constexpr static std::uint8_t __preferred_vector_size =
+        __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(__max_vector_size, __min_type_size) : 1;
+    constexpr static std::uint8_t __preferred_iters_per_item =
+        __bytes_per_item / (__min_type_size * __preferred_vector_size);
+
+  protected:
+    using __vec_load_t = oneapi::dpl::__par_backend_hetero::__vector_load<__preferred_vector_size>;
+    using __vec_store_t = oneapi::dpl::__par_backend_hetero::__vector_store<__preferred_vector_size>;
+    using __vec_reverse_t = oneapi::dpl::__par_backend_hetero::__vector_reverse<__preferred_vector_size>;
+    using __vec_walk_t = oneapi::dpl::__par_backend_hetero::__vector_walk<__preferred_vector_size>;
+};
+
+// Path that intentionally disables vectorization for algorithms with a scattered access pattern (e.g. binary_search)
+template <typename... _Ranges>
+struct walk_scalar_base
+{
+  private:
+    using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
+    constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
+    constexpr static std::uint8_t __bytes_per_item = 16;
+
+  public:
+    constexpr static bool __can_vectorize = false;
+    // With no vectorization, the vector size is 1
+    constexpr static std::uint8_t __preferred_vector_size = 1;
+    // To achieve full bandwidth utilization, multiple iterations need to be processed by a work item
+    constexpr static std::uint8_t __preferred_iters_per_item =
+        __bytes_per_item / (__min_type_size * __preferred_vector_size);
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range>
+struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
+{
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range>;
+    _F __f;
+    std::size_t __n;
+
+  public:
+    walk1_vector_or_scalar(_F __f, std::size_t __n) : __f(std::move(__f)), __n(__n) {}
+
+    template <typename _IsFull>
+    void
+    __vector_path_impl(_IsFull __is_full, const std::size_t __idx, _Range __rng) const
+    {
+        typename __base_t::__vec_walk_t{__n}(__is_full, __idx, __f, __rng);
+    }
+
+    // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
+    template <typename _IsFull>
+    void
+    __scalar_path_impl(_IsFull, const std::size_t __idx, _Range __rng) const
+    {
+        __f(__rng[__idx]);
+    }
+
+    template <typename _IsFull>
+    void
+    operator()(_IsFull __is_full, const std::size_t __idx, _Range __rng) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng);
+    }
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2>
+{
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    _F __f;
+    std::size_t __n;
+
+  public:
+    walk2_vectors_or_scalars(_F __f, std::size_t __n) : __f(std::move(__f)), __n(__n) {}
+
+    template <typename _IsFull>
+    void
+    __vector_path_impl(_IsFull __is_full, const std::size_t __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+        _ValueType1 __rng1_vector[__base_t::__preferred_vector_size];
+        // 1. Load input into a vector
+        typename __base_t::__vec_load_t{__n}(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_load_op{},
+                                             __rng1, __rng1_vector);
+        // 2. Apply functor to vector and store into global memory
+        typename __base_t::__vec_store_t{__n}(__is_full, __idx,
+                                              oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<_F>{__f},
+                                              __rng1_vector, __rng2);
+    }
+
+    // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
+    template <typename _IsFull, typename _ItemId>
+    void
+    __scalar_path_impl(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+
+        __f(__rng1[__idx], __rng2[__idx]);
+    }
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
+    }
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
+struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2, _Range3>
+{
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2, _Range3>;
+    _F __f;
+    std::size_t __n;
+
+  public:
+    walk3_vectors_or_scalars(_F __f, std::size_t __n) : __f(std::move(__f)), __n(__n) {}
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    {
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+        using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
+
+        _ValueType1 __rng1_vector[__base_t::__preferred_vector_size];
+        _ValueType2 __rng2_vector[__base_t::__preferred_vector_size];
+
+        typename __base_t::__vec_load_t __vec_load{__n};
+        typename __base_t::__vec_store_t __vec_store{__n};
+        oneapi::dpl::__par_backend_hetero::__scalar_load_op __load_op;
+
+        // 1. Load inputs into vectors
+        __vec_load(__is_full, __idx, __load_op, __rng1, __rng1_vector);
+        __vec_load(__is_full, __idx, __load_op, __rng2, __rng2_vector);
+        // 2. Apply binary functor to vector and store into global memory
+        __vec_store(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<_F>{__f},
+                    __rng1_vector, __rng2_vector, __rng3);
+    }
+
+    // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
+    template <typename _IsFull, typename _ItemId>
+    void
+    __scalar_path_impl(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    {
+
+        __f(__rng1[__idx], __rng2[__idx], __rng3[__idx]);
+    }
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2, __rng3);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2, __rng3);
+    }
+};
+
 // If read accessor returns temporary value then __no_op returns lvalue reference to it.
 // After temporary value destroying it will be a reference on invalid object.
 // So let's don't call functor in case of __no_op
@@ -132,22 +317,56 @@ struct walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>
 // walk_adjacent_difference
 //------------------------------------------------------------------------
 
-template <typename _ExecutionPolicy, typename _F>
-struct walk_adjacent_difference
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
+    std::size_t __n;
+    oneapi::dpl::__internal::__pstl_assign __assigner;
 
-    template <typename _ItemId, typename _Acc1, typename _Acc2>
+  public:
+    walk_adjacent_difference(_F __f, std::size_t __n) : __f(std::move(__f)), __n(__n) {}
+
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(const _ItemId __idx, const _Acc1& _acc_src, _Acc2& _acc_dst) const
+    __scalar_path_impl(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        using ::std::get;
-
         // just copy an element if it is the first one
         if (__idx == 0)
-            _acc_dst[__idx] = _acc_src[__idx];
+            __assigner(__rng1[__idx], __rng2[__idx]);
         else
-            __f(_acc_src[__idx + (-1)], _acc_src[__idx], _acc_dst[__idx]);
+            __f(__rng1[__idx + (-1)], __rng1[__idx], __rng2[__idx]);
+    }
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
+        _ValueType __rng1_vector[__base_t::__preferred_vector_size + 1];
+        // 1. Establish a vector of __preferred_vector_size + 1 where a scalar load is performed on the first element
+        // followed by a vector load of the specified length.
+        __assigner(__idx != 0 ? __rng1[__idx - 1] : __rng1[0], __rng1_vector[0]);
+        typename __base_t::__vec_load_t{__n}(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_load_op{},
+                                             __rng1, &__rng1_vector[1]);
+        // 2. Perform a vector store of __preferred_vector_size adjacent differences.
+        typename __base_t::__vec_store_t{__n}(__is_full, __idx,
+                                              oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<_F>{__f},
+                                              __rng1_vector, &__rng1_vector[1], __rng2);
+        // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch
+        // to better handle this results in performance degradation.
+        if (__idx == 0)
+            __assigner(__rng1_vector[0], __rng2[0]);
+    }
+    template <typename _IsFull, typename _ItemId>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -927,47 +1146,203 @@ struct __brick_includes
 //------------------------------------------------------------------------
 // reverse
 //------------------------------------------------------------------------
-template <typename _Size>
-struct __reverse_functor
+template <typename _Size, typename _Range>
+struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
 {
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
     _Size __size;
-    template <typename _Idx, typename _Accessor>
+
+  public:
+    __reverse_functor(_Size __size) : __size(__size) {}
+
+    template <typename _IsFull>
+    void
+    __vector_path_impl(_IsFull, const std::size_t __left_start_idx, _Range __rng) const
+    {
+        const std::size_t __n = __size;
+
+        // In the below implementation, we see that _IsFull is ignored in favor of std::true_type{} in all cases.
+        // This relaxation is due to the fact that in-place reverse launches work only over the first half of the
+        // buffer. As long as __size >= __vec_size there is no risk of an OOB accesses or a race condition. There may
+        // exist a  single point of double processing between left and right vectors in the last work-item which
+        // reverses middle elements. This extra processing of elements <= __vec_size is more performant than applying
+        // additional branching (such as in reverse_copy).
+
+        const std::size_t __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size;
+
+        _ValueType __rng_left_vector[__base_t::__preferred_vector_size];
+        _ValueType __rng_right_vector[__base_t::__preferred_vector_size];
+
+        typename __base_t::__vec_load_t __vec_load{__n};
+        typename __base_t::__vec_reverse_t __vec_reverse;
+        typename __base_t::__vec_store_t __vec_store{__n};
+        oneapi::dpl::__par_backend_hetero::__scalar_load_op __load_op;
+        oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<oneapi::dpl::__internal::__pstl_assign>
+            __store_op;
+
+        // 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right
+        __vec_load(std::true_type{}, __left_start_idx, __load_op, __rng, __rng_left_vector);
+        __vec_load(std::true_type{}, __right_start_idx, __load_op, __rng, __rng_right_vector);
+        // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full
+        // vector of elements to load
+        __vec_reverse(std::true_type{}, __left_start_idx, __rng_left_vector);
+        __vec_reverse(std::true_type{}, __right_start_idx, __rng_right_vector);
+        // 3. Store the left-half vector to the corresponding right-half indices and vice versa
+        __vec_store(std::true_type{}, __right_start_idx, __store_op, __rng_left_vector, __rng);
+        __vec_store(std::true_type{}, __left_start_idx, __store_op, __rng_right_vector, __rng);
+    }
+    template <typename _IsFull>
     void
-    operator()(const _Idx __idx, _Accessor& __acc) const
+    __scalar_path_impl(_IsFull, const std::size_t __idx, _Range __rng) const
     {
         using ::std::swap;
-        swap(__acc[__idx], __acc[__size - __idx - 1]);
+        swap(__rng[__idx], __rng[__size - __idx - 1]);
+    }
+    template <typename _IsFull>
+    void
+    operator()(_IsFull __is_full, const std::size_t __idx, _Range __rng) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng);
     }
 };
 
 //------------------------------------------------------------------------
 // reverse_copy
 //------------------------------------------------------------------------
-template <typename _Size>
-struct __reverse_copy
+template <typename _Size, typename _Range1, typename _Range2>
+struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
     _Size __size;
-    template <typename _Idx, typename _AccessorSrc, typename _AccessorDst>
+    oneapi::dpl::__internal::__pstl_assign __assigner;
+
+  public:
+    __reverse_copy(_Size __size) : __size(__size) {}
+
+    template <typename _IsFull>
+    void
+    __scalar_path_impl(_IsFull, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        __rng2[__idx] = __rng1[__size - __idx - 1];
+    }
+    template <typename _IsFull>
+    void
+    __vector_path_impl(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        const std::size_t __n = __size;
+        const std::size_t __remaining_elements = __n - __idx;
+        const std::uint8_t __elements_to_process =
+            std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
+        const std::size_t __output_start = __size - __idx - __elements_to_process;
+        // 1. Load vector to reverse
+        _ValueType __rng1_vector[__base_t::__preferred_vector_size];
+        typename __base_t::__vec_load_t{__n}(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_load_op{},
+                                             __rng1, __rng1_vector);
+        // 2. Reverse in registers
+        typename __base_t::__vec_reverse_t{}(__is_full, __elements_to_process, __rng1_vector);
+        // 3. Flip the location of the vector in the output buffer
+        if constexpr (_IsFull::value)
+        {
+            typename __base_t::__vec_store_t{__n}(std::true_type{}, __output_start,
+                                                  oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<
+                                                      oneapi::dpl::__internal::__pstl_assign>{},
+                                                  __rng1_vector, __rng2);
+        }
+        else
+        {
+            // The non-full case is processed manually here due to the translation of indices in the reverse operation.
+            // The last few elements in the buffer are reversed into the beginning of the buffer. However,
+            // __vector_store would believe that we always have a full vector length of elements due to the starting
+            // index having greater than __preferred_vector_size elements until the end of the buffer.
+            for (std::uint8_t __i = 0; __i < __elements_to_process; ++__i)
+                __assigner(__rng1_vector[__i], __rng2[__output_start + __i]);
+        }
+    }
+    template <typename _IsFull>
     void
-    operator()(const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const
+    operator()(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        __acc2[__idx] = __acc1[__size - __idx - 1];
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 
 //------------------------------------------------------------------------
 // rotate_copy
 //------------------------------------------------------------------------
-template <typename _Size>
-struct __rotate_copy
+template <typename _Size, typename _Range1, typename _Range2>
+struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
     _Size __size;
     _Size __shift;
-    template <typename _Idx, typename _AccessorSrc, typename _AccessorDst>
+    oneapi::dpl::__internal::__pstl_assign __assigner;
+
+  public:
+    __rotate_copy(_Size __size, _Size __shift) : __size(__size), __shift(__shift) {}
+
+    template <typename _IsFull>
     void
-    operator()(const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const
+    __vector_path_impl(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        __acc2[__idx] = __acc1[(__shift + __idx) % __size];
+        const std::size_t __shifted_idx = __shift + __idx;
+        const std::size_t __wrapped_idx = __shifted_idx % __size;
+        const std::size_t __n = __size;
+        _ValueType __rng1_vector[__base_t::__preferred_vector_size];
+        //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process
+        if (__wrapped_idx + __base_t::__preferred_vector_size <= __n)
+        {
+            typename __base_t::__vec_load_t{__n}(
+                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__scalar_load_op{}, __rng1, __rng1_vector);
+        }
+        else
+        {
+            // A single point of non-contiguity within the rotation operation. Manually process two loops here:
+            // the first before the wraparound point and the second after.
+            const std::size_t __remaining_elements = __n - __idx;
+            const std::uint8_t __elements_to_process =
+                std::min(std::size_t{__base_t::__preferred_vector_size}, __remaining_elements);
+            // __n - __wrapped_idx can safely fit into a uint8_t due to the condition check above.
+            const std::uint8_t __loop1_elements =
+                std::min(__elements_to_process, static_cast<std::uint8_t>(__n - __wrapped_idx));
+            const std::uint8_t __loop2_elements = __elements_to_process - __loop1_elements;
+            std::uint8_t __i = 0;
+            for (__i = 0; __i < __loop1_elements; ++__i)
+                __assigner(__rng1[__wrapped_idx + __i], __rng1_vector[__i]);
+            for (std::uint8_t __j = 0; __j < __loop2_elements; ++__j)
+                __assigner(__rng1[__j], __rng1_vector[__i + __j]);
+        }
+        // 2. Store the rotation
+        typename __base_t::__vec_store_t{__n}(
+            __is_full, __idx,
+            oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
+            __rng1_vector, __rng2);
+    }
+    template <typename _IsFull>
+    void
+    __scalar_path_impl(_IsFull, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        __rng2[__idx] = __rng1[(__shift + __idx) % __size];
+    }
+    template <typename _IsFull>
+    void
+    operator()(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -1041,15 +1416,73 @@ class __brick_set_op
     }
 };
 
-template <typename _ExecutionPolicy, typename _DiffType>
+template <typename _ExecutionPolicy, typename _DiffType, typename _Range>
 struct __brick_shift_left
 {
+  private:
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
+    // Maximum size supported by compilers to generate vector instructions
+    constexpr static std::uint8_t __max_vector_size = 4;
+
+  public:
+    // Multiple iterations per item are manually processed in the brick with a nd-range strided approach.
+    constexpr static std::uint8_t __preferred_iters_per_item = 1;
+    constexpr static bool __can_vectorize =
+        oneapi::dpl::__ranges::__is_vectorizable_range<std::decay_t<_Range>>::value &&
+        std::is_fundamental_v<_ValueType> && sizeof(_ValueType) < 4;
+    constexpr static std::uint8_t __preferred_vector_size =
+        __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(__max_vector_size, sizeof(_ValueType)) : 1;
+
     _DiffType __size;
     _DiffType __n;
 
-    template <typename _ItemId, typename _Range>
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    {
+        const std::size_t __unsigned_size = __size;
+        const _DiffType __i = __idx - __n;
+        oneapi::dpl::__par_backend_hetero::__vector_load<__preferred_vector_size> __vec_load{__unsigned_size};
+        oneapi::dpl::__par_backend_hetero::__vector_store<__preferred_vector_size> __vec_store{__unsigned_size};
+        oneapi::dpl::__par_backend_hetero::__scalar_load_op __load_op;
+        oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<oneapi::dpl::__internal::__pstl_assign>
+            __store_op;
+        for (_DiffType __k = __n; __k < __size; __k += __n)
+        {
+            const _DiffType __read_offset = __k + __idx;
+            const _DiffType __write_offset = __k + __i;
+            if constexpr (_IsFull::value)
+            {
+                if (__read_offset + __preferred_vector_size <= __size)
+                {
+                    _ValueType __rng_vector[__preferred_vector_size];
+                    __vec_load(std::true_type{}, __read_offset, __load_op, __rng, __rng_vector);
+                    __vec_store(std::true_type{}, __write_offset, __store_op, __rng_vector, __rng);
+                }
+                else if (__read_offset < __size)
+                {
+                    const std::size_t __num_remaining = __size - __read_offset;
+                    for (_DiffType __j = 0; __j < __num_remaining; ++__j)
+                        __rng[__write_offset + __j] = __rng[__read_offset + __j];
+                }
+            }
+            else
+            {
+                // Some items within a sub-group may still have a full vector length to process even if _IsFull is
+                // false by intentional design of __stride_recommender. While these are vectorizable, this will result
+                // in branch divergence and masked execution of both vectorized and serial paths for all items in the
+                // sub-group which may worsen performance. Instead, have each item in the sub-group process its work
+                // serially.
+                for (_DiffType __j = 0; __j < std::min(std::size_t{__preferred_vector_size}, __n - __idx); ++__j)
+                    if (__read_offset + __j < __size)
+                        __rng[__write_offset + __j] = __rng[__read_offset + __j];
+            }
+        }
+    }
+
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(const _ItemId __idx, _Range&& __rng) const
+    __scalar_path_impl(_IsFull, const _ItemId __idx, _Range __rng) const
     {
         const _DiffType __i = __idx - __n; //loop invariant
         for (_DiffType __k = __n; __k < __size; __k += __n)
@@ -1058,6 +1491,16 @@ struct __brick_shift_left
                 __rng[__k + __i] = ::std::move(__rng[__k + __idx]);
         }
     }
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    {
+        if constexpr (__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng);
+    }
 };
 
 struct __brick_assign_key_position
@@ -1074,14 +1517,14 @@ struct __brick_assign_key_position
 };
 
 // reduce the values in a segment associated with a key
-template <typename _BinaryOperator, typename _Size>
-struct __brick_reduce_idx
+template <typename _BinaryOperator, typename _Size, typename _Range>
+struct __brick_reduce_idx : public walk_scalar_base<_Range>
 {
     __brick_reduce_idx(const _BinaryOperator& __b, const _Size __n_) : __binary_op(__b), __n(__n_) {}
 
-    template <typename _Idx, typename _Values>
+    template <typename _Values>
     auto
-    reduce(_Idx __segment_begin, _Idx __segment_end, const _Values& __values) const
+    reduce(std::size_t __segment_begin, std::size_t __segment_end, const _Values& __values) const
     {
         using __ret_type = oneapi::dpl::__internal::__decay_with_tuple_specialization_t<decltype(__values[0])>;
         __ret_type __res = __values[__segment_begin];
@@ -1090,23 +1533,82 @@ struct __brick_reduce_idx
             __res = __binary_op(__res, __values[__segment_begin]);
         return __res;
     }
-
-    template <typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
+    template <typename _IsFull, typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
     void
-    operator()(const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
-               _OutValues& __out_values) const
+    __scalar_path_impl(_IsFull, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
+                       _OutValues& __out_values) const
     {
         using __value_type = decltype(__segment_starts[__idx]);
         __value_type __segment_end =
             (__idx == __segment_starts.size() - 1) ? __value_type(__n) : __segment_starts[__idx + 1];
         __out_values[__idx] = reduce(__segment_starts[__idx], __segment_end, __values);
     }
+    template <typename _IsFull, typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
+               _OutValues& __out_values) const
+    {
+        __scalar_path_impl(__is_full, __idx, __segment_starts, __values, __out_values);
+    }
 
   private:
     _BinaryOperator __binary_op;
     _Size __n;
 };
 
+// std::swap_ranges is unique in that both sets of provided ranges will be modified. Due to this,
+// we define a separate functor from __walk2_vectors_or_scalars with a customized vectorization path.
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
+{
+  private:
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    _F __f;
+    std::size_t __n;
+
+  public:
+    __brick_swap(_F __f, std::size_t __n) : __f(std::move(__f)), __n(__n) {}
+
+    template <typename _IsFull>
+    void
+    __vector_path_impl(_IsFull __is_full, const std::size_t __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+        // Copies are used in the vector path of swap due to the restriction to fundamental types.
+        using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
+        _ValueType __rng_vector[__base_t::__preferred_vector_size];
+        typename __base_t::__vec_load_t __vec_load{__n};
+        typename __base_t::__vec_store_t __vec_store{__n};
+        // 1. Load elements from __rng1.
+        __vec_load(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_load_op{}, __rng1, __rng_vector);
+        // 2. Swap the __rng1 elements in the vector with __rng2 elements from global memory. Note the store operation
+        // updates __rng_vector due to the swap functor.
+        __vec_store(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<_F>{__f},
+                    __rng_vector, __rng2);
+        // 3. Store __rng2 elements in the vector into __rng1.
+        __vec_store(
+            __is_full, __idx,
+            oneapi::dpl::__par_backend_hetero::__scalar_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
+            __rng_vector, __rng1);
+    }
+
+    template <typename _IsFull>
+    void
+    __scalar_path_impl(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        __f(__rng1[__idx], __rng2[__idx]);
+    }
+
+    template <typename _IsFull>
+    void
+    operator()(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
+    }
+};
+
 } // namespace unseq_backend
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
index 4d16ae4508b..23b39c055c7 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
@@ -18,8 +18,12 @@
 
 #include <iterator>
 #include <type_traits>
+#if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT
+#include <ranges> // std::ranges::contiguous_range
+#endif
 
 #include "../../utils_ranges.h"
+#include "../../ranges_defs.h" // contiguous_range and contiguous_iterator from nanorange
 #include "../../iterator_impl.h"
 #include "sycl_iterator.h"
 #include "sycl_defs.h"
@@ -754,6 +758,55 @@ __select_backend(const execution::fpga_policy<_Factor, _KernelName>&, _Ranges&&.
 }
 #endif
 
+// TODO: At some point with C++20, we should implement this with concepts to more easily support the less common edge
+// cases (e.g. a pipe over a counting iterator which is non-contiguous). For now, vectorization is primarily based on
+// range contiguity with specialization for internal views and guard views over internal iterators.
+// The following cases enable vectorization for a range:
+// 1. With C++20 concepts, the range satisfies std::ranges::contiguous_range.
+// 2. With C++17 nanoranges, the range satisfies __nanorange::nano::ranges::contiguous_range. Note that a view over
+// a SYCL buffer satisfies this concept along with pipe views that maintain access contiguity.
+// 3. The range is a guard view over an iterator with no global memory access: counting_iterator and discard_iterator
+// 4. The range is one of our internal, vectorizable range types: drop_view_simple, take_view_simple, or
+// transform_view_simple
+
+// Base case: check contiguous range properties
+template <typename _Rng>
+struct __is_vectorizable_range
+{
+    constexpr static bool value =
+#if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT
+        std::ranges::contiguous_range<_Rng> ||
+#endif
+        __nanorange::nano::ranges::contiguous_range<_Rng>;
+};
+
+// Basic guard view specializations - views which are not contiguous but do not interact with global memory
+template <typename _Ip>
+struct __is_vectorizable_range<oneapi::dpl::__ranges::guard_view<oneapi::dpl::counting_iterator<_Ip>>> : std::true_type
+{
+};
+
+template <>
+struct __is_vectorizable_range<oneapi::dpl::__ranges::guard_view<oneapi::dpl::discard_iterator>> : std::true_type
+{
+};
+
+// Recursive view specializations - internal views which we need to search inwards to identify if it is vectorizable
+template <typename _Rng, typename _F>
+struct __is_vectorizable_range<oneapi::dpl::__ranges::transform_view_simple<_Rng, _F>> : __is_vectorizable_range<_Rng>
+{
+};
+
+template <typename _Rng, typename _Size>
+struct __is_vectorizable_range<oneapi::dpl::__ranges::drop_view_simple<_Rng, _Size>> : __is_vectorizable_range<_Rng>
+{
+};
+
+template <typename _Rng, typename _F>
+struct __is_vectorizable_range<oneapi::dpl::__ranges::take_view_simple<_Rng, _F>> : __is_vectorizable_range<_Rng>
+{
+};
+
 } // namespace __ranges
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
index 6970f57f4d7..937608bfc39 100644
--- a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
@@ -143,7 +143,9 @@ __pattern_histogram(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rando
 
         auto __init_event = oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__hist_fill_zeros_wrapper>(__exec),
-            unseq_backend::walk_n<_ExecutionPolicy, decltype(__fill_func)>{__fill_func}, __num_bins, __bins);
+            unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, decltype(__fill_func), decltype(__bins)>{
+                __fill_func, static_cast<std::size_t>(__num_bins)},
+            __num_bins, __bins);
 
         if (__n > 0)
         {
diff --git a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
index d040e828eef..cf9533f8113 100644
--- a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
@@ -263,9 +263,12 @@ __pattern_adjacent_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&
             oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator2>();
         auto __buf2 = __keep2(__d_first, __d_last);
 
-        using _Function = unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn)>;
+        using _Function =
+            unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn), decltype(__buf1.all_view()),
+                                                    decltype(__buf2.all_view())>;
 
-        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec, _Function{__fn}, __n,
+        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec,
+                                                          _Function{__fn, static_cast<std::size_t>(__n)}, __n,
                                                           __buf1.all_view(), __buf2.all_view())
             .__deferrable_wait();
     }
diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index 9c32178a78c..b0e41931482 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -25,6 +25,7 @@
 #include <iterator>
 #include <functional>
 #include <type_traits>
+#include <algorithm>
 
 #if _ONEDPL_BACKEND_SYCL
 #    include "hetero/dpcpp/sycl_defs.h"
@@ -784,6 +785,35 @@ union __lazy_ctor_storage
     }
 };
 
+// To implement __min_nested_type_size, a general utility with an internal tuple
+// specialization, we need to forward declare our internal tuple first as tuple_impl.h
+// already includes this header.
+template <typename... T>
+struct tuple;
+
+// Returns the smallest type within a set of potentially nested template types. This function
+// recursively explores std::tuple and oneapi::dpl::__internal::tuple for the smallest type.
+// For all other types, its size is used directly.
+// E.g. If we consider the type: T = tuple<float, tuple<short, long>, int, double>,
+// then __min_nested_type_size<T>::value returns sizeof(short).
+template <typename _T>
+struct __min_nested_type_size
+{
+    constexpr static std::size_t value = sizeof(_T);
+};
+
+template <typename... _Ts>
+struct __min_nested_type_size<std::tuple<_Ts...>>
+{
+    constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...});
+};
+
+template <typename... _Ts>
+struct __min_nested_type_size<oneapi::dpl::__internal::tuple<_Ts...>>
+{
+    constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...});
+};
+
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index da7f0ad5390..fe308318cad 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -45,10 +45,12 @@ test_device_copyable()
     static_assert(sycl::is_device_copyable_v<constant_iterator_device_copyable>,
                   "constant_iterator_device_copyable is not device copyable");
 
-    //custom_brick
-    static_assert(sycl::is_device_copyable_v<oneapi::dpl::internal::custom_brick<
-                      noop_device_copyable, int_device_copyable, oneapi::dpl::internal::search_algorithm::lower_bound>>,
-                  "custom_brick is not device copyable with device copyable types");
+    //__custom_brick
+    static_assert(
+        sycl::is_device_copyable_v<
+            oneapi::dpl::internal::__custom_brick<noop_device_copyable, int_device_copyable, range_device_copyable,
+                                                  oneapi::dpl::internal::search_algorithm::lower_bound>>,
+        "__custom_brick is not device copyable with device copyable types");
     //replace_if_fun
     static_assert(
         sycl::is_device_copyable_v<oneapi::dpl::internal::replace_if_fun<int_device_copyable, noop_device_copyable>>,
@@ -76,11 +78,23 @@ test_device_copyable()
     static_assert(sycl::is_device_copyable_v<
                       oneapi::dpl::unseq_backend::walk_n<policy_non_device_copyable, noop_device_copyable>>,
                   "walk_n is not device copyable with device copyable types");
+    //walk1_vector_or_scalar
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk1_vector_or_scalar<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable>>,
+                  "walk1_vector_or_scalar is not device copyable with device copyable types");
+    //walk2_vectors_or_scalars
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk2_vectors_or_scalars<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable>>,
+                  "walk2_vectors_or_scalars is not device copyable with device copyable types");
+    //walk3_vectors_or_scalars
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk3_vectors_or_scalars<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable,
+                      range_device_copyable>>,
+                  "walk3_vectors_or_scalars is not device copyable with device copyable types");
     //walk_adjacent_difference
-    static_assert(
-        sycl::is_device_copyable_v<
-            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_device_copyable>>,
-        "walk_adjacent_difference is not device copyable with device copyable types");
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk_adjacent_difference<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable>>,
+                  "walk_adjacent_difference is not device copyable with device copyable types");
     //transform_reduce
     static_assert(
         sycl::is_device_copyable_v<
@@ -148,8 +162,8 @@ test_device_copyable()
                                                        int_device_copyable, int_device_copyable, std::true_type>>,
         "__brick_set_op is not device copyable with device copyable types");
     // __brick_reduce_idx
-    static_assert(sycl::is_device_copyable_v<
-                      oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_device_copyable>>,
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::__brick_reduce_idx<
+                      noop_device_copyable, int_device_copyable, range_device_copyable>>,
                   "__brick_reduce_idx is not device copyable with device copyable types");
 
     //__gen_transform_input
@@ -310,11 +324,11 @@ test_non_device_copyable()
     static_assert(!sycl::is_device_copyable_v<constant_iterator_non_device_copyable>, "iterator is device copyable");
     static_assert(!sycl::is_device_copyable_v<range_non_device_copyable>, "range_non_device_copyable is device copyable");
 
-    //custom_brick
-    static_assert(
-        !sycl::is_device_copyable_v<oneapi::dpl::internal::custom_brick<
-            noop_device_copyable, int_non_device_copyable, oneapi::dpl::internal::search_algorithm::lower_bound>>,
-        "custom_brick is device copyable with non device copyable types");
+    //__custom_brick
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::internal::__custom_brick<
+                      noop_device_copyable, int_non_device_copyable, range_non_device_copyable,
+                      oneapi::dpl::internal::search_algorithm::lower_bound>>,
+                  "__custom_brick is device copyable with non device copyable types");
     //replace_if_fun
     static_assert(!sycl::is_device_copyable_v<
                       oneapi::dpl::internal::replace_if_fun<int_device_copyable, noop_non_device_copyable>>,
@@ -343,10 +357,26 @@ test_non_device_copyable()
     static_assert(!sycl::is_device_copyable_v<
                       oneapi::dpl::unseq_backend::walk_n<policy_non_device_copyable, noop_non_device_copyable>>,
                   "walk_n is device copyable with non device copyable types");
+    //walk1_vector_or_scalar
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk1_vector_or_scalar<
+                      policy_non_device_copyable, noop_non_device_copyable, range_non_device_copyable>>,
+                  "walk1_vector_or_scalar is device copyable with non device copyable types");
+    //walk2_vectors_or_scalars
+    static_assert(
+        !sycl::is_device_copyable_v<
+            oneapi::dpl::unseq_backend::walk2_vectors_or_scalars<policy_non_device_copyable, noop_non_device_copyable,
+                                                                 range_non_device_copyable, range_non_device_copyable>>,
+        "walk2_vectors_or_scalars is device copyable with non device copyable types");
+    //walk3_vectors_or_scalars
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk3_vectors_or_scalars<
+                      policy_non_device_copyable, noop_non_device_copyable, range_non_device_copyable,
+                      range_non_device_copyable, range_non_device_copyable>>,
+                  "walk3_vectors_or_scalars is device copyable with non device copyable types");
     //walk_adjacent_difference
     static_assert(
         !sycl::is_device_copyable_v<
-            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_non_device_copyable>>,
+            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_non_device_copyable,
+                                                                 range_non_device_copyable, range_non_device_copyable>>,
         "walk_adjacent_difference is device copyable with non device copyable types");
     //transform_reduce
     static_assert(
@@ -415,8 +445,8 @@ test_non_device_copyable()
                                                        int_device_copyable, int_device_copyable, std::true_type>>,
         "__brick_set_op is device copyable with non device copyable types");
     //__brick_reduce_idx
-    static_assert(!sycl::is_device_copyable_v<
-                      oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_non_device_copyable>>,
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::__brick_reduce_idx<
+                      noop_device_copyable, int_non_device_copyable, range_non_device_copyable>>,
                   "__brick_reduce_idx is device copyable with non device copyable types");
 
     //__gen_transform_input
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
index 13f03fdb3dd..41afe0693e1 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
@@ -69,7 +69,7 @@ template <typename T>
 void
 test()
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T> actual(max_len);
 
@@ -99,6 +99,7 @@ int
 main()
 {
     test<std::int32_t>();
+    test<std::uint8_t>();
     test<std::uint16_t>();
     test<float64_t>();
     test<wrapper<float32_t>>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
index 44f9dc6fe7c..cbe63db7e47 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
@@ -82,7 +82,7 @@ template <typename T1, typename T2>
 void
 test()
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
     Sequence<T2> actual(max_len);
     Sequence<T1> data(max_len, [](::std::size_t i) { return T1(i); });
 
@@ -100,6 +100,7 @@ main()
 {
     // clang-3.8 fails to correctly auto vectorize the loop in some cases of different types of container's elements,
     // for example: std::int32_t and std::int8_t. This issue isn't detected for clang-3.9 and newer versions.
+    test<std::uint8_t, std::uint8_t>();
     test<std::int16_t, std::int8_t>();
     test<std::uint16_t, float32_t>();
     test<float64_t, std::int64_t>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
index 9ad4cfc13bc..7dfc95b1485 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
@@ -125,8 +125,9 @@ template <typename T, typename Convert>
 void
 test(T trash, Convert convert)
 {
+    size_t max_n = TestUtils::get_pattern_for_max_n();
     // Try sequences of various lengths.
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         // count is number of output elements, plus a handful
         // more for sake of detecting buffer overruns.
@@ -166,6 +167,9 @@ main()
     test<std::int32_t>(-666, [](size_t j) { return std::int32_t(j); });
     test<float64_t>(-666.0, [](size_t j) { return float64_t(j); });
 
+    test<std::uint16_t>(42, [](size_t j) { return std::uint16_t(j); });
+    test<std::uint8_t>(42, [](size_t j) { return std::uint8_t(j); });
+
 #if !TEST_DPCPP_BACKEND_PRESENT
     /*TODO: copy support of a class with no default constructor*/
     test<Wrapper<float64_t>>(Wrapper<float64_t>(-666.0), [](std::int32_t j) { return Wrapper<float64_t>(j); });
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
index 66c8e321627..6c707b62079 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
@@ -101,10 +101,12 @@ int
 main()
 {
 
-    const ::std::size_t N = 100000;
+    const std::size_t N = TestUtils::get_pattern_for_max_n();
 
     for (::std::size_t n = 0; n < N; n = n < 16 ? n + 1 : size_t(3.1415 * n))
     {
+        test_fill_by_type<std::int8_t>(n);
+        test_fill_by_type<std::int16_t>(n);
         test_fill_by_type<std::int32_t>(n);
         test_fill_by_type<float64_t>(n);
     }
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp
index 3ee13107011..bcd65426526 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp
@@ -82,7 +82,8 @@ template <typename T>
 void
 test_generate_by_type()
 {
-    for (size_t n = 0; n <= 100000; n = n < 16 ? n + 1 : size_t(3.1415 * n))
+    size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 0; n <= max_n; n = n < 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<T> in(n, [](size_t) -> T { return T(0); }); //fill by zero
 
@@ -123,6 +124,7 @@ struct test_non_const_generate_n
 int
 main()
 {
+    test_generate_by_type<std::uint8_t>();
     test_generate_by_type<std::int32_t>();
     test_generate_by_type<float64_t>();
 
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
index 92820d2012f..91291851dcb 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
@@ -114,7 +114,7 @@ template <typename T1, typename T2, typename Pred>
 void
 test(Pred pred)
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     const T1 value = T1(0);
     const T1 new_value = T1(666);
@@ -162,6 +162,7 @@ int
 main()
 {
     test<std::int32_t, float32_t>(oneapi::dpl::__internal::__equal_value<std::int32_t>(666));
+    test<std::uint8_t, std::uint8_t>([](const std::uint8_t& elem) { return elem % 3 < 2; });
     test<std::uint16_t, std::uint8_t>([](const std::uint16_t& elem) { return elem % 3 < 2; });
     test<float64_t, std::int64_t>([](const float64_t& elem) { return elem * elem - 3.5 * elem > 10; });
     //test<copy_int, copy_int>([](const copy_int& val) { return val.value / 5 > 2; });
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
index 50851e3d299..b89c3aa3f40 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
@@ -75,8 +75,9 @@ template <typename T, typename Convert, typename Predicate>
 void
 test(T trash, const T& old_value, const T& new_value, Predicate pred, Convert convert)
 {
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
     // Try sequences of various lengths.
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<T> in(n, [&](size_t k) -> T { return convert(n ^ k); });
         Sequence<T> out(n, [=](size_t) { return trash; });
@@ -123,6 +124,10 @@ main()
     test<std::int32_t>(-666, 42, 99, [](const std::int32_t& x) { return x != 42; },
                   [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? 42 : -1 - std::int32_t(j); });
 
+    test<std::uint8_t>(123, 42, 99, [](const std::uint8_t& x) { return x != 42; },
+                  [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? 42 : 255; });
+
+
 #if !TEST_DPCPP_BACKEND_PRESENT
     test<Number>(Number(42, OddTag()), Number(2001, OddTag()), Number(2017, OddTag()), IsMultiple(3, OddTag()),
                  [](std::int32_t j) { return ((j + 1) % 3 & 2) != 0 ? Number(2001, OddTag()) : Number(j, OddTag()); });
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
index 2147999ce3f..a6f4600c2a8 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
@@ -132,7 +132,7 @@ template <typename T>
 void
 test()
 {
-    const std::int32_t max_len = 100000;
+    const std::int32_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T> actual(max_len, [](::std::size_t i) { return T(i); });
     Sequence<T> data(max_len, [](::std::size_t i) { return T(i); });
@@ -154,6 +154,8 @@ test()
 int
 main()
 {
+    test<std::int8_t>();
+    test<std::int16_t>();
     test<std::int32_t>();
 #if !TEST_DPCPP_BACKEND_PRESENT
     test<wrapper<float64_t>>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
index edf5f56651c..1df986deb6a 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
@@ -100,7 +100,7 @@ void
 test()
 {
 
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T2> actual(max_len, [](::std::size_t i) { return T1(i); });
 
@@ -125,6 +125,8 @@ test()
 int
 main()
 {
+    test<std::int8_t, std::int8_t>();
+    test<std::int16_t, std::int16_t>();
     test<std::int32_t, std::int8_t>();
     test<std::uint16_t, float32_t>();
     test<float64_t, std::int64_t>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
index c15a493a639..d12cc8ba107 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
@@ -1,5 +1,5 @@
 // -*- C++ -*-
-//===-- shift_left.pass.cpp -----------------------------------------------===//
+//===-- shift_left_right.pass.cpp -----------------------------------------------===//
 //
 // Copyright (C) Intel Corporation
 //
@@ -40,8 +40,11 @@
 #include <omp.h> // omp_get_max_threads, omp_set_num_threads
 #endif
 
-template<typename Name>
-struct USM;
+template <typename... Name>
+struct USMKernelName;
+
+template <typename... Name>
+struct BufferKernelName;
 
 struct test_shift
 {
@@ -80,7 +83,9 @@ struct test_shift
         TestUtils::usm_data_transfer<alloc_type, _ValueType> dt_helper(queue, first, m);
 
         auto ptr = dt_helper.get_data();
-        auto het_res = algo(TestUtils::make_device_policy<USM<Algo>>(::std::forward<Policy>(exec)), ptr, ptr + m, n);
+        auto het_res =
+            algo(TestUtils::make_device_policy<USMKernelName<Algo, _ValueType>>(std::forward<Policy>(exec)), ptr,
+                 ptr + m, n);
         _DiffType res_idx = het_res - ptr;
 
         //3.2 check result
@@ -95,14 +100,14 @@ struct test_shift
     operator()(Policy&& exec, It first, typename ::std::iterator_traits<It>::difference_type m,
         It first_exp, typename ::std::iterator_traits<It>::difference_type n, Algo algo)
     {
+        using _ValueType = typename std::iterator_traits<It>::value_type;
+        using _DiffType = typename std::iterator_traits<It>::difference_type;
+        auto buffer_policy = TestUtils::make_device_policy<BufferKernelName<_ValueType, Algo>>(exec);
         //1.1 run a test with hetero policy and host itertors
-        auto res = algo(::std::forward<Policy>(exec), first, first + m, n);
+        auto res = algo(buffer_policy, first, first + m, n);
         //1.2 check result
         algo.check(res, first, m, first_exp, n);
 
-        using _ValueType = typename ::std::iterator_traits<It>::value_type;
-        using _DiffType = typename ::std::iterator_traits<It>::difference_type;
-
         //2.1 run a test with hetero policy and hetero itertors
         _DiffType res_idx(0);
         {//scope for SYCL buffer lifetime
@@ -112,7 +117,7 @@ struct test_shift
 
             auto het_begin = oneapi::dpl::begin(buf);
 
-            auto het_res = algo(::std::forward<Policy>(exec), het_begin, het_begin + m, n);
+            auto het_res = algo(buffer_policy, het_begin, het_begin + m, n);
             res_idx = het_res - het_begin;
         }
         //2.2 check result
@@ -121,7 +126,7 @@ struct test_shift
 #if _PSTL_SYCL_TEST_USM
         //3. run a test with hetero policy and USM shared/device memory pointers
         test_usm<sycl::usm::alloc::shared>(exec, first, m, first_exp, n, algo);
-        test_usm<sycl::usm::alloc::device>(exec, first, m, first_exp, n, algo);
+        test_usm<sycl::usm::alloc::device>(std::forward<Policy>(exec), first, m, first_exp, n, algo);
 #endif
     }
 #endif
@@ -208,10 +213,10 @@ test_shift_by_type(Size m, Size n)
     TestUtils::Sequence<T> in(m, [](::std::size_t v) -> T { return T(v); }); //fill data
 
 #ifdef _PSTL_TEST_SHIFT_LEFT
-    TestUtils::invoke_on_all_policies<0>()(test_shift(), in.begin(), m, orig.begin(), n, shift_left_algo{});
+    TestUtils::invoke_on_all_policies()(test_shift(), in.begin(), m, orig.begin(), n, shift_left_algo{});
 #endif
 #ifdef _PSTL_TEST_SHIFT_RIGHT
-    TestUtils::invoke_on_all_policies<1>()(test_shift(), in.begin(), m, orig.begin(), n, shift_right_algo{});
+    TestUtils::invoke_on_all_policies()(test_shift(), in.begin(), m, orig.begin(), n, shift_right_algo{});
 #endif
 }
 
@@ -233,6 +238,16 @@ main()
     {
        test_shift_by_type<ValueType>(m, n);
     }
+#if TEST_DPCPP_BACKEND_PRESENT
+    // Test both paths of the vectorized implementation in the SYCL backend. Use shift factors that will not divide
+    // into the vector size to assess edge case handling.
+    const std::size_t large_n = 1000000;
+    const std::size_t quarter_shift = 250111;
+    const std::size_t three_quarters_shift = 750203;
+    test_shift_by_type<std::uint8_t>(large_n, quarter_shift);
+    test_shift_by_type<std::uint8_t>(three_quarters_shift, large_n);
+    test_shift_by_type<std::uint16_t>(large_n, quarter_shift);
+#endif
 
     return TestUtils::done();
 }
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
index afd10f55f50..f23ea69e05b 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
@@ -115,7 +115,7 @@ template <typename T>
 void
 test()
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T> data(max_len);
     Sequence<T> actual(max_len);
@@ -132,6 +132,7 @@ main()
 {
     test<wrapper<std::uint16_t>>();
     test<wrapper<float32_t>>();
+    test<std::uint8_t>();
     test<std::int32_t>();
     test<float64_t>();
 
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
index 113c24be7c8..5deae6ea667 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
@@ -104,7 +104,7 @@ test(Predicate pred, _IteratorAdapter adap = {})
 #if PSTL_USE_DEBUG && ONEDPL_USE_OPENMP_BACKEND
         10000;
 #else
-        100000;
+        TestUtils::get_pattern_for_max_n();
 #endif
     for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
@@ -153,5 +153,7 @@ main()
     //test case for zip iterator
     test<50, std::int32_t, std::int32_t, std::int32_t>(TheOperationZip<std::int32_t>(1), _ZipIteratorAdapter{});
 
+    test<60, std::uint16_t, std::uint16_t, std::int32_t>(TheOperation<std::uint16_t, std::uint16_t, std::int32_t>(1));
+
     return done();
 }
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp
index 20c1403f6b9..19fe906935c 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp
@@ -84,7 +84,8 @@ template <::std::size_t CallNumber, typename Tin, typename Tout, typename _Op =
 void
 test()
 {
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<Tin> in(n, [](std::int32_t k) { return k % 5 != 1 ? 3 * k - 7 : 0; });
 
diff --git a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
index 526c1c45fcf..c23fd3013de 100644
--- a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
+++ b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
@@ -85,7 +85,8 @@ template <typename T>
 void
 test()
 {
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<T> in_out(n, Gen<T>());
         Sequence<T> expected(n, Gen<T>());
diff --git a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
index 973ca3658f8..97198b72f76 100644
--- a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
+++ b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
@@ -162,10 +162,11 @@ void
 test()
 {
     const ::std::int64_t init_val = 999;
-    for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 1; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         {
-            Sequence<_Type> in1(n, [=](size_t k) { return (3 * k); });
+            Sequence<_Type> in1(n, [=](size_t k) { return (3 * k) % std::numeric_limits<_Type>::max(); });
             Sequence<_Type> in2(n, [=](size_t k) { return k % 2 == 0 ? 1 : 0; });
 
             Sequence<_Type> out(n, [=](size_t) { return init_val; });
@@ -178,7 +179,7 @@ test()
 #endif
         }
         {
-            Sequence<_Type> in1(n, [=](size_t k) { return k; });
+            Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); });
             Sequence<_Type> out(n, [=](size_t) { return init_val; });
 
             invoke_on_all_policies<2>()(test_transform_if_unary<_Type>(), in1.begin(), in1.end(), out.begin(),
@@ -196,10 +197,11 @@ void
 test_inplace()
 {
     const ::std::int64_t init_val = 999;
-    for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 1; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         {
-            Sequence<_Type> in1(n, [=](size_t k) { return k; });
+            Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); });
             Sequence<_Type> out(n, [=](size_t) { return 0; });
 
             invoke_on_all_policies<4>()(test_transform_if_unary_inplace<_Type>(), in1.begin(), in1.end(), out.begin(),
@@ -211,11 +213,15 @@ test_inplace()
 int
 main()
 {
-    test<::std::int32_t>();
-    test<::std::int64_t>();
-
-    test_inplace<::std::int32_t>();
-    test_inplace<::std::int64_t>();
+    test<std::int8_t>();
+    test<std::int16_t>();
+    test<std::int32_t>();
+    test<std::int64_t>();
+
+    test_inplace<std::int8_t>();
+    test_inplace<std::int16_t>();
+    test_inplace<std::int32_t>();
+    test_inplace<std::int64_t>();
 
     return done();
 }
diff --git a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
index a617d3205d0..08b93265541 100644
--- a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
@@ -139,7 +139,7 @@ template <typename T1, typename T2, typename Pred>
 void
 test(Pred pred)
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     const T2 value = T2(77);
     const T1 trash = T1(31);
@@ -165,6 +165,7 @@ int
 main()
 {
     test<std::uint8_t, std::uint32_t>([](std::uint32_t a, std::uint32_t b) { return a - b; });
+    test<std::uint16_t, std::uint16_t>([](std::uint16_t a, std::uint16_t b) { return a > b ? a - b : b - a; });
     test<std::int32_t, std::int64_t>([](std::int64_t a, std::int64_t b) { return a / (b + 1); });
     test<std::int64_t, float32_t>([](float32_t a, float32_t b) { return (a + b) / 2; });
 #if !TEST_DPCPP_BACKEND_PRESENT
diff --git a/test/support/utils.h b/test/support/utils.h
index 666d5ca3646..af4dfdc8a61 100644
--- a/test/support/utils.h
+++ b/test/support/utils.h
@@ -1013,6 +1013,29 @@ generate_arithmetic_data(T* input, std::size_t size, std::uint32_t seed)
         input[j] = input[i];
     }
 }
+
+// Utility that models __estimate_best_start_size in the SYCL backend parallel_for
+// to ensure large enough inputs are used to test the large submitter path.
+// A multiplier to the max n is added to ensure we get a few separate test inputs for
+// this path.
+std::size_t
+get_pattern_for_max_n()
+{
+#if TEST_DPCPP_BACKEND_PRESENT
+    sycl::queue q = TestUtils::get_test_queue();
+    sycl::device d = q.get_device();
+    constexpr std::size_t max_iters_per_item = 16;
+    constexpr std::size_t multiplier = 4;
+    constexpr std::size_t max_work_group_size = 512;
+    std::size_t __max_n = multiplier * max_iters_per_item * max_work_group_size *
+                          d.get_info<sycl::info::device::max_compute_units>();
+    __max_n = std::min(std::size_t{10000000}, __max_n);
+    return __max_n;
+#else
+    return TestUtils::max_n;
+#endif
+}
+
 } /* namespace TestUtils */
 
 #endif // _UTILS_H