uxlfoundation · dmitriy-sobolev · Jan 22, 2025 · Jan 6, 2025
diff --git a/include/oneapi/dpl/internal/scan_by_segment_impl.h b/include/oneapi/dpl/internal/scan_by_segment_impl.h
@@ -168,7 +168,7 @@ struct __sycl_scan_by_segment_impl
             __cgh.use_kernel_bundle(__seg_scan_wg_kernel.get_kernel_bundle());
 #endif
             __cgh.parallel_for<_SegScanWgKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
                 __seg_scan_wg_kernel,
 #endif
                 sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
@@ -272,7 +272,7 @@ struct __sycl_scan_by_segment_impl
                 __cgh.use_kernel_bundle(__seg_scan_prefix_kernel.get_kernel_bundle());
 #endif
                 __cgh.parallel_for<_SegScanPrefixKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
                     __seg_scan_prefix_kernel,
 #endif
                     sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -330,7 +330,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
             __cgh.use_kernel_bundle(__kernel_1.get_kernel_bundle());
 #endif
             __cgh.parallel_for<_LocalScanKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
                 __kernel_1,
 #endif
                 sycl::nd_range<1>(__n_groups * __wgroup_size, __wgroup_size), [=](sycl::nd_item<1> __item) {
@@ -351,7 +351,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
                 __cgh.use_kernel_bundle(__kernel_2.get_kernel_bundle());
 #endif
                 __cgh.parallel_for<_GroupScanKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
                     __kernel_2,
 #endif
                     // TODO: try to balance work between several workgroups instead of one

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
@@ -202,7 +202,7 @@ __radix_sort_count_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, :
         __hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
 #endif
         __hdl.parallel_for<_KernelName>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
             __kernel,
 #endif
             sycl::nd_range<1>(__segments * __wg_size, __wg_size), [=](sycl::nd_item<1> __self_item) {
@@ -303,7 +303,7 @@ __radix_sort_scan_submit(_ExecutionPolicy&& __exec, ::std::size_t __scan_wg_size
         __hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
 #endif
         __hdl.parallel_for<_KernelName>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
             __kernel,
 #endif
             sycl::nd_range<1>(__radix_states * __scan_wg_size, __scan_wg_size), [=](sycl::nd_item<1> __self_item) {
@@ -548,7 +548,7 @@ __radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments,
         __hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
 #endif
         __hdl.parallel_for<_KernelName>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
             __kernel,
 #endif
             //Each SYCL work group processes one data segment.

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
@@ -374,7 +374,7 @@ struct __parallel_transform_reduce_impl
                 __cgh.use_kernel_bundle(__kernel.get_kernel_bundle());
 #endif
                 __cgh.parallel_for<_ReduceKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
                     __kernel,
 #endif
                     sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size),

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h
@@ -169,7 +169,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
 #endif
         __cgh.parallel_for<_SegReduceCountKernel>(
             sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
                                                                               __seg_reduce_count_kernel,
 #endif
                                                                               sycl::nd_item<1> __item) {
@@ -206,7 +206,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
         __cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle());
 #endif
         __cgh.parallel_for<_SegReduceOffsetKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
             __seg_reduce_offset_kernel,
 #endif
             sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
@@ -229,7 +229,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
         __cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle());
 #endif
         __cgh.parallel_for<_SegReduceWgKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
             __seg_reduce_wg_kernel,
 #endif
             sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
@@ -352,7 +352,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
             __cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle());
 #endif
             __cgh.parallel_for<_SegReducePrefixKernel>(
-#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
+#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
                 __seg_reduce_prefix_kernel,
 #endif
                 sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -255,6 +255,7 @@ using __kernel_name_generator =
     _BaseName<_CustomName>;
 #endif
 
+#if _ONEDPL_COMPILE_KERNEL
 template <typename... _KernelNames>
 class __kernel_compiler
 {
@@ -287,7 +288,7 @@ class __kernel_compiler
     {
         return __kernel_array_type{__kernel_bundle.get_kernel(__kernel_ids[_Ip])...};
     }
-#else
+#elif _ONEDPL_LIBSYCL_PROGRAM_PRESENT
     template <typename _Exec>
     static auto
     __compile(_Exec&& __exec)
@@ -300,6 +301,7 @@ class __kernel_compiler
     }
 #endif
 };
+#endif // _ONEDPL_COMPILE_KERNEL
 
 #if _ONEDPL_DEBUG_SYCL
 template <typename _Policy>

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
@@ -42,6 +42,10 @@
 #endif
 #define _ONEDPL_LIBSYCL_VERSION_LESS_THAN(_VERSION) (_ONEDPL_LIBSYCL_VERSION && _ONEDPL_LIBSYCL_VERSION < _VERSION)
 
+#if defined(ACPP_VERSION_MAJOR) && defined(ACPP_VERSION_MINOR) && defined(ACPP_VERSION_PATCH)
+#    define _ONEDPL_ACPP_VERSION (ACPP_VERSION_MAJOR * 10000 + ACPP_VERSION_MINOR * 100 + ACPP_VERSION_PATCH)
+#endif
+
 #if _ONEDPL_FPGA_DEVICE
 #    if _ONEDPL_LIBSYCL_VERSION >= 50400
 #        include <sycl/ext/intel/fpga_extensions.hpp>
@@ -59,7 +63,9 @@
 #define _ONEDPL_SYCL2020_COLLECTIVES_PRESENT                  (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
 #define _ONEDPL_SYCL2020_BUFFER_SIZE_PRESENT                  (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
 #define _ONEDPL_SYCL2020_ACCESSOR_SIZE_PRESENT                (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
-#define _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT                (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
+// Kernel bundle support is not expected in ACPP, see https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1296.
+#define _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT                                                                        \
+    (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300) && !_ONEDPL_ACPP_VERSION)
 #define _ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT               (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
 #define _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT           (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
 #define _ONEDPL_SYCL2020_REQD_SUB_GROUP_SIZE_PRESENT          (!_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
@@ -83,10 +89,23 @@
 
 // Feature macros for DPC++ SYCL runtime library alternatives to non-supported SYCL 2020 features
 #define _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT                   (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
+#define _ONEDPL_LIBSYCL_PROGRAM_PRESENT                       (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(50300))
 #define _ONEDPL_LIBSYCL_KNOWN_IDENTITY_PRESENT                (_ONEDPL_LIBSYCL_VERSION == 50200)
 #define _ONEDPL_LIBSYCL_SUB_GROUP_MASK_PRESENT                                                                         \
     (SYCL_EXT_ONEAPI_SUB_GROUP_MASK >= 1 && _ONEDPL_LIBSYCL_VERSION >= 50700)
 
+// Compilation of a kernel is requiried to obtain valid work_group_size
+// when target devices are CPU or FPGA emulator. Since CPU and GPU devices
+// cannot be distinguished during compilation, the macro is enabled by default.
+#define _ONEDPL_CAN_COMPILE_KERNEL (_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT || _ONEDPL_LIBSYCL_PROGRAM_PRESENT)
+#if !defined(_ONEDPL_COMPILE_KERNEL)
+#    define _ONEDPL_COMPILE_KERNEL _ONEDPL_CAN_COMPILE_KERNEL
+#else
+#    if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_CAN_COMPILE_KERNEL
+#        error "No SYCL kernel compilation method available (neither SYCL 2020 kernel bundle nor other alternatives)."
+#    endif
+#endif
+
 #define _ONEDPL_SYCL_DEVICE_COPYABLE_SPECIALIZATION_BROKEN (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(70100))
 
 // Macro to check if we are compiling for SPIR-V devices. This macro must only be used within

diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h
@@ -334,13 +334,6 @@
 #        define _ONEDPL_USE_GROUP_ALGOS 1
 #    endif
 
-// Compilation of a kernel is requiried to obtain valid work_group_size
-// when target devices are CPU or FPGA emulator. Since CPU and GPU devices
-// cannot be distinguished during compilation, the macro is enabled by default.
-#    if !defined(_ONEDPL_COMPILE_KERNEL)
-#        define _ONEDPL_COMPILE_KERNEL 1
-#    endif
-
 #    define _ONEDPL_BUILT_IN_STABLE_NAME_PRESENT __has_builtin(__builtin_sycl_unique_stable_name)
 #endif // _ONEDPL_BACKEND_SYCL