Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove support of old compilers from oneDPL code #1678

Draft
wants to merge 14 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 9 additions & 22 deletions include/oneapi/dpl/internal/reduce_by_segment_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,15 +281,11 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __keys);
auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::write>(__cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
# if _ONEDPL_COMPILE_KERNEL
__cgh.use_kernel_bundle(__seg_reduce_count_kernel.get_kernel_bundle());
#endif
# endif
__cgh.parallel_for<_SegReduceCountKernel>(
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__seg_reduce_count_kernel,
#endif
sycl::nd_item<1> __item) {
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
auto __group = __item.get_group();
::std::size_t __group_id = __item.get_group(0);
::std::size_t __local_id = __item.get_local_id(0);
Expand Down Expand Up @@ -319,13 +315,10 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
__cgh.depends_on(__seg_end_identification);
auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::read>(__cgh);
auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read_write>(__cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
# if _ONEDPL_COMPILE_KERNEL
__cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle());
#endif
# endif
__cgh.parallel_for<_SegReduceOffsetKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__seg_reduce_offset_kernel,
#endif
sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
auto __beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_acc);
auto __out_beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_scan_acc);
Expand All @@ -342,13 +335,10 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
auto __partials_acc = __partials.template get_access<sycl::access_mode::read_write>(__cgh);
auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read>(__cgh);
__dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
# if _ONEDPL_COMPILE_KERNEL
__cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle());
#endif
# endif
__cgh.parallel_for<_SegReduceWgKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__seg_reduce_wg_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
::std::array<__val_type, __vals_per_item> __loc_partials;

Expand Down Expand Up @@ -465,13 +455,10 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
__dpl_sycl::__local_accessor<__diff_type> __loc_seg_ends_acc(__wgroup_size, __cgh);

__cgh.depends_on(__wg_reduce);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
# if _ONEDPL_COMPILE_KERNEL
__cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle());
#endif
# endif
__cgh.parallel_for<_SegReducePrefixKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__seg_reduce_prefix_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
auto __group = __item.get_group();
::std::int64_t __group_id = __item.get_group(0);
Expand Down
14 changes: 4 additions & 10 deletions include/oneapi/dpl/internal/scan_by_segment_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,13 +164,10 @@ struct __sycl_scan_by_segment_impl

__dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh);

#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
# if _ONEDPL_COMPILE_KERNEL
__cgh.use_kernel_bundle(__seg_scan_wg_kernel.get_kernel_bundle());
#endif
# endif
__cgh.parallel_for<_SegScanWgKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__seg_scan_wg_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
__val_type __accumulator = __identity;

Expand Down Expand Up @@ -268,13 +265,10 @@ struct __sycl_scan_by_segment_impl
__dpl_sycl::__local_accessor<__val_type> __loc_partials_acc(__wgroup_size, __cgh);

__dpl_sycl::__local_accessor<bool> __loc_seg_ends_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
# if _ONEDPL_COMPILE_KERNEL
__cgh.use_kernel_bundle(__seg_scan_prefix_kernel.get_kernel_bundle());
#endif
# endif
__cgh.parallel_for<_SegScanPrefixKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__seg_scan_prefix_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
auto __group = __item.get_group();
::std::size_t __group_id = __item.get_group(0);
Expand Down
10 changes: 2 additions & 8 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -321,13 +321,10 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer
auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh);
__dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL
__cgh.use_kernel_bundle(__kernel_1.get_kernel_bundle());
#endif
__cgh.parallel_for<_LocalScanKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__kernel_1,
#endif
sycl::nd_range<1>(__n_groups * __wgroup_size, __wgroup_size), [=](sycl::nd_item<1> __item) {
auto __temp_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__temp_acc);
__local_scan(__item, __n, __local_acc, __rng1, __rng2, __temp_ptr, __size_per_wg, __wgroup_size,
Expand All @@ -342,13 +339,10 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
__cgh.depends_on(__submit_event);
auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh);
__dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL
__cgh.use_kernel_bundle(__kernel_2.get_kernel_bundle());
#endif
__cgh.parallel_for<_GroupScanKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__kernel_2,
#endif
// TODO: try to balance work between several workgroups instead of one
sycl::nd_range<1>(__wgroup_size, __wgroup_size), [=](sycl::nd_item<1> __item) {
auto __temp_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__temp_acc);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,10 @@ __radix_sort_count_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, :
oneapi::dpl::__ranges::__require_access(__hdl, __val_rng, __count_rng);
// an accessor per work-group with value counters from each work-item
auto __count_lacc = __dpl_sycl::__local_accessor<_CountT>(__wg_size * __radix_states, __hdl);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__kernel,
#endif
sycl::nd_range<1>(__segments * __wg_size, __wg_size), [=](sycl::nd_item<1> __self_item) {
// item info
const ::std::size_t __self_lidx = __self_item.get_local_id(0);
Expand Down Expand Up @@ -299,13 +296,10 @@ __radix_sort_scan_submit(_ExecutionPolicy&& __exec, ::std::size_t __scan_wg_size
__hdl.depends_on(__dependency_event);
// access the counters for all work groups
oneapi::dpl::__ranges::__require_access(__hdl, __count_rng);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__kernel,
#endif
sycl::nd_range<1>(__radix_states * __scan_wg_size, __scan_wg_size), [=](sycl::nd_item<1> __self_item) {
// find borders of a region with a specific bucket id
sycl::global_ptr<_CountT> __begin = __count_rng.begin() + __scan_size * __self_item.get_group(0);
Expand Down Expand Up @@ -346,7 +340,6 @@ enum class __peer_prefix_algo
template <std::uint32_t __radix_states, typename _OffsetT, __peer_prefix_algo _Algo>
struct __peer_prefix_helper;

#if (_ONEDPL_LIBSYCL_VERSION >= 50700)
template <std::uint32_t __radix_states, typename _OffsetT>
struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::atomic_fetch_or>
{
Expand Down Expand Up @@ -390,7 +383,6 @@ struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::atomic
return __offset;
}
};
#endif // (_ONEDPL_LIBSYCL_VERSION >= 50700)

template <std::uint32_t __radix_states, typename _OffsetT>
struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::scan_then_broadcast>
Expand Down Expand Up @@ -544,13 +536,10 @@ __radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments,

typename _PeerHelper::_TempStorageT __peer_temp(1, __hdl);

#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__kernel,
#endif
//Each SYCL work group processes one data segment.
sycl::nd_range<1>(__segments * __sg_size, __sg_size), [=](sycl::nd_item<1> __self_item) {

Expand Down Expand Up @@ -607,7 +596,11 @@ __radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments,
if (__residual > 0)
{
//_ValueT may not have a default constructor, so we create just a storage via union type
union __storage { _ValueT __v; __storage(){} } __in_val;
union __storage
{
_ValueT __v;
__storage() {}
} __in_val;

::std::uint32_t __bucket = __radix_states; // greater than any actual radix state
if (__self_lidx < __residual)
Expand Down Expand Up @@ -728,10 +721,8 @@ struct __parallel_radix_sort_iteration
{
#if _ONEDPL_SYCL_SUB_GROUP_MASK_PRESENT
constexpr auto __peer_algorithm = __peer_prefix_algo::subgroup_ballot;
#elif _ONEDPL_LIBSYCL_VERSION >= 50700
constexpr auto __peer_algorithm = __peer_prefix_algo::atomic_fetch_or;
#else
constexpr auto __peer_algorithm = __peer_prefix_algo::scan_then_broadcast;
constexpr auto __peer_algorithm = __peer_prefix_algo::atomic_fetch_or;
#endif // _ONEDPL_SYCL_SUB_GROUP_MASK_PRESENT

__reorder_event =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,13 +365,10 @@ struct __parallel_transform_reduce_impl
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL
__cgh.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_ReduceKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
__kernel,
#endif
sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size),
sycl::range<1>(__work_group_size)),
[=](sycl::nd_item<1> __item_id) {
Expand Down
24 changes: 0 additions & 24 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,19 +108,12 @@ __supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_s
// Kernel run-time information helpers
//-----------------------------------------------------------------------------

// 20201214 value corresponds to Intel(R) oneAPI C++ Compiler Classic 2021.1.2 Patch release
#define _USE_KERNEL_DEVICE_SPECIFIC_API (__SYCL_COMPILER_VERSION > 20201214) || (_ONEDPL_LIBSYCL_VERSION >= 50700)

template <typename _ExecutionPolicy>
::std::size_t
__kernel_work_group_size(const _ExecutionPolicy& __policy, const sycl::kernel& __kernel)
{
const sycl::device& __device = __policy.queue().get_device();
#if _USE_KERNEL_DEVICE_SPECIFIC_API
return __kernel.template get_info<sycl::info::kernel_device_specific::work_group_size>(__device);
#else
return __kernel.template get_work_group_info<sycl::info::kernel_work_group::work_group_size>(__device);
#endif
}

template <typename _ExecutionPolicy>
Expand All @@ -130,17 +123,13 @@ __kernel_sub_group_size(const _ExecutionPolicy& __policy, const sycl::kernel& __
const sycl::device& __device = __policy.queue().get_device();
[[maybe_unused]] const ::std::size_t __wg_size = __kernel_work_group_size(__policy, __kernel);
const ::std::uint32_t __sg_size =
#if _USE_KERNEL_DEVICE_SPECIFIC_API
__kernel.template get_info<sycl::info::kernel_device_specific::max_sub_group_size>(
__device
# if _ONEDPL_LIBSYCL_VERSION < 60000
,
sycl::range<3> { __wg_size, 1, 1 }
# endif
);
#else
__kernel.template get_sub_group_info<sycl::info::kernel_sub_group::max_sub_group_size>(
__device, sycl::range<3>{__wg_size, 1, 1});
#endif
return __sg_size;
}
Expand Down Expand Up @@ -267,7 +256,6 @@ class __kernel_compiler
static_assert(__kernel_count > 0, "At least one kernel name should be provided");

public:
#if _ONEDPL_KERNEL_BUNDLE_PRESENT
template <typename _Exec>
static auto
__compile(_Exec&& __exec)
Expand All @@ -290,18 +278,6 @@ class __kernel_compiler
{
return __kernel_array_type{__kernel_bundle.get_kernel(__kernel_ids[_Ip])...};
}
#else
template <typename _Exec>
static auto
__compile(_Exec&& __exec)
{
sycl::program __program(__exec.queue().get_context());

using __return_type = std::conditional_t<(__kernel_count > 1), __kernel_array_type, sycl::kernel>;
return __return_type{
(__program.build_with_kernel_type<_KernelNames>(), __program.get_kernel<_KernelNames>())...};
}
#endif
};

#if _ONEDPL_DEBUG_SYCL
Expand Down
Loading
Loading