Skip to content

Commit

Permalink
[SYCL][E2E] Rewrite Tests Containing Deprecated Overloads #3 (#16775)
Browse files Browse the repository at this point in the history
The overloads for single_task and parallel_for in the
sycl_ext_oneapi_kernel_properties extension are being deprecated as
mentioned in #14785. So I'm rewriting
tests containg such overloads so that they can still run after the
deprecation.

---------

Signed-off-by: Hu, Peisen <[email protected]>
  • Loading branch information
HPS-1 authored Feb 4, 2025
1 parent 10f3889 commit 70f7543
Show file tree
Hide file tree
Showing 15 changed files with 303 additions and 261 deletions.
14 changes: 11 additions & 3 deletions sycl/test-e2e/Basic/kernel_max_wg_size.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ __attribute__((noinline)) void f(int *result, nd_item<1> &index) {
result[index.get_global_id()] = index.get_global_id();
}

struct KernelFunctor {
int *mResult;
KernelFunctor(int *result) : mResult(result) {}

void operator()(nd_item<1> index) const { f(mResult, index); }
auto get(syclex::properties_tag) const {
return syclex::properties{intelex::grf_size<256>};
}
};

int main() {
queue myQueue;
auto myContext = myQueue.get_context();
Expand All @@ -46,11 +56,9 @@ int main() {
nd_range myRange{range{maxWgSize}, range{maxWgSize}};

int *result = sycl::malloc_shared<int>(maxWgSize, myQueue);
syclex::properties kernelProperties{intelex::grf_size<256>};
myQueue.submit([&](handler &cgh) {
cgh.use_kernel_bundle(myBundle);
cgh.parallel_for<MyKernel>(myRange, kernelProperties,
([=](nd_item<1> index) { f(result, index); }));
cgh.parallel_for<MyKernel>(myRange, KernelFunctor(result));
});

myQueue.wait();
Expand Down
37 changes: 0 additions & 37 deletions sycl/test-e2e/Basic/sub_group_size_prop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,33 +44,12 @@ void test(queue &Queue, const std::vector<size_t> SupportedSGSizes) {
return;
}

auto Props = ext::oneapi::experimental::properties{
ext::oneapi::experimental::sub_group_size<SGSize>};

nd_range<1> NdRange(SGSize * 4, SGSize * 2);

size_t ReadSubGroupSize = 0;
{
buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1));

Queue.submit([&](handler &CGH) {
accessor ReadSubGroupSizeBufAcc{ReadSubGroupSizeBuf, CGH,
sycl::write_only, sycl::no_init};

CGH.parallel_for<SubGroupKernel<Variant::Function, SGSize>>(
NdRange, Props, [=](nd_item<1> NdItem) {
auto SG = NdItem.get_sub_group();
if (NdItem.get_global_linear_id() == 0)
ReadSubGroupSizeBufAcc[0] = SG.get_local_linear_range();
});
});
}
assert(ReadSubGroupSize == SGSize && "Failed check for function.");

ReadSubGroupSize = 0;
{
buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1));

Queue.submit([&](handler &CGH) {
accessor ReadSubGroupSizeBufAcc{ReadSubGroupSizeBuf, CGH,
sycl::write_only, sycl::no_init};
Expand All @@ -81,22 +60,6 @@ void test(queue &Queue, const std::vector<size_t> SupportedSGSizes) {
});
}
assert(ReadSubGroupSize == SGSize && "Failed check for functor.");

ReadSubGroupSize = 0;
{
buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1));

Queue.submit([&](handler &CGH) {
accessor ReadSubGroupSizeBufAcc{ReadSubGroupSizeBuf, CGH,
sycl::write_only, sycl::no_init};
KernelFunctorWithSGSizeProp<SGSize> KernelFunctor{ReadSubGroupSizeBufAcc};

CGH.parallel_for<SubGroupKernel<Variant::Functor, SGSize>>(NdRange, Props,
KernelFunctor);
});
}
assert(ReadSubGroupSize == SGSize &&
"Failed check for functor and properties.");
}

int main() {
Expand Down
79 changes: 47 additions & 32 deletions sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,49 @@

#include <string>

template <int Dim, typename T> struct KernelFunctor {
int *mCorrectResultFlag;
T mClusterLaunchProperty;
sycl::range<Dim> mClusterRange;
KernelFunctor(int *CorrectResultFlag, T ClusterLaunchProperty,
sycl::range<Dim> ClusterRange)
: mCorrectResultFlag(CorrectResultFlag),
mClusterLaunchProperty(ClusterLaunchProperty),
mClusterRange(ClusterRange) {}

void operator()(sycl::nd_item<Dim> It) const {
uint32_t ClusterDimX, ClusterDimY, ClusterDimZ;
// Temporary solution till cluster group class is implemented
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) && \
(__SYCL_CUDA_ARCH__ >= 900)
asm volatile("\n\t"
"mov.u32 %0, %%cluster_nctaid.x; \n\t"
"mov.u32 %1, %%cluster_nctaid.y; \n\t"
"mov.u32 %2, %%cluster_nctaid.z; \n\t"
: "=r"(ClusterDimZ), "=r"(ClusterDimY), "=r"(ClusterDimX));
#endif
if constexpr (Dim == 1) {
if (ClusterDimZ == mClusterRange[0] && ClusterDimY == 1 &&
ClusterDimX == 1) {
*mCorrectResultFlag = 1;
}
} else if constexpr (Dim == 2) {
if (ClusterDimZ == mClusterRange[1] && ClusterDimY == mClusterRange[0] &&
ClusterDimX == 1) {
*mCorrectResultFlag = 1;
}
} else {
if (ClusterDimZ == mClusterRange[2] && ClusterDimY == mClusterRange[1] &&
ClusterDimX == mClusterRange[0]) {
*mCorrectResultFlag = 1;
}
}
}
auto get(sycl::ext::oneapi::experimental::properties_tag) const {
return mClusterLaunchProperty;
}
};

template <int Dim>
int test_cluster_launch_parallel_for(sycl::queue &Queue,
sycl::range<Dim> GlobalRange,
Expand All @@ -25,38 +68,10 @@ int test_cluster_launch_parallel_for(sycl::queue &Queue,

Queue
.submit([&](sycl::handler &CGH) {
CGH.parallel_for(sycl::nd_range<Dim>(GlobalRange, LocalRange),
ClusterLaunchProperty, [=](sycl::nd_item<Dim> It) {
uint32_t ClusterDimX, ClusterDimY, ClusterDimZ;
// Temporary solution till cluster group class is implemented
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) && \
(__SYCL_CUDA_ARCH__ >= 900)
asm volatile("\n\t"
"mov.u32 %0, %%cluster_nctaid.x; \n\t"
"mov.u32 %1, %%cluster_nctaid.y; \n\t"
"mov.u32 %2, %%cluster_nctaid.z; \n\t"
: "=r"(ClusterDimZ), "=r"(ClusterDimY),
"=r"(ClusterDimX));
#endif
if constexpr (Dim == 1) {
if (ClusterDimZ == ClusterRange[0] &&
ClusterDimY == 1 && ClusterDimX == 1) {
*CorrectResultFlag = 1;
}
} else if constexpr (Dim == 2) {
if (ClusterDimZ == ClusterRange[1] &&
ClusterDimY == ClusterRange[0] &&
ClusterDimX == 1) {
*CorrectResultFlag = 1;
}
} else {
if (ClusterDimZ == ClusterRange[2] &&
ClusterDimY == ClusterRange[1] &&
ClusterDimX == ClusterRange[0]) {
*CorrectResultFlag = 1;
}
}
});
CGH.parallel_for(
sycl::nd_range<Dim>(GlobalRange, LocalRange),
KernelFunctor<Dim, decltype(ClusterLaunchProperty)>(
CorrectResultFlag, ClusterLaunchProperty, ClusterRange));
})
.wait_and_throw();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,22 @@ template <typename T> void dummy_kernel(T *Input, int N, sycl::nd_item<1> It) {
#endif
}

template <typename T1, typename T2> struct KernelFunctor {
T1 mAcc;
T2 mClusterLaunchProperty;
KernelFunctor(T2 ClusterLaunchProperty, T1 Acc)
: mClusterLaunchProperty(ClusterLaunchProperty), mAcc(Acc) {}

void operator()(sycl::nd_item<1> It) const {
dummy_kernel(
mAcc.template get_multi_ptr<sycl::access::decorated::yes>().get(), 4096,
It);
}
auto get(sycl::ext::oneapi::experimental::properties_tag) const {
return mClusterLaunchProperty;
}
};

int main() {

std::vector<int> HostArray(4096, -20);
Expand All @@ -46,13 +62,8 @@ int main() {
cuda::cluster_size ClusterDims(sycl::range{2});
properties ClusterLaunchProperty{ClusterDims};
auto Acc = Buff.template get_access<sycl::access::mode::read_write>(CGH);
CGH.parallel_for(
sycl::nd_range({4096}, {32}), ClusterLaunchProperty,
[=](sycl::nd_item<1> It) {
dummy_kernel(
Acc.get_multi_ptr<sycl::access::decorated::yes>().get(), 4096,
It);
});
CGH.parallel_for(sycl::nd_range({4096}, {32}),
KernelFunctor(ClusterLaunchProperty, Acc));
});
Queue.submit([&](sycl::handler &CGH) {
auto Acc = Buff.template get_access<sycl::access::mode::read_write>(CGH);
Expand Down
13 changes: 11 additions & 2 deletions sycl/test-e2e/DeviceCodeSplit/grf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,15 @@ bool checkResult(const std::vector<float> &A, int Inc) {
return true;
}

template <typename T1, typename T2> struct KernelFunctor {
T1 mPA;
T2 mProp;
KernelFunctor(T1 PA, T2 Prop) : mPA(PA), mProp(Prop) {}

void operator()(id<1> i) const { mPA[i] += 2; }
auto get(properties_tag) const { return mProp; }
};

int main(void) {
constexpr unsigned Size = 32;
constexpr unsigned VL = 16;
Expand Down Expand Up @@ -122,8 +131,8 @@ int main(void) {

auto e = q.submit([&](handler &cgh) {
auto PA = bufa.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<class SYCLKernelSpecifiedGRF>(
Size, prop, [=](id<1> i) { PA[i] += 2; });
cgh.parallel_for<class SYCLKernelSpecifiedGRF>(Size,
KernelFunctor(PA, prop));
});
e.wait();
} catch (sycl::exception const &e) {
Expand Down
68 changes: 0 additions & 68 deletions sycl/test-e2e/Graph/Inputs/sub_group_prop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,49 +39,13 @@ void test(queue &Queue, const std::vector<size_t> SupportedSGSizes) {
return;
}

auto Props = ext::oneapi::experimental::properties{
ext::oneapi::experimental::sub_group_size<SGSize>};

nd_range<1> NdRange(SGSize * 4, SGSize * 2);

size_t ReadSubGroupSize = 0;
{
buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1));
ReadSubGroupSizeBuf.set_write_back(false);

{
exp_ext::command_graph Graph{
Queue.get_context(),
Queue.get_device(),
{exp_ext::property::graph::assume_buffer_outlives_graph{}}};

add_node(Graph, Queue, [&](handler &CGH) {
accessor ReadSubGroupSizeBufAcc{ReadSubGroupSizeBuf, CGH,
sycl::write_only, sycl::no_init};

CGH.parallel_for<SubGroupKernel<Variant::Function, SGSize>>(
NdRange, Props, [=](nd_item<1> NdItem) {
auto SG = NdItem.get_sub_group();
if (NdItem.get_global_linear_id() == 0)
ReadSubGroupSizeBufAcc[0] = SG.get_local_linear_range();
});
});

auto ExecGraph = Graph.finalize();
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
Queue.wait_and_throw();
}

host_accessor HostAcc(ReadSubGroupSizeBuf);
ReadSubGroupSize = HostAcc[0];
}
assert(ReadSubGroupSize == SGSize && "Failed check for function.");

ReadSubGroupSize = 0;
{
buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1));
ReadSubGroupSizeBuf.set_write_back(false);

{
exp_ext::command_graph Graph{
Queue.get_context(),
Expand All @@ -107,38 +71,6 @@ void test(queue &Queue, const std::vector<size_t> SupportedSGSizes) {
ReadSubGroupSize = HostAcc[0];
}
assert(ReadSubGroupSize == SGSize && "Failed check for functor.");

ReadSubGroupSize = 0;
{
buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1));
ReadSubGroupSizeBuf.set_write_back(false);

{
exp_ext::command_graph Graph{
Queue.get_context(),
Queue.get_device(),
{exp_ext::property::graph::assume_buffer_outlives_graph{}}};

add_node(Graph, Queue, [&](handler &CGH) {
accessor ReadSubGroupSizeBufAcc{ReadSubGroupSizeBuf, CGH,
sycl::write_only, sycl::no_init};
KernelFunctorWithSGSizeProp<SGSize> KernelFunctor{
ReadSubGroupSizeBufAcc};

CGH.parallel_for<SubGroupKernel<Variant::Functor, SGSize>>(
NdRange, Props, KernelFunctor);
});

auto ExecGraph = Graph.finalize();
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
Queue.wait_and_throw();
}

host_accessor HostAcc(ReadSubGroupSizeBuf);
ReadSubGroupSize = HostAcc[0];
}
assert(ReadSubGroupSize == SGSize &&
"Failed check for functor and properties.");
}

int main() {
Expand Down
27 changes: 20 additions & 7 deletions sycl/test-e2e/VirtualFunctions/misc/group-barrier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,24 @@ class MultiplyOp : public BaseOp {
}
};

template <typename T1, typename T2, typename T3> struct KernelFunctor {
T1 mDeviceStorage;
T2 mDataAcc;
T3 mLocalAcc;
KernelFunctor(T1 DeviceStorage, T2 DataAcc, T3 LocalAcc)
: mDeviceStorage(DeviceStorage), mDataAcc(DataAcc), mLocalAcc(LocalAcc) {}

void operator()(sycl::nd_item<1> It) const {
auto *Ptr = mDeviceStorage->template getAs<BaseOp>();
mDataAcc[It.get_global_id()] = Ptr->apply(
mLocalAcc.template get_multi_ptr<sycl::access::decorated::no>().get(),
It.get_group());
}
auto get(oneapi::properties_tag) const {
return oneapi::properties{oneapi::assume_indirect_calls};
}
};

int main() try {
using storage_t = obj_storage_t<SumOp, MultiplyOp>;

Expand All @@ -113,7 +131,6 @@ int main() try {
sycl::range G{16};
sycl::range L{4};

constexpr oneapi::properties props{oneapi::assume_indirect_calls};
for (unsigned TestCase = 0; TestCase < 2; ++TestCase) {
sycl::buffer<int> DataStorage(G);

Expand All @@ -126,12 +143,8 @@ int main() try {
q.submit([&](sycl::handler &CGH) {
sycl::accessor DataAcc(DataStorage, CGH, sycl::read_write);
sycl::local_accessor<int> LocalAcc(L, CGH);
CGH.parallel_for(sycl::nd_range{G, L}, props, [=](auto It) {
auto *Ptr = DeviceStorage->getAs<BaseOp>();
DataAcc[It.get_global_id()] = Ptr->apply(
LocalAcc.get_multi_ptr<sycl::access::decorated::no>().get(),
It.get_group());
});
CGH.parallel_for(sycl::nd_range{G, L},
KernelFunctor(DeviceStorage, DataAcc, LocalAcc));
}).wait_and_throw();

auto *Ptr = HostStorage.construct</* ret type = */ BaseOp>(TestCase);
Expand Down
Loading

0 comments on commit 70f7543

Please sign in to comment.