14 inline namespace _V1 {
20 size_t &NWorkGroups) {
21 size_t WGSize = MaxWGSize;
22 if (NWorkItems <= WGSize) {
26 NWorkGroups = NWorkItems / WGSize;
27 size_t Rem = NWorkItems % WGSize;
32 size_t NWorkGroupsAlt = NWorkItems / Rem;
33 size_t RemAlt = NWorkItems % Rem;
34 if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
39 NWorkGroups = NWorkGroupsAlt;
54 std::shared_ptr<sycl::detail::queue_impl> Queue) {
62 if (Queue ==
nullptr) {
65 device Dev = Queue->get_device();
66 uint32_t NumThreads = Dev.
get_info<sycl::info::device::max_compute_units>();
69 if (Dev.
is_gpu() && Dev.
get_info<sycl::info::device::host_unified_memory>())
76 size_t LocalMemBytesPerWorkItem) {
77 device Dev = Queue->get_device();
78 size_t MaxWGSize = Dev.
get_info<sycl::info::device::max_work_group_size>();
80 size_t WGSizePerMem = MaxWGSize * 2;
81 size_t WGSize = MaxWGSize;
82 if (LocalMemBytesPerWorkItem != 0) {
83 size_t MemSize = Dev.
get_info<sycl::info::device::local_mem_size>();
84 WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
89 if ((WGSizePerMem & (WGSizePerMem - 1)) != 0)
91 WGSize = (std::min)(WGSizePerMem, WGSize);
106 if (WGSize >= 4 && WGSizePerMem < MaxWGSize * 2) {
116 size_t LocalMemBytesPerWorkItem) {
124 if (Queue ==
nullptr) {
127 device Dev = Queue->get_device();
136 using PrefWGConfig = sycl::detail::SYCLConfig<
137 sycl::detail::SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE>;
140 if (CPUMaxWGSize == 0)
142 size_t DevMaxWGSize =
143 Dev.
get_info<sycl::info::device::max_work_group_size>();
144 return std::min(CPUMaxWGSize, DevMaxWGSize);
150 size_t DevMaxWGSize =
151 Dev.
get_info<sycl::info::device::max_work_group_size>();
158 size_t DevMaxWGSize =
159 Dev.
get_info<sycl::info::device::max_work_group_size>();
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
bool is_accelerator() const
Check if device is an accelerator device.
bool is_gpu() const
Check if device is a GPU device.
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
bool is_cpu() const
Check if device is a CPU device.
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)