15 inline namespace _V1 {
21 size_t &NWorkGroups) {
22 size_t WGSize = MaxWGSize;
23 if (NWorkItems <= WGSize) {
27 NWorkGroups = NWorkItems / WGSize;
28 size_t Rem = NWorkItems % WGSize;
33 size_t NWorkGroupsAlt = NWorkItems / Rem;
34 size_t RemAlt = NWorkItems % Rem;
35 if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
40 NWorkGroups = NWorkGroupsAlt;
55 std::shared_ptr<sycl::detail::queue_impl> Queue) {
63 if (Queue ==
nullptr) {
66 device Dev = Queue->get_device();
67 uint32_t NumThreads = Dev.
get_info<sycl::info::device::max_compute_units>();
70 if (Dev.
is_gpu() && Dev.
get_info<sycl::info::device::host_unified_memory>())
77 size_t LocalMemBytesPerWorkItem) {
78 device Dev = Queue->get_device();
79 size_t MaxWGSize = Dev.
get_info<sycl::info::device::max_work_group_size>();
81 size_t WGSizePerMem = MaxWGSize * 2;
82 size_t WGSize = MaxWGSize;
83 if (LocalMemBytesPerWorkItem != 0) {
84 size_t MemSize = Dev.
get_info<sycl::info::device::local_mem_size>();
85 WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;
90 if ((WGSizePerMem & (WGSizePerMem - 1)) != 0)
92 WGSize = (std::min)(WGSizePerMem, WGSize);
107 if (WGSize >= 4 && WGSizePerMem < MaxWGSize * 2) {
117 size_t LocalMemBytesPerWorkItem) {
125 if (Queue ==
nullptr) {
128 device Dev = Queue->get_device();
137 using PrefWGConfig = sycl::detail::SYCLConfig<
138 sycl::detail::SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE>;
141 if (CPUMaxWGSize == 0)
143 size_t DevMaxWGSize =
144 Dev.
get_info<sycl::info::device::max_work_group_size>();
145 return std::min(CPUMaxWGSize, DevMaxWGSize);
151 size_t DevMaxWGSize =
152 Dev.
get_info<sycl::info::device::max_work_group_size>();
159 size_t DevMaxWGSize =
160 Dev.
get_info<sycl::info::device::max_work_group_size>();
171 std::shared_ptr<int> &Counter) {
172 auto EventImpl = std::make_shared<detail::event_impl>(Queue);
174 EventImpl->setStateIncomplete();
176 &EventImpl->getHandleRef(), EventImpl);
177 CGH.
depends_on(createSyclObjFromImpl<event>(EventImpl));
static void fill_usm(void *DstMem, QueueImplPtr Queue, size_t Len, const std::vector< unsigned char > &Pattern, std::vector< ur_event_handle_t > DepEvents, ur_event_handle_t *OutEvent, const detail::EventImplPtr &OutEventImpl)
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
bool is_accelerator() const
Check if device is an accelerator device.
bool is_gpu() const
Check if device is a GPU device.
detail::is_device_info_desc< Param >::return_type get_info() const
Queries this SYCL device for information requested by the template parameter param.
bool is_cpu() const
Get instance of device.
Command group handler class.
void depends_on(event Event)
Registers event dependencies on this command group.
constexpr tuple_element< I, tuple< Types... > >::type & get(sycl::detail::tuple< Types... > &Arg) noexcept
decltype(Obj::impl) const & getSyclObjImpl(const Obj &SyclObject)
uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr< queue_impl > Queue)
size_t reduGetMaxWGSize(std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
size_t reduGetPreferredWGSize(std::shared_ptr< queue_impl > &Queue, size_t LocalMemBytesPerWorkItem)
void addCounterInit(handler &CGH, std::shared_ptr< sycl::detail::queue_impl > &Queue, std::shared_ptr< int > &Counter)
size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)