40 #include <type_traits>
41 #include <unordered_map>
50 #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
57 #if defined(__linux__)
65 #error "Only support Windows and Linux."
73 sycl::ext::oneapi::group_local_memory_for_overwrite<AllocT>(
74 sycl::ext::oneapi::this_work_item::get_work_group<3>());
75 auto *As = *As_multi_ptr;
79 namespace experimental {
105 : _data(data), _pitch(pitch), _x(x), _y(y) {}
121 size_t _pitch, _x, _y;
124 namespace experimental {
125 #ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
126 class image_mem_wrapper;
133 image_mem_wrapper *dest,
const sycl::id<3> &dest_id,
147 #ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
148 experimental::image_mem_wrapper *image_bindless{
nullptr};
162 template <
class T, memory_region Memory,
size_t Dimension>
class accessor;
167 ? sycl::access::address_space::local_space
168 : sycl::access::address_space::global_space;
172 ? sycl::access_mode::read
179 template <
size_t Dimension = 1>
196 return ((x) + 31) & ~(0x1F);
206 static inline void *
malloc(
size_t &pitch,
size_t x,
size_t y,
size_t z,
209 return malloc(pitch * y * z, q);
224 return q.
fill(dev_ptr, pattern, count);
236 return q.
memset(dev_ptr, value, size);
246 template <
typename T>
247 static inline std::vector<sycl::event>
249 std::vector<sycl::event> event_list;
251 unsigned char *data_surface = (
unsigned char *)data.
get_data_ptr();
252 for (
size_t z = 0; z < size.
get(2); ++z) {
253 unsigned char *data_ptr = data_surface;
254 for (
size_t y = 0; y < size.
get(1); ++y) {
255 event_list.push_back(detail::fill<T>(q, data_ptr, value, size.
get(0)));
258 data_surface += slice;
273 template <
typename T>
275 size_t pitch,
const T &value,
276 size_t x,
size_t y) {
291 case sycl::usm::alloc::unknown:
293 case sycl::usm::alloc::device:
295 case sycl::usm::alloc::shared:
296 case sycl::usm::alloc::host:
304 using namespace experimental;
317 const std::vector<sycl::event> &dep_events = {}) {
320 return q.
memcpy(to_ptr, from_ptr, size, dep_events);
326 return slice * (size.
get(2) - 1) + pitch * (size.
get(1) - 1) + size.
get(0);
330 return slice *
id.get(2) + pitch *
id.get(1) +
id.get(0);
338 const std::vector<sycl::event> &_deps;
342 : _buf(
std::
malloc(size)), _size(size), _q(q), _deps(deps) {}
357 static inline std::vector<sycl::event>
361 const std::vector<sycl::event> &dep_events = {}) {
363 std::vector<sycl::event> event_list;
365 size_t to_slice = to_range.
get(1) * to_range.
get(0);
366 size_t from_slice = from_range.
get(1) * from_range.
get(0);
367 unsigned char *to_surface =
368 (
unsigned char *)to_ptr +
get_offset(to_id, to_slice, to_range.
get(0));
369 const unsigned char *from_surface =
370 (
const unsigned char *)from_ptr +
373 if (to_slice == from_slice && to_slice == size.
get(1) * size.
get(0)) {
374 return {
memcpy(q, to_surface, from_surface, to_slice * size.
get(2),
377 using namespace experimental;
379 size_t size_slice = size.
get(1) * size.
get(0);
382 for (
size_t z = 0;
z < size.
get(2); ++
z) {
383 unsigned char *to_ptr = to_surface;
384 const unsigned char *from_ptr = from_surface;
385 if (to_range.
get(0) == from_range.
get(0) &&
386 to_range.
get(0) == size.
get(0)) {
387 event_list.push_back(
388 memcpy(q, to_ptr, from_ptr, size_slice, dep_events));
390 for (
size_t y = 0;
y < size.
get(1); ++
y) {
391 event_list.push_back(
392 memcpy(q, to_ptr, from_ptr, size.
get(0), dep_events));
393 to_ptr += to_range.
get(0);
394 from_ptr += from_range.
get(0);
397 to_surface += to_slice;
398 from_surface += from_slice;
404 std::vector<sycl::event> host_events;
405 if (to_slice == size_slice) {
408 memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
413 memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
417 std::vector<sycl::event>{memcpy(q, buf.get_ptr(), to_surface,
418 buf.get_size(), dep_events)});
421 event_list.push_back(
422 memcpy(q, to_surface, buf.get_ptr(), buf.get_size(), host_events));
430 memcpy(q, to_surface, buf.get_ptr(), to_range, from_range,
433 std::vector<sycl::event>{memcpy(q, buf.get_ptr(), from_surface,
434 buf.get_size(), dep_events)});
439 cgh.depends_on(dep_events);
440 cgh.parallel_for<class memcpy_3d_detail>(size, [=](sycl::id<3> id) {
441 to_surface[get_offset(id, to_slice, to_range.get(0))] =
442 from_surface[get_offset(id, from_slice, from_range.get(0))];
447 throw std::runtime_error(
"[SYCLcompat] memcpy: invalid direction value");
453 static inline std::vector<sycl::event>
463 static inline std::vector<sycl::event>
465 size_t from_pitch,
size_t x,
size_t y) {
483 namespace experimental {
485 static inline std::vector<sycl::event>
489 #ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
490 if (param.
to.image_bindless !=
nullptr &&
491 param.
from.image_bindless !=
nullptr) {
492 throw std::runtime_error(
493 "[SYCLcompat] memcpy: Unsupported bindless_image API.");
495 std::vector<sycl::event> event_list;
497 to.set_data_ptr(buf.
get_ptr());
500 from.set_data_ptr(buf.
get_ptr());
505 }
else if (param.
to.image_bindless !=
nullptr) {
506 throw std::runtime_error(
507 "[SYCLcompat] memcpy: Unsupported bindless_image API.");
509 param.
to.image_bindless, param.
to.
pos,
511 }
else if (param.
from.image_bindless !=
nullptr) {
512 throw std::runtime_error(
513 "[SYCLcompat] memcpy: Unsupported bindless_image API.");
519 if (param.
to.
image !=
nullptr) {
520 throw std::runtime_error(
"[SYCLcompat] memcpy: Unsupported image API.");
524 throw std::runtime_error(
"[SYCLcompat] memcpy: Unsupported image API.");
537 static inline void *
malloc(
size_t num_bytes,
547 template <
typename T>
566 template <
typename T>
585 template <
typename T>
611 static inline void *
malloc(
size_t &pitch,
size_t x,
size_t y,
648 const std::vector<sycl::event> &events,
652 cgh.depends_on(events);
653 cgh.host_task([=]() {
654 for (auto p : pointers)
671 static void memcpy(
void *to_ptr,
const void *from_ptr,
size_t size,
700 template <
typename T>
705 static_cast<const void *
>(from_ptr), count *
sizeof(T));
718 template <
typename T>
723 static_cast<const void *
>(from_ptr), count *
sizeof(T))
741 static inline void memcpy(
void *to_ptr,
size_t to_pitch,
const void *from_ptr,
742 size_t from_pitch,
size_t x,
size_t y,
763 const void *from_ptr,
size_t from_pitch,
766 auto events =
detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y);
805 auto events =
detail::memcpy(q, to, to_pos, from, from_pos, size);
819 static void inline fill(
void *dev_ptr,
const T &pattern,
size_t count,
842 namespace experimental {
875 static void memset(
void *dev_ptr,
int value,
size_t size,
886 static inline void memset_d16(
void *dev_ptr,
unsigned short value,
size_t size,
888 detail::fill<unsigned short>(q, dev_ptr, value, size).wait();
897 static inline void memset_d32(
void *dev_ptr,
unsigned int value,
size_t size,
899 detail::fill<unsigned int>(q, dev_ptr, value, size).wait();
923 return detail::fill<unsigned short>(q, dev_ptr, value, size);
936 return detail::fill<unsigned int>(q, dev_ptr, value, size);
947 static inline void memset(
void *ptr,
size_t pitch,
int val,
size_t x,
size_t y,
960 static inline void memset_d16(
void *ptr,
size_t pitch,
unsigned short val,
974 static inline void memset_d32(
void *ptr,
size_t pitch,
unsigned int val,
993 auto events = detail::memset<unsigned char>(q, ptr, pitch, val, x, y);
1055 auto events = detail::memset<unsigned char>(q, pitch, val, size);
1060 template <
class T, memory_region Memory,
size_t Dimension>
class accessor;
1061 template <
class T, memory_region Memory>
class accessor<T, Memory, 3> {
1068 : _data(data), _range(in_range) {}
1069 template <memory_region M = Memory>
1072 :
accessor(acc, acc.get_range()) {}
1074 :
accessor(acc.get_pointer(), in_range) {}
1086 template <
class T, memory_region Memory>
class accessor<T, Memory, 2> {
1093 : _data(data), _range(in_range) {}
1094 template <memory_region Mem = Memory>
1097 :
accessor(acc, acc.get_range()) {}
1099 :
accessor(acc.get_pointer(), in_range) {}
1102 return _data + _range.get(1) * index;
1113 template <
class T, memory_region Memory,
size_t Dimension>
class device_memory {
1125 std::initializer_list<value_t> &&init_list,
1128 assert(init_list.size() <= in_range.
size());
1130 std::memset(_host_ptr, 0, _size);
1131 std::memcpy(_host_ptr, init_list.begin(), init_list.size() *
sizeof(T));
1135 template <
size_t Dim = Dimension>
1137 const typename std::enable_if<Dim == 2,
sycl::range<2>>::type &in_range,
1138 std::initializer_list<std::initializer_list<value_t>> &&init_list,
1141 assert(init_list.size() <= in_range[0]);
1143 std::memset(_host_ptr, 0, _size);
1144 auto tmp_data = _host_ptr;
1145 for (
auto sub_list : init_list) {
1146 assert(sub_list.size() <= in_range[1]);
1147 std::memcpy(tmp_data, sub_list.begin(), sub_list.size() *
sizeof(T));
1148 tmp_data += in_range[1];
1155 : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
1156 _host_ptr(nullptr), _device_ptr(nullptr), _q(q) {
1160 "device memory region should be global, constant or shared");
1168 template <
class... Args,
size_t Dim = Dimension,
1169 typename = std::enable_if_t<
sizeof...(Args) == Dim>>
1175 template <
class... Args>
1180 if (_device_ptr && !_reference)
1219 template <
size_t Dim = Dimension>
1220 typename std::enable_if<Dim == 1, T>::type &
operator[](
size_t index) {
1222 return _device_ptr[index];
1227 template <
size_t Dim = Dimension>
1228 typename std::enable_if<Dim != 1, syclcompat_accessor_t>::type
1236 : _size(size), _range(size / sizeof(T)), _reference(true),
1237 _device_ptr(memory_ptr), _q(q) {}
1245 #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
1263 template <
class T, memory_region Memory>
1273 :
base(
sycl::range<1>(1), {val}, q) {}
1279 template <
class T,
size_t Dimension>
1281 template <
class T,
size_t Dimension>
1283 template <
class T,
size_t Dimension>
1290 device_pointer = (memory_type != sycl::usm::alloc::unknown) ? ptr :
nullptr;
1291 host_pointer = (memory_type != sycl::usm::alloc::unknown) &&
1292 (memory_type != sycl::usm::alloc::device)
1311 const void *device_pointer =
nullptr;
1312 const void *host_pointer =
nullptr;
1313 unsigned int device_id = 0;
size_t get(int dimension) const
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
void wait()
Wait for the event.
Command group handler class.
void depends_on(event Event)
Registers event dependencies on this command group.
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
A unique identifier of an item in an index space.
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
void wait(const detail::code_location &CodeLoc=detail::code_location::current())
Performs a blocking wait for the completion of all enqueued tasks in the queue.
event memcpy(void *Dest, const void *Src, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
event fill(void *Ptr, const T &Pattern, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the specified memory with the specified pattern.
event memset(void *Ptr, int Value, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the memory pointed by a USM pointer with the value specified.
device get_device() const
context get_context() const
std::enable_if_t< std::is_invocable_r_v< void, T, handler & >, event > submit(T CGF, const detail::code_location &CodeLoc=detail::code_location::current())
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Defines the iteration domain of either a single work-group in a parallel dispatch,...
accessor(const accessor_t &acc, const sycl::range< 2 > &in_range)
pointer_t get_ptr() const
pointer_t operator[](size_t index) const
typename memory_t::template accessor_t< 2 > accessor_t
accessor(typename std::enable_if< Mem !=memory_region::local, const accessor_t >::type &acc)
typename memory_t::pointer_t pointer_t
typename memory_t::element_t element_t
accessor(pointer_t data, const sycl::range< 2 > &in_range)
accessor(pointer_t data, const sycl::range< 3 > &in_range)
accessor< T, Memory, 2 > operator[](size_t index) const
typename memory_t::template accessor_t< 3 > accessor_t
typename memory_t::element_t element_t
accessor(typename std::enable_if< M !=memory_region::local, const accessor_t >::type &acc)
typename memory_t::pointer_t pointer_t
accessor(const accessor_t &acc, const sycl::range< 3 > &in_range)
pointer_t get_ptr() const
accessor used as device function parameter.
unsigned int get_device_id(const sycl::device &dev)
static dev_mgr & instance()
Returns the instance of device manager singleton.
host_buffer(size_t size, sycl::queue q, const std::vector< sycl::event > &deps)
typename std::conditional_t< Memory==memory_region::constant, const T, T > element_t
typename std::remove_cv_t< T > value_t
static constexpr size_t type_size
typename std::conditional_t< target==target::local, sycl::local_accessor< T, Dimension >, sycl::accessor< T, Dimension, mode > > accessor_t
static constexpr sycl::access_mode mode
static constexpr sycl::access::address_space asp
void queues_wait_and_throw()
typename detail::memory_traits< Memory, T >::template accessor_t< 0 > accessor_t
device_memory(const value_t &val, sycl::queue q=get_default_queue())
Constructor with initial value.
device_memory(sycl::queue q=get_default_queue())
Default constructor.
typename base::value_t value_t
Device variable with address space of shared or global.
std::enable_if< Dim !=1, syclcompat_accessor_t >::type get_access(sycl::handler &cgh)
Get compat_accessor with dimension info for the device memory object when usm is used and dimension i...
device_memory(const sycl::range< Dimension > &range_in, sycl::queue q=get_default_queue())
Constructor with range.
syclcompat::accessor< T, Memory, Dimension > syclcompat_accessor_t
size_t get_size()
Get the device memory object size in bytes.
device_memory(Args... Arguments)
Constructor with range.
device_memory(const typename std::enable_if< Dim==2, sycl::range< 2 >>::type &in_range, std::initializer_list< std::initializer_list< value_t >> &&init_list, sycl::queue q=get_default_queue())
Constructor of 2-D array with initializer list.
std::enable_if< Dim==1, T >::type & operator[](size_t index)
value_t * get_ptr(sycl::queue q)
void init(sycl::queue q)
Allocate memory with specified queue, and init memory if has initial value.
device_memory(const sycl::range< Dimension > &in_range, std::initializer_list< value_t > &&init_list, sycl::queue q=get_default_queue())
Constructor of 1-D array with initializer list.
device_memory(sycl::queue q=get_default_queue())
void init()
Allocate memory with the queue specified in the constuctor, and init memory if has initial value.
void assign(value_t *src, size_t size)
The variable is assigned to a device pointer.
typename detail::memory_traits< Memory, T >::template accessor_t< Dimension > accessor_t
device_memory(Args... Arguments, sycl::queue q)
Constructor with range and queue.
typename detail::memory_traits< Memory, T >::value_t value_t
Pitched 2D/3D memory data.
pitched_data(void *data, size_t pitch, size_t x, size_t y)
void set_pitch(size_t pitch)
void set_data_ptr(void *data)
const void * get_host_pointer()
void init(const void *ptr, sycl::queue q=get_default_queue())
const void * get_device_pointer()
sycl::usm::alloc get_memory_type()
unsigned int get_device_id()
__ESIMD_API std::enable_if_t<(RegionT::length *sizeof(typename RegionT::element_type) >=2)> wait(sycl::ext::intel::esimd::simd_view< T, RegionT > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
usm::alloc get_pointer_type(const void *ptr, const context &ctxt)
Query the allocation type from a USM pointer.
void * malloc_shared(size_t size, const device &dev, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
constexpr mode_tag_t< access_mode::read_write > read_write
auto auto autodecltype(x) z
device get_pointer_device(const void *ptr, const context &ctxt)
Queries the device against which the pointer was allocated Throws an exception with errc::invalid err...
void * malloc_device(size_t size, const device &dev, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
void * malloc_host(size_t size, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
void free(void *ptr, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
static experimental::memcpy_direction deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr)
static pointer_access_attribute get_pointer_attribute(sycl::queue q, const void *ptr)
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
static constexpr size_t get_pitch(size_t x)
Calculate pitch (padded length of major dimension x) by rounding up to multiple of 32.
static sycl::event combine_events(std::vector< sycl::event > &events, sycl::queue q)
static sycl::event memset(sycl::queue q, void *dev_ptr, int value, size_t size)
Set value to the first size bytes starting from dev_ptr in q.
static size_t get_offset(sycl::id< 3 > id, size_t slice, size_t pitch)
static size_t get_copy_range(sycl::range< 3 > size, size_t slice, size_t pitch)
static void * malloc(size_t size, sycl::queue q)
static sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern, size_t count)
Set pattern to the first count elements of type T starting from dev_ptr.
static std::vector< sycl::event > memcpy(sycl::queue q, const experimental::memcpy_parameter ¶m)
static pitched_data to_pitched_data(image_matrix *image)
static void memcpy(const memcpy_parameter ¶m, sycl::queue q=get_default_queue())
[UNSUPPORTED] Synchronously copies 2D/3D memory data specified by param .
static void memcpy_async(const memcpy_parameter ¶m, sycl::queue q=get_default_queue())
[UNSUPPORTED] Asynchronously copies 2D/3D memory data specified by param
static sycl::event memset_d16_async(void *dev_ptr, unsigned short value, size_t size, sycl::queue q=get_default_queue())
Sets 2 bytes data value to the first size elements starting from dev_ptr in q asynchronously.
static void memcpy(void *to_ptr, const void *from_ptr, size_t size, sycl::queue q=get_default_queue())
Synchronously copies size bytes from the address specified by from_ptr to the address specified by to...
static void free(void *ptr, sycl::queue q=get_default_queue())
Free the memory ptr on the default queue without synchronizing.
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
static device_ext & get_current_device()
Util function to get the current device.
static void memset_d32(void *dev_ptr, unsigned int value, size_t size, sycl::queue q=get_default_queue())
Sets 4 bytes data value to the first size elements starting from dev_ptr in q synchronously.
static void * malloc_host(size_t num_bytes, sycl::queue q=get_default_queue())
Allocate memory block on the host.
typename type_identity< T >::type type_identity_t
static sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size, sycl::queue q=get_default_queue())
Asynchronously copies size bytes from the address specified by from_ptr to the address specified by t...
static sycl::event memset_async(void *dev_ptr, int value, size_t size, sycl::queue q=get_default_queue())
Sets 1 byte data value to the first size elements starting from dev_ptr in q asynchronously.
static void fill(void *dev_ptr, const T &pattern, size_t count, sycl::queue q=get_default_queue())
Synchronously sets pattern to the first count elements starting from dev_ptr.
static void * malloc_shared(size_t num_bytes, sycl::queue q=get_default_queue())
Allocate memory block of usm_shared memory.
static void * malloc(size_t &pitch, size_t x, size_t y, sycl::queue q=get_default_queue())
Allocate memory block for 2D array on the device.
static sycl::event fill_async(void *dev_ptr, const T &pattern, size_t count, sycl::queue q=get_default_queue())
Asynchronously sets pattern to the first count elements starting from dev_ptr.
static void wait_and_free(void *ptr, sycl::queue q=get_default_queue())
Wait on the queue q and free the memory ptr.
sycl::event enqueue_free(const std::vector< void * > &pointers, const std::vector< sycl::event > &events, sycl::queue q=get_default_queue())
Enqueues the release of all pointers in /p pointers on the /p q.
static sycl::event memset_d32_async(void *dev_ptr, unsigned int value, size_t size, sycl::queue q=get_default_queue())
Sets 4 bytes data value to the first size elements starting from dev_ptr in q asynchronously.
static void memset_d16(void *dev_ptr, unsigned short value, size_t size, sycl::queue q=get_default_queue())
Sets 2 bytes data value to the first size elements starting from dev_ptr in q synchronously.
static void memset(void *dev_ptr, int value, size_t size, sycl::queue q=get_default_queue())
Synchronously sets value to the first size bytes starting from dev_ptr.
Memory copy parameters for 2D/3D memory data.
syclcompat::experimental::memcpy_direction direction