40 #include <type_traits>
41 #include <unordered_map>
49 #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
56 #if defined(__linux__)
64 #error "Only support Windows and Linux."
72 sycl::ext::oneapi::group_local_memory_for_overwrite<AllocT>(
73 sycl::ext::oneapi::this_work_item::get_work_group<3>());
74 auto *As = *As_multi_ptr;
104 : _data(data), _pitch(pitch), _x(x), _y(y) {}
120 size_t _pitch, _x, _y;
125 template <
class T, memory_region Memory,
size_t Dimension>
class accessor;
130 ? sycl::access::address_space::local_space
131 : sycl::access::address_space::global_space;
135 ? sycl::access_mode::read
142 template <
size_t Dimension = 1>
159 return ((x) + 31) & ~(0x1F);
169 static inline void *
malloc(
size_t &pitch,
size_t x,
size_t y,
size_t z,
172 return malloc(pitch * y * z, q);
187 return q.
fill(dev_ptr, pattern, count);
199 return q.
memset(dev_ptr, value, size);
209 template <
typename T>
210 static inline std::vector<sycl::event>
212 std::vector<sycl::event> event_list;
214 unsigned char *data_surface = (
unsigned char *)data.
get_data_ptr();
215 for (
size_t z = 0; z < size.
get(2); ++z) {
216 unsigned char *data_ptr = data_surface;
217 for (
size_t y = 0; y < size.
get(1); ++y) {
218 event_list.push_back(detail::fill<T>(q, data_ptr, value, size.
get(0)));
221 data_surface += slice;
236 template <
typename T>
238 size_t pitch,
const T &value,
239 size_t x,
size_t y) {
254 case sycl::usm::alloc::unknown:
256 case sycl::usm::alloc::device:
258 case sycl::usm::alloc::shared:
259 case sycl::usm::alloc::host:
265 const void *from_ptr) {
285 const std::vector<sycl::event> &dep_events = {}) {
288 return q.
memcpy(to_ptr, from_ptr, size, dep_events);
294 return slice * (size.
get(2) - 1) + pitch * (size.
get(1) - 1) + size.
get(0);
298 return slice *
id.get(2) + pitch *
id.get(1) +
id.get(0);
303 static inline std::vector<sycl::event>
307 const std::vector<sycl::event> &dep_events = {}) {
313 const std::vector<sycl::event> &_deps;
317 const std::vector<sycl::event> &deps)
318 : _buf(
std::
malloc(size)), _size(size), _q(q), _deps(deps) {}
319 void *get_ptr()
const {
return _buf; }
320 size_t get_size()
const {
return _size; }
330 std::vector<sycl::event> event_list;
332 size_t to_slice = to_range.
get(1) * to_range.
get(0);
333 size_t from_slice = from_range.
get(1) * from_range.
get(0);
334 unsigned char *to_surface =
335 (
unsigned char *)to_ptr +
get_offset(to_id, to_slice, to_range.
get(0));
336 const unsigned char *from_surface =
337 (
const unsigned char *)from_ptr +
340 if (to_slice == from_slice && to_slice == size.
get(1) * size.
get(0)) {
341 return {
memcpy(q, to_surface, from_surface, to_slice * size.
get(2),
345 size_t size_slice = size.
get(1) * size.
get(0);
348 for (
size_t z = 0;
z < size.
get(2); ++
z) {
349 unsigned char *to_ptr = to_surface;
350 const unsigned char *from_ptr = from_surface;
351 if (to_range.
get(0) == from_range.
get(0) &&
352 to_range.
get(0) == size.
get(0)) {
353 event_list.push_back(
354 memcpy(q, to_ptr, from_ptr, size_slice, dep_events));
356 for (
size_t y = 0;
y < size.
get(1); ++
y) {
357 event_list.push_back(
358 memcpy(q, to_ptr, from_ptr, size.
get(0), dep_events));
359 to_ptr += to_range.
get(0);
360 from_ptr += from_range.
get(0);
363 to_surface += to_slice;
364 from_surface += from_slice;
370 std::vector<sycl::event> host_events;
371 if (to_slice == size_slice) {
374 memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
379 memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
383 std::vector<sycl::event>{memcpy(q, buf.get_ptr(), to_surface,
384 buf.get_size(), dep_events)});
387 event_list.push_back(
388 memcpy(q, to_surface, buf.get_ptr(), buf.get_size(), host_events));
396 memcpy(q, to_surface, buf.get_ptr(), to_range, from_range,
399 std::vector<sycl::event>{memcpy(q, buf.get_ptr(), from_surface,
400 buf.get_size(), dep_events)});
405 cgh.depends_on(dep_events);
406 cgh.parallel_for<class memcpy_3d_detail>(size, [=](sycl::id<3> id) {
407 to_surface[get_offset(id, to_slice, to_range.get(0))] =
408 from_surface[get_offset(id, from_slice, from_range.get(0))];
413 throw std::runtime_error(
"[SYCLcompat] memcpy: invalid direction value");
419 static inline std::vector<sycl::event>
429 static inline std::vector<sycl::event>
431 size_t from_pitch,
size_t x,
size_t y) {
453 static inline void *
malloc(
size_t num_bytes,
463 template <
typename T>
482 template <
typename T>
501 template <
typename T>
527 static inline void *
malloc(
size_t &pitch,
size_t x,
size_t y,
550 const std::vector<sycl::event> &events,
554 cgh.depends_on(events);
555 cgh.host_task([=]() {
556 for (auto p : pointers)
573 static void memcpy(
void *to_ptr,
const void *from_ptr,
size_t size,
602 template <
typename T>
607 static_cast<const void *
>(from_ptr), count *
sizeof(T));
620 template <
typename T>
625 static_cast<const void *
>(from_ptr), count *
sizeof(T))
643 static inline void memcpy(
void *to_ptr,
size_t to_pitch,
const void *from_ptr,
644 size_t from_pitch,
size_t x,
size_t y,
665 const void *from_ptr,
size_t from_pitch,
668 auto events =
detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch,
x,
y);
707 auto events =
detail::memcpy(q, to, to_pos, from, from_pos, size);
721 static void inline fill(
void *dev_ptr,
const T &pattern,
size_t count,
752 static void memset(
void *dev_ptr,
int value,
size_t size,
763 static inline void memset_d16(
void *dev_ptr,
unsigned short value,
size_t size,
765 detail::fill<unsigned short>(q, dev_ptr, value, size).wait();
774 static inline void memset_d32(
void *dev_ptr,
unsigned int value,
size_t size,
776 detail::fill<unsigned int>(q, dev_ptr, value, size).wait();
800 return detail::fill<unsigned short>(q, dev_ptr, value, size);
813 return detail::fill<unsigned int>(q, dev_ptr, value, size);
824 static inline void memset(
void *ptr,
size_t pitch,
int val,
size_t x,
size_t y,
837 static inline void memset_d16(
void *ptr,
size_t pitch,
unsigned short val,
851 static inline void memset_d32(
void *ptr,
size_t pitch,
unsigned int val,
870 auto events = detail::memset<unsigned char>(q, ptr, pitch, val,
x,
y);
932 auto events = detail::memset<unsigned char>(q, pitch, val, size);
937 template <
class T, memory_region Memory,
size_t Dimension>
class accessor;
938 template <
class T, memory_region Memory>
class accessor<T, Memory, 3> {
945 : _data(data), _range(in_range) {}
946 template <memory_region M = Memory>
947 accessor(
typename std::enable_if<M != memory_region::local,
951 :
accessor(acc.get_pointer(), in_range) {}
963 template <
class T, memory_region Memory>
class accessor<T, Memory, 2> {
970 : _data(data), _range(in_range) {}
971 template <memory_region Mem = Memory>
972 accessor(
typename std::enable_if<Mem != memory_region::local,
976 :
accessor(acc.get_pointer(), in_range) {}
979 return _data + _range.get(1) * index;
990 template <
class T, memory_region Memory,
size_t Dimension>
class device_memory {
1002 std::initializer_list<value_t> &&init_list,
1005 assert(init_list.size() <= in_range.
size());
1007 std::memset(_host_ptr, 0, _size);
1008 std::memcpy(_host_ptr, init_list.begin(), init_list.size() *
sizeof(T));
1012 template <
size_t Dim = Dimension>
1014 const typename std::enable_if<Dim == 2,
sycl::range<2>>::type &in_range,
1015 std::initializer_list<std::initializer_list<value_t>> &&init_list,
1018 assert(init_list.size() <= in_range[0]);
1020 std::memset(_host_ptr, 0, _size);
1021 auto tmp_data = _host_ptr;
1022 for (
auto sub_list : init_list) {
1023 assert(sub_list.size() <= in_range[1]);
1024 std::memcpy(tmp_data, sub_list.begin(), sub_list.size() *
sizeof(T));
1025 tmp_data += in_range[1];
1032 : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
1033 _host_ptr(nullptr), _device_ptr(nullptr), _q(q) {
1034 static_assert((Memory == memory_region::global) ||
1035 (Memory == memory_region::constant) ||
1036 (Memory == memory_region::usm_shared),
1037 "device memory region should be global, constant or shared");
1039 detail::dev_mgr::instance();
1045 template <
class... Args,
size_t Dim = Dimension,
1046 typename = std::enable_if_t<
sizeof...(Args) == Dim>>
1052 template <
class... Args>
1057 if (_device_ptr && !_reference)
1096 template <
size_t Dim = Dimension>
1097 typename std::enable_if<Dim == 1, T>::type &
operator[](
size_t index) {
1099 return _device_ptr[index];
1104 template <
size_t Dim = Dimension>
1105 typename std::enable_if<Dim != 1, syclcompat_accessor_t>::type
1113 : _size(size), _range(size / sizeof(T)), _reference(true),
1114 _device_ptr(memory_ptr), _q(q) {}
1117 if (Memory == memory_region::usm_shared) {
1122 #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
1123 if (Memory == memory_region::constant) {
1137 value_t *_device_ptr;
1140 template <
class T, memory_region Memory>
1150 :
base(
sycl::range<1>(1), {val}, q) {}
1156 template <
class T,
size_t Dimension>
1158 template <
class T,
size_t Dimension>
1160 template <
class T,
size_t Dimension>
1167 device_pointer = (memory_type != sycl::usm::alloc::unknown) ? ptr :
nullptr;
1168 host_pointer = (memory_type != sycl::usm::alloc::unknown) &&
1169 (memory_type != sycl::usm::alloc::device)
1173 device_id = detail::dev_mgr::instance().get_device_id(device_obj);
1188 const void *device_pointer =
nullptr;
1189 const void *host_pointer =
nullptr;
1190 unsigned int device_id = 0;
size_t get(int dimension) const
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Command group handler class.
void depends_on(event Event)
Registers event dependencies on this command group.
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
A unique identifier of an item in an index space.
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
event memcpy(void *Dest, const void *Src, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
event fill(void *Ptr, const T &Pattern, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the specified memory with the specified pattern.
event memset(void *Ptr, int Value, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the memory pointed by a USM pointer with the value specified.
device get_device() const
context get_context() const
std::enable_if_t< std::is_invocable_r_v< void, T, handler & >, event > submit(T CGF, const detail::code_location &CodeLoc=detail::code_location::current())
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Defines the iteration domain of either a single work-group in a parallel dispatch,...
accessor(const accessor_t &acc, const sycl::range< 2 > &in_range)
pointer_t get_ptr() const
pointer_t operator[](size_t index) const
typename memory_t::template accessor_t< 2 > accessor_t
accessor(typename std::enable_if< Mem !=memory_region::local, const accessor_t >::type &acc)
typename memory_t::pointer_t pointer_t
typename memory_t::element_t element_t
accessor(pointer_t data, const sycl::range< 2 > &in_range)
accessor(pointer_t data, const sycl::range< 3 > &in_range)
accessor< T, Memory, 2 > operator[](size_t index) const
typename memory_t::template accessor_t< 3 > accessor_t
typename memory_t::element_t element_t
accessor(typename std::enable_if< M !=memory_region::local, const accessor_t >::type &acc)
typename memory_t::pointer_t pointer_t
accessor(const accessor_t &acc, const sycl::range< 3 > &in_range)
pointer_t get_ptr() const
accessor used as device function parameter.
typename std::conditional_t< Memory==memory_region::constant, const T, T > element_t
typename std::remove_cv_t< T > value_t
static constexpr size_t type_size
typename std::conditional_t< target==target::local, sycl::local_accessor< T, Dimension >, sycl::accessor< T, Dimension, mode > > accessor_t
static constexpr sycl::access_mode mode
static constexpr sycl::access::address_space asp
typename detail::memory_traits< Memory, T >::template accessor_t< 0 > accessor_t
device_memory(const value_t &val, sycl::queue q=get_default_queue())
Constructor with initial value.
device_memory(sycl::queue q=get_default_queue())
Default constructor.
typename base::value_t value_t
Device variable with address space of shared or global.
std::enable_if< Dim !=1, syclcompat_accessor_t >::type get_access(sycl::handler &cgh)
Get compat_accessor with dimension info for the device memory object when usm is used and dimension i...
device_memory(const sycl::range< Dimension > &range_in, sycl::queue q=get_default_queue())
Constructor with range.
size_t get_size()
Get the device memory object size in bytes.
device_memory(Args... Arguments)
Constructor with range.
device_memory(const typename std::enable_if< Dim==2, sycl::range< 2 >>::type &in_range, std::initializer_list< std::initializer_list< value_t >> &&init_list, sycl::queue q=get_default_queue())
Constructor of 2-D array with initializer list.
std::enable_if< Dim==1, T >::type & operator[](size_t index)
value_t * get_ptr(sycl::queue q)
void init(sycl::queue q)
Allocate memory with specified queue, and init memory if has initial value.
device_memory(const sycl::range< Dimension > &in_range, std::initializer_list< value_t > &&init_list, sycl::queue q=get_default_queue())
Constructor of 1-D array with initializer list.
device_memory(sycl::queue q=get_default_queue())
void init()
Allocate memory with the queue specified in the constuctor, and init memory if has initial value.
void assign(value_t *src, size_t size)
The variable is assigned to a device pointer.
typename detail::memory_traits< Memory, T >::template accessor_t< Dimension > accessor_t
device_memory(Args... Arguments, sycl::queue q)
Constructor with range and queue.
typename detail::memory_traits< Memory, T >::value_t value_t
Pitched 2D/3D memory data.
pitched_data(void *data, size_t pitch, size_t x, size_t y)
void set_pitch(size_t pitch)
void set_data_ptr(void *data)
const void * get_host_pointer()
void init(const void *ptr, sycl::queue q=get_default_queue())
const void * get_device_pointer()
sycl::usm::alloc get_memory_type()
unsigned int get_device_id()
__ESIMD_API std::enable_if_t<(RegionT::length *sizeof(typename RegionT::element_type) >=2)> wait(sycl::ext::intel::esimd::simd_view< T, RegionT > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
usm::alloc get_pointer_type(const void *ptr, const context &ctxt)
Query the allocation type from a USM pointer.
void * malloc_shared(size_t size, const device &dev, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
constexpr mode_tag_t< access_mode::read_write > read_write
auto auto autodecltype(x) z
device get_pointer_device(const void *ptr, const context &ctxt)
Queries the device against which the pointer was allocated Throws an invalid_object_error if ptr is a...
void * malloc_device(size_t size, const device &dev, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
void * malloc_host(size_t size, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
void free(void *ptr, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
static pointer_access_attribute get_pointer_attribute(sycl::queue q, const void *ptr)
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
static constexpr size_t get_pitch(size_t x)
Calculate pitch (padded length of major dimension x) by rounding up to multiple of 32.
static sycl::event combine_events(std::vector< sycl::event > &events, sycl::queue q)
static sycl::event memset(sycl::queue q, void *dev_ptr, int value, size_t size)
Set value to the first size bytes starting from dev_ptr in q.
static size_t get_offset(sycl::id< 3 > id, size_t slice, size_t pitch)
static memcpy_direction deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr)
static size_t get_copy_range(sycl::range< 3 > size, size_t slice, size_t pitch)
static void * malloc(size_t size, sycl::queue q)
static sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern, size_t count)
Set pattern to the first count elements of type T starting from dev_ptr.
static void memcpy(void *to_ptr, const void *from_ptr, size_t size, sycl::queue q=get_default_queue())
Synchronously copies size bytes from the address specified by from_ptr to the address specified by to...
static sycl::event memset_d32_async(void *ptr, size_t pitch, unsigned int val, size_t x, size_t y, sycl::queue q=get_default_queue())
Sets 4 bytes data val to the pitched 2D memory region pointed by ptr in q asynchronously.
static void free(void *ptr, sycl::queue q=get_default_queue())
free
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
static device_ext & get_current_device()
Util function to get the current device.
static void memset_d16(void *ptr, size_t pitch, unsigned short val, size_t x, size_t y, sycl::queue q=get_default_queue())
Sets 2 bytes data val to the pitched 2D memory region pointed by ptr in q synchronously.
static void memset(pitched_data pitch, int val, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Sets value to the 3D memory region specified by pitch in q.
static void * malloc_host(size_t num_bytes, sycl::queue q=get_default_queue())
Allocate memory block on the host.
typename type_identity< T >::type type_identity_t
static void fill(void *dev_ptr, const T &pattern, size_t count, sycl::queue q=get_default_queue())
Synchronously sets pattern to the first count elements starting from dev_ptr.
static void * malloc_shared(size_t num_bytes, sycl::queue q=get_default_queue())
Allocate memory block of usm_shared memory.
static sycl::event memset_async(pitched_data pitch, int val, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Sets value to the 3D memory region specified by pitch in q.
sycl::event free_async(const std::vector< void * > &pointers, const std::vector< sycl::event > &events, sycl::queue q=get_default_queue())
Free the device memory pointed by a batch of pointers in pointers which are related to q after events...
static sycl::event memcpy_async(pitched_data to, sycl::id< 3 > to_pos, pitched_data from, sycl::id< 3 > from_pos, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Asynchronously copies a subset of a 3D matrix specified by to to another 3D matrix specified by from.
static void * malloc(size_t &pitch, size_t x, size_t y, sycl::queue q=get_default_queue())
Allocate memory block for 2D array on the device.
static sycl::event fill_async(void *dev_ptr, const T &pattern, size_t count, sycl::queue q=get_default_queue())
Asynchronously sets pattern to the first count elements starting from dev_ptr.
static void memcpy(pitched_data to, sycl::id< 3 > to_pos, pitched_data from, sycl::id< 3 > from_pos, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Synchronously copies a subset of a 3D matrix specified by to to another 3D matrix specified by from.
static sycl::event memset_d16_async(void *ptr, size_t pitch, unsigned short val, size_t x, size_t y, sycl::queue q=get_default_queue())
Sets 2 bytes data val to the pitched 2D memory region pointed by ptr in q asynchronously.
static void memset_d32(void *ptr, size_t pitch, unsigned int val, size_t x, size_t y, sycl::queue q=get_default_queue())
Sets 4 bytes data val to the pitched 2D memory region pointed by ptr in q synchronously.