DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Copyright (C) Codeplay Software Ltd.
4  *
5  * Part of the LLVM Project, under the Apache License v2.0 with LLVM
6  * Exceptions. See https://llvm.org/LICENSE.txt for license information.
7  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  *
15  * SYCL compatibility extension
16  *
17  * memory.hpp
18  *
19  * Description:
20  * memory functionality for the SYCL compatibility extension
21  **************************************************************************/
22 
23 // The original source was under the license below:
24 //==---- memory.hpp -------------------------------*- C++ -*----------------==//
25 //
26 // Copyright (C) Intel Corporation
27 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
28 // See https://llvm.org/LICENSE.txt for license information.
29 //
30 //===----------------------------------------------------------------------===//
31 
32 #pragma once
33 
34 #include <cassert>
35 #include <cstdint>
36 #include <cstring>
37 #include <map>
38 #include <mutex>
39 #include <thread>
40 #include <type_traits>
41 #include <unordered_map>
42 #include <utility>
43 
44 #include <sycl/builtins.hpp>
46 #include <sycl/group.hpp>
47 #include <sycl/usm.hpp>
48 
49 #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
51 #endif
52 
53 #include <syclcompat/device.hpp>
54 #include <syclcompat/traits.hpp>
55 
56 #if defined(__linux__)
57 #include <sys/mman.h>
58 #elif defined(_WIN64)
59 #ifndef NOMINMAX
60 #define NOMINMAX
61 #endif
62 #include <windows.h>
63 #else
64 #error "Only support Windows and Linux."
65 #endif
66 
67 namespace syclcompat {
68 
69 template <typename AllocT> auto *local_mem() {
71  As_multi_ptr =
72  sycl::ext::oneapi::group_local_memory_for_overwrite<AllocT>(
73  sycl::ext::oneapi::this_work_item::get_work_group<3>());
74  auto *As = *As_multi_ptr;
75  return As;
76 }
77 
78 namespace detail {
84  automatic
85 };
86 } // namespace detail
87 
88 enum class memory_region {
89  global = 0, // device global memory
90  constant, // device read-only memory
91  local, // device local memory
92  usm_shared, // memory which can be accessed by host and device
93 };
94 
95 enum class target { device, local };
96 
97 using byte_t = uint8_t;
98 
101 public:
102  pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
103  pitched_data(void *data, size_t pitch, size_t x, size_t y)
104  : _data(data), _pitch(pitch), _x(x), _y(y) {}
105 
106  void *get_data_ptr() { return _data; }
107  void set_data_ptr(void *data) { _data = data; }
108 
109  size_t get_pitch() { return _pitch; }
110  void set_pitch(size_t pitch) { _pitch = pitch; }
111 
112  size_t get_x() { return _x; }
113  void set_x(size_t x) { _x = x; };
114 
115  size_t get_y() { return _y; }
116  void set_y(size_t y) { _y = y; }
117 
118 private:
119  void *_data;
120  size_t _pitch, _x, _y;
121 };
122 
123 namespace detail {
124 
125 template <class T, memory_region Memory, size_t Dimension> class accessor;
126 template <memory_region Memory, class T = byte_t> class memory_traits {
127 public:
128  static constexpr sycl::access::address_space asp =
129  (Memory == memory_region::local)
130  ? sycl::access::address_space::local_space
131  : sycl::access::address_space::global_space;
132  static constexpr target target =
134  static constexpr sycl::access_mode mode = (Memory == memory_region::constant)
135  ? sycl::access_mode::read
137  static constexpr size_t type_size = sizeof(T);
138  using element_t =
139  typename std::conditional_t<Memory == memory_region::constant, const T,
140  T>;
141  using value_t = typename std::remove_cv_t<T>;
142  template <size_t Dimension = 1>
143  using accessor_t =
144  typename std::conditional_t<target == target::local,
147  using pointer_t = T *;
148 };
149 
150 static inline void *malloc(size_t size, sycl::queue q) {
151  return sycl::malloc_device(size, q.get_device(), q.get_context());
152 }
153 
158 static inline constexpr size_t get_pitch(size_t x) {
159  return ((x) + 31) & ~(0x1F);
160 }
161 
169 static inline void *malloc(size_t &pitch, size_t x, size_t y, size_t z,
170  sycl::queue q) {
171  pitch = get_pitch(x);
172  return malloc(pitch * y * z, q);
173 }
174 
184 template <class T>
185 static inline sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern,
186  size_t count) {
187  return q.fill(dev_ptr, pattern, count);
188 }
189 
197 static inline sycl::event memset(sycl::queue q, void *dev_ptr, int value,
198  size_t size) {
199  return q.memset(dev_ptr, value, size);
200 }
201 
209 template <typename T>
210 static inline std::vector<sycl::event>
211 memset(sycl::queue q, pitched_data data, const T &value, sycl::range<3> size) {
212  std::vector<sycl::event> event_list;
213  size_t slice = data.get_pitch() * data.get_y();
214  unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
215  for (size_t z = 0; z < size.get(2); ++z) {
216  unsigned char *data_ptr = data_surface;
217  for (size_t y = 0; y < size.get(1); ++y) {
218  event_list.push_back(detail::fill<T>(q, data_ptr, value, size.get(0)));
219  data_ptr += data.get_pitch();
220  }
221  data_surface += slice;
222  }
223  return event_list;
224 }
225 
236 template <typename T>
237 static inline std::vector<sycl::event> memset(sycl::queue q, void *ptr,
238  size_t pitch, const T &value,
239  size_t x, size_t y) {
240  return memset(q, pitched_data(ptr, pitch, x, 1), value,
241  sycl::range<3>(x, y, 1));
242 }
243 
245  host_only = 0,
246  device_only,
247  host_device,
248  end
249 };
250 
252  const void *ptr) {
253  switch (sycl::get_pointer_type(ptr, q.get_context())) {
254  case sycl::usm::alloc::unknown:
256  case sycl::usm::alloc::device:
258  case sycl::usm::alloc::shared:
259  case sycl::usm::alloc::host:
261  }
262 }
263 
265  const void *from_ptr) {
266  // table[to_attribute][from_attribute]
267  static const memcpy_direction
268  direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
269  [static_cast<unsigned>(pointer_access_attribute::end)] = {
279  return direction_table[static_cast<unsigned>(get_pointer_attribute(
280  q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
281 }
282 
283 static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
284  size_t size,
285  const std::vector<sycl::event> &dep_events = {}) {
286  if (!size)
287  return sycl::event{};
288  return q.memcpy(to_ptr, from_ptr, size, dep_events);
289 }
290 
291 // Get actual copy range and make sure it will not exceed range.
292 static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
293  size_t pitch) {
294  return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
295 }
296 
297 static inline size_t get_offset(sycl::id<3> id, size_t slice, size_t pitch) {
298  return slice * id.get(2) + pitch * id.get(1) + id.get(0);
299 }
300 
303 static inline std::vector<sycl::event>
304 memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
305  sycl::range<3> to_range, sycl::range<3> from_range, sycl::id<3> to_id,
306  sycl::id<3> from_id, sycl::range<3> size,
307  const std::vector<sycl::event> &dep_events = {}) {
308  // RAII for host pointer
309  class host_buffer {
310  void *_buf;
311  size_t _size;
312  sycl::queue _q;
313  const std::vector<sycl::event> &_deps; // free operation depends
314 
315  public:
316  host_buffer(size_t size, sycl::queue q,
317  const std::vector<sycl::event> &deps)
318  : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
319  void *get_ptr() const { return _buf; }
320  size_t get_size() const { return _size; }
321  ~host_buffer() {
322  if (_buf) {
323  _q.submit([&](sycl::handler &cgh) {
324  cgh.depends_on(_deps);
325  cgh.host_task([buf = _buf] { std::free(buf); });
326  });
327  }
328  }
329  };
330  std::vector<sycl::event> event_list;
331 
332  size_t to_slice = to_range.get(1) * to_range.get(0);
333  size_t from_slice = from_range.get(1) * from_range.get(0);
334  unsigned char *to_surface =
335  (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
336  const unsigned char *from_surface =
337  (const unsigned char *)from_ptr +
338  get_offset(from_id, from_slice, from_range.get(0));
339 
340  if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) {
341  return {memcpy(q, to_surface, from_surface, to_slice * size.get(2),
342  dep_events)};
343  }
344  memcpy_direction direction = deduce_memcpy_direction(q, to_ptr, from_ptr);
345  size_t size_slice = size.get(1) * size.get(0);
346  switch (direction) {
347  case host_to_host:
348  for (size_t z = 0; z < size.get(2); ++z) {
349  unsigned char *to_ptr = to_surface;
350  const unsigned char *from_ptr = from_surface;
351  if (to_range.get(0) == from_range.get(0) &&
352  to_range.get(0) == size.get(0)) {
353  event_list.push_back(
354  memcpy(q, to_ptr, from_ptr, size_slice, dep_events));
355  } else {
356  for (size_t y = 0; y < size.get(1); ++y) {
357  event_list.push_back(
358  memcpy(q, to_ptr, from_ptr, size.get(0), dep_events));
359  to_ptr += to_range.get(0);
360  from_ptr += from_range.get(0);
361  }
362  }
363  to_surface += to_slice;
364  from_surface += from_slice;
365  }
366  break;
367  case host_to_device: {
368  host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
369  event_list);
370  std::vector<sycl::event> host_events;
371  if (to_slice == size_slice) {
372  // Copy host data to a temp host buffer with the shape of target.
373  host_events =
374  memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
375  sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, dep_events);
376  } else {
377  // Copy host data to a temp host buffer with the shape of target.
378  host_events =
379  memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
380  sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
381  // If has padding data, not sure whether it is useless. So fill
382  // temp buffer with it.
383  std::vector<sycl::event>{memcpy(q, buf.get_ptr(), to_surface,
384  buf.get_size(), dep_events)});
385  }
386  // Copy from temp host buffer to device with only one submit.
387  event_list.push_back(
388  memcpy(q, to_surface, buf.get_ptr(), buf.get_size(), host_events));
389  break;
390  }
391  case device_to_host: {
392  host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
393  event_list);
394  // Copy from host temp buffer to host target with reshaping.
395  event_list =
396  memcpy(q, to_surface, buf.get_ptr(), to_range, from_range,
397  sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
398  // Copy from device to temp host buffer with only one submit.
399  std::vector<sycl::event>{memcpy(q, buf.get_ptr(), from_surface,
400  buf.get_size(), dep_events)});
401  break;
402  }
403  case device_to_device:
404  event_list.push_back(q.submit([&](sycl::handler &cgh) {
405  cgh.depends_on(dep_events);
406  cgh.parallel_for<class memcpy_3d_detail>(size, [=](sycl::id<3> id) {
407  to_surface[get_offset(id, to_slice, to_range.get(0))] =
408  from_surface[get_offset(id, from_slice, from_range.get(0))];
409  });
410  }));
411  break;
412  default:
413  throw std::runtime_error("[SYCLcompat] memcpy: invalid direction value");
414  }
415  return event_list;
416 }
417 
419 static inline std::vector<sycl::event>
421  sycl::id<3> from_id, sycl::range<3> size) {
422  return memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
423  sycl::range<3>(to.get_pitch(), to.get_y(), 1),
424  sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id,
425  from_id, size);
426 }
427 
429 static inline std::vector<sycl::event>
430 memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t to_pitch,
431  size_t from_pitch, size_t x, size_t y) {
432  return memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
433  sycl::range<3>(from_pitch, y, 1), sycl::id<3>(0, 0, 0),
434  sycl::id<3>(0, 0, 0), sycl::range<3>(x, y, 1));
435 }
436 
437 // Takes a std::vector<sycl::event> & returns a single event
438 // which simply depends on all of them
439 static sycl::event combine_events(std::vector<sycl::event> &events,
440  sycl::queue q) {
441  return q.submit([&events](sycl::handler &cgh) {
442  cgh.depends_on(events);
443  cgh.host_task([]() {});
444  });
445 }
446 
447 } // namespace detail
448 
453 static inline void *malloc(size_t num_bytes,
455  return detail::malloc(num_bytes, q);
456 }
457 
463 template <typename T>
464 static inline T *malloc(size_t count, sycl::queue q = get_default_queue()) {
465  return static_cast<T *>(detail::malloc(count * sizeof(T), q));
466 }
467 
472 static inline void *malloc_host(size_t num_bytes,
474  return sycl::malloc_host(num_bytes, q);
475 }
476 
482 template <typename T>
483 static inline T *malloc_host(size_t count,
485  return static_cast<T *>(sycl::malloc_host(count * sizeof(T), q));
486 }
487 
492 static inline void *malloc_shared(size_t num_bytes,
494  return sycl::malloc_shared(num_bytes, q);
495 }
496 
501 template <typename T>
502 static inline T *malloc_shared(size_t count,
504  return static_cast<T *>(sycl::malloc_shared(count * sizeof(T), q));
505 }
506 
511 static inline pitched_data malloc(sycl::range<3> size,
513  pitched_data pitch(nullptr, 0, size.get(0), size.get(1));
514  size_t pitch_size;
515  pitch.set_data_ptr(
516  detail::malloc(pitch_size, size.get(0), size.get(1), size.get(2), q));
517  pitch.set_pitch(pitch_size);
518  return pitch;
519 }
520 
527 static inline void *malloc(size_t &pitch, size_t x, size_t y,
529  return detail::malloc(pitch, x, y, 1, q);
530 }
531 
536 static inline void free(void *ptr, sycl::queue q = get_default_queue()) {
537  if (ptr) {
538  sycl::free(ptr, q.get_context());
539  }
540 }
541 
548 // Can't be static due to the friend declaration in the memory header.
549 inline sycl::event free_async(const std::vector<void *> &pointers,
550  const std::vector<sycl::event> &events,
552  auto event = q.submit(
553  [&pointers, &events, ctxt = q.get_context()](sycl::handler &cgh) {
554  cgh.depends_on(events);
555  cgh.host_task([=]() {
556  for (auto p : pointers)
557  sycl::free(p, ctxt);
558  });
559  });
560  get_current_device().add_event(event);
561  return event;
562 }
563 
573 static void memcpy(void *to_ptr, const void *from_ptr, size_t size,
575  detail::memcpy(q, to_ptr, from_ptr, size).wait();
576 }
577 
587 static sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size,
589  return detail::memcpy(q, to_ptr, from_ptr, size);
590 }
591 
602 template <typename T>
603 static sycl::event
605  size_t count, sycl::queue q = get_default_queue()) {
606  return detail::memcpy(q, static_cast<void *>(to_ptr),
607  static_cast<const void *>(from_ptr), count * sizeof(T));
608 }
609 
620 template <typename T>
621 static void memcpy(type_identity_t<T> *to_ptr,
622  const type_identity_t<T> *from_ptr, size_t count,
624  detail::memcpy(q, static_cast<void *>(to_ptr),
625  static_cast<const void *>(from_ptr), count * sizeof(T))
626  .wait();
627 }
628 
643 static inline void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
644  size_t from_pitch, size_t x, size_t y,
647  detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y));
648 }
649 
664 static inline sycl::event memcpy_async(void *to_ptr, size_t to_pitch,
665  const void *from_ptr, size_t from_pitch,
666  size_t x, size_t y,
668  auto events = detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y);
669  return detail::combine_events(events, q);
670 }
671 
675 // The function will return after the copy is completed.
684 static inline void memcpy(pitched_data to, sycl::id<3> to_pos,
685  pitched_data from, sycl::id<3> from_pos,
686  sycl::range<3> size,
688  sycl::event::wait(detail::memcpy(q, to, to_pos, from, from_pos, size));
689 }
690 
704  pitched_data from, sycl::id<3> from_pos,
705  sycl::range<3> size,
707  auto events = detail::memcpy(q, to, to_pos, from, from_pos, size);
708  return detail::combine_events(events, q);
709 }
710 
720 template <class T>
721 static void inline fill(void *dev_ptr, const T &pattern, size_t count,
723  detail::fill(q, dev_ptr, pattern, count).wait();
724 }
725 
737 template <class T>
738 static sycl::event inline fill_async(void *dev_ptr, const T &pattern,
739  size_t count,
741  return detail::fill(q, dev_ptr, pattern, count);
742 }
743 
752 static void memset(void *dev_ptr, int value, size_t size,
754  detail::memset(q, dev_ptr, value, size).wait();
755 }
756 
763 static inline void memset_d16(void *dev_ptr, unsigned short value, size_t size,
765  detail::fill<unsigned short>(q, dev_ptr, value, size).wait();
766 }
767 
774 static inline void memset_d32(void *dev_ptr, unsigned int value, size_t size,
776  detail::fill<unsigned int>(q, dev_ptr, value, size).wait();
777 }
778 
785 static inline sycl::event memset_async(void *dev_ptr, int value, size_t size,
787  return detail::memset(q, dev_ptr, value, size);
788 }
789 
797 static inline sycl::event
798 memset_d16_async(void *dev_ptr, unsigned short value, size_t size,
800  return detail::fill<unsigned short>(q, dev_ptr, value, size);
801 }
802 
810 static inline sycl::event
811 memset_d32_async(void *dev_ptr, unsigned int value, size_t size,
813  return detail::fill<unsigned int>(q, dev_ptr, value, size);
814 }
815 
824 static inline void memset(void *ptr, size_t pitch, int val, size_t x, size_t y,
826  sycl::event::wait(detail::memset<unsigned char>(q, ptr, pitch, val, x, y));
827 }
828 
837 static inline void memset_d16(void *ptr, size_t pitch, unsigned short val,
838  size_t x, size_t y,
840  sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y));
841 }
842 
851 static inline void memset_d32(void *ptr, size_t pitch, unsigned int val,
852  size_t x, size_t y,
854  sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y));
855 }
856 
866 static inline sycl::event memset_async(void *ptr, size_t pitch, int val,
867  size_t x, size_t y,
869 
870  auto events = detail::memset<unsigned char>(q, ptr, pitch, val, x, y);
871  return detail::combine_events(events, q);
872 }
873 
883 static inline sycl::event
884 memset_d16_async(void *ptr, size_t pitch, unsigned short val, size_t x,
885  size_t y, sycl::queue q = get_default_queue()) {
886  auto events = detail::memset(q, ptr, pitch, val, x, y);
887  return detail::combine_events(events, q);
888 }
889 
899 static inline sycl::event
900 memset_d32_async(void *ptr, size_t pitch, unsigned int val, size_t x, size_t y,
902  auto events = detail::memset(q, ptr, pitch, val, x, y);
903  return detail::combine_events(events, q);
904 }
905 
915 static inline void memset(pitched_data pitch, int val, sycl::range<3> size,
917  sycl::event::wait(detail::memset<unsigned char>(q, pitch, val, size));
918 }
919 
929 static inline sycl::event memset_async(pitched_data pitch, int val,
930  sycl::range<3> size,
932  auto events = detail::memset<unsigned char>(q, pitch, val, size);
933  return detail::combine_events(events, q);
934 }
935 
937 template <class T, memory_region Memory, size_t Dimension> class accessor;
938 template <class T, memory_region Memory> class accessor<T, Memory, 3> {
939 public:
941  using element_t = typename memory_t::element_t;
942  using pointer_t = typename memory_t::pointer_t;
943  using accessor_t = typename memory_t::template accessor_t<3>;
944  accessor(pointer_t data, const sycl::range<3> &in_range)
945  : _data(data), _range(in_range) {}
946  template <memory_region M = Memory>
947  accessor(typename std::enable_if<M != memory_region::local,
948  const accessor_t>::type &acc)
949  : accessor(acc, acc.get_range()) {}
950  accessor(const accessor_t &acc, const sycl::range<3> &in_range)
951  : accessor(acc.get_pointer(), in_range) {}
952  accessor<T, Memory, 2> operator[](size_t index) const {
953  sycl::range<2> sub(_range.get(1), _range.get(2));
954  return accessor<T, Memory, 2>(_data + index * sub.size(), sub);
955  }
956 
957  pointer_t get_ptr() const { return _data; }
958 
959 private:
960  pointer_t _data;
961  sycl::range<3> _range;
962 };
963 template <class T, memory_region Memory> class accessor<T, Memory, 2> {
964 public:
966  using element_t = typename memory_t::element_t;
967  using pointer_t = typename memory_t::pointer_t;
968  using accessor_t = typename memory_t::template accessor_t<2>;
969  accessor(pointer_t data, const sycl::range<2> &in_range)
970  : _data(data), _range(in_range) {}
971  template <memory_region Mem = Memory>
972  accessor(typename std::enable_if<Mem != memory_region::local,
973  const accessor_t>::type &acc)
974  : accessor(acc, acc.get_range()) {}
975  accessor(const accessor_t &acc, const sycl::range<2> &in_range)
976  : accessor(acc.get_pointer(), in_range) {}
977 
978  pointer_t operator[](size_t index) const {
979  return _data + _range.get(1) * index;
980  }
981 
982  pointer_t get_ptr() const { return _data; }
983 
984 private:
985  pointer_t _data;
986  sycl::range<2> _range;
987 };
988 
990 template <class T, memory_region Memory, size_t Dimension> class device_memory {
991 public:
992  using accessor_t =
996 
998  : device_memory(sycl::range<Dimension>(1), q) {}
999 
1002  std::initializer_list<value_t> &&init_list,
1004  : device_memory(in_range, q) {
1005  assert(init_list.size() <= in_range.size());
1006  _host_ptr = (value_t *)std::malloc(_size);
1007  std::memset(_host_ptr, 0, _size);
1008  std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
1009  }
1010 
1012  template <size_t Dim = Dimension>
1014  const typename std::enable_if<Dim == 2, sycl::range<2>>::type &in_range,
1015  std::initializer_list<std::initializer_list<value_t>> &&init_list,
1017  : device_memory(in_range, q) {
1018  assert(init_list.size() <= in_range[0]);
1019  _host_ptr = (value_t *)std::malloc(_size);
1020  std::memset(_host_ptr, 0, _size);
1021  auto tmp_data = _host_ptr;
1022  for (auto sub_list : init_list) {
1023  assert(sub_list.size() <= in_range[1]);
1024  std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T));
1025  tmp_data += in_range[1];
1026  }
1027  }
1028 
1032  : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
1033  _host_ptr(nullptr), _device_ptr(nullptr), _q(q) {
1034  static_assert((Memory == memory_region::global) ||
1035  (Memory == memory_region::constant) ||
1036  (Memory == memory_region::usm_shared),
1037  "device memory region should be global, constant or shared");
1038  // Make sure that singleton class dev_mgr will destruct later than this.
1039  detail::dev_mgr::instance();
1040  }
1041 
1043  // enable_if_t SFINAE to avoid ambiguity with
1044  // device_memory(Args... Arguments, sycl::queue q)
1045  template <class... Args, size_t Dim = Dimension,
1046  typename = std::enable_if_t<sizeof...(Args) == Dim>>
1047  device_memory(Args... Arguments)
1048  : device_memory(sycl::range<Dimension>(Arguments...),
1049  get_default_queue()) {}
1050 
1052  template <class... Args>
1053  device_memory(Args... Arguments, sycl::queue q)
1054  : device_memory(sycl::range<Dimension>(Arguments...), q) {}
1055 
1057  if (_device_ptr && !_reference)
1058  syclcompat::free(_device_ptr, _q);
1059  if (_host_ptr)
1060  std::free(_host_ptr);
1061  }
1062 
1065  void init() { init(_q); }
1068  void init(sycl::queue q) {
1069  if (_device_ptr)
1070  return;
1071  if (!_size)
1072  return;
1073  allocate_device(q);
1074  if (_host_ptr)
1075  detail::memcpy(q, _device_ptr, _host_ptr, _size);
1076  }
1077 
1079  void assign(value_t *src, size_t size) {
1080  this->~device_memory();
1081  new (this) device_memory(src, size, _q);
1082  }
1083 
1084  // Get memory pointer of the memory object, a device USM pointer.
1085  value_t *get_ptr() { return get_ptr(_q); }
1086 
1087  // Get memory pointer of the memory object, a device USM pointer.
1089  init(q);
1090  return _device_ptr;
1091  }
1092 
1094  size_t get_size() { return _size; }
1095 
1096  template <size_t Dim = Dimension>
1097  typename std::enable_if<Dim == 1, T>::type &operator[](size_t index) {
1098  init();
1099  return _device_ptr[index];
1100  }
1101 
1104  template <size_t Dim = Dimension>
1105  typename std::enable_if<Dim != 1, syclcompat_accessor_t>::type
1107  return syclcompat_accessor_t((T *)_device_ptr, _range);
1108  }
1109 
1110 private:
1111  device_memory(value_t *memory_ptr, size_t size,
1113  : _size(size), _range(size / sizeof(T)), _reference(true),
1114  _device_ptr(memory_ptr), _q(q) {}
1115 
1116  void allocate_device(sycl::queue q) {
1117  if (Memory == memory_region::usm_shared) {
1118  _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
1119  q.get_context());
1120  return;
1121  }
1122 #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
1123  if (Memory == memory_region::constant) {
1124  _device_ptr = (value_t *)sycl::malloc_device(
1125  _size, q.get_device(), q.get_context(),
1127  return;
1128  }
1129 #endif
1130  _device_ptr = (value_t *)detail::malloc(_size, q);
1131  }
1132 
1133  size_t _size;
1134  sycl::range<Dimension> _range;
1135  bool _reference;
1136  value_t *_host_ptr;
1137  value_t *_device_ptr;
1138  sycl::queue _q;
1139 };
1140 template <class T, memory_region Memory>
1141 class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
1142 public:
1144  using value_t = typename base::value_t;
1145  using accessor_t =
1147 
1150  : base(sycl::range<1>(1), {val}, q) {}
1151 
1154 };
1155 
1156 template <class T, size_t Dimension>
1158 template <class T, size_t Dimension>
1160 template <class T, size_t Dimension>
1162 
1164 public:
1165  void init(const void *ptr, sycl::queue q = get_default_queue()) {
1166  memory_type = sycl::get_pointer_type(ptr, q.get_context());
1167  device_pointer = (memory_type != sycl::usm::alloc::unknown) ? ptr : nullptr;
1168  host_pointer = (memory_type != sycl::usm::alloc::unknown) &&
1169  (memory_type != sycl::usm::alloc::device)
1170  ? ptr
1171  : nullptr;
1172  sycl::device device_obj = sycl::get_pointer_device(ptr, q.get_context());
1173  device_id = detail::dev_mgr::instance().get_device_id(device_obj);
1174  }
1175 
1176  sycl::usm::alloc get_memory_type() { return memory_type; }
1177 
1178  const void *get_device_pointer() { return device_pointer; }
1179 
1180  const void *get_host_pointer() { return host_pointer; }
1181 
1182  bool is_memory_shared() { return memory_type == sycl::usm::alloc::shared; }
1183 
1184  unsigned int get_device_id() { return device_id; }
1185 
1186 private:
1187  sycl::usm::alloc memory_type = sycl::usm::alloc::unknown;
1188  const void *device_pointer = nullptr;
1189  const void *host_pointer = nullptr;
1190  unsigned int device_id = 0;
1191 };
1192 
1193 } // namespace syclcompat
size_t get(int dimension) const
Definition: array.hpp:62
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
Command group handler class.
Definition: handler.hpp:458
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:1427
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:2040
A unique identifier of an item in an index space.
Definition: id.hpp:36
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:111
event memcpy(void *Dest, const void *Src, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
Definition: queue.cpp:125
event fill(void *Ptr, const T &Pattern, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the specified memory with the specified pattern.
Definition: queue.hpp:486
event memset(void *Ptr, int Value, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue.cpp:106
device get_device() const
Definition: queue.cpp:76
context get_context() const
Definition: queue.cpp:74
std::enable_if_t< std::is_invocable_r_v< void, T, handler & >, event > submit(T CGF, const detail::code_location &CodeLoc=detail::code_location::current())
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue.hpp:346
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
size_t size() const
Definition: range.hpp:56
accessor(const accessor_t &acc, const sycl::range< 2 > &in_range)
Definition: memory.hpp:975
pointer_t operator[](size_t index) const
Definition: memory.hpp:978
typename memory_t::template accessor_t< 2 > accessor_t
Definition: memory.hpp:968
accessor(typename std::enable_if< Mem !=memory_region::local, const accessor_t >::type &acc)
Definition: memory.hpp:972
typename memory_t::pointer_t pointer_t
Definition: memory.hpp:967
typename memory_t::element_t element_t
Definition: memory.hpp:966
accessor(pointer_t data, const sycl::range< 2 > &in_range)
Definition: memory.hpp:969
accessor(pointer_t data, const sycl::range< 3 > &in_range)
Definition: memory.hpp:944
accessor< T, Memory, 2 > operator[](size_t index) const
Definition: memory.hpp:952
typename memory_t::template accessor_t< 3 > accessor_t
Definition: memory.hpp:943
typename memory_t::element_t element_t
Definition: memory.hpp:941
accessor(typename std::enable_if< M !=memory_region::local, const accessor_t >::type &acc)
Definition: memory.hpp:947
typename memory_t::pointer_t pointer_t
Definition: memory.hpp:942
accessor(const accessor_t &acc, const sycl::range< 3 > &in_range)
Definition: memory.hpp:950
accessor used as device function parameter.
Definition: memory.hpp:937
typename std::conditional_t< Memory==memory_region::constant, const T, T > element_t
Definition: memory.hpp:140
typename std::remove_cv_t< T > value_t
Definition: memory.hpp:141
static constexpr size_t type_size
Definition: memory.hpp:137
typename std::conditional_t< target==target::local, sycl::local_accessor< T, Dimension >, sycl::accessor< T, Dimension, mode > > accessor_t
Definition: memory.hpp:146
static constexpr sycl::access_mode mode
Definition: memory.hpp:134
static constexpr sycl::access::address_space asp
Definition: memory.hpp:128
typename detail::memory_traits< Memory, T >::template accessor_t< 0 > accessor_t
Definition: memory.hpp:1146
device_memory(const value_t &val, sycl::queue q=get_default_queue())
Constructor with initial value.
Definition: memory.hpp:1149
device_memory(sycl::queue q=get_default_queue())
Default constructor.
Definition: memory.hpp:1153
Device variable with address space of shared or global.
Definition: memory.hpp:990
std::enable_if< Dim !=1, syclcompat_accessor_t >::type get_access(sycl::handler &cgh)
Get compat_accessor with dimension info for the device memory object when usm is used and dimension i...
Definition: memory.hpp:1106
device_memory(const sycl::range< Dimension > &range_in, sycl::queue q=get_default_queue())
Constructor with range.
Definition: memory.hpp:1030
size_t get_size()
Get the device memory object size in bytes.
Definition: memory.hpp:1094
device_memory(Args... Arguments)
Constructor with range.
Definition: memory.hpp:1047
device_memory(const typename std::enable_if< Dim==2, sycl::range< 2 >>::type &in_range, std::initializer_list< std::initializer_list< value_t >> &&init_list, sycl::queue q=get_default_queue())
Constructor of 2-D array with initializer list.
Definition: memory.hpp:1013
std::enable_if< Dim==1, T >::type & operator[](size_t index)
Definition: memory.hpp:1097
value_t * get_ptr(sycl::queue q)
Definition: memory.hpp:1088
void init(sycl::queue q)
Allocate memory with specified queue, and init memory if has initial value.
Definition: memory.hpp:1068
device_memory(const sycl::range< Dimension > &in_range, std::initializer_list< value_t > &&init_list, sycl::queue q=get_default_queue())
Constructor of 1-D array with initializer list.
Definition: memory.hpp:1001
device_memory(sycl::queue q=get_default_queue())
Definition: memory.hpp:997
void init()
Allocate memory with the queue specified in the constuctor, and init memory if has initial value.
Definition: memory.hpp:1065
void assign(value_t *src, size_t size)
The variable is assigned to a device pointer.
Definition: memory.hpp:1079
typename detail::memory_traits< Memory, T >::template accessor_t< Dimension > accessor_t
Definition: memory.hpp:993
device_memory(Args... Arguments, sycl::queue q)
Constructor with range and queue.
Definition: memory.hpp:1053
typename detail::memory_traits< Memory, T >::value_t value_t
Definition: memory.hpp:994
Pitched 2D/3D memory data.
Definition: memory.hpp:100
void set_y(size_t y)
Definition: memory.hpp:116
pitched_data(void *data, size_t pitch, size_t x, size_t y)
Definition: memory.hpp:103
void set_x(size_t x)
Definition: memory.hpp:113
void set_pitch(size_t pitch)
Definition: memory.hpp:110
void set_data_ptr(void *data)
Definition: memory.hpp:107
const void * get_host_pointer()
Definition: memory.hpp:1180
void init(const void *ptr, sycl::queue q=get_default_queue())
Definition: memory.hpp:1165
const void * get_device_pointer()
Definition: memory.hpp:1178
sycl::usm::alloc get_memory_type()
Definition: memory.hpp:1176
unsigned int get_device_id()
Definition: memory.hpp:1184
__ESIMD_API std::enable_if_t<(RegionT::length *sizeof(typename RegionT::element_type) >=2)> wait(sycl::ext::intel::esimd::simd_view< T, RegionT > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
Definition: memory.hpp:435
access::mode access_mode
Definition: access.hpp:72
usm::alloc get_pointer_type(const void *ptr, const context &ctxt)
Query the allocation type from a USM pointer.
Definition: usm_impl.cpp:575
void * malloc_shared(size_t size, const device &dev, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:408
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:85
auto auto autodecltype(x) z
device get_pointer_device(const void *ptr, const context &ctxt)
Queries the device against which the pointer was allocated Throws an invalid_object_error if ptr is a...
Definition: usm_impl.cpp:626
void * malloc_device(size_t size, const device &dev, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:320
autodecltype(x) x
void * malloc_host(size_t size, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:383
void free(void *ptr, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:374
Definition: access.hpp:18
static pointer_access_attribute get_pointer_attribute(sycl::queue q, const void *ptr)
Definition: memory.hpp:251
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
Definition: memory.hpp:283
static constexpr size_t get_pitch(size_t x)
Calculate pitch (padded length of major dimension x) by rounding up to multiple of 32.
Definition: memory.hpp:158
static sycl::event combine_events(std::vector< sycl::event > &events, sycl::queue q)
Definition: memory.hpp:439
static sycl::event memset(sycl::queue q, void *dev_ptr, int value, size_t size)
Set value to the first size bytes starting from dev_ptr in q.
Definition: memory.hpp:197
static size_t get_offset(sycl::id< 3 > id, size_t slice, size_t pitch)
Definition: memory.hpp:297
static memcpy_direction deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr)
Definition: memory.hpp:264
static size_t get_copy_range(sycl::range< 3 > size, size_t slice, size_t pitch)
Definition: memory.hpp:292
static void * malloc(size_t size, sycl::queue q)
Definition: memory.hpp:150
static sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern, size_t count)
Set pattern to the first count elements of type T starting from dev_ptr.
Definition: memory.hpp:185
static void memcpy(void *to_ptr, const void *from_ptr, size_t size, sycl::queue q=get_default_queue())
Synchronously copies size bytes from the address specified by from_ptr to the address specified by to...
Definition: memory.hpp:573
static sycl::event memset_d32_async(void *ptr, size_t pitch, unsigned int val, size_t x, size_t y, sycl::queue q=get_default_queue())
Sets 4 bytes data val to the pitched 2D memory region pointed by ptr in q asynchronously.
Definition: memory.hpp:900
uint8_t byte_t
Definition: memory.hpp:97
static void free(void *ptr, sycl::queue q=get_default_queue())
free
Definition: memory.hpp:536
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
Definition: device.hpp:744
auto * local_mem()
Definition: memory.hpp:69
static device_ext & get_current_device()
Util function to get the current device.
Definition: device.hpp:772
static void memset_d16(void *ptr, size_t pitch, unsigned short val, size_t x, size_t y, sycl::queue q=get_default_queue())
Sets 2 bytes data val to the pitched 2D memory region pointed by ptr in q synchronously.
Definition: memory.hpp:837
static void memset(pitched_data pitch, int val, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Sets value to the 3D memory region specified by pitch in q.
Definition: memory.hpp:915
static void * malloc_host(size_t num_bytes, sycl::queue q=get_default_queue())
Allocate memory block on the host.
Definition: memory.hpp:472
typename type_identity< T >::type type_identity_t
Definition: traits.hpp:35
static void fill(void *dev_ptr, const T &pattern, size_t count, sycl::queue q=get_default_queue())
Synchronously sets pattern to the first count elements starting from dev_ptr.
Definition: memory.hpp:721
static void * malloc_shared(size_t num_bytes, sycl::queue q=get_default_queue())
Allocate memory block of usm_shared memory.
Definition: memory.hpp:492
static sycl::event memset_async(pitched_data pitch, int val, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Sets value to the 3D memory region specified by pitch in q.
Definition: memory.hpp:929
sycl::event free_async(const std::vector< void * > &pointers, const std::vector< sycl::event > &events, sycl::queue q=get_default_queue())
Free the device memory pointed by a batch of pointers in pointers which are related to q after events...
Definition: memory.hpp:549
static sycl::event memcpy_async(pitched_data to, sycl::id< 3 > to_pos, pitched_data from, sycl::id< 3 > from_pos, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Asynchronously copies a subset of a 3D matrix specified by to to another 3D matrix specified by from.
Definition: memory.hpp:703
static void * malloc(size_t &pitch, size_t x, size_t y, sycl::queue q=get_default_queue())
Allocate memory block for 2D array on the device.
Definition: memory.hpp:527
static sycl::event fill_async(void *dev_ptr, const T &pattern, size_t count, sycl::queue q=get_default_queue())
Asynchronously sets pattern to the first count elements starting from dev_ptr.
Definition: memory.hpp:738
static void memcpy(pitched_data to, sycl::id< 3 > to_pos, pitched_data from, sycl::id< 3 > from_pos, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Synchronously copies a subset of a 3D matrix specified by to to another 3D matrix specified by from.
Definition: memory.hpp:684
static sycl::event memset_d16_async(void *ptr, size_t pitch, unsigned short val, size_t x, size_t y, sycl::queue q=get_default_queue())
Sets 2 bytes data val to the pitched 2D memory region pointed by ptr in q asynchronously.
Definition: memory.hpp:884
static void memset_d32(void *ptr, size_t pitch, unsigned int val, size_t x, size_t y, sycl::queue q=get_default_queue())
Sets 4 bytes data val to the pitched 2D memory region pointed by ptr in q synchronously.
Definition: memory.hpp:851