DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Copyright (C) Codeplay Software Ltd.
4  *
5  * Part of the LLVM Project, under the Apache License v2.0 with LLVM
6  * Exceptions. See https://llvm.org/LICENSE.txt for license information.
7  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  *
15  * SYCL compatibility extension
16  *
17  * memory.hpp
18  *
19  * Description:
20  * memory functionality for the SYCL compatibility extension
21  **************************************************************************/
22 
23 // The original source was under the license below:
24 //==---- memory.hpp -------------------------------*- C++ -*----------------==//
25 //
26 // Copyright (C) Intel Corporation
27 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
28 // See https://llvm.org/LICENSE.txt for license information.
29 //
30 //===----------------------------------------------------------------------===//
31 
32 #pragma once
33 
34 #include <cassert>
35 #include <cstdint>
36 #include <cstring>
37 #include <map>
38 #include <mutex>
39 #include <thread>
40 #include <type_traits>
41 #include <unordered_map>
42 #include <utility>
43 
44 #include <sycl/builtins.hpp>
47 #include <sycl/usm.hpp>
48 
49 #include <syclcompat/device.hpp>
50 #include <syclcompat/traits.hpp>
51 
52 #if defined(__linux__)
53 #include <sys/mman.h>
54 #elif defined(_WIN64)
55 #define NOMINMAX
56 #include <windows.h>
57 #else
58 #error "Only support Windows and Linux."
59 #endif
60 
61 namespace syclcompat {
62 
63 template <typename AllocT> auto *local_mem() {
65  As_multi_ptr = sycl::ext::oneapi::group_local_memory<AllocT>(
66  sycl::ext::oneapi::experimental::this_nd_item<3>().get_group());
67  auto *As = *As_multi_ptr;
68  return As;
69 }
70 
71 namespace detail {
77  automatic
78 };
79 } // namespace detail
80 
81 enum class memory_region {
82  global = 0, // device global memory
83  constant, // device read-only memory
84  local, // device local memory
85  usm_shared, // memory which can be accessed by host and device
86 };
87 
88 enum class target { device, local };
89 
90 using byte_t = uint8_t;
91 
93 class pitched_data {
94 public:
95  pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
96  pitched_data(void *data, size_t pitch, size_t x, size_t y)
97  : _data(data), _pitch(pitch), _x(x), _y(y) {}
98 
99  void *get_data_ptr() { return _data; }
100  void set_data_ptr(void *data) { _data = data; }
101 
102  size_t get_pitch() { return _pitch; }
103  void set_pitch(size_t pitch) { _pitch = pitch; }
104 
105  size_t get_x() { return _x; }
106  void set_x(size_t x) { _x = x; };
107 
108  size_t get_y() { return _y; }
109  void set_y(size_t y) { _y = y; }
110 
111 private:
112  void *_data;
113  size_t _pitch, _x, _y;
114 };
115 
116 namespace detail {
117 
118 template <class T, memory_region Memory, size_t Dimension> class accessor;
119 template <memory_region Memory, class T = byte_t> class memory_traits {
120 public:
121  static constexpr sycl::access::address_space asp =
122  (Memory == memory_region::local)
123  ? sycl::access::address_space::local_space
124  : sycl::access::address_space::global_space;
125  static constexpr syclcompat::target target = (Memory == memory_region::local)
128  static constexpr sycl::access_mode mode = (Memory == memory_region::constant)
129  ? sycl::access_mode::read
131  static constexpr size_t type_size = sizeof(T);
132  using element_t =
133  typename std::conditional_t<Memory == memory_region::constant, const T,
134  T>;
135  using value_t = typename std::remove_cv_t<T>;
136  template <size_t Dimension = 1>
137  using accessor_t =
138  typename std::conditional_t<target == syclcompat::target::local,
141  using pointer_t = T *;
142 };
143 
144 static inline void *malloc(size_t size, sycl::queue q) {
145  return sycl::malloc_device(size, q.get_device(), q.get_context());
146 }
147 
152 static inline constexpr size_t get_pitch(size_t x) {
153  return ((x) + 31) & ~(0x1F);
154 }
155 
156 static inline void *malloc(size_t &pitch, size_t x, size_t y, size_t z,
157  sycl::queue q) {
158  pitch = get_pitch(x);
159  return malloc(pitch * y * z, q);
160 }
161 
171 template <class T>
172 static inline sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern,
173  size_t count) {
174  return q.fill(dev_ptr, pattern, count);
175 }
176 
184 static inline sycl::event memset(sycl::queue q, void *dev_ptr, int value,
185  size_t size) {
186  return q.memset(dev_ptr, value, size);
187 }
188 
197 static inline std::vector<sycl::event> memset(sycl::queue q, pitched_data data,
198  int value, sycl::range<3> size) {
199  std::vector<sycl::event> event_list;
200  size_t slice = data.get_pitch() * data.get_y();
201  unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
202  for (size_t z = 0; z < size.get(2); ++z) {
203  unsigned char *data_ptr = data_surface;
204  for (size_t y = 0; y < size.get(1); ++y) {
205  event_list.push_back(memset(q, data_ptr, value, size.get(0)));
206  data_ptr += data.get_pitch();
207  }
208  data_surface += slice;
209  }
210  return event_list;
211 }
212 
214 static inline std::vector<sycl::event>
215 memset(sycl::queue q, void *ptr, size_t pitch, int val, size_t x, size_t y) {
216  return memset(q, pitched_data(ptr, pitch, x, 1), val,
217  sycl::range<3>(x, y, 1));
218 }
219 
221  host_only = 0,
222  device_only,
223  host_device,
224  end
225 };
226 
228  const void *ptr) {
229  switch (sycl::get_pointer_type(ptr, q.get_context())) {
230  case sycl::usm::alloc::unknown:
232  case sycl::usm::alloc::device:
234  case sycl::usm::alloc::shared:
235  case sycl::usm::alloc::host:
237  }
238 }
239 
241  const void *from_ptr) {
242  // table[to_attribute][from_attribute]
243  static const memcpy_direction
244  direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
245  [static_cast<unsigned>(pointer_access_attribute::end)] = {
255  return direction_table[static_cast<unsigned>(get_pointer_attribute(
256  q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
257 }
258 
259 static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
260  size_t size,
261  const std::vector<sycl::event> &dep_events = {}) {
262  if (!size)
263  return sycl::event{};
264  return q.memcpy(to_ptr, from_ptr, size, dep_events);
265 }
266 
267 // Get actual copy range and make sure it will not exceed range.
268 static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
269  size_t pitch) {
270  return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
271 }
272 
273 static inline size_t get_offset(sycl::id<3> id, size_t slice, size_t pitch) {
274  return slice * id.get(2) + pitch * id.get(1) + id.get(0);
275 }
276 
279 static inline std::vector<sycl::event>
280 memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
281  sycl::range<3> to_range, sycl::range<3> from_range, sycl::id<3> to_id,
282  sycl::id<3> from_id, sycl::range<3> size,
283  const std::vector<sycl::event> &dep_events = {}) {
284  // RAII for host pointer
285  class host_buffer {
286  void *_buf;
287  size_t _size;
288  sycl::queue _q;
289  const std::vector<sycl::event> &_deps; // free operation depends
290 
291  public:
292  host_buffer(size_t size, sycl::queue q,
293  const std::vector<sycl::event> &deps)
294  : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
295  void *get_ptr() const { return _buf; }
296  size_t get_size() const { return _size; }
297  ~host_buffer() {
298  if (_buf) {
299  _q.submit([&](sycl::handler &cgh) {
300  cgh.depends_on(_deps);
301  cgh.host_task([buf = _buf] { std::free(buf); });
302  });
303  }
304  }
305  };
306  std::vector<sycl::event> event_list;
307 
308  size_t to_slice = to_range.get(1) * to_range.get(0);
309  size_t from_slice = from_range.get(1) * from_range.get(0);
310  unsigned char *to_surface =
311  (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
312  const unsigned char *from_surface =
313  (const unsigned char *)from_ptr +
314  get_offset(from_id, from_slice, from_range.get(0));
315 
316  if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) {
317  return {memcpy(q, to_surface, from_surface, to_slice * size.get(2),
318  dep_events)};
319  }
320  memcpy_direction direction = deduce_memcpy_direction(q, to_ptr, from_ptr);
321  size_t size_slice = size.get(1) * size.get(0);
322  switch (direction) {
323  case host_to_host:
324  for (size_t z = 0; z < size.get(2); ++z) {
325  unsigned char *to_ptr = to_surface;
326  const unsigned char *from_ptr = from_surface;
327  if (to_range.get(0) == from_range.get(0) &&
328  to_range.get(0) == size.get(0)) {
329  event_list.push_back(
330  memcpy(q, to_ptr, from_ptr, size_slice, dep_events));
331  } else {
332  for (size_t y = 0; y < size.get(1); ++y) {
333  event_list.push_back(
334  memcpy(q, to_ptr, from_ptr, size.get(0), dep_events));
335  to_ptr += to_range.get(0);
336  from_ptr += from_range.get(0);
337  }
338  }
339  to_surface += to_slice;
340  from_surface += from_slice;
341  }
342  break;
343  case host_to_device: {
344  host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
345  event_list);
346  std::vector<sycl::event> host_events;
347  if (to_slice == size_slice) {
348  // Copy host data to a temp host buffer with the shape of target.
349  host_events =
350  memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
351  sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, dep_events);
352  } else {
353  // Copy host data to a temp host buffer with the shape of target.
354  host_events =
355  memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
356  sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
357  // If has padding data, not sure whether it is useless. So fill
358  // temp buffer with it.
359  std::vector<sycl::event>{memcpy(q, buf.get_ptr(), to_surface,
360  buf.get_size(), dep_events)});
361  }
362  // Copy from temp host buffer to device with only one submit.
363  event_list.push_back(
364  memcpy(q, to_surface, buf.get_ptr(), buf.get_size(), host_events));
365  break;
366  }
367  case device_to_host: {
368  host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
369  event_list);
370  // Copy from host temp buffer to host target with reshaping.
371  event_list =
372  memcpy(q, to_surface, buf.get_ptr(), to_range, from_range,
373  sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
374  // Copy from device to temp host buffer with only one submit.
375  std::vector<sycl::event>{memcpy(q, buf.get_ptr(), from_surface,
376  buf.get_size(), dep_events)});
377  break;
378  }
379  case device_to_device:
380  event_list.push_back(q.submit([&](sycl::handler &cgh) {
381  cgh.depends_on(dep_events);
382  cgh.parallel_for<class memcpy_3d_detail>(size, [=](sycl::id<3> id) {
383  to_surface[get_offset(id, to_slice, to_range.get(0))] =
384  from_surface[get_offset(id, from_slice, from_range.get(0))];
385  });
386  }));
387  break;
388  default:
389  throw std::runtime_error("syclcompat::"
390  "memcpy: invalid direction value");
391  }
392  return event_list;
393 }
394 
396 static inline std::vector<sycl::event>
398  sycl::id<3> from_id, sycl::range<3> size) {
399  return memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
400  sycl::range<3>(to.get_pitch(), to.get_y(), 1),
401  sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id,
402  from_id, size);
403 }
404 
406 static inline std::vector<sycl::event>
407 memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t to_pitch,
408  size_t from_pitch, size_t x, size_t y) {
409  return memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
410  sycl::range<3>(from_pitch, y, 1), sycl::id<3>(0, 0, 0),
411  sycl::id<3>(0, 0, 0), sycl::range<3>(x, y, 1));
412 }
413 
414 // Takes a std::vector<sycl::event> & returns a single event
415 // which simply depends on all of them
416 static sycl::event combine_events(std::vector<sycl::event> &events,
417  sycl::queue q) {
418  return q.submit([&events](sycl::handler &cgh) {
419  cgh.depends_on(events);
420  cgh.host_task([]() {});
421  });
422 }
423 
424 } // namespace detail
425 
430 static inline void *malloc(size_t num_bytes,
432  return detail::malloc(num_bytes, q);
433 }
434 
440 template <typename T>
441 static inline T *malloc(size_t count, sycl::queue q = get_default_queue()) {
442  return static_cast<T *>(detail::malloc(count * sizeof(T), q));
443 }
444 
449 static inline void *malloc_host(size_t num_bytes,
451  return sycl::malloc_host(num_bytes, q);
452 }
453 
459 template <typename T>
460 static inline T *malloc_host(size_t count,
462  return static_cast<T *>(sycl::malloc_host(count * sizeof(T), q));
463 }
464 
469 static inline void *malloc_shared(size_t num_bytes,
471  return sycl::malloc_shared(num_bytes, q);
472 }
473 
478 template <typename T>
479 static inline T *malloc_shared(size_t count,
481  return static_cast<T *>(sycl::malloc_shared(count * sizeof(T), q));
482 }
483 
488 static inline pitched_data malloc(sycl::range<3> size,
490  pitched_data pitch(nullptr, 0, size.get(0), size.get(1));
491  size_t pitch_size;
492  pitch.set_data_ptr(
493  detail::malloc(pitch_size, size.get(0), size.get(1), size.get(2), q));
494  pitch.set_pitch(pitch_size);
495  return pitch;
496 }
497 
504 static inline void *malloc(size_t &pitch, size_t x, size_t y,
506  return detail::malloc(pitch, x, y, 1, q);
507 }
508 
513 static inline void free(void *ptr, sycl::queue q = get_default_queue()) {
514  if (ptr) {
515  sycl::free(ptr, q.get_context());
516  }
517 }
518 
525 inline sycl::event free_async(const std::vector<void *> &pointers,
526  const std::vector<sycl::event> &events,
528  auto event = q.submit(
529  [&pointers, &events, ctxt = q.get_context()](sycl::handler &cgh) {
530  cgh.depends_on(events);
531  cgh.host_task([=]() {
532  for (auto p : pointers)
533  sycl::free(p, ctxt);
534  });
535  });
536  get_current_device().add_event(event);
537  return event;
538 }
539 
549 static void memcpy(void *to_ptr, const void *from_ptr, size_t size,
551  detail::memcpy(q, to_ptr, from_ptr, size).wait();
552 }
553 
563 static sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size,
565  return detail::memcpy(q, to_ptr, from_ptr, size);
566 }
567 
578 template <typename T>
579 static sycl::event
581  size_t count, sycl::queue q = get_default_queue()) {
582  return detail::memcpy(q, static_cast<void *>(to_ptr),
583  static_cast<const void *>(from_ptr), count * sizeof(T));
584 }
585 
596 template <typename T>
597 static void memcpy(type_identity_t<T> *to_ptr,
598  const type_identity_t<T> *from_ptr, size_t count,
600  detail::memcpy(q, static_cast<void *>(to_ptr),
601  static_cast<const void *>(from_ptr), count * sizeof(T))
602  .wait();
603 }
604 
619 static inline void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
620  size_t from_pitch, size_t x, size_t y,
623  detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y));
624 }
625 
640 static inline sycl::event memcpy_async(void *to_ptr, size_t to_pitch,
641  const void *from_ptr, size_t from_pitch,
642  size_t x, size_t y,
644  auto events = detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y);
645  return detail::combine_events(events, q);
646 }
647 
651 // The function will return after the copy is completed.
660 static inline void memcpy(pitched_data to, sycl::id<3> to_pos,
661  pitched_data from, sycl::id<3> from_pos,
662  sycl::range<3> size,
664  sycl::event::wait(detail::memcpy(q, to, to_pos, from, from_pos, size));
665 }
666 
680  pitched_data from, sycl::id<3> from_pos,
681  sycl::range<3> size,
683  auto events = detail::memcpy(q, to, to_pos, from, from_pos, size);
684  return detail::combine_events(events, q);
685 }
686 
696 template <class T>
697 static void inline fill(void *dev_ptr, const T &pattern, size_t count,
699  detail::fill(q, dev_ptr, pattern, count).wait();
700 }
701 
713 template <class T>
714 static sycl::event inline fill_async(void *dev_ptr, const T &pattern,
715  size_t count,
717  return detail::fill(q, dev_ptr, pattern, count);
718 }
719 
728 static void memset(void *dev_ptr, int value, size_t size,
730  detail::memset(q, dev_ptr, value, size).wait();
731 }
732 
741 static sycl::event memset_async(void *dev_ptr, int value, size_t size,
743  return detail::memset(q, dev_ptr, value, size);
744 }
745 
758 static inline void memset(void *ptr, size_t pitch, int val, size_t x, size_t y,
760  sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y));
761 }
762 
775 static inline sycl::event memset_async(void *ptr, size_t pitch, int val,
776  size_t x, size_t y,
778  auto events = detail::memset(q, ptr, pitch, val, x, y);
779  return detail::combine_events(events, q);
780 }
781 
791 static inline void memset(pitched_data pitch, int val, sycl::range<3> size,
793  sycl::event::wait(detail::memset(q, pitch, val, size));
794 }
795 
805 static inline sycl::event memset_async(pitched_data pitch, int val,
806  sycl::range<3> size,
808  auto events = detail::memset(q, pitch, val, size);
809  return detail::combine_events(events, q);
810 }
811 
813 template <class T, memory_region Memory, size_t Dimension> class accessor;
814 template <class T, memory_region Memory> class accessor<T, Memory, 3> {
815 public:
817  using element_t = typename memory_t::element_t;
818  using pointer_t = typename memory_t::pointer_t;
819  using accessor_t = typename memory_t::template accessor_t<3>;
820  accessor(pointer_t data, const sycl::range<3> &in_range)
821  : _data(data), _range(in_range) {}
822  template <memory_region M = Memory>
823  accessor(typename std::enable_if<M != memory_region::local,
824  const accessor_t>::type &acc)
825  : accessor(acc, acc.get_range()) {}
826  accessor(const accessor_t &acc, const sycl::range<3> &in_range)
827  : accessor(acc.get_pointer(), in_range) {}
828  accessor<T, Memory, 2> operator[](size_t index) const {
829  sycl::range<2> sub(_range.get(1), _range.get(2));
830  return accessor<T, Memory, 2>(_data + index * sub.size(), sub);
831  }
832 
833  pointer_t get_ptr() const { return _data; }
834 
835 private:
836  pointer_t _data;
837  sycl::range<3> _range;
838 };
839 template <class T, memory_region Memory> class accessor<T, Memory, 2> {
840 public:
842  using element_t = typename memory_t::element_t;
843  using pointer_t = typename memory_t::pointer_t;
844  using accessor_t = typename memory_t::template accessor_t<2>;
845  accessor(pointer_t data, const sycl::range<2> &in_range)
846  : _data(data), _range(in_range) {}
847  template <memory_region M = Memory>
848  accessor(typename std::enable_if<M != memory_region::local,
849  const accessor_t>::type &acc)
850  : accessor(acc, acc.get_range()) {}
851  accessor(const accessor_t &acc, const sycl::range<2> &in_range)
852  : accessor(acc.get_pointer(), in_range) {}
853 
854  pointer_t operator[](size_t index) const {
855  return _data + _range.get(1) * index;
856  }
857 
858  pointer_t get_ptr() const { return _data; }
859 
860 private:
861  pointer_t _data;
862  sycl::range<2> _range;
863 };
864 
866 template <class T, memory_region Memory, size_t Dimension> class device_memory {
867 public:
868  using accessor_t =
872 
874  : device_memory(sycl::range<Dimension>(1), q) {}
875 
878  std::initializer_list<value_t> &&init_list,
880  : device_memory(in_range, q) {
881  assert(init_list.size() <= in_range.size());
882  _host_ptr = (value_t *)std::malloc(_size);
883  std::memset(_host_ptr, 0, _size);
884  std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
885  }
886 
888  template <size_t D = Dimension>
890  const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
891  std::initializer_list<std::initializer_list<value_t>> &&init_list,
893  : device_memory(in_range, q) {
894  assert(init_list.size() <= in_range[0]);
895  _host_ptr = (value_t *)std::malloc(_size);
896  std::memset(_host_ptr, 0, _size);
897  auto tmp_data = _host_ptr;
898  for (auto sub_list : init_list) {
899  assert(sub_list.size() <= in_range[1]);
900  std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T));
901  tmp_data += in_range[1];
902  }
903  }
904 
908  : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
909  _host_ptr(nullptr), _device_ptr(nullptr), _q(q) {
910  static_assert((Memory == memory_region::global) ||
911  (Memory == memory_region::constant) ||
912  (Memory == memory_region::usm_shared),
913  "device memory region should be global, constant or shared");
914  // Make sure that singleton class dev_mgr will destruct later than this.
915  detail::dev_mgr::instance();
916  }
917 
919  // enable_if_t SFINAE to avoid ambiguity with
920  // device_memory(Args... Arguments, sycl::queue q)
921  template <class... Args, size_t D = Dimension,
922  typename = std::enable_if_t<sizeof...(Args) == D>>
923  device_memory(Args... Arguments)
924  : device_memory(sycl::range<Dimension>(Arguments...),
925  get_default_queue()) {}
926 
928  template <class... Args>
929  device_memory(Args... Arguments, sycl::queue q)
930  : device_memory(sycl::range<Dimension>(Arguments...), q) {}
931 
933  if (_device_ptr && !_reference)
934  syclcompat::free(_device_ptr, _q);
935  if (_host_ptr)
936  std::free(_host_ptr);
937  }
938 
940  void init() { init(_q); }
943  void init(sycl::queue q) {
944  if (_device_ptr)
945  return;
946  if (!_size)
947  return;
948  allocate_device(q);
949  if (_host_ptr)
950  detail::memcpy(q, _device_ptr, _host_ptr, _size);
951  }
952 
954  void assign(value_t *src, size_t size) {
955  this->~device_memory();
956  new (this) device_memory(src, size, _q);
957  }
958 
961  value_t *get_ptr() { return get_ptr(_q); }
965  init(q);
966  return _device_ptr;
967  }
968 
970  size_t get_size() { return _size; }
971 
972  template <size_t D = Dimension>
973  typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
974  init();
975  return _device_ptr[index];
976  }
977 
980  template <size_t D = Dimension>
981  typename std::enable_if<D != 1, compat_accessor_t>::type
983  return compat_accessor_t((T *)_device_ptr, _range);
984  }
985 
986 private:
987  device_memory(value_t *memory_ptr, size_t size,
989  : _size(size), _range(size / sizeof(T)), _reference(true),
990  _device_ptr(memory_ptr), _q(q) {}
991 
992  void allocate_device(sycl::queue q) {
993  if (Memory == memory_region::usm_shared) {
994  _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
995  q.get_context());
996  return;
997  }
998 #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
999  if (Memory == memory_region::constant) {
1000  _device_ptr = (value_t *)sycl::malloc_device(
1001  _size, q.get_device(), q.get_context(),
1003  return;
1004  }
1005 #endif
1006  _device_ptr = (value_t *)detail::malloc(_size, q);
1007  }
1008 
1009  size_t _size;
1010  sycl::range<Dimension> _range;
1011  bool _reference;
1012  value_t *_host_ptr;
1013  value_t *_device_ptr;
1014  sycl::queue _q;
1015 };
1016 template <class T, memory_region Memory>
1017 class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
1018 public:
1020  using value_t = typename base::value_t;
1021  using accessor_t =
1023 
1025  device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {}
1026 
1029 };
1030 
1031 template <class T, size_t Dimension>
1033 template <class T, size_t Dimension>
1035 template <class T, size_t Dimension>
1037 
1039 public:
1040  void init(const void *ptr, sycl::queue q = get_default_queue()) {
1041  memory_type = sycl::get_pointer_type(ptr, q.get_context());
1042  device_pointer = (memory_type != sycl::usm::alloc::unknown) ? ptr : nullptr;
1043  host_pointer = (memory_type != sycl::usm::alloc::unknown) &&
1044  (memory_type != sycl::usm::alloc::device)
1045  ? ptr
1046  : nullptr;
1047  sycl::device device_obj = sycl::get_pointer_device(ptr, q.get_context());
1048  device_id = detail::dev_mgr::instance().get_device_id(device_obj);
1049  }
1050 
1051  sycl::usm::alloc get_memory_type() { return memory_type; }
1052 
1053  const void *get_device_pointer() { return device_pointer; }
1054 
1055  const void *get_host_pointer() { return host_pointer; }
1056 
1057  bool is_memory_shared() { return memory_type == sycl::usm::alloc::shared; }
1058 
1059  unsigned int get_device_id() { return device_id; }
1060 
1061 private:
1062  sycl::usm::alloc memory_type = sycl::usm::alloc::unknown;
1063  const void *device_pointer = nullptr;
1064  const void *host_pointer = nullptr;
1065  unsigned int device_id = 0;
1066 };
1067 
1068 } // namespace syclcompat
size_t get(int dimension) const
Definition: array.hpp:70
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:59
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
Command group handler class.
Definition: handler.hpp:454
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:1246
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:1950
A unique identifier of an item in an index space.
Definition: id.hpp:36
Provides constructors for address space qualified and non address space qualified pointers to allow i...
Definition: multi_ptr.hpp:83
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:119
event memcpy(void *Dest, const void *Src, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
Definition: queue.cpp:126
event fill(void *Ptr, const T &Pattern, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the specified memory with the specified pattern.
Definition: queue.hpp:487
event memset(void *Ptr, int Value, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue.cpp:107
device get_device() const
Definition: queue.cpp:77
context get_context() const
Definition: queue.cpp:75
std::enable_if_t< std::is_invocable_r_v< void, T, handler & >, event > submit(T CGF, const detail::code_location &CodeLoc=detail::code_location::current())
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue.hpp:347
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
size_t size() const
Definition: range.hpp:56
accessor(const accessor_t &acc, const sycl::range< 2 > &in_range)
Definition: memory.hpp:851
pointer_t operator[](size_t index) const
Definition: memory.hpp:854
typename memory_t::template accessor_t< 2 > accessor_t
Definition: memory.hpp:844
typename memory_t::pointer_t pointer_t
Definition: memory.hpp:843
typename memory_t::element_t element_t
Definition: memory.hpp:842
accessor(typename std::enable_if< M !=memory_region::local, const accessor_t >::type &acc)
Definition: memory.hpp:848
accessor(pointer_t data, const sycl::range< 2 > &in_range)
Definition: memory.hpp:845
accessor(pointer_t data, const sycl::range< 3 > &in_range)
Definition: memory.hpp:820
accessor< T, Memory, 2 > operator[](size_t index) const
Definition: memory.hpp:828
typename memory_t::template accessor_t< 3 > accessor_t
Definition: memory.hpp:819
typename memory_t::element_t element_t
Definition: memory.hpp:817
accessor(typename std::enable_if< M !=memory_region::local, const accessor_t >::type &acc)
Definition: memory.hpp:823
typename memory_t::pointer_t pointer_t
Definition: memory.hpp:818
accessor(const accessor_t &acc, const sycl::range< 3 > &in_range)
Definition: memory.hpp:826
accessor used as device function parameter.
Definition: memory.hpp:813
typename std::conditional_t< Memory==memory_region::constant, const T, T > element_t
Definition: memory.hpp:134
typename std::remove_cv_t< T > value_t
Definition: memory.hpp:135
static constexpr size_t type_size
Definition: memory.hpp:131
typename std::conditional_t< target==syclcompat::target::local, sycl::local_accessor< T, Dimension >, sycl::accessor< T, Dimension, mode > > accessor_t
Definition: memory.hpp:140
static constexpr sycl::access_mode mode
Definition: memory.hpp:128
static constexpr sycl::access::address_space asp
Definition: memory.hpp:121
typename detail::memory_traits< Memory, T >::template accessor_t< 0 > accessor_t
Definition: memory.hpp:1022
device_memory(const value_t &val)
Constructor with initial value.
Definition: memory.hpp:1025
Device variable with address space of shared or global.
Definition: memory.hpp:866
std::enable_if< D !=1, compat_accessor_t >::type get_access(sycl::handler &cgh)
Get compat_accessor with dimension info for the device memory object when usm is used and dimension i...
Definition: memory.hpp:982
value_t * get_ptr()
Get memory pointer of the memory object, which is virtual pointer when usm is not used,...
Definition: memory.hpp:961
std::enable_if< D==1, T >::type & operator[](size_t index)
Definition: memory.hpp:973
device_memory(const sycl::range< Dimension > &range_in, sycl::queue q=get_default_queue())
Constructor with range.
Definition: memory.hpp:906
size_t get_size()
Get the device memory object size in bytes.
Definition: memory.hpp:970
device_memory(Args... Arguments)
Constructor with range.
Definition: memory.hpp:923
device_memory(const typename std::enable_if< D==2, sycl::range< 2 >>::type &in_range, std::initializer_list< std::initializer_list< value_t >> &&init_list, sycl::queue q=get_default_queue())
Constructor of 2-D array with initializer list.
Definition: memory.hpp:889
value_t * get_ptr(sycl::queue q)
Get memory pointer of the memory object, which is virtual pointer when usm is not used,...
Definition: memory.hpp:964
void init(sycl::queue q)
Allocate memory with specified queue, and init memory if has initial value.
Definition: memory.hpp:943
device_memory(const sycl::range< Dimension > &in_range, std::initializer_list< value_t > &&init_list, sycl::queue q=get_default_queue())
Constructor of 1-D array with initializer list.
Definition: memory.hpp:877
device_memory(sycl::queue q=get_default_queue())
Definition: memory.hpp:873
void init()
Allocate memory with default queue, and init memory if has initial value.
Definition: memory.hpp:940
void assign(value_t *src, size_t size)
The variable is assigned to a device pointer.
Definition: memory.hpp:954
typename detail::memory_traits< Memory, T >::template accessor_t< Dimension > accessor_t
Definition: memory.hpp:869
device_memory(Args... Arguments, sycl::queue q)
Constructor with range and queue.
Definition: memory.hpp:929
typename detail::memory_traits< Memory, T >::value_t value_t
Definition: memory.hpp:870
Pitched 2D/3D memory data.
Definition: memory.hpp:93
void set_y(size_t y)
Definition: memory.hpp:109
pitched_data(void *data, size_t pitch, size_t x, size_t y)
Definition: memory.hpp:96
void set_x(size_t x)
Definition: memory.hpp:106
void set_pitch(size_t pitch)
Definition: memory.hpp:103
void set_data_ptr(void *data)
Definition: memory.hpp:100
const void * get_host_pointer()
Definition: memory.hpp:1055
void init(const void *ptr, sycl::queue q=get_default_queue())
Definition: memory.hpp:1040
const void * get_device_pointer()
Definition: memory.hpp:1053
sycl::usm::alloc get_memory_type()
Definition: memory.hpp:1051
unsigned int get_device_id()
Definition: memory.hpp:1059
__ESIMD_API std::enable_if_t<(RegionT::length *sizeof(typename RegionT::element_type) >=2)> wait(sycl::ext::intel::esimd::simd_view< T, RegionT > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
Definition: memory.hpp:428
access::mode access_mode
Definition: access.hpp:72
usm::alloc get_pointer_type(const void *ptr, const context &ctxt)
Query the allocation type from a USM pointer.
Definition: usm_impl.cpp:574
void * malloc_shared(size_t size, const device &dev, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:407
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:85
y y maxval[j] maxval c[j] c[j] z
device get_pointer_device(const void *ptr, const context &ctxt)
Queries the device against which the pointer was allocated Throws an invalid_object_error if ptr is a...
Definition: usm_impl.cpp:625
void * malloc_device(size_t size, const device &dev, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:319
void * malloc_host(size_t size, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:382
typename detail::LegacyPointerTypes< ElementType, Space >::pointer_t pointer_t
Definition: multi_ptr.hpp:758
void free(void *ptr, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:373
Definition: access.hpp:18
static pointer_access_attribute get_pointer_attribute(sycl::queue q, const void *ptr)
Definition: memory.hpp:227
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
Definition: memory.hpp:259
static constexpr size_t get_pitch(size_t x)
Calculate pitch (padded length of major dimension x) by rounding up to multiple of 32.
Definition: memory.hpp:152
static sycl::event combine_events(std::vector< sycl::event > &events, sycl::queue q)
Definition: memory.hpp:416
static sycl::event memset(sycl::queue q, void *dev_ptr, int value, size_t size)
Set value to the first size bytes starting from dev_ptr in q.
Definition: memory.hpp:184
static size_t get_offset(sycl::id< 3 > id, size_t slice, size_t pitch)
Definition: memory.hpp:273
static memcpy_direction deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr)
Definition: memory.hpp:240
static size_t get_copy_range(sycl::range< 3 > size, size_t slice, size_t pitch)
Definition: memory.hpp:268
static void * malloc(size_t size, sycl::queue q)
Definition: memory.hpp:144
static sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern, size_t count)
Set pattern to the first count elements of type T starting from dev_ptr.
Definition: memory.hpp:172
static void memcpy(void *to_ptr, const void *from_ptr, size_t size, sycl::queue q=get_default_queue())
Synchronously copies size bytes from the address specified by from_ptr to the address specified by to...
Definition: memory.hpp:549
uint8_t byte_t
Definition: memory.hpp:90
static void free(void *ptr, sycl::queue q=get_default_queue())
free
Definition: memory.hpp:513
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
Definition: device.hpp:516
auto * local_mem()
Definition: memory.hpp:63
static device_ext & get_current_device()
Util function to get the current device.
Definition: device.hpp:529
static void memset(pitched_data pitch, int val, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Sets value to the 3D memory region specified by pitch in q.
Definition: memory.hpp:791
static void * malloc_host(size_t num_bytes, sycl::queue q=get_default_queue())
Allocate memory block on the host.
Definition: memory.hpp:449
typename type_identity< T >::type type_identity_t
Definition: traits.hpp:35
static void fill(void *dev_ptr, const T &pattern, size_t count, sycl::queue q=get_default_queue())
Synchronously sets pattern to the first count elements starting from dev_ptr.
Definition: memory.hpp:697
static void * malloc_shared(size_t num_bytes, sycl::queue q=get_default_queue())
Allocate memory block of usm_shared memory.
Definition: memory.hpp:469
static sycl::event memset_async(pitched_data pitch, int val, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Sets value to the 3D memory region specified by pitch in q.
Definition: memory.hpp:805
sycl::event free_async(const std::vector< void * > &pointers, const std::vector< sycl::event > &events, sycl::queue q=get_default_queue())
Free the device memory pointed by a batch of pointers in pointers which are related to q after events...
Definition: memory.hpp:525
static sycl::event memcpy_async(pitched_data to, sycl::id< 3 > to_pos, pitched_data from, sycl::id< 3 > from_pos, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Asynchronously copies a subset of a 3D matrix specified by to to another 3D matrix specified by from.
Definition: memory.hpp:679
static void * malloc(size_t &pitch, size_t x, size_t y, sycl::queue q=get_default_queue())
Allocate memory block for 2D array on the device.
Definition: memory.hpp:504
static sycl::event fill_async(void *dev_ptr, const T &pattern, size_t count, sycl::queue q=get_default_queue())
Asynchronously sets pattern to the first count elements starting from dev_ptr.
Definition: memory.hpp:714
static void memcpy(pitched_data to, sycl::id< 3 > to_pos, pitched_data from, sycl::id< 3 > from_pos, sycl::range< 3 > size, sycl::queue q=get_default_queue())
Synchronously copies a subset of a 3D matrix specified by to to another 3D matrix specified by from.
Definition: memory.hpp:660