DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory.hpp
Go to the documentation of this file.
1 /***************************************************************************
2  *
3  * Copyright (C) Codeplay Software Ltd.
4  *
5  * Part of the LLVM Project, under the Apache License v2.0 with LLVM
6  * Exceptions. See https://llvm.org/LICENSE.txt for license information.
7  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  *
15  * SYCL compatibility extension
16  *
17  * memory.hpp
18  *
19  * Description:
20  * memory functionality for the SYCL compatibility extension
21  **************************************************************************/
22 
23 // The original source was under the license below:
24 //==---- memory.hpp -------------------------------*- C++ -*----------------==//
25 //
26 // Copyright (C) Intel Corporation
27 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
28 // See https://llvm.org/LICENSE.txt for license information.
29 //
30 //===----------------------------------------------------------------------===//
31 
32 #pragma once
33 
34 #include <cassert>
35 #include <cstdint>
36 #include <cstring>
37 #include <map>
38 #include <mutex>
39 #include <thread>
40 #include <type_traits>
41 #include <unordered_map>
42 #include <utility>
43 
44 #include <sycl/builtins.hpp>
47 #include <sycl/group.hpp>
48 #include <sycl/usm.hpp>
49 
50 #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
52 #endif
53 
54 #include <syclcompat/device.hpp>
55 #include <syclcompat/traits.hpp>
56 
57 #if defined(__linux__)
58 #include <sys/mman.h>
59 #elif defined(_WIN64)
60 #ifndef NOMINMAX
61 #define NOMINMAX
62 #endif
63 #include <windows.h>
64 #else
65 #error "Only support Windows and Linux."
66 #endif
67 
68 namespace syclcompat {
69 
70 template <typename AllocT> auto *local_mem() {
72  As_multi_ptr =
73  sycl::ext::oneapi::group_local_memory_for_overwrite<AllocT>(
74  sycl::ext::oneapi::this_work_item::get_work_group<3>());
75  auto *As = *As_multi_ptr;
76  return As;
77 }
78 
79 namespace experimental {
85  automatic
86 };
87 }
88 
89 enum class memory_region {
90  global = 0, // device global memory
91  constant, // device read-only memory
92  local, // device local memory
93  usm_shared, // memory which can be accessed by host and device
94 };
95 
96 enum class target { device, local };
97 
98 using byte_t = uint8_t;
99 
102 public:
103  pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
104  pitched_data(void *data, size_t pitch, size_t x, size_t y)
105  : _data(data), _pitch(pitch), _x(x), _y(y) {}
106 
107  void *get_data_ptr() { return _data; }
108  void set_data_ptr(void *data) { _data = data; }
109 
110  size_t get_pitch() { return _pitch; }
111  void set_pitch(size_t pitch) { _pitch = pitch; }
112 
113  size_t get_x() { return _x; }
114  void set_x(size_t x) { _x = x; };
115 
116  size_t get_y() { return _y; }
117  void set_y(size_t y) { _y = y; }
118 
119 private:
120  void *_data;
121  size_t _pitch, _x, _y;
122 };
123 
124 namespace experimental {
125 #ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
126 class image_mem_wrapper;
127 namespace detail {
128 static sycl::event memcpy(const image_mem_wrapper *src,
129  const sycl::id<3> &src_id, pitched_data &dest,
130  const sycl::id<3> &dest_id,
131  const sycl::range<3> &copy_extend, sycl::queue q);
132 static sycl::event memcpy(const pitched_data src, const sycl::id<3> &src_id,
133  image_mem_wrapper *dest, const sycl::id<3> &dest_id,
134  const sycl::range<3> &copy_extend, sycl::queue q);
135 } // namespace detail
136 #endif
137 class image_matrix;
138 namespace detail {
139 static pitched_data to_pitched_data(image_matrix *image);
140 }
141 
144  struct data_wrapper {
147 #ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
148  experimental::image_mem_wrapper *image_bindless{nullptr};
149 #endif
150  image_matrix *image{nullptr};
151  };
157 };
158 } // namespace experimental
159 
160 namespace detail {
161 
162 template <class T, memory_region Memory, size_t Dimension> class accessor;
163 template <memory_region Memory, class T = byte_t> class memory_traits {
164 public:
165  static constexpr sycl::access::address_space asp =
166  (Memory == memory_region::local)
167  ? sycl::access::address_space::local_space
168  : sycl::access::address_space::global_space;
169  static constexpr target target =
171  static constexpr sycl::access_mode mode = (Memory == memory_region::constant)
172  ? sycl::access_mode::read
174  static constexpr size_t type_size = sizeof(T);
175  using element_t =
176  typename std::conditional_t<Memory == memory_region::constant, const T,
177  T>;
178  using value_t = typename std::remove_cv_t<T>;
179  template <size_t Dimension = 1>
180  using accessor_t =
181  typename std::conditional_t<target == target::local,
184  using pointer_t = T *;
185 };
186 
187 static inline void *malloc(size_t size, sycl::queue q) {
188  return sycl::malloc_device(size, q.get_device(), q.get_context());
189 }
190 
195 static inline constexpr size_t get_pitch(size_t x) {
196  return ((x) + 31) & ~(0x1F);
197 }
198 
206 static inline void *malloc(size_t &pitch, size_t x, size_t y, size_t z,
207  sycl::queue q) {
208  pitch = get_pitch(x);
209  return malloc(pitch * y * z, q);
210 }
211 
221 template <class T>
222 static inline sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern,
223  size_t count) {
224  return q.fill(dev_ptr, pattern, count);
225 }
226 
234 static inline sycl::event memset(sycl::queue q, void *dev_ptr, int value,
235  size_t size) {
236  return q.memset(dev_ptr, value, size);
237 }
238 
246 template <typename T>
247 static inline std::vector<sycl::event>
248 memset(sycl::queue q, pitched_data data, const T &value, sycl::range<3> size) {
249  std::vector<sycl::event> event_list;
250  size_t slice = data.get_pitch() * data.get_y();
251  unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
252  for (size_t z = 0; z < size.get(2); ++z) {
253  unsigned char *data_ptr = data_surface;
254  for (size_t y = 0; y < size.get(1); ++y) {
255  event_list.push_back(detail::fill<T>(q, data_ptr, value, size.get(0)));
256  data_ptr += data.get_pitch();
257  }
258  data_surface += slice;
259  }
260  return event_list;
261 }
262 
273 template <typename T>
274 static inline std::vector<sycl::event> memset(sycl::queue q, void *ptr,
275  size_t pitch, const T &value,
276  size_t x, size_t y) {
277  return memset(q, pitched_data(ptr, pitch, x, 1), value,
278  sycl::range<3>(x, y, 1));
279 }
280 
282  host_only = 0,
283  device_only,
284  host_device,
285  end
286 };
287 
289  const void *ptr) {
290  switch (sycl::get_pointer_type(ptr, q.get_context())) {
291  case sycl::usm::alloc::unknown:
293  case sycl::usm::alloc::device:
295  case sycl::usm::alloc::shared:
296  case sycl::usm::alloc::host:
298  }
299 }
300 
302 deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr) {
303  // table[to_attribute][from_attribute]
304  using namespace experimental; // for memcpy_direction
305  static const memcpy_direction
306  direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
307  [static_cast<unsigned>(pointer_access_attribute::end)] = {
311  return direction_table[static_cast<unsigned>(get_pointer_attribute(
312  q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
313 }
314 
315 static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
316  size_t size,
317  const std::vector<sycl::event> &dep_events = {}) {
318  if (!size)
319  return sycl::event{};
320  return q.memcpy(to_ptr, from_ptr, size, dep_events);
321 }
322 
323 // Get actual copy range and make sure it will not exceed range.
324 static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
325  size_t pitch) {
326  return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
327 }
328 
329 static inline size_t get_offset(sycl::id<3> id, size_t slice, size_t pitch) {
330  return slice * id.get(2) + pitch * id.get(1) + id.get(0);
331 }
332 
333 // RAII for host pointer
334 class host_buffer {
335  void *_buf;
336  size_t _size;
337  sycl::queue _q;
338  const std::vector<sycl::event> &_deps; // free operation depends
339 
340 public:
341  host_buffer(size_t size, sycl::queue q, const std::vector<sycl::event> &deps)
342  : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
343  void *get_ptr() const { return _buf; }
344  size_t get_size() const { return _size; }
346  if (_buf) {
347  _q.submit([&](sycl::handler &cgh) {
348  cgh.depends_on(_deps);
349  cgh.host_task([buf = _buf] { std::free(buf); });
350  });
351  }
352  }
353 };
354 
357 static inline std::vector<sycl::event>
358 memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
359  sycl::range<3> to_range, sycl::range<3> from_range, sycl::id<3> to_id,
360  sycl::id<3> from_id, sycl::range<3> size,
361  const std::vector<sycl::event> &dep_events = {}) {
362 
363  std::vector<sycl::event> event_list;
364 
365  size_t to_slice = to_range.get(1) * to_range.get(0);
366  size_t from_slice = from_range.get(1) * from_range.get(0);
367  unsigned char *to_surface =
368  (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
369  const unsigned char *from_surface =
370  (const unsigned char *)from_ptr +
371  get_offset(from_id, from_slice, from_range.get(0));
372 
373  if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) {
374  return {memcpy(q, to_surface, from_surface, to_slice * size.get(2),
375  dep_events)};
376  }
377  using namespace experimental; // for memcpy_direction
378  memcpy_direction direction = deduce_memcpy_direction(q, to_ptr, from_ptr);
379  size_t size_slice = size.get(1) * size.get(0);
380  switch (direction) {
381  case host_to_host:
382  for (size_t z = 0; z < size.get(2); ++z) {
383  unsigned char *to_ptr = to_surface;
384  const unsigned char *from_ptr = from_surface;
385  if (to_range.get(0) == from_range.get(0) &&
386  to_range.get(0) == size.get(0)) {
387  event_list.push_back(
388  memcpy(q, to_ptr, from_ptr, size_slice, dep_events));
389  } else {
390  for (size_t y = 0; y < size.get(1); ++y) {
391  event_list.push_back(
392  memcpy(q, to_ptr, from_ptr, size.get(0), dep_events));
393  to_ptr += to_range.get(0);
394  from_ptr += from_range.get(0);
395  }
396  }
397  to_surface += to_slice;
398  from_surface += from_slice;
399  }
400  break;
401  case host_to_device: {
402  host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
403  event_list);
404  std::vector<sycl::event> host_events;
405  if (to_slice == size_slice) {
406  // Copy host data to a temp host buffer with the shape of target.
407  host_events =
408  memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
409  sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, dep_events);
410  } else {
411  // Copy host data to a temp host buffer with the shape of target.
412  host_events =
413  memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
414  sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
415  // If has padding data, not sure whether it is useless. So fill
416  // temp buffer with it.
417  std::vector<sycl::event>{memcpy(q, buf.get_ptr(), to_surface,
418  buf.get_size(), dep_events)});
419  }
420  // Copy from temp host buffer to device with only one submit.
421  event_list.push_back(
422  memcpy(q, to_surface, buf.get_ptr(), buf.get_size(), host_events));
423  break;
424  }
425  case device_to_host: {
426  host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
427  event_list);
428  // Copy from host temp buffer to host target with reshaping.
429  event_list =
430  memcpy(q, to_surface, buf.get_ptr(), to_range, from_range,
431  sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
432  // Copy from device to temp host buffer with only one submit.
433  std::vector<sycl::event>{memcpy(q, buf.get_ptr(), from_surface,
434  buf.get_size(), dep_events)});
435  break;
436  }
437  case device_to_device:
438  event_list.push_back(q.submit([&](sycl::handler &cgh) {
439  cgh.depends_on(dep_events);
440  cgh.parallel_for<class memcpy_3d_detail>(size, [=](sycl::id<3> id) {
441  to_surface[get_offset(id, to_slice, to_range.get(0))] =
442  from_surface[get_offset(id, from_slice, from_range.get(0))];
443  });
444  }));
445  break;
446  default:
447  throw std::runtime_error("[SYCLcompat] memcpy: invalid direction value");
448  }
449  return event_list;
450 }
451 
453 static inline std::vector<sycl::event>
455  sycl::id<3> from_id, sycl::range<3> size) {
456  return memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
457  sycl::range<3>(to.get_pitch(), to.get_y(), 1),
458  sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id,
459  from_id, size);
460 }
461 
463 static inline std::vector<sycl::event>
464 memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t to_pitch,
465  size_t from_pitch, size_t x, size_t y) {
466  return memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
467  sycl::range<3>(from_pitch, y, 1), sycl::id<3>(0, 0, 0),
468  sycl::id<3>(0, 0, 0), sycl::range<3>(x, y, 1));
469 }
470 
471 // Takes a std::vector<sycl::event> & returns a single event
472 // which simply depends on all of them
473 static sycl::event combine_events(std::vector<sycl::event> &events,
474  sycl::queue q) {
475  return q.submit([&events](sycl::handler &cgh) {
476  cgh.depends_on(events);
477  cgh.host_task([]() {});
478  });
479 }
480 
481 } // namespace detail
482 
483 namespace experimental {
484 namespace detail {
485 static inline std::vector<sycl::event>
487  auto to = param.to.pitched;
488  auto from = param.from.pitched;
489 #ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
490  if (param.to.image_bindless != nullptr &&
491  param.from.image_bindless != nullptr) {
492  throw std::runtime_error(
493  "[SYCLcompat] memcpy: Unsupported bindless_image API.");
494  // TODO: Need change logic when sycl support image_mem to image_mem copy.
495  std::vector<sycl::event> event_list;
496  syclcompat::detail::host_buffer buf(param.size.size(), q, event_list);
497  to.set_data_ptr(buf.get_ptr());
498  experimental::detail::memcpy(param.from.image_bindless, param.from.pos, to,
499  sycl::id<3>(0, 0, 0), param.size, q);
500  from.set_data_ptr(buf.get_ptr());
501  event_list.push_back(experimental::detail::memcpy(
502  from, sycl::id<3>(0, 0, 0), param.to.image_bindless, param.to.pos,
503  param.size, q));
504  return event_list;
505  } else if (param.to.image_bindless != nullptr) {
506  throw std::runtime_error(
507  "[SYCLcompat] memcpy: Unsupported bindless_image API.");
508  return {experimental::detail::memcpy(from, param.from.pos,
509  param.to.image_bindless, param.to.pos,
510  param.size, q)};
511  } else if (param.from.image_bindless != nullptr) {
512  throw std::runtime_error(
513  "[SYCLcompat] memcpy: Unsupported bindless_image API.");
514  return {experimental::detail::memcpy(param.from.image_bindless,
515  param.from.pos, to, param.to.pos,
516  param.size, q)};
517  }
518 #endif
519  if (param.to.image != nullptr) {
520  throw std::runtime_error("[SYCLcompat] memcpy: Unsupported image API.");
522  }
523  if (param.from.image != nullptr) {
524  throw std::runtime_error("[SYCLcompat] memcpy: Unsupported image API.");
526  }
527  return syclcompat::detail::memcpy(q, to, param.to.pos, from, param.from.pos,
528  param.size);
529 }
530 } // namespace detail
531 } // namespace experimental
532 
537 static inline void *malloc(size_t num_bytes,
539  return detail::malloc(num_bytes, q);
540 }
541 
547 template <typename T>
548 static inline T *malloc(size_t count, sycl::queue q = get_default_queue()) {
549  return static_cast<T *>(detail::malloc(count * sizeof(T), q));
550 }
551 
556 static inline void *malloc_host(size_t num_bytes,
558  return sycl::malloc_host(num_bytes, q);
559 }
560 
566 template <typename T>
567 static inline T *malloc_host(size_t count,
569  return static_cast<T *>(sycl::malloc_host(count * sizeof(T), q));
570 }
571 
576 static inline void *malloc_shared(size_t num_bytes,
578  return sycl::malloc_shared(num_bytes, q);
579 }
580 
585 template <typename T>
586 static inline T *malloc_shared(size_t count,
588  return static_cast<T *>(sycl::malloc_shared(count * sizeof(T), q));
589 }
590 
595 static inline pitched_data malloc(sycl::range<3> size,
597  pitched_data pitch(nullptr, 0, size.get(0), size.get(1));
598  size_t pitch_size;
599  pitch.set_data_ptr(
600  detail::malloc(pitch_size, size.get(0), size.get(1), size.get(2), q));
601  pitch.set_pitch(pitch_size);
602  return pitch;
603 }
604 
611 static inline void *malloc(size_t &pitch, size_t x, size_t y,
613  return detail::malloc(pitch, x, y, 1, q);
614 }
615 
620 static inline void wait_and_free(void *ptr,
623  q.wait();
624  if (ptr) {
625  sycl::free(ptr, q);
626  }
627 }
628 
632 static inline void free(void *ptr, sycl::queue q = get_default_queue()) {
633  if (ptr) {
634  sycl::free(ptr, q);
635  }
636 }
637 
646 // Can't be static due to the friend declaration in the memory header.
647 inline sycl::event enqueue_free(const std::vector<void *> &pointers,
648  const std::vector<sycl::event> &events,
650  auto event = q.submit(
651  [&pointers, &events, ctxt = q.get_context()](sycl::handler &cgh) {
652  cgh.depends_on(events);
653  cgh.host_task([=]() {
654  for (auto p : pointers)
655  sycl::free(p, ctxt);
656  });
657  });
658  get_current_device().add_event(event);
659  return event;
660 }
661 
671 static void memcpy(void *to_ptr, const void *from_ptr, size_t size,
673  detail::memcpy(q, to_ptr, from_ptr, size).wait();
674 }
675 
685 static sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size,
687  return detail::memcpy(q, to_ptr, from_ptr, size);
688 }
689 
700 template <typename T>
701 static sycl::event
703  size_t count, sycl::queue q = get_default_queue()) {
704  return detail::memcpy(q, static_cast<void *>(to_ptr),
705  static_cast<const void *>(from_ptr), count * sizeof(T));
706 }
707 
718 template <typename T>
719 static void memcpy(type_identity_t<T> *to_ptr,
720  const type_identity_t<T> *from_ptr, size_t count,
722  detail::memcpy(q, static_cast<void *>(to_ptr),
723  static_cast<const void *>(from_ptr), count * sizeof(T))
724  .wait();
725 }
726 
741 static inline void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
742  size_t from_pitch, size_t x, size_t y,
745  detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y));
746 }
747 
762 static inline sycl::event memcpy_async(void *to_ptr, size_t to_pitch,
763  const void *from_ptr, size_t from_pitch,
764  size_t x, size_t y,
766  auto events = detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y);
767  return detail::combine_events(events, q);
768 }
769 
773 // The function will return after the copy is completed.
782 static inline void memcpy(pitched_data to, sycl::id<3> to_pos,
783  pitched_data from, sycl::id<3> from_pos,
784  sycl::range<3> size,
786  sycl::event::wait(detail::memcpy(q, to, to_pos, from, from_pos, size));
787 }
788 
802  pitched_data from, sycl::id<3> from_pos,
803  sycl::range<3> size,
805  auto events = detail::memcpy(q, to, to_pos, from, from_pos, size);
806  return detail::combine_events(events, q);
807 }
808 
818 template <class T>
819 static void inline fill(void *dev_ptr, const T &pattern, size_t count,
821  detail::fill(q, dev_ptr, pattern, count).wait();
822 }
823 
835 template <class T>
836 static sycl::event inline fill_async(void *dev_ptr, const T &pattern,
837  size_t count,
839  return detail::fill(q, dev_ptr, pattern, count);
840 }
841 
842 namespace experimental {
843 
850 static inline void memcpy(const memcpy_parameter &param,
853 }
854 
861 static inline void memcpy_async(const memcpy_parameter &param,
864 }
865 } // namespace experimental
866 
875 static void memset(void *dev_ptr, int value, size_t size,
877  detail::memset(q, dev_ptr, value, size).wait();
878 }
879 
886 static inline void memset_d16(void *dev_ptr, unsigned short value, size_t size,
888  detail::fill<unsigned short>(q, dev_ptr, value, size).wait();
889 }
890 
897 static inline void memset_d32(void *dev_ptr, unsigned int value, size_t size,
899  detail::fill<unsigned int>(q, dev_ptr, value, size).wait();
900 }
901 
908 static inline sycl::event memset_async(void *dev_ptr, int value, size_t size,
910  return detail::memset(q, dev_ptr, value, size);
911 }
912 
920 static inline sycl::event
921 memset_d16_async(void *dev_ptr, unsigned short value, size_t size,
923  return detail::fill<unsigned short>(q, dev_ptr, value, size);
924 }
925 
933 static inline sycl::event
934 memset_d32_async(void *dev_ptr, unsigned int value, size_t size,
936  return detail::fill<unsigned int>(q, dev_ptr, value, size);
937 }
938 
947 static inline void memset(void *ptr, size_t pitch, int val, size_t x, size_t y,
949  sycl::event::wait(detail::memset<unsigned char>(q, ptr, pitch, val, x, y));
950 }
951 
960 static inline void memset_d16(void *ptr, size_t pitch, unsigned short val,
961  size_t x, size_t y,
963  sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y));
964 }
965 
974 static inline void memset_d32(void *ptr, size_t pitch, unsigned int val,
975  size_t x, size_t y,
977  sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y));
978 }
979 
989 static inline sycl::event memset_async(void *ptr, size_t pitch, int val,
990  size_t x, size_t y,
992 
993  auto events = detail::memset<unsigned char>(q, ptr, pitch, val, x, y);
994  return detail::combine_events(events, q);
995 }
996 
1006 static inline sycl::event
1007 memset_d16_async(void *ptr, size_t pitch, unsigned short val, size_t x,
1008  size_t y, sycl::queue q = get_default_queue()) {
1009  auto events = detail::memset(q, ptr, pitch, val, x, y);
1010  return detail::combine_events(events, q);
1011 }
1012 
1022 static inline sycl::event
1023 memset_d32_async(void *ptr, size_t pitch, unsigned int val, size_t x, size_t y,
1025  auto events = detail::memset(q, ptr, pitch, val, x, y);
1026  return detail::combine_events(events, q);
1027 }
1028 
1038 static inline void memset(pitched_data pitch, int val, sycl::range<3> size,
1040  sycl::event::wait(detail::memset<unsigned char>(q, pitch, val, size));
1041 }
1042 
1052 static inline sycl::event memset_async(pitched_data pitch, int val,
1053  sycl::range<3> size,
1055  auto events = detail::memset<unsigned char>(q, pitch, val, size);
1056  return detail::combine_events(events, q);
1057 }
1058 
1060 template <class T, memory_region Memory, size_t Dimension> class accessor;
1061 template <class T, memory_region Memory> class accessor<T, Memory, 3> {
1062 public:
1064  using element_t = typename memory_t::element_t;
1065  using pointer_t = typename memory_t::pointer_t;
1066  using accessor_t = typename memory_t::template accessor_t<3>;
1067  accessor(pointer_t data, const sycl::range<3> &in_range)
1068  : _data(data), _range(in_range) {}
1069  template <memory_region M = Memory>
1070  accessor(typename std::enable_if<M != memory_region::local,
1071  const accessor_t>::type &acc)
1072  : accessor(acc, acc.get_range()) {}
1073  accessor(const accessor_t &acc, const sycl::range<3> &in_range)
1074  : accessor(acc.get_pointer(), in_range) {}
1075  accessor<T, Memory, 2> operator[](size_t index) const {
1076  sycl::range<2> sub(_range.get(1), _range.get(2));
1077  return accessor<T, Memory, 2>(_data + index * sub.size(), sub);
1078  }
1079 
1080  pointer_t get_ptr() const { return _data; }
1081 
1082 private:
1083  pointer_t _data;
1084  sycl::range<3> _range;
1085 };
1086 template <class T, memory_region Memory> class accessor<T, Memory, 2> {
1087 public:
1089  using element_t = typename memory_t::element_t;
1090  using pointer_t = typename memory_t::pointer_t;
1091  using accessor_t = typename memory_t::template accessor_t<2>;
1092  accessor(pointer_t data, const sycl::range<2> &in_range)
1093  : _data(data), _range(in_range) {}
1094  template <memory_region Mem = Memory>
1095  accessor(typename std::enable_if<Mem != memory_region::local,
1096  const accessor_t>::type &acc)
1097  : accessor(acc, acc.get_range()) {}
1098  accessor(const accessor_t &acc, const sycl::range<2> &in_range)
1099  : accessor(acc.get_pointer(), in_range) {}
1100 
1101  pointer_t operator[](size_t index) const {
1102  return _data + _range.get(1) * index;
1103  }
1104 
1105  pointer_t get_ptr() const { return _data; }
1106 
1107 private:
1108  pointer_t _data;
1109  sycl::range<2> _range;
1110 };
1111 
1113 template <class T, memory_region Memory, size_t Dimension> class device_memory {
1114 public:
1115  using accessor_t =
1119 
1121  : device_memory(sycl::range<Dimension>(1), q) {}
1122 
1125  std::initializer_list<value_t> &&init_list,
1127  : device_memory(in_range, q) {
1128  assert(init_list.size() <= in_range.size());
1129  _host_ptr = (value_t *)std::malloc(_size);
1130  std::memset(_host_ptr, 0, _size);
1131  std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
1132  }
1133 
1135  template <size_t Dim = Dimension>
1137  const typename std::enable_if<Dim == 2, sycl::range<2>>::type &in_range,
1138  std::initializer_list<std::initializer_list<value_t>> &&init_list,
1140  : device_memory(in_range, q) {
1141  assert(init_list.size() <= in_range[0]);
1142  _host_ptr = (value_t *)std::malloc(_size);
1143  std::memset(_host_ptr, 0, _size);
1144  auto tmp_data = _host_ptr;
1145  for (auto sub_list : init_list) {
1146  assert(sub_list.size() <= in_range[1]);
1147  std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T));
1148  tmp_data += in_range[1];
1149  }
1150  }
1151 
1155  : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
1156  _host_ptr(nullptr), _device_ptr(nullptr), _q(q) {
1157  static_assert((Memory == memory_region::global) ||
1158  (Memory == memory_region::constant) ||
1159  (Memory == memory_region::usm_shared),
1160  "device memory region should be global, constant or shared");
1161  // Make sure that singleton class dev_mgr will destruct later than this.
1163  }
1164 
1166  // enable_if_t SFINAE to avoid ambiguity with
1167  // device_memory(Args... Arguments, sycl::queue q)
1168  template <class... Args, size_t Dim = Dimension,
1169  typename = std::enable_if_t<sizeof...(Args) == Dim>>
1170  device_memory(Args... Arguments)
1171  : device_memory(sycl::range<Dimension>(Arguments...),
1172  get_default_queue()) {}
1173 
1175  template <class... Args>
1176  device_memory(Args... Arguments, sycl::queue q)
1177  : device_memory(sycl::range<Dimension>(Arguments...), q) {}
1178 
1180  if (_device_ptr && !_reference)
1181  syclcompat::free(_device_ptr, _q);
1182  if (_host_ptr)
1183  std::free(_host_ptr);
1184  }
1185 
1188  void init() { init(_q); }
1191  void init(sycl::queue q) {
1192  if (_device_ptr)
1193  return;
1194  if (!_size)
1195  return;
1196  allocate_device(q);
1197  if (_host_ptr)
1198  detail::memcpy(q, _device_ptr, _host_ptr, _size);
1199  }
1200 
1202  void assign(value_t *src, size_t size) {
1203  this->~device_memory();
1204  new (this) device_memory(src, size, _q);
1205  }
1206 
1207  // Get memory pointer of the memory object, a device USM pointer.
1208  value_t *get_ptr() { return get_ptr(_q); }
1209 
1210  // Get memory pointer of the memory object, a device USM pointer.
1212  init(q);
1213  return _device_ptr;
1214  }
1215 
1217  size_t get_size() { return _size; }
1218 
1219  template <size_t Dim = Dimension>
1220  typename std::enable_if<Dim == 1, T>::type &operator[](size_t index) {
1221  init();
1222  return _device_ptr[index];
1223  }
1224 
1227  template <size_t Dim = Dimension>
1228  typename std::enable_if<Dim != 1, syclcompat_accessor_t>::type
1230  return syclcompat_accessor_t((T *)_device_ptr, _range);
1231  }
1232 
1233 private:
1234  device_memory(value_t *memory_ptr, size_t size,
1236  : _size(size), _range(size / sizeof(T)), _reference(true),
1237  _device_ptr(memory_ptr), _q(q) {}
1238 
1239  void allocate_device(sycl::queue q) {
1240  if (Memory == memory_region::usm_shared) {
1241  _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
1242  q.get_context());
1243  return;
1244  }
1245 #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
1246  if (Memory == memory_region::constant) {
1247  _device_ptr = (value_t *)sycl::malloc_device(
1248  _size, q.get_device(), q.get_context(),
1250  return;
1251  }
1252 #endif
1253  _device_ptr = (value_t *)detail::malloc(_size, q);
1254  }
1255 
1256  size_t _size;
1257  sycl::range<Dimension> _range;
1258  bool _reference;
1259  value_t *_host_ptr;
1260  value_t *_device_ptr;
1261  sycl::queue _q;
1262 };
1263 template <class T, memory_region Memory>
1264 class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
1265 public:
1267  using value_t = typename base::value_t;
1268  using accessor_t =
1270 
1273  : base(sycl::range<1>(1), {val}, q) {}
1274 
1277 };
1278 
1279 template <class T, size_t Dimension>
1281 template <class T, size_t Dimension>
1283 template <class T, size_t Dimension>
1285 
1287 public:
1288  void init(const void *ptr, sycl::queue q = get_default_queue()) {
1289  memory_type = sycl::get_pointer_type(ptr, q.get_context());
1290  device_pointer = (memory_type != sycl::usm::alloc::unknown) ? ptr : nullptr;
1291  host_pointer = (memory_type != sycl::usm::alloc::unknown) &&
1292  (memory_type != sycl::usm::alloc::device)
1293  ? ptr
1294  : nullptr;
1295  sycl::device device_obj = sycl::get_pointer_device(ptr, q.get_context());
1296  device_id = detail::dev_mgr::instance().get_device_id(device_obj);
1297  }
1298 
1299  sycl::usm::alloc get_memory_type() { return memory_type; }
1300 
1301  const void *get_device_pointer() { return device_pointer; }
1302 
1303  const void *get_host_pointer() { return host_pointer; }
1304 
1305  bool is_memory_shared() { return memory_type == sycl::usm::alloc::shared; }
1306 
1307  unsigned int get_device_id() { return device_id; }
1308 
1309 private:
1310  sycl::usm::alloc memory_type = sycl::usm::alloc::unknown;
1311  const void *device_pointer = nullptr;
1312  const void *host_pointer = nullptr;
1313  unsigned int device_id = 0;
1314 };
1315 
1316 } // namespace syclcompat
size_t get(int dimension) const
Definition: array.hpp:62
The SYCL device class encapsulates a single SYCL device on which kernels may be executed.
Definition: device.hpp:64
An event object can be used to synchronize memory transfers, enqueues of kernels and signaling barrie...
Definition: event.hpp:44
void wait()
Wait for the event.
Definition: event.cpp:41
Command group handler class.
Definition: handler.hpp:467
void depends_on(event Event)
Registers event dependencies on this command group.
Definition: handler.cpp:1529
std::enable_if_t< detail::check_fn_signature< std::remove_reference_t< FuncT >, void()>::value||detail::check_fn_signature< std::remove_reference_t< FuncT >, void(interop_handle)>::value > host_task(FuncT &&Func)
Enqueues a command to the SYCL runtime to invoke Func once.
Definition: handler.hpp:2085
A unique identifier of an item in an index space.
Definition: id.hpp:36
Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
Definition: queue.hpp:110
void wait(const detail::code_location &CodeLoc=detail::code_location::current())
Performs a blocking wait for the completion of all enqueued tasks in the queue.
Definition: queue.hpp:429
event memcpy(void *Dest, const void *Src, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Copies data from one memory region to another, each is either a host pointer or a pointer within USM ...
Definition: queue.cpp:122
event fill(void *Ptr, const T &Pattern, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the specified memory with the specified pattern.
Definition: queue.hpp:484
event memset(void *Ptr, int Value, size_t Count, const detail::code_location &CodeLoc=detail::code_location::current())
Fills the memory pointed by a USM pointer with the value specified.
Definition: queue.cpp:101
device get_device() const
Definition: queue.cpp:77
context get_context() const
Definition: queue.cpp:75
std::enable_if_t< std::is_invocable_r_v< void, T, handler & >, event > submit(T CGF, const detail::code_location &CodeLoc=detail::code_location::current())
Submits a command group function object to the queue, in order to be scheduled for execution on the d...
Definition: queue.hpp:340
Defines the iteration domain of either a single work-group in a parallel dispatch,...
Definition: range.hpp:26
size_t size() const
Definition: range.hpp:56
accessor(const accessor_t &acc, const sycl::range< 2 > &in_range)
Definition: memory.hpp:1098
pointer_t operator[](size_t index) const
Definition: memory.hpp:1101
typename memory_t::template accessor_t< 2 > accessor_t
Definition: memory.hpp:1091
accessor(typename std::enable_if< Mem !=memory_region::local, const accessor_t >::type &acc)
Definition: memory.hpp:1095
typename memory_t::pointer_t pointer_t
Definition: memory.hpp:1090
typename memory_t::element_t element_t
Definition: memory.hpp:1089
accessor(pointer_t data, const sycl::range< 2 > &in_range)
Definition: memory.hpp:1092
accessor(pointer_t data, const sycl::range< 3 > &in_range)
Definition: memory.hpp:1067
accessor< T, Memory, 2 > operator[](size_t index) const
Definition: memory.hpp:1075
typename memory_t::template accessor_t< 3 > accessor_t
Definition: memory.hpp:1066
typename memory_t::element_t element_t
Definition: memory.hpp:1064
accessor(typename std::enable_if< M !=memory_region::local, const accessor_t >::type &acc)
Definition: memory.hpp:1070
typename memory_t::pointer_t pointer_t
Definition: memory.hpp:1065
accessor(const accessor_t &acc, const sycl::range< 3 > &in_range)
Definition: memory.hpp:1073
accessor used as device function parameter.
Definition: memory.hpp:1060
unsigned int get_device_id(const sycl::device &dev)
Definition: device.hpp:739
static dev_mgr & instance()
Returns the instance of device manager singleton.
Definition: device.hpp:813
host_buffer(size_t size, sycl::queue q, const std::vector< sycl::event > &deps)
Definition: memory.hpp:341
typename std::conditional_t< Memory==memory_region::constant, const T, T > element_t
Definition: memory.hpp:177
typename std::remove_cv_t< T > value_t
Definition: memory.hpp:178
static constexpr size_t type_size
Definition: memory.hpp:174
typename std::conditional_t< target==target::local, sycl::local_accessor< T, Dimension >, sycl::accessor< T, Dimension, mode > > accessor_t
Definition: memory.hpp:183
static constexpr sycl::access_mode mode
Definition: memory.hpp:171
static constexpr sycl::access::address_space asp
Definition: memory.hpp:165
void queues_wait_and_throw()
Definition: device.hpp:566
typename detail::memory_traits< Memory, T >::template accessor_t< 0 > accessor_t
Definition: memory.hpp:1269
device_memory(const value_t &val, sycl::queue q=get_default_queue())
Constructor with initial value.
Definition: memory.hpp:1272
device_memory(sycl::queue q=get_default_queue())
Default constructor.
Definition: memory.hpp:1276
Device variable with address space of shared or global.
Definition: memory.hpp:1113
std::enable_if< Dim !=1, syclcompat_accessor_t >::type get_access(sycl::handler &cgh)
Get compat_accessor with dimension info for the device memory object when usm is used and dimension i...
Definition: memory.hpp:1229
device_memory(const sycl::range< Dimension > &range_in, sycl::queue q=get_default_queue())
Constructor with range.
Definition: memory.hpp:1153
syclcompat::accessor< T, Memory, Dimension > syclcompat_accessor_t
Definition: memory.hpp:1118
size_t get_size()
Get the device memory object size in bytes.
Definition: memory.hpp:1217
device_memory(Args... Arguments)
Constructor with range.
Definition: memory.hpp:1170
device_memory(const typename std::enable_if< Dim==2, sycl::range< 2 >>::type &in_range, std::initializer_list< std::initializer_list< value_t >> &&init_list, sycl::queue q=get_default_queue())
Constructor of 2-D array with initializer list.
Definition: memory.hpp:1136
std::enable_if< Dim==1, T >::type & operator[](size_t index)
Definition: memory.hpp:1220
value_t * get_ptr(sycl::queue q)
Definition: memory.hpp:1211
void init(sycl::queue q)
Allocate memory with specified queue, and init memory if has initial value.
Definition: memory.hpp:1191
device_memory(const sycl::range< Dimension > &in_range, std::initializer_list< value_t > &&init_list, sycl::queue q=get_default_queue())
Constructor of 1-D array with initializer list.
Definition: memory.hpp:1124
device_memory(sycl::queue q=get_default_queue())
Definition: memory.hpp:1120
void init()
Allocate memory with the queue specified in the constuctor, and init memory if has initial value.
Definition: memory.hpp:1188
void assign(value_t *src, size_t size)
The variable is assigned to a device pointer.
Definition: memory.hpp:1202
typename detail::memory_traits< Memory, T >::template accessor_t< Dimension > accessor_t
Definition: memory.hpp:1116
device_memory(Args... Arguments, sycl::queue q)
Constructor with range and queue.
Definition: memory.hpp:1176
typename detail::memory_traits< Memory, T >::value_t value_t
Definition: memory.hpp:1117
Pitched 2D/3D memory data.
Definition: memory.hpp:101
void set_y(size_t y)
Definition: memory.hpp:117
pitched_data(void *data, size_t pitch, size_t x, size_t y)
Definition: memory.hpp:104
void set_x(size_t x)
Definition: memory.hpp:114
void set_pitch(size_t pitch)
Definition: memory.hpp:111
void set_data_ptr(void *data)
Definition: memory.hpp:108
const void * get_host_pointer()
Definition: memory.hpp:1303
void init(const void *ptr, sycl::queue q=get_default_queue())
Definition: memory.hpp:1288
const void * get_device_pointer()
Definition: memory.hpp:1301
sycl::usm::alloc get_memory_type()
Definition: memory.hpp:1299
unsigned int get_device_id()
Definition: memory.hpp:1307
__ESIMD_API std::enable_if_t<(RegionT::length *sizeof(typename RegionT::element_type) >=2)> wait(sycl::ext::intel::esimd::simd_view< T, RegionT > value)
Create explicit scoreboard dependency to avoid device code motion across this call and preserve the v...
Definition: memory.hpp:226
access::mode access_mode
Definition: access.hpp:72
usm::alloc get_pointer_type(const void *ptr, const context &ctxt)
Query the allocation type from a USM pointer.
Definition: usm_impl.cpp:544
void * malloc_shared(size_t size, const device &dev, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:377
constexpr mode_tag_t< access_mode::read_write > read_write
Definition: access.hpp:85
auto auto autodecltype(x) z
device get_pointer_device(const void *ptr, const context &ctxt)
Queries the device against which the pointer was allocated Throws an exception with errc::invalid err...
Definition: usm_impl.cpp:592
void * malloc_device(size_t size, const device &dev, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:289
void * malloc_host(size_t size, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:352
void free(void *ptr, const context &ctxt, const detail::code_location &CodeLoc=detail::code_location::current())
Definition: usm_impl.cpp:343
Definition: access.hpp:18
static experimental::memcpy_direction deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr)
Definition: memory.hpp:302
static pointer_access_attribute get_pointer_attribute(sycl::queue q, const void *ptr)
Definition: memory.hpp:288
static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t size, const std::vector< sycl::event > &dep_events={})
Definition: memory.hpp:315
static constexpr size_t get_pitch(size_t x)
Calculate pitch (padded length of major dimension x) by rounding up to multiple of 32.
Definition: memory.hpp:195
static sycl::event combine_events(std::vector< sycl::event > &events, sycl::queue q)
Definition: memory.hpp:473
static sycl::event memset(sycl::queue q, void *dev_ptr, int value, size_t size)
Set value to the first size bytes starting from dev_ptr in q.
Definition: memory.hpp:234
static size_t get_offset(sycl::id< 3 > id, size_t slice, size_t pitch)
Definition: memory.hpp:329
static size_t get_copy_range(sycl::range< 3 > size, size_t slice, size_t pitch)
Definition: memory.hpp:324
static void * malloc(size_t size, sycl::queue q)
Definition: memory.hpp:187
static sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern, size_t count)
Set pattern to the first count elements of type T starting from dev_ptr.
Definition: memory.hpp:222
static std::vector< sycl::event > memcpy(sycl::queue q, const experimental::memcpy_parameter &param)
Definition: memory.hpp:486
static pitched_data to_pitched_data(image_matrix *image)
static void memcpy(const memcpy_parameter &param, sycl::queue q=get_default_queue())
[UNSUPPORTED] Synchronously copies 2D/3D memory data specified by param .
Definition: memory.hpp:850
static void memcpy_async(const memcpy_parameter &param, sycl::queue q=get_default_queue())
[UNSUPPORTED] Asynchronously copies 2D/3D memory data specified by param
Definition: memory.hpp:861
static sycl::event memset_d16_async(void *dev_ptr, unsigned short value, size_t size, sycl::queue q=get_default_queue())
Sets 2 bytes data value to the first size elements starting from dev_ptr in q asynchronously.
Definition: memory.hpp:921
static void memcpy(void *to_ptr, const void *from_ptr, size_t size, sycl::queue q=get_default_queue())
Synchronously copies size bytes from the address specified by from_ptr to the address specified by to...
Definition: memory.hpp:671
uint8_t byte_t
Definition: memory.hpp:98
static void free(void *ptr, sycl::queue q=get_default_queue())
Free the memory ptr on the default queue without synchronizing.
Definition: memory.hpp:632
static sycl::queue get_default_queue()
Util function to get the default queue of current device in device manager.
Definition: device.hpp:872
auto * local_mem()
Definition: memory.hpp:70
static device_ext & get_current_device()
Util function to get the current device.
Definition: device.hpp:900
static void memset_d32(void *dev_ptr, unsigned int value, size_t size, sycl::queue q=get_default_queue())
Sets 4 bytes data value to the first size elements starting from dev_ptr in q synchronously.
Definition: memory.hpp:897
static void * malloc_host(size_t num_bytes, sycl::queue q=get_default_queue())
Allocate memory block on the host.
Definition: memory.hpp:556
typename type_identity< T >::type type_identity_t
Definition: traits.hpp:35
static sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size, sycl::queue q=get_default_queue())
Asynchronously copies size bytes from the address specified by from_ptr to the address specified by t...
Definition: memory.hpp:685
static sycl::event memset_async(void *dev_ptr, int value, size_t size, sycl::queue q=get_default_queue())
Sets 1 byte data value to the first size elements starting from dev_ptr in q asynchronously.
Definition: memory.hpp:908
static void fill(void *dev_ptr, const T &pattern, size_t count, sycl::queue q=get_default_queue())
Synchronously sets pattern to the first count elements starting from dev_ptr.
Definition: memory.hpp:819
static void * malloc_shared(size_t num_bytes, sycl::queue q=get_default_queue())
Allocate memory block of usm_shared memory.
Definition: memory.hpp:576
static void * malloc(size_t &pitch, size_t x, size_t y, sycl::queue q=get_default_queue())
Allocate memory block for 2D array on the device.
Definition: memory.hpp:611
static sycl::event fill_async(void *dev_ptr, const T &pattern, size_t count, sycl::queue q=get_default_queue())
Asynchronously sets pattern to the first count elements starting from dev_ptr.
Definition: memory.hpp:836
static void wait_and_free(void *ptr, sycl::queue q=get_default_queue())
Wait on the queue q and free the memory ptr.
Definition: memory.hpp:620
sycl::event enqueue_free(const std::vector< void * > &pointers, const std::vector< sycl::event > &events, sycl::queue q=get_default_queue())
Enqueues the release of all pointers in /p pointers on the /p q.
Definition: memory.hpp:647
static sycl::event memset_d32_async(void *dev_ptr, unsigned int value, size_t size, sycl::queue q=get_default_queue())
Sets 4 bytes data value to the first size elements starting from dev_ptr in q asynchronously.
Definition: memory.hpp:934
static void memset_d16(void *dev_ptr, unsigned short value, size_t size, sycl::queue q=get_default_queue())
Sets 2 bytes data value to the first size elements starting from dev_ptr in q synchronously.
Definition: memory.hpp:886
static void memset(void *dev_ptr, int value, size_t size, sycl::queue q=get_default_queue())
Synchronously sets value to the first size bytes starting from dev_ptr.
Definition: memory.hpp:875
Memory copy parameters for 2D/3D memory data.
Definition: memory.hpp:143
syclcompat::experimental::memcpy_direction direction
Definition: memory.hpp:155