DPC++ Runtime
Runtime libraries for oneAPI DPC++
group_load_store.hpp
Go to the documentation of this file.
1 //==---- group_load_store.hpp --- SYCL extension for group loads/stores ----==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 // Implements sycl_ext_oneapi_group_load_store extension.
10 
11 #pragma once
12 
14 #include <sycl/group_barrier.hpp>
15 #include <sycl/sycl_span.hpp>
16 
17 #include <cstring>
18 
19 namespace sycl {
20 inline namespace _V1 {
21 namespace ext::oneapi::experimental {
22 
24 
26  : detail::compile_time_property_key<detail::PropKind::DataPlacement> {
27  template <data_placement_enum Placement>
28  using value_t =
30  // TODO: Extension uses data_placement_enum directly here.
31  std::integral_constant<int, static_cast<int>(Placement)>>;
32 };
33 
34 template <data_placement_enum Placement>
36 
41 
43  : detail::compile_time_property_key<detail::PropKind::ContiguousMemory> {
45 };
46 
48 
50  : detail::compile_time_property_key<detail::PropKind::FullGroup> {
52 };
53 
55 
56 namespace detail {
57 struct naive_key : detail::compile_time_property_key<detail::PropKind::Naive> {
59 };
60 inline constexpr naive_key::value_t naive;
61 using namespace sycl::detail;
62 } // namespace detail
63 
64 #ifdef __SYCL_DEVICE_ONLY__
65 namespace detail {
66 template <typename InputIteratorT, typename OutputElemT>
67 inline constexpr bool verify_load_types =
68  std::is_same_v<
69  typename std::iterator_traits<InputIteratorT>::iterator_category,
70  std::random_access_iterator_tag> &&
71  std::is_convertible_v<remove_decoration_t<typename std::iterator_traits<
72  InputIteratorT>::value_type>,
73  OutputElemT> &&
74  std::is_trivially_copyable_v<remove_decoration_t<
76  std::is_default_constructible_v<remove_decoration_t<
78  std::is_trivially_copyable_v<OutputElemT> &&
79  std::is_default_constructible_v<OutputElemT>;
80 
81 template <typename InputElemT, typename OutputIteratorT>
82 inline constexpr bool verify_store_types =
83  std::is_same_v<
84  typename std::iterator_traits<OutputIteratorT>::iterator_category,
85  std::random_access_iterator_tag> &&
86  std::is_convertible_v<InputElemT,
87  remove_decoration_t<typename std::iterator_traits<
88  OutputIteratorT>::value_type>> &&
89  std::is_trivially_copyable_v<remove_decoration_t<
91  std::is_default_constructible_v<remove_decoration_t<
93  std::is_trivially_copyable_v<InputElemT> &&
94  std::is_default_constructible_v<InputElemT>;
95 
96 template <typename Properties> constexpr bool isBlocked(Properties properties) {
97  if constexpr (properties.template has_property<data_placement_key>())
98  return properties.template get_property<data_placement_key>() ==
100  else
101  return true;
102 }
103 
104 template <bool IsBlocked, int VEC_OR_ARRAY_SIZE, typename GroupTy>
105 int get_mem_idx(GroupTy g, int vec_or_array_idx) {
106  if constexpr (IsBlocked)
107  return g.get_local_linear_id() * VEC_OR_ARRAY_SIZE + vec_or_array_idx;
108  else
109  return g.get_local_linear_id() +
110  g.get_local_linear_range() * vec_or_array_idx;
111 }
112 
113 // SPIR-V extension:
114 // https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_subgroups.asciidoc,
115 // however it doesn't describe limitations/requirements. Those seem to be
116 // listed in the Intel OpenCL extensions for sub-groups:
117 // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups.html
118 // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups_char.html
119 // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups_long.html
120 // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups_short.html
121 // Reads require 4-byte alignment, writes 16-byte alignment. Supported
122 // sizes:
123 //
124 // +------------+-------------+
125 // | block type | # of blocks |
126 // +------------+-------------+
127 // | uchar | 1,2,4,8,16 |
128 // | ushort | 1,2,4,8 |
129 // | uint | 1,2,4,8 |
130 // | ulong | 1,2,4,8 |
131 // +------------+-------------+
132 //
133 // Utility type traits below are used to map user type to one of the block
134 // read/write types above.
135 
136 template <typename IteratorT, std::size_t ElementsPerWorkItem, bool Blocked>
137 struct BlockInfo {
138  using value_type =
140 
141  static constexpr int block_size =
142  sizeof(value_type) * (Blocked ? ElementsPerWorkItem : 1);
143  static constexpr int num_blocks = Blocked ? 1 : ElementsPerWorkItem;
144  // There is an overload in the table above that could be used for the block
145  // operation:
146  static constexpr bool has_builtin =
147  detail::is_power_of_two(block_size) &&
148  detail::is_power_of_two(num_blocks) && block_size <= 8 &&
149  (num_blocks <= 8 || (num_blocks == 16 && block_size == 1));
150 };
151 
152 template <typename BlockInfoTy> struct BlockTypeInfo;
153 
154 template <typename IteratorT, std::size_t ElementsPerWorkItem, bool Blocked>
155 struct BlockTypeInfo<BlockInfo<IteratorT, ElementsPerWorkItem, Blocked>> {
156  using BlockInfoTy = BlockInfo<IteratorT, ElementsPerWorkItem, Blocked>;
157  static_assert(BlockInfoTy::has_builtin);
158 
160 
161  using block_pointer_elem_type = std::conditional_t<
162  std::is_const_v<std::remove_reference_t<
164  std::add_const_t<block_type>, block_type>;
165 
166  using block_pointer_type = typename detail::DecoratedType<
167  block_pointer_elem_type, access::address_space::global_space>::type *;
168  using block_op_type = std::conditional_t<
169  BlockInfoTy::num_blocks == 1, block_type,
171 };
172 
173 // Returns either a pointer suitable to use in a block read/write builtin or
174 // nullptr if some legality conditions aren't satisfied.
175 template <int RequiredAlign, std::size_t ElementsPerWorkItem,
176  typename IteratorT, typename Properties>
177 auto get_block_op_ptr(IteratorT iter, [[maybe_unused]] Properties props) {
178  using value_type =
180  using iter_no_cv = std::remove_cv_t<IteratorT>;
181 
182  constexpr bool blocked = detail::isBlocked(props);
183  using BlkInfo = BlockInfo<IteratorT, ElementsPerWorkItem, blocked>;
184 
185 #if defined(__SPIR__)
186  // TODO: What about non-Intel SPIR-V devices?
187  constexpr bool is_spir = true;
188 #else
189  constexpr bool is_spir = false;
190 #endif
191 
192  if constexpr (!is_spir || !BlkInfo::has_builtin) {
193  return nullptr;
194  } else if constexpr (!props.template has_property<full_group_key>()) {
195  return nullptr;
196  } else if constexpr (detail::is_multi_ptr_v<IteratorT>) {
197  return get_block_op_ptr<RequiredAlign, ElementsPerWorkItem>(
198  iter.get_decorated(), props);
199  } else if constexpr (!std::is_pointer_v<iter_no_cv>) {
200  if constexpr (props.template has_property<contiguous_memory_key>())
201  return get_block_op_ptr<RequiredAlign, ElementsPerWorkItem>(&*iter,
202  props);
203  else
204  return nullptr;
205  } else {
206  // Load/store to/from nullptr would be an UB, this assume allows the
207  // compiler to optimize the IR further.
208  __builtin_assume(iter != nullptr);
209 
210  // No early return as that would mess up return type deduction.
211  bool is_aligned = alignof(value_type) >= RequiredAlign ||
212  reinterpret_cast<uintptr_t>(iter) % RequiredAlign == 0;
213 
214  constexpr auto AS = detail::deduce_AS<iter_no_cv>::value;
215  using block_pointer_type =
216  typename BlockTypeInfo<BlkInfo>::block_pointer_type;
217  if constexpr (AS == access::address_space::global_space) {
218  return is_aligned ? reinterpret_cast<block_pointer_type>(iter) : nullptr;
219  } else if constexpr (AS == access::address_space::generic_space) {
220  return is_aligned
221  ? reinterpret_cast<block_pointer_type>(
222  __SYCL_GenericCastToPtrExplicit_ToGlobal<value_type>(
223  iter))
224  : nullptr;
225  } else {
226  return nullptr;
227  }
228  }
229 }
230 } // namespace detail
231 
232 // Load API span overload.
233 template <typename Group, typename InputIteratorT, typename OutputT,
234  std::size_t ElementsPerWorkItem,
235  typename Properties = decltype(properties())>
236 std::enable_if_t<detail::verify_load_types<InputIteratorT, OutputT> &&
237  detail::is_generic_group_v<Group>>
238 group_load(Group g, InputIteratorT in_ptr,
239  span<OutputT, ElementsPerWorkItem> out, Properties props = {}) {
240  constexpr bool blocked = detail::isBlocked(props);
241  using use_naive =
242  detail::merged_properties_t<Properties,
243  decltype(properties(detail::naive))>;
244 
245  if constexpr (props.template has_property<detail::naive_key>()) {
246  group_barrier(g);
247  for (int i = 0; i < out.size(); ++i)
248  out[i] = in_ptr[detail::get_mem_idx<blocked, ElementsPerWorkItem>(g, i)];
249  group_barrier(g);
250  return;
251  } else if constexpr (!std::is_same_v<Group, sycl::sub_group>) {
252  return group_load(g, in_ptr, out, use_naive{});
253  } else {
254  auto ptr =
255  detail::get_block_op_ptr<4 /* load align */, ElementsPerWorkItem>(
256  in_ptr, props);
257  if (!ptr)
258  return group_load(g, in_ptr, out, use_naive{});
259 
260  if constexpr (!std::is_same_v<std::nullptr_t, decltype(ptr)>) {
261  // Do optimized load.
264 
265  auto load = __spirv_SubgroupBlockReadINTEL<
266  typename detail::BlockTypeInfo<detail::BlockInfo<
267  InputIteratorT, ElementsPerWorkItem, blocked>>::block_op_type>(
268  ptr);
269 
270  // TODO: accessor_iterator's value_type is weird, so we need
271  // `std::remove_const_t` below:
272  //
273  // static_assert(
274  // std::is_same_v<
275  // typename std::iterator_traits<
276  // sycl::detail::accessor_iterator<const int, 1>>::value_type,
277  // const int>);
278  //
279  // yet
280  //
281  // static_assert(
282  // std::is_same_v<
283  // typename std::iterator_traits<const int *>::value_type, int>);
284 
285  if constexpr (std::is_same_v<std::remove_const_t<value_type>, OutputT>) {
286  static_assert(sizeof(load) == out.size_bytes());
287  std::memcpy(out.begin(), &load, out.size_bytes());
288  } else {
289  std::remove_const_t<value_type> values[ElementsPerWorkItem];
290  static_assert(sizeof(load) == sizeof(values));
291  std::memcpy(values, &load, sizeof(values));
292 
293  // Note: can't `memcpy` directly into `out` because that might bypass
294  // an implicit conversion required by the specification.
295  for (int i = 0; i < ElementsPerWorkItem; ++i)
296  out[i] = values[i];
297  }
298 
299  return;
300  }
301  }
302 }
303 
304 // Store API span overload.
305 template <typename Group, typename InputT, std::size_t ElementsPerWorkItem,
306  typename OutputIteratorT,
307  typename Properties = decltype(properties())>
308 std::enable_if_t<detail::verify_store_types<InputT, OutputIteratorT> &&
309  detail::is_generic_group_v<Group>>
310 group_store(Group g, const span<InputT, ElementsPerWorkItem> in,
311  OutputIteratorT out_ptr, Properties props = {}) {
312  constexpr bool blocked = detail::isBlocked(props);
313  using use_naive =
314  detail::merged_properties_t<Properties,
315  decltype(properties(detail::naive))>;
316 
317  if constexpr (props.template has_property<detail::naive_key>()) {
318  group_barrier(g);
319  for (int i = 0; i < in.size(); ++i)
320  out_ptr[detail::get_mem_idx<blocked, ElementsPerWorkItem>(g, i)] = in[i];
321  group_barrier(g);
322  return;
323  } else if constexpr (!std::is_same_v<Group, sycl::sub_group>) {
324  return group_store(g, in, out_ptr, use_naive{});
325  } else {
326  auto ptr =
327  detail::get_block_op_ptr<16 /* store align */, ElementsPerWorkItem>(
328  out_ptr, props);
329  if (!ptr)
330  return group_store(g, in, out_ptr, use_naive{});
331 
332  if constexpr (!std::is_same_v<std::nullptr_t, decltype(ptr)>) {
333  // Do optimized store.
334  std::remove_const_t<remove_decoration_t<
336  values[ElementsPerWorkItem];
337 
338  for (int i = 0; i < ElementsPerWorkItem; ++i) {
339  // Including implicit conversion.
340  values[i] = in[i];
341  }
342 
343  __spirv_SubgroupBlockWriteINTEL(
344  ptr,
345  sycl::bit_cast<typename detail::BlockTypeInfo<detail::BlockInfo<
346  OutputIteratorT, ElementsPerWorkItem, blocked>>::block_op_type>(
347  values));
348  }
349  }
350 }
351 
352 // Load API scalar.
353 template <typename Group, typename InputIteratorT, typename OutputT,
354  typename Properties = decltype(properties())>
355 std::enable_if_t<detail::verify_load_types<InputIteratorT, OutputT> &&
356  detail::is_generic_group_v<Group>>
357 group_load(Group g, InputIteratorT in_ptr, OutputT &out,
358  Properties properties = {}) {
359  group_load(g, in_ptr, span<OutputT, 1>(&out, 1), properties);
360 }
361 
362 // Store API scalar.
363 template <typename Group, typename InputT, typename OutputIteratorT,
364  typename Properties = decltype(properties())>
365 std::enable_if_t<detail::verify_store_types<InputT, OutputIteratorT> &&
366  detail::is_generic_group_v<Group>>
367 group_store(Group g, const InputT &in, OutputIteratorT out_ptr,
368  Properties properties = {}) {
369  group_store(g, span<const InputT, 1>(&in, 1), out_ptr, properties);
370 }
371 
372 // Load API sycl::vec overload.
373 template <typename Group, typename InputIteratorT, typename OutputT, int N,
374  typename Properties = decltype(properties())>
375 std::enable_if_t<detail::verify_load_types<InputIteratorT, OutputT> &&
376  detail::is_generic_group_v<Group>>
377 group_load(Group g, InputIteratorT in_ptr, sycl::vec<OutputT, N> &out,
378  Properties properties = {}) {
379  group_load(g, in_ptr, span<OutputT, N>(&out[0], N), properties);
380 }
381 
382 // Store API sycl::vec overload.
383 template <typename Group, typename InputT, int N, typename OutputIteratorT,
384  typename Properties = decltype(properties())>
385 std::enable_if_t<detail::verify_store_types<InputT, OutputIteratorT> &&
386  detail::is_generic_group_v<Group>>
387 group_store(Group g, const sycl::vec<InputT, N> &in, OutputIteratorT out_ptr,
388  Properties properties = {}) {
389  group_store(g, span<const InputT, N>(&in[0], N), out_ptr, properties);
390 }
391 
392 #else
393 template <typename... Args> void group_load(Args...) {
394  throw sycl::exception(
395  std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
396  "Group loads/stores are not supported on host.");
397 }
398 template <typename... Args> void group_store(Args...) {
399  throw sycl::exception(
400  std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
401  "Group loads/stores are not supported on host.");
402 }
403 #endif
404 } // namespace ext::oneapi::experimental
405 } // namespace _V1
406 } // namespace sycl
constexpr bool is_power_of_two(int x)
Definition: helpers.hpp:250
std::conditional_t< Size==1, opencl::cl_uchar, std::conditional_t< Size==2, opencl::cl_ushort, std::conditional_t< Size==4, opencl::cl_uint, opencl::cl_ulong > >> cl_unsigned
decltype(convertToOpenCLType(std::declval< T >())) ConvertToOpenCLType_t
typename merged_properties< LHSPropertiesT, RHSPropertiesT >::type merged_properties_t
Definition: properties.hpp:267
constexpr data_placement_key::value_t< Placement > data_placement
sycl::ext::oneapi::experimental::annotated_ref< T, property_list_t > reference
constexpr data_placement_key::value_t< data_placement_enum::striped > data_placement_striped
constexpr full_group_key::value_t full_group
constexpr data_placement_key::value_t< data_placement_enum::blocked > data_placement_blocked
constexpr contiguous_memory_key::value_t contiguous_memory
std::enable_if_t< sizeof(To)==sizeof(From) &&std::is_trivially_copyable< From >::value &&std::is_trivially_copyable< To >::value, To > bit_cast(const From &from) noexcept
Definition: bit_cast.hpp:52
void group_barrier(ext::oneapi::experimental::root_group< dimensions > G, memory_scope FenceScope=decltype(G)::fence_scope)
Definition: root_group.hpp:100
return(x >> one)+(y >> one)+((y &x) &one)
const std::error_category & sycl_category() noexcept
Definition: exception.cpp:59
const void value_type
Definition: multi_ptr.hpp:457
typename remove_decoration< T >::type remove_decoration_t
Definition: access.hpp:325
Definition: access.hpp:18
error_code
Definition: defs.hpp:70