DPC++ Runtime
Runtime libraries for oneAPI DPC++
memory_intrin.hpp
Go to the documentation of this file.
1 //==------------ memory_intrin.hpp - DPC++ Explicit SIMD API ---------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Declares Explicit SIMD intrinsics used to implement working with
9 // the SIMD classes objects.
10 //===----------------------------------------------------------------------===//
11 
13 
14 #pragma once
15 
16 #include <sycl/accessor.hpp>
20 #include <sycl/types.hpp>
21 
22 #include <cstdint>
23 
24 #ifndef __SYCL_DEVICE_ONLY__
25 // ESIMD_CPU Emulation support using esimd_cpu plugin
26 
27 #include <sycl/backend_types.hpp>
28 #include <sycl/detail/pi.hpp>
31 
32 // Channel Mask Array for scaled-gather/scatter
33 const std::array<__ESIMD_NS::rgba_channel, 4> ChannelMaskArray{
34  __ESIMD_NS::rgba_channel::R, __ESIMD_NS::rgba_channel::G,
35  __ESIMD_NS::rgba_channel::B, __ESIMD_NS::rgba_channel::A};
36 
37 #endif // ifndef __SYCL_DEVICE_ONLY__
38 
39 namespace sycl {
41 namespace ext::intel::esimd::detail {
42 
43 // Provides access to sycl accessor class' private members.
44 class AccessorPrivateProxy {
45 public:
46  template <typename AccessorTy>
47  static auto getQualifiedPtrOrImageObj(const AccessorTy &Acc) {
48 #ifdef __SYCL_DEVICE_ONLY__
49  if constexpr (sycl::detail::acc_properties::is_image_accessor_v<AccessorTy>)
50  return Acc.getNativeImageObj();
51  else
52  return Acc.getQualifiedPtr();
53 #else // __SYCL_DEVICE_ONLY__
54  return Acc;
55 #endif // __SYCL_DEVICE_ONLY__
56  }
57 
58 #ifndef __SYCL_DEVICE_ONLY__
59  static void *getPtr(const sycl::detail::AccessorBaseHost &Acc) {
60  return Acc.getPtr();
61  }
62 #endif // __SYCL_DEVICE_ONLY__
63 };
64 
65 template <int ElemsPerAddr,
66  typename = std::enable_if_t<(ElemsPerAddr == 1 || ElemsPerAddr == 2 ||
67  ElemsPerAddr == 4)>>
68 constexpr unsigned int ElemsPerAddrEncoding() {
69  // encoding requires log2 of ElemsPerAddr
70  if constexpr (ElemsPerAddr == 1)
71  return 0;
72  else if constexpr (ElemsPerAddr == 2)
73  return 1;
74  else if constexpr (ElemsPerAddr == 4)
75  return 2;
76 
77  // other cases not needed since std::enable_if disallows other values
78 }
79 
80 constexpr unsigned int ElemsPerAddrDecoding(unsigned int ElemsPerAddrEncoded) {
81  // encoding requires 2^ElemsPerAddrEncoded
82  return (1 << ElemsPerAddrEncoded);
83 }
84 
85 } // namespace ext::intel::esimd::detail
86 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
87 } // namespace sycl
88 
89 // flat_read does flat-address gather
90 template <typename Ty, int N, int NumBlk = 0, int ElemsPerAddr = 0>
91 __ESIMD_INTRIN
92  __ESIMD_DNS::vector_type_t<Ty,
93  N * __ESIMD_DNS::ElemsPerAddrDecoding(NumBlk)>
94  __esimd_svm_gather(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
95  __ESIMD_DNS::simd_mask_storage_t<N> pred = 1)
96 #ifdef __SYCL_DEVICE_ONLY__
97  ;
98 #else
99 {
100  auto NumBlkDecoded = __ESIMD_DNS::ElemsPerAddrDecoding(NumBlk);
101  __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::ElemsPerAddrDecoding(NumBlk)>
102  V = 0;
103  auto ElemsPerAddrDecoded = __ESIMD_DNS::ElemsPerAddrDecoding(ElemsPerAddr);
104  if (sizeof(Ty) == 2)
105  ElemsPerAddrDecoded = ElemsPerAddrDecoded / 2;
106 
107  for (int I = 0; I < N; I++) {
108  if (pred[I]) {
109  Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
110  if (sizeof(Ty) <= 2) {
111  for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddrDecoded; J++)
112  V[I * NumBlkDecoded + J] = *(Addr + J);
113  } else {
114  for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddrDecoded; J++)
115  V[J * N + I] = *(Addr + J);
116  }
117  }
118  }
119  return V;
120 }
121 #endif // __SYCL_DEVICE_ONLY__
122 
123 // flat_write does flat-address scatter
124 template <typename Ty, int N, int NumBlk = 0, int ElemsPerAddr = 0>
125 __ESIMD_INTRIN void __esimd_svm_scatter(
126  __ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
127  __ESIMD_DNS::vector_type_t<Ty,
128  N * __ESIMD_DNS::ElemsPerAddrDecoding(NumBlk)>
129  vals,
130  __ESIMD_DNS::simd_mask_storage_t<N> pred = 1)
131 #ifdef __SYCL_DEVICE_ONLY__
132  ;
133 #else
134 {
135  auto NumBlkDecoded = __ESIMD_DNS::ElemsPerAddrDecoding(NumBlk);
136  auto ElemsPerAddrDecoded = __ESIMD_DNS::ElemsPerAddrDecoding(ElemsPerAddr);
137  if (sizeof(Ty) == 2)
138  ElemsPerAddrDecoded = ElemsPerAddrDecoded / 2;
139 
140  for (int I = 0; I < N; I++) {
141  if (pred[I]) {
142  Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
143  if (sizeof(Ty) <= 2) {
144  for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddrDecoded; J++)
145  *(Addr + J) = vals[I * NumBlkDecoded + J];
146  } else {
147  for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddrDecoded; J++)
148  *(Addr + J) = vals[J * N + I];
149  }
150  }
151  }
152 }
153 #endif // __SYCL_DEVICE_ONLY__
154 
155 // flat_block_read reads a block of data from one flat address
156 template <typename Ty, int N>
157 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
158 __esimd_svm_block_ld_unaligned(uint64_t addr)
159 #ifdef __SYCL_DEVICE_ONLY__
160  ;
161 #else
162 {
163  __ESIMD_DNS::vector_type_t<Ty, N> V;
164 
165  for (int I = 0; I < N; I++) {
166  Ty *Addr = reinterpret_cast<Ty *>(addr + I * sizeof(Ty));
167  V[I] = *Addr;
168  }
169  return V;
170 }
171 #endif // __SYCL_DEVICE_ONLY__
172 
173 // Read a block of data from the given address. Address must be 16-byte aligned.
174 template <typename Ty, int N>
175 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
176 __esimd_svm_block_ld(uint64_t addr)
177 #ifdef __SYCL_DEVICE_ONLY__
178  ;
179 #else
180 {
181  __ESIMD_DNS::vector_type_t<Ty, N> V;
182 
183  for (int I = 0; I < N; I++) {
184  Ty *Addr = reinterpret_cast<Ty *>(addr + I * sizeof(Ty));
185  V[I] = *Addr;
186  }
187  return V;
188 }
189 #endif // __SYCL_DEVICE_ONLY__
190 
191 // flat_block_write writes a block of data using one flat address
192 template <typename Ty, int N>
193 __ESIMD_INTRIN void __esimd_svm_block_st(uint64_t addr,
194  __ESIMD_DNS::vector_type_t<Ty, N> vals)
195 #ifdef __SYCL_DEVICE_ONLY__
196  ;
197 #else
198 {
199  for (int I = 0; I < N; I++) {
200  Ty *Addr = reinterpret_cast<Ty *>(addr + I * sizeof(Ty));
201  *Addr = vals[I];
202  }
203 }
204 #endif // __SYCL_DEVICE_ONLY__
205 
206 // Reads a block of data from given surface at given offset.
207 template <typename Ty, int N, typename SurfIndAliasTy, int32_t IsModified = 0>
208 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
209 __esimd_oword_ld_unaligned(SurfIndAliasTy surf_ind, uint32_t offset)
210 #ifdef __SYCL_DEVICE_ONLY__
211  ;
212 #else
213 {
214  __ESIMD_DNS::vector_type_t<Ty, N> retv;
215  sycl::detail::ESIMDDeviceInterface *I =
217 
218  if (surf_ind == __ESIMD_NS::detail::SLM_BTI) {
219  // O-word/Block load for Shared Local Memory
220  // __ESIMD_NS::detail::SLM_BTI is special binding table index for SLM
221  char *SlmBase = I->__cm_emu_get_slm_ptr();
222  for (int i = 0; i < N; ++i) {
223  Ty *SlmAddr = reinterpret_cast<Ty *>(offset + SlmBase);
224  retv[i] = *SlmAddr;
225  offset += sizeof(Ty);
226  }
227  } else {
228  // O-word/Block load for regular surface indexed by surf_ind
229  char *readBase;
230  uint32_t width;
231  std::mutex *mutexLock;
232 
233  I->sycl_get_cm_buffer_params_ptr(surf_ind, &readBase, &width, &mutexLock);
234 
235  std::lock_guard<std::mutex> lock(*mutexLock);
236 
237  for (int idx = 0; idx < N; idx++) {
238  if (offset >= width) {
239  retv[idx] = 0;
240  } else {
241  retv[idx] = *((Ty *)(readBase + offset));
242  }
243  offset += (uint32_t)sizeof(Ty);
244  }
245  }
246  return retv;
247 }
248 #endif // __SYCL_DEVICE_ONLY__
249 
250 // Writes given block of data to a surface with given index at given offset.
251 template <typename Ty, int N, typename SurfIndAliasTy>
252 __ESIMD_INTRIN void __esimd_oword_st(SurfIndAliasTy surf_ind, uint32_t offset,
253  __ESIMD_DNS::vector_type_t<Ty, N> vals)
254 #ifdef __SYCL_DEVICE_ONLY__
255  ;
256 #else
257 {
258  offset <<= 4;
259 
260  sycl::detail::ESIMDDeviceInterface *I =
262  if (surf_ind == __ESIMD_NS::detail::SLM_BTI) {
263  // O-word/Block store for Shared Local Memory
264  // __ESIMD_NS::detail::SLM_BTI is special binding table index for SLM
265  char *SlmBase = I->__cm_emu_get_slm_ptr();
266  for (int i = 0; i < N; ++i) {
267  Ty *SlmAddr = reinterpret_cast<Ty *>(offset + SlmBase);
268  *SlmAddr = vals[i];
269  offset += sizeof(Ty);
270  }
271  } else {
272  // O-word/Block store for regular surface indexed by surf_ind
273  char *writeBase;
274  uint32_t width;
275  std::mutex *mutexLock;
276 
277  I->sycl_get_cm_buffer_params_ptr(surf_ind, &writeBase, &width, &mutexLock);
278 
279  std::lock_guard<std::mutex> lock(*mutexLock);
280 
281  for (int idx = 0; idx < N; idx++) {
282  if (offset < width) {
283  *((Ty *)(writeBase + offset)) = vals[idx];
284  } else {
285  break;
286  }
287  offset += (uint32_t)sizeof(Ty);
288  }
289 
290  // TODO : Optimize
291  I->cm_fence_ptr();
292  }
293 }
294 #endif // __SYCL_DEVICE_ONLY__
295 
296 // flat_read4 does flat-address gather4
297 template <typename Ty, int N, __ESIMD_NS::rgba_channel_mask Mask>
298 __ESIMD_DNS::vector_type_t<Ty, N * get_num_channels_enabled(Mask)>
299  __ESIMD_INTRIN
300  __esimd_svm_gather4_scaled(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
301  __ESIMD_DNS::simd_mask_storage_t<N> pred = 1)
302 #ifdef __SYCL_DEVICE_ONLY__
303  ;
304 #else
305 {
306  __ESIMD_DNS::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> V = 0;
307  unsigned int Next = 0;
308  uint64_t Offset = 0;
309 
310  for (const auto &channel : ChannelMaskArray) {
311  if (__ESIMD_NS::is_channel_enabled(Mask, channel)) {
312  for (int I = 0; I < N; I++, Next++) {
313  if (pred[I]) {
314  Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + Offset);
315  V[Next] = *Addr;
316  }
317  }
318  }
319  Offset += (uint64_t)sizeof(Ty);
320  }
321 
322  return V;
323 }
324 #endif // __SYCL_DEVICE_ONLY__
325 
326 // flat_write does flat-address scatter
327 template <typename Ty, int N, __ESIMD_NS::rgba_channel_mask Mask>
328 __ESIMD_INTRIN void __esimd_svm_scatter4_scaled(
329  __ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
330  __ESIMD_DNS::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> vals,
331  __ESIMD_DNS::simd_mask_storage_t<N> pred = 1)
332 #ifdef __SYCL_DEVICE_ONLY__
333  ;
334 #else
335 {
336  __ESIMD_DNS::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> V;
337  unsigned int Next = 0;
338  uint64_t Offset = 0;
339 
340  for (const auto &channel : ChannelMaskArray) {
341  if (__ESIMD_NS::is_channel_enabled(Mask, channel)) {
342  for (int I = 0; I < N; I++, Next++) {
343  if (pred[I]) {
344  Ty *Addr = reinterpret_cast<Ty *>(addrs[I] + Offset);
345  *Addr = vals[Next];
346  }
347  }
348  }
349  Offset += (uint64_t)sizeof(Ty);
350  }
351 }
352 #endif // __SYCL_DEVICE_ONLY__
353 
354 // Low-level surface-based gather. Collects elements located at given offsets in
355 // a surface and returns them as a single \ref simd object. Element can be
356 // 1, 2 or 4-byte value, but is always returned as a 4-byte value within the
357 // resulting simd object, with upper 2 or 3 bytes undefined.
358 // Template (compile-time constant) parameters:
359 // @tparam Ty - element type; can only be a 4-byte integer or \c float,
360 // @tparam N - the number of elements
361 // @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
362 // accessor used to denote the surface
363 // @tparam TySizeLog2 - Log2 of the number of bytes read per element:
364 // 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
365 // @tparam Scale - offset scaling factor; must be zero currently
366 // @tparam L1H - L1 cache hint
367 // @tparam L3H - L3 cache hint
368 //
369 // Formal parameters:
370 // @param surf_ind - the surface index, taken from the SYCL memory object
371 // @param global_offset - offset added to each individual element's offset to
372 // compute actual memory access offset for that element
373 // @param elem_offsets - per-element offsets
374 //
375 template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
376  int16_t Scale = 0>
377 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
378 __esimd_gather_scaled2(SurfIndAliasTy surf_ind, uint32_t global_offset,
379  __ESIMD_DNS::vector_type_t<uint32_t, N> elem_offsets)
380 #ifdef __SYCL_DEVICE_ONLY__
381  ;
382 #else
383 {
384  static_assert(N == 1 || N == 8 || N == 16 || N == 32);
385  static_assert(TySizeLog2 <= 2 && Scale == 0);
386  static_assert(std::is_integral_v<Ty> || TySizeLog2 == 2);
387  __ESIMD_UNSUPPORTED_ON_HOST;
388 }
389 #endif // __SYCL_DEVICE_ONLY__
390 
391 // Low-level surface-based scatter. Writes elements of a \ref simd object into a
392 // surface at given offsets. Element can be a 1, 2 or 4-byte value, but it is
393 // always represented as a 4-byte value within the input simd object,
394 // unused (not written) upper bytes are ignored.
395 // Template (compile-time constant) parameters:
396 // @tparam Ty - element type; can only be a 4-byte integer or \c float,
397 // @tparam N - the number of elements to write
398 // @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
399 // accessor used to denote the surface
400 // @tparam TySizeLog2 - Log2 of the number of bytes written per element:
401 // 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
402 // @tparam Scale - offset scale; only 0 is supported for now
403 // @tparam L1H - L1 cache hint
404 // @tparam L3H - L3 cache hint
405 //
406 // Formal parameters:
407 // @param pred - per-element predicates; elements with zero corresponding
408 // predicates are not written
409 // @param surf_ind - the surface index, taken from the SYCL memory object
410 // @param global_offset - offset added to each individual element's offset to
411 // compute actual memory access offset for that element
412 // @param elem_offsets - per-element offsets
413 // @param vals - values to write
414 //
415 template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
416  int16_t Scale = 0>
417 __ESIMD_INTRIN void
418 __esimd_scatter_scaled(__ESIMD_DNS::simd_mask_storage_t<N> pred,
419  SurfIndAliasTy surf_ind, uint32_t global_offset,
420  __ESIMD_DNS::vector_type_t<uint32_t, N> elem_offsets,
421  __ESIMD_DNS::vector_type_t<Ty, N> vals)
422 #ifdef __SYCL_DEVICE_ONLY__
423  ;
424 #else
425 {
426  static_assert(N == 1 || N == 8 || N == 16 || N == 32);
427  static_assert(TySizeLog2 <= 2);
428  static_assert(std::is_integral_v<Ty> || TySizeLog2 == 2);
429 
430  // determine the original element's type size (as __esimd_scatter_scaled
431  // requires vals to be a vector of 4-byte integers)
432  constexpr size_t OrigSize = __ESIMD_DNS::ElemsPerAddrDecoding(TySizeLog2);
433  using RestoredTy = __ESIMD_DNS::uint_type_t<OrigSize>;
434 
435  sycl::detail::ESIMDDeviceInterface *I =
437 
438  __ESIMD_DNS::vector_type_t<RestoredTy, N> TypeAdjustedVals;
439  if constexpr (OrigSize == 4) {
440  TypeAdjustedVals = __ESIMD_DNS::bitcast<RestoredTy, Ty, N>(vals);
441  } else {
442  static_assert(OrigSize == 1 || OrigSize == 2);
443  TypeAdjustedVals = __ESIMD_DNS::convert_vector<RestoredTy, Ty, N>(vals);
444  }
445 
446  if (surf_ind == __ESIMD_NS::detail::SLM_BTI) {
447  // Scattered-store for Shared Local Memory
448  // __ESIMD_NS::detail::SLM_BTI is special binding table index for SLM
449  assert(global_offset == 0);
450  char *SlmBase = I->__cm_emu_get_slm_ptr();
451  for (int i = 0; i < N; ++i) {
452  if (pred[i]) {
453  RestoredTy *addr =
454  reinterpret_cast<RestoredTy *>(elem_offsets[i] + SlmBase);
455  *addr = TypeAdjustedVals[i];
456  }
457  }
458  } else {
459  // Scattered-store for regular surface indexed by surf_ind
460  char *writeBase;
461  uint32_t width;
462  std::mutex *mutexLock;
463 
464  I->sycl_get_cm_buffer_params_ptr(surf_ind, &writeBase, &width, &mutexLock);
465  writeBase += global_offset;
466 
467  std::lock_guard<std::mutex> lock(*mutexLock);
468 
469  for (int idx = 0; idx < N; idx++) {
470  if (pred[idx]) {
471  RestoredTy *addr =
472  reinterpret_cast<RestoredTy *>(elem_offsets[idx] + writeBase);
473  *addr = TypeAdjustedVals[idx];
474  }
475  }
476 
477  // TODO : Optimize
478  I->cm_fence_ptr();
479  }
480 }
481 #endif // __SYCL_DEVICE_ONLY__
482 
483 // flat_atomic: flat-address atomic
484 template <__ESIMD_NS::atomic_op Op, typename Ty, int N>
485 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
486 __esimd_svm_atomic0(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
487  __ESIMD_DNS::simd_mask_storage_t<N> pred)
488 #ifdef __SYCL_DEVICE_ONLY__
489  ;
490 #else
491 {
492  __ESIMD_DNS::vector_type_t<Ty, N> Oldval = 0;
493 
494  for (int AddrIdx = 0; AddrIdx < N; AddrIdx += 1) {
495  if (pred[AddrIdx] == 0) {
496  // Skip Oldval vector elements correpsonding to
497  // predicates whose value is zero
498  continue;
499  }
500  if constexpr (Op == __ESIMD_NS::atomic_op::load) {
501  Oldval[AddrIdx] = __ESIMD_DNS::atomic_load<Ty>((Ty *)addrs[AddrIdx]);
502  } else if constexpr (Op == __ESIMD_NS::atomic_op::inc) {
503  Oldval[AddrIdx] =
504  __ESIMD_DNS::atomic_add<Ty>((Ty *)addrs[AddrIdx], static_cast<Ty>(1));
505  } else if constexpr (Op == __ESIMD_NS::atomic_op::dec) {
506  Oldval[AddrIdx] =
507  __ESIMD_DNS::atomic_sub<Ty>((Ty *)addrs[AddrIdx], static_cast<Ty>(1));
508  }
509  }
510  return Oldval;
511 }
512 #endif // __SYCL_DEVICE_ONLY__
513 
514 template <__ESIMD_NS::atomic_op Op, typename Ty, int N>
515 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
516 __esimd_svm_atomic1(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
517  __ESIMD_DNS::vector_type_t<Ty, N> src0,
518  __ESIMD_DNS::simd_mask_storage_t<N> pred)
519 #ifdef __SYCL_DEVICE_ONLY__
520  ;
521 #else
522 {
523  __ESIMD_DNS::vector_type_t<Ty, N> Oldval;
524 
525  for (int AddrIdx = 0; AddrIdx < N; AddrIdx++) {
526  if (pred[AddrIdx] == 0) {
527  // Skip Output vector elements correpsonding to
528  // predicates whose value is zero
529  continue;
530  }
531 
532  if constexpr (Op == __ESIMD_NS::atomic_op::store) {
533  Oldval[AddrIdx] =
534  __ESIMD_DNS::atomic_store<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
535  } else if constexpr ((Op == __ESIMD_NS::atomic_op::add) ||
536  (Op == __ESIMD_NS::atomic_op::fadd)) {
537  Oldval[AddrIdx] =
538  __ESIMD_DNS::atomic_add<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
539  } else if constexpr ((Op == __ESIMD_NS::atomic_op::sub) ||
540  (Op == __ESIMD_NS::atomic_op::fsub)) {
541  Oldval[AddrIdx] =
542  __ESIMD_DNS::atomic_sub<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
543  } else if constexpr ((Op == __ESIMD_NS::atomic_op::minsint) ||
544  (Op == __ESIMD_NS::atomic_op::min) ||
545  (Op == __ESIMD_NS::atomic_op::fmin)) {
546  Oldval[AddrIdx] =
547  __ESIMD_DNS::atomic_min<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
548  } else if constexpr ((Op == __ESIMD_NS::atomic_op::maxsint) ||
549  (Op == __ESIMD_NS::atomic_op::max) ||
550  (Op == __ESIMD_NS::atomic_op::fmax)) {
551  Oldval[AddrIdx] =
552  __ESIMD_DNS::atomic_max<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
553  } else if constexpr (Op == __ESIMD_NS::atomic_op::bit_and) {
554  Oldval[AddrIdx] =
555  __ESIMD_DNS::atomic_and<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
556  } else if constexpr (Op == __ESIMD_NS::atomic_op::bit_or) {
557  Oldval[AddrIdx] =
558  __ESIMD_DNS::atomic_or<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
559  } else if constexpr (Op == __ESIMD_NS::atomic_op::bit_xor) {
560  Oldval[AddrIdx] =
561  __ESIMD_DNS::atomic_xor<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
562  }
563  }
564 
565  return Oldval;
566 }
567 #endif // __SYCL_DEVICE_ONLY__
568 
569 template <__ESIMD_NS::atomic_op Op, typename Ty, int N>
570 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
571 __esimd_svm_atomic2(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
572  __ESIMD_DNS::vector_type_t<Ty, N> src0,
573  __ESIMD_DNS::vector_type_t<Ty, N> src1,
574  __ESIMD_DNS::simd_mask_storage_t<N> pred)
575 #ifdef __SYCL_DEVICE_ONLY__
576  ;
577 #else
578 {
579  __ESIMD_DNS::vector_type_t<Ty, N> Oldval;
580 
581  for (int AddrIdx = 0; AddrIdx < N; AddrIdx++) {
582  if (pred[AddrIdx] == 0) {
583  // Skip Output vector elements correpsonding to
584  // predicates whose value is zero
585  continue;
586  }
587  static_assert((Op == __ESIMD_NS::atomic_op::cmpxchg) ||
588  (Op == __ESIMD_NS::atomic_op::fcmpxchg));
589  Oldval[AddrIdx] = __ESIMD_DNS::atomic_cmpxchg((Ty *)addrs[AddrIdx],
590  src0[AddrIdx], src1[AddrIdx]);
591  }
592  return Oldval;
593 }
594 #endif // __SYCL_DEVICE_ONLY__
595 
596 __ESIMD_INTRIN void __esimd_slm_init(uint32_t size)
597 #ifdef __SYCL_DEVICE_ONLY__
598  ;
599 #else
600 {
601  sycl::detail::getESIMDDeviceInterface()->cm_slm_init_ptr(size);
602 }
603 #endif // ifndef __SYCL_DEVICE_ONLY__
604 
605 // esimd_barrier, generic group barrier
606 __ESIMD_INTRIN void __esimd_barrier()
607 #ifdef __SYCL_DEVICE_ONLY__
608  ;
609 #else
610 {
611  sycl::detail::getESIMDDeviceInterface()->cm_barrier_ptr();
612 }
613 #endif // __SYCL_DEVICE_ONLY__
614 
615 // slm_fence sets the SLM read/write order
616 __ESIMD_INTRIN void __esimd_fence(uint8_t cntl)
617 #ifdef __SYCL_DEVICE_ONLY__
618  ;
619 #else
620 {
621  // CM_EMU's 'cm_fence' is NOP. Disabled.
622  // sycl::detail::getESIMDDeviceInterface()->cm_fence_ptr();
624 }
625 #endif // __SYCL_DEVICE_ONLY__
626 
627 // Scaled gather from a surface.
628 template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
629  int16_t Scale = 0>
630 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
631 __esimd_gather_scaled(__ESIMD_DNS::simd_mask_storage_t<N> pred,
632  SurfIndAliasTy surf_ind, uint32_t global_offset,
633  __ESIMD_DNS::vector_type_t<uint32_t, N> addrs)
634 #ifdef __SYCL_DEVICE_ONLY__
635  ;
636 #else
637 {
638  __ESIMD_DNS::vector_type_t<Ty, N> retv = 0;
639  sycl::detail::ESIMDDeviceInterface *I =
641  if (surf_ind == __ESIMD_NS::detail::SLM_BTI) {
642  // Scattered-load for Shared Local Memory
643  // __ESIMD_NS::detail::SLM_BTI is special binding table index for SLM
644  assert(global_offset == 0);
645  char *SlmBase = I->__cm_emu_get_slm_ptr();
646  for (int i = 0; i < N; ++i) {
647  if (pred[i]) {
648  Ty *addr = reinterpret_cast<Ty *>(addrs[i] + SlmBase);
649  retv[i] = *addr;
650  }
651  }
652  } else {
653  // Scattered-load for regular surface indexed by surf_ind
654  char *readBase;
655  uint32_t width;
656  std::mutex *mutexLock;
657 
658  I->sycl_get_cm_buffer_params_ptr(surf_ind, &readBase, &width, &mutexLock);
659  readBase += global_offset;
660 
661  std::lock_guard<std::mutex> lock(*mutexLock);
662 
663  for (int idx = 0; idx < N; idx++) {
664  if (pred[idx]) {
665  Ty *addr = reinterpret_cast<Ty *>(addrs[idx] + readBase);
666  retv[idx] = *addr;
667  }
668  }
669 
670  // TODO : Optimize
671  I->cm_fence_ptr();
672  }
673 
674  return retv;
675 }
676 #endif // __SYCL_DEVICE_ONLY__
677 
678 // Predicated (masked) scaled gather from a surface.
679 //
680 // Template (compile-time constant) parameters:
681 // @tparam Ty - element type
682 // @tparam N - the number of elements to read
683 // @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
684 // accessor used to denote the surface
685 // @tparam TySizeLog2 - Log2 of the number of bytes written per element:
686 // 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
687 // @tparam Scale - offset scale; only 0 is supported for now
688 //
689 // Formal parameters:
690 // @param surf_ind - the surface index, taken from the SYCL memory object
691 // @param global_offset - offset added to each individual element's offset to
692 // compute actual memory access offset for that element
693 // @param offsets - per-element offsets
694 // @param pred - per-element predicates; elements with zero corresponding
695 // predicates are not written
696 // @return - elements read ("gathered") from memory
697 
698 template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
699  int16_t Scale = 0>
700 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
701 __esimd_gather_masked_scaled2(SurfIndAliasTy surf_ind, uint32_t global_offset,
702  __ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
703  __ESIMD_DNS::simd_mask_storage_t<N> pred)
704 #ifdef __SYCL_DEVICE_ONLY__
705  ;
706 #else
707 {
708  static_assert(Scale == 0);
709 
710  // determine the original element's type size (as __esimd_scatter_scaled
711  // requires vals to be a vector of 4-byte integers)
712  constexpr size_t OrigSize = __ESIMD_DNS::ElemsPerAddrDecoding(TySizeLog2);
713  using RestoredTy = __ESIMD_DNS::uint_type_t<OrigSize>;
714 
715  __ESIMD_DNS::vector_type_t<RestoredTy, N> retv = 0;
716  sycl::detail::ESIMDDeviceInterface *I =
718 
719  if (surf_ind == __ESIMD_NS::detail::SLM_BTI) {
720  // __ESIMD_NS::detail::SLM_BTI is special binding table index for SLM
721  assert(global_offset == 0);
722  char *SlmBase = I->__cm_emu_get_slm_ptr();
723  for (int idx = 0; idx < N; ++idx) {
724  if (pred[idx]) {
725  RestoredTy *addr =
726  reinterpret_cast<RestoredTy *>(offsets[idx] + SlmBase);
727  retv[idx] = *addr;
728  }
729  }
730  } else {
731  char *readBase;
732  uint32_t width;
733  std::mutex *mutexLock;
734 
735  I->sycl_get_cm_buffer_params_ptr(surf_ind, &readBase, &width, &mutexLock);
736 
737  readBase += global_offset;
738  std::lock_guard<std::mutex> lock(*mutexLock);
739  for (int idx = 0; idx < N; idx++) {
740  if (pred[idx]) {
741  RestoredTy *addr =
742  reinterpret_cast<RestoredTy *>(offsets[idx] + readBase);
743  retv[idx] = *addr;
744  }
745  }
746 
747  // TODO : Optimize
748  I->cm_fence_ptr();
749  }
750 
751  if constexpr (OrigSize == 4) {
752  return __ESIMD_DNS::bitcast<Ty, RestoredTy, N>(retv);
753  } else {
754  return __ESIMD_DNS::convert_vector<Ty, RestoredTy, N>(retv);
755  }
756 }
757 #endif // __SYCL_DEVICE_ONLY__
758 
759 // Reads a block of data from given surface at given offset, offset must be
760 // 16-byte-aligned.
761 template <typename Ty, int N, typename SurfIndAliasTy, int32_t IsModified = 0>
762 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
763 __esimd_oword_ld(SurfIndAliasTy surf_ind, uint32_t addr)
764 #ifdef __SYCL_DEVICE_ONLY__
765  ;
766 #else
767 {
768  addr <<= 4;
769 
770  __ESIMD_DNS::vector_type_t<Ty, N> retv;
771  sycl::detail::ESIMDDeviceInterface *I =
773 
774  if (surf_ind == __ESIMD_NS::detail::SLM_BTI) {
775  // O-word/Block load for Shared Local Memory
776  // __ESIMD_NS::detail::SLM_BTI is special binding table index for SLM
777  char *SlmBase = I->__cm_emu_get_slm_ptr();
778  for (int i = 0; i < N; ++i) {
779  Ty *SlmAddr = reinterpret_cast<Ty *>(addr + SlmBase);
780  retv[i] = *SlmAddr;
781  addr += sizeof(Ty);
782  }
783  } else {
784  // O-word/Block load for regular surface indexed by surf_ind
785  char *readBase;
786  uint32_t width;
787  std::mutex *mutexLock;
788 
789  I->sycl_get_cm_buffer_params_ptr(surf_ind, &readBase, &width, &mutexLock);
790 
791  std::lock_guard<std::mutex> lock(*mutexLock);
792 
793  for (int idx = 0; idx < N; idx++) {
794  if (addr >= width) {
795  retv[idx] = 0;
796  } else {
797  retv[idx] = *((Ty *)(readBase + addr));
798  }
799  addr += (uint32_t)sizeof(Ty);
800  }
801  }
802  return retv;
803 }
804 #endif // __SYCL_DEVICE_ONLY__
805 
806 // gather4 scaled masked from a surface/SLM
807 template <typename Ty, int N, __ESIMD_NS::rgba_channel_mask Mask,
808  typename SurfIndAliasTy, int16_t Scale = 0>
809 __ESIMD_INTRIN
810  __ESIMD_DNS::vector_type_t<Ty, N * get_num_channels_enabled(Mask)>
811  __esimd_gather4_masked_scaled2(
812  SurfIndAliasTy surf_ind, int global_offset,
813  __ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
814  __ESIMD_DNS::simd_mask_storage_t<N> pred)
815 #ifdef __SYCL_DEVICE_ONLY__
816  ;
817 #else
818 {
819  __ESIMD_DNS::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> retv = 0;
820  sycl::detail::ESIMDDeviceInterface *I =
822  char *ReadBase;
823  unsigned int Next = 0;
824 
825  if (surf_ind == __ESIMD_NS::detail::SLM_BTI) {
826  ReadBase = I->__cm_emu_get_slm_ptr();
827  } else {
828  uint32_t width;
829  std::mutex *mutexLock;
830  I->sycl_get_cm_buffer_params_ptr(surf_ind, &ReadBase, &width, &mutexLock);
831  std::lock_guard<std::mutex> lock(*mutexLock);
832  }
833 
834  ReadBase += global_offset;
835 
836  for (const auto &channel : ChannelMaskArray) {
837  if (__ESIMD_NS::is_channel_enabled(Mask, channel)) {
838  for (int I = 0; I < N; I++, Next++) {
839  if (pred[I]) {
840  Ty *Addr = reinterpret_cast<Ty *>(ReadBase + offsets[I]);
841  retv[Next] = *Addr;
842  }
843  }
844  }
845  ReadBase += (uint64_t)sizeof(Ty);
846  }
847 
848  return retv;
849 }
850 #endif // __SYCL_DEVICE_ONLY__
851 
852 // scatter4 scaled to a surface/SLM
853 template <typename Ty, int N, typename SurfIndAliasTy,
854  __ESIMD_NS::rgba_channel_mask Mask, int16_t Scale = 0>
855 __ESIMD_INTRIN void __esimd_scatter4_scaled(
856  __ESIMD_DNS::simd_mask_storage_t<N> pred, SurfIndAliasTy surf_ind,
857  int global_offset, __ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
858  __ESIMD_DNS::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> vals)
859 #ifdef __SYCL_DEVICE_ONLY__
860  ;
861 #else
862 {
863  sycl::detail::ESIMDDeviceInterface *I =
865  char *WriteBase;
866  unsigned int Next = 0;
867 
868  if (surf_ind == __ESIMD_NS::detail::SLM_BTI) {
869  WriteBase = I->__cm_emu_get_slm_ptr();
870  } else {
871  uint32_t width;
872  std::mutex *mutexLock;
873  I->sycl_get_cm_buffer_params_ptr(surf_ind, &WriteBase, &width, &mutexLock);
874  std::lock_guard<std::mutex> lock(*mutexLock);
875  }
876 
877  WriteBase += global_offset;
878 
879  for (const auto &channel : ChannelMaskArray) {
880  if (__ESIMD_NS::is_channel_enabled(Mask, channel)) {
881  for (int I = 0; I < N; I++, Next++) {
882  if (pred[I]) {
883  Ty *Addr = reinterpret_cast<Ty *>(WriteBase + offsets[I]);
884  *Addr = vals[Next];
885  }
886  }
887  }
888  WriteBase += (uint64_t)sizeof(Ty);
889  }
890 }
891 #endif // __SYCL_DEVICE_ONLY__
892 
893 // Surface-based atomic operations
894 template <__ESIMD_NS::atomic_op Op, typename Ty, int N, typename SurfIndAliasTy>
895 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
896 __esimd_dword_atomic0(__ESIMD_DNS::simd_mask_storage_t<N> pred,
897  SurfIndAliasTy surf_ind,
898  __ESIMD_DNS::vector_type_t<uint32_t, N> addrs)
899 #ifdef __SYCL_DEVICE_ONLY__
900  ;
901 #else
902 {
903  __ESIMD_DNS::vector_type_t<Ty, N> retv;
904 
905  if (surf_ind == __ESIMD_NS::detail::SLM_BTI) {
906  char *WriteBase =
907  sycl::detail::getESIMDDeviceInterface()->__cm_emu_get_slm_ptr();
908 
909  for (int i = 0; i < N; i++) {
910  if (pred[i]) {
911  Ty *p = reinterpret_cast<Ty *>(addrs[i] + WriteBase);
912 
913  switch (Op) {
914  case __ESIMD_NS::atomic_op::inc:
915  retv[i] = __ESIMD_DNS::atomic_add<Ty>(p, 1);
916  break;
917  default:
918  __ESIMD_UNSUPPORTED_ON_HOST;
919  }
920  }
921  }
922  } else {
923  __ESIMD_UNSUPPORTED_ON_HOST;
924  }
925  return retv;
926 }
927 #endif // __SYCL_DEVICE_ONLY__
928 
929 template <__ESIMD_NS::atomic_op Op, typename Ty, int N, typename SurfIndAliasTy>
930 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
931 __esimd_dword_atomic1(__ESIMD_DNS::simd_mask_storage_t<N> pred,
932  SurfIndAliasTy surf_ind,
933  __ESIMD_DNS::vector_type_t<uint32_t, N> addrs,
934  __ESIMD_DNS::vector_type_t<Ty, N> src0)
935 #ifdef __SYCL_DEVICE_ONLY__
936  ;
937 #else
938 {
939  __ESIMD_UNSUPPORTED_ON_HOST;
940 }
941 #endif // __SYCL_DEVICE_ONLY__
942 
943 template <__ESIMD_NS::atomic_op Op, typename Ty, int N, typename SurfIndAliasTy>
944 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N>
945 __esimd_dword_atomic2(__ESIMD_DNS::simd_mask_storage_t<N> pred,
946  SurfIndAliasTy surf_ind,
947  __ESIMD_DNS::vector_type_t<uint32_t, N> addrs,
948  __ESIMD_DNS::vector_type_t<Ty, N> src0,
949  __ESIMD_DNS::vector_type_t<Ty, N> src1)
950 #ifdef __SYCL_DEVICE_ONLY__
951  ;
952 #else
953 {
954  __ESIMD_UNSUPPORTED_ON_HOST;
955 }
956 #endif // __SYCL_DEVICE_ONLY__
957 
958 // Media block load.
959 //
960 // @tparam Ty the element data type.
961 // @tparam M the hight of the 2D block.
962 // @tparam N the width of the 2D block.
963 // @tparam Modifier top/bottom field surface access control.
964 // @tparam TACC type of the surface handle.
965 // @tparam Plane planar surface index.
966 // @tparam BlockWidth the width of the return block.
967 // @param handle the surface handle.
968 // @param x X-coordinate of the left upper rectangle corner in BYTES.
969 // @param y Y-coordinate of the left upper rectangle corner in ROWS.
970 //
971 // @return the linearized 2D block data read from surface.
972 //
973 template <typename Ty, int M, int N, int Modifier, typename TACC, int Plane,
974  int BlockWidth>
975 __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, M * N>
976 __esimd_media_ld(TACC handle, unsigned x, unsigned y)
977 #ifdef __SYCL_DEVICE_ONLY__
978  ;
979 #else
980 {
981  __ESIMD_DNS::vector_type_t<Ty, M * N> vals;
982  char *readBase;
983  uint32_t bpp;
984  uint32_t imgWidth;
985  uint32_t imgHeight;
986  std::mutex *mutexLock;
987 
988  assert((handle != __ESIMD_NS::detail::SLM_BTI) &&
989  "__esimd_media_ld cannot access SLM");
990 
991  sycl::detail::getESIMDDeviceInterface()->sycl_get_cm_image_params_ptr(
992  handle, &readBase, &imgWidth, &imgHeight, &bpp, &mutexLock);
993 
994  std::lock_guard<std::mutex> lock(*mutexLock);
995 
996  int x_pos_a, y_pos_a, offset, index;
997 
998  // TODO : Remove intermediate 'in' matrix
999  std::vector<std::vector<Ty>> in(M, std::vector<Ty>(N));
1000  int R = M;
1001  int C = N;
1002  for (int i = 0; i < R; i++) {
1003  for (int j = 0; j < C; j++) {
1004  x_pos_a = x + j * sizeof(Ty);
1005  { y_pos_a = y + i; }
1006  // We should check the boundary condition based on sizeof(Ty), x_pos_a is
1007  // 0-based Note: Use a signed variable; otherwise sizeof(Ty) is unsigned
1008  if ((x_pos_a + sizeof(Ty)) > imgWidth) {
1009  // If we're trying to read outside the boundary, limit the value of
1010  // x_pos_a Assumption -- We don't this situation:
1011  // x_pos_a width's boundary
1012  // | |
1013  // <---type(Ty)--->
1014  // At most x_pos_a+sizeof(Ty) is exactly at the boundary.
1015  x_pos_a = imgWidth;
1016  }
1017  if (y_pos_a > imgHeight - 1) {
1018  y_pos_a = imgHeight - 1;
1019  }
1020  if (y_pos_a < 0) {
1021  y_pos_a = 0;
1022  }
1023  {
1024  if (x_pos_a < 0) {
1025  // Need to align x position to bbp
1026  int offset = x % bpp;
1027  x_pos_a -= offset;
1028  }
1029  while (x_pos_a < 0) {
1030  // If we're trying to read outside the left boundary, increase x_pos_a
1031  x_pos_a += bpp;
1032  }
1033  }
1034 
1035  if (x_pos_a >= imgWidth) {
1036  {
1037  x_pos_a = x_pos_a - bpp;
1038  for (uint byte_count = 0; byte_count < sizeof(Ty); byte_count++) {
1039  if (x_pos_a >= imgWidth) {
1040  x_pos_a = x_pos_a - bpp;
1041  }
1042  offset = y_pos_a * imgWidth + x_pos_a;
1043 
1044  /*
1045  If destination size per element is less then or equal pixel size
1046  of the surface move the pixel value accross the destination
1047  elements. If destination size per element is greater then pixel
1048  size of the surface replicate pixel value in the destination
1049  element.
1050  */
1051  if (sizeof(Ty) <= bpp) {
1052  for (uint bpp_count = 0; j < C && bpp_count < bpp;
1053  j++, bpp_count += sizeof(Ty)) {
1054  in[i][j] = *((Ty *)(readBase + offset + bpp_count));
1055  }
1056  j--;
1057  break;
1058  } else {
1059  // ((unsigned char*)in.get_addr(i*C+j))[byte_count] = *((unsigned
1060  // char*)((char*)buff_iter->p + offset));
1061  unsigned char *pTempBase =
1062  ((unsigned char *)in[i].data()) + j * sizeof(Ty);
1063  pTempBase[byte_count] = *((unsigned char *)(readBase + offset));
1064  }
1065 
1066  x_pos_a = x_pos_a + 1;
1067  }
1068  x_pos_a = imgWidth;
1069  }
1070  } else {
1071  offset = y_pos_a * imgWidth + x_pos_a;
1072  { in[i][j] = *((Ty *)(readBase + offset)); }
1073  }
1074  }
1075  }
1076 
1077  for (auto i = 0, k = 0; i < M; i++) {
1078  for (auto j = 0; j < N; j++) {
1079  vals[k++] = in[i][j];
1080  }
1081  }
1082 
1083  return vals;
1084 }
1085 #endif // __SYCL_DEVICE_ONLY__
1086 
1087 // Media block store
1088 //
1089 // @tparam Ty the element data type.
1090 // @tparam M the hight of the 2D block.
1091 // @tparam N the width of the 2D block.
1092 // @tparam Modifier top/bottom field surface access control.
1093 // @tparam TACC type of the surface handle.
1094 // @tparam Plane planar surface index.
1095 // @tparam BlockWidth the width of the return block.
1096 // @param handle the surface handle.
1097 // @param x X-coordinate of the left upper rectangle corner in BYTES.
1098 // @param y Y-coordinate of the left upper rectangle corner in ROWS.
1099 // @param vals the linearized 2D block data to be written to surface.
1100 //
1101 template <typename Ty, int M, int N, int Modifier, typename TACC, int Plane,
1102  int BlockWidth>
1103 __ESIMD_INTRIN void __esimd_media_st(TACC handle, unsigned x, unsigned y,
1104  __ESIMD_DNS::vector_type_t<Ty, M * N> vals)
1105 #ifdef __SYCL_DEVICE_ONLY__
1106  ;
1107 #else
1108 {
1109  sycl::detail::ESIMDDeviceInterface *I =
1111 
1112  char *writeBase;
1113  uint32_t bpp;
1114  uint32_t imgWidth;
1115  uint32_t imgHeight;
1116  std::mutex *mutexLock;
1117 
1118  assert((handle != __ESIMD_NS::detail::SLM_BTI) &&
1119  "__esimd_media_ld cannot access SLM");
1120 
1121  I->sycl_get_cm_image_params_ptr(handle, &writeBase, &imgWidth, &imgHeight,
1122  &bpp, &mutexLock);
1123 
1124  int x_pos_a, y_pos_a, offset;
1125 
1126  assert((x % 4) == 0);
1127  assert((N * sizeof(Ty)) % 4 == 0);
1128 
1129  // TODO : Remove intermediate 'out' matrix
1130  std::vector<std::vector<Ty>> out(M, std::vector<Ty>(N));
1131 
1132  std::lock_guard<std::mutex> lock(*mutexLock);
1133 
1134  for (int i = 0, k = 0; i < M; i++) {
1135  for (int j = 0; j < N; j++) {
1136  out[i][j] = vals[k++];
1137  }
1138  }
1139 
1140  for (int i = 0; i < M; i++) {
1141  for (int j = 0; j < N; j++) {
1142  x_pos_a = x + j * sizeof(Ty);
1143  { y_pos_a = y + i; }
1144  if ((int)x_pos_a < 0) {
1145  continue;
1146  }
1147  if ((int)y_pos_a < 0) {
1148  continue;
1149  }
1150  if ((int)(x_pos_a + sizeof(Ty)) > imgWidth) {
1151  continue;
1152  }
1153 
1154  if ((int)y_pos_a > imgHeight - 1) {
1155  continue;
1156  }
1157  offset = y_pos_a * imgWidth + x_pos_a;
1158  *((Ty *)(writeBase + offset)) = out[i][j];
1159  }
1160  }
1161 
1162  // TODO : Optimize
1163  I->cm_fence_ptr();
1164 }
1165 #endif // __SYCL_DEVICE_ONLY__
1166 
1167 // getter methods returning surface index are not available when stateless
1168 // memory accesses are enforced.
1169 #ifndef __ESIMD_FORCE_STATELESS_MEM
1170 
1171 // \brief Converts given value to a surface index.
1172 // The input must always be a result of
1173 // detail::AccessorPrivateProxy::getQualifiedPtrOrImageObj(acc)
1174 // where acc is a buffer or image accessor. If the result is, say, 'obj', then
1175 // 'obj' is really a value of the surface index kept in a differently typed
1176 // accessor field. Front-end compilation time type of 'obj' is either
1177 // ConcreteASPtrType (detail::DecoratedType<DataT, AS>::type *), for a buffer
1178 // or
1179 // image{1,2,3}d_t OpenCL type for an image
1180 // But when doing code generation, FE replaces e.g. '__read_only image2d_t' FE
1181 // type with '%opencl.image2d_ro_t addrspace(1) *' LLVM type or a Target
1182 // Extension Type if using opaque pointers. These types can neither be
1183 // reinterpret_cast'ed from pointer to intptr_t (because they are not a pointer
1184 // at FE translation time), nor can they be bit_cast'ed to intptr_t (because
1185 // they are not trivially copyable). This function takes advantage of the fact
1186 // that in SPIR-V 'obj' is always a pointer, where we can do ptr to uint32_t
1187 // conversion. This function can be called only from the device code, as
1188 // accessor => memory handle translation for host is different.
1189 // @param acc the SYCL accessor.
1190 // Returns the binding table index value.
1191 template <typename MemObjTy>
1192 ESIMD_INLINE __ESIMD_NS::SurfaceIndex __esimd_get_surface_index(MemObjTy obj)
1193 #ifdef __SYCL_DEVICE_ONLY__
1194 {
1195  return __spirv_ConvertPtrToU<MemObjTy, uint32_t>(obj);
1196 }
1197 #else // __SYCL_DEVICE_ONLY__
1198 {
1199  return sycl::detail::getESIMDDeviceInterface()->sycl_get_cm_surface_index_ptr(
1200  __ESIMD_DNS::AccessorPrivateProxy::getPtr(obj));
1201 }
1202 #endif // __SYCL_DEVICE_ONLY__
1203 
1204 #endif // !__ESIMD_FORCE_STATELESS_MEM
1205 
sycl::_V1::ext::intel::esimd::rgba_channel_mask
rgba_channel_mask
Represents a pixel's channel mask - all possible combinations of enabled channels.
Definition: common.hpp:105
esimd_emulator_device_interface.hpp
sycl::_V1::ext::oneapi::bit_and
std::bit_and< T > bit_and
Definition: functional.hpp:24
sycl::_V1::ext::oneapi::experimental::obj
global_pointer_t obj
Definition: annotated_arg.hpp:76
common.hpp
__SYCL_INLINE_VER_NAMESPACE
#define __SYCL_INLINE_VER_NAMESPACE(X)
Definition: defines_elementary.hpp:11
types.hpp
sycl::_V1::ext::oneapi::bit_xor
std::bit_xor< T > bit_xor
Definition: functional.hpp:23
sycl
---— Error handling, matching OpenCL plugin semantics.
Definition: access.hpp:14
max
simd< _Tp, _Abi > max(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
pi.hpp
sycl::_V1::detail::getESIMDDeviceInterface
ESIMDDeviceInterface * getESIMDDeviceInterface()
Definition: esimd_emulator_device_interface.cpp:25
sycl::_V1::ext::oneapi::fmax
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmax(T x, T y)
Definition: bf16_storage_builtins.hpp:60
sycl::_V1::ext::intel::esimd::rgba_channel::R
@ R
sycl::_V1::ext::intel::experimental::esimd::bfn_t::x
@ x
sycl::_V1::ext::intel::esimd::SurfaceIndex
unsigned int SurfaceIndex
Surface index type.
Definition: common.hpp:64
types.hpp
sycl::_V1::ext::oneapi::fmin
std::enable_if_t< detail::is_bf16_storage_type< T >::value, T > fmin(T x, T y)
Definition: bf16_storage_builtins.hpp:49
sycl::_V1::atomic_fence
static void atomic_fence(memory_order order, memory_scope scope)
Definition: atomic_fence.hpp:22
util.hpp
sycl::_V1::ext::intel::esimd::detail::SLM_BTI
static constexpr SurfaceIndex SLM_BTI
Definition: common.hpp:98
sycl::_V1::ext::intel::experimental::esimd::bfn_t::y
@ y
sycl::_V1::dec
constexpr stream_manipulator dec
Definition: stream.hpp:744
accessor.hpp
atomic_intrin.hpp
backend_types.hpp
sycl::_V1::ext::intel::esimd::uint
unsigned int uint
Definition: common.hpp:43
sycl::_V1::ext::intel::esimd::is_channel_enabled
constexpr int is_channel_enabled(rgba_channel_mask M, rgba_channel Ch)
Definition: common.hpp:123
sycl::_V1::detail::device_global_map::add
void add(const void *DeviceGlobalPtr, const char *UniqueId)
Definition: device_global_map.cpp:15
sycl::_V1::ext::intel::esimd::get_num_channels_enabled
constexpr int get_num_channels_enabled(rgba_channel_mask M)
Definition: common.hpp:128
min
simd< _Tp, _Abi > min(const simd< _Tp, _Abi > &, const simd< _Tp, _Abi > &) noexcept
sycl::_V1::ext::oneapi::bit_or
std::bit_or< T > bit_or
Definition: functional.hpp:22