XeTLA v0.3.6
IntelĀ® Xe Templates for Linear Algebra - API Definition Document
 
Loading...
Searching...
No Matches
memory.hpp
Go to the documentation of this file.
1/*******************************************************************************
2* Copyright (c) 2022-2023 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
19
20#pragma once
21
26
27namespace gpu::xetla {
28
29namespace detail {
30
34constexpr __ESIMD_ENS::cache_hint get_cache_hint(gpu::xetla::cache_hint ch) {
35 switch (ch) {
36 case gpu::xetla::cache_hint::none: return __ESIMD_ENS::cache_hint::none;
38 return __ESIMD_ENS::cache_hint::uncached;
40 return __ESIMD_ENS::cache_hint::cached;
42 return __ESIMD_ENS::cache_hint::write_back;
44 return __ESIMD_ENS::cache_hint::write_through;
46 return __ESIMD_ENS::cache_hint::streaming;
48 return __ESIMD_ENS::cache_hint::read_invalidate;
49 }
50}
51
55constexpr __ESIMD_ENS::lsc_data_size get_data_size(gpu::xetla::data_size ds) {
56 switch (ds) {
58 return __ESIMD_ENS::lsc_data_size::default_size;
59 case gpu::xetla::data_size::u8: return __ESIMD_ENS::lsc_data_size::u8;
60 case gpu::xetla::data_size::u16: return __ESIMD_ENS::lsc_data_size::u16;
61 case gpu::xetla::data_size::u32: return __ESIMD_ENS::lsc_data_size::u32;
62 case gpu::xetla::data_size::u64: return __ESIMD_ENS::lsc_data_size::u64;
64 return __ESIMD_ENS::lsc_data_size::u8u32;
66 return __ESIMD_ENS::lsc_data_size::u16u32;
68 return __ESIMD_ENS::lsc_data_size::u16u32h;
69 }
70}
71
75constexpr __ESIMD_ENS::lsc_memory_kind get_memory_kind(
77 switch (mk) {
79 return __ESIMD_ENS::lsc_memory_kind::untyped_global;
81 return __ESIMD_ENS::lsc_memory_kind::untyped_global_low_pri;
83 return __ESIMD_ENS::lsc_memory_kind::typed_global;
85 return __ESIMD_ENS::lsc_memory_kind::shared_local;
86 }
87}
88
92constexpr __ESIMD_ENS::lsc_fence_op get_fence_op(gpu::xetla::fence_op fo) {
93 switch (fo) {
94 case gpu::xetla::fence_op::none: return __ESIMD_ENS::lsc_fence_op::none;
96 return __ESIMD_ENS::lsc_fence_op::evict;
98 return __ESIMD_ENS::lsc_fence_op::invalidate;
100 return __ESIMD_ENS::lsc_fence_op::discard;
102 return __ESIMD_ENS::lsc_fence_op::clean;
104 return __ESIMD_ENS::lsc_fence_op::flushl3;
105 }
106}
107
111constexpr __ESIMD_ENS::lsc_scope get_fence_scope(gpu::xetla::fence_scope fs) {
112 switch (fs) {
114 return __ESIMD_ENS::lsc_scope::group;
116 return __ESIMD_ENS::lsc_scope::local;
117 case gpu::xetla::fence_scope::tile: return __ESIMD_ENS::lsc_scope::tile;
118 case gpu::xetla::fence_scope::gpu: return __ESIMD_ENS::lsc_scope::gpu;
119 case gpu::xetla::fence_scope::gpus: return __ESIMD_ENS::lsc_scope::gpus;
121 return __ESIMD_ENS::lsc_scope::system;
123 return __ESIMD_ENS::lsc_scope::sysacq;
124 }
125}
126
130constexpr __ESIMD_NS::atomic_op get_atomic_op(gpu::xetla::atomic_op ao) {
131 switch (ao) {
132 case gpu::xetla::atomic_op::iinc: return __ESIMD_NS::atomic_op::inc;
133 case gpu::xetla::atomic_op::idec: return __ESIMD_NS::atomic_op::dec;
134 case gpu::xetla::atomic_op::iadd: return __ESIMD_NS::atomic_op::add;
135 case gpu::xetla::atomic_op::isub: return __ESIMD_NS::atomic_op::sub;
136 case gpu::xetla::atomic_op::smin: return __ESIMD_NS::atomic_op::smin;
137 case gpu::xetla::atomic_op::smax: return __ESIMD_NS::atomic_op::smax;
138 case gpu::xetla::atomic_op::umin: return __ESIMD_NS::atomic_op::umin;
139 case gpu::xetla::atomic_op::umax: return __ESIMD_NS::atomic_op::umax;
141 return __ESIMD_NS::atomic_op::cmpxchg;
142#pragma clang diagnostic push
143#pragma clang diagnostic ignored "-Wdeprecated-declarations"
144 case gpu::xetla::atomic_op::fadd: return __ESIMD_NS::atomic_op::fadd;
145 case gpu::xetla::atomic_op::fsub: return __ESIMD_NS::atomic_op::fsub;
146 case gpu::xetla::atomic_op::fmin: return __ESIMD_NS::atomic_op::fmin;
147 case gpu::xetla::atomic_op::fmax: return __ESIMD_NS::atomic_op::fmax;
149 return __ESIMD_NS::atomic_op::fcmpwr;
150#pragma clang diagnostic pop
152 return __ESIMD_NS::atomic_op::bit_and;
154 return __ESIMD_NS::atomic_op::bit_or;
156 return __ESIMD_NS::atomic_op::bit_xor;
157 case gpu::xetla::atomic_op::load: return __ESIMD_NS::atomic_op::load;
158 case gpu::xetla::atomic_op::store: return __ESIMD_NS::atomic_op::store;
159 }
160}
161} // namespace detail
162
165
183template <typename Ty, uint8_t NElts = 1,
186 cache_hint L2H = cache_hint::cached, int N>
188 Ty *p, xetla_vector<uint32_t, N> offsets, xetla_mask<N> pred = 1) {
189 using T = native_type_t<Ty>;
190 __ESIMD_ENS::lsc_prefetch<T, NElts, gpu::xetla::detail::get_data_size(DS),
192 gpu::xetla::detail::get_cache_hint(L2H), N>((T *)p, offsets, pred);
193}
194
210template <typename Ty, uint8_t NElts = 1,
214__XETLA_API void xetla_prefetch_global(Ty *p, uint64_t offset = 0) {
215 using T = native_type_t<Ty>;
216 __ESIMD_ENS::lsc_prefetch<T, NElts, gpu::xetla::detail::get_data_size(DS),
219 (T *)p + (offset / sizeof(T)));
220}
221
241template <typename Ty, uint8_t NElts = 1,
244 int N, typename Toffset = uint32_t>
246 Ty *p, xetla_vector<Toffset, N> offsets, xetla_mask<N> pred = 1) {
247 using T = native_type_t<Ty>;
248 DEBUG_INVOKE(dbg_level::core,
250 Ty>::template check_restriction<NElts, N>(offsets,
251 (uint64_t)p));
252
253 return __ESIMD_ENS::lsc_gather<T, NElts,
256 gpu::xetla::detail::get_cache_hint(L2H), N>((T *)p, offsets, pred);
257}
258
276template <typename Ty, uint8_t NElts = 1,
280 Ty *p, uint64_t offset = 0) {
281 using T = native_type_t<Ty>;
282 DEBUG_INVOKE(dbg_level::core,
284 Ty>::template check_restriction<NElts>(offset,
285 (uint64_t)p));
286
287 return __ESIMD_ENS::lsc_block_load<T, NElts,
291 (T *)p + (offset / sizeof(T)));
292}
293
312template <typename Ty, uint8_t NElts = 1,
315 int N, typename Toffset = uint32_t>
318 using T = native_type_t<Ty>;
319 __ESIMD_ENS::lsc_scatter<T, NElts, gpu::xetla::detail::get_data_size(DS),
322 (T *)p, offsets, vals, pred);
323}
324
341template <typename Ty, uint8_t NElts = 1,
345 Ty *p, uint64_t offset, xetla_vector<Ty, NElts> vals) {
346 using T = native_type_t<Ty>;
347 __ESIMD_ENS::lsc_block_store<T, NElts,
351 (T *)p + (offset / sizeof(T)), vals);
352}
353
368template <atomic_op Op, typename T, int N,
372 T *p, xetla_vector<uint32_t, N> offsets, xetla_mask<N> pred) {
373 static_assert(!(is_internal_type<T>::value),
374 "The internal types are not yet supported!");
375 return __ESIMD_ENS::lsc_atomic_update<gpu::xetla::detail::get_atomic_op(Op),
378 gpu::xetla::detail::get_cache_hint(L2H)>(p, offsets, pred);
379}
380
396template <atomic_op Op, typename T, int N,
401 xetla_mask<N> pred) {
402 static_assert(!(is_internal_type<T>::value),
403 "The internal types are not yet supported!");
404 return __ESIMD_ENS::lsc_atomic_update<gpu::xetla::detail::get_atomic_op(Op),
407 gpu::xetla::detail::get_cache_hint(L2H)>(p, offsets, src0, pred);
408}
409
426template <atomic_op Op, typename T, int N,
432 static_assert(!(is_internal_type<T>::value),
433 "The internal types are not yet supported!");
434 return __ESIMD_ENS::lsc_atomic_update<gpu::xetla::detail::get_atomic_op(Op),
438 p, offsets, src0, src1, pred);
439}
442template <uint32_t SLMSize>
444 if constexpr (SLMSize != 0) { __ESIMD_NS::slm_init(SLMSize); }
445}
446
462template <typename Ty, uint8_t NElts = 1,
465 xetla_vector<uint32_t, N> offsets, xetla_mask<N> pred = 1) {
466 using T = native_type_t<Ty>;
467 DEBUG_INVOKE(dbg_level::core,
469 Ty>::template check_restriction<NElts, N>(offsets));
470
471 return __ESIMD_ENS::lsc_slm_gather<T, NElts,
473 xetla_cvt<uint64_t, uint32_t>(offsets), pred);
474}
475
489template <typename Ty, uint8_t NElts = 1,
492 using T = native_type_t<Ty>;
493 DEBUG_INVOKE(dbg_level::core,
495 Ty>::template check_restriction<NElts>((uint64_t)offset));
496
497 return __ESIMD_ENS::lsc_slm_block_load<T, NElts,
499}
500
516template <typename Ty, uint8_t NElts = 1,
520 using T = native_type_t<Ty>;
521 DEBUG_INVOKE(dbg_level::core,
523 NElts, N, uint32_t>(offsets));
524
525 __ESIMD_ENS::lsc_slm_scatter<T, NElts,
526 gpu::xetla::detail::get_data_size(DS), N>(offsets, vals, pred);
527}
528
542template <typename Ty, uint8_t NElts = 1,
545 uint32_t offset, xetla_vector<Ty, NElts> vals) {
546 using T = native_type_t<Ty>;
547 DEBUG_INVOKE(dbg_level::core,
549 Ty>::template check_restriction<NElts>(offset));
550
551 __ESIMD_ENS::lsc_slm_block_store<T, NElts,
552 gpu::xetla::detail::get_data_size(DS)>(offset, vals);
553}
554
566template <atomic_op Op, typename T, int N,
570 static_assert(!(is_internal_type<T>::value),
571 "The internal types are not yet supported!");
572 return __ESIMD_ENS::lsc_slm_atomic_update<gpu::xetla::detail::get_atomic_op(
573 Op),
574 T, N, gpu::xetla::detail::get_data_size(DS)>(offsets, pred);
575}
576
589template <atomic_op Op, typename T, int N,
593 xetla_mask<N> pred) {
594 static_assert(!(is_internal_type<T>::value),
595 "The internal types are not yet supported!");
596 return __ESIMD_ENS::lsc_slm_atomic_update<gpu::xetla::detail::get_atomic_op(
597 Op),
598 T, N, gpu::xetla::detail::get_data_size(DS)>(offsets, src0, pred);
599}
600
614template <atomic_op Op, typename T, int N,
619 static_assert(!(is_internal_type<T>::value),
620 "The internal types are not yet supported!");
621 return __ESIMD_ENS::lsc_slm_atomic_update<gpu::xetla::detail::get_atomic_op(
622 Op),
624 offsets, src0, src1, pred);
625}
626
636 fence_op FenceOp = fence_op::none,
637 fence_scope Scope = fence_scope::group, int N = 16>
639 __ESIMD_ENS::lsc_fence<gpu::xetla::detail::get_memory_kind(Kind),
642}
643
645
646} // namespace gpu::xetla
C++ API.
C++ API.
#define __XETLA_API
Definition common.hpp:43
#define DEBUG_INVOKE(level,...)
Definition debug.hpp:180
typename native_type< T >::type native_type_t
Return the native data type of T.
Definition base_types.hpp:106
__ESIMD_NS::simd< native_type_t< Ty >, N > xetla_vector
wrapper for xetla_vector.
Definition base_types.hpp:149
__ESIMD_NS::simd_mask< N > xetla_mask
wrapper for xetla_mask.
Definition base_types.hpp:165
__XETLA_API xetla_vector< T, N > xetla_atomic_local(xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
SLM scattered atomic (0 src).
Definition memory.hpp:568
__XETLA_API void xetla_fence(xetla_mask< N > pred=1)
Memory fence.
Definition memory.hpp:638
__XETLA_API void xetla_prefetch_global(Ty *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
Stateless scattered prefetch.
Definition memory.hpp:187
__XETLA_API void xetla_local_init()
Declare per-work-group slm size.
Definition memory.hpp:443
__XETLA_API xetla_vector< Ty, N *NElts > xetla_load_global(Ty *p, xetla_vector< Toffset, N > offsets, xetla_mask< N > pred=1)
Stateless scattered load.
Definition memory.hpp:245
__XETLA_API xetla_vector< Ty, N *NElts > xetla_load_local(xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
SLM scattered load.
Definition memory.hpp:464
__XETLA_API xetla_vector< T, N > xetla_atomic_global(T *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
Stateless scattered atomic (0 src).
Definition memory.hpp:371
__XETLA_API void xetla_store_local(xetla_vector< uint32_t, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
SLM scattered store.
Definition memory.hpp:518
__XETLA_API void xetla_store_global(Ty *p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
Stateless scattered store.
Definition memory.hpp:316
constexpr __ESIMD_ENS::lsc_scope get_fence_scope(gpu::xetla::fence_scope fs)
lookup table for fence scope.
Definition memory.hpp:111
constexpr __ESIMD_NS::atomic_op get_atomic_op(gpu::xetla::atomic_op ao)
lookup table for atomic op.
Definition memory.hpp:130
constexpr __ESIMD_ENS::lsc_data_size get_data_size(gpu::xetla::data_size ds)
lookup table for data size.
Definition memory.hpp:55
constexpr __ESIMD_ENS::lsc_fence_op get_fence_op(gpu::xetla::fence_op fo)
lookup table for fence op.
Definition memory.hpp:92
constexpr __ESIMD_ENS::lsc_memory_kind get_memory_kind(gpu::xetla::memory_kind mk)
lookup table for memory kind.
Definition memory.hpp:75
constexpr __ESIMD_ENS::cache_hint get_cache_hint(gpu::xetla::cache_hint ch)
lookup table for cache hint.
Definition memory.hpp:34
Definition arch_config.hpp:24
cache_hint
L1 or L2 cache hint kinds.
Definition common.hpp:89
data_size
Data size or format to read or store.
Definition common.hpp:100
@ u16u32h
load 16b, zero extend to 32b; store the opposite
@ u16u32
load 8b, zero extend to 32b; store the opposite
fence_op
The xetla_fence operation to apply to caches.
Definition common.hpp:120
@ clean
direct and clean lines are discarded w/o eviction
@ flushl2
dirty lines are written to memory, but retained in cache
@ discard
invalidate all clean lines
@ invalidate
dirty lines evicted and invalidated from L1
@ evict
no operation
fence_scope
The scope that xetla_fence operation should apply to.
Definition common.hpp:130
@ gpu
tile, flush out to several DSSs
@ tile
flush out to the local scope
@ gpus
entire GPU, flush out to the GPUs LLC
@ sysacq
the entire system memory space
@ system
all GPUs in the system, flush out to memory shared by all GPUs
@ local
flush out to the threadgroup's scope
memory_kind
The specific LSC shared function to fence with xetla_fence.
Definition common.hpp:112
@ typed_global
low-priority untyped global memory
@ untyped_global_low_pri
untyped global memory
@ shared_local
typed global memory
atomic_op
Represents an atomic operation.
Definition common.hpp:142
@ umin
Atomic store the unsigned int min of src1 and memory data and return the old value....
@ fsub
Atomic float subtract of src1 from memory data and return the old value. see
@ bit_or
Atomic store the bitwise OR of src1 and memory data and return the old value. see
@ iadd
Atomic signed int add of src1 from memory data and return the old value. see
@ smin
Atomic store the signed int min of src1 and memory data and return the old value. see
@ cmpxchg
Atomic bit-compare src1_X and memory data and replace if equal with src1_Y. Returns the old value....
@ fmax
Atomic store the float max of src1 and memory data and return the old value. see
@ fadd
Atomic float add of src1 from memory data and return the old value. see
@ idec
Atomic decrement of memory data and return the old value. see
@ umax
Atomic store the unsigned int max of src1 and memory data and return the old value....
@ store
Atomic store untyped data to memory. see
@ fmin
Atomic store the float min of src1 and memory data and return the old value. see
@ bit_and
Atomic store the bitwise AND of src1 and memory data and return the old value. see
@ iinc
Atomic increment of memory data and return the old value. see
@ smax
Atomic store the signed int max of src1 and memory data and return the old value. see
@ bit_xor
Atomic store the bitwise XOR of src1 and memory data and return the old value. see
@ isub
Atomic signed int subtract of src1 from memory data and return the old value. see
@ fcmpxchg
Atomic float compare src1_X and memory data and replace if equal with src1_Y. Returns the old value....
@ load
Atomic read of the memory data value, without modifying the data. see
Definition limitation.hpp:31
Used to check if the type is xetla internal data type.
Definition base_types.hpp:67