|
| template<uint8_t NbarCount> |
| __XETLA_API void | xetla_nbarrier_init () |
| | Initialize the number of named barrier index for a kernel.
|
| |
| __XETLA_API void | named_barrier_signal (uint8_t barrier_id, uint8_t producer_consumer_mode, uint32_t num_producers, uint32_t num_consumers) |
| | Perform signal operation for the given named barrier id.
|
| |
| __XETLA_API void | named_barrier_wait (uint8_t barrier_id) |
| | Wait on a named barrier.
|
| |
| template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T0, SZ > | xetla_shl (xetla_vector< T1, SZ > src0, U src1, Sat sat={}) |
| | Shift left operation (vector version)
|
| |
| template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag> |
| std::remove_const< T0 >::type | xetla_shl (T1 src0, T2 src1, Sat sat={}) |
| | Shift left operation (scalar version)
|
| |
| template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T0, SZ > | xetla_shr (xetla_vector< T1, SZ > src0, U src1, Sat sat={}) |
| | Shift right operation (vector version)
|
| |
| template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag> |
| __XETLA_API std::remove_const< T0 >::type | xetla_shr (T1 src0, T2 src1, Sat sat={}) |
| | Shift right operation (scalar version)
|
| |
| template<typename T0 , typename T1 , int SZ> |
| __XETLA_API xetla_vector< T0, SZ > | xetla_rol (xetla_vector< T1, SZ > src0, xetla_vector< T1, SZ > src1) |
| | Rotate left operation with two vector inputs.
|
| |
| template<typename T0 , typename T1 , int SZ, typename U > |
| __XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< U >::value &&is_xetla_scalar< U >::value, xetla_vector< T0, SZ > > | xetla_rol (xetla_vector< T1, SZ > src0, U src1) |
| | Rotate left operation with a vector and a scalar inputs.
|
| |
| template<typename T0 , typename T1 , typename T2 > |
| __XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, remove_const_t< T0 > > | xetla_rol (T1 src0, T2 src1) |
| | Rotate left operation with two scalar inputs.
|
| |
| template<typename T0 , typename T1 , int SZ> |
| __XETLA_API xetla_vector< T0, SZ > | xetla_ror (xetla_vector< T1, SZ > src0, xetla_vector< T1, SZ > src1) |
| | Rotate right operation with two vector inputs.
|
| |
| template<typename T0 , typename T1 , int SZ, typename U > |
| __XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< U >::value &&is_xetla_scalar< U >::value, xetla_vector< T0, SZ > > | xetla_ror (xetla_vector< T1, SZ > src0, U src1) |
| | Rotate right operation with a vector and a scalar inputs.
|
| |
| template<typename T0 , typename T1 , typename T2 > |
| __XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, remove_const_t< T0 > > | xetla_ror (T1 src0, T2 src1) |
| | Rotate right operation with two scalar inputs.
|
| |
| template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T0, SZ > | xetla_lsr (xetla_vector< T1, SZ > src0, U src1, Sat sat={}) |
| | Logical Shift Right (vector version)
|
| |
| template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag> |
| __XETLA_API std::remove_const< T0 >::type | xetla_lsr (T1 src0, T2 src1, Sat sat={}) |
| | Logical Shift Right (scalar version)
|
| |
| template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T0, SZ > | xetla_asr (xetla_vector< T1, SZ > src0, U src1, Sat sat={}) |
| | Arithmetical Shift Right (vector version)
|
| |
| template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag> |
| __XETLA_API std::remove_const< T0 >::type | xetla_asr (T1 src0, T2 src1, Sat sat={}) |
| | Arithmetical Shift Right (scalar version)
|
| |
| template<int N> |
| __XETLA_API uint32_t | xetla_pack_mask (xetla_mask< N > src0) |
| | Pack a xetla_mask into a single unsigned 32-bit integer value.
|
| |
| template<int N> |
| __XETLA_API xetla_mask< N > | xetla_unpack_mask (uint32_t src0) |
| | Unpack an unsigned 32-bit integer value into a xetla_mask.
|
| |
| void | xetla_wait (uint16_t val) |
| |
| template<typename T_dst , typename T_src , int N> |
| __XETLA_API std::enable_if_t<!(is_internal_type< T_dst >::value) &&!(is_internal_type< T_src >::value), xetla_vector< T_dst, N > > | xetla_cvt (xetla_vector< T_src, N > src) |
| | xetla explicit data conversion for standard data types(integer,float,half)
|
| |
| template<typename T_dst , typename T_src , int N> |
| __XETLA_API std::enable_if_t< std::is_same< T_dst, bf16 >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > > | xetla_cvt (xetla_vector< T_src, N > src) |
| | xetla explicit data conversion, fp32->bf16.
|
| |
| template<typename T_dst , typename T_src , int N> |
| __XETLA_API std::enable_if_t< std::is_same< T_dst, float >::value &&std::is_same< T_src, bf16 >::value, xetla_vector< T_dst, N > > | xetla_cvt (xetla_vector< T_src, N > src) |
| | xetla explicit data conversion, bf16->fp32.
|
| |
| template<typename T_dst , typename T_src , int N> |
| __XETLA_API std::enable_if_t< std::is_same< T_dst, tf32 >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > > | xetla_cvt (xetla_vector< T_src, N > src) |
| | xetla explicit data conversion, fp32->tf32.
|
| |
| template<typename T_dst , typename T_src , int N> |
| __XETLA_API std::enable_if_t< std::is_same< T_dst, float >::value &&std::is_same< T_src, tf32 >::value, xetla_vector< T_dst, N > > | xetla_cvt (xetla_vector< T_src, N > src) |
| | xetla explicit data conversion, tf32->fp32.
|
| |
| template<typename T_dst , typename T_src , int N> |
| __XETLA_API std::enable_if_t< std::is_same< T_dst, fp16 >::value &&std::is_same< T_src, int32_t >::value, xetla_vector< T_dst, N > > | xetla_cvt (xetla_vector< T_src, N > src, float scaling_value) |
| | xetpp explicit data conversion with scaling, int32->fp16.
|
| |
| template<typename T_dst , typename T_src , int N> |
| __XETLA_API std::enable_if_t< std::is_same< T_dst, int8_t >::value &&std::is_same< T_src, int32_t >::value, xetla_vector< T_dst, N > > | xetla_cvt (xetla_vector< T_src, N > src, float scaling_value) |
| | xetpp explicit data conversion with re-quantization, int32->int8.
|
| |
| template<typename T_dst , typename T_src , int N> |
| __XETLA_API std::enable_if_t< std::is_same< T_dst, int8_t >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > > | xetla_cvt (xetla_vector< T_src, N > src, float scaling_value) |
| | xetpp explicit data conversion with scaling and quantization, float32->int8.
|
| |
| template<typename T_dst , typename T_src , int N> |
| __XETLA_API std::enable_if_t< std::is_same< T_dst, T_src >::value &&is_internal_type< T_src >::value, xetla_vector< T_dst, N > > | xetla_cvt (xetla_vector< T_src, N > src) |
| | xetla explicit data conversion, same type.
|
| |
| template<typename T0 , typename T1 , int SZ> |
| __XETLA_API xetla_vector< T0, SZ > | xetla_abs (xetla_vector< T1, SZ > src0) |
| | Get absolute value (vector version)
|
| |
| template<typename T0 , typename T1 > |
| std::enable_if_t<!std::is_same< remove_const_t< T0 >, remove_const_t< T1 > >::value, remove_const_t< T0 > > __XETLA_API | xetla_abs (T1 src0) |
| | Get absolute value (scalar version)
|
| |
| template<typename T1 , int SZ> |
| __XETLA_API xetla_vector< T1, SZ > | xetla_abs (xetla_vector< T1, SZ > src0) |
| | Get absolute value (vector version).
|
| |
| template<typename T1 > |
| __XETLA_API std::remove_const< T1 >::type | xetla_abs (T1 src0) |
| | Get absolute value (scalar version).
|
| |
| template<typename T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_max (xetla_vector< T, SZ > src0, xetla_vector< T, SZ > src1, Sat sat={}) |
| | Selects component-wise the maximum of the two vectors.
|
| |
| template<typename T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_max (xetla_vector< T, SZ > src0, T src1, Sat sat={}) |
| | Selects maximums for each element of the input vector and a scalar.
|
| |
| template<typename T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_max (T src0, xetla_vector< T, SZ > src1, Sat sat={}) |
| | Selects maximums for each element of the input scalar and a vector.
|
| |
| template<typename T , typename Sat = xetla_saturation_off_tag> |
| __XETLA_API T | xetla_max (T src0, T src1, Sat sat={}) |
| | Selects maximum between two scalar values.
|
| |
| template<typename T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_min (xetla_vector< T, SZ > src0, xetla_vector< T, SZ > src1, Sat sat={}) |
| | Selects component-wise the minimum of the two vectors.
|
| |
| template<typename T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_min (xetla_vector< T, SZ > src0, T src1, Sat sat={}) |
| | Selects minimums for each element of the input vector and a scalar.
|
| |
| template<typename T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_min (T src0, xetla_vector< T, SZ > src1, Sat sat={}) |
| | Selects minimums for each element of the input scalar and a vector.
|
| |
| template<typename T , typename Sat = xetla_saturation_off_tag> |
| __XETLA_API T | xetla_min (T src0, T src1, Sat sat={}) |
| | Selects minimum between two scalar values.
|
| |
| template<class T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_exp (xetla_vector< T, SZ > src, Sat sat={}) |
| | Calculate exponent value for each element of the input vector, the base is e.
|
| |
| template<class T , typename Sat = xetla_saturation_off_tag> |
| __XETLA_API T | xetla_exp (T src, Sat sat={}) |
| | Calculate exponent value of a scalar, the base is e.
|
| |
| template<class T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_exp2 (xetla_vector< T, SZ > src, Sat sat={}) |
| | Calculate exponent value for each element of the input vector, the base is 2.
|
| |
| template<class T , typename Sat = xetla_saturation_off_tag> |
| __XETLA_API T | xetla_exp2 (T src, Sat sat={}) |
| | Calculate exponent value of a scalar, the base is 2.
|
| |
| template<typename T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_inv (xetla_vector< T, SZ > src, Sat sat={}) |
| | Calculate the inversion, i.e.
|
| |
| template<typename T , typename Sat = xetla_saturation_off_tag> |
| __XETLA_API T | xetla_inv (T src, Sat sat={}) |
| | Calculate the inversion, i.e.
|
| |
| template<typename T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_sqrt (xetla_vector< T, SZ > src, Sat sat={}) |
| | Calculate the square root, i.e.
|
| |
| template<typename T , typename Sat = xetla_saturation_off_tag> |
| __XETLA_API T | xetla_sqrt (T src, Sat sat={}) |
| | Calculate the square root, i.e.
|
| |
| template<typename T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_sqrt_ieee (xetla_vector< T, SZ > src, Sat sat={}) |
| | Calculate the square root, i.e.
|
| |
| template<typename T , typename Sat = xetla_saturation_off_tag> |
| __XETLA_API T | xetla_sqrt_ieee (T src, Sat sat={}) |
| | Calculate the square root, i.e.
|
| |
| template<typename T , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, SZ > | xetla_rsqrt (xetla_vector< T, SZ > src, Sat sat={}) |
| | Calculate the inversion of square root, i.e.
|
| |
| template<typename T , typename Sat = xetla_saturation_off_tag> |
| __XETLA_API T | xetla_rsqrt (T src, Sat sat={}) |
| | Calculate the inversion of square root, i.e.
|
| |
| template<typename T , int SZ> |
| __XETLA_API xetla_vector< T, SZ > | xetla_tanh (xetla_vector< T, SZ > src) |
| | Calculate the tanh (vector version).
|
| |
| template<typename T > |
| __XETLA_API T | xetla_tanh (T src) |
| | Calculate the tanh (scalar version).
|
| |
| template<typename T , int SZ> |
| __XETLA_API xetla_vector< T, SZ > | xetla_add_c (xetla_vector< T, SZ > src0, xetla_vector< T, SZ > src1, xetla_vector_ref< T, SZ > __REF__ carry) |
| | Add two unsigned integer vectors, return the result and in-place update the carry.
|
| |
| template<typename T , int SZ> |
| __XETLA_API xetla_vector< T, SZ > | xetla_add_c (xetla_vector< T, SZ > src0, T src1, xetla_vector_ref< T, SZ > __REF__ carry) |
| | Add one unsigned integer vectors with a scalar, return the result and in-place update the carry.
|
| |
| template<typename T0 , typename T1 , typename T2 , int SZ> |
| __XETLA_API xetla_vector< T0, SZ > | xetla_imul (xetla_vector_ref< T0, SZ > __REF__ lo, xetla_vector< T1, SZ > src0, T2 src1) |
| | Multiply src0 with src1, return the hi part and in-place update the lo part.
|
| |
| template<typename T0 , typename T1 , int SZ, reduce_op BinaryOperation> |
| __XETLA_API T0 | xetla_reduce (xetla_vector< T1, SZ > v) |
| | Performs reduction over elements of the input vector.
|
| |
| template<typename T , int SZ> |
| __XETLA_API xetla_vector< T, SZ > | xetla_rnde (xetla_vector< T, SZ > src0) |
| | Get rounded value.
|
| |
| template<typename T1 , typename T0 , int SZ, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T1, SZ > | xetla_add (xetla_vector< T0, SZ > src0, xetla_vector< T0, SZ > src1, Sat sat={}) |
| | Adds two vectors with saturation The source operands must be both of floating-point type.
|
| |
| template<typename T1 , typename T0 , int SZ> |
| __XETLA_API xetla_vector< T1, SZ > | xetla_sat (xetla_vector< T0, SZ > src) |
| | Saturation function.
|
| |
| template<argument_type src1_precision, argument_type src2_precision, int systolic_depth, int repeat_count, typename T , typename T1 , typename T2 , int N, int N1, int N2, typename Sat = xetla_saturation_off_tag> |
| __XETLA_API xetla_vector< T, N > | xetla_mma (xetla_vector< T, N > src0, xetla_vector< T1, N1 > src1, xetla_vector< T2, N2 > src2, Sat sat={}) |
| | description of xetla mma perform matrix multiply add operation
|
| |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, int N> |
| __XETLA_API void | xetla_prefetch_global (Ty *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1) |
| | Stateless scattered prefetch.
|
| |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached> |
| __XETLA_API void | xetla_prefetch_global (Ty *p, uint64_t offset=0) |
| | Stateless block prefetch (transposed gather with 1 channel).
|
| |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t> |
| __XETLA_API xetla_vector< Ty, N *NElts > | xetla_load_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_mask< N > pred=1) |
| | Stateless scattered load.
|
| |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> |
| __XETLA_API xetla_vector< Ty, NElts > | xetla_load_global (Ty *p, uint64_t offset=0) |
| | Stateless block load (transposed gather with 1 channel).
|
| |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t> |
| __XETLA_API void | xetla_store_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1) |
| | Stateless scattered store.
|
| |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> |
| __XETLA_API void | xetla_store_global (Ty *p, uint64_t offset, xetla_vector< Ty, NElts > vals) |
| | Stateless block store (transposed scatter with 1 channel).
|
| |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> |
| __XETLA_API xetla_vector< T, N > | xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred) |
| | Stateless scattered atomic (0 src).
|
| |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> |
| __XETLA_API xetla_vector< T, N > | xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred) |
| | Stateless scattered atomic (1 src).
|
| |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none> |
| __XETLA_API xetla_vector< T, N > | xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred) |
| | Stateless scattered atomic (2 src).
|
| |
| template<uint32_t SLMSize> |
| __XETLA_API void | xetla_local_init () |
| | Declare per-work-group slm size.
|
| |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N> |
| __XETLA_API xetla_vector< Ty, N *NElts > | xetla_load_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1) |
| | SLM scattered load.
|
| |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size> |
| __XETLA_API xetla_vector< Ty, NElts > | xetla_load_local (uint32_t offset) |
| | SLM block load.
|
| |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N> |
| __XETLA_API void | xetla_store_local (xetla_vector< uint32_t, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1) |
| | SLM scattered store.
|
| |
| template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size> |
| __XETLA_API void | xetla_store_local (uint32_t offset, xetla_vector< Ty, NElts > vals) |
| | SLM block store (transposed SLM scatter with 1 channel).
|
| |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size> |
| __XETLA_API xetla_vector< T, N > | xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred) |
| | SLM scattered atomic (0 src).
|
| |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size> |
| __XETLA_API xetla_vector< T, N > | xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred) |
| | SLM scattered atomic (1 src).
|
| |
| template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size> |
| __XETLA_API xetla_vector< T, N > | xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred) |
| | SLM scattered atomic (2 src).
|
| |
| template<memory_kind Kind = memory_kind::untyped_global, fence_op FenceOp = fence_op::none, fence_scope Scope = fence_scope::group, int N = 16> |
| __XETLA_API void | xetla_fence (xetla_mask< N > pred=1) |
| | Memory fence.
|
| |
| template<typename T1 , uint32_t n1, typename T2 , uint32_t n2, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16> |
| __XETLA_API void | xetla_raw_send (xetla_vector_ref< T1, n1 > __REF__ msgDst, xetla_vector< T2, n2 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1) |
| | Raw send with one source operand and one destination operand.
|
| |
| template<typename T1 , uint32_t n1, typename T2 , uint32_t n2, typename T3 , uint32_t n3, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16> |
| __XETLA_API void | xetla_raw_send (xetla_vector_ref< T1, n1 > __REF__ msgDst, xetla_vector< T2, n2 > msgSrc0, xetla_vector< T3, n3 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1) |
| | Raw send with two source operands and one destination operand.
|
| |
| template<typename T1 , uint32_t n1, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16> |
| __XETLA_API void | xetla_raw_send (xetla_vector< T1, n1 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1) |
| | Raw send with one source operand and no return.
|
| |
| template<typename T1 , uint32_t n1, typename T2 , uint32_t n2, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16> |
| __XETLA_API void | xetla_raw_send (xetla_vector< T1, n1 > msgSrc0, xetla_vector< T2, n2 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1) |
| | Raw send with two source operands and no return.
|
| |
| template<typename kernel_t > |
| void | slm_barrier_init () |
| | Initial the local memory size and named barrier count with kernel_t.
|
| |
| template<uint32_t slm_size, uint32_t nbarrier_count> |
| void | slm_barrier_init () |
| | Initial the local memory size and named barrier count.
|
| |
| int | clz (int x) |
| | Host side utility function to compute number of leading zeros in the binary representation.
|
| |
| int | find_log2 (int x) |
| | Host side utility function to compute log2 function.
|
| |
| __XETLA_API xetla_vector< uint32_t, 4 > | get_time_stamp () |
| | Returns time stamp.
|
| |
| template<typename Ty , int N> |
| __XETLA_API xetla_vector< Ty, N > | xetla_vector_gen (int InitVal, int Step) |
| | xetla_vector generation.
|
| |
| template<uint32_t N> |
| __XETLA_API xetla_mask_int< N > | xetla_mask_int_gen (uint32_t mask_val) |
| |
| template<typename dtype_acc , uint32_t N, uint32_t num_flag = 4, typename dtype_mask = uint8_t> |
| __XETLA_API xetla_vector< dtype_acc, N > | drop_out (xetla_vector< dtype_acc, N > in, xetla_vector< dtype_mask, N > mask, dtype_acc scale) |
| |
| template<reduce_op reduce_kind, typename dtype , int size> |
| __XETLA_API std::enable_if_t< reduce_kind==reduce_op::sum, xetla_vector< dtype, size > > | reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b) |
| |
| template<reduce_op reduce_kind, typename dtype , int size> |
| __XETLA_API std::enable_if_t< reduce_kind==reduce_op::prod, xetla_vector< dtype, size > > | reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b) |
| |
| template<reduce_op reduce_kind, typename dtype , int size> |
| __XETLA_API std::enable_if_t< reduce_kind==reduce_op::max, xetla_vector< dtype, size > > | reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b) |
| |
| template<reduce_op reduce_kind, typename dtype , int size> |
| __XETLA_API std::enable_if_t< reduce_kind==reduce_op::min, xetla_vector< dtype, size > > | reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b) |
| |
| template<reduce_op reduce_kind, typename dtype , int N_x, int N_y> |
| __XETLA_API std::enable_if_t< N_y==1, xetla_vector< dtype, N_x > > | recur_row_reduce (xetla_vector< dtype, N_x > in) |
| |
| template<reduce_op reduce_kind, typename dtype , int N_x, int N_y> |
| __XETLA_API std::enable_if_t<(N_y > 1), xetla_vector< dtype, N_x > > | recur_row_reduce (xetla_vector< dtype, N_x *N_y > in) |
| |
| template<reduce_op reduce_kind, typename dtype , int N_x, int N_y> |
| __XETLA_API std::enable_if_t< N_x==1, xetla_vector< dtype, N_y > > | recur_col_reduce (xetla_vector< dtype, N_y > in) |
| |
| template<reduce_op reduce_kind, typename dtype , int N_x, int N_y> |
| __XETLA_API std::enable_if_t<(N_x > 1), xetla_vector< dtype, N_y > > | recur_col_reduce (xetla_vector< dtype, N_x *N_y > in) |
| |
| __XETLA_API uint32_t | get_2d_group_linear_id (sycl::nd_item< 3 > &item) |
| | get linear group id of the last two dimensions.
|
| |
| template<typename Ty , uint32_t block_width = 1, uint32_t block_height = 1, uint8_t array_len = 1> |
| __XETLA_API void | xetla_fill_tdesc (xetla_tdescriptor_ref tdesc, Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y) |
| | Tensor descriptor construction(global memory version).
|
| |
| template<typename Ty > |
| __XETLA_API void | xetla_fill_tdesc (xetla_tdescriptor_ref tdesc, uint32_t base_address, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y) |
| | Tensor descriptor construction(local memory version).
|
| |
| template<typename Ty , uint32_t block_width = 1, uint32_t block_height = 1, uint8_t array_len = 1> |
| __XETLA_API xetla_tdescriptor | xetla_get_tdesc (Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y) |
| | Generate a new tensor descriptor(global memory version).
|
| |
| template<typename Ty > |
| __XETLA_API xetla_tdescriptor | xetla_get_tdesc (uint32_t base_address, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y) |
| | Generate a new tensor descriptor(local memory version).
|
| |
| __XETLA_API void | xetla_update_tdesc_offsetx (xetla_tdescriptor_ref tdesc, int32_t doffset_x) |
| | Update the x coordinate in the given tensor descriptor.
|
| |
| __XETLA_API void | xetla_update_tdesc_offsety (xetla_tdescriptor_ref tdesc, int32_t doffset_y) |
| | Update the y coordinate in the given tensor descriptor.
|
| |
| template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, bool transpose = false, bool transform = false, gpu_arch arch_tag = gpu_arch::Xe> |
| __XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, xetla_vector< Ty, N > > | xetla_tload_global (xetla_tdescriptor tdesc) |
| | Tensor load API.
|
| |
| template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, gpu_arch arch_tag = gpu_arch::Xe> |
| __XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > | xetla_tstore_global (xetla_tdescriptor tdesc, xetla_vector< Ty, N > data) |
| | Tensor store API.
|
| |
| template<typename Ty , cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, gpu_arch arch_tag = gpu_arch::Xe> |
| __XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > | xetla_tprefetch_global (xetla_tdescriptor tdesc) |
| | Tensor prefetch API.
|
| |
| template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, atomic_op Op, gpu_arch arch_tag = gpu_arch::Xe, typename Toffset = uint32_t> |
| __XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > | xetla_tatomic_store_global (uint64_t base_address, xetla_vector< Toffset, N > offset, xetla_vector< Ty, N > data, xetla_mask< N > pred=1) |
| | Tensor atomic store API.
|
| |