XeTLA v0.3.6
IntelĀ® Xe Templates for Linear Algebra - API Definition Document
 
Loading...
Searching...
No Matches
gpu::xetla Namespace Reference

Namespaces

namespace  core
 
namespace  decision_tree_rule
 
namespace  detail
 
namespace  group
 
namespace  impl
 
namespace  kernel
 
namespace  subgroup
 

Classes

struct  arch_attr_t
 
struct  arch_attr_t< gpu_arch::Xe >
 
struct  decision_tree_optimizer
 
struct  dict_t
 
struct  dropout_fwd_t
 
struct  dummy_optimizer
 
struct  elem_t
 
struct  elem_t_t
 
struct  elem_v_t
 
struct  fallback_optimizer
 
struct  FastDivMod
 Fast division + modulus operation Host code pre-computes values to avoid expensive operations in kernel code. More...
 
struct  get_uint_type
 Get the unit representation based on Size. More...
 
struct  int4x2
 xetla 4bits data packed as 8bits data type. More...
 
struct  is_floating_point
 Used to check if the type is floating_point. More...
 
struct  is_host_callable
 
struct  is_host_callable< T, std::enable_if_t< T::host_callable==true > >
 
struct  is_integral
 Used to check if the type is floating_point. More...
 
struct  is_internal_type
 Used to check if the type is xetla internal data type. More...
 
struct  is_internal_type< int4x2 >
 Used to check if the type is xetla internal data type. More...
 
struct  load_store_attr_t
 
struct  load_store_attr_t< msg_type::block_2d, gpu_arch::Xe >
 
struct  mem_base_t
 
struct  mem_base_t< dtype_, mem_space::global >
 
struct  mem_base_t< dtype_, mem_space::local >
 
struct  mem_coord_t
 
struct  mem_coord_t< 2 >
 
struct  mem_desc_t
 
struct  mem_desc_t< dtype_, layout_, space_, alignment_, 2 >
 
struct  mem_shape_t
 
struct  mem_shape_t< 2 >
 
struct  mma_attr_t
 
struct  mma_attr_t< gpu_arch::Xe >
 
struct  native_type
 Set the native data type of T. More...
 
struct  native_type< int4x2 >
 Set uint8_t as the native data type of int4x2. More...
 
struct  param_adaptor
 
struct  param_adaptor< param_adaptor_tag::kernel, dict_t_ >
 
struct  param_adaptor< param_adaptor_tag::work_group_epilogue, dict_t_ >
 
struct  param_adaptor< param_adaptor_tag::work_group_gemm, dict_t_ >
 
struct  param_adaptor_base
 
struct  param_optimizer
 
struct  param_optimizer< param_optimizer_tag::kernel, dict_t_ >
 
struct  param_optimizer< param_optimizer_tag::work_group, dict_t_ >
 
struct  param_optimizer_base
 
struct  register_attr_t
 
struct  register_attr_t< grf_num_mode, gpu_arch::Xe >
 
struct  shape
 
struct  uint_type
 Get the unit representation of type T. More...
 
struct  work_group_t
 Define a workgroup scope for a specific problem shape. More...
 
struct  xetla_nbarrier_t
 xetla nbarrier definition API. More...
 
struct  xetla_rand_t
 
class  xetla_saturation_off_tag
 
class  xetla_saturation_on_tag
 

Concepts

concept  xetla_vector_ref
 Workaround for ESIMD vector(1D) ref type.
 
concept  xetla_matrix_ref
 Workaround for ESIMD matrix(2D) ref type.
 

Typedefs

using bf16 = sycl::ext::oneapi::bfloat16
 xetla bf16 data type.
 
using fp16 = sycl::half
 xetla fp16 data type.
 
using tf32 = sycl::ext::intel::experimental::esimd::tfloat32
 xetla tf32 data type.
 
template<typename T >
using native_type_t = typename native_type< T >::type
 Return the native data type of T.
 
template<typename T >
using uint_type_t = typename uint_type< T >::type
 Return the uint representation of type T.
 
template<int Size>
using get_uint_type_t = typename get_uint_type< Size >::type
 Return the uint representation based on Size.
 
template<typename Ty , uint32_t N>
using xetla_vector = __ESIMD_NS::simd< native_type_t< Ty >, N >
 wrapper for xetla_vector.
 
using xetla_tdescriptor = xetla_vector< uint32_t, 16 >
 Description of nd tensor descriptor for load and store.
 
template<uint32_t N>
using xetla_mask = __ESIMD_NS::simd_mask< N >
 wrapper for xetla_mask.
 
template<uint32_t N>
using xetla_mask_int = __ESIMD_NS::simd_mask< N >
 wrapper for xetla_mask_int.
 
template<typename T >
using is_xetla_scalar = typename __ESIMD_DNS::is_esimd_scalar< T >
 
template<auto d_ = impl::meta_impl_base_default, typename T = decltype(d_)>
using meta_value = impl::meta_value_impl< T, d_ >
 
template<auto d_ = impl::meta_impl_base_default, typename T = decltype(d_)>
using meta_value_t = typename meta_value< d_, T >::type
 
template<typename T = impl::meta_impl_base::_default>
using meta_type = impl::meta_type_impl< T >
 
template<typename T >
using meta_type_t = typename T::type
 
using default_param_t = dict_t<>::template update_dict_t< detail::param_dtype_bf16_bf16_bf16 >::template update_dict_t< detail::param_memlayout_rrr >::template update_dict_t< detail::param_memalignment_8_8_8 >::template update_dict_t< detail::param_memspace_ggg >::template update_dict_t< detail::param_performance_default >::template update_dict_t< detail::param_runtime_default >::template update_t< elem_t_t< tune_key::data_type_acc, float >, elem_v_t< tune_key::global_kslicing_ratio, 1UL, uint32_t >, elem_v_t< tune_key::local_kslicing_ratio, 1UL, uint32_t >, elem_t_t< tune_key::wg_tile_shape, shape< 256, 256 > >, elem_t_t< tune_key::sg_tile_shape, shape< 64, 32 > >, elem_v_t< tune_key::param_optimizer_type, tune_key_value::param_optimizer_dummy > >
 

Enumerations

enum class  gpu_arch : uint8_t { Xe = 0 }
 
enum class  grf_mode : uint8_t { normal = 0 , double_grf = 1 }
 
enum class  mem_layout : uint8_t { row_major = 0 , col_major = 1 }
 
enum class  mem_space : uint8_t { global = 0 , local = 1 }
 
enum class  msg_type : uint8_t {
  block_2d = 0 , block_1d = 1 , scatter = 2 , atomic_add = 3 ,
  unaligned_2d = 4
}
 
enum class  cache_hint : uint8_t {
  none = 0 , uncached = 1 , cached = 2 , write_back = 3 ,
  write_through = 4 , streaming = 5 , read_invalidate = 6
}
 L1 or L2 cache hint kinds. More...
 
enum class  data_size : uint8_t {
  default_size = 0 , u8 = 1 , u16 = 2 , u32 = 3 ,
  u64 = 4 , u8u32 = 5 , u16u32 = 6 , u16u32h = 7
}
 Data size or format to read or store. More...
 
enum class  memory_kind : uint8_t { untyped_global = 0 , untyped_global_low_pri = 1 , typed_global = 2 , shared_local = 3 }
 The specific LSC shared function to fence with xetla_fence. More...
 
enum class  fence_op : uint8_t {
  none = 0 , evict = 1 , invalidate = 2 , discard = 3 ,
  clean = 4 , flushl2 = 5
}
 The xetla_fence operation to apply to caches. More...
 
enum class  fence_scope : uint8_t {
  group = 0 , local = 1 , tile = 2 , gpu = 3 ,
  gpus = 4 , system = 5 , sysacq = 6
}
 The scope that xetla_fence operation should apply to. More...
 
enum class  atomic_op : uint8_t {
  iinc = 0x0 , idec = 0x1 , iadd = 0x2 , isub = 0x3 ,
  smin = 0x4 , smax = 0x5 , cmpxchg = 0x6 , fadd = 0x7 ,
  fsub = 0x8 , fmin = 0x9 , fmax = 0xa , fcmpxchg = 0xb ,
  umin = 0xc , umax = 0xd , bit_and = 0xe , bit_or = 0xf ,
  bit_xor = 0x10 , load = 0x11 , store = 0x12
}
 Represents an atomic operation. More...
 
enum class  argument_type : uint8_t {
  U1 = 0 , S1 = 1 , U2 = 2 , S2 = 3 ,
  U4 = 4 , S4 = 5 , U8 = 6 , S8 = 7 ,
  BF16 = 8 , FP16 = 9 , TF32 = 12 , DF = 13 ,
  NUM_ARG_TYPES = 14
}
 xetla dpas argument typ More...
 
enum class  reduce_op : uint8_t { sum = 0 , prod = 1 , min = 2 , max = 3 }
 xetla reduce op More...
 
enum class  reg_layout : uint8_t {
  linear = 0 , tiled = 1 , vnni_tiled = 2 , transpose_tiled = 3 ,
  vnni_tiled_col_major = 4
}
 tile layout in register linear: linear layout with one tile tiled: 2d block stacked in raster order vnni_tiled: vnni pack with 2d block and 2d block stacked in raster order for dword and qword, there is no impact for word, two rows are interleaved, i.e. More...
 
enum class  store_op : uint8_t {
  normal = 0 , atomic_fadd = 1 , atomic_iadd = 2 , scattered_transpose = 3 ,
  block_1d = 4
}
 
enum class  mma_engine : uint8_t { xmx = 0 , fpu = 1 }
 
enum class  memory_op : uint8_t { load = 0 , store = 1 }
 
enum class  tdesc_update_dir : uint8_t { x_dir = 0 , y_dir = 1 }
 
enum class  post_kind : uint8_t {
  none = 0 , relu = 1 , gelu = 2 , gelu_bwd_w = 3 ,
  sigmoid = 4 , tanh = 5
}
 
enum class  pre_kind : uint8_t { none = 0 , bias_add = 1 , res_add = 2 }
 
enum class  offset_mode : uint8_t { const_offset = 0 , cyclic_offset = 1 , acyclic_offset = 2 }
 
enum class  nbarrier_role : uint8_t { nbarrier_role::producer_consumer = 0 , nbarrier_role::producer = 1 , nbarrier_role::consumer = 2 }
 
enum class  ln_fwd_fused_kind : uint8_t {
  none = 0 , bias_dropout_resAdd_ln = 1 , ln_dropout = 2 , bias_rng_dropout_resAdd_ln = 3 ,
  ln_rng_dropout = 4
}
 
enum class  ln_bwd_fused_kind : uint8_t { none = 0 , bias_dropout_resAdd_ln = 1 , ln_dropout_gradAdd = 2 , ln_dropout = 3 }
 
enum class  reduction_fused_kind : uint8_t { none = 0 , bias_gelu_w_bwd = 1 , bias_dropout_bwd = 2 }
 
enum class  tune_key : uint8_t {
  data_type_a , memory_layout_a , memory_alignment_a , memory_space_a ,
  data_type_b , memory_layout_b , memory_alignment_b , memory_space_b ,
  data_type_c , memory_layout_c , memory_alignment_c , memory_space_c ,
  data_type_acc , global_kslicing_ratio , local_kslicing_ratio , wg_tile_shape ,
  wg_tile_k , sg_tile_shape , pre_processing , prefetch_distance ,
  periodic_sync_interval , mma_engine , gpu_arch , epilogue_policy ,
  dispatch_policy , group_swizzle_policy , param_optimizer_type , source_location
}
 
enum class  tune_key_value : uint8_t {
  pre_processing_default , pre_processing_mata_neg_filter , dispatch_policy_default , dispatch_policy_kslicing ,
  dispatch_policy_stream_k , param_optimizer_dummy , param_optimizer_decision_tree
}
 
enum class  param_optimizer_tag : uint8_t { kernel , work_group }
 
enum class  param_adaptor_tag : uint8_t { kernel , work_group_gemm , work_group_epilogue }
 

Functions

template<uint8_t NbarCount>
__XETLA_API void xetla_nbarrier_init ()
 Initialize the number of named barrier index for a kernel.
 
__XETLA_API void named_barrier_signal (uint8_t barrier_id, uint8_t producer_consumer_mode, uint32_t num_producers, uint32_t num_consumers)
 Perform signal operation for the given named barrier id.
 
__XETLA_API void named_barrier_wait (uint8_t barrier_id)
 Wait on a named barrier.
 
template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T0, SZ > xetla_shl (xetla_vector< T1, SZ > src0, U src1, Sat sat={})
 Shift left operation (vector version)
 
template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag>
std::remove_const< T0 >::type xetla_shl (T1 src0, T2 src1, Sat sat={})
 Shift left operation (scalar version)
 
template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T0, SZ > xetla_shr (xetla_vector< T1, SZ > src0, U src1, Sat sat={})
 Shift right operation (vector version)
 
template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag>
__XETLA_API std::remove_const< T0 >::type xetla_shr (T1 src0, T2 src1, Sat sat={})
 Shift right operation (scalar version)
 
template<typename T0 , typename T1 , int SZ>
__XETLA_API xetla_vector< T0, SZ > xetla_rol (xetla_vector< T1, SZ > src0, xetla_vector< T1, SZ > src1)
 Rotate left operation with two vector inputs.
 
template<typename T0 , typename T1 , int SZ, typename U >
__XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< U >::value &&is_xetla_scalar< U >::value, xetla_vector< T0, SZ > > xetla_rol (xetla_vector< T1, SZ > src0, U src1)
 Rotate left operation with a vector and a scalar inputs.
 
template<typename T0 , typename T1 , typename T2 >
__XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, remove_const_t< T0 > > xetla_rol (T1 src0, T2 src1)
 Rotate left operation with two scalar inputs.
 
template<typename T0 , typename T1 , int SZ>
__XETLA_API xetla_vector< T0, SZ > xetla_ror (xetla_vector< T1, SZ > src0, xetla_vector< T1, SZ > src1)
 Rotate right operation with two vector inputs.
 
template<typename T0 , typename T1 , int SZ, typename U >
__XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< U >::value &&is_xetla_scalar< U >::value, xetla_vector< T0, SZ > > xetla_ror (xetla_vector< T1, SZ > src0, U src1)
 Rotate right operation with a vector and a scalar inputs.
 
template<typename T0 , typename T1 , typename T2 >
__XETLA_API std::enable_if_t< std::is_integral< T0 >::value &&std::is_integral< T1 >::value &&std::is_integral< T2 >::value, remove_const_t< T0 > > xetla_ror (T1 src0, T2 src1)
 Rotate right operation with two scalar inputs.
 
template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T0, SZ > xetla_lsr (xetla_vector< T1, SZ > src0, U src1, Sat sat={})
 Logical Shift Right (vector version)
 
template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag>
__XETLA_API std::remove_const< T0 >::type xetla_lsr (T1 src0, T2 src1, Sat sat={})
 Logical Shift Right (scalar version)
 
template<typename T0 , typename T1 , int SZ, typename U , class Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T0, SZ > xetla_asr (xetla_vector< T1, SZ > src0, U src1, Sat sat={})
 Arithmetical Shift Right (vector version)
 
template<typename T0 , typename T1 , typename T2 , class Sat = xetla_saturation_off_tag>
__XETLA_API std::remove_const< T0 >::type xetla_asr (T1 src0, T2 src1, Sat sat={})
 Arithmetical Shift Right (scalar version)
 
template<int N>
__XETLA_API uint32_t xetla_pack_mask (xetla_mask< N > src0)
 Pack a xetla_mask into a single unsigned 32-bit integer value.
 
template<int N>
__XETLA_API xetla_mask< N > xetla_unpack_mask (uint32_t src0)
 Unpack an unsigned 32-bit integer value into a xetla_mask.
 
void xetla_wait (uint16_t val)
 
template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t<!(is_internal_type< T_dst >::value) &&!(is_internal_type< T_src >::value), xetla_vector< T_dst, N > > xetla_cvt (xetla_vector< T_src, N > src)
 xetla explicit data conversion for standard data types(integer,float,half)
 
template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, bf16 >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > > xetla_cvt (xetla_vector< T_src, N > src)
 xetla explicit data conversion, fp32->bf16.
 
template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, float >::value &&std::is_same< T_src, bf16 >::value, xetla_vector< T_dst, N > > xetla_cvt (xetla_vector< T_src, N > src)
 xetla explicit data conversion, bf16->fp32.
 
template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, tf32 >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > > xetla_cvt (xetla_vector< T_src, N > src)
 xetla explicit data conversion, fp32->tf32.
 
template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, float >::value &&std::is_same< T_src, tf32 >::value, xetla_vector< T_dst, N > > xetla_cvt (xetla_vector< T_src, N > src)
 xetla explicit data conversion, tf32->fp32.
 
template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, fp16 >::value &&std::is_same< T_src, int32_t >::value, xetla_vector< T_dst, N > > xetla_cvt (xetla_vector< T_src, N > src, float scaling_value)
 xetpp explicit data conversion with scaling, int32->fp16.
 
template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, int8_t >::value &&std::is_same< T_src, int32_t >::value, xetla_vector< T_dst, N > > xetla_cvt (xetla_vector< T_src, N > src, float scaling_value)
 xetpp explicit data conversion with re-quantization, int32->int8.
 
template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, int8_t >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > > xetla_cvt (xetla_vector< T_src, N > src, float scaling_value)
 xetpp explicit data conversion with scaling and quantization, float32->int8.
 
template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, T_src >::value &&is_internal_type< T_src >::value, xetla_vector< T_dst, N > > xetla_cvt (xetla_vector< T_src, N > src)
 xetla explicit data conversion, same type.
 
template<typename T0 , typename T1 , int SZ>
__XETLA_API xetla_vector< T0, SZ > xetla_abs (xetla_vector< T1, SZ > src0)
 Get absolute value (vector version)
 
template<typename T0 , typename T1 >
std::enable_if_t<!std::is_same< remove_const_t< T0 >, remove_const_t< T1 > >::value, remove_const_t< T0 > > __XETLA_API xetla_abs (T1 src0)
 Get absolute value (scalar version)
 
template<typename T1 , int SZ>
__XETLA_API xetla_vector< T1, SZ > xetla_abs (xetla_vector< T1, SZ > src0)
 Get absolute value (vector version).
 
template<typename T1 >
__XETLA_API std::remove_const< T1 >::type xetla_abs (T1 src0)
 Get absolute value (scalar version).
 
template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_max (xetla_vector< T, SZ > src0, xetla_vector< T, SZ > src1, Sat sat={})
 Selects component-wise the maximum of the two vectors.
 
template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_max (xetla_vector< T, SZ > src0, T src1, Sat sat={})
 Selects maximums for each element of the input vector and a scalar.
 
template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_max (T src0, xetla_vector< T, SZ > src1, Sat sat={})
 Selects maximums for each element of the input scalar and a vector.
 
template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_APIxetla_max (T src0, T src1, Sat sat={})
 Selects maximum between two scalar values.
 
template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_min (xetla_vector< T, SZ > src0, xetla_vector< T, SZ > src1, Sat sat={})
 Selects component-wise the minimum of the two vectors.
 
template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_min (xetla_vector< T, SZ > src0, T src1, Sat sat={})
 Selects minimums for each element of the input vector and a scalar.
 
template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_min (T src0, xetla_vector< T, SZ > src1, Sat sat={})
 Selects minimums for each element of the input scalar and a vector.
 
template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_APIxetla_min (T src0, T src1, Sat sat={})
 Selects minimum between two scalar values.
 
template<class T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_exp (xetla_vector< T, SZ > src, Sat sat={})
 Calculate exponent value for each element of the input vector, the base is e.
 
template<class T , typename Sat = xetla_saturation_off_tag>
__XETLA_APIxetla_exp (T src, Sat sat={})
 Calculate exponent value of a scalar, the base is e.
 
template<class T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_exp2 (xetla_vector< T, SZ > src, Sat sat={})
 Calculate exponent value for each element of the input vector, the base is 2.
 
template<class T , typename Sat = xetla_saturation_off_tag>
__XETLA_APIxetla_exp2 (T src, Sat sat={})
 Calculate exponent value of a scalar, the base is 2.
 
template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_inv (xetla_vector< T, SZ > src, Sat sat={})
 Calculate the inversion, i.e.
 
template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_APIxetla_inv (T src, Sat sat={})
 Calculate the inversion, i.e.
 
template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_sqrt (xetla_vector< T, SZ > src, Sat sat={})
 Calculate the square root, i.e.
 
template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_APIxetla_sqrt (T src, Sat sat={})
 Calculate the square root, i.e.
 
template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_sqrt_ieee (xetla_vector< T, SZ > src, Sat sat={})
 Calculate the square root, i.e.
 
template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_APIxetla_sqrt_ieee (T src, Sat sat={})
 Calculate the square root, i.e.
 
template<typename T , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, SZ > xetla_rsqrt (xetla_vector< T, SZ > src, Sat sat={})
 Calculate the inversion of square root, i.e.
 
template<typename T , typename Sat = xetla_saturation_off_tag>
__XETLA_APIxetla_rsqrt (T src, Sat sat={})
 Calculate the inversion of square root, i.e.
 
template<typename T , int SZ>
__XETLA_API xetla_vector< T, SZ > xetla_tanh (xetla_vector< T, SZ > src)
 Calculate the tanh (vector version).
 
template<typename T >
__XETLA_APIxetla_tanh (T src)
 Calculate the tanh (scalar version).
 
template<typename T , int SZ>
__XETLA_API xetla_vector< T, SZ > xetla_add_c (xetla_vector< T, SZ > src0, xetla_vector< T, SZ > src1, xetla_vector_ref< T, SZ > __REF__ carry)
 Add two unsigned integer vectors, return the result and in-place update the carry.
 
template<typename T , int SZ>
__XETLA_API xetla_vector< T, SZ > xetla_add_c (xetla_vector< T, SZ > src0, T src1, xetla_vector_ref< T, SZ > __REF__ carry)
 Add one unsigned integer vectors with a scalar, return the result and in-place update the carry.
 
template<typename T0 , typename T1 , typename T2 , int SZ>
__XETLA_API xetla_vector< T0, SZ > xetla_imul (xetla_vector_ref< T0, SZ > __REF__ lo, xetla_vector< T1, SZ > src0, T2 src1)
 Multiply src0 with src1, return the hi part and in-place update the lo part.
 
template<typename T0 , typename T1 , int SZ, reduce_op BinaryOperation>
__XETLA_API T0 xetla_reduce (xetla_vector< T1, SZ > v)
 Performs reduction over elements of the input vector.
 
template<typename T , int SZ>
__XETLA_API xetla_vector< T, SZ > xetla_rnde (xetla_vector< T, SZ > src0)
 Get rounded value.
 
template<typename T1 , typename T0 , int SZ, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T1, SZ > xetla_add (xetla_vector< T0, SZ > src0, xetla_vector< T0, SZ > src1, Sat sat={})
 Adds two vectors with saturation The source operands must be both of floating-point type.
 
template<typename T1 , typename T0 , int SZ>
__XETLA_API xetla_vector< T1, SZ > xetla_sat (xetla_vector< T0, SZ > src)
 Saturation function.
 
template<argument_type src1_precision, argument_type src2_precision, int systolic_depth, int repeat_count, typename T , typename T1 , typename T2 , int N, int N1, int N2, typename Sat = xetla_saturation_off_tag>
__XETLA_API xetla_vector< T, N > xetla_mma (xetla_vector< T, N > src0, xetla_vector< T1, N1 > src1, xetla_vector< T2, N2 > src2, Sat sat={})
 description of xetla mma perform matrix multiply add operation
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, int N>
__XETLA_API void xetla_prefetch_global (Ty *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
 Stateless scattered prefetch.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached>
__XETLA_API void xetla_prefetch_global (Ty *p, uint64_t offset=0)
 Stateless block prefetch (transposed gather with 1 channel).
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>
__XETLA_API xetla_vector< Ty, N *NElts > xetla_load_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_mask< N > pred=1)
 Stateless scattered load.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< Ty, NElts > xetla_load_global (Ty *p, uint64_t offset=0)
 Stateless block load (transposed gather with 1 channel).
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, int N, typename Toffset = uint32_t>
__XETLA_API void xetla_store_global (Ty *p, xetla_vector< Toffset, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
 Stateless scattered store.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API void xetla_store_global (Ty *p, uint64_t offset, xetla_vector< Ty, NElts > vals)
 Stateless block store (transposed scatter with 1 channel).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N > xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
 Stateless scattered atomic (0 src).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N > xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred)
 Stateless scattered atomic (1 src).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none>
__XETLA_API xetla_vector< T, N > xetla_atomic_global (T *p, xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred)
 Stateless scattered atomic (2 src).
 
template<uint32_t SLMSize>
__XETLA_API void xetla_local_init ()
 Declare per-work-group slm size.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>
__XETLA_API xetla_vector< Ty, N *NElts > xetla_load_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred=1)
 SLM scattered load.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< Ty, NElts > xetla_load_local (uint32_t offset)
 SLM block load.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size, int N>
__XETLA_API void xetla_store_local (xetla_vector< uint32_t, N > offsets, xetla_vector< Ty, N *NElts > vals, xetla_mask< N > pred=1)
 SLM scattered store.
 
template<typename Ty , uint8_t NElts = 1, data_size DS = data_size::default_size>
__XETLA_API void xetla_store_local (uint32_t offset, xetla_vector< Ty, NElts > vals)
 SLM block store (transposed SLM scatter with 1 channel).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N > xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_mask< N > pred)
 SLM scattered atomic (0 src).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N > xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_mask< N > pred)
 SLM scattered atomic (1 src).
 
template<atomic_op Op, typename T , int N, data_size DS = data_size::default_size>
__XETLA_API xetla_vector< T, N > xetla_atomic_local (xetla_vector< uint32_t, N > offsets, xetla_vector< T, N > src0, xetla_vector< T, N > src1, xetla_mask< N > pred)
 SLM scattered atomic (2 src).
 
template<memory_kind Kind = memory_kind::untyped_global, fence_op FenceOp = fence_op::none, fence_scope Scope = fence_scope::group, int N = 16>
__XETLA_API void xetla_fence (xetla_mask< N > pred=1)
 Memory fence.
 
template<typename T1 , uint32_t n1, typename T2 , uint32_t n2, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16>
__XETLA_API void xetla_raw_send (xetla_vector_ref< T1, n1 > __REF__ msgDst, xetla_vector< T2, n2 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1)
 Raw send with one source operand and one destination operand.
 
template<typename T1 , uint32_t n1, typename T2 , uint32_t n2, typename T3 , uint32_t n3, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t numDst, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16>
__XETLA_API void xetla_raw_send (xetla_vector_ref< T1, n1 > __REF__ msgDst, xetla_vector< T2, n2 > msgSrc0, xetla_vector< T3, n3 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1)
 Raw send with two source operands and one destination operand.
 
template<typename T1 , uint32_t n1, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16>
__XETLA_API void xetla_raw_send (xetla_vector< T1, n1 > msgSrc0, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1)
 Raw send with one source operand and no return.
 
template<typename T1 , uint32_t n1, typename T2 , uint32_t n2, uint8_t execSize, uint8_t sfid, uint8_t numSrc0, uint8_t numSrc1, uint8_t isEOT = 0, uint8_t isSendc = 0, int N = 16>
__XETLA_API void xetla_raw_send (xetla_vector< T1, n1 > msgSrc0, xetla_vector< T2, n2 > msgSrc1, uint32_t exDesc, uint32_t msgDesc, xetla_mask< N > mask=1)
 Raw send with two source operands and no return.
 
template<typename kernel_t >
void slm_barrier_init ()
 Initial the local memory size and named barrier count with kernel_t.
 
template<uint32_t slm_size, uint32_t nbarrier_count>
void slm_barrier_init ()
 Initial the local memory size and named barrier count.
 
int clz (int x)
 Host side utility function to compute number of leading zeros in the binary representation.
 
int find_log2 (int x)
 Host side utility function to compute log2 function.
 
__XETLA_API xetla_vector< uint32_t, 4 > get_time_stamp ()
 Returns time stamp.
 
template<typename Ty , int N>
__XETLA_API xetla_vector< Ty, N > xetla_vector_gen (int InitVal, int Step)
 xetla_vector generation.
 
template<uint32_t N>
__XETLA_API xetla_mask_int< N > xetla_mask_int_gen (uint32_t mask_val)
 
template<typename dtype_acc , uint32_t N, uint32_t num_flag = 4, typename dtype_mask = uint8_t>
__XETLA_API xetla_vector< dtype_acc, N > drop_out (xetla_vector< dtype_acc, N > in, xetla_vector< dtype_mask, N > mask, dtype_acc scale)
 
template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::sum, xetla_vector< dtype, size > > reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b)
 
template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::prod, xetla_vector< dtype, size > > reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b)
 
template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::max, xetla_vector< dtype, size > > reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b)
 
template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::min, xetla_vector< dtype, size > > reduce_helper (xetla_vector< dtype, size > a, xetla_vector< dtype, size > b)
 
template<reduce_op reduce_kind, typename dtype , int N_x, int N_y>
__XETLA_API std::enable_if_t< N_y==1, xetla_vector< dtype, N_x > > recur_row_reduce (xetla_vector< dtype, N_x > in)
 
template<reduce_op reduce_kind, typename dtype , int N_x, int N_y>
__XETLA_API std::enable_if_t<(N_y > 1), xetla_vector< dtype, N_x > > recur_row_reduce (xetla_vector< dtype, N_x *N_y > in)
 
template<reduce_op reduce_kind, typename dtype , int N_x, int N_y>
__XETLA_API std::enable_if_t< N_x==1, xetla_vector< dtype, N_y > > recur_col_reduce (xetla_vector< dtype, N_y > in)
 
template<reduce_op reduce_kind, typename dtype , int N_x, int N_y>
__XETLA_API std::enable_if_t<(N_x > 1), xetla_vector< dtype, N_y > > recur_col_reduce (xetla_vector< dtype, N_x *N_y > in)
 
__XETLA_API uint32_t get_2d_group_linear_id (sycl::nd_item< 3 > &item)
 get linear group id of the last two dimensions.
 
template<typename Ty , uint32_t block_width = 1, uint32_t block_height = 1, uint8_t array_len = 1>
__XETLA_API void xetla_fill_tdesc (xetla_tdescriptor_ref tdesc, Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
 Tensor descriptor construction(global memory version).
 
template<typename Ty >
__XETLA_API void xetla_fill_tdesc (xetla_tdescriptor_ref tdesc, uint32_t base_address, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
 Tensor descriptor construction(local memory version).
 
template<typename Ty , uint32_t block_width = 1, uint32_t block_height = 1, uint8_t array_len = 1>
__XETLA_API xetla_tdescriptor xetla_get_tdesc (Ty *p, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
 Generate a new tensor descriptor(global memory version).
 
template<typename Ty >
__XETLA_API xetla_tdescriptor xetla_get_tdesc (uint32_t base_address, int tensor_width, int tensor_height, int tensor_pitch, int offset_x, int offset_y)
 Generate a new tensor descriptor(local memory version).
 
__XETLA_API void xetla_update_tdesc_offsetx (xetla_tdescriptor_ref tdesc, int32_t doffset_x)
 Update the x coordinate in the given tensor descriptor.
 
__XETLA_API void xetla_update_tdesc_offsety (xetla_tdescriptor_ref tdesc, int32_t doffset_y)
 Update the y coordinate in the given tensor descriptor.
 
template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, bool transpose = false, bool transform = false, gpu_arch arch_tag = gpu_arch::Xe>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, xetla_vector< Ty, N > > xetla_tload_global (xetla_tdescriptor tdesc)
 Tensor load API.
 
template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, gpu_arch arch_tag = gpu_arch::Xe>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tstore_global (xetla_tdescriptor tdesc, xetla_vector< Ty, N > data)
 Tensor store API.
 
template<typename Ty , cache_hint L1H = cache_hint::cached, cache_hint L2H = cache_hint::cached, gpu_arch arch_tag = gpu_arch::Xe>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tprefetch_global (xetla_tdescriptor tdesc)
 Tensor prefetch API.
 
template<typename Ty , uint32_t N, cache_hint L1H = cache_hint::none, cache_hint L2H = cache_hint::none, atomic_op Op, gpu_arch arch_tag = gpu_arch::Xe, typename Toffset = uint32_t>
__XETLA_API std::enable_if_t< arch_tag==gpu_arch::Xe, void > xetla_tatomic_store_global (uint64_t base_address, xetla_vector< Toffset, N > offset, xetla_vector< Ty, N > data, xetla_mask< N > pred=1)
 Tensor atomic store API.
 

Typedef Documentation

◆ default_param_t

using gpu::xetla::default_param_t = typedef dict_t<>::template update_dict_t< detail::param_dtype_bf16_bf16_bf16>::template update_dict_t<detail:: param_memlayout_rrr>::template update_dict_t<detail:: param_memalignment_8_8_8>::template update_dict_t<detail:: param_memspace_ggg>::template update_dict_t<detail:: param_performance_default>::template update_dict_t<detail:: param_runtime_default>:: template update_t<elem_t_t<tune_key::data_type_acc, float>, elem_v_t<tune_key::global_kslicing_ratio, 1UL, uint32_t>, elem_v_t<tune_key::local_kslicing_ratio, 1UL, uint32_t>, elem_t_t<tune_key::wg_tile_shape, shape<256, 256> >, elem_t_t<tune_key::sg_tile_shape, shape<64, 32> >, elem_v_t<tune_key::param_optimizer_type, tune_key_value::param_optimizer_dummy> >

◆ is_xetla_scalar

template<typename T >
using gpu::xetla::is_xetla_scalar = typedef typename __ESIMD_DNS::is_esimd_scalar<T>

◆ meta_type

template<typename T = impl::meta_impl_base::_default>
using gpu::xetla::meta_type = typedef impl::meta_type_impl<T>

◆ meta_type_t

template<typename T >
using gpu::xetla::meta_type_t = typedef typename T::type

◆ meta_value

template<auto d_ = impl::meta_impl_base_default, typename T = decltype(d_)>
using gpu::xetla::meta_value = typedef impl::meta_value_impl<T, d_>

◆ meta_value_t

template<auto d_ = impl::meta_impl_base_default, typename T = decltype(d_)>
using gpu::xetla::meta_value_t = typedef typename meta_value<d_, T>::type

Enumeration Type Documentation

◆ argument_type

enum class gpu::xetla::argument_type : uint8_t
strong

xetla dpas argument typ

Enumerator
U1 
S1 
U2 
S2 
U4 
S4 
U8 
S8 
BF16 
FP16 
TF32 
DF 
NUM_ARG_TYPES 

◆ atomic_op

enum class gpu::xetla::atomic_op : uint8_t
strong

Represents an atomic operation.

Operations always return the old value(s) of the target memory location(s) as it was before the operation was applied.

Enumerator
iinc 

Atomic increment of memory data and return the old value. see

idec 

Atomic decrement of memory data and return the old value. see

iadd 

Atomic signed int add of src1 from memory data and return the old value. see

isub 

Atomic signed int subtract of src1 from memory data and return the old value. see

smin 

Atomic store the signed int min of src1 and memory data and return the old value. see

smax 

Atomic store the signed int max of src1 and memory data and return the old value. see

cmpxchg 

Atomic bit-compare src1_X and memory data and replace if equal with src1_Y. Returns the old value. see

fadd 

Atomic float add of src1 from memory data and return the old value. see

fsub 

Atomic float subtract of src1 from memory data and return the old value. see

fmin 

Atomic store the float min of src1 and memory data and return the old value. see

fmax 

Atomic store the float max of src1 and memory data and return the old value. see

fcmpxchg 

Atomic float compare src1_X and memory data and replace if equal with src1_Y. Returns the old value. see

umin 

Atomic store the unsigned int min of src1 and memory data and return the old value. see

umax 

Atomic store the unsigned int max of src1 and memory data and return the old value. see

bit_and 

Atomic store the bitwise AND of src1 and memory data and return the old value. see

bit_or 

Atomic store the bitwise OR of src1 and memory data and return the old value. see

bit_xor 

Atomic store the bitwise XOR of src1 and memory data and return the old value. see

load 

Atomic read of the memory data value, without modifying the data. see

store 

Atomic store untyped data to memory. see

◆ cache_hint

enum class gpu::xetla::cache_hint : uint8_t
strong

L1 or L2 cache hint kinds.

Enumerator
none 
uncached 
cached 
write_back 
write_through 
streaming 
read_invalidate 

◆ data_size

enum class gpu::xetla::data_size : uint8_t
strong

Data size or format to read or store.

Enumerator
default_size 
u8 
u16 
u32 
u64 
u8u32 
u16u32 

load 8b, zero extend to 32b; store the opposite

u16u32h 

load 16b, zero extend to 32b; store the opposite

◆ fence_op

enum class gpu::xetla::fence_op : uint8_t
strong

The xetla_fence operation to apply to caches.

Enumerator
none 
evict 

no operation

invalidate 

dirty lines evicted and invalidated from L1

discard 

invalidate all clean lines

clean 

direct and clean lines are discarded w/o eviction

flushl2 

dirty lines are written to memory, but retained in cache

in clean state

◆ fence_scope

enum class gpu::xetla::fence_scope : uint8_t
strong

The scope that xetla_fence operation should apply to.

Enumerator
group 
local 

flush out to the threadgroup's scope

tile 

flush out to the local scope

gpu 

tile, flush out to several DSSs

gpus 

entire GPU, flush out to the GPUs LLC

system 

all GPUs in the system, flush out to memory shared by all GPUs

sysacq 

the entire system memory space

◆ gpu_arch

enum class gpu::xetla::gpu_arch : uint8_t
strong
Enumerator
Xe 

◆ grf_mode

enum class gpu::xetla::grf_mode : uint8_t
strong
Enumerator
normal 
double_grf 

◆ ln_bwd_fused_kind

enum class gpu::xetla::ln_bwd_fused_kind : uint8_t
strong
Enumerator
none 
bias_dropout_resAdd_ln 
ln_dropout_gradAdd 
ln_dropout 

◆ ln_fwd_fused_kind

enum class gpu::xetla::ln_fwd_fused_kind : uint8_t
strong
Enumerator
none 
bias_dropout_resAdd_ln 
ln_dropout 
bias_rng_dropout_resAdd_ln 
ln_rng_dropout 

◆ mem_layout

enum class gpu::xetla::mem_layout : uint8_t
strong
Enumerator
row_major 
col_major 

◆ mem_space

enum class gpu::xetla::mem_space : uint8_t
strong
Enumerator
global 
local 

◆ memory_kind

enum class gpu::xetla::memory_kind : uint8_t
strong

The specific LSC shared function to fence with xetla_fence.

Enumerator
untyped_global 
untyped_global_low_pri 

untyped global memory

typed_global 

low-priority untyped global memory

shared_local 

typed global memory

◆ memory_op

enum class gpu::xetla::memory_op : uint8_t
strong
Enumerator
load 
store 

◆ mma_engine

enum class gpu::xetla::mma_engine : uint8_t
strong
Enumerator
xmx 
fpu 

◆ msg_type

enum class gpu::xetla::msg_type : uint8_t
strong
Enumerator
block_2d 
block_1d 
scatter 
atomic_add 
unaligned_2d 

◆ offset_mode

enum class gpu::xetla::offset_mode : uint8_t
strong
Enumerator
const_offset 
cyclic_offset 
acyclic_offset 

◆ param_adaptor_tag

enum class gpu::xetla::param_adaptor_tag : uint8_t
strong
Enumerator
kernel 
work_group_gemm 
work_group_epilogue 

◆ param_optimizer_tag

enum class gpu::xetla::param_optimizer_tag : uint8_t
strong
Enumerator
kernel 
work_group 

◆ post_kind

enum class gpu::xetla::post_kind : uint8_t
strong
Enumerator
none 
relu 
gelu 
gelu_bwd_w 
sigmoid 
tanh 

◆ pre_kind

enum class gpu::xetla::pre_kind : uint8_t
strong
Enumerator
none 
bias_add 
res_add 

◆ reduce_op

enum class gpu::xetla::reduce_op : uint8_t
strong

xetla reduce op

Enumerator
sum 
prod 
min 
max 

◆ reduction_fused_kind

enum class gpu::xetla::reduction_fused_kind : uint8_t
strong
Enumerator
none 
bias_gelu_w_bwd 
bias_dropout_bwd 

◆ reg_layout

enum class gpu::xetla::reg_layout : uint8_t
strong

tile layout in register linear: linear layout with one tile tiled: 2d block stacked in raster order vnni_tiled: vnni pack with 2d block and 2d block stacked in raster order for dword and qword, there is no impact for word, two rows are interleaved, i.e.

a0 b0 c0 d0 ==> a0 a1 b0 b1 c0 c1 d0 d1 a1 b1 c1 d1 for byte, four rows are interleaved and formed one row, i.e. a0 b0 c0 d0 ==> a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 d0 d1 d2 d3 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3

Enumerator
linear 
tiled 
vnni_tiled 
transpose_tiled 
vnni_tiled_col_major 

this is vnni tiled format, but for each block, they are stored in col major order

◆ store_op

enum class gpu::xetla::store_op : uint8_t
strong
Enumerator
normal 
atomic_fadd 
atomic_iadd 
scattered_transpose 
block_1d 

◆ tdesc_update_dir

enum class gpu::xetla::tdesc_update_dir : uint8_t
strong
Enumerator
x_dir 
y_dir 

◆ tune_key

enum class gpu::xetla::tune_key : uint8_t
strong
Enumerator
data_type_a 
memory_layout_a 
memory_alignment_a 
memory_space_a 
data_type_b 
memory_layout_b 
memory_alignment_b 
memory_space_b 
data_type_c 
memory_layout_c 
memory_alignment_c 
memory_space_c 
data_type_acc 
global_kslicing_ratio 
local_kslicing_ratio 
wg_tile_shape 
wg_tile_k 
sg_tile_shape 
pre_processing 
prefetch_distance 
periodic_sync_interval 
mma_engine 
gpu_arch 
epilogue_policy 
dispatch_policy 
group_swizzle_policy 
param_optimizer_type 
source_location 

◆ tune_key_value

enum class gpu::xetla::tune_key_value : uint8_t
strong
Enumerator
pre_processing_default 
pre_processing_mata_neg_filter 
dispatch_policy_default 
dispatch_policy_kslicing 
dispatch_policy_stream_k 
param_optimizer_dummy 
param_optimizer_decision_tree 

Function Documentation

◆ clz()

int gpu::xetla::clz ( int  x)
inline

Host side utility function to compute number of leading zeros in the binary representation.

◆ find_log2()

int gpu::xetla::find_log2 ( int  x)
inline

Host side utility function to compute log2 function.

◆ reduce_helper() [1/3]

template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::prod, xetla_vector< dtype, size > > gpu::xetla::reduce_helper ( xetla_vector< dtype, size >  a,
xetla_vector< dtype, size >  b 
)

◆ reduce_helper() [2/3]

template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::max, xetla_vector< dtype, size > > gpu::xetla::reduce_helper ( xetla_vector< dtype, size >  a,
xetla_vector< dtype, size >  b 
)

◆ reduce_helper() [3/3]

template<reduce_op reduce_kind, typename dtype , int size>
__XETLA_API std::enable_if_t< reduce_kind==reduce_op::min, xetla_vector< dtype, size > > gpu::xetla::reduce_helper ( xetla_vector< dtype, size >  a,
xetla_vector< dtype, size >  b 
)

◆ slm_barrier_init() [1/2]

template<typename kernel_t >
void gpu::xetla::slm_barrier_init ( )

Initial the local memory size and named barrier count with kernel_t.

Template Parameters
kernel_tIs XeTLA kernel level functor.

◆ slm_barrier_init() [2/2]

template<uint32_t slm_size, uint32_t nbarrier_count>
void gpu::xetla::slm_barrier_init ( )

Initial the local memory size and named barrier count.

Template Parameters
slm_sizeIs the local memory size required.
nbarrier_countIs the named barrier count required.

◆ xetla_cvt() [1/7]

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, bf16 >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > > gpu::xetla::xetla_cvt ( xetla_vector< T_src, N >  src)

xetla explicit data conversion, fp32->bf16.

Template Parameters
T_dstis the float32 data type.
T_srcis the bfloat16 data type.
Nis the element number in xetla_vector.

◆ xetla_cvt() [2/7]

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, float >::value &&std::is_same< T_src, bf16 >::value, xetla_vector< T_dst, N > > gpu::xetla::xetla_cvt ( xetla_vector< T_src, N >  src)

xetla explicit data conversion, bf16->fp32.

Template Parameters
T_dstis the bfloat16 data type.
T_srcis the float32 data type.
Nis the element number in xetla_vector.

◆ xetla_cvt() [3/7]

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, tf32 >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > > gpu::xetla::xetla_cvt ( xetla_vector< T_src, N >  src)

xetla explicit data conversion, fp32->tf32.

Template Parameters
T_dstis the float32 data type.
T_srcis the tensor_float32 data type.
Nis the element number in xetla_vector.

◆ xetla_cvt() [4/7]

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, float >::value &&std::is_same< T_src, tf32 >::value, xetla_vector< T_dst, N > > gpu::xetla::xetla_cvt ( xetla_vector< T_src, N >  src)

xetla explicit data conversion, tf32->fp32.

Template Parameters
T_dstis the tensor_float32 data type.
T_srcis the float32 data type.
Nis the element number in xetla_vector.

◆ xetla_cvt() [5/7]

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, T_src >::value &&is_internal_type< T_src >::value, xetla_vector< T_dst, N > > gpu::xetla::xetla_cvt ( xetla_vector< T_src, N >  src)

xetla explicit data conversion, same type.

Template Parameters
T_dstis the dst data type.
T_srcis the src data type.
Nis the element number in xetla_vector.

◆ xetla_cvt() [6/7]

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, int8_t >::value &&std::is_same< T_src, int32_t >::value, xetla_vector< T_dst, N > > gpu::xetla::xetla_cvt ( xetla_vector< T_src, N >  src,
float  scaling_value 
)

xetpp explicit data conversion with re-quantization, int32->int8.

Template Parameters
T_dstis the int32 data type.
T_srcis the int8 data type.
Nis the element number in xetla_vector.

◆ xetla_cvt() [7/7]

template<typename T_dst , typename T_src , int N>
__XETLA_API std::enable_if_t< std::is_same< T_dst, int8_t >::value &&std::is_same< T_src, float >::value, xetla_vector< T_dst, N > > gpu::xetla::xetla_cvt ( xetla_vector< T_src, N >  src,
float  scaling_value 
)

xetpp explicit data conversion with scaling and quantization, float32->int8.

Template Parameters
T_dstis the int8 data type.
T_srcis the float32 data type.
Nis the element number in xetla_vector.

◆ xetla_wait()

void gpu::xetla::xetla_wait ( uint16_t  val)
inline