DPC++ Runtime
Runtime libraries for oneAPI DPC++
cl::sycl::ext::oneapi::detail Namespace Reference

Classes

class  __sycl_reduction_aux_kernel
 
class  __sycl_reduction_main_kernel
 These are the forward declaration for the classes that help to create names for additional kernels. More...
 
struct  AreAllButLastReductions
 Predicate returning true if all template type parameters except the last one are reductions. More...
 
struct  AreAllButLastReductions< T >
 Helper specialization of AreAllButLastReductions for one element only. More...
 
class  atomic_ref_base
 
class  atomic_ref_impl
 
class  atomic_ref_impl< T *, DefaultOrder, DefaultScope, AddressSpace >
 
class  atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_floating_point< T >::value > >
 
class  atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_integral< T >::value > >
 
struct  bit_equal
 
struct  bit_equal< double >
 
struct  bit_equal< float >
 
struct  bit_equal< T, typename detail::enable_if_t< std::is_integral< T >::value > >
 
class  combiner
 Use CRTP to avoid redefining shorthand operators in terms of combine. More...
 
class  default_reduction_algorithm
 Types representing specific reduction algorithms Enables reduction_impl_algo to take additional algorithm-specific templates. More...
 
struct  EmptyReductionPredicate
 
class  filter_selector_impl
 
struct  FilterElement
 
struct  get_reduction_aux_kernel_name_t
 
struct  get_reduction_aux_kernel_name_t< sycl::detail::auto_name, Type, B1, B2, T3 >
 
struct  get_reduction_main_kernel_name_t
 Helper structs to get additional kernel name types based on given Name and additional template parameters helping to distinguish kernels. More...
 
struct  get_reduction_main_kernel_name_t< sycl::detail::auto_name, Type, B1, B2, T3 >
 
struct  is_bf16_storage_type
 
struct  is_bf16_storage_type< uint16_t >
 
struct  is_bf16_storage_type< uint32_t >
 
struct  is_bf16_storage_type< vec< uint16_t, N > >
 
struct  is_bf16_storage_type< vec< uint32_t, N > >
 
struct  IsArrayReduction
 
struct  IsNonUsmReductionPredicate
 
struct  IsScalarReduction
 
struct  IsValidAtomicRefType
 
struct  joint_matrix_load_impl
 
struct  joint_matrix_load_impl< T, Use, NumRows, NumCols, Layout, Space, typename std::enable_if_t< Layout==sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major||Layout==sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major > >
 
struct  joint_matrix_mad_impl
 
struct  joint_matrix_mad_impl< T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, typename std::enable_if_t<(LayoutA==sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major||LayoutA==sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major) &&(LayoutB==sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major||LayoutB==sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major) &&(LayoutC==sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major||LayoutC==sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major)> >
 
struct  joint_matrix_store_impl
 
struct  joint_matrix_store_impl< T, NumRows, NumCols, Layout, Space, typename std::enable_if_t< Layout==sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major||Layout==sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major > >
 
struct  memory_order_traits
 
struct  memory_order_traits< memory_order::acq_rel >
 
struct  memory_order_traits< memory_order::relaxed >
 
struct  memory_order_traits< memory_order::seq_cst >
 
class  reducer
 Class that is used to represent objects that are passed to user's lambda functions and representing users' reduction variable. More...
 
class  reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >
 Specialization of the generic class 'reducer'. More...
 
class  reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&Extent==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >
 Specialization of the generic class 'reducer'. More...
 
class  reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==0 &&View==true > >
 Component of 'reducer' class for array reductions, representing a single element of the span (as returned by the subscript operator). More...
 
class  reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&!IsKnownIdentityOp< T, BinaryOperation >::value > >
 Specialization of 'reducer' class for array reductions exposing the subscript operator. More...
 
class  reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, enable_if_t< Dims==1 &&View==false &&IsKnownIdentityOp< T, BinaryOperation >::value > >
 Specialization of 'reducer' class for array reductions accepting a span in cases where the identity value is known. More...
 
struct  ReducerTraits
 Helper class for accessing reducer-defined types in CRTP May prove to be useful for other things later. More...
 
struct  ReducerTraits< reducer< T, BinaryOperation, Dims, Extent, Algorithm, View, Subst > >
 
class  reduction_impl
 This class encapsulates the reduction variable/accessor, the reduction operator and an optional operator identity. More...
 
class  reduction_impl_algo
 Templated class for implementations of specific reduction algorithms. More...
 
class  reduction_impl_algo< T, BinaryOperation, Dims, Extent, default_reduction_algorithm< IsUSM, IsPlaceholder, AccessorDims > >
 Original reduction algorithm is the default. More...
 
class  reduction_impl_base
 Base non-template class which is a base class for all reduction implementation classes. More...
 
class  reduction_impl_common
 Templated class for common functionality of all reduction implementation classes. More...
 

Typedefs

using memory_order = cl::sycl::ext::oneapi::memory_order
 
using memory_scope = cl::sycl::ext::oneapi::memory_scope
 
template<cl::sycl::access::address_space AS>
using IsValidAtomicAddressSpace = bool_constant< AS==access::address_space::global_space||AS==access::address_space::local_space||AS==access::address_space::global_device_space >
 
template<memory_order Order>
using IsValidDefaultOrder = bool_constant< Order==memory_order::relaxed||Order==memory_order::acq_rel||Order==memory_order::seq_cst >
 
template<typename T , class BinaryOperation >
using IsReduOptForFastAtomicFetch = bool_constant< sycl::detail::is_sgeninteger< T >::value &&sycl::detail::IsValidAtomicType< T >::value &&(sycl::detail::IsPlus< T, BinaryOperation >::value||sycl::detail::IsMinimum< T, BinaryOperation >::value||sycl::detail::IsMaximum< T, BinaryOperation >::value||sycl::detail::IsBitOR< T, BinaryOperation >::value||sycl::detail::IsBitXOR< T, BinaryOperation >::value||sycl::detail::IsBitAND< T, BinaryOperation >::value)>
 
template<typename T , class BinaryOperation >
using IsReduOptForAtomic64Add = bool_constant< sycl::detail::IsPlus< T, BinaryOperation >::value &&sycl::detail::is_sgenfloat< T >::value &&(sizeof(T)==4||sizeof(T)==8)>
 
template<typename T , class BinaryOperation >
using IsReduOptForFastReduce = bool_constant<((sycl::detail::is_sgeninteger< T >::value &&(sizeof(T)==4||sizeof(T)==8))||sycl::detail::is_sgenfloat< T >::value) &&(sycl::detail::IsPlus< T, BinaryOperation >::value||sycl::detail::IsMinimum< T, BinaryOperation >::value||sycl::detail::IsMaximum< T, BinaryOperation >::value)>
 
template<typename... Ts>
using ReduTupleT = sycl::detail::tuple< Ts... >
 
typedef struct sycl::detail::device_filter filter
 

Functions

template<typename KernelName , typename KernelType , int Dims, class Reduction >
void reduCGFunc (handler &CGH, KernelType KernelFunc, const range< Dims > &Range, size_t MaxWGSize, uint32_t NumConcurrentWorkGroups, Reduction &Redu)
 
template<typename KernelName , typename KernelType , int Dims, class Reduction >
enable_if_t< Reduction::has_atomic_add_float64 > reduCGFuncAtomic64 (handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu)
 
template<typename KernelName , typename KernelType , int Dims, class Reduction >
enable_if_t< Reduction::has_fast_atomics > reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu)
 
template<typename KernelName , typename KernelType , int Dims, class Reduction >
enable_if_t<!Reduction::has_fast_atomics > reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu)
 
template<typename KernelName , typename KernelType , class Reduction >
enable_if_t<!Reduction::has_fast_atomics, size_t > reduAuxCGFunc (handler &CGH, size_t NWorkItems, size_t MaxWGSize, Reduction &Redu)
 Implements a command group function that enqueues a kernel that does one iteration of reduction of elements in each of work-groups. More...
 
template<typename KernelName , typename KernelType , int Dims, typename... Reductions, size_t... Is>
void reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
 
template<typename KernelName , typename KernelType , typename... Reductions, size_t... Is>
size_t reduAuxCGFunc (handler &CGH, size_t NWorkItems, size_t MaxWGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
 
template<typename KernelName , class Reduction >
std::enable_if_t<!Reduction::is_usm > reduSaveFinalResultToUserMem (handler &CGH, Reduction &Redu)
 Copies the final reduction result kept in read-write accessor to user's accessor. More...
 
template<typename KernelName , class Reduction >
std::enable_if_t< Reduction::is_usm > reduSaveFinalResultToUserMem (handler &CGH, Reduction &Redu)
 Copies the final reduction result kept in read-write accessor to user's USM memory. More...
 
template<typename... Reduction, size_t... Is>
std::shared_ptr< eventreduSaveFinalResultToUserMem (std::shared_ptr< detail::queue_impl > Queue, bool IsHost, std::tuple< Reduction... > &ReduTuple, std::index_sequence< Is... >)
 Creates additional kernels that copy the accumulated/final results from reductions accessors to either user's accessor or user's USM memory. More...
 
template<typename Reduction , typename... RestT>
std::enable_if_t<!Reduction::is_usm > reduSaveFinalResultToUserMemHelper (std::vector< event > &Events, std::shared_ptr< detail::queue_impl > Queue, bool IsHost, Reduction &Redu, RestT... Rest)
 
uint32_t reduGetMaxNumConcurrentWorkGroups (std::shared_ptr< queue_impl > Queue)
 
size_t reduGetMaxWGSize (std::shared_ptr< queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
 
template<typename... ReductionT, size_t... Is>
size_t reduGetMemPerWorkItem (std::tuple< ReductionT... > &ReduTuple, std::index_sequence< Is... >)
 
template<typename TupleT , std::size_t... Is>
std::tuple< std::tuple_element_t< Is, TupleT >... > tuple_select_elements (TupleT Tuple, std::index_sequence< Is... >)
 Utility function: for the given tuple. More...
 
 __SYCL2020_DEPRECATED ("use 'sycl::detail::getStdMemoryOrder(sycl::memory_order)' instead") static inline const expr std
 
constexpr memory_order getLoadOrder (memory_order order)
 
template<sycl::ext::oneapi::experimental::matrix::matrix_layout Layout>
constexpr int get_layout_id ()
 
template<>
constexpr int get_layout_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major > ()
 
template<>
constexpr int get_layout_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major > ()
 
template<sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutA, sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutB>
constexpr int get_layout_pair_id ()
 
template<>
constexpr int get_layout_pair_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major, sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major > ()
 
template<>
constexpr int get_layout_pair_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major, sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major > ()
 
template<>
constexpr int get_layout_pair_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major, sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major > ()
 
template<>
constexpr int get_layout_pair_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major, sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major > ()
 
template<typename... Ts>
ReduTupleT< Ts... > makeReduTupleT (Ts... Elements)
 
size_t reduComputeWGSize (size_t NWorkItems, size_t MaxWGSize, size_t &NWorkGroups)
 
template<typename KernelFunc , int Dims, typename ReducerT >
void reductionLoop (const range< Dims > &Range, ReducerT &Reducer, const nd_item< 1 > &NdId, KernelFunc &F)
 Called in device code. More...
 
template<typename KernelName , typename KernelType , int Dims, class Reduction >
std::enable_if_t< Reduction::has_fast_atomics > reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const range< Dims > &Range, const nd_range< 1 > &NDRange, Reduction &Redu)
 
template<typename KernelName , typename KernelType , int Dims, class Reduction >
std::enable_if_t<!Reduction::has_fast_atomics &&Reduction::has_fast_reduce > reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const range< Dims > &Range, const nd_range< 1 > &NDRange, Reduction &Redu)
 
template<typename KernelName , typename KernelType , int Dims, class Reduction >
std::enable_if_t<!Reduction::has_fast_atomics &&!Reduction::has_fast_reduce > reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const range< Dims > &Range, const nd_range< 1 > &NDRange, Reduction &Redu)
 
template<typename KernelName , typename KernelType , int Dims, class Reduction , bool IsPow2WG>
enable_if_t< Reduction::has_fast_reduce &&Reduction::has_fast_atomics > reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &, typename Reduction::rw_accessor_type Out)
 Implements a command group function that enqueues a kernel that calls user's lambda function KernelFunc and also does one iteration of reduction of elements computed in user's lambda function. More...
 
template<typename KernelName , typename KernelType , int Dims, class Reduction , bool IsPow2WG>
enable_if_t<!Reduction::has_fast_reduce &&Reduction::has_fast_atomics > reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &, typename Reduction::rw_accessor_type Out)
 Implements a command group function that enqueues a kernel that calls user's lambda function KernelFunc and also does one iteration of reduction of elements computed in user's lambda function. More...
 
template<typename KernelName , typename KernelType , int Dims, class Reduction , bool IsPow2WG>
enable_if_t< Reduction::has_fast_reduce &&!Reduction::has_fast_atomics > reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu, typename Reduction::rw_accessor_type Out)
 Implements a command group function that enqueues a kernel that calls user's lambda function and does one iteration of reduction of elements in each of work-groups. More...
 
template<typename KernelName , typename KernelType , int Dims, class Reduction , bool IsPow2WG>
enable_if_t<!Reduction::has_fast_reduce &&!Reduction::has_fast_atomics > reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &Redu, typename Reduction::rw_accessor_type Out)
 Implements a command group function that enqueues a kernel that calls user's lambda function. More...
 
template<typename KernelName , typename KernelType , bool UniformWG, class Reduction , typename InputT , typename OutputT >
enable_if_t< Reduction::has_fast_reduce &&!Reduction::has_fast_atomics > reduAuxCGFuncImpl (handler &CGH, size_t NWorkItems, size_t NWorkGroups, size_t WGSize, Reduction &Redu, InputT In, OutputT Out)
 Implements a command group function that enqueues a kernel that does one iteration of reduction of elements in each of work-groups. More...
 
template<typename KernelName , typename KernelType , bool UniformPow2WG, class Reduction , typename InputT , typename OutputT >
enable_if_t<!Reduction::has_fast_reduce &&!Reduction::has_fast_atomics > reduAuxCGFuncImpl (handler &CGH, size_t NWorkItems, size_t NWorkGroups, size_t WGSize, Reduction &Redu, InputT In, OutputT Out)
 Implements a command group function that enqueues a kernel that does one iteration of reduction of elements in each of work-groups. More...
 
template<typename... Reductions, size_t... Is>
auto createReduLocalAccs (size_t Size, handler &CGH, std::index_sequence< Is... >)
 For the given 'Reductions' types pack and indices enumerating only the reductions for which a local accessors are needed, this function creates those local accessors and returns a tuple consisting of them. More...
 
template<bool IsOneWG, typename... Reductions, size_t... Is>
auto createReduOutAccs (size_t NWorkGroups, handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
 For the given 'Reductions' types pack and indices enumerating them this function either creates new temporary accessors for partial sums (if IsOneWG is false) or returns user's accessor/USM-pointer if (IsOneWG is true). More...
 
template<typename... Reductions, size_t... Is>
auto getReadAccsToPreviousPartialReds (handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
 For the given 'Reductions' types pack and indices enumerating them this function returns accessors to buffers holding partial sums generated in the previous kernel invocation. More...
 
template<typename... Reductions, size_t... Is>
ReduTupleT< typename Reductions::result_type... > getReduIdentities (std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
 
template<typename... Reductions, size_t... Is>
ReduTupleT< typename Reductions::binary_operation... > getReduBOPs (std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
 
template<typename... Reductions, size_t... Is>
std::array< bool, sizeof...(Reductions)> getInitToIdentityProperties (std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
 
template<typename... Reductions, size_t... Is>
std::tuple< typename Reductions::reducer_type... > createReducers (ReduTupleT< typename Reductions::result_type... > Identities, ReduTupleT< typename Reductions::binary_operation... > BOPsTuple, std::index_sequence< Is... >)
 
template<typename KernelType , int Dims, typename... ReducerT, size_t... Is>
void callReduUserKernelFunc (KernelType KernelFunc, nd_item< Dims > NDIt, std::tuple< ReducerT... > &Reducers, std::index_sequence< Is... >)
 
template<bool Pow2WG, typename... LocalAccT, typename... ReducerT, typename... ResultT, size_t... Is>
void initReduLocalAccs (size_t LID, size_t WGSize, ReduTupleT< LocalAccT... > LocalAccs, const std::tuple< ReducerT... > &Reducers, ReduTupleT< ResultT... > Identities, std::index_sequence< Is... >)
 
template<bool UniformPow2WG, typename... LocalAccT, typename... InputAccT, typename... ResultT, size_t... Is>
void initReduLocalAccs (size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, ReduTupleT< InputAccT... > LocalAccs, ReduTupleT< LocalAccT... > InputAccs, ReduTupleT< ResultT... > Identities, std::index_sequence< Is... >)
 
template<typename... LocalAccT, typename... BOPsT, size_t... Is>
void reduceReduLocalAccs (size_t IndexA, size_t IndexB, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, std::index_sequence< Is... >)
 
template<bool Pow2WG, bool IsOneWG, typename... Reductions, typename... OutAccT, typename... LocalAccT, typename... BOPsT, typename... Ts, size_t... Is>
void writeReduSumsToOutAccs (size_t OutAccIndex, size_t WGSize, std::tuple< Reductions... > *, ReduTupleT< OutAccT... > OutAccs, ReduTupleT< LocalAccT... > LocalAccs, ReduTupleT< BOPsT... > BOPs, ReduTupleT< Ts... > IdentityVals, std::array< bool, sizeof...(Reductions)> IsInitializeToIdentity, std::index_sequence< Is... >)
 
constexpr std::index_sequence concat_sequences (std::index_sequence<>)
 
template<size_t I>
constexpr std::index_sequence< I > concat_sequences (std::index_sequence< I >)
 
template<size_t... Is, size_t... Js>
constexpr std::index_sequence< Is..., Js... > concat_sequences (std::index_sequence< Is... >, std::index_sequence< Js... >)
 
template<size_t... Is, size_t... Js, class... Rs>
constexpr auto concat_sequences (std::index_sequence< Is... >, std::index_sequence< Js... >, Rs...)
 
template<typename... T, typename FunctorT , size_t... Is, std::enable_if_t<(sizeof...(Is) > 0), int > Z = 0>
constexpr auto filterSequenceHelper (FunctorT, std::index_sequence< Is... >)
 For each index 'I' from the given indices pack 'Is' this function initially creates a number of short index_sequences, where each of such short index sequences is either empty (if the given Functor returns false for the type T[I]) or 1 element 'I' (otherwise). More...
 
template<typename... T, typename FunctorT , size_t... Is>
constexpr auto filterSequence (FunctorT F, std::index_sequence< Is... > Indices)
 For each index 'I' from the given indices pack 'Is' this function returns an index sequence consisting of only those 'I's for which the 'FunctorT' applied to 'T[I]' returns true. More...
 
template<bool Pow2WG, bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT, typename... OutAccT, typename... ReducerT, typename... Ts, typename... BOPsT, size_t... Is>
void reduCGFuncImplScalar (nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
 All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction uses its own storage. More...
 
template<bool Pow2WG, bool IsOneWG, typename Reduction , int Dims, typename LocalAccT , typename OutAccT , typename ReducerT , typename T , typename BOPT >
void reduCGFuncImplArrayHelper (nd_item< Dims > NDIt, LocalAccT LocalReds, OutAccT Out, ReducerT &Reducer, T Identity, BOPT BOp, bool IsInitializeToIdentity)
 Each array reduction is processed separately. More...
 
template<bool Pow2WG, bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT, typename... OutAccT, typename... ReducerT, typename... Ts, typename... BOPsT, size_t... Is>
void reduCGFuncImplArray (nd_item< Dims > NDIt, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, std::tuple< ReducerT... > &ReducersTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
 
template<typename KernelName , bool Pow2WG, bool IsOneWG, typename KernelType , int Dims, typename... Reductions, size_t... Is>
void reduCGFuncImpl (handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
 
template<typename KernelName , typename KernelType , int Dims, class Reduction >
std::enable_if_t< Reduction::has_atomic_add_float64 > reduCGFuncImplAtomic64 (handler &CGH, KernelType KernelFunc, const nd_range< Dims > &Range, Reduction &, typename Reduction::rw_accessor_type Out)
 
void associateReduAccsWithHandlerHelper (handler &)
 
template<typename ReductionT >
void associateReduAccsWithHandlerHelper (handler &CGH, ReductionT &Redu)
 
template<typename ReductionT , typename... RestT, enable_if_t<(sizeof...(RestT) > 0), int > Z = 0>
void associateReduAccsWithHandlerHelper (handler &CGH, ReductionT &Redu, RestT &... Rest)
 
template<typename... Reductions, size_t... Is>
void associateReduAccsWithHandler (handler &CGH, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... >)
 
template<bool UniformPow2WG, bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT, typename... InAccT, typename... OutAccT, typename... Ts, typename... BOPsT, size_t... Is>
void reduAuxCGFuncImplScalar (nd_item< Dims > NDIt, size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... > ReduIndices)
 All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction uses its own storage. More...
 
template<bool UniformPow2WG, bool IsOneWG, typename Reduction , int Dims, typename LocalAccT , typename InAccT , typename OutAccT , typename T , typename BOPT >
void reduAuxCGFuncImplArrayHelper (nd_item< Dims > NDIt, size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, LocalAccT LocalReds, InAccT In, OutAccT Out, T Identity, BOPT BOp, bool IsInitializeToIdentity)
 
template<bool UniformPow2WG, bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT, typename... InAccT, typename... OutAccT, typename... Ts, typename... BOPsT, size_t... Is>
void reduAuxCGFuncImplArray (nd_item< Dims > NDIt, size_t LID, size_t GID, size_t NWorkItems, size_t WGSize, ReduTupleT< LocalAccT... > LocalAccsTuple, ReduTupleT< InAccT... > InAccsTuple, ReduTupleT< OutAccT... > OutAccsTuple, ReduTupleT< Ts... > IdentitiesTuple, ReduTupleT< BOPsT... > BOPsTuple, std::array< bool, sizeof...(Reductions)> InitToIdentityProps, std::index_sequence< Is... >)
 
template<typename KernelName , typename KernelType , bool UniformPow2WG, bool IsOneWG, typename... Reductions, size_t... Is>
void reduAuxCGFuncImpl (handler &CGH, size_t NWorkItems, size_t NWorkGroups, size_t WGSize, std::tuple< Reductions... > &ReduTuple, std::index_sequence< Is... > ReduIndices)
 
void reduSaveFinalResultToUserMemHelper (std::vector< event > &, std::shared_ptr< detail::queue_impl >, bool)
 
template<typename Reduction , typename... RestT>
std::enable_if_t< Reduction::is_usm > reduSaveFinalResultToUserMemHelper (std::vector< event > &Events, std::shared_ptr< detail::queue_impl > Queue, bool IsHost, Reduction &, RestT... Rest)
 
template<typename Reduction >
size_t reduGetMemPerWorkItemHelper (Reduction &)
 
template<typename Reduction , typename... RestT>
size_t reduGetMemPerWorkItemHelper (Reduction &, RestT... Rest)
 
std::vector< std::string > tokenize (const std::string &Filter, const std::string &Delim)
 
filter create_filter (const std::string &Input)
 
uint32_t reduGetMaxNumConcurrentWorkGroups (std::shared_ptr< sycl::detail::queue_impl > Queue)
 
size_t reduGetMaxWGSize (std::shared_ptr< sycl::detail::queue_impl > Queue, size_t LocalMemBytesPerWorkItem)
 

Typedef Documentation

◆ filter

◆ IsReduOptForAtomic64Add

template<typename T , class BinaryOperation >
using cl::sycl::ext::oneapi::detail::IsReduOptForAtomic64Add = typedef bool_constant<sycl::detail::IsPlus<T, BinaryOperation>::value && sycl::detail::is_sgenfloat<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)>

Definition at line 76 of file reduction.hpp.

◆ IsReduOptForFastAtomicFetch

template<typename T , class BinaryOperation >
using cl::sycl::ext::oneapi::detail::IsReduOptForFastAtomicFetch = typedef bool_constant<sycl::detail::is_sgeninteger<T>::value && sycl::detail::IsValidAtomicType<T>::value && (sycl::detail::IsPlus<T, BinaryOperation>::value || sycl::detail::IsMinimum<T, BinaryOperation>::value || sycl::detail::IsMaximum<T, BinaryOperation>::value || sycl::detail::IsBitOR<T, BinaryOperation>::value || sycl::detail::IsBitXOR<T, BinaryOperation>::value || sycl::detail::IsBitAND<T, BinaryOperation>::value)>

Definition at line 54 of file reduction.hpp.

◆ IsReduOptForFastReduce

template<typename T , class BinaryOperation >
using cl::sycl::ext::oneapi::detail::IsReduOptForFastReduce = typedef bool_constant<((sycl::detail::is_sgeninteger<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)) || sycl::detail::is_sgenfloat<T>::value) && (sycl::detail::IsPlus<T, BinaryOperation>::value || sycl::detail::IsMinimum<T, BinaryOperation>::value || sycl::detail::IsMaximum<T, BinaryOperation>::value)>

Definition at line 94 of file reduction.hpp.

◆ IsValidAtomicAddressSpace

template<cl::sycl::access::address_space AS>
using cl::sycl::ext::oneapi::detail::IsValidAtomicAddressSpace = typedef bool_constant<AS == access::address_space::global_space || AS == access::address_space::local_space || AS == access::address_space::global_device_space>

Definition at line 50 of file atomic_ref.hpp.

◆ IsValidDefaultOrder

template<memory_order Order>
using cl::sycl::ext::oneapi::detail::IsValidDefaultOrder = typedef bool_constant<Order == memory_order::relaxed || Order == memory_order::acq_rel || Order == memory_order::seq_cst>

Definition at line 56 of file atomic_ref.hpp.

◆ memory_order

using cl::sycl::ext::oneapi::detail::memory_order = typedef cl::sycl::ext::oneapi::memory_order

Definition at line 33 of file atomic_ref.hpp.

◆ memory_scope

using cl::sycl::ext::oneapi::detail::memory_scope = typedef cl::sycl::ext::oneapi::memory_scope

Definition at line 34 of file atomic_ref.hpp.

◆ ReduTupleT

template<typename... Ts>
using cl::sycl::ext::oneapi::detail::ReduTupleT = typedef sycl::detail::tuple<Ts...>

Definition at line 101 of file reduction.hpp.

Function Documentation

◆ __SYCL2020_DEPRECATED()

cl::sycl::ext::oneapi::detail::__SYCL2020_DEPRECATED ( "use 'sycl::detail::getStdMemoryOrder(sycl::memory_order)' instead ) const

Definition at line 59 of file atomic_enums.hpp.

References __SYCL2020_DEPRECATED.

◆ associateReduAccsWithHandler()

template<typename... Reductions, size_t... Is>
void cl::sycl::ext::oneapi::detail::associateReduAccsWithHandler ( handler CGH,
std::tuple< Reductions... > &  ReduTuple,
std::index_sequence< Is... >   
)

Definition at line 2289 of file reduction.hpp.

References associateReduAccsWithHandlerHelper().

Referenced by reduAuxCGFuncImpl().

◆ associateReduAccsWithHandlerHelper() [1/3]

void cl::sycl::ext::oneapi::detail::associateReduAccsWithHandlerHelper ( handler )
inline

Definition at line 2273 of file reduction.hpp.

◆ associateReduAccsWithHandlerHelper() [2/3]

template<typename ReductionT >
void cl::sycl::ext::oneapi::detail::associateReduAccsWithHandlerHelper ( handler CGH,
ReductionT &  Redu 
)

Definition at line 2276 of file reduction.hpp.

◆ associateReduAccsWithHandlerHelper() [3/3]

template<typename ReductionT , typename... RestT, enable_if_t<(sizeof...(RestT) > 0), int > Z = 0>
void cl::sycl::ext::oneapi::detail::associateReduAccsWithHandlerHelper ( handler CGH,
ReductionT &  Redu,
RestT &...  Rest 
)

Definition at line 2282 of file reduction.hpp.

Referenced by associateReduAccsWithHandler().

◆ callReduUserKernelFunc()

template<typename KernelType , int Dims, typename... ReducerT, size_t... Is>
void cl::sycl::ext::oneapi::detail::callReduUserKernelFunc ( KernelType  KernelFunc,
nd_item< Dims >  NDIt,
std::tuple< ReducerT... > &  Reducers,
std::index_sequence< Is... >   
)

Definition at line 1829 of file reduction.hpp.

Referenced by reduCGFuncImpl().

◆ concat_sequences() [1/4]

template<size_t I>
constexpr std::index_sequence<I> cl::sycl::ext::oneapi::detail::concat_sequences ( std::index_sequence< I >  )
constexpr

Definition at line 1933 of file reduction.hpp.

◆ concat_sequences() [2/4]

template<size_t... Is, size_t... Js>
constexpr std::index_sequence<Is..., Js...> cl::sycl::ext::oneapi::detail::concat_sequences ( std::index_sequence< Is... >  ,
std::index_sequence< Js... >   
)
constexpr

Definition at line 1940 of file reduction.hpp.

◆ concat_sequences() [3/4]

template<size_t... Is, size_t... Js, class... Rs>
constexpr auto cl::sycl::ext::oneapi::detail::concat_sequences ( std::index_sequence< Is... >  ,
std::index_sequence< Js... >  ,
Rs...   
)
constexpr

Definition at line 1946 of file reduction.hpp.

Referenced by filterSequenceHelper().

◆ concat_sequences() [4/4]

constexpr std::index_sequence cl::sycl::ext::oneapi::detail::concat_sequences ( std::index_sequence<>  )
constexpr

Definition at line 1927 of file reduction.hpp.

◆ create_filter()

◆ createReducers()

template<typename... Reductions, size_t... Is>
std::tuple<typename Reductions::reducer_type...> cl::sycl::ext::oneapi::detail::createReducers ( ReduTupleT< typename Reductions::result_type... >  Identities,
ReduTupleT< typename Reductions::binary_operation... >  BOPsTuple,
std::index_sequence< Is... >   
)

Definition at line 1821 of file reduction.hpp.

Referenced by reduCGFuncImpl().

◆ createReduLocalAccs()

template<typename... Reductions, size_t... Is>
auto cl::sycl::ext::oneapi::detail::createReduLocalAccs ( size_t  Size,
handler CGH,
std::index_sequence< Is... >   
)

For the given 'Reductions' types pack and indices enumerating only the reductions for which a local accessors are needed, this function creates those local accessors and returns a tuple consisting of them.

Definition at line 1767 of file reduction.hpp.

References makeReduTupleT().

Referenced by reduAuxCGFuncImpl(), and reduCGFuncImpl().

◆ createReduOutAccs()

template<bool IsOneWG, typename... Reductions, size_t... Is>
auto cl::sycl::ext::oneapi::detail::createReduOutAccs ( size_t  NWorkGroups,
handler CGH,
std::tuple< Reductions... > &  ReduTuple,
std::index_sequence< Is... >   
)

For the given 'Reductions' types pack and indices enumerating them this function either creates new temporary accessors for partial sums (if IsOneWG is false) or returns user's accessor/USM-pointer if (IsOneWG is true).

Definition at line 1778 of file reduction.hpp.

References makeReduTupleT().

◆ filterSequence()

template<typename... T, typename FunctorT , size_t... Is>
constexpr auto cl::sycl::ext::oneapi::detail::filterSequence ( FunctorT  F,
std::index_sequence< Is... >  Indices 
)
constexpr

For each index 'I' from the given indices pack 'Is' this function returns an index sequence consisting of only those 'I's for which the 'FunctorT' applied to 'T[I]' returns true.

Definition at line 1989 of file reduction.hpp.

References filterSequenceHelper().

Referenced by reduAuxCGFuncImpl(), and reduCGFuncImpl().

◆ filterSequenceHelper()

template<typename... T, typename FunctorT , size_t... Is, std::enable_if_t<(sizeof...(Is) > 0), int > Z = 0>
constexpr auto cl::sycl::ext::oneapi::detail::filterSequenceHelper ( FunctorT  ,
std::index_sequence< Is... >   
)
constexpr

For each index 'I' from the given indices pack 'Is' this function initially creates a number of short index_sequences, where each of such short index sequences is either empty (if the given Functor returns false for the type T[I]) or 1 element 'I' (otherwise).

After that this function concatenates those short sequences into one and returns the result sequence.

Definition at line 1973 of file reduction.hpp.

References concat_sequences().

Referenced by filterSequence().

◆ get_layout_id()

template<sycl::ext::oneapi::experimental::matrix::matrix_layout Layout>
constexpr int cl::sycl::ext::oneapi::detail::get_layout_id ( )
constexpr

◆ get_layout_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major >()

◆ get_layout_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major >()

◆ get_layout_pair_id()

template<sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutA, sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutB>
constexpr int cl::sycl::ext::oneapi::detail::get_layout_pair_id ( )
constexpr

◆ get_layout_pair_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major, sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major >()

◆ get_layout_pair_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major, sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major >()

◆ get_layout_pair_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major, sycl::ext::oneapi::experimental::matrix::matrix_layout::col_major >()

◆ get_layout_pair_id< sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major, sycl::ext::oneapi::experimental::matrix::matrix_layout::row_major >()

◆ getInitToIdentityProperties()

template<typename... Reductions, size_t... Is>
std::array<bool, sizeof...(Reductions)> cl::sycl::ext::oneapi::detail::getInitToIdentityProperties ( std::tuple< Reductions... > &  ReduTuple,
std::index_sequence< Is... >   
)

Definition at line 1814 of file reduction.hpp.

Referenced by reduAuxCGFuncImpl(), and reduCGFuncImpl().

◆ getLoadOrder()

constexpr memory_order cl::sycl::ext::oneapi::detail::getLoadOrder ( memory_order  order)
inlineconstexpr

Definition at line 75 of file atomic_ref.hpp.

Referenced by cl::sycl::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_floating_point< T >::value > >::fetch_add(), cl::sycl::ext::oneapi::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_floating_point< T >::value > >::fetch_add(), cl::sycl::detail::atomic_ref_impl< T *, DefaultOrder, DefaultScope, AddressSpace >::fetch_add(), cl::sycl::ext::oneapi::detail::atomic_ref_impl< T *, DefaultOrder, DefaultScope, AddressSpace >::fetch_add(), cl::sycl::ext::oneapi::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_integral< T >::value > >::fetch_max(), cl::sycl::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_integral< T >::value > >::fetch_max(), cl::sycl::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_floating_point< T >::value > >::fetch_max(), cl::sycl::ext::oneapi::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_floating_point< T >::value > >::fetch_max(), cl::sycl::ext::oneapi::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_integral< T >::value > >::fetch_min(), cl::sycl::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_integral< T >::value > >::fetch_min(), cl::sycl::ext::oneapi::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_floating_point< T >::value > >::fetch_min(), cl::sycl::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_floating_point< T >::value > >::fetch_min(), cl::sycl::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_floating_point< T >::value > >::fetch_sub(), cl::sycl::ext::oneapi::detail::atomic_ref_impl< T, DefaultOrder, DefaultScope, AddressSpace, typename detail::enable_if_t< std::is_floating_point< T >::value > >::fetch_sub(), cl::sycl::detail::atomic_ref_impl< T *, DefaultOrder, DefaultScope, AddressSpace >::fetch_sub(), and cl::sycl::ext::oneapi::detail::atomic_ref_impl< T *, DefaultOrder, DefaultScope, AddressSpace >::fetch_sub().

◆ getReadAccsToPreviousPartialReds()

template<typename... Reductions, size_t... Is>
auto cl::sycl::ext::oneapi::detail::getReadAccsToPreviousPartialReds ( handler CGH,
std::tuple< Reductions... > &  ReduTuple,
std::index_sequence< Is... >   
)

For the given 'Reductions' types pack and indices enumerating them this function returns accessors to buffers holding partial sums generated in the previous kernel invocation.

Definition at line 1792 of file reduction.hpp.

References makeReduTupleT().

Referenced by reduAuxCGFuncImpl().

◆ getReduBOPs()

template<typename... Reductions, size_t... Is>
ReduTupleT<typename Reductions::binary_operation...> cl::sycl::ext::oneapi::detail::getReduBOPs ( std::tuple< Reductions... > &  ReduTuple,
std::index_sequence< Is... >   
)

Definition at line 1808 of file reduction.hpp.

Referenced by reduAuxCGFuncImpl(), and reduCGFuncImpl().

◆ getReduIdentities()

template<typename... Reductions, size_t... Is>
ReduTupleT<typename Reductions::result_type...> cl::sycl::ext::oneapi::detail::getReduIdentities ( std::tuple< Reductions... > &  ReduTuple,
std::index_sequence< Is... >   
)

Definition at line 1801 of file reduction.hpp.

Referenced by reduAuxCGFuncImpl(), and reduCGFuncImpl().

◆ initReduLocalAccs() [1/2]

template<bool UniformPow2WG, typename... LocalAccT, typename... InputAccT, typename... ResultT, size_t... Is>
void cl::sycl::ext::oneapi::detail::initReduLocalAccs ( size_t  LID,
size_t  GID,
size_t  NWorkItems,
size_t  WGSize,
ReduTupleT< InputAccT... >  LocalAccs,
ReduTupleT< LocalAccT... >  InputAccs,
ReduTupleT< ResultT... >  Identities,
std::index_sequence< Is... >   
)

Definition at line 1855 of file reduction.hpp.

References cl::sycl::detail::make_tuple(), and cl::sycl::detail::tie().

◆ initReduLocalAccs() [2/2]

template<bool Pow2WG, typename... LocalAccT, typename... ReducerT, typename... ResultT, size_t... Is>
void cl::sycl::ext::oneapi::detail::initReduLocalAccs ( size_t  LID,
size_t  WGSize,
ReduTupleT< LocalAccT... >  LocalAccs,
const std::tuple< ReducerT... > &  Reducers,
ReduTupleT< ResultT... >  Identities,
std::index_sequence< Is... >   
)

Definition at line 1837 of file reduction.hpp.

References cl::sycl::detail::make_tuple(), and cl::sycl::detail::tie().

◆ makeReduTupleT()

template<typename... Ts>
ReduTupleT<Ts...> cl::sycl::ext::oneapi::detail::makeReduTupleT ( Ts...  Elements)

◆ reduAuxCGFunc() [1/2]

template<typename KernelName , typename KernelType , class Reduction >
enable_if_t<!Reduction::has_fast_atomics, size_t > cl::sycl::ext::oneapi::detail::reduAuxCGFunc ( handler CGH,
size_t  NWorkItems,
size_t  MaxWGSize,
Reduction &  Redu 
)

Implements a command group function that enqueues a kernel that does one iteration of reduction of elements in each of work-groups.

At the end of each work-group the partial sum is written to a global buffer. The function returns the number of the newly generated partial sums.

Definition at line 1696 of file reduction.hpp.

References reduComputeWGSize().

◆ reduAuxCGFunc() [2/2]

template<typename KernelName , typename KernelType , typename... Reductions, size_t... Is>
size_t cl::sycl::ext::oneapi::detail::reduAuxCGFunc ( handler CGH,
size_t  NWorkItems,
size_t  MaxWGSize,
std::tuple< Reductions... > &  ReduTuple,
std::index_sequence< Is... >  ReduIndices 
)

Definition at line 2491 of file reduction.hpp.

References reduComputeWGSize().

◆ reduAuxCGFuncImpl() [1/3]

template<typename KernelName , typename KernelType , bool UniformWG, class Reduction , typename InputT , typename OutputT >
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics> cl::sycl::ext::oneapi::detail::reduAuxCGFuncImpl ( handler CGH,
size_t  NWorkItems,
size_t  NWorkGroups,
size_t  WGSize,
Reduction &  Redu,
InputT  In,
OutputT  Out 
)

Implements a command group function that enqueues a kernel that does one iteration of reduction of elements in each of work-groups.

This version uses ext::oneapi::reduce() algorithm to reduce elements in each of work-groups. At the end of each work-groups the partial sum is written to a global buffer.

Briefly: aux kernel, ext::oneapi::reduce(), reproducible results, FP + ADD/MIN/MAX

Definition at line 1587 of file reduction.hpp.

References cl::sycl::handler::parallel_for(), and reduce().

◆ reduAuxCGFuncImpl() [2/3]

template<typename KernelName , typename KernelType , bool UniformPow2WG, class Reduction , typename InputT , typename OutputT >
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics> cl::sycl::ext::oneapi::detail::reduAuxCGFuncImpl ( handler CGH,
size_t  NWorkItems,
size_t  NWorkGroups,
size_t  WGSize,
Reduction &  Redu,
InputT  In,
OutputT  Out 
)

Implements a command group function that enqueues a kernel that does one iteration of reduction of elements in each of work-groups.

This version uses tree-reduction algorithm to reduce elements in each of work-groups. At the end of each work-group the partial sum is written to a global buffer.

Briefly: aux kernel, tree-reduction, CUSTOM types/ops.

Definition at line 1626 of file reduction.hpp.

References cl::sycl::handler::parallel_for().

◆ reduAuxCGFuncImpl() [3/3]

template<typename KernelName , typename KernelType , bool UniformPow2WG, bool IsOneWG, typename... Reductions, size_t... Is>
void cl::sycl::ext::oneapi::detail::reduAuxCGFuncImpl ( handler CGH,
size_t  NWorkItems,
size_t  NWorkGroups,
size_t  WGSize,
std::tuple< Reductions... > &  ReduTuple,
std::index_sequence< Is... >  ReduIndices 
)

◆ reduAuxCGFuncImplArray()

template<bool UniformPow2WG, bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT, typename... InAccT, typename... OutAccT, typename... Ts, typename... BOPsT, size_t... Is>
void cl::sycl::ext::oneapi::detail::reduAuxCGFuncImplArray ( nd_item< Dims >  NDIt,
size_t  LID,
size_t  GID,
size_t  NWorkItems,
size_t  WGSize,
ReduTupleT< LocalAccT... >  LocalAccsTuple,
ReduTupleT< InAccT... >  InAccsTuple,
ReduTupleT< OutAccT... >  OutAccsTuple,
ReduTupleT< Ts... >  IdentitiesTuple,
ReduTupleT< BOPsT... >  BOPsTuple,
std::array< bool, sizeof...(Reductions)>  InitToIdentityProps,
std::index_sequence< Is... >   
)

Definition at line 2416 of file reduction.hpp.

References reduAuxCGFuncImplArrayHelper().

Referenced by reduAuxCGFuncImpl().

◆ reduAuxCGFuncImplArrayHelper()

template<bool UniformPow2WG, bool IsOneWG, typename Reduction , int Dims, typename LocalAccT , typename InAccT , typename OutAccT , typename T , typename BOPT >
void cl::sycl::ext::oneapi::detail::reduAuxCGFuncImplArrayHelper ( nd_item< Dims >  NDIt,
size_t  LID,
size_t  GID,
size_t  NWorkItems,
size_t  WGSize,
LocalAccT  LocalReds,
InAccT  In,
OutAccT  Out,
T  Identity,
BOPT  BOp,
bool  IsInitializeToIdentity 
)

◆ reduAuxCGFuncImplScalar()

template<bool UniformPow2WG, bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT, typename... InAccT, typename... OutAccT, typename... Ts, typename... BOPsT, size_t... Is>
void cl::sycl::ext::oneapi::detail::reduAuxCGFuncImplScalar ( nd_item< Dims >  NDIt,
size_t  LID,
size_t  GID,
size_t  NWorkItems,
size_t  WGSize,
ReduTupleT< LocalAccT... >  LocalAccsTuple,
ReduTupleT< InAccT... >  InAccsTuple,
ReduTupleT< OutAccT... >  OutAccsTuple,
ReduTupleT< Ts... >  IdentitiesTuple,
ReduTupleT< BOPsT... >  BOPsTuple,
std::array< bool, sizeof...(Reductions)>  InitToIdentityProps,
std::index_sequence< Is... >  ReduIndices 
)

All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction uses its own storage.

Definition at line 2300 of file reduction.hpp.

References cl::sycl::nd_item< dimensions >::barrier(), cl::sycl::nd_item< dimensions >::get_group_linear_id(), and reduceReduLocalAccs().

Referenced by reduAuxCGFuncImpl().

◆ reduceReduLocalAccs()

template<typename... LocalAccT, typename... BOPsT, size_t... Is>
void cl::sycl::ext::oneapi::detail::reduceReduLocalAccs ( size_t  IndexA,
size_t  IndexB,
ReduTupleT< LocalAccT... >  LocalAccs,
ReduTupleT< BOPsT... >  BOPs,
std::index_sequence< Is... >   
)

◆ reduCGFunc() [1/4]

template<typename KernelName , typename KernelType , int Dims, class Reduction >
enable_if_t< Reduction::has_fast_atomics > cl::sycl::ext::oneapi::detail::reduCGFunc ( handler CGH,
KernelType  KernelFunc,
const nd_range< Dims > &  Range,
Reduction &  Redu 
)

◆ reduCGFunc() [2/4]

template<typename KernelName , typename KernelType , int Dims, class Reduction >
enable_if_t<!Reduction::has_fast_atomics > cl::sycl::ext::oneapi::detail::reduCGFunc ( handler CGH,
KernelType  KernelFunc,
const nd_range< Dims > &  Range,
Reduction &  Redu 
)

◆ reduCGFunc() [3/4]

template<typename KernelName , typename KernelType , int Dims, typename... Reductions, size_t... Is>
void cl::sycl::ext::oneapi::detail::reduCGFunc ( handler CGH,
KernelType  KernelFunc,
const nd_range< Dims > &  Range,
std::tuple< Reductions... > &  ReduTuple,
std::index_sequence< Is... >  ReduIndices 
)

◆ reduCGFunc() [4/4]

template<typename KernelName , typename KernelType , int Dims, class Reduction >
void cl::sycl::ext::oneapi::detail::reduCGFunc ( handler CGH,
KernelType  KernelFunc,
const range< Dims > &  Range,
size_t  MaxWGSize,
uint32_t  NumConcurrentWorkGroups,
Reduction &  Redu 
)

Definition at line 1280 of file reduction.hpp.

References cl::sycl::range< dimensions >::size().

◆ reduCGFuncAtomic64()

template<typename KernelName , typename KernelType , int Dims, class Reduction >
enable_if_t< Reduction::has_atomic_add_float64 > cl::sycl::ext::oneapi::detail::reduCGFuncAtomic64 ( handler CGH,
KernelType  KernelFunc,
const nd_range< Dims > &  Range,
Reduction &  Redu 
)

Definition at line 2265 of file reduction.hpp.

◆ reduCGFuncImpl() [1/8]

template<typename KernelName , typename KernelType , int Dims, class Reduction , bool IsPow2WG>
enable_if_t<Reduction::has_fast_reduce && Reduction::has_fast_atomics> cl::sycl::ext::oneapi::detail::reduCGFuncImpl ( handler CGH,
KernelType  KernelFunc,
const nd_range< Dims > &  Range,
Reduction &  ,
typename Reduction::rw_accessor_type  Out 
)

Implements a command group function that enqueues a kernel that calls user's lambda function KernelFunc and also does one iteration of reduction of elements computed in user's lambda function.

This version uses ext::oneapi::reduce() algorithm to reduce elements in each of work-groups, then it calls fast SYCL atomic operations to update the given reduction variable Out.

Briefly: calls user's lambda, ext::oneapi::reduce() + atomic, INT + ADD/MIN/MAX.

Definition at line 1308 of file reduction.hpp.

References cl::sycl::handler::parallel_for(), and reduce().

◆ reduCGFuncImpl() [2/8]

template<typename KernelName , typename KernelType , int Dims, class Reduction , bool IsPow2WG>
enable_if_t<!Reduction::has_fast_reduce && Reduction::has_fast_atomics> cl::sycl::ext::oneapi::detail::reduCGFuncImpl ( handler CGH,
KernelType  KernelFunc,
const nd_range< Dims > &  Range,
Reduction &  ,
typename Reduction::rw_accessor_type  Out 
)

Implements a command group function that enqueues a kernel that calls user's lambda function KernelFunc and also does one iteration of reduction of elements computed in user's lambda function.

This version uses tree-reduction algorithm to reduce elements in each of work-groups, then it calls fast SYCL atomic operations to update user's reduction variable.

Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR.

Definition at line 1339 of file reduction.hpp.

References cl::sycl::nd_range< dimensions >::get_local_range(), and cl::sycl::handler::parallel_for().

◆ reduCGFuncImpl() [3/8]

template<typename KernelName , typename KernelType , int Dims, class Reduction , bool IsPow2WG>
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics> cl::sycl::ext::oneapi::detail::reduCGFuncImpl ( handler CGH,
KernelType  KernelFunc,
const nd_range< Dims > &  Range,
Reduction &  Redu,
typename Reduction::rw_accessor_type  Out 
)

Implements a command group function that enqueues a kernel that calls user's lambda function and does one iteration of reduction of elements in each of work-groups.

This version uses ext::oneapi::reduce() algorithm to reduce elements in each of work-groups. At the end of each work-groups the partial sum is written to a global buffer.

Briefly: user's lambda, ext::oneapi::reduce(), FP + ADD/MIN/MAX.

Definition at line 1444 of file reduction.hpp.

References cl::sycl::nd_range< dimensions >::get_group_range(), cl::sycl::handler::parallel_for(), and reduce().

◆ reduCGFuncImpl() [4/8]

template<typename KernelName , typename KernelType , int Dims, class Reduction , bool IsPow2WG>
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics> cl::sycl::ext::oneapi::detail::reduCGFuncImpl ( handler CGH,
KernelType  KernelFunc,
const nd_range< Dims > &  Range,
Reduction &  Redu,
typename Reduction::rw_accessor_type  Out 
)

Implements a command group function that enqueues a kernel that calls user's lambda function.

Parameters
KernelFuncand does one iteration of reduction of elements in each of work-groups. This version uses tree-reduction algorithm to reduce elements in each of work-groups. At the end of each work-group the partial sum is written to a global buffer.

Briefly: user's lambda, tree-reduction, CUSTOM types/ops.

Definition at line 1485 of file reduction.hpp.

References cl::sycl::nd_range< dimensions >::get_group_range(), cl::sycl::nd_range< dimensions >::get_local_range(), and cl::sycl::handler::parallel_for().

◆ reduCGFuncImpl() [5/8]

template<typename KernelName , bool Pow2WG, bool IsOneWG, typename KernelType , int Dims, typename... Reductions, size_t... Is>
void cl::sycl::ext::oneapi::detail::reduCGFuncImpl ( handler CGH,
KernelType  KernelFunc,
const nd_range< Dims > &  Range,
std::tuple< Reductions... > &  ReduTuple,
std::index_sequence< Is... >  ReduIndices 
)

◆ reduCGFuncImpl() [6/8]

template<typename KernelName , typename KernelType , int Dims, class Reduction >
std::enable_if_t<Reduction::has_fast_atomics> cl::sycl::ext::oneapi::detail::reduCGFuncImpl ( handler CGH,
KernelType  KernelFunc,
const range< Dims > &  Range,
const nd_range< 1 > &  NDRange,
Reduction &  Redu 
)

◆ reduCGFuncImpl() [7/8]

template<typename KernelName , typename KernelType , int Dims, class Reduction >
std::enable_if_t<!Reduction::has_fast_atomics && Reduction::has_fast_reduce> cl::sycl::ext::oneapi::detail::reduCGFuncImpl ( handler CGH,
KernelType  KernelFunc,
const range< Dims > &  Range,
const nd_range< 1 > &  NDRange,
Reduction &  Redu 
)

◆ reduCGFuncImpl() [8/8]

template<typename KernelName , typename KernelType , int Dims, class Reduction >
std::enable_if_t<!Reduction::has_fast_atomics && !Reduction::has_fast_reduce> cl::sycl::ext::oneapi::detail::reduCGFuncImpl ( handler CGH,
KernelType  KernelFunc,
const range< Dims > &  Range,
const nd_range< 1 > &  NDRange,
Reduction &  Redu 
)

◆ reduCGFuncImplArray()

template<bool Pow2WG, bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT, typename... OutAccT, typename... ReducerT, typename... Ts, typename... BOPsT, size_t... Is>
void cl::sycl::ext::oneapi::detail::reduCGFuncImplArray ( nd_item< Dims >  NDIt,
ReduTupleT< LocalAccT... >  LocalAccsTuple,
ReduTupleT< OutAccT... >  OutAccsTuple,
std::tuple< ReducerT... > &  ReducersTuple,
ReduTupleT< Ts... >  IdentitiesTuple,
ReduTupleT< BOPsT... >  BOPsTuple,
std::array< bool, sizeof...(Reductions)>  InitToIdentityProps,
std::index_sequence< Is... >   
)

Definition at line 2114 of file reduction.hpp.

References reduCGFuncImplArrayHelper().

Referenced by reduCGFuncImpl().

◆ reduCGFuncImplArrayHelper()

template<bool Pow2WG, bool IsOneWG, typename Reduction , int Dims, typename LocalAccT , typename OutAccT , typename ReducerT , typename T , typename BOPT >
void cl::sycl::ext::oneapi::detail::reduCGFuncImplArrayHelper ( nd_item< Dims >  NDIt,
LocalAccT  LocalReds,
OutAccT  Out,
ReducerT &  Reducer,
T  Identity,
BOPT  BOp,
bool  IsInitializeToIdentity 
)

◆ reduCGFuncImplAtomic64()

template<typename KernelName , typename KernelType , int Dims, class Reduction >
std::enable_if_t<Reduction::has_atomic_add_float64> cl::sycl::ext::oneapi::detail::reduCGFuncImplAtomic64 ( handler CGH,
KernelType  KernelFunc,
const nd_range< Dims > &  Range,
Reduction &  ,
typename Reduction::rw_accessor_type  Out 
)

◆ reduCGFuncImplScalar()

template<bool Pow2WG, bool IsOneWG, typename... Reductions, int Dims, typename... LocalAccT, typename... OutAccT, typename... ReducerT, typename... Ts, typename... BOPsT, size_t... Is>
void cl::sycl::ext::oneapi::detail::reduCGFuncImplScalar ( nd_item< Dims >  NDIt,
ReduTupleT< LocalAccT... >  LocalAccsTuple,
ReduTupleT< OutAccT... >  OutAccsTuple,
std::tuple< ReducerT... > &  ReducersTuple,
ReduTupleT< Ts... >  IdentitiesTuple,
ReduTupleT< BOPsT... >  BOPsTuple,
std::array< bool, sizeof...(Reductions)>  InitToIdentityProps,
std::index_sequence< Is... >  ReduIndices 
)

All scalar reductions are processed together; there is one loop of log2(N) steps, and each reduction uses its own storage.

Definition at line 2012 of file reduction.hpp.

References cl::sycl::nd_item< dimensions >::barrier(), cl::sycl::nd_item< dimensions >::get_group_linear_id(), cl::sycl::nd_item< dimensions >::get_local_linear_id(), cl::sycl::nd_item< dimensions >::get_local_range(), and reduceReduLocalAccs().

Referenced by reduCGFuncImpl().

◆ reduComputeWGSize()

size_t cl::sycl::ext::oneapi::detail::reduComputeWGSize ( size_t  NWorkItems,
size_t  MaxWGSize,
size_t &  NWorkGroups 
)

Definition at line 20 of file reduction.cpp.

Referenced by reduAuxCGFunc().

◆ reductionLoop()

template<typename KernelFunc , int Dims, typename ReducerT >
void cl::sycl::ext::oneapi::detail::reductionLoop ( const range< Dims > &  Range,
ReducerT &  Reducer,
const nd_item< 1 > &  NdId,
KernelFunc F 
)

Called in device code.

This function iterates through the index space Range using stride equal to the global range specified in NdId, which gives much better performance than using stride equal to 1. For each of the index the given F function/functor is called and the reduction value hold in Reducer is accumulated in those calls.

Definition at line 1042 of file reduction.hpp.

References cl::sycl::nd_item< dimensions >::get_global_id(), cl::sycl::nd_item< dimensions >::get_global_range(), cl::sycl::detail::getDelinearizedId(), and cl::sycl::range< dimensions >::size().

Referenced by reduCGFuncImpl().

◆ reduGetMaxNumConcurrentWorkGroups() [1/2]

uint32_t cl::sycl::ext::oneapi::detail::reduGetMaxNumConcurrentWorkGroups ( std::shared_ptr< queue_impl Queue)

◆ reduGetMaxNumConcurrentWorkGroups() [2/2]

uint32_t cl::sycl::ext::oneapi::detail::reduGetMaxNumConcurrentWorkGroups ( std::shared_ptr< sycl::detail::queue_impl Queue)

Definition at line 54 of file reduction.cpp.

References cl::sycl::device::get_info(), and cl::sycl::device::is_gpu().

◆ reduGetMaxWGSize() [1/2]

size_t cl::sycl::ext::oneapi::detail::reduGetMaxWGSize ( std::shared_ptr< queue_impl Queue,
size_t  LocalMemBytesPerWorkItem 
)

◆ reduGetMaxWGSize() [2/2]

size_t cl::sycl::ext::oneapi::detail::reduGetMaxWGSize ( std::shared_ptr< sycl::detail::queue_impl Queue,
size_t  LocalMemBytesPerWorkItem 
)

Definition at line 68 of file reduction.cpp.

References cl::sycl::device::get_info().

◆ reduGetMemPerWorkItem()

template<typename... ReductionT, size_t... Is>
size_t cl::sycl::ext::oneapi::detail::reduGetMemPerWorkItem ( std::tuple< ReductionT... > &  ReduTuple,
std::index_sequence< Is... >   
)

Definition at line 2577 of file reduction.hpp.

References reduGetMemPerWorkItemHelper().

Referenced by cl::sycl::handler::parallel_for().

◆ reduGetMemPerWorkItemHelper() [1/2]

template<typename Reduction >
size_t cl::sycl::ext::oneapi::detail::reduGetMemPerWorkItemHelper ( Reduction &  )

Definition at line 2566 of file reduction.hpp.

◆ reduGetMemPerWorkItemHelper() [2/2]

template<typename Reduction , typename... RestT>
size_t cl::sycl::ext::oneapi::detail::reduGetMemPerWorkItemHelper ( Reduction &  ,
RestT...  Rest 
)

Definition at line 2571 of file reduction.hpp.

Referenced by reduGetMemPerWorkItem().

◆ reduSaveFinalResultToUserMem() [1/3]

template<typename KernelName , class Reduction >
std::enable_if_t<!Reduction::is_usm > cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMem ( handler CGH,
Reduction &  Redu 
)

Copies the final reduction result kept in read-write accessor to user's accessor.

This method is not called for user's read-write accessors requiring update-write to it.

Definition at line 1731 of file reduction.hpp.

References cl::sycl::handler::copy().

◆ reduSaveFinalResultToUserMem() [2/3]

template<typename KernelName , class Reduction >
std::enable_if_t< Reduction::is_usm > cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMem ( handler CGH,
Reduction &  Redu 
)

Copies the final reduction result kept in read-write accessor to user's USM memory.

Definition at line 1747 of file reduction.hpp.

References cl::sycl::handler::single_task().

◆ reduSaveFinalResultToUserMem() [3/3]

template<typename... Reduction, size_t... Is>
std::shared_ptr< event > cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMem ( std::shared_ptr< detail::queue_impl Queue,
bool  IsHost,
std::tuple< Reduction... > &  ReduTuple,
std::index_sequence< Is... >   
)

Creates additional kernels that copy the accumulated/final results from reductions accessors to either user's accessor or user's USM memory.

Returns the event to the last kernel copying data or nullptr if no additional kernels created.

Definition at line 2555 of file reduction.hpp.

References reduSaveFinalResultToUserMemHelper().

Referenced by cl::sycl::handler::parallel_for().

◆ reduSaveFinalResultToUserMemHelper() [1/3]

void cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMemHelper ( std::vector< event > &  ,
std::shared_ptr< detail::queue_impl ,
bool   
)
inline

Definition at line 2518 of file reduction.hpp.

◆ reduSaveFinalResultToUserMemHelper() [2/3]

template<typename Reduction , typename... RestT>
std::enable_if_t<Reduction::is_usm> cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMemHelper ( std::vector< event > &  Events,
std::shared_ptr< detail::queue_impl Queue,
bool  IsHost,
Reduction &  ,
RestT...  Rest 
)

◆ reduSaveFinalResultToUserMemHelper() [3/3]

template<typename Reduction , typename... RestT>
std::enable_if_t<!Reduction::is_usm > cl::sycl::ext::oneapi::detail::reduSaveFinalResultToUserMemHelper ( std::vector< event > &  Events,
std::shared_ptr< detail::queue_impl Queue,
bool  IsHost,
Reduction &  Redu,
RestT...  Rest 
)

◆ tokenize()

std::vector<std::string> cl::sycl::ext::oneapi::detail::tokenize ( const std::string &  Filter,
const std::string &  Delim 
)

◆ tuple_select_elements()

template<typename TupleT , std::size_t... Is>
std::tuple< std::tuple_element_t< Is, TupleT >... > cl::sycl::ext::oneapi::detail::tuple_select_elements ( TupleT  Tuple,
std::index_sequence< Is... >   
)

Utility function: for the given tuple.

Parameters
Tuplethe function returns a new tuple consisting of only elements indexed by the index sequence.

Definition at line 2586 of file reduction.hpp.

Referenced by cl::sycl::handler::parallel_for().

◆ writeReduSumsToOutAccs()

template<bool Pow2WG, bool IsOneWG, typename... Reductions, typename... OutAccT, typename... LocalAccT, typename... BOPsT, typename... Ts, size_t... Is>
void cl::sycl::ext::oneapi::detail::writeReduSumsToOutAccs ( size_t  OutAccIndex,
size_t  WGSize,
std::tuple< Reductions... > *  ,
ReduTupleT< OutAccT... >  OutAccs,
ReduTupleT< LocalAccT... >  LocalAccs,
ReduTupleT< BOPsT... >  BOPs,
ReduTupleT< Ts... >  IdentityVals,
std::array< bool, sizeof...(Reductions)>  IsInitializeToIdentity,
std::index_sequence< Is... >   
)

Definition at line 1894 of file reduction.hpp.

References cl::sycl::detail::make_tuple(), and cl::sycl::detail::tie().