18 #include <type_traits>
21 inline namespace _V1 {
22 namespace ext::oneapi::experimental {
28 sycl::detail::memcpy(&res, &
x[start],
sizeof(uint32_t));
36 std::enable_if_t<std::is_same_v<T, bfloat16>,
bool>
isnan(T x) {
38 return (((XBits & 0x7F80) == 0x7F80) && (XBits & 0x7F)) ? true :
false;
43 for (
size_t i = 0; i < N; i++) {
50 std::enable_if_t<std::is_same_v<T, bfloat16>, T>
fabs(T x) {
51 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
52 (__SYCL_CUDA_ARCH__ >= 800)
59 x = ((XBits & SignMask) == SignMask)
71 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
72 (__SYCL_CUDA_ARCH__ >= 800)
73 for (
size_t i = 0; i < N / 2; i++) {
75 sycl::detail::memcpy(&res[i * 2], &partial_res,
sizeof(uint32_t));
84 for (
size_t i = 0; i < N; i++) {
93 std::enable_if_t<std::is_same_v<T, bfloat16>, T>
fmin(T x, T y) {
94 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
95 (__SYCL_CUDA_ARCH__ >= 800)
110 if (((XBits | YBits) ==
116 return (
x <
y) ?
x :
y;
125 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
126 (__SYCL_CUDA_ARCH__ >= 800)
127 for (
size_t i = 0; i < N / 2; i++) {
130 sycl::detail::memcpy(&res[i * 2], &partial_res,
sizeof(uint32_t));
141 for (
size_t i = 0; i < N; i++) {
142 res[i] =
fmin(
x[i],
y[i]);
149 template <
typename T>
150 std::enable_if_t<std::is_same_v<T, bfloat16>, T>
fmax(T x, T y) {
151 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
152 (__SYCL_CUDA_ARCH__ >= 800)
167 if (((XBits | YBits) ==
172 return (
x >
y) ?
x :
y;
181 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
182 (__SYCL_CUDA_ARCH__ >= 800)
183 for (
size_t i = 0; i < N / 2; i++) {
186 sycl::detail::memcpy(&res[i * 2], &partial_res,
sizeof(uint32_t));
197 for (
size_t i = 0; i < N; i++) {
198 res[i] =
fmax(
x[i],
y[i]);
205 template <
typename T>
206 std::enable_if_t<std::is_same_v<T, bfloat16>, T>
fma(T x, T y, T z) {
207 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
208 (__SYCL_CUDA_ARCH__ >= 800)
224 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) && \
225 (__SYCL_CUDA_ARCH__ >= 800)
226 for (
size_t i = 0; i < N / 2; i++) {
230 sycl::detail::memcpy(&res[i * 2], &partial_res,
sizeof(uint32_t));
243 for (
size_t i = 0; i < N; i++) {
244 res[i] =
fma(
x[i],
y[i],
z[i]);
251 #define BFLOAT16_MATH_FP32_WRAPPERS(op) \
252 template <typename T> \
253 std::enable_if_t<std::is_same<T, bfloat16>::value, T> op(T x) { \
254 return sycl::ext::oneapi::bfloat16{sycl::op(float{x})}; \
257 #define BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(op) \
258 template <size_t N> \
259 sycl::marray<bfloat16, N> op(sycl::marray<bfloat16, N> x) { \
260 sycl::marray<bfloat16, N> res; \
261 for (size_t i = 0; i < N; i++) { \
296 #undef BFLOAT16_MATH_FP32_WRAPPERS
297 #undef BFLOAT16_MATH_FP32_WRAPPERS_MARRAY
#define BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(op)
#define BFLOAT16_MATH_FP32_WRAPPERS(op)
Provides a cross-platform math array class template that works on SYCL devices as well as in host C++...
__ESIMD_API simd< T, N > rsqrt(simd< T, N > src, Sat sat={})
Square root reciprocal - calculates 1/sqrt(x).
__ESIMD_API simd< T, N > log2(simd< T, N > src, Sat sat={})
Logarithm base 2.
__ESIMD_API simd< T, N > exp2(simd< T, N > src, Sat sat={})
Exponent base 2.
bfloat16 bitsToBfloat16(const Bfloat16StorageT Value)
Bfloat16StorageT bfloat16ToBits(const bfloat16 &Value)
uint16_t Bfloat16StorageT
uint32_t to_uint32_t(sycl::marray< bfloat16, N > x, size_t start)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, complex< _Tp > > sin(const complex< _Tp > &__x)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, complex< _Tp > > cos(const complex< _Tp > &__x)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, complex< _Tp > > sqrt(const complex< _Tp > &__x)
std::enable_if_t< std::is_same_v< T, bfloat16 >, bool > isnan(T x)
std::enable_if_t< std::is_same_v< T, bfloat16 >, T > fabs(T x)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, complex< _Tp > > exp(const complex< _Tp > &__x)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, complex< _Tp > > log(const complex< _Tp > &__x)
std::enable_if_t< std::is_same_v< T, bfloat16 >, T > fmin(T x, T y)
std::enable_if_t< std::is_same_v< T, bfloat16 >, T > fma(T x, T y, T z)
__DPCPP_SYCL_EXTERNAL _SYCL_EXT_CPLX_INLINE_VISIBILITY std::enable_if_t< is_genfloat< _Tp >::value, complex< _Tp > > log10(const complex< _Tp > &__x)
std::enable_if_t< std::is_same_v< T, bfloat16 >, T > fmax(T x, T y)
auto auto autodecltype(x) z