13 #if defined(__SYCL_RT_OS_LINUX)
16 #if defined(__x86_64__) || defined(__i386__)
19 #elif defined(__SYCL_RT_OS_WINDOWS)
21 #elif defined(__SYCL_RT_OS_DARWIN)
22 #if defined(__x86_64__) || defined(__i386__)
28 inline namespace _V1 {
31 #if defined(__x86_64__) || defined(__i386__)
33 static void cpuid(uint32_t *CPUInfo, uint32_t Type, uint32_t SubType = 0) {
34 #if defined(__SYCL_RT_OS_LINUX) || defined(__SYCL_RT_OS_DARWIN)
35 __cpuid_count(Type, SubType, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
36 #elif defined(__SYCL_RT_OS_WINDOWS)
37 __cpuidex(
reinterpret_cast<int *
>(CPUInfo), Type, SubType);
44 "max_clock_frequency parameter is not supported for host device",
45 PI_ERROR_INVALID_DEVICE);
50 #if defined(__x86_64__) || defined(__i386__)
52 cpuid(CPUInfo, 0x80000006);
53 return CPUInfo[2] & 0xff;
54 #elif defined(__SYCL_RT_OS_LINUX) && defined(_SC_LEVEL2_DCACHE_LINESIZE)
55 long lineSize = sysconf(_SC_LEVEL2_DCACHE_LINESIZE);
64 #if defined(__x86_64__) || defined(__i386__)
66 cpuid(CPUInfo, 0x80000006);
67 return static_cast<uint64_t
>(CPUInfo[2] >> 16) * 1024;
68 #elif defined(__SYCL_RT_OS_LINUX) && defined(_SC_LEVEL2_DCACHE_SIZE)
69 long cacheSize = sysconf(_SC_LEVEL2_DCACHE_SIZE);
74 return static_cast<uint64_t
>(16 * 1024);
79 #if defined(__x86_64__) || defined(__i386__)
80 uint32_t Index =
static_cast<uint32_t
>(TIndex);
83 static constexpr uint32_t VECTOR_WIDTH_SSE42[] = {16, 8, 4, 2, 4, 2, 0};
85 static constexpr uint32_t VECTOR_WIDTH_AVX[] = {16, 8, 4, 2, 8, 4, 0};
87 static constexpr uint32_t VECTOR_WIDTH_AVX2[] = {32, 16, 8, 4, 8, 4, 0};
89 static constexpr uint32_t VECTOR_WIDTH_AVX512[] = {64, 32, 16, 8, 16, 8, 0};
91 #if defined(__SYCL_RT_OS_LINUX) || defined(__SYCL_RT_OS_DARWIN)
92 if (__builtin_cpu_supports(
"avx512f"))
93 return VECTOR_WIDTH_AVX512[Index];
94 if (__builtin_cpu_supports(
"avx2"))
95 return VECTOR_WIDTH_AVX2[Index];
96 if (__builtin_cpu_supports(
"avx"))
97 return VECTOR_WIDTH_AVX[Index];
98 #elif defined(__SYCL_RT_OS_WINDOWS)
107 if (Info[1] & (1 << 16))
108 return VECTOR_WIDTH_AVX512[Index];
111 if (Info[1] & (1 << 5))
112 return VECTOR_WIDTH_AVX2[Index];
117 if (Info[2] & (1 << 28))
118 return VECTOR_WIDTH_AVX[Index];
121 return VECTOR_WIDTH_SSE42[Index];
123 #elif defined(__ARM_NEON)
124 uint32_t Index =
static_cast<uint32_t
>(TIndex);
127 static constexpr uint32_t VECTOR_WIDTH_NEON[] = {16, 8, 4, 2, 4, 2, 0};
128 return VECTOR_WIDTH_NEON[Index];
139 const size_t CacheLineMask = ~(CacheLineSize - 1);
140 const char *PtrEnd = Ptr + NumBytes;
143 Ptr =
reinterpret_cast<const char *
>(
reinterpret_cast<size_t>(Ptr) &
145 for (; Ptr < PtrEnd; Ptr += CacheLineSize) {
146 #if defined(__SYCL_RT_OS_LINUX)
147 __builtin_prefetch(Ptr);
148 #elif defined(__SYCL_RT_OS_WINDOWS)
149 _mm_prefetch(Ptr, _MM_HINT_T0);