13 #if defined(__SYCL_RT_OS_LINUX)
16 #if defined(__x86_64__) || defined(__i386__)
19 #elif defined(__SYCL_RT_OS_WINDOWS)
27 #if defined(__x86_64__) || defined(__i386__)
29 static void cpuid(uint32_t *CPUInfo, uint32_t Type, uint32_t SubType = 0) {
30 #if defined(__SYCL_RT_OS_LINUX)
31 __cpuid_count(Type, SubType, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
32 #elif defined(__SYCL_RT_OS_WINDOWS)
33 __cpuidex(
reinterpret_cast<int *
>(CPUInfo), Type, SubType);
38 uint32_t PlatformUtil::getMaxClockFrequency() {
40 "max_clock_frequency parameter is not supported for host device",
41 PI_ERROR_INVALID_DEVICE);
42 #if defined(__x86_64__) || defined(__i386__)
44 std::string Buff(
sizeof(CPUInfo) * 3 + 1, 0);
47 for (uint32_t i = 0x80000002; i <= 0x80000004; i++) {
49 std::copy(
reinterpret_cast<char *
>(CPUInfo),
50 reinterpret_cast<char *
>(CPUInfo) +
sizeof(CPUInfo),
51 Buff.begin() + Offset);
52 Offset +=
sizeof(CPUInfo);
54 std::size_t Found = Buff.rfind(
"Hz");
56 if (Found == std::string::npos)
59 Buff = Buff.substr(0, Found);
61 switch (Buff[Buff.size() - 1]) {
69 Buff = Buff.substr(Buff.rfind(
' '), Buff.length());
70 Freq *= std::stod(Buff);
76 uint32_t PlatformUtil::getMemCacheLineSize() {
77 #if defined(__x86_64__) || defined(__i386__)
79 cpuid(CPUInfo, 0x80000006);
80 return CPUInfo[2] & 0xff;
81 #elif defined(__SYCL_RT_OS_LINUX) && defined(_SC_LEVEL2_DCACHE_LINESIZE)
82 long lineSize = sysconf(_SC_LEVEL2_DCACHE_LINESIZE);
90 uint64_t PlatformUtil::getMemCacheSize() {
91 #if defined(__x86_64__) || defined(__i386__)
93 cpuid(CPUInfo, 0x80000006);
94 return static_cast<uint64_t
>(CPUInfo[2] >> 16) * 1024;
95 #elif defined(__SYCL_RT_OS_LINUX) && defined(_SC_LEVEL2_DCACHE_SIZE)
96 long cacheSize = sysconf(_SC_LEVEL2_DCACHE_SIZE);
101 return static_cast<uint64_t
>(16 * 1024);
106 #if defined(__x86_64__) || defined(__i386__)
107 uint32_t Index =
static_cast<uint32_t
>(TIndex);
110 static constexpr uint32_t VECTOR_WIDTH_SSE42[] = {16, 8, 4, 2, 4, 2, 0};
112 static constexpr uint32_t VECTOR_WIDTH_AVX[] = {16, 8, 4, 2, 8, 4, 0};
114 static constexpr uint32_t VECTOR_WIDTH_AVX2[] = {32, 16, 8, 4, 8, 4, 0};
116 static constexpr uint32_t VECTOR_WIDTH_AVX512[] = {64, 32, 16, 8, 16, 8, 0};
118 #if defined(__SYCL_RT_OS_LINUX)
119 if (__builtin_cpu_supports(
"avx512f"))
120 return VECTOR_WIDTH_AVX512[Index];
121 if (__builtin_cpu_supports(
"avx2"))
122 return VECTOR_WIDTH_AVX2[Index];
123 if (__builtin_cpu_supports(
"avx"))
124 return VECTOR_WIDTH_AVX[Index];
125 #elif defined(__SYCL_RT_OS_WINDOWS)
134 if (Info[1] & (1 << 16))
135 return VECTOR_WIDTH_AVX512[Index];
138 if (Info[1] & (1 << 5))
139 return VECTOR_WIDTH_AVX2[Index];
144 if (Info[2] & (1 << 28))
145 return VECTOR_WIDTH_AVX[Index];
148 return VECTOR_WIDTH_SSE42[Index];
150 #elif defined(__ARM_NEON)
151 uint32_t Index =
static_cast<uint32_t
>(TIndex);
154 static constexpr uint32_t VECTOR_WIDTH_NEON[] = {16, 8, 4, 2, 4, 2, 0};
155 return VECTOR_WIDTH_NEON[Index];
165 const size_t CacheLineSize = PlatformUtil::getMemCacheLineSize();
166 const size_t CacheLineMask = ~(CacheLineSize - 1);
167 const char *PtrEnd = Ptr + NumBytes;
170 Ptr =
reinterpret_cast<const char *
>(
171 reinterpret_cast<size_t>(Ptr) & CacheLineMask);
172 for (; Ptr < PtrEnd; Ptr += CacheLineSize) {
173 #if defined(__SYCL_RT_OS_LINUX)
174 __builtin_prefetch(Ptr);
175 #elif defined(__SYCL_RT_OS_WINDOWS)
176 _mm_prefetch(Ptr, _MM_HINT_T0);