clang  20.0.0git
__clang_cuda_texture_intrinsics.h
Go to the documentation of this file.
1 /*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  *
9  * This header provides in-header implmentations for NVCC's built-in
10  * __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The
11  * built-in is unusual as it's actually a set of function overloads that use the
12  * first string literal argument as one of the overload parameters.
13  */
14 #ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
15 #define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
16 #ifndef __CUDA__
17 #error "This file is for CUDA compilation only."
18 #endif
19 
20 // __nv_tex_surf_handler() provided by this header as a macro.
21 #define __nv_tex_surf_handler(__op, __ptr, ...) \
22  ::__cuda_tex::__tex_fetch< \
23  ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \
24  __VA_ARGS__)
25 
26 #pragma push_macro("__ASM_OUT")
27 #pragma push_macro("__ASM_OUTP")
28 #pragma push_macro("__Args")
29 #pragma push_macro("__ID")
30 #pragma push_macro("__IDV")
31 #pragma push_macro("__IMPL_2DGATHER")
32 #pragma push_macro("__IMPL_ALIAS")
33 #pragma push_macro("__IMPL_ALIASI")
34 #pragma push_macro("__IMPL_F1")
35 #pragma push_macro("__IMPL_F3")
36 #pragma push_macro("__IMPL_F3N")
37 #pragma push_macro("__IMPL_F3S")
38 #pragma push_macro("__IMPL_S")
39 #pragma push_macro("__IMPL_S3")
40 #pragma push_macro("__IMPL_S3I")
41 #pragma push_macro("__IMPL_S3N")
42 #pragma push_macro("__IMPL_S3NI")
43 #pragma push_macro("__IMPL_S3S")
44 #pragma push_macro("__IMPL_S3SI")
45 #pragma push_macro("__IMPL_SI")
46 #pragma push_macro("__L")
47 #pragma push_macro("__STRIP_PARENS")
48 
49 // Put all functions into anonymous namespace so they have internal linkage.
50 // The device-only function here must be internal in order to avoid ODR
51 // violations in case they are used from the files compiled with
52 // -fgpu-rdc. E.g. a library and an app using it may be built with a different
53 // version of this header file.
54 namespace {
55 
56 // Put the implmentation into its own namespace so we don't pollute the TU.
57 namespace __cuda_tex {
58 
59 // First, we need a perfect hash function and a few constexpr helper functions
60 // for converting a string literal into a numeric value which can be used to
61 // parametrize a template. We can not use string literals for that as that would
62 // require C++20.
63 //
64 // The hash function was generated with 'gperf' and then manually converted into
65 // its constexpr equivalent.
66 //
67 // NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
68 // function has a collision for any of the texture operations, the compilation
69 // will fail due to an attempt to redefine a tag with the same value. If the
70 // header compiles, then the hash function is good enough for the job.
71 
72 constexpr int __tex_len(const char *s) {
73  return (s[0] == 0) ? 0
74  : (s[1] == 0) ? 1
75  : (s[2] == 0) ? 2
76  : (s[3] == 0) ? 3
77  : (s[4] == 0) ? 4
78  : (s[5] == 0) ? 5
79  : (s[6] == 0) ? 6
80  : (s[7] == 0) ? 7
81  : (s[8] == 0) ? 8
82  : (s[9] == 0) ? 9
83  : (s[10] == 0) ? 10
84  : (s[11] == 0) ? 11
85  : (s[12] == 0) ? 12
86  : (s[13] == 0) ? 13
87  : (s[14] == 0) ? 14
88  : (s[15] == 0) ? 15
89  : (s[16] == 0) ? 16
90  : (s[17] == 0) ? 17
91  : (s[18] == 0) ? 18
92  : (s[19] == 0) ? 19
93  : (s[20] == 0) ? 20
94  : (s[21] == 0) ? 21
95  : (s[22] == 0) ? 22
96  : (s[23] == 0) ? 23
97  : (s[24] == 0) ? 24
98  : (s[25] == 0) ? 25
99  : (s[26] == 0) ? 26
100  : (s[27] == 0) ? 27
101  : (s[28] == 0) ? 28
102  : (s[29] == 0) ? 29
103  : (s[30] == 0) ? 30
104  : (s[31] == 0) ? 31
105  : 32;
106 }
107 
108 constexpr int __tex_hash_map(int c) {
109  return (c == 49) ? 10
110  : (c == 50) ? 0
111  : (c == 51) ? 100
112  : (c == 52) ? 30
113  : (c == 67) ? 10
114  : (c == 68) ? 0
115  : (c == 69) ? 25
116  : (c == 72) ? 70
117  : (c == 77) ? 0
118  : (c == 96) ? 44
119  : (c == 99) ? 10
120  : (c == 100) ? 5
121  : (c == 101) ? 60
122  : (c == 102) ? 40
123  : (c == 103) ? 70
124  : (c == 104) ? 25
125  : (c == 112) ? 0
126  : (c == 114) ? 45
127  : (c == 117) ? 5
128  : (c == 118) ? 85
129  : (c == 120) ? 20
130  : 225;
131 }
132 
133 constexpr int __tex_op_hash(const char *str) {
134  return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
135  __tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
136 }
137 
138 // Tag type to identify particular texture operation.
139 template <int N> struct __Tag;
140 #define __ID(__op) __Tag<__tex_op_hash(__op)>
141 // Tags for variants of particular operation. E.g. tex2Dgather can translate
142 // into 4 different instructions.
143 #define __IDV(__op, __variant) \
144  __Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
145 
146 // Helper classes for figuring out key data types for derived types.
147 // E.g. char2 has __base_t = char, __fetch_t = char4
148 template <class> struct __TypeInfoT;
149 // Type info for the fundamental types.
150 template <> struct __TypeInfoT<float> {
151  using __base_t = float;
152  using __fetch_t = float4;
153 };
154 template <> struct __TypeInfoT<char> {
155  using __base_t = char;
156  using __fetch_t = int4;
157 };
158 template <> struct __TypeInfoT<signed char> {
159  using __base_t = signed char;
160  using __fetch_t = int4;
161 };
162 template <> struct __TypeInfoT<unsigned char> {
163  using __base_t = unsigned char;
164  using __fetch_t = uint4;
165 };
166 template <> struct __TypeInfoT<short> {
167  using __base_t = short;
168  using __fetch_t = int4;
169 };
170 template <> struct __TypeInfoT<unsigned short> {
171  using __base_t = unsigned short;
172  using __fetch_t = uint4;
173 };
174 template <> struct __TypeInfoT<int> {
175  using __base_t = int;
176  using __fetch_t = int4;
177 };
178 template <> struct __TypeInfoT<unsigned int> {
179  using __base_t = unsigned int;
180  using __fetch_t = uint4;
181 };
182 
183 // Derived base/fetch types for N-element vectors.
184 template <class __T> struct __TypeInfoT {
185  using __base_t = decltype(__T::x);
186  using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
187 };
188 
189 // Classes that implement specific texture ops.
190 template <class __op> struct __tex_fetch_v4;
191 
192 // Helper macros to strip parens from a macro argument.
193 #define __Args(...) __VA_ARGS__
194 #define __STRIP_PARENS(__X) __X
195 #define __L(__X) __STRIP_PARENS(__Args __X)
196 
197 // Construct inline assembly output args.
198 // Results are stored in a temp var __r.
199 // isResident bool is pointed to by __ir
200 // Asm args for return values. It's a 4-element vector
201 #define __ASM_OUT(__t) \
202  ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
203 // .. possibly combined with a predicate.
204 #define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
205 
206 // Implements a single variant of texture fetch instruction.
207 #define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \
208  template <> \
209  __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \
210  __rt __r; \
211  asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \
212  return __r; \
213  }
214 
215 // Implements texture fetch instructions for int4/uint4/float4 data types.
216 #define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
217  __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
218  __ASM_OUT("r"), __asm_args) \
219  __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
220  __ASM_OUT("r"), __asm_args) \
221  __IMPL_F1(float4, float4, __args, \
222  __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \
223  __asm_args)
224 // Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
225 // types. Similar to above, but returns a boolean 'isPresent' value in addition
226 // to texture data,
227 #define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
228  __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
229  __ASM_OUTP("r"), __asm_args) \
230  __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
231  __ASM_OUTP("r"), __asm_args) \
232  __IMPL_F1(float4, float4, __args, \
233  __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \
234  __asm_args)
235 
236 // Similar to F3, but for integer data which is returned as normalized floats.
237 // Only instantiates fetch functions for int4/uint4.
238 #define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
239  __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
240  __ASM_OUT("r"), __asm_args) \
241  __IMPL_F1(float4, uint4, __args, \
242  __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \
243  __asm_args)
244 
245 // Instantiates __tex_fetch_v4 with regular fetch functions.
246 #define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
247  template <> struct __tex_fetch_v4<__op> { \
248  template <class T> \
249  __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
250  __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
251  }
252 
253 // Same, but for sparse ops. Only available on sm_60+
254 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
255 #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \
256  __asm_args) \
257  template <> struct __tex_fetch_v4<__op> { \
258  template <class T> \
259  __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
260  __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
261  }
262 #else
263 #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
264 #endif
265 
266 // Same, but for normalized float ops.
267 #define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \
268  __asm_args) \
269  template <> struct __tex_fetch_v4<__op> { \
270  template <class T> \
271  __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \
272  __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
273  }
274 
275 // Regular and normalized float ops share a lot of similarities. This macro
276 // instantiates both variants -- normal for __op and normalized for __opn.
277 #define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
278  __asm_args) \
279  __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \
280  __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
281 
282 // Convenience macros which converts string literal __op into a __Tag,
283 #define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
284  __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
285 #define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
286  __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
287 #define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
288  __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
289 #define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
290  __asm_args) \
291  __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
292  __asm_args)
293 
294 // CUDA headers have some 'legacy' texture oprerations that duplicate
295 // functionality. So, we just inherit it, instead of refining a copy.
296 #define __IMPL_ALIASI(__op, __opn) \
297  template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
298 #define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
299 
300 // Now we can instantiate everything we need for each specific texture fetch
301 // variant.
302 __IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
303  "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
304 __IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
305  "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
306 __IMPL_ALIAS("__itex1D", "__tex1D_v2");
307 __IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
308 
309 __IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
310  (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
311  "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
312  ("f"(__x), "f"(__dPdx), "f"(__dPdy)));
313 __IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
314 
315 __IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
316  (float __x, int __layer), "tex.a1d.v4", "f32",
317  "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
318 __IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
319 
320 __IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
321  (float __x, int __layer, float __dPdx, float __dPdy),
322  "tex.grad.a1d.v4", "f32",
323  "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
324  ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
325 __IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
326 
327 __IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
328  (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
329  "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
330  ("r"(__layer), "f"(__x), "f"(__level)));
331 __IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
332 
333 __IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
334  "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
335  ("f"(__x), "f"(__level)));
336 __IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
337 
338 // 2D
339 __IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
340  "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
341 __IMPL_ALIAS("__itex2D", "__tex2D_v2");
342 
343 __IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
344  "{.reg .pred %%p0;\n\t"
345  "tex.2d.v4",
346  "f32",
347  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
348  " selp.u16 %4, 1, 0, %%p0; }",
349  ("f"(__x), "f"(__y)));
350 
351 __IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
352  (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
353  "tex.grad.2d.v4", "f32",
354  "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
355  ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
356  "f"(__dPdy->y)));
357 __IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
358 
359 __IMPL_S3S("__itex2DGrad_sparse",
360  (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
361  unsigned char *__ir),
362  "{.reg .pred %%p0;\n\t"
363  "tex.grad.2d.v4",
364  "f32",
365  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
366  "selp.u16 %4, 1, 0, %%p0; }",
367  ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
368  "f"(__dPdy->y)));
369 
370 __IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
371  (float __x, float __y, int __layer), "tex.a2d.v4", "f32",
372  "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
373  ("r"(__layer), "f"(__x), "f"(__y)));
374 __IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
375 
376 __IMPL_S3S("__itex2DLayered_sparse",
377  (float __x, float __y, int __layer, unsigned char *__ir),
378  "{.reg .pred %%p0;\n\t"
379  "tex.a2d.v4",
380  "f32",
381  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
382  "selp.u16 %4, 1, 0, %%p0; }",
383  ("r"(__layer), "f"(__x), "f"(__y)));
384 
385 __IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
386  (float __x, float __y, int __layer, const float2 *__dPdx,
387  const float2 *__dPdy),
388  "tex.grad.a2d.v4", "f32",
389  "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
390  ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
391  "f"(__dPdy->x), "f"(__dPdy->y)));
392 __IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
393 
394 __IMPL_S3S(
395  "__itex2DLayeredGrad_sparse",
396  (float __x, float __y, int __layer, const float2 *__dPdx,
397  const float2 *__dPdy, unsigned char *__ir),
398  "{.reg .pred %%p0;\n\t"
399  "tex.grad.a2d.v4",
400  "f32",
401  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
402  "selp.u16 %4, 1, 0, %%p0; }",
403  ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
404  "f"(__dPdy->x), "f"(__dPdy->y)));
405 
406 __IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
407  (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
408  "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
409  ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
410 __IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
411 
412 __IMPL_S3S("__itex2DLayeredLod_sparse",
413  (float __x, float __y, int __layer, float __level,
414  unsigned char *__ir),
415  "{.reg .pred %%p0;\n\t"
416  "tex.level.a2d.v4",
417  "f32",
418  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
419  "selp.u16 %4, 1, 0, %%p0; }",
420  ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
421 
422 __IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
423  (float __x, float __y, float __level), "tex.level.2d.v4", "f32",
424  "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
425  ("f"(__x), "f"(__y), "f"(__level)));
426 __IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
427 
428 __IMPL_S3S("__itex2DLod_sparse",
429  (float __x, float __y, float __level, unsigned char *__ir),
430  "{.reg .pred %%p0;\n\t"
431  "tex.level.2d.v4",
432  "f32",
433  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
434  "selp.u16 %4, 1, 0, %%p0; }",
435  ("f"(__x), "f"(__y), "f"(__level)));
436 
437 // 2D gather is special. Unlike other variants that translate into exactly one
438 // asm instruction, it uses one of the four different instructions selected by
439 // __comp. We implement each instruction variant separately, and dispatch the
440 // right one from the manually implemented 'umbrella' fetch.
441 #define __IMPL_2DGATHER(variant, instr) \
442  __IMPL_SI(__IDV("__tex2Dgather_v2", variant), \
443  __IDV("__tex2Dgather_rmnf_v2", variant), \
444  (float __x, float __y, int __comp), instr, "f32", \
445  "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \
446  __IMPL_ALIASI(__IDV("__itex2Dgather", variant), \
447  __IDV("__tex2Dgather_v2", variant)); \
448  __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \
449  (float __x, float __y, unsigned char *__ir, int __comp), \
450  "{.reg .pred %%p0;\n\t" instr, "f32", \
451  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" \
452  "selp.u16 %4, 1, 0, %%p0; }", \
453  ("f"(__x), "f"(__y)));
454 __IMPL_2DGATHER(0, "tld4.r.2d.v4");
455 __IMPL_2DGATHER(1, "tld4.g.2d.v4");
456 __IMPL_2DGATHER(2, "tld4.b.2d.v4");
457 __IMPL_2DGATHER(3, "tld4.a.2d.v4");
458 
459 // Umbrella dispatcher -- calls into specific 2Dgather variant.
460 template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
461  template <class __T>
462  __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
463  int __comp) {
464  switch (__comp) {
465  case 0:
466  return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
467  __obj, __x, __y, __comp);
468  case 1:
469  return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
470  __obj, __x, __y, __comp);
471  case 2:
472  return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
473  __obj, __x, __y, __comp);
474  case 3:
475  return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
476  __obj, __x, __y, __comp);
477  }
478  }
479 };
480 __IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
481 
482 template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
483  template <class __T>
484  __device__ static float4 __run(cudaTextureObject_t __obj, float __x,
485  float __y, int __comp) {
486  switch (__comp) {
487  case 0:
488  return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
489  __obj, __x, __y, __comp);
490  case 1:
491  return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
492  __obj, __x, __y, __comp);
493  case 2:
494  return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
495  __obj, __x, __y, __comp);
496  case 3:
497  return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
498  __obj, __x, __y, __comp);
499  }
500  }
501 };
502 
503 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
504 template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
505  template <class __T>
506  __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
507  unsigned char *__ir, int __comp) {
508  switch (__comp) {
509  case 0:
510  return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
511  __obj, __x, __y, __ir, __comp);
512  case 1:
513  return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
514  __obj, __x, __y, __ir, __comp);
515  case 2:
516  return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
517  __obj, __x, __y, __ir, __comp);
518  case 3:
519  return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
520  __obj, __x, __y, __ir, __comp);
521  }
522  }
523 };
524 #endif
525 
526 // 3D
527 __IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
528  "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
529  ("f"(__x), "f"(__y), "f"(__z)));
530 __IMPL_ALIAS("__itex3D", "__tex3D_v2");
531 
532 __IMPL_S3S("__itex3D_sparse",
533  (float __x, float __y, float __z, unsigned char *__ir),
534  "{.reg .pred %%p0;\n\t"
535  "tex.3d.v4",
536  "f32",
537  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
538  "selp.u16 %4, 1, 0, %%p0; }",
539  ("f"(__x), "f"(__y), "f"(__z)));
540 
541 __IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
542  (float __x, float __y, float __z, const float4 *__dPdx,
543  const float4 *__dPdy),
544  "tex.grad.3d.v4", "f32",
545  "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
546  "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
547  ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
548  "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
549 __IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
550 
551 __IMPL_S3S("__itex3DGrad_sparse",
552  (float __x, float __y, float __z, const float4 *__dPdx,
553  const float4 *__dPdy, unsigned char *__ir),
554  "{.reg .pred %%p0;\n\t"
555  "tex.grad.3d.v4",
556  "f32",
557  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
558  "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
559  "selp.u16 %4, 1, 0, %%p0; }",
560  ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
561  "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
562 
563 __IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
564  (float __x, float __y, float __z, float __level), "tex.level.3d.v4",
565  "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
566  ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
567 __IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
568 
569 __IMPL_S3S("__itex3DLod_sparse",
570  (float __x, float __y, float __z, float __level,
571  unsigned char *__ir),
572  "{.reg .pred %%p0;\n\t"
573  "tex.level.3d.v4",
574  "f32",
575  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
576  "selp.u16 %4, 1, 0, %%p0; }",
577  ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
578 
579 // Cubemap
580 __IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
581  (float __x, float __y, float __z), "tex.cube.v4", "f32",
582  "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
583  ("f"(__x), "f"(__y), "f"(__z)));
584 __IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
585 
586 __IMPL_S3S("__itexCubemap_sparse",
587  (float __x, float __y, float __z, unsigned char *__ir),
588  "{.reg .pred %%p0;\n\t"
589  "tex.cube.v4",
590  "f32",
591  "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
592  "selp.u16 %4, 1, 0, %%p0; }",
593  ("f"(__x), "f"(__y), "f"(__z)));
594 
595 __IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
596  (float __x, float __y, float __z, const float4 *__dPdx,
597  const float4 *__dPdy),
598  "tex.grad.cube.v4", "f32",
599  "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
600  "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
601  ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
602  "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
603 __IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
604 
605 __IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
606  (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
607  "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
608  ("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
609 __IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
610 
611 __IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
612  (float __x, float __y, float __z, int __layer, const float4 *__dPdx,
613  const float4 *__dPdy),
614  "tex.grad.acube.v4", "f32",
615  "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
616  "{%9, %10, %11, %11}, {%12, %13, %14, %14};",
617  ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
618  "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
619  "f"(__dPdy->z)));
620 __IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
621 
622 __IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
623  (float __x, float __y, float __z, int __layer, float __level),
624  "tex.level.acube.v4", "f32",
625  "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
626  ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
627 __IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
628 
629 __IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
630  (float __x, float __y, float __z, float __level), "tex.level.cube.v4",
631  "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
632  ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
633 __IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
634 
635 // Helper class for extracting slice of data from V4 fetch results.
636 template <class __DestT, class __SrcT> struct __convert {
637  template <int __NElements = sizeof(__DestT) /
638  sizeof(typename __TypeInfoT<__DestT>::__base_t)>
639  __device__ static __DestT __run(__SrcT __v);
640  template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
641  template <> __device__ static __DestT __run<2>(__SrcT __v) {
642  return {__v.x, __v.y};
643  }
644  template <> __device__ static __DestT __run<3>(__SrcT __v) {
645  return {__v.x, __v.y, __v.z};
646  }
647  template <> __device__ static __DestT __run<4>(__SrcT __v) {
648  return {__v.x, __v.y, __v.z, __v.w};
649  }
650 };
651 
652 // These are the top-level function overloads the __nv_tex_surf_handler expands
653 // to. Each overload deals with one of the several ways __nv_tex_surf_handler
654 // is called by CUDA headers. In the end, each of the overloads does the same
655 // job -- it figures out which `__tex_fetch_v4::run` variant should be used to
656 // fetch texture data and which `__convert::run` is needed to convert it into
657 // appropriate return type.
658 
659 // __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
660 // Data type and return type are based on ret.
661 template <class __op, class __T, class... __Args>
662 __device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
663  __Args... __args) {
664  using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
665  *__ptr = __convert<__T, __FetchT>::__run(
666  __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
667 }
668 
669 #if CUDA_VERSION < 12000
670 // texture<> objects get magically converted into a texture reference. However,
671 // there's no way to convert them to cudaTextureObject_t on C++ level. So, we
672 // cheat a bit and use inline assembly to do it. It costs us an extra register
673 // and a move, but that is easy for ptxas to optimize away.
674 template <class __T>
675 __device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
676  cudaTextureObject_t __obj;
677  asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
678  return __obj;
679 }
680 
681 // __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
682 // Data type and return type is based on ret.
683 template <class __op, class __T, class __HandleT, class... __Args>
684 __device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
685  __Args... __args) {
686  using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
687  *__ptr = __convert<__T, __FetchT>::__run(
688  __tex_fetch_v4<__op>::template __run<__FetchT>(
689  __tex_handle_to_obj(__handle), __args...));
690 }
691 
692 // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
693 // cudaReadModeNormalizedFloat fetches always return float4.
694 template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
695 __device__ static void
696 __tex_fetch(__DataT *, __RetT *__ptr,
697  texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
698  __Args... __args) {
699  using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
700  *__ptr = __convert<__RetT, float4>::__run(
701  __tex_fetch_v4<__op>::template __run<__FetchT>(
702  __tex_handle_to_obj(__handle), __args...));
703 }
704 
705 // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
706 // For cudaReadModeElementType fetch return type is based on type_dummy.
707 template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
708 __device__ static void
709 __tex_fetch(__DataT *, __RetT *__ptr,
710  texture<__DataT, __TexT, cudaReadModeElementType> __handle,
711  __Args... __args) {
712  using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
713  *__ptr = __convert<__RetT, __FetchT>::__run(
714  __tex_fetch_v4<__op>::template __run<__FetchT>(
715  __tex_handle_to_obj(__handle), __args...));
716 }
717 #endif // CUDA_VERSION
718 } // namespace __cuda_tex
719 } // namespace
720 #pragma pop_macro("__ASM_OUT")
721 #pragma pop_macro("__ASM_OUTP")
722 #pragma pop_macro("__Args")
723 #pragma pop_macro("__ID")
724 #pragma pop_macro("__IDV")
725 #pragma pop_macro("__IMPL_2DGATHER")
726 #pragma pop_macro("__IMPL_ALIAS")
727 #pragma pop_macro("__IMPL_ALIASI")
728 #pragma pop_macro("__IMPL_F1")
729 #pragma pop_macro("__IMPL_F3")
730 #pragma pop_macro("__IMPL_F3N")
731 #pragma pop_macro("__IMPL_F3S")
732 #pragma pop_macro("__IMPL_S")
733 #pragma pop_macro("__IMPL_S3")
734 #pragma pop_macro("__IMPL_S3I")
735 #pragma pop_macro("__IMPL_S3N")
736 #pragma pop_macro("__IMPL_S3NI")
737 #pragma pop_macro("__IMPL_S3S")
738 #pragma pop_macro("__IMPL_S3SI")
739 #pragma pop_macro("__IMPL_SI")
740 #pragma pop_macro("__L")
741 #pragma pop_macro("__STRIP_PARENS")
742 #endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__
#define __ID(__op)
#define __Args(...)
#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#define __IMPL_ALIAS(__op, __opn)
#define __IMPL_2DGATHER(variant, instr)
#define __IDV(__op, __variant)
#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
__device__ int
__device__ __2f16 float __ockl_bool s
__device__ __2f16 float c
__device__ float
#define __device__
static __inline__ uint32_t uint32_t __y
Definition: arm_acle.h:130
struct __storeu_i16 *__P __v
Definition: immintrin.h:472
vector< float, 4 > float4
vector< int, 4 > int4
vector< float, 2 > float2
vector< uint, 4 > uint4