tvm
builtin.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
33 #ifndef TVM_TIR_BUILTIN_H_
34 #define TVM_TIR_BUILTIN_H_
35 
36 #include <tvm/ir/op.h>
37 #include <tvm/tirx/expr.h>
38 
39 namespace tvm {
40 namespace tirx {
41 
43 namespace builtin {
47 TVM_DLL const Op& ret();
51 TVM_DLL const Op& thread_return();
55 TVM_DLL const Op& continue_loop();
59 TVM_DLL const Op& break_loop();
63 TVM_DLL const Op& reinterpret();
64 
68 TVM_DLL const Op& likely();
69 
77 TVM_DLL const Op& filter();
78 
87 TVM_DLL const Op& selector();
88 
92 TVM_DLL const Op& bitwise_and();
93 
97 TVM_DLL const Op& bitwise_or();
98 
102 TVM_DLL const Op& bitwise_xor();
103 
107 TVM_DLL const Op& bitwise_not();
108 
112 TVM_DLL const Op& shift_left();
113 
117 TVM_DLL const Op& shift_right();
118 
128 TVM_DLL const Op& large_uint_imm();
129 
136 TVM_DLL const Op& q_multiply_shift();
137 
150 TVM_DLL const Op& address_of();
151 
159 TVM_DLL const Op& if_then_else();
160 
168 TVM_DLL const Op& isnullptr();
169 
173 TVM_DLL const Op& isnan();
174 
178 TVM_DLL const Op& popcount();
179 
187 TVM_DLL const Op& fma();
188 
201 TVM_DLL const Op& call_extern();
202 
215 TVM_DLL const Op& call_pure_extern();
216 
227 TVM_DLL const Op& call_llvm_intrin();
228 
239 TVM_DLL const Op& call_llvm_pure_intrin();
240 
250 TVM_DLL const Op& call_spirv_pure_glsl450();
251 
252 // TODO(tvm-team) revisit the builtins below
253 // some of them can simply become ops with special codegen attr.
257 TVM_DLL const Op& prefetch();
258 
274 TVM_DLL const Op& tvm_access_ptr();
275 
280 TVM_DLL const Op& tvm_static_handle();
281 
286 TVM_DLL const Op& tvm_context_id();
287 
295 TVM_DLL const Op& tvm_tuple();
296 
304 TVM_DLL const Op& handle_add_byte_offset();
305 
314 TVM_DLL const Op& tvm_struct_get();
315 
324 TVM_DLL const Op& tvm_struct_set();
325 
332 TVM_DLL const Op& lookup_param();
333 
341 TVM_DLL const Op& tvm_throw_last_error();
342 
352 TVM_DLL const Op& tvm_stack_alloca();
353 
364 TVM_DLL const Op& tvm_stack_make_shape();
365 
385 TVM_DLL const Op& tvm_stack_make_array();
386 
399 TVM_DLL const Op& tvm_call_packed();
400 
410 TVM_DLL const Op& tvm_call_cpacked();
411 
423 TVM_DLL const Op& tvm_call_trace_packed();
424 
429 TVM_DLL const Op& tvm_thread_invariant();
430 
447 TVM_DLL const Op& tvm_call_packed_lowered();
448 
462 TVM_DLL const Op& tvm_call_cpacked_lowered();
463 
482 
491 TVM_DLL const Op& tvm_storage_sync();
492 
524 TVM_DLL const Op& tvm_warp_shuffle();
525 TVM_DLL const Op& tvm_warp_shuffle_up();
526 TVM_DLL const Op& tvm_warp_shuffle_down();
527 TVM_DLL const Op& tvm_warp_shuffle_xor();
528 TVM_DLL const Op& tvm_warp_activemask();
529 
534 TVM_DLL const Op& tvm_global_barrier_kinit();
535 
547 TVM_DLL const Op& tvm_thread_allreduce();
548 
549 // Metal SimdGroup matrix intrinsics
550 
560 
569 TVM_DLL const Op& simdgroup_load();
570 
579 TVM_DLL const Op& simdgroup_store();
580 
590 
591 // Metal cooperative_tensor intrinsics (MetalPerformancePrimitives / Metal 4)
592 
599 TVM_DLL const Op& cooperative_tensor_fill();
600 
611 TVM_DLL const Op& cooperative_tensor_load();
612 
623 TVM_DLL const Op& cooperative_tensor_store();
624 
635 
636 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
640 TVM_DLL const Op& vectorhigh();
641 
645 TVM_DLL const Op& vectorlow();
646 
650 TVM_DLL const Op& vectorcombine();
651 
655 TVM_DLL const Op& dp4a();
656 
660 TVM_DLL const Op& atomic_add();
664 TVM_DLL const Op& nd_mem_alloc_with_scope();
665 
669 TVM_DLL const Op& texture2d_store();
670 
674 TVM_DLL const Op& texture2d_load();
675 
687 TVM_DLL const Op& dma_copy();
688 
695 TVM_DLL const Op& dma_wait();
696 
706 TVM_DLL const Op& dma_start_group();
707 
719 TVM_DLL const Op& dma_end_group();
720 
728 TVM_DLL const Op& assume();
729 
736 TVM_DLL const Op& undef();
737 
741 TVM_DLL const Op& start_profile_intrinsic();
742 
746 TVM_DLL const Op& end_profile_intrinsic();
747 
759 TVM_DLL const Op& anylist_getitem();
760 
772 TVM_DLL const Op& anylist_resetitem();
773 
786 
791 
796 TVM_DLL const Op& vscale();
797 
804 TVM_DLL const Op& get_active_lane_mask();
805 
807 TVM_DLL const Op& ignore_loop_partition();
813 TVM_DLL const Op& buffer_offset();
814 
816 enum TVMStructFieldKind : int {
817  // DLTensor fields
830  // TVMValue field
836  // Generic int64 array element access: ((int64_t*)buf)[index]
838 };
839 
843 TVM_DLL const Op& print_buffer();
844 
854 TVM_DLL const Op& timer_init_cuda();
855 
868 TVM_DLL const Op& timer_start_cuda();
869 
881 TVM_DLL const Op& timer_end_cuda();
882 
894 TVM_DLL const Op& timer_finalize_cuda();
895 
899 TVM_DLL const Op& cuda_atomic_add();
900 
904 TVM_DLL const Op& cuda_thread_fence();
905 
912 TVM_DLL const Op& cuda_warp_reduce();
913 
920 TVM_DLL const Op& cuda_cta_reduce();
921 
929 TVM_DLL const Op& cuda_copy_bytes();
930 
934 TVM_DLL const Op& cuda_warp_sync();
935 
939 TVM_DLL const Op& cuda_cta_sync();
940 
944 TVM_DLL const Op& cuda_grid_sync();
945 
950 TVM_DLL const Op& cuda_thread_rank();
951 
955 TVM_DLL const Op& cuda_half2float();
956 
960 TVM_DLL const Op& cuda_bfloat162float();
961 
965 TVM_DLL const Op& cuda_float22half2();
966 
971 
975 TVM_DLL const Op& cuda_runtime_instr_desc();
976 
980 TVM_DLL const Op& cuda_half8tofloat8();
981 
985 TVM_DLL const Op& cuda_float8tohalf8();
986 
990 TVM_DLL const Op& cuda_syncthreads_and();
991 
995 TVM_DLL const Op& cuda_syncthreads_or();
996 
1000 TVM_DLL const Op& cuda_nano_sleep();
1001 
1005 TVM_DLL const Op& cuda_atomic_cas();
1006 
1010 TVM_DLL const Op& cuda_printf();
1011 
1015 TVM_DLL const Op& cuda_ldg();
1016 
1020 TVM_DLL const Op& cuda_get_tmem_addr();
1021 
1025 TVM_DLL const Op& ptx_exp2();
1026 
1030 TVM_DLL const Op& ptx_rcp();
1031 
1035 TVM_DLL const Op& ptx_any_sync();
1036 
1040 TVM_DLL const Op& ptx_reduce3_max_f32();
1041 
1045 TVM_DLL const Op& ptx_reduce3_min_f32();
1046 
1050 TVM_DLL const Op& ptx_add_packed_f32x2();
1051 
1055 TVM_DLL const Op& ptx_sub_packed_f32x2();
1056 
1060 TVM_DLL const Op& ptx_mul_packed_f32x2();
1061 
1065 TVM_DLL const Op& ptx_fma_packed_f32x2();
1066 
1067 } // namespace builtin
1068 } // namespace tirx
1069 } // namespace tvm
1070 #endif // TVM_TIR_BUILTIN_H_
Managed reference class to OpNode.
Definition: op.h:131
Primitive operators(builtin intrinsics) and registry for them.
const Op & q_multiply_shift()
Execute a multiplication between two Q-numbers x and y followed by a right shift s The default roundi...
const Op & tvm_warp_shuffle_up()
const Op & cuda_thread_rank()
tvm intrinsic that returns cooperative_groups::thread_rank() for the enclosing CTA (linear thread ind...
const Op & ptx_reduce3_min_f32()
tvm intrinsic for PTX 3-input min instruction (sm_100a+)
const Op & dma_end_group()
End a group of DMA copies.
const Op & tvm_stack_make_array()
Allocate a Tensor(DLTensor) on stack, return the handle.
const Op & vscale()
Get the target's vscale value. It will be lowered to llvm.vscale intrinsic (https://llvm....
const Op & simdgroup_multiply_accumulate()
tvm intrinsic for multiply and accumulate two matrices in simdgroup
const Op & tvm_call_cpacked_lowered()
Lowered version of call c-packed, the space of value and type codes are explicitly allocated.
const Op & cuda_thread_fence()
tvm intrinsic for cuda thread fence instruction
const Op & cuda_runtime_instr_desc()
tvm intrinsic to modify runtime instruction descriptor
const Op & tvm_access_ptr()
Get head access address with memory access pattern info.
const Op & dp4a()
Dot product of two int8x4 vectors and add an optional accumulator.
const Op & bitwise_xor()
Bitwise xor operator.
const Op & tvm_thread_allreduce()
See pesudo code.
const Op & buffer_offset()
Get the element offset of a buffer given logical indices.
const Op & cuda_atomic_cas()
tvm intrinsic for cuda atomic compare and swap instruction
const Op & cuda_bfloat162float()
tvm intrinsic for cuda bfloat16 to float conversion
const Op & tvm_call_packed_lowered()
Lowered version of call packed, the space of value and type codes are explicitly allocated.
const Op & address_of()
Returns the address of an element in the buffer (see pseudocode below).
const Op & undef()
Returns an initialized but arbitrary value.
const Op & tvm_call_trace_packed()
See pesudo code.
const Op & tvm_call_cpacked()
See pesudo code.
const Op & tvm_context_id()
Return a unique context id, used for hint of workspace separation. Different context id ganrantees no...
const Op & ptx_any_sync()
tvm intrinsic for PTX warp-wide any predicate (__any_sync)
const Op & texture2d_store()
Store to texture 2d memory.
const Op & anylist_setitem_call_cpacked()
Same as anylist_setitem_call_packed but use C calling convention.
const Op & cuda_warp_reduce()
Warp-level butterfly shuffle-XOR reduction.
const Op & timer_end_cuda()
tvm intrinsic for ending the timer for profiling a specific event, and storing profiling result in a ...
const Op & ptx_rcp()
tvm intrinsic for PTX fast reciprocal approximation (rcp.approx.ftz.f32)
const Op & tvm_call_trace_packed_lowered()
Lowered version of trace intrinsic, the space of value and type codes are explicitly allocated....
const Op & timer_start_cuda()
tvm intrinsic for starting the timer for profiling a specific event, and storing profiling result in ...
const Op & simdgroup_load()
tvm intrinsic for loading data from device memory or threadgroup memory to simdgroup.
const Op & cuda_cta_reduce()
CTA-wide reduction via warp shuffle + shared memory.
const Op & cooperative_tensor_multiply_accumulate()
Multiply and accumulate two matrices using cooperative_tensor (MetalPerformancePrimitives matmul2d).
const Op & tvm_tuple()
tvm_tuple is not an actual function and cannot codegen. It is used to represent tuple structure in va...
const Op & cooperative_tensor_store()
Store data from a cooperative_tensor to device or threadgroup memory.
const Op & call_extern()
Call an extern C function with given name and signature from the types of args in the runtime environ...
const Op & texture2d_load()
Load from texture 2d memory.
const Op & cuda_get_tmem_addr()
tvm intrinsic for cuda tmem address calculation
const Op & tvm_warp_activemask()
const Op & tvm_call_packed()
See pesudo code.
const Op & cuda_warp_sync()
tvm intrinsic for cuda warp sync instruction
const Op & lookup_param()
See pseudo code Type lookup_param(ffi::String param_name) { return __tvm_param__param_name; }.
const Op & continue_loop()
Loop continue.
const Op & cuda_syncthreads_or()
tvm intrinsic for cuda syncthreads_or instruction
const Op & cuda_trap_when_assert_failed()
tvm intrinsic to trap when an assertion failed (cond == false)
const Op & prefetch()
same signature as llvm.prefetch
const Op & cuda_float22half2()
tvm intrinsic for a helper converting float2 to half2 with rounding
const Op & timer_init_cuda()
tvm intrinsic for initializing the CUDA profiler, and store profiling result in a buffer.
const Op & tvm_struct_set()
See pesudo code.
const Op & selector()
Analysis-only active-thread selector.
const Op & cuda_grid_sync()
tvm intrinsic for cuda grid-wide sync (cooperative groups)
const Op & thread_return()
Return from a GPU thread.
const Op & get_active_lane_mask()
Calculate a predicate mask given an upper bound (limit) and a current value (base).
const Op & if_then_else()
Same as select, used for unsafe memory access.
const Op & cuda_printf()
tvm intrinsic for cuda printf instruction
const Op & ret()
Return value.
const Op & end_profile_intrinsic()
Profiling intrinsic.
const Op & ptx_exp2()
tvm intrinsic for PTX fast exp2 approximation (ex2.approx.ftz.f32)
const Op & call_spirv_pure_glsl450()
Call an SPIRV pure GLSL450 intrinsic.
const Op & bitwise_not()
Bitwise not operator.
const Op & cooperative_tensor_load()
Load data from device or threadgroup memory into a cooperative_tensor.
const Op & tvm_stack_alloca()
See pesudo code.
const Op & ptx_sub_packed_f32x2()
tvm intrinsic for PTX packed subtract instruction (sm_100a+)
const Op & break_loop()
Loop break.
const Op & bitwise_and()
Bitwise and operator.
const Op & print_buffer()
Print the content of a buffer during runtime.
const Op & assume()
Provide a true statement that can be used for simplifications.
const Op & large_uint_imm()
See pesudo code.
const Op & tvm_thread_invariant()
Mark a condition to be thread invariant. This means the condition must be the same for all threads.
const Op & start_profile_intrinsic()
Profiling intrinsic.
const Op & cuda_cta_sync()
tvm intrinsic for cuda block-wide sync (syncthreads)
const Op & handle_add_byte_offset()
See pesudo code.
const Op & call_llvm_pure_intrin()
Call an LLVM pure intrinsic with a given intrinsic id and signature from the types of args in the run...
const Op & filter()
Thread-set filter predicate. Used as the condition of an IfThenElse to narrow the active thread set A...
const Op & tvm_global_barrier_kinit()
Initialize the global barrier. Call this at beginning of kernel that need global barrier.
const Op & fma()
Fused multiply add.
const Op & cuda_half2float()
tvm intrinsic for cuda half to float conversion
const Op & cuda_half8tofloat8()
tvm intrinsic to convert 8 half2 lanes to 8 float2 lanes
const Op & call_pure_extern()
Call an pure extern C function with given name and signature from the types of args in the runtime en...
const Op & tvm_throw_last_error()
See pesudo code.
const Op & shift_left()
Left shift.
const Op & tvm_warp_shuffle_down()
const Op & dma_start_group()
Start a group of DMA copies.
const Op & cuda_ldg()
tvm intrinsic for cuda ldg instruction
const Op & cuda_copy_bytes()
Typed load/store copy of num_bytes bytes.
const Op & dma_copy()
Initiate a non-blocking DMA copy from source to destination.
const Op & tvm_storage_sync()
See pseudo code.
const Op & call_llvm_intrin()
Call an LLVM intrinsic with a given intrinsic id and signature from the types of args in the runtime ...
const Op & likely()
Marks a condition is likely going to happen.
const Op & make_filled_simdgroup_matrix()
tvm intrinsic for initializing and simdgroup with given value.
const Op & ignore_loop_partition()
Annotate a predicate not be considered as target condition of loop partition.
const Op & vectorhigh()
Get the high level half of the vector.
const Op & anylist_setitem_call_packed()
Set an item into any list by running packed function call.
const Op & cuda_nano_sleep()
tvm intrinsic for cuda nano sleep instruction
const Op & simdgroup_store()
tvm intrinsic for storing data from simdgroup to device memory or threadgroup memory.
const Op & shift_right()
Right shift.
const Op & anylist_getitem()
Get a item from any list and return it.
const Op & ptx_fma_packed_f32x2()
tvm intrinsic for PTX packed FMA instruction (sm_100a+)
const Op & cooperative_tensor_fill()
Fill a cooperative_tensor with a given value.
const Op & timer_finalize_cuda()
tvm intrinsic for finalize the timer for profiling, and storing profiling result in a buffer.
const Op & tvm_static_handle()
Create a function local static handle that iniitalizes to nullptr. can be used to cache function loca...
const Op & vectorcombine()
Concat two vectors.
const Op & tvm_warp_shuffle_xor()
TVMStructFieldKind
The kind of structure field info used in intrinsic.
Definition: builtin.h:816
@ kDLTensorNDim
Definition: builtin.h:822
@ kTVMFFIAnyUnionValue
Definition: builtin.h:834
@ kTVMFFIAnyTypeIndex
Definition: builtin.h:832
@ kDLTensorDeviceId
Definition: builtin.h:827
@ kDLTensorKindBound_
Definition: builtin.h:829
@ kDLTensorTypeLanes
Definition: builtin.h:825
@ kTVMValueKindBound_
Definition: builtin.h:835
@ kTVMValueContent
Definition: builtin.h:831
@ kDLTensorStrides
Definition: builtin.h:821
@ kDLTensorTypeCode
Definition: builtin.h:823
@ kDLTensorAddr
Definition: builtin.h:818
@ kDLTensorByteOffset
Definition: builtin.h:826
@ kDLTensorShape
Definition: builtin.h:820
@ kInt64ArrayElem
Definition: builtin.h:837
@ kDLTensorDeviceType
Definition: builtin.h:828
@ kTVMFFIAnyZeroPadding
Definition: builtin.h:833
@ kDLTensorData
Definition: builtin.h:819
@ kDLTensorTypeBits
Definition: builtin.h:824
const Op & tvm_stack_make_shape()
Allocate a shape tuple on stack, return the handle.
const Op & isnullptr()
See pesudo code.
const Op & ptx_reduce3_max_f32()
tvm intrinsic for PTX 3-input max instruction (sm_100a+)
const Op & dma_wait()
Wait until the number of DMA groups in flight is less than or equal to some maximum.
const Op & atomic_add()
atomic add instruction, corresponding e.g. to atomicAdd in CUDA
const Op & ptx_mul_packed_f32x2()
tvm intrinsic for PTX packed multiply instruction (sm_100a+)
const Op & tvm_struct_get()
See pesudo code.
const Op & reinterpret()
Reinterpret the value using the target type.
const Op & isnan()
Check if value is nan.
const Op & vectorlow()
Get the low-level half of the vector.
const Op & cuda_syncthreads_and()
tvm intrinsic for cuda syncthreads_and instruction
const Op & cuda_float8tohalf8()
tvm intrinsic to convert 8 float2 lanes to 8 half2 lanes with rounding
const Op & cuda_atomic_add()
tvm intrinsic for cuda atomic add instruction
const Op & ptx_add_packed_f32x2()
tvm intrinsic for PTX packed add instruction (sm_100a+)
const Op & tvm_warp_shuffle()
See pseudo code.
const Op & anylist_resetitem()
Reset and clear a item in any list.
const Op & popcount()
Popcount.
const Op & nd_mem_alloc_with_scope()
Create an Nd memory allocation with storage scope.
const Op & bitwise_or()
Bitwise or operator.
An object that builds and maintains block scope and StmtSref mapping for Dependence analysis.
Definition: analyzer.h:37
TIR expressions.