tvm
builtin.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
33 #ifndef TVM_TIR_BUILTIN_H_
34 #define TVM_TIR_BUILTIN_H_
35 
36 #include <tvm/ir/op.h>
37 #include <tvm/tir/expr.h>
38 
39 namespace tvm {
40 namespace tir {
41 
43 namespace builtin {
47 TVM_DLL const Op& ret();
51 TVM_DLL const Op& thread_return();
55 TVM_DLL const Op& continue_loop();
59 TVM_DLL const Op& break_loop();
63 TVM_DLL const Op& reinterpret();
64 
68 TVM_DLL const Op& likely();
69 
73 TVM_DLL const Op& bitwise_and();
74 
78 TVM_DLL const Op& bitwise_or();
79 
83 TVM_DLL const Op& bitwise_xor();
84 
88 TVM_DLL const Op& bitwise_not();
89 
93 TVM_DLL const Op& shift_left();
94 
98 TVM_DLL const Op& shift_right();
99 
109 TVM_DLL const Op& large_uint_imm();
110 
117 TVM_DLL const Op& q_multiply_shift();
118 
131 TVM_DLL const Op& address_of();
132 
140 TVM_DLL const Op& if_then_else();
141 
149 TVM_DLL const Op& isnullptr();
150 
154 TVM_DLL const Op& isnan();
155 
159 TVM_DLL const Op& popcount();
160 
168 TVM_DLL const Op& fma();
169 
182 TVM_DLL const Op& call_extern();
183 
196 TVM_DLL const Op& call_pure_extern();
197 
208 TVM_DLL const Op& call_llvm_intrin();
209 
220 TVM_DLL const Op& call_llvm_pure_intrin();
221 
231 TVM_DLL const Op& call_spirv_pure_glsl450();
232 
233 // TODO(tvm-team) revisit the builtins below
234 // some of them can simply become ops with special codegen attr.
238 TVM_DLL const Op& prefetch();
239 
255 TVM_DLL const Op& tvm_access_ptr();
256 
261 TVM_DLL const Op& tvm_static_handle();
262 
267 TVM_DLL const Op& tvm_context_id();
268 
276 TVM_DLL const Op& tvm_tuple();
277 
285 TVM_DLL const Op& handle_add_byte_offset();
286 
295 TVM_DLL const Op& tvm_struct_get();
296 
305 TVM_DLL const Op& tvm_struct_set();
306 
313 TVM_DLL const Op& lookup_param();
314 
322 TVM_DLL const Op& tvm_throw_last_error();
323 
333 TVM_DLL const Op& tvm_stack_alloca();
334 
345 TVM_DLL const Op& tvm_stack_make_shape();
346 
366 TVM_DLL const Op& tvm_stack_make_array();
367 
380 TVM_DLL const Op& tvm_call_packed();
381 
391 TVM_DLL const Op& tvm_call_cpacked();
392 
404 TVM_DLL const Op& tvm_call_trace_packed();
405 
417 TVM_DLL const Op& tvm_thread_context();
418 
423 TVM_DLL const Op& tvm_thread_invariant();
424 
441 TVM_DLL const Op& tvm_call_packed_lowered();
442 
456 TVM_DLL const Op& tvm_call_cpacked_lowered();
457 
476 
485 TVM_DLL const Op& tvm_storage_sync();
486 
518 TVM_DLL const Op& tvm_warp_shuffle();
519 TVM_DLL const Op& tvm_warp_shuffle_up();
520 TVM_DLL const Op& tvm_warp_shuffle_down();
521 TVM_DLL const Op& tvm_warp_activemask();
522 
527 TVM_DLL const Op& tvm_global_barrier_kinit();
528 
540 TVM_DLL const Op& tvm_thread_allreduce();
541 
542 // TODO(tvm-team) TensorCore specific intrinsics should be directly registered under
543 // cuda. namespace and used through op.
556 TVM_DLL const Op& tvm_load_matrix_sync();
557 
569 TVM_DLL const Op& tvm_mma_sync();
570 
582 TVM_DLL const Op& tvm_bmma_sync();
583 
594 TVM_DLL const Op& tvm_fill_fragment();
595 
607 TVM_DLL const Op& tvm_store_matrix_sync();
608 
618 TVM_DLL const Op& ptx_mma();
619 
624 TVM_DLL const Op& ptx_ldg32();
625 
630 TVM_DLL const Op& ptx_ldg32();
631 
643 TVM_DLL const Op& ptx_mma_sp();
644 
652 TVM_DLL const Op& ptx_ldmatrix();
653 
663 TVM_DLL const Op& ptx_cp_async();
664 
675 TVM_DLL const Op& ptx_cp_async_bulk();
676 
684 TVM_DLL const Op& ptx_commit_group();
685 TVM_DLL const Op& ptx_wait_group();
686 
693 TVM_DLL const Op& ptx_cp_async_barrier();
694 
702 
709 TVM_DLL const Op& ptx_arrive_barrier();
710 
718 
725 TVM_DLL const Op& ptx_wait_barrier();
726 
733 TVM_DLL const Op& create_barriers();
734 
747 TVM_DLL const Op& mma_store();
748 
760 TVM_DLL const Op& mma_fill();
761 
762 // Metal SimdGroup matrix intrinsics
763 
773 
782 TVM_DLL const Op& simdgroup_load();
783 
792 TVM_DLL const Op& simdgroup_store();
793 
803 
804 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
808 TVM_DLL const Op& vectorhigh();
809 
813 TVM_DLL const Op& vectorlow();
814 
818 TVM_DLL const Op& vectorcombine();
819 
823 TVM_DLL const Op& dp4a();
824 
828 TVM_DLL const Op& atomic_add();
832 TVM_DLL const Op& nd_mem_alloc_with_scope();
833 
837 TVM_DLL const Op& texture2d_store();
838 
842 TVM_DLL const Op& texture2d_load();
843 
855 TVM_DLL const Op& dma_copy();
856 
863 TVM_DLL const Op& dma_wait();
864 
874 TVM_DLL const Op& dma_start_group();
875 
887 TVM_DLL const Op& dma_end_group();
888 
896 TVM_DLL const Op& assume();
897 
904 TVM_DLL const Op& undef();
905 
909 TVM_DLL const Op& start_profile_intrinsic();
910 
914 TVM_DLL const Op& end_profile_intrinsic();
915 
927 TVM_DLL const Op& anylist_getitem();
928 
940 TVM_DLL const Op& anylist_resetitem();
941 
954 
959 
964 TVM_DLL const Op& vscale();
965 
972 TVM_DLL const Op& get_active_lane_mask();
973 
975 TVM_DLL const Op& ignore_loop_partition();
976 
978 enum TVMStructFieldKind : int {
979  // array head address
992  // TVMValue field
998 };
999 } // namespace builtin
1000 } // namespace tir
1001 } // namespace tvm
1002 #endif // TVM_TIR_BUILTIN_H_
Managed reference class to OpNode.
Definition: op.h:131
Primitive operators(builtin intrinsics) and registry for them.
const Op & tvm_call_packed_lowered()
Lowered version of call packed, the space of value and type codes are explicitly allocated.
const Op & tvm_thread_invariant()
Mark a condition to be thread invariant. This means the condition must be the same for all threads.
const Op & bitwise_not()
Bitwise not operator.
const Op & assume()
Provide a true statement that can be used for simplifications.
const Op & q_multiply_shift()
Execute a multiplication between two Q-numbers x and y followed by a right shift s The default roundi...
const Op & tvm_mma_sync()
tvm intrinsic for tensor core mma_sync operators.
const Op & bitwise_xor()
Bitwise xor operator.
const Op & bitwise_and()
Bitwise and operator.
const Op & dma_wait()
Wait until the number of DMA groups in flight is less than or equal to some maximum.
const Op & tvm_stack_alloca()
See pesudo code.
const Op & handle_add_byte_offset()
See pesudo code.
const Op & thread_return()
Return from a GPU thread.
const Op & simdgroup_store()
tvm intrinsic for storing data from simdgroup to device memory or threadgroup memory.
const Op & fma()
Fused multiply add.
const Op & tvm_call_cpacked()
See pesudo code.
const Op & ptx_commit_group()
tvm intrinsics for ptx async copy commit and wait.
const Op & popcount()
Popcount.
const Op & tvm_context_id()
Return a unique context id, used for hint of workspace separation. Different context id ganrantees no...
const Op & likely()
Marks a condition is likely going to happen.
const Op & tvm_struct_get()
See pesudo code.
const Op & shift_left()
Left shift.
const Op & tvm_stack_make_array()
Allocate a Tensor(DLTensor) on stack, return the handle.
const Op & ptx_cp_async_barrier()
tvm intrinsics for ptx async copy barrier using cp.async.mbarrier.arrive
const Op & tvm_call_packed()
See pesudo code.
const Op & create_barriers()
tvm intrinsics to create N barriers
const Op & mma_fill()
tvm intrinsic for zero-initializing an MMA accumulation register. For example, if each thread in a wa...
const Op & vectorcombine()
Concat two vectors.
const Op & tvm_call_cpacked_lowered()
Lowered version of call c-packed, the space of value and type codes are explicitly allocated.
const Op & large_uint_imm()
See pesudo code.
const Op & ptx_mma_sp()
tvm intrinsic for sparse tensor core ptx instructions.
const Op & dma_end_group()
End a group of DMA copies.
const Op & anylist_setitem_call_packed()
Set an item into any list by running packed function call.
const Op & tvm_access_ptr()
Get head access address with memory access pattern info.
const Op & vectorhigh()
Get the high level half of the vector.
const Op & break_loop()
Loop break.
const Op & prefetch()
same signature as llvm.prefetch
const Op & ptx_cp_async_bulk()
tvm intrinsics for ptx async copy from global to shared memory using cp.async.bulk
const Op & bitwise_or()
Bitwise or operator.
const Op & dp4a()
Dot product of two int8x4 vectors and add an optional accumulator.
const Op & tvm_fill_fragment()
tvm intrinsic for tensor core fill_fragment operators.
const Op & simdgroup_multiply_accumulate()
tvm intrinsic for multiply and accumulate two matrices in simdgroup
const Op & call_extern()
Call an extern C function with given name and signature from the types of args in the runtime environ...
const Op & tvm_static_handle()
Create a function local static handle that iniitalizes to nullptr. can be used to cache function loca...
const Op & tvm_thread_context()
See pesudo code Mark the content as thread local context, can get optimized by only call the call onc...
const Op & end_profile_intrinsic()
Profiling intrinsic.
const Op & tvm_struct_set()
See pesudo code.
const Op & texture2d_store()
Store to texture 2d memory.
const Op & isnan()
Check if value is nan.
const Op & address_of()
Returns the address of an element in the buffer (see pseudocode below).
const Op & simdgroup_load()
tvm intrinsic for loading data from device memory or threadgroup memory to simdgroup.
const Op & mma_store()
tvm intrinsic for storing the result of PTX MMA into a destination pointer. For example,...
const Op & undef()
Returns an initialized but arbitrary value.
const Op & ptx_ldg32()
tvm intrinsic for ptx predicate load with 32-bit data type.
const Op & reinterpret()
Reinterpret the value using the target type.
const Op & ptx_cp_async()
tvm intrinsics for ptx async copy from global to shared memory using cp.async
const Op & vectorlow()
Get the low-level half of the vector.
const Op & call_llvm_intrin()
Call an LLVM intrinsic with a given intrinsic id and signature from the types of args in the runtime ...
const Op & ptx_wait_barrier()
tvm intrinsics for ptx barrier wait using mbarrier.try_wait
const Op & tvm_bmma_sync()
tvm intrinsic for tensor core bmma_sync operators.
const Op & call_llvm_pure_intrin()
Call an LLVM pure intrinsic with a given intrinsic id and signature from the types of args in the run...
const Op & anylist_setitem_call_cpacked()
Same as anylist_setitem_call_packed but use C calling convention.
const Op & tvm_storage_sync()
See pseudo code.
const Op & tvm_throw_last_error()
See pesudo code.
const Op & tvm_load_matrix_sync()
tvm intrinsic for tensor core load operators.
const Op & nd_mem_alloc_with_scope()
Create an Nd memory allocation with storage scope.
const Op & tvm_thread_allreduce()
See pesudo code.
const Op & isnullptr()
See pesudo code.
const Op & start_profile_intrinsic()
Profiling intrinsic.
const Op & tvm_call_trace_packed_lowered()
Lowered version of trace intrinsic, the space of value and type codes are explicitly allocated....
const Op & anylist_resetitem()
Reset and clear a item in any list.
const Op & vscale()
Get the target's vscale value. It will be lowered to llvm.vscale intrinsic (https://llvm....
const Op & ptx_arrive_barrier_expect_tx()
tvm intrinsic for ptx barrier arrival with expect tx using mbarrier.arrive.expect_tx
const Op & tvm_tuple()
tvm_tuple is not an actual function and cannot codegen. It is used to represent tuple structure in va...
const Op & atomic_add()
atomic add instruction, corresponding e.g. to atomicAdd in CUDA
const Op & anylist_getitem()
Get a item from any list and return it.
const Op & tvm_stack_make_shape()
Allocate a shape tuple on stack, return the handle.
const Op & ptx_arrive_barrier()
tvm intrinsics for ptx barrier arrival using mbarrier.arrive
const Op & call_spirv_pure_glsl450()
Call an SPIRV pure GLSL450 intrinsic.
const Op & tvm_call_trace_packed()
See pesudo code.
const Op & tvm_global_barrier_kinit()
Initialize the global barrier. Call this at beginning of kernel that need global barrier.
const Op & tvm_warp_shuffle()
See pseudo code.
const Op & ptx_init_barrier_thread_count()
tvm intrinsics for ptx barrier initialization of thread count using mbarrier.init
const Op & ignore_loop_partition()
Annotate a predicate not be considered as target condition of loop partition.
const Op & dma_start_group()
Start a group of DMA copies.
const Op & get_active_lane_mask()
Calculate a predicate mask given an upper bound (limit) and a current value (base).
TVMStructFieldKind
The kind of structure field info used in intrinsic.
Definition: builtin.h:978
@ kArrAddr
Definition: builtin.h:980
@ kArrTypeLanes
Definition: builtin.h:987
@ kArrTypeBits
Definition: builtin.h:986
@ kArrKindBound_
Definition: builtin.h:991
@ kArrShape
Definition: builtin.h:982
@ kArrTypeCode
Definition: builtin.h:985
@ kTVMValueContent
Definition: builtin.h:993
@ kTVMFFIAnyTypeIndex
Definition: builtin.h:994
@ kArrData
Definition: builtin.h:981
@ kArrDeviceId
Definition: builtin.h:989
@ kTVMFFIAnyUnionValue
Definition: builtin.h:996
@ kArrStrides
Definition: builtin.h:983
@ kTVMValueKindBound_
Definition: builtin.h:997
@ kTVMFFIAnyZeroPadding
Definition: builtin.h:995
@ kArrDeviceType
Definition: builtin.h:990
@ kArrNDim
Definition: builtin.h:984
@ kArrByteOffset
Definition: builtin.h:988
const Op & continue_loop()
Loop continue.
const Op & ptx_mma()
tvm intrinsic for ptx tensor core mma instructions.
const Op & tvm_warp_shuffle_up()
const Op & call_pure_extern()
Call an pure extern C function with given name and signature from the types of args in the runtime en...
const Op & texture2d_load()
Load from texture 2d memory.
const Op & if_then_else()
Same as select, used for unsafe memory access.
const Op & ret()
Return value.
const Op & ptx_ldmatrix()
tvm intrinsic for ptx load matrix from shared memory.
const Op & ptx_wait_group()
const Op & shift_right()
Right shift.
const Op & lookup_param()
See pseudo code Type lookup_param(ffi::String param_name) { return __tvm_param__param_name; }.
const Op & tvm_warp_shuffle_down()
const Op & make_filled_simdgroup_matrix()
tvm intrinsic for initializing and simdgroup with given value.
const Op & tvm_warp_activemask()
const Op & tvm_store_matrix_sync()
tvm intrinsic for tensor core store operators.
const Op & dma_copy()
Initiate a non-blocking DMA copy from source to destination.
Performance counters for profiling via the PAPI library.
Definition: analyzer.h:37
TIR expressions.