tvm
cuda.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
24 #ifndef TVM_TIRX_TARGET_BUILTIN_CUDA_H_
25 #define TVM_TIRX_TARGET_BUILTIN_CUDA_H_
26 
27 #include <tvm/tirx/expr.h>
28 #include <tvm/tirx/op.h>
29 
30 namespace tvm {
31 namespace tirx {
32 namespace builtin {
33 
34 // TODO(tvm-team) TensorCore specific intrinsics should be directly registered under
35 // cuda. namespace and used through op.
48 TVM_DLL const Op& tvm_load_matrix_sync();
49 
61 TVM_DLL const Op& tvm_mma_sync();
62 
74 TVM_DLL const Op& tvm_bmma_sync();
75 
86 TVM_DLL const Op& tvm_fill_fragment();
87 
99 TVM_DLL const Op& tvm_store_matrix_sync();
100 
110 TVM_DLL const Op& ptx_mma();
111 
118 TVM_DLL const Op& ptx_mma_legacy();
119 TVM_DLL const Op& ptx_ldmatrix_legacy();
120 TVM_DLL const Op& mma_store_legacy();
121 TVM_DLL const Op& mma_fill_legacy();
122 
127 TVM_DLL const Op& ptx_ldg32();
128 
133 TVM_DLL const Op& ptx_ldg32();
134 
146 TVM_DLL const Op& ptx_mma_sp();
147 
155 TVM_DLL const Op& ptx_ldmatrix();
156 
166 TVM_DLL const Op& ptx_cp_async();
167 
179 TVM_DLL const Op& ptx_cp_async_bulk();
180 
190 
198 TVM_DLL const Op& ptx_cp_async_commit_group();
199 TVM_DLL const Op& ptx_cp_async_wait_group();
200 
208 
214 TVM_DLL const Op& ptx_fence();
215 
221 TVM_DLL const Op& ptx_fence_proxy_async();
222 
228 TVM_DLL const Op& ptx_mbarrier_init();
229 
237 TVM_DLL const Op& ptx_mbarrier_arrive();
238 
249 
255 TVM_DLL const Op& ptx_mbarrier_try_wait();
256 
262 TVM_DLL const Op& ptx_bar_arrive();
263 
269 TVM_DLL const Op& ptx_bar_sync();
270 
282 
294 
306 
318 
327 
334 
341 
348 
354 TVM_DLL const Op& ptx_barrier_cluster_wait();
355 
361 TVM_DLL const Op& ptx_elect_sync();
362 
368 TVM_DLL const Op& ptx_fence_mbarrier_init();
369 
375 TVM_DLL const Op& ptx_fetch_register();
376 
389 TVM_DLL const Op& mma_store();
390 
402 TVM_DLL const Op& mma_fill();
403 
410 
416 TVM_DLL const Op& ptx_wgmma_noop_barrier();
417 
424 TVM_DLL const Op& ptx_wgmma_mma_async_ss();
425 
432 TVM_DLL const Op& ptx_wgmma_mma_async_rs();
433 
439 TVM_DLL const Op& ptx_wgmma_fence();
440 
446 TVM_DLL const Op& ptx_wgmma_commit_group();
447 
453 TVM_DLL const Op& ptx_wgmma_wait_group();
454 
460 TVM_DLL const Op& ptx_stmatrix();
461 
465 TVM_DLL const Op& ptx_setmaxnreg();
466 
472 TVM_DLL const Op& ptx_ld_global_acquire();
473 
479 TVM_DLL const Op& ptx_tcgen05_alloc();
480 
486 TVM_DLL const Op& ptx_tcgen05_dealloc();
487 
494 
501 
508 
514 TVM_DLL const Op& ptx_tcgen05_ld();
515 
521 TVM_DLL const Op& ptx_tcgen05_st();
522 
528 TVM_DLL const Op& ptx_tcgen05_wait_ld();
529 
535 TVM_DLL const Op& ptx_tcgen05_wait_st();
536 
543 
552 
562 
568 TVM_DLL const Op& ptx_tcgen05_mma();
569 
576 
582 TVM_DLL const Op& ptx_tcgen05_mma_sp();
583 
590 
596 TVM_DLL const Op& ptx_tcgen05_commit();
597 
603 TVM_DLL const Op& ptx_tcgen05_cp();
604 
610 TVM_DLL const Op& ptx_tcgen05_shift();
611 
617 TVM_DLL const Op& ptx_map_shared_rank();
618 
624 TVM_DLL const Op& cuda_func_call();
625 
631 TVM_DLL const Op& nvshmem_my_pe();
632 
638 TVM_DLL const Op& nvshmem_n_pes();
639 
645 TVM_DLL const Op& nvshmem_getmem_nbi();
646 
652 TVM_DLL const Op& nvshmem_putmem_nbi();
653 
659 TVM_DLL const Op& nvshmem_getmem_nbi_warp();
660 
666 TVM_DLL const Op& nvshmem_putmem_nbi_warp();
667 
673 TVM_DLL const Op& nvshmem_getmem_nbi_block();
674 
680 TVM_DLL const Op& nvshmem_putmem_nbi_block();
681 
687 TVM_DLL const Op& nvshmem_signal_op();
688 
694 TVM_DLL const Op& nvshmem_wait_until();
695 
701 TVM_DLL const Op& nvshmem_quiet();
702 
709 TVM_DLL const Op& nvshmem_putmem_signal_nbi();
710 
718 
726 
732 TVM_DLL const Op& nvshmem_fence();
733 
739 TVM_DLL const Op& nvshmem_barrier_all();
740 
741 } // namespace builtin
742 } // namespace tirx
743 } // namespace tvm
744 
745 #endif // TVM_TIRX_TARGET_BUILTIN_CUDA_H_
Managed reference class to OpNode.
Definition: op.h:131
const Op & ptx_mbarrier_arrive()
tvm instrinsics to call mbarrier.arrive.shared::cta.b64 or mapa.shared::cluster.u32 mbarrier....
const Op & ptx_mma_legacy()
ptx mma / ldmatrix / mma_store / mma_fill variants that take (ptr_var, offset) pairs (not a folded ac...
const Op & ptx_fetch_register()
tvm instrinsics to fetch PTX pre-defined registers
const Op & ptx_tcgen05_cp()
tvm instrinsics to call tcgen05.cp.cta_group
const Op & ptx_barrier_cluster_wait()
tvm instrinsics to call barrier.cluster.wait.{acquire}{.aligned}
const Op & ptx_tcgen05_fence_after_thread_sync()
tvm instrinsics to call tcgen05.fence::after_thread_sync;
const Op & mma_store_legacy()
const Op & nvshmem_my_pe()
nvshmem intrinsics for nvshmem_my_pe() operation.
const Op & ptx_ld_global_acquire()
tvm intrinsic to call ld.global.acquire.gpu.b32
const Op & ptx_map_shared_rank()
tvm instrinsics to call map_shared_rank
const Op & ptx_tcgen05_wait_ld()
tvm instrinsics to call tcgen05.wait::ld.sync.aligned;
const Op & ptx_cp_async()
tvm intrinsics for ptx async copy from global to shared memory using cp.async
const Op & ptx_elect_sync()
tvm instrinsics to call elect.sync _|p, membermask and return the predicate
const Op & ptx_tcgen05_mma_sp()
tvm intrinsic to call tcgen05.mma.sp.cta_group.kind without block scaling.
const Op & tvm_fill_fragment()
tvm intrinsic for tensor core fill_fragment operators.
const Op & tvm_mma_sync()
tvm intrinsic for tensor core mma_sync operators.
const Op & nvshmem_putmem_nbi()
nvshmem intrinsics for nvshmem_putmem_nbi() operation.
const Op & ptx_stmatrix()
tvm intrinsic to call stmatrix.sync.aligned.m8n8.num{.trans}.shared.b16 [p], r;
const Op & ptx_tcgen05_fence_before_thread_sync()
tvm instrinsics to call tcgen05.fence::before_thread_sync;
const Op & ptx_tcgen05_shift()
tvm instrinsics to call tcgen05.shift.cta_group.down
const Op & ptx_cp_async_bulk_shared_to_cluster()
tvm intrinsics for ptx async bulk copy from shared::cta to shared::cluster
const Op & tvm_bmma_sync()
tvm intrinsic for tensor core bmma_sync operators.
const Op & ptx_wgmma_encode_matrix_descriptor()
tvm intrinsic to encode matrix descriptor for wgmma instructions.
const Op & ptx_tcgen05_mma()
tvm intrinsic to call tcgen05.mma.cta_group.kind without block scaling.
const Op & nvshmem_wait_until()
nvshmem intrinsics for nvshmem_FuncParam{TYPENAME}_wait_until() operation.
const Op & ptx_cp_async_bulk_tensor_shared_to_global()
tvm instrinsics to call cp.async.bulk.tensor.dim.global.shared::cta.tile。bulk_group
const Op & cuda_func_call()
tvm instrinsics to call a CUDA function. Source code is provided as a string.
const Op & ptx_mbarrier_try_wait()
tvm instrinsics to call mbarrier.try_wait.parity repeatedly until it returns true
const Op & ptx_bar_arrive()
tvm instrinsics to call bar.arrive a, b
const Op & ptx_ldmatrix()
tvm intrinsic for ptx load matrix from shared memory.
const Op & ptx_tcgen05_encode_instr_descriptor_block_scaled()
tvm intrinsic to encode instruction descriptor for tcgen05 MMA block scaled.
const Op & ptx_mma_sp()
tvm intrinsic for sparse tensor core ptx instructions.
const Op & ptx_tcgen05_commit()
tvm instrinsics to call tcgen05.commit.cta_group
const Op & nvshmem_getmem_nbi_block()
nvshmem intrinsics for nvshmemx_getmem_nbi_block() operation.
const Op & ptx_cp_async_bulk_wait_group()
tvm instrinsics to call cp.async.bulk.wait_group{.read} N
const Op & ptx_tcgen05_relinquish_alloc_permit()
tvm instrinsics to call tcgen05.relinquish_alloc_permit.cta_group.sync.aligned;
const Op & tvm_store_matrix_sync()
tvm intrinsic for tensor core store operators.
const Op & ptx_ldmatrix_legacy()
const Op & ptx_tcgen05_encode_matrix_descriptor()
tvm intrinsic to encode matrix descriptor for tcgen05 instructions.
const Op & ptx_tcgen05_st()
tvm instrinsics to call tcgen05.st.sync.aligned;
const Op & ptx_wgmma_noop_barrier()
tvm intrinsic to call "" : "+r"(reg) :: "memory"
const Op & nvshmem_quiet()
nvshmem intrinsics for nvshmem_quiet() operation.
const Op & mma_store()
tvm intrinsic for storing the result of PTX MMA into a destination pointer. For example,...
const Op & nvshmem_getmem_nbi()
nvshmem intrinsics for nvshmem_getmem_nbi() operation.
const Op & ptx_wgmma_mma_async_ss()
tvm intrinsic to call wgmma.mma_async.sync.aligned.shape.dtype.atype.btype where both A and B are in ...
const Op & ptx_mbarrier_init()
tvm instrinsics to call mbarrier.init.shared::cta.b64
const Op & ptx_cp_async_bulk_tensor_tile_gather4_global_to_cluster()
tvm intrinsic to call cp.async.bulk.tensor.dim.shared::cluster.global.tile::gather4....
const Op & ptx_fence_proxy_async()
PTX fence.proxy.async instruction: fence.proxy.async[.{space}].
const Op & ptx_tcgen05_ld()
tvm instrinsics to call tcgen05.ld.sync.aligned;
const Op & ptx_cp_async_bulk_tensor_shared_to_global_reduce()
tvm instrinsics to call cp.reduce.async.bulk.tensor.dim.dst.src.redOp
const Op & nvshmem_putmem_nbi_block()
nvshmem intrinsics for nvshmemx_putmem_nbi_block() operation.
const Op & ptx_wgmma_wait_group()
tvm intrinsic to call wgmma.wait_group.sync.aligned;
const Op & ptx_barrier_cluster_arrive()
tvm instrinsics to call barrier.cluster.arrive{.sem}{.aligned}
const Op & ptx_wgmma_fence()
tvm intrinsic to call wgmma.fence.sync.aligned;
const Op & nvshmem_barrier_all()
nvshmem intrinsics for nvshmem_barrier_all() operation.
const Op & nvshmem_fence()
nvshmem intrinsics for nvshmem_fence() operation.
const Op & ptx_cp_async_bulk()
tvm intrinsics for ptx async copy from global to shared memory using cp.async.bulk
const Op & ptx_tcgen05_mma_sp_block_scale()
tvm intrinsic to call tcgen05.mma.sp.cta_group.kind.block_scale{.scale_vec_size}
const Op & nvshmem_putmem_signal_nbi()
nvshmem intrinsics for nvshmemx_putmem_signal_nbi() operation.
const Op & ptx_tcgen05_encode_instr_descriptor()
tvm intrinsic to encode instruction descriptor for tcgen05 MMA.
const Op & ptx_cp_async_bulk_commit_group()
tvm instrinsics to call cp.async.bulk.commit_group
const Op & ptx_cp_async_wait_group()
const Op & nvshmem_putmem_signal_nbi_warp()
nvshmem intrinsics for nvshmemx_putmem_signal_nbi_warp() operation.
const Op & ptx_fence_mbarrier_init()
PTX fence.mbarrier_init.release.cluster instruction.
const Op & nvshmem_putmem_nbi_warp()
nvshmem intrinsics for nvshmemx_putmem_nbi_warp() operation.
const Op & ptx_ldg32()
tvm intrinsic for ptx predicate load with 32-bit data type.
const Op & mma_fill_legacy()
const Op & ptx_tcgen05_mma_block_scale()
tvm intrinsic to call tcgen05.mma.cta_group.kind.block_scale{.scale_vec_size}
const Op & ptx_setmaxnreg()
tvm intrinsic to call setmaxnreg.action.sync.aligned.u32 imm-reg-count
const Op & ptx_mma()
tvm intrinsic for ptx tensor core mma instructions.
const Op & ptx_cp_async_mbarrier_arrive()
tvm intrinsics for ptx async copy barrier using cp.async.mbarrier.arrive
const Op & nvshmem_putmem_signal_nbi_block()
nvshmem intrinsics for nvshmemx_putmem_signal_nbi_block() operation.
const Op & ptx_cp_async_bulk_tensor_global_to_cluster()
tvm instrinsics to call cp.async.bulk.tensor.dim.shared::cluster.global.tile.mbarrier::complete_tx::b...
const Op & nvshmem_signal_op()
nvshmem intrinsics for nvshmemx_signal_op() operation.
const Op & nvshmem_getmem_nbi_warp()
nvshmem intrinsics for nvshmemx_getmem_nbi_warp() operation.
const Op & ptx_cp_async_bulk_tensor_global_to_cluster_prefetch()
tvm instrinsics to call cp.async.bulk.prefetch.tensor.dim.L2.global.tile
const Op & ptx_tcgen05_wait_st()
tvm instrinsics to call tcgen05.wait::st.sync.aligned;
const Op & ptx_tcgen05_alloc()
tvm instrinsics to call tcgen05.alloc.cta_group.sync.aligned;
const Op & mma_fill()
tvm intrinsic for zero-initializing an MMA accumulation register. For example, if each thread in a wa...
const Op & tvm_load_matrix_sync()
tvm intrinsic for tensor core load operators.
const Op & ptx_wgmma_mma_async_rs()
tvm intrinsic to call wgmma.mma_async.sync.aligned.shape.dtype.atype.btype where A is in register and...
const Op & ptx_bar_sync()
tvm instrinsics to call bar.sync a, {b}
const Op & ptx_mbarrier_arrive_expect_tx()
tvm instrinsics to call mbarrier.arrive.expect_tx.shared.b64 or mapa.shared::cluster....
const Op & ptx_cp_async_commit_group()
tvm intrinsics for ptx async copy commit and wait.
const Op & ptx_wgmma_commit_group()
tvm intrinsic to call wgmma.commit_group.sync.aligned;
const Op & nvshmem_n_pes()
nvshmem intrinsics for nvshmem_n_pes() operation.
const Op & ptx_fence()
PTX fence instruction: fence.{sem}.{scope}.
const Op & ptx_tcgen05_dealloc()
tvm instrinsics to call tcgen05.dealloc.cta_group.sync.aligned;
An object that builds and maintains block scope and StmtSref mapping for Dependence analysis.
Definition: analyzer.h:37
TIR expressions.
Common operators defined for Expr.