tvm
Namespaces | Functions
cuda.h File Reference
#include <tvm/tirx/expr.h>
#include <tvm/tirx/op.h>
Include dependency graph for cuda.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces

 tvm
 An object that builds and maintains block scope and StmtSref mapping for Dependence analysis.
 
 tvm::tirx
 
 tvm::tirx::builtin
 Collection of builtin intrinsics as ops.
 

Functions

const Op & tvm::tirx::builtin::tvm_load_matrix_sync ()
 tvm intrinsic for tensor core load operators. More...
 
const Op & tvm::tirx::builtin::tvm_mma_sync ()
 tvm intrinsic for tensor core mma_sync operators. More...
 
const Op & tvm::tirx::builtin::tvm_bmma_sync ()
 tvm intrinsic for tensor core bmma_sync operators. More...
 
const Op & tvm::tirx::builtin::tvm_fill_fragment ()
 tvm intrinsic for tensor core fill_fragment operators. More...
 
const Op & tvm::tirx::builtin::tvm_store_matrix_sync ()
 tvm intrinsic for tensor core store operators. More...
 
const Op & tvm::tirx::builtin::ptx_mma ()
 tvm intrinsic for ptx tensor core mma instructions. More...
 
const Op & tvm::tirx::builtin::ptx_mma_legacy ()
 ptx mma / ldmatrix / mma_store / mma_fill variants that take (ptr_var, offset) pairs (not a folded access_ptr Call). Codegen emits ptr + offset C pointer arithmetic; lower_warp_memory rewrites the offset's group component to its thread-local index. More...
 
const Op & tvm::tirx::builtin::ptx_ldmatrix_legacy ()
 
const Op & tvm::tirx::builtin::mma_store_legacy ()
 
const Op & tvm::tirx::builtin::mma_fill_legacy ()
 
const Op & tvm::tirx::builtin::ptx_ldg32 ()
 tvm intrinsic for ptx predicate load with 32-bit data type. More...
 
const Op & tvm::tirx::builtin::ptx_mma_sp ()
 tvm intrinsic for sparse tensor core ptx instructions. More...
 
const Op & tvm::tirx::builtin::ptx_ldmatrix ()
 tvm intrinsic for ptx load matrix from shared memory. More...
 
const Op & tvm::tirx::builtin::ptx_cp_async ()
 tvm intrinsics for ptx async copy from global to shared memory using cp.async More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_bulk ()
 tvm intrinsics for ptx async copy from global to shared memory using cp.async.bulk More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_bulk_shared_to_cluster ()
 tvm intrinsics for ptx async bulk copy from shared::cta to shared::cluster More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_commit_group ()
 tvm intrinsics for ptx async copy commit and wait. More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_wait_group ()
 
const Op & tvm::tirx::builtin::ptx_cp_async_mbarrier_arrive ()
 tvm intrinsics for ptx async copy barrier using cp.async.mbarrier.arrive More...
 
const Op & tvm::tirx::builtin::ptx_fence ()
 PTX fence instruction: fence.{sem}.{scope}. More...
 
const Op & tvm::tirx::builtin::ptx_fence_proxy_async ()
 PTX fence.proxy.async instruction: fence.proxy.async[.{space}]. More...
 
const Op & tvm::tirx::builtin::ptx_mbarrier_init ()
 tvm instrinsics to call mbarrier.init.shared::cta.b64 More...
 
const Op & tvm::tirx::builtin::ptx_mbarrier_arrive ()
 tvm instrinsics to call mbarrier.arrive.shared::cta.b64 or mapa.shared::cluster.u32 mbarrier.arrive.shared::cluster.b64 More...
 
const Op & tvm::tirx::builtin::ptx_mbarrier_arrive_expect_tx ()
 tvm instrinsics to call mbarrier.arrive.expect_tx.shared.b64 or mapa.shared::cluster.u32 mbarrier.arrive.expect_tx.shared.b64 More...
 
const Op & tvm::tirx::builtin::ptx_mbarrier_try_wait ()
 tvm instrinsics to call mbarrier.try_wait.parity repeatedly until it returns true More...
 
const Op & tvm::tirx::builtin::ptx_bar_arrive ()
 tvm instrinsics to call bar.arrive a, b More...
 
const Op & tvm::tirx::builtin::ptx_bar_sync ()
 tvm instrinsics to call bar.sync a, {b} More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_bulk_tensor_global_to_cluster ()
 tvm instrinsics to call cp.async.bulk.tensor.dim.shared::cluster.global.tile.mbarrier::complete_tx::bytes More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_bulk_tensor_tile_gather4_global_to_cluster ()
 tvm intrinsic to call cp.async.bulk.tensor.dim.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_bulk_tensor_shared_to_global ()
 tvm instrinsics to call cp.async.bulk.tensor.dim.global.shared::cta.tile。bulk_group More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_bulk_tensor_global_to_cluster_prefetch ()
 tvm instrinsics to call cp.async.bulk.prefetch.tensor.dim.L2.global.tile More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_bulk_tensor_shared_to_global_reduce ()
 tvm instrinsics to call cp.reduce.async.bulk.tensor.dim.dst.src.redOp More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_bulk_commit_group ()
 tvm instrinsics to call cp.async.bulk.commit_group More...
 
const Op & tvm::tirx::builtin::ptx_cp_async_bulk_wait_group ()
 tvm instrinsics to call cp.async.bulk.wait_group{.read} N More...
 
const Op & tvm::tirx::builtin::ptx_barrier_cluster_arrive ()
 tvm instrinsics to call barrier.cluster.arrive{.sem}{.aligned} More...
 
const Op & tvm::tirx::builtin::ptx_barrier_cluster_wait ()
 tvm instrinsics to call barrier.cluster.wait.{acquire}{.aligned} More...
 
const Op & tvm::tirx::builtin::ptx_elect_sync ()
 tvm instrinsics to call elect.sync _|p, membermask and return the predicate More...
 
const Op & tvm::tirx::builtin::ptx_fence_mbarrier_init ()
 PTX fence.mbarrier_init.release.cluster instruction. More...
 
const Op & tvm::tirx::builtin::ptx_fetch_register ()
 tvm instrinsics to fetch PTX pre-defined registers More...
 
const Op & tvm::tirx::builtin::mma_store ()
 tvm intrinsic for storing the result of PTX MMA into a destination pointer. For example, if each thread in a warp of size 32 has 4 elements from the result of m16xn8xk16 MMA in its registers, this intrinsic can be used to store the result in a 16x8 region in shared or global memory. More...
 
const Op & tvm::tirx::builtin::mma_fill ()
 tvm intrinsic for zero-initializing an MMA accumulation register. For example, if each thread in a warp of size 32 has 8 elements from the A matrix in m16xn8xk16 MMA in its registers, this intrinsic can be used to zero-initialize its 4 accumulation registers. More...
 
const Op & tvm::tirx::builtin::ptx_wgmma_encode_matrix_descriptor ()
 tvm intrinsic to encode matrix descriptor for wgmma instructions. More...
 
const Op & tvm::tirx::builtin::ptx_wgmma_noop_barrier ()
 tvm intrinsic to call "" : "+r"(reg) :: "memory" More...
 
const Op & tvm::tirx::builtin::ptx_wgmma_mma_async_ss ()
 tvm intrinsic to call wgmma.mma_async.sync.aligned.shape.dtype.atype.btype where both A and B are in shared memory. More...
 
const Op & tvm::tirx::builtin::ptx_wgmma_mma_async_rs ()
 tvm intrinsic to call wgmma.mma_async.sync.aligned.shape.dtype.atype.btype where A is in register and B is in shared memory. More...
 
const Op & tvm::tirx::builtin::ptx_wgmma_fence ()
 tvm intrinsic to call wgmma.fence.sync.aligned; More...
 
const Op & tvm::tirx::builtin::ptx_wgmma_commit_group ()
 tvm intrinsic to call wgmma.commit_group.sync.aligned; More...
 
const Op & tvm::tirx::builtin::ptx_wgmma_wait_group ()
 tvm intrinsic to call wgmma.wait_group.sync.aligned; More...
 
const Op & tvm::tirx::builtin::ptx_stmatrix ()
 tvm intrinsic to call stmatrix.sync.aligned.m8n8.num{.trans}.shared.b16 [p], r; More...
 
const Op & tvm::tirx::builtin::ptx_setmaxnreg ()
 tvm intrinsic to call setmaxnreg.action.sync.aligned.u32 imm-reg-count More...
 
const Op & tvm::tirx::builtin::ptx_ld_global_acquire ()
 tvm intrinsic to call ld.global.acquire.gpu.b32 More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_alloc ()
 tvm instrinsics to call tcgen05.alloc.cta_group.sync.aligned; More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_dealloc ()
 tvm instrinsics to call tcgen05.dealloc.cta_group.sync.aligned; More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_relinquish_alloc_permit ()
 tvm instrinsics to call tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_fence_before_thread_sync ()
 tvm instrinsics to call tcgen05.fence::before_thread_sync; More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_fence_after_thread_sync ()
 tvm instrinsics to call tcgen05.fence::after_thread_sync; More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_ld ()
 tvm instrinsics to call tcgen05.ld.sync.aligned; More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_st ()
 tvm instrinsics to call tcgen05.st.sync.aligned; More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_wait_ld ()
 tvm instrinsics to call tcgen05.wait::ld.sync.aligned; More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_wait_st ()
 tvm instrinsics to call tcgen05.wait::st.sync.aligned; More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_encode_matrix_descriptor ()
 tvm intrinsic to encode matrix descriptor for tcgen05 instructions. More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_encode_instr_descriptor ()
 tvm intrinsic to encode instruction descriptor for tcgen05 MMA. More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_encode_instr_descriptor_block_scaled ()
 tvm intrinsic to encode instruction descriptor for tcgen05 MMA block scaled. More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_mma ()
 tvm intrinsic to call tcgen05.mma.cta_group.kind without block scaling. More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_mma_block_scale ()
 tvm intrinsic to call tcgen05.mma.cta_group.kind.block_scale{.scale_vec_size} More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_mma_sp ()
 tvm intrinsic to call tcgen05.mma.sp.cta_group.kind without block scaling. More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_mma_sp_block_scale ()
 tvm intrinsic to call tcgen05.mma.sp.cta_group.kind.block_scale{.scale_vec_size} More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_commit ()
 tvm instrinsics to call tcgen05.commit.cta_group More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_cp ()
 tvm instrinsics to call tcgen05.cp.cta_group More...
 
const Op & tvm::tirx::builtin::ptx_tcgen05_shift ()
 tvm instrinsics to call tcgen05.shift.cta_group.down More...
 
const Op & tvm::tirx::builtin::ptx_map_shared_rank ()
 tvm instrinsics to call map_shared_rank More...
 
const Op & tvm::tirx::builtin::cuda_func_call ()
 tvm instrinsics to call a CUDA function. Source code is provided as a string. More...
 
const Op & tvm::tirx::builtin::nvshmem_my_pe ()
 nvshmem intrinsics for nvshmem_my_pe() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_n_pes ()
 nvshmem intrinsics for nvshmem_n_pes() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_getmem_nbi ()
 nvshmem intrinsics for nvshmem_getmem_nbi() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_putmem_nbi ()
 nvshmem intrinsics for nvshmem_putmem_nbi() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_getmem_nbi_warp ()
 nvshmem intrinsics for nvshmemx_getmem_nbi_warp() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_putmem_nbi_warp ()
 nvshmem intrinsics for nvshmemx_putmem_nbi_warp() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_getmem_nbi_block ()
 nvshmem intrinsics for nvshmemx_getmem_nbi_block() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_putmem_nbi_block ()
 nvshmem intrinsics for nvshmemx_putmem_nbi_block() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_signal_op ()
 nvshmem intrinsics for nvshmemx_signal_op() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_wait_until ()
 nvshmem intrinsics for nvshmem_FuncParam{TYPENAME}_wait_until() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_quiet ()
 nvshmem intrinsics for nvshmem_quiet() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_putmem_signal_nbi ()
 nvshmem intrinsics for nvshmemx_putmem_signal_nbi() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_putmem_signal_nbi_warp ()
 nvshmem intrinsics for nvshmemx_putmem_signal_nbi_warp() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_putmem_signal_nbi_block ()
 nvshmem intrinsics for nvshmemx_putmem_signal_nbi_block() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_fence ()
 nvshmem intrinsics for nvshmem_fence() operation. More...
 
const Op & tvm::tirx::builtin::nvshmem_barrier_all ()
 nvshmem intrinsics for nvshmem_barrier_all() operation. More...