#include <tvm/tirx/expr.h>
#include <tvm/tirx/op.h>

Include dependency graph for cuda.h:

This graph shows which files directly or indirectly include this file:

Namespaces
	tvm
	An object that builds and maintains block scope and StmtSref mapping for Dependence analysis.

	tvm::tirx

	tvm::tirx::builtin
	Collection of builtin intrinsics as ops.

Functions
const Op &	tvm::tirx::builtin::tvm_load_matrix_sync ()
	tvm intrinsic for tensor core load operators. More...

const Op &	tvm::tirx::builtin::tvm_mma_sync ()
	tvm intrinsic for tensor core mma_sync operators. More...

const Op &	tvm::tirx::builtin::tvm_bmma_sync ()
	tvm intrinsic for tensor core bmma_sync operators. More...

const Op &	tvm::tirx::builtin::tvm_fill_fragment ()
	tvm intrinsic for tensor core fill_fragment operators. More...

const Op &	tvm::tirx::builtin::tvm_store_matrix_sync ()
	tvm intrinsic for tensor core store operators. More...

const Op &	tvm::tirx::builtin::ptx_mma ()
	tvm intrinsic for ptx tensor core mma instructions. More...

const Op &	tvm::tirx::builtin::ptx_mma_legacy ()
	ptx mma / ldmatrix / mma_store / mma_fill variants that take `(ptr_var, offset)` pairs (not a folded access_ptr Call). Codegen emits `ptr + offset` C pointer arithmetic; `lower_warp_memory` rewrites the offset's group component to its thread-local index. More...

const Op &	tvm::tirx::builtin::ptx_ldmatrix_legacy ()

const Op &	tvm::tirx::builtin::mma_store_legacy ()

const Op &	tvm::tirx::builtin::mma_fill_legacy ()

const Op &	tvm::tirx::builtin::ptx_ldg32 ()
	tvm intrinsic for ptx predicate load with 32-bit data type. More...

const Op &	tvm::tirx::builtin::ptx_mma_sp ()
	tvm intrinsic for sparse tensor core ptx instructions. More...

const Op &	tvm::tirx::builtin::ptx_ldmatrix ()
	tvm intrinsic for ptx load matrix from shared memory. More...

const Op &	tvm::tirx::builtin::ptx_cp_async ()
	tvm intrinsics for ptx async copy from global to shared memory using cp.async More...

const Op &	tvm::tirx::builtin::ptx_cp_async_bulk ()
	tvm intrinsics for ptx async copy from global to shared memory using cp.async.bulk More...

const Op &	tvm::tirx::builtin::ptx_cp_async_bulk_shared_to_cluster ()
	tvm intrinsics for ptx async bulk copy from shared::cta to shared::cluster More...

const Op &	tvm::tirx::builtin::ptx_cp_async_commit_group ()
	tvm intrinsics for ptx async copy commit and wait. More...

const Op &	tvm::tirx::builtin::ptx_cp_async_wait_group ()

const Op &	tvm::tirx::builtin::ptx_cp_async_mbarrier_arrive ()
	tvm intrinsics for ptx async copy barrier using cp.async.mbarrier.arrive More...

const Op &	tvm::tirx::builtin::ptx_fence ()
	PTX fence instruction: fence.{sem}.{scope}. More...

const Op &	tvm::tirx::builtin::ptx_fence_proxy_async ()
	PTX fence.proxy.async instruction: fence.proxy.async[.{space}]. More...

const Op &	tvm::tirx::builtin::ptx_mbarrier_init ()
	tvm instrinsics to call mbarrier.init.shared::cta.b64 More...

const Op &	tvm::tirx::builtin::ptx_mbarrier_arrive ()
	tvm instrinsics to call mbarrier.arrive.shared::cta.b64 or `mapa.shared::cluster.u32` `mbarrier.arrive.shared::cluster.b64` More...

const Op &	tvm::tirx::builtin::ptx_mbarrier_arrive_expect_tx ()
	tvm instrinsics to call mbarrier.arrive.expect_tx.shared.b64 or `mapa.shared::cluster.u32` `mbarrier.arrive.expect_tx.shared.b64` More...

const Op &	tvm::tirx::builtin::ptx_mbarrier_try_wait ()
	tvm instrinsics to call mbarrier.try_wait.parity repeatedly until it returns true More...

const Op &	tvm::tirx::builtin::ptx_bar_arrive ()
	tvm instrinsics to call bar.arrive a, b More...

const Op &	tvm::tirx::builtin::ptx_bar_sync ()
	tvm instrinsics to call bar.sync a, {b} More...

const Op &	tvm::tirx::builtin::ptx_cp_async_bulk_tensor_global_to_cluster ()
	tvm instrinsics to call cp.async.bulk.tensor.dim.shared::cluster.global.tile.mbarrier::complete_tx::bytes More...

const Op &	tvm::tirx::builtin::ptx_cp_async_bulk_tensor_tile_gather4_global_to_cluster ()
	tvm intrinsic to call cp.async.bulk.tensor.dim.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes More...

const Op &	tvm::tirx::builtin::ptx_cp_async_bulk_tensor_shared_to_global ()
	tvm instrinsics to call cp.async.bulk.tensor.dim.global.shared::cta.tile。bulk_group More...

const Op &	tvm::tirx::builtin::ptx_cp_async_bulk_tensor_global_to_cluster_prefetch ()
	tvm instrinsics to call cp.async.bulk.prefetch.tensor.dim.L2.global.tile More...

const Op &	tvm::tirx::builtin::ptx_cp_async_bulk_tensor_shared_to_global_reduce ()
	tvm instrinsics to call cp.reduce.async.bulk.tensor.dim.dst.src.redOp More...

const Op &	tvm::tirx::builtin::ptx_cp_async_bulk_commit_group ()
	tvm instrinsics to call cp.async.bulk.commit_group More...

const Op &	tvm::tirx::builtin::ptx_cp_async_bulk_wait_group ()
	tvm instrinsics to call cp.async.bulk.wait_group{.read} N More...

const Op &	tvm::tirx::builtin::ptx_barrier_cluster_arrive ()
	tvm instrinsics to call barrier.cluster.arrive{.sem}{.aligned} More...

const Op &	tvm::tirx::builtin::ptx_barrier_cluster_wait ()
	tvm instrinsics to call barrier.cluster.wait.{acquire}{.aligned} More...

const Op &	tvm::tirx::builtin::ptx_elect_sync ()
	tvm instrinsics to call elect.sync _\|p, membermask and return the predicate More...

const Op &	tvm::tirx::builtin::ptx_fence_mbarrier_init ()
	PTX fence.mbarrier_init.release.cluster instruction. More...

const Op &	tvm::tirx::builtin::ptx_fetch_register ()
	tvm instrinsics to fetch PTX pre-defined registers More...

const Op &	tvm::tirx::builtin::mma_store ()
	tvm intrinsic for storing the result of PTX MMA into a destination pointer. For example, if each thread in a warp of size 32 has 4 elements from the result of m16xn8xk16 MMA in its registers, this intrinsic can be used to store the result in a 16x8 region in shared or global memory. More...

const Op &	tvm::tirx::builtin::mma_fill ()
	tvm intrinsic for zero-initializing an MMA accumulation register. For example, if each thread in a warp of size 32 has 8 elements from the A matrix in m16xn8xk16 MMA in its registers, this intrinsic can be used to zero-initialize its 4 accumulation registers. More...

const Op &	tvm::tirx::builtin::ptx_wgmma_encode_matrix_descriptor ()
	tvm intrinsic to encode matrix descriptor for wgmma instructions. More...

const Op &	tvm::tirx::builtin::ptx_wgmma_noop_barrier ()
	tvm intrinsic to call "" : "+r"(reg) :: "memory" More...

const Op &	tvm::tirx::builtin::ptx_wgmma_mma_async_ss ()
	tvm intrinsic to call wgmma.mma_async.sync.aligned.shape.dtype.atype.btype where both A and B are in shared memory. More...

const Op &	tvm::tirx::builtin::ptx_wgmma_mma_async_rs ()
	tvm intrinsic to call wgmma.mma_async.sync.aligned.shape.dtype.atype.btype where A is in register and B is in shared memory. More...

const Op &	tvm::tirx::builtin::ptx_wgmma_fence ()
	tvm intrinsic to call wgmma.fence.sync.aligned; More...

const Op &	tvm::tirx::builtin::ptx_wgmma_commit_group ()
	tvm intrinsic to call wgmma.commit_group.sync.aligned; More...

const Op &	tvm::tirx::builtin::ptx_wgmma_wait_group ()
	tvm intrinsic to call wgmma.wait_group.sync.aligned; More...

const Op &	tvm::tirx::builtin::ptx_stmatrix ()
	tvm intrinsic to call stmatrix.sync.aligned.m8n8.num{.trans}.shared.b16 [p], r; More...

const Op &	tvm::tirx::builtin::ptx_setmaxnreg ()
	tvm intrinsic to call setmaxnreg.action.sync.aligned.u32 imm-reg-count More...

const Op &	tvm::tirx::builtin::ptx_ld_global_acquire ()
	tvm intrinsic to call ld.global.acquire.gpu.b32 More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_alloc ()
	tvm instrinsics to call tcgen05.alloc.cta_group.sync.aligned; More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_dealloc ()
	tvm instrinsics to call tcgen05.dealloc.cta_group.sync.aligned; More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_relinquish_alloc_permit ()
	tvm instrinsics to call tcgen05.relinquish_alloc_permit.cta_group.sync.aligned; More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_fence_before_thread_sync ()
	tvm instrinsics to call tcgen05.fence::before_thread_sync; More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_fence_after_thread_sync ()
	tvm instrinsics to call tcgen05.fence::after_thread_sync; More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_ld ()
	tvm instrinsics to call tcgen05.ld.sync.aligned; More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_st ()
	tvm instrinsics to call tcgen05.st.sync.aligned; More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_wait_ld ()
	tvm instrinsics to call tcgen05.wait::ld.sync.aligned; More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_wait_st ()
	tvm instrinsics to call tcgen05.wait::st.sync.aligned; More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_encode_matrix_descriptor ()
	tvm intrinsic to encode matrix descriptor for tcgen05 instructions. More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_encode_instr_descriptor ()
	tvm intrinsic to encode instruction descriptor for tcgen05 MMA. More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_encode_instr_descriptor_block_scaled ()
	tvm intrinsic to encode instruction descriptor for tcgen05 MMA block scaled. More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_mma ()
	tvm intrinsic to call tcgen05.mma.cta_group.kind without block scaling. More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_mma_block_scale ()
	tvm intrinsic to call tcgen05.mma.cta_group.kind.block_scale{.scale_vec_size} More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_mma_sp ()
	tvm intrinsic to call tcgen05.mma.sp.cta_group.kind without block scaling. More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_mma_sp_block_scale ()
	tvm intrinsic to call tcgen05.mma.sp.cta_group.kind.block_scale{.scale_vec_size} More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_commit ()
	tvm instrinsics to call tcgen05.commit.cta_group More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_cp ()
	tvm instrinsics to call tcgen05.cp.cta_group More...

const Op &	tvm::tirx::builtin::ptx_tcgen05_shift ()
	tvm instrinsics to call tcgen05.shift.cta_group.down More...

const Op &	tvm::tirx::builtin::ptx_map_shared_rank ()
	tvm instrinsics to call map_shared_rank More...

const Op &	tvm::tirx::builtin::cuda_func_call ()
	tvm instrinsics to call a CUDA function. Source code is provided as a string. More...

const Op &	tvm::tirx::builtin::nvshmem_my_pe ()
	nvshmem intrinsics for nvshmem_my_pe() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_n_pes ()
	nvshmem intrinsics for nvshmem_n_pes() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_getmem_nbi ()
	nvshmem intrinsics for nvshmem_getmem_nbi() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_putmem_nbi ()
	nvshmem intrinsics for nvshmem_putmem_nbi() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_getmem_nbi_warp ()
	nvshmem intrinsics for nvshmemx_getmem_nbi_warp() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_putmem_nbi_warp ()
	nvshmem intrinsics for nvshmemx_putmem_nbi_warp() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_getmem_nbi_block ()
	nvshmem intrinsics for nvshmemx_getmem_nbi_block() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_putmem_nbi_block ()
	nvshmem intrinsics for nvshmemx_putmem_nbi_block() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_signal_op ()
	nvshmem intrinsics for nvshmemx_signal_op() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_wait_until ()
	nvshmem intrinsics for nvshmem_FuncParam{TYPENAME}_wait_until() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_quiet ()
	nvshmem intrinsics for nvshmem_quiet() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_putmem_signal_nbi ()
	nvshmem intrinsics for nvshmemx_putmem_signal_nbi() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_putmem_signal_nbi_warp ()
	nvshmem intrinsics for nvshmemx_putmem_signal_nbi_warp() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_putmem_signal_nbi_block ()
	nvshmem intrinsics for nvshmemx_putmem_signal_nbi_block() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_fence ()
	nvshmem intrinsics for nvshmem_fence() operation. More...

const Op &	tvm::tirx::builtin::nvshmem_barrier_all ()
	nvshmem intrinsics for nvshmem_barrier_all() operation. More...

Namespaces

Functions