24 #ifndef TVM_TIRX_TARGET_BUILTIN_CUDA_H_
25 #define TVM_TIRX_TARGET_BUILTIN_CUDA_H_
Managed reference class to OpNode.
Definition: op.h:131
const Op & ptx_mbarrier_arrive()
tvm instrinsics to call mbarrier.arrive.shared::cta.b64 or mapa.shared::cluster.u32 mbarrier....
const Op & ptx_mma_legacy()
ptx mma / ldmatrix / mma_store / mma_fill variants that take (ptr_var, offset) pairs (not a folded ac...
const Op & ptx_fetch_register()
tvm instrinsics to fetch PTX pre-defined registers
const Op & ptx_tcgen05_cp()
tvm instrinsics to call tcgen05.cp.cta_group
const Op & ptx_barrier_cluster_wait()
tvm instrinsics to call barrier.cluster.wait.{acquire}{.aligned}
const Op & ptx_tcgen05_fence_after_thread_sync()
tvm instrinsics to call tcgen05.fence::after_thread_sync;
const Op & mma_store_legacy()
const Op & nvshmem_my_pe()
nvshmem intrinsics for nvshmem_my_pe() operation.
const Op & ptx_ld_global_acquire()
tvm intrinsic to call ld.global.acquire.gpu.b32
const Op & ptx_map_shared_rank()
tvm instrinsics to call map_shared_rank
const Op & ptx_tcgen05_wait_ld()
tvm instrinsics to call tcgen05.wait::ld.sync.aligned;
const Op & ptx_cp_async()
tvm intrinsics for ptx async copy from global to shared memory using cp.async
const Op & ptx_elect_sync()
tvm instrinsics to call elect.sync _|p, membermask and return the predicate
const Op & ptx_tcgen05_mma_sp()
tvm intrinsic to call tcgen05.mma.sp.cta_group.kind without block scaling.
const Op & tvm_fill_fragment()
tvm intrinsic for tensor core fill_fragment operators.
const Op & tvm_mma_sync()
tvm intrinsic for tensor core mma_sync operators.
const Op & nvshmem_putmem_nbi()
nvshmem intrinsics for nvshmem_putmem_nbi() operation.
const Op & ptx_stmatrix()
tvm intrinsic to call stmatrix.sync.aligned.m8n8.num{.trans}.shared.b16 [p], r;
const Op & ptx_tcgen05_fence_before_thread_sync()
tvm instrinsics to call tcgen05.fence::before_thread_sync;
const Op & ptx_tcgen05_shift()
tvm instrinsics to call tcgen05.shift.cta_group.down
const Op & ptx_cp_async_bulk_shared_to_cluster()
tvm intrinsics for ptx async bulk copy from shared::cta to shared::cluster
const Op & tvm_bmma_sync()
tvm intrinsic for tensor core bmma_sync operators.
const Op & ptx_wgmma_encode_matrix_descriptor()
tvm intrinsic to encode matrix descriptor for wgmma instructions.
const Op & ptx_tcgen05_mma()
tvm intrinsic to call tcgen05.mma.cta_group.kind without block scaling.
const Op & nvshmem_wait_until()
nvshmem intrinsics for nvshmem_FuncParam{TYPENAME}_wait_until() operation.
const Op & ptx_cp_async_bulk_tensor_shared_to_global()
tvm instrinsics to call cp.async.bulk.tensor.dim.global.shared::cta.tile。bulk_group
const Op & cuda_func_call()
tvm instrinsics to call a CUDA function. Source code is provided as a string.
const Op & ptx_mbarrier_try_wait()
tvm instrinsics to call mbarrier.try_wait.parity repeatedly until it returns true
const Op & ptx_bar_arrive()
tvm instrinsics to call bar.arrive a, b
const Op & ptx_ldmatrix()
tvm intrinsic for ptx load matrix from shared memory.
const Op & ptx_tcgen05_encode_instr_descriptor_block_scaled()
tvm intrinsic to encode instruction descriptor for tcgen05 MMA block scaled.
const Op & ptx_mma_sp()
tvm intrinsic for sparse tensor core ptx instructions.
const Op & ptx_tcgen05_commit()
tvm instrinsics to call tcgen05.commit.cta_group
const Op & nvshmem_getmem_nbi_block()
nvshmem intrinsics for nvshmemx_getmem_nbi_block() operation.
const Op & ptx_cp_async_bulk_wait_group()
tvm instrinsics to call cp.async.bulk.wait_group{.read} N
const Op & ptx_tcgen05_relinquish_alloc_permit()
tvm instrinsics to call tcgen05.relinquish_alloc_permit.cta_group.sync.aligned;
const Op & tvm_store_matrix_sync()
tvm intrinsic for tensor core store operators.
const Op & ptx_ldmatrix_legacy()
const Op & ptx_tcgen05_encode_matrix_descriptor()
tvm intrinsic to encode matrix descriptor for tcgen05 instructions.
const Op & ptx_tcgen05_st()
tvm instrinsics to call tcgen05.st.sync.aligned;
const Op & ptx_wgmma_noop_barrier()
tvm intrinsic to call "" : "+r"(reg) :: "memory"
const Op & nvshmem_quiet()
nvshmem intrinsics for nvshmem_quiet() operation.
const Op & mma_store()
tvm intrinsic for storing the result of PTX MMA into a destination pointer. For example,...
const Op & nvshmem_getmem_nbi()
nvshmem intrinsics for nvshmem_getmem_nbi() operation.
const Op & ptx_wgmma_mma_async_ss()
tvm intrinsic to call wgmma.mma_async.sync.aligned.shape.dtype.atype.btype where both A and B are in ...
const Op & ptx_mbarrier_init()
tvm instrinsics to call mbarrier.init.shared::cta.b64
const Op & ptx_cp_async_bulk_tensor_tile_gather4_global_to_cluster()
tvm intrinsic to call cp.async.bulk.tensor.dim.shared::cluster.global.tile::gather4....
const Op & ptx_fence_proxy_async()
PTX fence.proxy.async instruction: fence.proxy.async[.{space}].
const Op & ptx_tcgen05_ld()
tvm instrinsics to call tcgen05.ld.sync.aligned;
const Op & ptx_cp_async_bulk_tensor_shared_to_global_reduce()
tvm instrinsics to call cp.reduce.async.bulk.tensor.dim.dst.src.redOp
const Op & nvshmem_putmem_nbi_block()
nvshmem intrinsics for nvshmemx_putmem_nbi_block() operation.
const Op & ptx_wgmma_wait_group()
tvm intrinsic to call wgmma.wait_group.sync.aligned;
const Op & ptx_barrier_cluster_arrive()
tvm instrinsics to call barrier.cluster.arrive{.sem}{.aligned}
const Op & ptx_wgmma_fence()
tvm intrinsic to call wgmma.fence.sync.aligned;
const Op & nvshmem_barrier_all()
nvshmem intrinsics for nvshmem_barrier_all() operation.
const Op & nvshmem_fence()
nvshmem intrinsics for nvshmem_fence() operation.
const Op & ptx_cp_async_bulk()
tvm intrinsics for ptx async copy from global to shared memory using cp.async.bulk
const Op & ptx_tcgen05_mma_sp_block_scale()
tvm intrinsic to call tcgen05.mma.sp.cta_group.kind.block_scale{.scale_vec_size}
const Op & nvshmem_putmem_signal_nbi()
nvshmem intrinsics for nvshmemx_putmem_signal_nbi() operation.
const Op & ptx_tcgen05_encode_instr_descriptor()
tvm intrinsic to encode instruction descriptor for tcgen05 MMA.
const Op & ptx_cp_async_bulk_commit_group()
tvm instrinsics to call cp.async.bulk.commit_group
const Op & ptx_cp_async_wait_group()
const Op & nvshmem_putmem_signal_nbi_warp()
nvshmem intrinsics for nvshmemx_putmem_signal_nbi_warp() operation.
const Op & ptx_fence_mbarrier_init()
PTX fence.mbarrier_init.release.cluster instruction.
const Op & nvshmem_putmem_nbi_warp()
nvshmem intrinsics for nvshmemx_putmem_nbi_warp() operation.
const Op & ptx_ldg32()
tvm intrinsic for ptx predicate load with 32-bit data type.
const Op & mma_fill_legacy()
const Op & ptx_tcgen05_mma_block_scale()
tvm intrinsic to call tcgen05.mma.cta_group.kind.block_scale{.scale_vec_size}
const Op & ptx_setmaxnreg()
tvm intrinsic to call setmaxnreg.action.sync.aligned.u32 imm-reg-count
const Op & ptx_mma()
tvm intrinsic for ptx tensor core mma instructions.
const Op & ptx_cp_async_mbarrier_arrive()
tvm intrinsics for ptx async copy barrier using cp.async.mbarrier.arrive
const Op & nvshmem_putmem_signal_nbi_block()
nvshmem intrinsics for nvshmemx_putmem_signal_nbi_block() operation.
const Op & ptx_cp_async_bulk_tensor_global_to_cluster()
tvm instrinsics to call cp.async.bulk.tensor.dim.shared::cluster.global.tile.mbarrier::complete_tx::b...
const Op & nvshmem_signal_op()
nvshmem intrinsics for nvshmemx_signal_op() operation.
const Op & nvshmem_getmem_nbi_warp()
nvshmem intrinsics for nvshmemx_getmem_nbi_warp() operation.
const Op & ptx_cp_async_bulk_tensor_global_to_cluster_prefetch()
tvm instrinsics to call cp.async.bulk.prefetch.tensor.dim.L2.global.tile
const Op & ptx_tcgen05_wait_st()
tvm instrinsics to call tcgen05.wait::st.sync.aligned;
const Op & ptx_tcgen05_alloc()
tvm instrinsics to call tcgen05.alloc.cta_group.sync.aligned;
const Op & mma_fill()
tvm intrinsic for zero-initializing an MMA accumulation register. For example, if each thread in a wa...
const Op & tvm_load_matrix_sync()
tvm intrinsic for tensor core load operators.
const Op & ptx_wgmma_mma_async_rs()
tvm intrinsic to call wgmma.mma_async.sync.aligned.shape.dtype.atype.btype where A is in register and...
const Op & ptx_bar_sync()
tvm instrinsics to call bar.sync a, {b}
const Op & ptx_mbarrier_arrive_expect_tx()
tvm instrinsics to call mbarrier.arrive.expect_tx.shared.b64 or mapa.shared::cluster....
const Op & ptx_cp_async_commit_group()
tvm intrinsics for ptx async copy commit and wait.
const Op & ptx_wgmma_commit_group()
tvm intrinsic to call wgmma.commit_group.sync.aligned;
const Op & nvshmem_n_pes()
nvshmem intrinsics for nvshmem_n_pes() operation.
const Op & ptx_fence()
PTX fence instruction: fence.{sem}.{scope}.
const Op & ptx_tcgen05_dealloc()
tvm instrinsics to call tcgen05.dealloc.cta_group.sync.aligned;
An object that builds and maintains block scope and StmtSref mapping for Dependence analysis.
Definition: analyzer.h:37
Common operators defined for Expr.