api/doxygen/cuda_8h_source.html

 /*

  * Licensed to the Apache Software Foundation (ASF) under one

  * or more contributor license agreements.  See the NOTICE file

  * distributed with this work for additional information

  * regarding copyright ownership.  The ASF licenses this file

  * to you under the Apache License, Version 2.0 (the

  * "License"); you may not use this file except in compliance

  * with the License.  You may obtain a copy of the License at

  *

  *   http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing,

  * software distributed under the License is distributed on an

  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

  * KIND, either express or implied.  See the License for the

  * specific language governing permissions and limitations

  * under the License.

  */


 #ifndef TVM_TIRX_TARGET_BUILTIN_CUDA_H_

 #define TVM_TIRX_TARGET_BUILTIN_CUDA_H_


 #include <tvm/tirx/expr.h>

 #include <tvm/tirx/op.h>


 namespace tvm {

 namespace tirx {

 namespace builtin {


 // TODO(tvm-team) TensorCore specific intrinsics should be directly registered under

 //                cuda. namespace and used through op.

 TVM_DLL const Op& tvm_load_matrix_sync();


 TVM_DLL const Op& tvm_mma_sync();


 TVM_DLL const Op& tvm_bmma_sync();


 TVM_DLL const Op& tvm_fill_fragment();


 TVM_DLL const Op& tvm_store_matrix_sync();


 TVM_DLL const Op& ptx_mma();


 TVM_DLL const Op& ptx_mma_legacy();

 TVM_DLL const Op& ptx_ldmatrix_legacy();

 TVM_DLL const Op& mma_store_legacy();

 TVM_DLL const Op& mma_fill_legacy();


 TVM_DLL const Op& ptx_ldg32();


 TVM_DLL const Op& ptx_ldg32();


 TVM_DLL const Op& ptx_mma_sp();


 TVM_DLL const Op& ptx_ldmatrix();


 TVM_DLL const Op& ptx_cp_async();


 TVM_DLL const Op& ptx_cp_async_bulk();


 TVM_DLL const Op& ptx_cp_async_bulk_shared_to_cluster();


 TVM_DLL const Op& ptx_cp_async_commit_group();

 TVM_DLL const Op& ptx_cp_async_wait_group();


 TVM_DLL const Op& ptx_cp_async_mbarrier_arrive();


 TVM_DLL const Op& ptx_fence();


 TVM_DLL const Op& ptx_fence_proxy_async();


 TVM_DLL const Op& ptx_mbarrier_init();


 TVM_DLL const Op& ptx_mbarrier_arrive();


 TVM_DLL const Op& ptx_mbarrier_arrive_expect_tx();


 TVM_DLL const Op& ptx_mbarrier_try_wait();


 TVM_DLL const Op& ptx_bar_arrive();


 TVM_DLL const Op& ptx_bar_sync();


 TVM_DLL const Op& ptx_cp_async_bulk_tensor_global_to_cluster();


 TVM_DLL const Op& ptx_cp_async_bulk_tensor_tile_gather4_global_to_cluster();


 TVM_DLL const Op& ptx_cp_async_bulk_tensor_shared_to_global();


 TVM_DLL const Op& ptx_cp_async_bulk_tensor_global_to_cluster_prefetch();


 TVM_DLL const Op& ptx_cp_async_bulk_tensor_shared_to_global_reduce();


 TVM_DLL const Op& ptx_cp_async_bulk_commit_group();


 TVM_DLL const Op& ptx_cp_async_bulk_wait_group();


 TVM_DLL const Op& ptx_barrier_cluster_arrive();


 TVM_DLL const Op& ptx_barrier_cluster_wait();


 TVM_DLL const Op& ptx_elect_sync();


 TVM_DLL const Op& ptx_fence_mbarrier_init();


 TVM_DLL const Op& ptx_fetch_register();


 TVM_DLL const Op& mma_store();


 TVM_DLL const Op& mma_fill();


 TVM_DLL const Op& ptx_wgmma_encode_matrix_descriptor();


 TVM_DLL const Op& ptx_wgmma_noop_barrier();


 TVM_DLL const Op& ptx_wgmma_mma_async_ss();


 TVM_DLL const Op& ptx_wgmma_mma_async_rs();


 TVM_DLL const Op& ptx_wgmma_fence();


 TVM_DLL const Op& ptx_wgmma_commit_group();


 TVM_DLL const Op& ptx_wgmma_wait_group();


 TVM_DLL const Op& ptx_stmatrix();


 TVM_DLL const Op& ptx_setmaxnreg();


 TVM_DLL const Op& ptx_ld_global_acquire();


 TVM_DLL const Op& ptx_tcgen05_alloc();


 TVM_DLL const Op& ptx_tcgen05_dealloc();


 TVM_DLL const Op& ptx_tcgen05_relinquish_alloc_permit();


 TVM_DLL const Op& ptx_tcgen05_fence_before_thread_sync();


 TVM_DLL const Op& ptx_tcgen05_fence_after_thread_sync();


 TVM_DLL const Op& ptx_tcgen05_ld();


 TVM_DLL const Op& ptx_tcgen05_st();


 TVM_DLL const Op& ptx_tcgen05_wait_ld();


 TVM_DLL const Op& ptx_tcgen05_wait_st();


 TVM_DLL const Op& ptx_tcgen05_encode_matrix_descriptor();


 TVM_DLL const Op& ptx_tcgen05_encode_instr_descriptor();


 TVM_DLL const Op& ptx_tcgen05_encode_instr_descriptor_block_scaled();


 TVM_DLL const Op& ptx_tcgen05_mma();


 TVM_DLL const Op& ptx_tcgen05_mma_block_scale();


 TVM_DLL const Op& ptx_tcgen05_mma_sp();


 TVM_DLL const Op& ptx_tcgen05_mma_sp_block_scale();


 TVM_DLL const Op& ptx_tcgen05_commit();


 TVM_DLL const Op& ptx_tcgen05_cp();


 TVM_DLL const Op& ptx_tcgen05_shift();


 TVM_DLL const Op& ptx_map_shared_rank();


 TVM_DLL const Op& cuda_func_call();


 TVM_DLL const Op& nvshmem_my_pe();


 TVM_DLL const Op& nvshmem_n_pes();


 TVM_DLL const Op& nvshmem_getmem_nbi();


 TVM_DLL const Op& nvshmem_putmem_nbi();


 TVM_DLL const Op& nvshmem_getmem_nbi_warp();


 TVM_DLL const Op& nvshmem_putmem_nbi_warp();


 TVM_DLL const Op& nvshmem_getmem_nbi_block();


 TVM_DLL const Op& nvshmem_putmem_nbi_block();


 TVM_DLL const Op& nvshmem_signal_op();


 TVM_DLL const Op& nvshmem_wait_until();


 TVM_DLL const Op& nvshmem_quiet();


 TVM_DLL const Op& nvshmem_putmem_signal_nbi();


 TVM_DLL const Op& nvshmem_putmem_signal_nbi_warp();


 TVM_DLL const Op& nvshmem_putmem_signal_nbi_block();


 TVM_DLL const Op& nvshmem_fence();


 TVM_DLL const Op& nvshmem_barrier_all();


 }  // namespace builtin

 }  // namespace tirx

 }  // namespace tvm


 #endif  // TVM_TIRX_TARGET_BUILTIN_CUDA_H_

tvm::Op
Managed reference class to OpNode.
Definition: op.h:131

tvm::tirx::builtin::ptx_mbarrier_arrive
const Op & ptx_mbarrier_arrive()
tvm instrinsics to call mbarrier.arrive.shared::cta.b64 or mapa.shared::cluster.u32 mbarrier....

tvm::tirx::builtin::ptx_mma_legacy
const Op & ptx_mma_legacy()
ptx mma / ldmatrix / mma_store / mma_fill variants that take (ptr_var, offset) pairs (not a folded ac...

tvm::tirx::builtin::ptx_fetch_register
const Op & ptx_fetch_register()
tvm instrinsics to fetch PTX pre-defined registers

tvm::tirx::builtin::ptx_tcgen05_cp
const Op & ptx_tcgen05_cp()
tvm instrinsics to call tcgen05.cp.cta_group

tvm::tirx::builtin::ptx_barrier_cluster_wait
const Op & ptx_barrier_cluster_wait()
tvm instrinsics to call barrier.cluster.wait.{acquire}{.aligned}

tvm::tirx::builtin::ptx_tcgen05_fence_after_thread_sync
const Op & ptx_tcgen05_fence_after_thread_sync()
tvm instrinsics to call tcgen05.fence::after_thread_sync;

tvm::tirx::builtin::mma_store_legacy
const Op & mma_store_legacy()

tvm::tirx::builtin::nvshmem_my_pe
const Op & nvshmem_my_pe()
nvshmem intrinsics for nvshmem_my_pe() operation.

tvm::tirx::builtin::ptx_ld_global_acquire
const Op & ptx_ld_global_acquire()
tvm intrinsic to call ld.global.acquire.gpu.b32

tvm::tirx::builtin::ptx_map_shared_rank
const Op & ptx_map_shared_rank()
tvm instrinsics to call map_shared_rank

tvm::tirx::builtin::ptx_tcgen05_wait_ld
const Op & ptx_tcgen05_wait_ld()
tvm instrinsics to call tcgen05.wait::ld.sync.aligned;

tvm::tirx::builtin::ptx_cp_async
const Op & ptx_cp_async()
tvm intrinsics for ptx async copy from global to shared memory using cp.async

tvm::tirx::builtin::ptx_elect_sync
const Op & ptx_elect_sync()
tvm instrinsics to call elect.sync _|p, membermask and return the predicate

tvm::tirx::builtin::ptx_tcgen05_mma_sp
const Op & ptx_tcgen05_mma_sp()
tvm intrinsic to call tcgen05.mma.sp.cta_group.kind without block scaling.

tvm::tirx::builtin::tvm_fill_fragment
const Op & tvm_fill_fragment()
tvm intrinsic for tensor core fill_fragment operators.

tvm::tirx::builtin::tvm_mma_sync
const Op & tvm_mma_sync()
tvm intrinsic for tensor core mma_sync operators.

tvm::tirx::builtin::nvshmem_putmem_nbi
const Op & nvshmem_putmem_nbi()
nvshmem intrinsics for nvshmem_putmem_nbi() operation.

tvm::tirx::builtin::ptx_stmatrix
const Op & ptx_stmatrix()
tvm intrinsic to call stmatrix.sync.aligned.m8n8.num{.trans}.shared.b16 [p], r;

tvm::tirx::builtin::ptx_tcgen05_fence_before_thread_sync
const Op & ptx_tcgen05_fence_before_thread_sync()
tvm instrinsics to call tcgen05.fence::before_thread_sync;

tvm::tirx::builtin::ptx_tcgen05_shift
const Op & ptx_tcgen05_shift()
tvm instrinsics to call tcgen05.shift.cta_group.down

tvm::tirx::builtin::ptx_cp_async_bulk_shared_to_cluster
const Op & ptx_cp_async_bulk_shared_to_cluster()
tvm intrinsics for ptx async bulk copy from shared::cta to shared::cluster

tvm::tirx::builtin::tvm_bmma_sync
const Op & tvm_bmma_sync()
tvm intrinsic for tensor core bmma_sync operators.

tvm::tirx::builtin::ptx_wgmma_encode_matrix_descriptor
const Op & ptx_wgmma_encode_matrix_descriptor()
tvm intrinsic to encode matrix descriptor for wgmma instructions.

tvm::tirx::builtin::ptx_tcgen05_mma
const Op & ptx_tcgen05_mma()
tvm intrinsic to call tcgen05.mma.cta_group.kind without block scaling.

tvm::tirx::builtin::nvshmem_wait_until
const Op & nvshmem_wait_until()
nvshmem intrinsics for nvshmem_FuncParam{TYPENAME}_wait_until() operation.

tvm::tirx::builtin::ptx_cp_async_bulk_tensor_shared_to_global
const Op & ptx_cp_async_bulk_tensor_shared_to_global()
tvm instrinsics to call cp.async.bulk.tensor.dim.global.shared::cta.tile。bulk_group

tvm::tirx::builtin::cuda_func_call
const Op & cuda_func_call()
tvm instrinsics to call a CUDA function. Source code is provided as a string.

tvm::tirx::builtin::ptx_mbarrier_try_wait
const Op & ptx_mbarrier_try_wait()
tvm instrinsics to call mbarrier.try_wait.parity repeatedly until it returns true

tvm::tirx::builtin::ptx_bar_arrive
const Op & ptx_bar_arrive()
tvm instrinsics to call bar.arrive a, b

tvm::tirx::builtin::ptx_ldmatrix
const Op & ptx_ldmatrix()
tvm intrinsic for ptx load matrix from shared memory.

tvm::tirx::builtin::ptx_tcgen05_encode_instr_descriptor_block_scaled
const Op & ptx_tcgen05_encode_instr_descriptor_block_scaled()
tvm intrinsic to encode instruction descriptor for tcgen05 MMA block scaled.

tvm::tirx::builtin::ptx_mma_sp
const Op & ptx_mma_sp()
tvm intrinsic for sparse tensor core ptx instructions.

tvm::tirx::builtin::ptx_tcgen05_commit
const Op & ptx_tcgen05_commit()
tvm instrinsics to call tcgen05.commit.cta_group

tvm::tirx::builtin::nvshmem_getmem_nbi_block
const Op & nvshmem_getmem_nbi_block()
nvshmem intrinsics for nvshmemx_getmem_nbi_block() operation.

tvm::tirx::builtin::ptx_cp_async_bulk_wait_group
const Op & ptx_cp_async_bulk_wait_group()
tvm instrinsics to call cp.async.bulk.wait_group{.read} N

tvm::tirx::builtin::ptx_tcgen05_relinquish_alloc_permit
const Op & ptx_tcgen05_relinquish_alloc_permit()
tvm instrinsics to call tcgen05.relinquish_alloc_permit.cta_group.sync.aligned;

tvm::tirx::builtin::tvm_store_matrix_sync
const Op & tvm_store_matrix_sync()
tvm intrinsic for tensor core store operators.

tvm::tirx::builtin::ptx_ldmatrix_legacy
const Op & ptx_ldmatrix_legacy()

tvm::tirx::builtin::ptx_tcgen05_encode_matrix_descriptor
const Op & ptx_tcgen05_encode_matrix_descriptor()
tvm intrinsic to encode matrix descriptor for tcgen05 instructions.

tvm::tirx::builtin::ptx_tcgen05_st
const Op & ptx_tcgen05_st()
tvm instrinsics to call tcgen05.st.sync.aligned;

tvm::tirx::builtin::ptx_wgmma_noop_barrier
const Op & ptx_wgmma_noop_barrier()
tvm intrinsic to call "" : "+r"(reg) :: "memory"

tvm::tirx::builtin::nvshmem_quiet
const Op & nvshmem_quiet()
nvshmem intrinsics for nvshmem_quiet() operation.

tvm::tirx::builtin::mma_store
const Op & mma_store()
tvm intrinsic for storing the result of PTX MMA into a destination pointer. For example,...

tvm::tirx::builtin::nvshmem_getmem_nbi
const Op & nvshmem_getmem_nbi()
nvshmem intrinsics for nvshmem_getmem_nbi() operation.

tvm::tirx::builtin::ptx_wgmma_mma_async_ss
const Op & ptx_wgmma_mma_async_ss()
tvm intrinsic to call wgmma.mma_async.sync.aligned.shape.dtype.atype.btype where both A and B are in ...

tvm::tirx::builtin::ptx_mbarrier_init
const Op & ptx_mbarrier_init()
tvm instrinsics to call mbarrier.init.shared::cta.b64

tvm::tirx::builtin::ptx_cp_async_bulk_tensor_tile_gather4_global_to_cluster
const Op & ptx_cp_async_bulk_tensor_tile_gather4_global_to_cluster()
tvm intrinsic to call cp.async.bulk.tensor.dim.shared::cluster.global.tile::gather4....

tvm::tirx::builtin::ptx_fence_proxy_async
const Op & ptx_fence_proxy_async()
PTX fence.proxy.async instruction: fence.proxy.async[.{space}].

tvm::tirx::builtin::ptx_tcgen05_ld
const Op & ptx_tcgen05_ld()
tvm instrinsics to call tcgen05.ld.sync.aligned;

tvm::tirx::builtin::ptx_cp_async_bulk_tensor_shared_to_global_reduce
const Op & ptx_cp_async_bulk_tensor_shared_to_global_reduce()
tvm instrinsics to call cp.reduce.async.bulk.tensor.dim.dst.src.redOp

tvm::tirx::builtin::nvshmem_putmem_nbi_block
const Op & nvshmem_putmem_nbi_block()
nvshmem intrinsics for nvshmemx_putmem_nbi_block() operation.

tvm::tirx::builtin::ptx_wgmma_wait_group
const Op & ptx_wgmma_wait_group()
tvm intrinsic to call wgmma.wait_group.sync.aligned;

tvm::tirx::builtin::ptx_barrier_cluster_arrive
const Op & ptx_barrier_cluster_arrive()
tvm instrinsics to call barrier.cluster.arrive{.sem}{.aligned}

tvm::tirx::builtin::ptx_wgmma_fence
const Op & ptx_wgmma_fence()
tvm intrinsic to call wgmma.fence.sync.aligned;

tvm::tirx::builtin::nvshmem_barrier_all
const Op & nvshmem_barrier_all()
nvshmem intrinsics for nvshmem_barrier_all() operation.

tvm::tirx::builtin::nvshmem_fence
const Op & nvshmem_fence()
nvshmem intrinsics for nvshmem_fence() operation.

tvm::tirx::builtin::ptx_cp_async_bulk
const Op & ptx_cp_async_bulk()
tvm intrinsics for ptx async copy from global to shared memory using cp.async.bulk

tvm::tirx::builtin::ptx_tcgen05_mma_sp_block_scale
const Op & ptx_tcgen05_mma_sp_block_scale()
tvm intrinsic to call tcgen05.mma.sp.cta_group.kind.block_scale{.scale_vec_size}

tvm::tirx::builtin::nvshmem_putmem_signal_nbi
const Op & nvshmem_putmem_signal_nbi()
nvshmem intrinsics for nvshmemx_putmem_signal_nbi() operation.

tvm::tirx::builtin::ptx_tcgen05_encode_instr_descriptor
const Op & ptx_tcgen05_encode_instr_descriptor()
tvm intrinsic to encode instruction descriptor for tcgen05 MMA.

tvm::tirx::builtin::ptx_cp_async_bulk_commit_group
const Op & ptx_cp_async_bulk_commit_group()
tvm instrinsics to call cp.async.bulk.commit_group

tvm::tirx::builtin::ptx_cp_async_wait_group
const Op & ptx_cp_async_wait_group()

tvm::tirx::builtin::nvshmem_putmem_signal_nbi_warp
const Op & nvshmem_putmem_signal_nbi_warp()
nvshmem intrinsics for nvshmemx_putmem_signal_nbi_warp() operation.

tvm::tirx::builtin::ptx_fence_mbarrier_init
const Op & ptx_fence_mbarrier_init()
PTX fence.mbarrier_init.release.cluster instruction.

tvm::tirx::builtin::nvshmem_putmem_nbi_warp
const Op & nvshmem_putmem_nbi_warp()
nvshmem intrinsics for nvshmemx_putmem_nbi_warp() operation.

tvm::tirx::builtin::ptx_ldg32
const Op & ptx_ldg32()
tvm intrinsic for ptx predicate load with 32-bit data type.

tvm::tirx::builtin::mma_fill_legacy
const Op & mma_fill_legacy()

tvm::tirx::builtin::ptx_tcgen05_mma_block_scale
const Op & ptx_tcgen05_mma_block_scale()
tvm intrinsic to call tcgen05.mma.cta_group.kind.block_scale{.scale_vec_size}

tvm::tirx::builtin::ptx_setmaxnreg
const Op & ptx_setmaxnreg()
tvm intrinsic to call setmaxnreg.action.sync.aligned.u32 imm-reg-count

tvm::tirx::builtin::ptx_mma
const Op & ptx_mma()
tvm intrinsic for ptx tensor core mma instructions.

tvm::tirx::builtin::ptx_cp_async_mbarrier_arrive
const Op & ptx_cp_async_mbarrier_arrive()
tvm intrinsics for ptx async copy barrier using cp.async.mbarrier.arrive

tvm::tirx::builtin::nvshmem_putmem_signal_nbi_block
const Op & nvshmem_putmem_signal_nbi_block()
nvshmem intrinsics for nvshmemx_putmem_signal_nbi_block() operation.

tvm::tirx::builtin::ptx_cp_async_bulk_tensor_global_to_cluster
const Op & ptx_cp_async_bulk_tensor_global_to_cluster()
tvm instrinsics to call cp.async.bulk.tensor.dim.shared::cluster.global.tile.mbarrier::complete_tx::b...

tvm::tirx::builtin::nvshmem_signal_op
const Op & nvshmem_signal_op()
nvshmem intrinsics for nvshmemx_signal_op() operation.

tvm::tirx::builtin::nvshmem_getmem_nbi_warp
const Op & nvshmem_getmem_nbi_warp()
nvshmem intrinsics for nvshmemx_getmem_nbi_warp() operation.

tvm::tirx::builtin::ptx_cp_async_bulk_tensor_global_to_cluster_prefetch
const Op & ptx_cp_async_bulk_tensor_global_to_cluster_prefetch()
tvm instrinsics to call cp.async.bulk.prefetch.tensor.dim.L2.global.tile

tvm::tirx::builtin::ptx_tcgen05_wait_st
const Op & ptx_tcgen05_wait_st()
tvm instrinsics to call tcgen05.wait::st.sync.aligned;

tvm::tirx::builtin::ptx_tcgen05_alloc
const Op & ptx_tcgen05_alloc()
tvm instrinsics to call tcgen05.alloc.cta_group.sync.aligned;

tvm::tirx::builtin::mma_fill
const Op & mma_fill()
tvm intrinsic for zero-initializing an MMA accumulation register. For example, if each thread in a wa...

tvm::tirx::builtin::tvm_load_matrix_sync
const Op & tvm_load_matrix_sync()
tvm intrinsic for tensor core load operators.

tvm::tirx::builtin::ptx_wgmma_mma_async_rs
const Op & ptx_wgmma_mma_async_rs()
tvm intrinsic to call wgmma.mma_async.sync.aligned.shape.dtype.atype.btype where A is in register and...

tvm::tirx::builtin::ptx_bar_sync
const Op & ptx_bar_sync()
tvm instrinsics to call bar.sync a, {b}

tvm::tirx::builtin::ptx_mbarrier_arrive_expect_tx
const Op & ptx_mbarrier_arrive_expect_tx()
tvm instrinsics to call mbarrier.arrive.expect_tx.shared.b64 or mapa.shared::cluster....

tvm::tirx::builtin::ptx_cp_async_commit_group
const Op & ptx_cp_async_commit_group()
tvm intrinsics for ptx async copy commit and wait.

tvm::tirx::builtin::ptx_wgmma_commit_group
const Op & ptx_wgmma_commit_group()
tvm intrinsic to call wgmma.commit_group.sync.aligned;

tvm::tirx::builtin::nvshmem_n_pes
const Op & nvshmem_n_pes()
nvshmem intrinsics for nvshmem_n_pes() operation.

tvm::tirx::builtin::ptx_fence
const Op & ptx_fence()
PTX fence instruction: fence.{sem}.{scope}.

tvm::tirx::builtin::ptx_tcgen05_dealloc
const Op & ptx_tcgen05_dealloc()
tvm instrinsics to call tcgen05.dealloc.cta_group.sync.aligned;

tvm
An object that builds and maintains block scope and StmtSref mapping for Dependence analysis.
Definition: analyzer.h:37

expr.h
TIR expressions.

op.h
Common operators defined for Expr.