24 #ifndef TVM_TOPI_CUDA_DENSE_H_ 25 #define TVM_TOPI_CUDA_DENSE_H_ 56 ICHECK_EQ(data->shape.size(), 2) <<
"dense requires 2-D data";
57 ICHECK_EQ(weight->shape.size(), 2) <<
"dense requires 2-D weight";
59 ICHECK_EQ(bias->shape.size(), 1) <<
"dense requires 1-D bias";
62 auto batch = data->shape[0];
63 auto in_dim = data->shape[1];
64 auto out_dim = weight->shape[0];
66 if (target->GetLibs().count(
"cublas")) {
67 ICHECK_EQ(data->dtype, out_dtype) <<
"Mixed precision not supported.";
71 {batch, out_dim}, [&](
Var i,
Var j) {
return mm(i, j) + bias(j); },
"tensor",
kBroadcast);
89 if (target->kind->name ==
"cuda" && target->GetLibs().count(
"cublas")) {
100 auto num_thread = 64;
103 s[
dense].split(k, num_thread, &ko, &kf);
104 auto dense_f = s.rfactor(
dense, kf)[0];
107 if (detail::contains(s->outputs,
dense->op)) {
110 out = outs[0]->op.output(0);
120 s[
dense].bind(tx, thread_x);
121 s[dense_f].compute_at(s[
dense], tx);
122 s[
dense].set_store_predicate(static_cast<PrimExpr>(thread_x) == 0);
123 s[out].set_store_predicate(static_cast<PrimExpr>(thread_x) == 0);
126 std::function<void(Operation)> traverse;
130 if (!detail::contains(s->outputs, op)) {
131 s[op].compute_inline();
133 for (
auto tensor : op->InputTensors()) {
134 if (tensor->op->InputTensors().size() > 0) {
135 traverse(tensor->op);
138 }
else if (op->tag ==
"dense") {
140 auto dense = op.output(0);
143 LOG(ERROR) <<
"Unsupported operator " << op->tag;
147 traverse(outs[0]->op);
154 #endif // TVM_TOPI_CUDA_DENSE_H_ IterVar thread_axis(Range dom, std::string tag)
Create a new IterVar that represents an axis in thread.
Schedule for extern followed by injective ops.
Global schedule container For operations and all the operations they depend on. The schedule per Oper...
Definition: schedule.h:318
Schedule schedule_dense(const Target &target, const Array< Tensor > &outs)
Create a rocm schedule for dense.
Definition: dense.h:88
Schedule create_schedule(Array< Operation > ops)
Create a schedule for array of ops(and their dependencies).
Definition: schedule.h:695
Utility functions for handling arrays.
runtime implementation for LibTorch/TorchScript.
Definition: analyzer.h:36
Tensor expression language DSL.
Definition: extracted_task.h:33
Operation that produces tensors.
Definition: tensor.h:47
a named variable in TIR
Definition: var.h:88
Iteration Variable, represents an iteration over an integer interval.
Definition: var.h:308
void push_back(const T &item)
push a new item to the back of the list
Definition: array.h:457
Collection of Schedule pass functions.
Tensor cublas_matmul(const Tensor &lhs, const Tensor &rhs, bool transa, bool transb)
Create an op that multiplies lhs and rhs with cuBLAS.
Definition: cublas.h:46
constexpr auto kBroadcast
Definition: tags.h:36
Range constainer.
Definition: expr.h:715
tvm::te::Tensor dense(const tvm::te::Tensor &data, const tvm::te::Tensor &weight, const tvm::te::Tensor &bias, const DataType &out_dtype)
Creates an operation that calculates data * weight^T + bias.
Definition: dense.h:48
bool defined() const
Definition: object.h:544
Runtime primitive data type.
Definition: data_type.h:41
Array, container representing a contiguous sequence of ObjectRefs.
Definition: array.h:289
External function interface to cuBLAS libraries.
A Compute op that compute a tensor on certain domain.
Definition: operation.h:226
Array< IterVar > axis
IterVar on each axis.
Definition: operation.h:207
IterVar reduce_axis(Range dom, std::string name="rv")
Create a new IterVar for reduction operations.
Managed reference class to TargetNode.
Definition: target.h:183
Tensor structure representing a possible input, or intermediate computation result.
Definition: tensor.h:102
Operation node can generate one or multiple Tensors.
bool is_broadcast(std::string tag)
Definition: tags.h:47
Tensor compute(Array< PrimExpr > shape, FCompute fcompute, std::string name="tensor", std::string tag="", Map< String, ObjectRef > attrs={})
Construct a new tensor by computing over shape, using the computation rule: result_tensor[axis] = fco...
const ObjectType * as() const
Try to downcast the internal Object to a raw pointer of a corresponding type.
Definition: object.h:865
Schedule schedule_extern(const Target &target, const Array< Tensor > &outs)
Schedule an extern op followed by injective operations.
Definition: extern.h:48
Generic function that can be specialzied on a per target basis.
tvm::te::Tensor dense_cuda(const Target &target, const tvm::te::Tensor &data, const tvm::te::Tensor &weight, const tvm::te::Tensor &bias, const DataType &out_dtype)
Implementation of dense for CUDA backend.
Definition: dense.h:53