api/doxygen/cuda_2dense_8h_source.html

 /*

  * Licensed to the Apache Software Foundation (ASF) under one

  * or more contributor license agreements.  See the NOTICE file

  * distributed with this work for additional information

  * regarding copyright ownership.  The ASF licenses this file

  * to you under the Apache License, Version 2.0 (the

  * "License"); you may not use this file except in compliance

  * with the License.  You may obtain a copy of the License at

  *

  *   http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing,

  * software distributed under the License is distributed on an

  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

  * KIND, either express or implied.  See the License for the

  * specific language governing permissions and limitations

  * under the License.

  */


 #ifndef TVM_TOPI_CUDA_DENSE_H_

 #define TVM_TOPI_CUDA_DENSE_H_


 #include <tvm/target/generic_func.h>

 #include <tvm/te/operation.h>

 #include <tvm/te/schedule_pass.h>

 #include <tvm/topi/contrib/cublas.h>

 #include <tvm/topi/detail/array_utils.h>

 #include <tvm/topi/generic/extern.h>

 #include <tvm/topi/nn/dense.h>

 #include <tvm/topi/tags.h>


 namespace tvm {

 namespace topi {


 using namespace tvm::te;


 namespace cuda {

 inline tvm::te::Tensor dense_cuda(const Target& target, const tvm::te::Tensor& data,

                                   const tvm::te::Tensor& weight, const tvm::te::Tensor& bias,

                                   const DataType& out_dtype) {

   ICHECK_EQ(data->shape.size(), 2) << "dense requires 2-D data";

   ICHECK_EQ(weight->shape.size(), 2) << "dense requires 2-D weight";

   if (bias.defined()) {

     ICHECK_EQ(bias->shape.size(), 1) << "dense requires 1-D bias";

   }


   auto batch = data->shape[0];

   auto in_dim = data->shape[1];

   auto out_dim = weight->shape[0];


   if (target->GetLibs().count("cublas")) {

     ICHECK_EQ(data->dtype, out_dtype) << "Mixed precision not supported.";

     auto mm = topi::contrib::cublas_matmul(data, weight, false, true);

     if (bias.defined()) {

       mm = tvm::te::compute(

           {batch, out_dim}, [&](Var i, Var j) { return mm(i, j) + bias(j); }, "tensor", kBroadcast);

     }


     return mm;

   } else {

     return topi::nn::dense(data, weight, bias, out_dtype);

   }

 }


 inline Schedule schedule_dense(const Target& target, const Array<Tensor>& outs) {

   if (target->kind->name == "cuda" && target->GetLibs().count("cublas")) {

     return topi::generic::schedule_extern(target, outs);

   }


   Array<Operation> out_ops;

   for (auto t : outs) {

     out_ops.push_back(t->op);

   }

   auto s = create_schedule(out_ops);


   auto _schedule = [&](const Tensor& dense) {

     auto num_thread = 64;

     auto k = dense->op.as<ComputeOpNode>()->reduce_axis[0];

     IterVar ko, kf;

     s[dense].split(k, num_thread, &ko, &kf);

     auto dense_f = s.rfactor(dense, kf)[0];


     Tensor out;

     if (detail::contains(s->outputs, dense->op)) {

       out = dense;

     } else {

       out = outs[0]->op.output(0);

       s[dense].compute_at(s[out], s[out]->op.as<ComputeOpNode>()->axis[1]);

     }

     s[out].bind(s[out]->op.as<ComputeOpNode>()->axis[0],

                 tvm::te::thread_axis(Range(), "blockIdx.y"));

     s[out].bind(s[out]->op.as<ComputeOpNode>()->axis[1],

                 tvm::te::thread_axis(Range(), "blockIdx.x"));


     auto tx = s[dense]->op.as<ComputeOpNode>()->reduce_axis[0];

     auto thread_x = tvm::te::thread_axis(Range(), "threadIdx.x");

     s[dense].bind(tx, thread_x);

     s[dense_f].compute_at(s[dense], tx);

     s[dense].set_store_predicate(static_cast<PrimExpr>(thread_x) == 0);

     s[out].set_store_predicate(static_cast<PrimExpr>(thread_x) == 0);

   };


   std::function<void(Operation)> traverse;

   traverse = [&](const Operation& op) {

     // Inline all one-to-one-mapping operators except the last stage (output)

     if (is_broadcast(op->tag)) {

       if (!detail::contains(s->outputs, op)) {

         s[op].compute_inline();

       }

       for (auto tensor : op->InputTensors()) {

         if (tensor->op->InputTensors().size() > 0) {

           traverse(tensor->op);

         }

       }

     } else if (op->tag == "dense") {

       // If tag starts with global_pool

       auto dense = op.output(0);

       _schedule(dense);

     } else {

       LOG(ERROR) << "Unsupported operator " << op->tag;

     }

   };


   traverse(outs[0]->op);

   return s;

 }


 }  // namespace cuda

 }  // namespace topi

 }  // namespace tvm

 #endif  // TVM_TOPI_CUDA_DENSE_H_

array_utils.h
Utility functions for handling arrays.

tvm::PrimExpr
Reference to PrimExprNode.
Definition: expr.h:115

tvm::Range
Range container
Definition: expr.h:725

tvm::Target
Managed reference class to TargetNode.
Definition: target.h:200

tvm::runtime::Array
Array, container representing a contiguous sequence of ObjectRefs.
Definition: array.h:289

tvm::runtime::Array::push_back
void push_back(const T &item)
push a new item to the back of the list
Definition: array.h:457

tvm::runtime::DataType
Runtime primitive data type.
Definition: data_type.h:43

tvm::runtime::ObjectRef::defined
bool defined() const
Definition: object.h:552

tvm::runtime::ObjectRef::as
const ObjectType * as() const
Try to downcast the internal Object to a raw pointer of a corresponding type.
Definition: object.h:910

tvm::te::BaseComputeOpNode::axis
Array< IterVar > axis
IterVar on each axis.
Definition: operation.h:207

tvm::te::ComputeOpNode
A Compute op that compute a tensor on certain domain.
Definition: operation.h:226

tvm::te::Operation
Operation that produces tensors.
Definition: tensor.h:47

tvm::te::Schedule
Global schedule container For operations and all the operations they depend on. The schedule per Oper...
Definition: schedule.h:326

tvm::te::Tensor
Tensor structure representing a possible input, or intermediate computation result.
Definition: tensor.h:102

tvm::tir::IterVar
Iteration Variable, represents an iteration over an integer interval.
Definition: var.h:315

tvm::tir::Var
a named variable in TIR
Definition: var.h:89

cublas.h
External function interface to cuBLAS libraries.

extern.h
Schedule for extern followed by injective ops.

generic_func.h
Generic function that can be specialzied on a per target basis.

tvm::te
Tensor expression language DSL.
Definition: extracted_task.h:33

tvm::te::create_schedule
Schedule create_schedule(Array< Operation > ops)
Create a schedule for array of ops(and their dependencies).
Definition: schedule.h:702

tvm::te::thread_axis
IterVar thread_axis(Range dom, std::string tag)
Create a new IterVar that represents an axis in thread.

tvm::te::reduce_axis
IterVar reduce_axis(Range dom, std::string name="rv")
Create a new IterVar for reduction operations.

tvm::te::compute
Tensor compute(Array< PrimExpr > shape, FCompute fcompute, std::string name="tensor", std::string tag="", Map< String, ObjectRef > attrs={})
Construct a new tensor by computing over shape, using the computation rule: result_tensor[axis] = fco...

tvm::topi::contrib::cublas_matmul
Tensor cublas_matmul(const Tensor &lhs, const Tensor &rhs, bool transa, bool transb)
Create an op that multiplies lhs and rhs with cuBLAS.
Definition: cublas.h:46

tvm::topi::cuda::dense_cuda
tvm::te::Tensor dense_cuda(const Target &target, const tvm::te::Tensor &data, const tvm::te::Tensor &weight, const tvm::te::Tensor &bias, const DataType &out_dtype)
Implementation of dense for CUDA backend.
Definition: dense.h:53

tvm::topi::generic::schedule_extern
Schedule schedule_extern(const Target &target, const Array< Tensor > &outs)
Schedule an extern op followed by injective operations.
Definition: extern.h:48

tvm::topi::nn::dense
tvm::te::Tensor dense(const tvm::te::Tensor &data, const tvm::te::Tensor &weight, const tvm::te::Tensor &bias, const DataType &out_dtype)
Creates an operation that calculates data * weight^T + bias.
Definition: dense.h:48

tvm::topi::rocm::schedule_dense
Schedule schedule_dense(const Target &target, const Array< Tensor > &outs)
Create a rocm schedule for dense.
Definition: dense.h:88

tvm::topi::kBroadcast
constexpr auto kBroadcast
Definition: tags.h:36

tvm::topi::is_broadcast
bool is_broadcast(std::string tag)
Definition: tags.h:47

tvm
runtime implementation for LibTorch/TorchScript.
Definition: analyzer.h:36

dense.h
Dense op constructions.

operation.h
Operation node can generate one or multiple Tensors.

schedule_pass.h
Collection of Schedule pass functions.

tags.h
External function interface to rocBLAS libraries.