api/doxygen/cuda_2reduction_8h_source.html

 /*

  * Licensed to the Apache Software Foundation (ASF) under one

  * or more contributor license agreements.  See the NOTICE file

  * distributed with this work for additional information

  * regarding copyright ownership.  The ASF licenses this file

  * to you under the Apache License, Version 2.0 (the

  * "License"); you may not use this file except in compliance

  * with the License.  You may obtain a copy of the License at

  *

  *   http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing,

  * software distributed under the License is distributed on an

  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

  * KIND, either express or implied.  See the License for the

  * specific language governing permissions and limitations

  * under the License.

  */


 #ifndef TVM_TOPI_CUDA_REDUCTION_H_

 #define TVM_TOPI_CUDA_REDUCTION_H_


 #include <tvm/target/generic_func.h>

 #include <tvm/te/operation.h>

 #include <tvm/te/schedule_pass.h>

 #include <tvm/topi/detail/fuse.h>

 #include <tvm/topi/tags.h>


 namespace tvm {

 namespace topi {


 using namespace tvm::te;


 namespace cuda {

 Schedule ScheduleReduce(const Target& target, Operation op, Schedule sch,

                         bool is_idx_reduce = false) {

   Tensor data_out;

   Tensor data_in;


   if (!is_idx_reduce) {

     data_in = op->InputTensors()[0];

     data_out = op.output(0);

   } else {

     data_out = op->InputTensors()[0];

   }


   auto out_stage = sch[data_out];

   ICHECK_GT(out_stage->op.as<ComputeOpNode>()->reduce_axis.size(), 0)

       << "reduce_axis must be greater than zero";


   bool all_reduce;

   int num_thread;

   IterVar block_x, thread_x, thread_y;


   if (out_stage->op.as<ComputeOpNode>()->axis.size() > 0) {

     all_reduce = false;

     num_thread = 32;

     if (target->kind->name == "opencl" || target->kind->name == "metal") {

       // Without this, CL_INVALID_WORK_GROUP_SIZE occurs with python tests.

       // Don't know why.

       num_thread = 16;

     }

     block_x = tvm::te::thread_axis(Range(), "blockIdx.x");

     thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x");

     thread_y = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.y");

   } else {

     all_reduce = true;

     num_thread = target->GetAttr<Integer>("max_num_threads").value().IntValue();

     thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x");

   }


   auto fused_reduce = detail::Fuse(out_stage, out_stage->op.as<ComputeOpNode>()->reduce_axis);


   IterVar ko, ki;

   out_stage.split(fused_reduce, num_thread, &ko, &ki);

   auto data_out_rf = sch.rfactor(data_out, ki)[0];

   auto tx = out_stage->op.as<ComputeOpNode>()->reduce_axis[0];

   out_stage.bind(tx, thread_x);

   sch[data_out_rf].compute_at(out_stage, tx);


   Tensor real_output;

   Tensor temp_idx_input, temp_val_input;

   if (is_idx_reduce) {

     real_output = op.output(0);

     temp_idx_input = data_out->op.output(0);

     temp_val_input = data_out->op.output(1);

   } else {

     real_output = data_out;

   }


   auto stage_real = sch[real_output];

   if (!all_reduce) {

     // Fuse and split the axis

     auto fused_outer = detail::Fuse(stage_real, stage_real->op.as<ComputeOpNode>()->axis);

     IterVar bx, outer_in;

     stage_real.split(fused_outer, num_thread, &bx, &outer_in);


     // Bind the axes to threads and blocks

     stage_real.bind(outer_in, thread_y);

     stage_real.bind(bx, block_x);

     if (is_idx_reduce) {

       sch[temp_idx_input].compute_at(stage_real, outer_in);

       sch[temp_val_input].compute_at(stage_real, outer_in);

     }

   } else {

     if (is_idx_reduce) {

       sch[temp_idx_input].compute_at(stage_real, stage_real->op.as<ComputeOpNode>()->axis[0]);

       sch[temp_val_input].compute_at(stage_real, stage_real->op.as<ComputeOpNode>()->axis[0]);

     }

   }


   stage_real.set_store_predicate(static_cast<PrimExpr>(thread_x) == 0);

   return sch;

 }


 void TraverseBeforeReduce(Schedule s, Operation op) {

   if (op->IsInstance<PlaceholderOpNode>()) {

     return;

   } else if (is_injective(op->tag)) {

     s[op].compute_inline();

     for (auto tensor : op->InputTensors()) {

       TraverseBeforeReduce(s, tensor->op);

     }

   } else {

     LOG(ERROR) << "Unsupported operator " << op->tag;

   }

 }


 void TraverseAfterReduce(const Target& target, Schedule s, Operation op) {

   if (is_broadcast(op->tag)) {

     LOG(ERROR) << "Elementwise op after reduce is not yet supported";

   } else if (op->tag == kCommReduce) {

     ScheduleReduce(target, op, s, false);

     for (auto tensor : op->InputTensors()) {

       TraverseBeforeReduce(s, tensor->op);

     }

   } else if (op->tag == kCommReduceIdx) {

     ScheduleReduce(target, op, s, true);

     for (auto tensor : op->InputTensors()[0]->op->InputTensors()) {

       TraverseBeforeReduce(s, tensor->op);

     }

   } else {

     LOG(ERROR) << "Unsupported operator " << op->tag;

   }

 }


 Schedule schedule_reduce(const Target& target, Array<Tensor> outs) {

   ICHECK_EQ(outs.size(), 1) << "outs must have size 1";

   Array<Operation> out_ops;

   for (auto t : outs) {

     out_ops.push_back(t->op);

   }

   auto s = create_schedule(out_ops);

   TraverseAfterReduce(target, s, outs[0]->op);

   return s;

 }


 }  // namespace cuda

 }  // namespace topi

 }  // namespace tvm

 #endif  // TVM_TOPI_CUDA_REDUCTION_H_

tvm::Integer
Container of constant int that adds more constructors.
Definition: expr.h:632

tvm::Integer::IntValue
int64_t IntValue() const
convert to int64_t
Definition: expr.h:669

tvm::PrimExpr
Reference to PrimExprNode.
Definition: expr.h:115

tvm::Range
Range container
Definition: expr.h:725

tvm::Target
Managed reference class to TargetNode.
Definition: target.h:200

tvm::runtime::Array
Array, container representing a contiguous sequence of ObjectRefs.
Definition: array.h:289

tvm::runtime::Array::push_back
void push_back(const T &item)
push a new item to the back of the list
Definition: array.h:457

tvm::runtime::Array::size
size_t size() const
Definition: array.h:420

tvm::runtime::Object::IsInstance
bool IsInstance() const
Definition: object.h:874

tvm::te::BaseComputeOpNode::axis
Array< IterVar > axis
IterVar on each axis.
Definition: operation.h:207

tvm::te::BaseComputeOpNode::reduce_axis
Array< IterVar > reduce_axis
IterVar on each reduction axis, if the body is a Reduce.
Definition: operation.h:209

tvm::te::ComputeOpNode
A Compute op that compute a tensor on certain domain.
Definition: operation.h:226

tvm::te::Fuse
Managed reference to FuseNode.
Definition: schedule.h:826

tvm::te::OperationNode::InputTensors
virtual Array< Tensor > InputTensors() const =0
List all the input Tensors.

tvm::te::OperationNode::tag
std::string tag
optional tag of the operation
Definition: operation.h:61

tvm::te::Operation
Operation that produces tensors.
Definition: tensor.h:47

tvm::te::Operation::output
Tensor output(size_t i) const
get the i-th output of the operation.

tvm::te::PlaceholderOpNode
A placeholder op represents an input placeholder.
Definition: operation.h:152

tvm::te::Schedule
Global schedule container For operations and all the operations they depend on. The schedule per Oper...
Definition: schedule.h:326

tvm::te::Schedule::rfactor
Array< Tensor > rfactor(const Tensor &tensor, const IterVar &axis, int factor_axis=0)
Factor a reduction axis in tensor's schedule to be an explicit axis. This will create a new stage tha...

tvm::te::Tensor
Tensor structure representing a possible input, or intermediate computation result.
Definition: tensor.h:102

tvm::tir::IterVar
Iteration Variable, represents an iteration over an integer interval.
Definition: var.h:315

fuse.h
Fuse operation.

generic_func.h
Generic function that can be specialzied on a per target basis.

tvm::te
Tensor expression language DSL.
Definition: extracted_task.h:33

tvm::te::create_schedule
Schedule create_schedule(Array< Operation > ops)
Create a schedule for array of ops(and their dependencies).
Definition: schedule.h:702

tvm::te::thread_axis
IterVar thread_axis(Range dom, std::string tag)
Create a new IterVar that represents an axis in thread.

tvm::te::reduce_axis
IterVar reduce_axis(Range dom, std::string name="rv")
Create a new IterVar for reduction operations.

tvm::topi::cuda::ScheduleReduce
Schedule ScheduleReduce(const Target &target, Operation op, Schedule sch, bool is_idx_reduce=false)
Schedule a given reduce operation.
Definition: reduction.h:50

tvm::topi::cuda::TraverseAfterReduce
void TraverseAfterReduce(const Target &target, Schedule s, Operation op)
Schedule a reduce op, then invoke TraverseBeforeReduce on each of the op's inputs.
Definition: reduction.h:159

tvm::topi::cuda::TraverseBeforeReduce
void TraverseBeforeReduce(Schedule s, Operation op)
Recursively traverse operator inputs, setting injective inputs to be computed inline.
Definition: reduction.h:138

tvm::topi::rocm::schedule_reduce
Schedule schedule_reduce(const Target &target, Array< Tensor > outs)
Create a rocm schedule for a reduce operation.
Definition: reduction.h:47

tvm::topi::is_injective
bool is_injective(std::string tag)
Definition: tags.h:51

tvm::topi::kCommReduce
constexpr auto kCommReduce
Definition: tags.h:34

tvm::topi::kCommReduceIdx
constexpr auto kCommReduceIdx
Definition: tags.h:35

tvm::topi::is_broadcast
bool is_broadcast(std::string tag)
Definition: tags.h:47

tvm
runtime implementation for LibTorch/TorchScript.
Definition: analyzer.h:36

operation.h
Operation node can generate one or multiple Tensors.

schedule_pass.h
Collection of Schedule pass functions.

tags.h
External function interface to rocBLAS libraries.