api/doxygen/cuda_2reduction_8h_source.html

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #ifndef TVM_TOPI_CUDA_REDUCTION_H_
 #define TVM_TOPI_CUDA_REDUCTION_H_

 #include <tvm/target/generic_func.h>
 #include <tvm/te/operation.h>
 #include <tvm/te/schedule_pass.h>
 #include <tvm/topi/detail/fuse.h>
 #include <tvm/topi/tags.h>

 namespace tvm {
 namespace topi {

 using namespace tvm::te;

 namespace cuda {
 Schedule ScheduleReduce(const Target& target, Operation op, Schedule sch,
                         bool is_idx_reduce = false) {
   Tensor data_out;
   Tensor data_in;

   if (!is_idx_reduce) {
     data_in = op->InputTensors()[0];
     data_out = op.output(0);
   } else {
     data_out = op->InputTensors()[0];
   }

   auto out_stage = sch[data_out];
   ICHECK_GT(out_stage->op.as<ComputeOpNode>()->reduce_axis.size(), 0)
       << "reduce_axis must be greater than zero";

   bool all_reduce;
   int num_thread;
   IterVar block_x, thread_x, thread_y;

   if (out_stage->op.as<ComputeOpNode>()->axis.size() > 0) {
     all_reduce = false;
     num_thread = 32;
     if (target->kind->name == "opencl" || target->kind->name == "metal") {
       // Without this, CL_INVALID_WORK_GROUP_SIZE occurs with python tests.
       // Don't know why.
       num_thread = 16;
     }
     block_x = tvm::te::thread_axis(Range(), "blockIdx.x");
     thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x");
     thread_y = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.y");
   } else {
     all_reduce = true;
     num_thread = target->GetAttr<Integer>("max_num_threads").value().IntValue();
     thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x");
   }

   auto fused_reduce = detail::Fuse(out_stage, out_stage->op.as<ComputeOpNode>()->reduce_axis);

   IterVar ko, ki;
   out_stage.split(fused_reduce, num_thread, &ko, &ki);
   auto data_out_rf = sch.rfactor(data_out, ki)[0];
   auto tx = out_stage->op.as<ComputeOpNode>()->reduce_axis[0];
   out_stage.bind(tx, thread_x);
   sch[data_out_rf].compute_at(out_stage, tx);

   Tensor real_output;
   Tensor temp_idx_input, temp_val_input;
   if (is_idx_reduce) {
     real_output = op.output(0);
     temp_idx_input = data_out->op.output(0);
     temp_val_input = data_out->op.output(1);
   } else {
     real_output = data_out;
   }

   auto stage_real = sch[real_output];
   if (!all_reduce) {
     // Fuse and split the axis
     auto fused_outer = detail::Fuse(stage_real, stage_real->op.as<ComputeOpNode>()->axis);
     IterVar bx, outer_in;
     stage_real.split(fused_outer, num_thread, &bx, &outer_in);

     // Bind the axes to threads and blocks
     stage_real.bind(outer_in, thread_y);
     stage_real.bind(bx, block_x);
     if (is_idx_reduce) {
       sch[temp_idx_input].compute_at(stage_real, outer_in);
       sch[temp_val_input].compute_at(stage_real, outer_in);
     }
   } else {
     if (is_idx_reduce) {
       sch[temp_idx_input].compute_at(stage_real, stage_real->op.as<ComputeOpNode>()->axis[0]);
       sch[temp_val_input].compute_at(stage_real, stage_real->op.as<ComputeOpNode>()->axis[0]);
     }
   }

   stage_real.set_store_predicate(static_cast<PrimExpr>(thread_x) == 0);
   return sch;
 }

 void TraverseBeforeReduce(Schedule s, Operation op) {
   if (op->IsInstance<PlaceholderOpNode>()) {
     return;
   } else if (is_injective(op->tag)) {
     s[op].compute_inline();
     for (auto tensor : op->InputTensors()) {
       TraverseBeforeReduce(s, tensor->op);
     }
   } else {
     LOG(ERROR) << "Unsupported operator " << op->tag;
   }
 }

 void TraverseAfterReduce(const Target& target, Schedule s, Operation op) {
   if (is_broadcast(op->tag)) {
     LOG(ERROR) << "Elementwise op after reduce is not yet supported";
   } else if (op->tag == kCommReduce) {
     ScheduleReduce(target, op, s, false);
     for (auto tensor : op->InputTensors()) {
       TraverseBeforeReduce(s, tensor->op);
     }
   } else if (op->tag == kCommReduceIdx) {
     ScheduleReduce(target, op, s, true);
     for (auto tensor : op->InputTensors()[0]->op->InputTensors()) {
       TraverseBeforeReduce(s, tensor->op);
     }
   } else {
     LOG(ERROR) << "Unsupported operator " << op->tag;
   }
 }

 Schedule schedule_reduce(const Target& target, Array<Tensor> outs) {
   ICHECK_EQ(outs.size(), 1) << "outs must have size 1";
   Array<Operation> out_ops;
   for (auto t : outs) {
     out_ops.push_back(t->op);
   }
   auto s = create_schedule(out_ops);
   TraverseAfterReduce(target, s, outs[0]->op);
   return s;
 }

 }  // namespace cuda
 }  // namespace topi
 }  // namespace tvm
 #endif  // TVM_TOPI_CUDA_REDUCTION_H_
tvm::te::thread_axis
IterVar thread_axis(Range dom, std::string tag)
Create a new IterVar that represents an axis in thread.

tvm::topi::cuda::TraverseBeforeReduce
void TraverseBeforeReduce(Schedule s, Operation op)
Recursively traverse operator inputs, setting injective inputs to be computed inline.
Definition: reduction.h:138

tvm::te::Operation::output
Tensor output(size_t i) const
get the i-th output of the operation.

tvm::te::Schedule
Global schedule container For operations and all the operations they depend on. The schedule per Oper...
Definition: schedule.h:317

tvm::topi::kCommReduceIdx
constexpr auto kCommReduceIdx
Definition: tags.h:35

tvm::topi::cuda::ScheduleReduce
Schedule ScheduleReduce(const Target &target, Operation op, Schedule sch, bool is_idx_reduce=false)
Schedule a given reduce operation.
Definition: reduction.h:50

tvm::te::create_schedule
Schedule create_schedule(Array< Operation > ops)
Create a schedule for array of ops(and their dependencies).
Definition: schedule.h:654

tvm::te::OperationNode::tag
std::string tag
optional tag of the operation
Definition: operation.h:61

tvm
runtime implementation for LibTorch/TorchScript.
Definition: analyzer.h:36

tvm::te
Tensor expression language DSL.
Definition: extracted_task.h:33

tvm::te::Operation
Operation that produces tensors.
Definition: tensor.h:47

tvm::topi::is_injective
bool is_injective(std::string tag)
Definition: tags.h:51

tvm::tir::IterVar
Iteration Variable, represents an iteration over an integer interval.
Definition: var.h:301

tvm::te::PlaceholderOpNode
A placeholder op represents an input placeholder.
Definition: operation.h:152

tvm::runtime::Array::push_back
void push_back(const T &item)
push a new item to the back of the list
Definition: array.h:457

tvm::runtime::Object::IsInstance
bool IsInstance() const
Definition: object.h:829

schedule_pass.h
Collection of Schedule pass functions.

tvm::Range
Range constainer.
Definition: expr.h:713

tvm::te::OperationNode::InputTensors
virtual Array< Tensor > InputTensors() const =0
List all the input Tensors.

tvm::runtime::Array::size
size_t size() const
Definition: array.h:420

tvm::Integer::IntValue
int64_t IntValue() const
convert to int64_t
Definition: expr.h:657

tvm::runtime::Array
Array, container representing a contiguous sequence of ObjectRefs.
Definition: array.h:289

tvm::te::ComputeOpNode
A Compute op that compute a tensor on certain domain.
Definition: operation.h:226

tvm::te::BaseComputeOpNode::axis
Array< IterVar > axis
IterVar on each axis.
Definition: operation.h:207

tvm::te::reduce_axis
IterVar reduce_axis(Range dom, std::string name="rv")
Create a new IterVar for reduction operations.

fuse.h
Fuse operation.

tvm::topi::kCommReduce
constexpr auto kCommReduce
Definition: tags.h:34

tvm::Target
Managed reference class to TargetNode.
Definition: target.h:183

tvm::te::Tensor
Tensor structure representing a possible input, or intermediate computation result.
Definition: tensor.h:102

operation.h
Operation node can generate one or multiple Tensors.

tvm::te::Fuse
Managed reference to FuseNode.
Definition: schedule.h:774

tvm::te::Schedule::rfactor
Array< Tensor > rfactor(const Tensor &tensor, const IterVar &axis, int factor_axis=0)
Factor a reduction axis in tensor&#39;s schedule to be an explicit axis. This will create a new stage tha...

tvm::topi::is_broadcast
bool is_broadcast(std::string tag)
Definition: tags.h:47

tags.h
External function interface to rocBLAS libraries.

tvm::topi::rocm::schedule_reduce
Schedule schedule_reduce(const Target &target, Array< Tensor > outs)
Create a rocm schedule for a reduce operation.
Definition: reduction.h:47

tvm::topi::cuda::TraverseAfterReduce
void TraverseAfterReduce(const Target &target, Schedule s, Operation op)
Schedule a reduce op, then invoke TraverseBeforeReduce on each of the op&#39;s inputs.
Definition: reduction.h:159

tvm::te::BaseComputeOpNode::reduce_axis
Array< IterVar > reduce_axis
IterVar on each reduction axis, if the body is a Reduce.
Definition: operation.h:209

generic_func.h
Generic function that can be specialzied on a per target basis.

tvm::Integer
Container of constant int that adds more constructors.
Definition: expr.h:620