.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "how_to/work_with_schedules/tuple_inputs.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        Click :ref:`here <sphx_glr_download_how_to_work_with_schedules_tuple_inputs.py>`
        to download the full example code

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_how_to_work_with_schedules_tuple_inputs.py:


Compute and Reduce with Tuple Inputs
=======================================
**Author**: `Ziheng Jiang <https://github.com/ZihengJiang>`_

Often we want to compute multiple outputs with the same shape within
a single loop or perform reduction that involves multiple values like
:code:`argmax`. These problems can be addressed by tuple inputs.

In this tutorial, we will introduce the usage of tuple inputs in TVM.

.. GENERATED FROM PYTHON SOURCE LINES 28-35

.. code-block:: default

    from __future__ import absolute_import, print_function


    import tvm
    from tvm import te
    import numpy as np


.. GENERATED FROM PYTHON SOURCE LINES 41-47

Describe Batchwise Computation
------------------------------
For operators which have the same shape, we can put them together as
the inputs of :any:`te.compute`, if we want them to be scheduled
together in the next schedule procedure.


.. GENERATED FROM PYTHON SOURCE LINES 47-57

.. code-block:: default

    n = te.var("n")
    m = te.var("m")
    A0 = te.placeholder((m, n), name="A0")
    A1 = te.placeholder((m, n), name="A1")
    B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), name="B")

    # The generated IR code would be:
    s = te.create_schedule(B0.op)
    print(tvm.lower(s, [A0, A1, B0, B1], simple_mode=True))


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

    @main = primfn(A0_1: handle, A1_1: handle, B_2: handle, B_3: handle) -> ()
      attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
      buffers = {A0: Buffer(A0_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
                 A1: Buffer(A1_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto"),
                 B: Buffer(B_4: Pointer(float32), float32, [(stride_2: int32*m)], [], type="auto"),
                 B_1: Buffer(B_5: Pointer(float32), float32, [(stride_3: int32*m)], [], type="auto")}
      buffer_map = {A0_1: A0, A1_1: A1, B_2: B, B_3: B_1}
      preflattened_buffer_map = {A0_1: A0_3: Buffer(A0_2, float32, [m, n: int32], [stride, stride_4: int32], type="auto"), A1_1: A1_3: Buffer(A1_2, float32, [m, n], [stride_1, stride_5: int32], type="auto"), B_2: B_6: Buffer(B_4, float32, [m, n], [stride_2, stride_6: int32], type="auto"), B_3: B_7: Buffer(B_5, float32, [m, n], [stride_3, stride_7: int32], type="auto")} {
      for (i: int32, 0, m) {
        for (j: int32, 0, n) {
          B[((i*stride_2) + (j*stride_6))] = (A0[((i*stride) + (j*stride_4))] + 2f32)
          B_1[((i*stride_3) + (j*stride_7))] = (A1[((i*stride_1) + (j*stride_5))]*3f32)
        }
      }
    }


.. GENERATED FROM PYTHON SOURCE LINES 58-67

.. _reduction-with-tuple-inputs:

Describe Reduction with Collaborative Inputs
--------------------------------------------
Sometimes, we require multiple inputs to express some reduction
operators, and the inputs will collaborate together, e.g. :code:`argmax`.
In the reduction procedure, :code:`argmax` need to compare the value of
operands, also need to keep the index of operand. It can be expressed
with :py:func:`te.comm_reducer` as below:

.. GENERATED FROM PYTHON SOURCE LINES 67-96

.. code-block:: default


    # x and y are the operands of reduction, both of them is a tuple of index
    # and value.
    def fcombine(x, y):
        lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
        rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
        return lhs, rhs


    # our identity element also need to be a tuple, so `fidentity` accepts
    # two types as inputs.
    def fidentity(t0, t1):
        return tvm.tir.const(-1, t0), tvm.te.min_value(t1)


    argmax = te.comm_reducer(fcombine, fidentity, name="argmax")

    # describe the reduction computation
    m = te.var("m")
    n = te.var("n")
    idx = te.placeholder((m, n), name="idx", dtype="int32")
    val = te.placeholder((m, n), name="val", dtype="int32")
    k = te.reduce_axis((0, n), "k")
    T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T")

    # the generated IR code would be:
    s = te.create_schedule(T0.op)
    print(tvm.lower(s, [idx, val, T0, T1], simple_mode=True))


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

    @main = primfn(idx_1: handle, val_1: handle, T_2: handle, T_3: handle) -> ()
      attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
      buffers = {idx: Buffer(idx_2: Pointer(int32), int32, [(stride: int32*m: int32)], [], type="auto"),
                 val: Buffer(val_2: Pointer(int32), int32, [(stride_1: int32*m)], [], type="auto"),
                 T: Buffer(T_4: Pointer(int32), int32, [(stride_2: int32*m)], [], type="auto"),
                 T_1: Buffer(T_5: Pointer(int32), int32, [(stride_3: int32*m)], [], type="auto")}
      buffer_map = {idx_1: idx, val_1: val, T_2: T, T_3: T_1}
      preflattened_buffer_map = {idx_1: idx_3: Buffer(idx_2, int32, [m, n: int32], [stride, stride_4: int32], type="auto"), val_1: val_3: Buffer(val_2, int32, [m, n], [stride_1, stride_5: int32], type="auto"), T_2: T_6: Buffer(T_4, int32, [m], [stride_2], type="auto"), T_3: T_7: Buffer(T_5, int32, [m], [stride_3], type="auto")} {
      for (i: int32, 0, m) {
        T[(i*stride_2)] = -1
        T_1[(i*stride_3)] = -2147483648
        for (k: int32, 0, n) {
          T[(i*stride_2)] = @tir.if_then_else((val[((i*stride_1) + (k*stride_5))] <= T_1[(i*stride_3)]), T[(i*stride_2)], idx[((i*stride) + (k*stride_4))], dtype=int32)
          T_1[(i*stride_3)] = @tir.if_then_else((val[((i*stride_1) + (k*stride_5))] <= T_1[(i*stride_3)]), T_1[(i*stride_3)], val[((i*stride_1) + (k*stride_5))], dtype=int32)
        }
      }
    }


.. GENERATED FROM PYTHON SOURCE LINES 97-101

.. note::

  For ones who are not familiar with reduction, please refer to
  :ref:`general-reduction`.

.. GENERATED FROM PYTHON SOURCE LINES 103-108

Schedule Operation with Tuple Inputs
------------------------------------
It is worth mentioning that although you will get multiple outputs
with one batch operation, but they can only be scheduled together
in terms of operation.

.. GENERATED FROM PYTHON SOURCE LINES 108-121

.. code-block:: default


    n = te.var("n")
    m = te.var("m")
    A0 = te.placeholder((m, n), name="A0")
    B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name="B")
    A1 = te.placeholder((m, n), name="A1")
    C = te.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name="C")

    s = te.create_schedule(C.op)
    s[B0].compute_at(s[C], C.op.axis[0])
    # as you can see in the below generated IR code:
    print(tvm.lower(s, [A0, A1, C], simple_mode=True))


.. rst-class:: sphx-glr-script-out

 .. code-block:: none

    @main = primfn(A0_1: handle, A1_1: handle, C_1: handle) -> ()
      attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
      buffers = {A0: Buffer(A0_2: Pointer(float32), float32, [(stride: int32*m: int32)], [], type="auto"),
                 A1: Buffer(A1_2: Pointer(float32), float32, [(stride_1: int32*m)], [], type="auto"),
                 C: Buffer(C_2: Pointer(float32), float32, [(stride_2: int32*m)], [], type="auto")}
      buffer_map = {A0_1: A0, A1_1: A1, C_1: C}
      preflattened_buffer_map = {A0_1: A0_3: Buffer(A0_2, float32, [m, n: int32], [stride, stride_3: int32], type="auto"), A1_1: A1_3: Buffer(A1_2, float32, [m, n], [stride_1, stride_4: int32], type="auto"), C_1: C_3: Buffer(C_2, float32, [m, n], [stride_2, stride_5: int32], type="auto")} {
      allocate(B.v0: Pointer(global float32), float32, [n]), storage_scope = global;
      allocate(B.v1: Pointer(global float32), float32, [n]), storage_scope = global;
      for (i: int32, 0, m) {
        for (j: int32, 0, n) {
          B.v0_1: Buffer(B.v0, float32, [n], [])[j] = (A0[((i*stride) + (j*stride_3))] + 2f32)
          B.v1_1: Buffer(B.v1, float32, [n], [])[j] = (A0[((i*stride) + (j*stride_3))]*3f32)
        }
        for (j_1: int32, 0, n) {
          C[((i*stride_2) + (j_1*stride_5))] = (A1[((i*stride_1) + (j_1*stride_4))] + B.v0_1[j_1])
        }
      }
    }


.. GENERATED FROM PYTHON SOURCE LINES 122-129

Summary
-------
This tutorial introduces the usage of tuple inputs operation.

- Describe normal batchwise computation.
- Describe reduction operation with tuple inputs.
- Notice that you can only schedule computation in terms of operation instead of tensor.


.. _sphx_glr_download_how_to_work_with_schedules_tuple_inputs.py:

.. only:: html

  .. container:: sphx-glr-footer sphx-glr-footer-example


    .. container:: sphx-glr-download sphx-glr-download-python

      :download:`Download Python source code: tuple_inputs.py <tuple_inputs.py>`

    .. container:: sphx-glr-download sphx-glr-download-jupyter

      :download:`Download Jupyter notebook: tuple_inputs.ipynb <tuple_inputs.ipynb>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_