# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

"""
.. _tutorial-bring-your-own-codegen:

Bring Your Own Codegen: NPU Backend Example
===========================================

This tutorial shows how to integrate a custom hardware backend with TVM's
BYOC framework, using the bundled example NPU backend (CPU emulation, no
real hardware required) as the worked example.  You will see the key
concepts needed to offload operations to a custom accelerator: pattern
registration, graph partitioning, codegen, and runtime dispatch.

NPUs are purpose-built accelerators designed around a fixed set of operations
common in neural network inference, such as matrix multiplication, convolution,
and activation functions.
The example backend's runtime is a *stub*: it logs the dispatch decisions an
NPU would make (memory tier, execution engine, fusion) but performs no real
computation, so output buffers are uninitialized.  Assertions in this tutorial
therefore check shapes, not values.  When you replace the runtime with your
hardware SDK calls, the same flow produces real results.

**Prerequisites**: Build TVM with ``USE_EXAMPLE_NPU_CODEGEN=ON`` and
``USE_EXAMPLE_NPU_RUNTIME=ON``.
"""

######################################################################
# Overview of the BYOC Flow
# -------------------------
#
# The BYOC framework lets you plug a custom backend into TVM's compilation
# pipeline in four steps:
#
# 1. **Register patterns** - describe which sequences of Relax ops the
#    backend can handle.
# 2. **Partition the graph** - group matched ops into composite functions.
# 3. **Run codegen** - lower composite functions to backend-specific
#    representation (JSON graph for the example NPU).
# 4. **Execute** - the runtime dispatches composite functions to the
#    registered backend runtime.

######################################################################
# Step 1: Import the backend to register its patterns
# ---------------------------------------------------
#
# Importing the module is enough to register all supported patterns with
# TVM's pattern registry.

import numpy as np

import tvm
import tvm.relax.backend.contrib.example_npu  # registers patterns
from tvm import relax
from tvm.relax.backend.pattern_registry import get_patterns_with_prefix
from tvm.relax.transform import FuseOpsByPattern, MergeCompositeFunctions, RunCodegen
from tvm.script import relax as R

has_example_npu_codegen = tvm.get_global_func("relax.ext.example_npu", True)
has_example_npu_runtime = tvm.get_global_func("runtime.ExampleNPUJSONRuntimeCreate", True)
has_example_npu = has_example_npu_codegen and has_example_npu_runtime

target = tvm.target.Target("llvm")

patterns = get_patterns_with_prefix("example_npu")
print("Registered patterns:", [p.name for p in patterns])

######################################################################
# Step 2: Define a model
# ----------------------
#
# We use a simple MatMul + ReLU module to illustrate the flow.


@tvm.script.ir_module
class MatmulReLU:
    @R.function
    def main(
        x: R.Tensor((2, 4), "float32"),
        w: R.Tensor((4, 8), "float32"),
    ) -> R.Tensor((2, 8), "float32"):
        with R.dataflow():
            y = relax.op.matmul(x, w)
            z = relax.op.nn.relu(y)
            R.output(z)
        return z


######################################################################
# Step 3: Partition the graph
# ---------------------------
#
# ``FuseOpsByPattern`` groups ops that match a registered pattern into
# composite functions, controlled by two flags:
#
# - ``bind_constants=False`` keeps weights as function arguments instead
#   of baking them in, so the host stays in charge of parameter
#   ownership.
# - ``annotate_codegen=True`` tags each composite with its backend name
#   (``example_npu``); without this tag, ``RunCodegen`` has no way to
#   route the composite to a backend.
#
# ``MergeCompositeFunctions`` then consolidates adjacent composites
# that target the same backend so each group becomes a single external
# call.  Note that consolidation depends on the patterns themselves: an
# ``op_a + op_b`` chain only collapses into one composite if a fused
# pattern (e.g. ``matmul_relu_fused``) was registered for it; otherwise
# each op stays as its own composite even when both target the same
# backend.

mod = MatmulReLU
mod = FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod)
mod = MergeCompositeFunctions()(mod)
print("After partitioning:")
print(mod)

######################################################################
# Step 4: Run codegen
# -------------------
#
# ``RunCodegen`` lowers each annotated composite function to the backend's
# serialization format.  For the example NPU this produces a JSON graph
# that the C++ runtime can execute.
#
# Steps 4 and 5 require TVM to be built with ``USE_EXAMPLE_NPU_CODEGEN=ON``
# and ``USE_EXAMPLE_NPU_RUNTIME=ON``.

if has_example_npu:
    mod = RunCodegen()(mod)
    print("After codegen:")
    print(mod)

    ######################################################################
    # Step 5: Build and run
    # ---------------------
    #
    # Build the module for the host target, create a virtual machine, and
    # execute the compiled function.

    np.random.seed(0)
    x_np = np.random.randn(2, 4).astype("float32")
    w_np = np.random.randn(4, 8).astype("float32")

    with tvm.transform.PassContext(opt_level=3):
        built = relax.build(mod, target)

    vm = relax.VirtualMachine(built, tvm.cpu())
    result = vm["main"](tvm.runtime.tensor(x_np, tvm.cpu()), tvm.runtime.tensor(w_np, tvm.cpu()))

    assert result.numpy().shape == (2, 8)
    print("Execution completed. Output shape:", result.numpy().shape)

######################################################################
# Step 6: Conv2D + ReLU
# ---------------------
#
# The same flow applies to convolution workloads.  Because the fused
# ``conv2d + relu`` pattern is registered after the standalone
# ``conv2d`` pattern in ``patterns.py`` (later entries have higher
# priority), both ops are offloaded as a single composite function.


@tvm.script.ir_module
class Conv2dReLU:
    @R.function
    def main(
        x: R.Tensor((1, 3, 32, 32), "float32"),
        w: R.Tensor((16, 3, 3, 3), "float32"),
    ) -> R.Tensor((1, 16, 30, 30), "float32"):
        with R.dataflow():
            y = relax.op.nn.conv2d(x, w)
            z = relax.op.nn.relu(y)
            R.output(z)
        return z


if has_example_npu:
    mod2 = Conv2dReLU
    mod2 = FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod2)
    mod2 = MergeCompositeFunctions()(mod2)
    mod2 = RunCodegen()(mod2)

    with tvm.transform.PassContext(opt_level=3):
        built2 = relax.build(mod2, target)

    x2_np = np.random.randn(1, 3, 32, 32).astype("float32")
    w2_np = np.random.randn(16, 3, 3, 3).astype("float32")

    vm2 = relax.VirtualMachine(built2, tvm.cpu())
    result2 = vm2["main"](
        tvm.runtime.tensor(x2_np, tvm.cpu()), tvm.runtime.tensor(w2_np, tvm.cpu())
    )
    assert result2.numpy().shape == (1, 16, 30, 30)
    print("Conv2dReLU output shape:", result2.numpy().shape)

######################################################################
# Next steps
# ----------
#
# To build a real NPU backend using this example as a starting point:
#
# - Replace ``example_npu_runtime.cc`` with your hardware SDK calls.
# - Extend ``patterns.py`` with the ops your hardware supports.
# - Add a C++ codegen under ``src/relax/backend/contrib/`` if your
#   hardware requires a non-JSON serialization format.
# - Add your cmake module under ``cmake/modules/contrib/`` following
#   the pattern in ``cmake/modules/contrib/ExampleNPU.cmake``.