# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

"""
.. _deploy_export_and_load_executable:

Export and Load Relax Executables
=================================

This tutorial walks through exporting a compiled Relax module to a shared
object, loading it back into the TVM runtime, and running the result either
interactively or from a standalone script. This tutorial demonstrates how
to turn Relax (or imported PyTorch / ONNX) programs into deployable artifacts
using ``tvm.relax`` APIs.

.. note::
   This tutorial uses PyTorch as the source format, but the export/load workflow
   is the same for ONNX models. For ONNX, use ``from_onnx(model, keep_params_in_input=True)``
   instead of ``from_exported_program()``, then follow the same steps for building,
   exporting, and loading.
"""

######################################################################
# Introduction
# ------------
# TVM builds Relax programs into ``tvm.runtime.Executable`` objects. These
# contain VM bytecode, compiled kernels, and constants. By exporting the
# executable with :py:meth:`export_library`, you obtain a shared library (for
# example ``.so`` on Linux) that can be shipped to another machine, uploaded
# via RPC, or loaded back later with the TVM runtime. This tutorial shows the
# exact steps end-to-end and explains what files are produced along the way.

import os
from pathlib import Path

try:
    import torch
    from torch.export import export
except ImportError:  # pragma: no cover
    torch = None  # type: ignore


######################################################################
# Prepare a Torch MLP and Convert to Relax
# ----------------------------------------
# We start with a small PyTorch MLP so the example remains lightweight. The
# model is exported to a :py:class:`torch.export.ExportedProgram` and then
# translated into a Relax ``IRModule``.

import tvm
from tvm import relax
from tvm.relax.frontend.torch import from_exported_program

# Check dependencies first
IS_IN_CI = os.getenv("CI", "").lower() == "true"
HAS_TORCH = torch is not None
RUN_EXAMPLE = HAS_TORCH and not IS_IN_CI


if HAS_TORCH:

    class TorchMLP(torch.nn.Module):
        def __init__(self) -> None:
            super().__init__()
            self.net = torch.nn.Sequential(
                torch.nn.Flatten(),
                torch.nn.Linear(28 * 28, 128),
                torch.nn.ReLU(),
                torch.nn.Linear(128, 10),
            )

        def forward(self, data: torch.Tensor) -> torch.Tensor:  # type: ignore[override]
            return self.net(data)

else:  # pragma: no cover
    TorchMLP = None  # type: ignore[misc, assignment]

if RUN_EXAMPLE:
    torch_model = TorchMLP().eval()
    example_args = (torch.randn(1, 1, 28, 28, dtype=torch.float32),)

    with torch.no_grad():
        exported_program = export(torch_model, example_args)

    mod = from_exported_program(exported_program, keep_params_as_input=True)

    # Separate model parameters so they can be bound later (or stored on disk).
    mod, params = relax.frontend.detach_params(mod)

    print("Imported Relax module:")
    mod.show()


######################################################################
# Build and Export with ``export_library``
# -------------------------------------------
# We build for ``llvm`` to generate CPU code and then export the resulting
# executable. Passing ``workspace_dir`` keeps the intermediate packaging files,
# which is useful to inspect what was produced.

TARGET = tvm.target.Target("llvm")
ARTIFACT_DIR = Path("relax_export_artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True)

if RUN_EXAMPLE:
    # Apply the default Relax compilation pipeline before building.
    pipeline = relax.get_pipeline()
    with TARGET:
        built_mod = pipeline(mod)

    # Build without params - we'll pass them at runtime
    executable = tvm.compile(built_mod, target=TARGET)

    library_path = ARTIFACT_DIR / "mlp_cpu.so"
    executable.export_library(str(library_path), workspace_dir=str(ARTIFACT_DIR))

    print(f"Exported runtime library to: {library_path}")

    # The workspace directory now contains the shared object and supporting files.
    produced_files = sorted(p.name for p in ARTIFACT_DIR.iterdir())
    print("Artifacts saved:")
    for name in produced_files:
        print(f"  - {name}")

    # Generated files:
    #   - ``mlp_cpu.so``: The main deployable shared library containing VM bytecode,
    #     compiled kernels, and constants. Note: Since parameters are passed at runtime,
    #     you will also need to save a separate parameters file (see next section).
    #   - Intermediate object files (``devc.o``, ``lib0.o``, etc.) are kept in the
    #     workspace for inspection but are not required for deployment.
    #
    #   Note: Additional files like ``*.params``, ``*.metadata.json``, or ``*.imports``
    #   may appear in specific configurations but are typically embedded into the
    #   shared library or only generated when needed.


######################################################################
# Load the Exported Library and Run It
# ------------------------------------
# Once the shared object is produced, we can reload it back into the TVM runtime
# on any machine with a compatible instruction set. The Relax VM consumes the
# runtime module directly.

if RUN_EXAMPLE:
    loaded_rt_mod = tvm.runtime.load_module(str(library_path))
    dev = tvm.cpu(0)
    vm = relax.VirtualMachine(loaded_rt_mod, dev)

    # Prepare input data
    input_tensor = torch.randn(1, 1, 28, 28, dtype=torch.float32)
    vm_input = tvm.runtime.tensor(input_tensor.numpy(), dev)

    # Prepare parameters (allocate on target device)
    vm_params = [tvm.runtime.tensor(p, dev) for p in params["main"]]

    # Run inference: pass input data followed by all parameters
    tvm_output = vm["main"](vm_input, *vm_params)

    # TVM returns Array objects for tuple outputs, access via indexing.
    # For models imported from PyTorch, outputs are typically tuples (even for single outputs).
    # For ONNX models, outputs may be a single Tensor directly.
    if isinstance(tvm_output, tvm.ir.Array) and len(tvm_output) > 0:
        result_tensor = tvm_output[0]
    else:
        result_tensor = tvm_output

    print("VM output shape:", result_tensor.shape)
    print("VM output type:", type(tvm_output), "->", type(result_tensor))

    # You can still inspect the executable after reloading.
    print("Executable stats:\n", loaded_rt_mod["stats"]())


######################################################################
# Save Parameters for Deployment
# -------------------------------
# Since parameters are passed at runtime (not embedded in the ``.so``), we must
# save them separately for deployment. This is a required step to use the model
# on other machines or in standalone scripts.

import numpy as np

if RUN_EXAMPLE:
    # Save parameters to disk
    params_path = ARTIFACT_DIR / "model_params.npz"
    param_arrays = {f"p_{i}": p.numpy() for i, p in enumerate(params["main"])}
    np.savez(str(params_path), **param_arrays)
    print(f"Saved parameters to: {params_path}")

# Note: Alternatively, you can embed parameters directly into the ``.so`` to
# create a single-file deployment. Use ``keep_params_as_input=False`` when
# importing from PyTorch:
#
# .. code-block:: python
#
#    mod = from_exported_program(exported_program, keep_params_as_input=False)
#    # Parameters are now embedded as constants in the module
#    executable = tvm.compile(built_mod, target=TARGET)
#    # Runtime: vm["main"](input)  # No need to pass params!
#
# This creates a single-file deployment (only the ``.so`` is needed), but you
# lose the flexibility to swap parameters without recompiling. For most
# production workflows, separating code and parameters (as shown above) is
# preferred for flexibility.


######################################################################
# Loading and Running the Exported Model
# -----------------------------------------------------------
# To use the exported model on another machine or in a standalone script, you need
# to load both the ``.so`` library and the parameters file. Here's a complete example
# of how to reload and run the model. Save this as ``run_mlp.py``:
#
# To make it executable from the command line:
#
# .. code-block:: bash
#
#    chmod +x run_mlp.py
#    ./run_mlp.py  # Run it like a regular program
#
# Complete script:
#
# .. code-block:: python
#
#    #!/usr/bin/env python3
#    import numpy as np
#    import tvm
#    from tvm import relax
#
#    # Step 1: Load the compiled library
#    lib = tvm.runtime.load_module("relax_export_artifacts/mlp_cpu.so")
#
#    # Step 2: Create Virtual Machine
#    device = tvm.cpu(0)
#    vm = relax.VirtualMachine(lib, device)
#
#    # Step 3: Load parameters from the .npz file
#    params_npz = np.load("relax_export_artifacts/model_params.npz")
#    params = [tvm.runtime.tensor(params_npz[f"p_{i}"], device)
#              for i in range(len(params_npz))]
#
#    # Step 4: Prepare input data
#    data = np.random.randn(1, 1, 28, 28).astype("float32")
#    input_tensor = tvm.runtime.tensor(data, device)
#
#    # Step 5: Run inference (pass input followed by all parameters)
#    output = vm["main"](input_tensor, *params)
#
#    # Step 6: Extract result (output may be tuple or single Tensor)
#    # PyTorch models typically return tuples, ONNX models may return a single Tensor
#    if isinstance(tvm_output, tvm.ir.Array) and len(tvm_output) > 0:
#        result_tensor = tvm_output[0]
#    else:
#        result_tensor = tvm_output
#
#    print("Prediction shape:", result.shape)
#    print("Predicted class:", np.argmax(result.numpy()))
#
# **Running on GPU:**
# To run on GPU instead of CPU, make the following changes:
#
# 1. **Compile for GPU** (earlier in the tutorial, around line 112):
#    .. code-block:: python
#
#       TARGET = tvm.target.Target("cuda")  # Change from "llvm" to "cuda"
#
# 2. **Use GPU device in the script**:
#    .. code-block:: python
#
#       device = tvm.cuda(0)  # Use CUDA device instead of CPU
#       vm = relax.VirtualMachine(lib, device)
#
#       # Load parameters to GPU
#       params = [tvm.runtime.tensor(params_npz[f"p_{i}"], device)  # Note: device parameter
#                 for i in range(len(params_npz))]
#
#       # Prepare input on GPU
#       input_tensor = tvm.runtime.tensor(data, device)  # Note: device parameter
#
#    The rest of the script remains the same. All tensors (parameters and inputs)
#    must be allocated on the same device (GPU) as the compiled model.
#
# **Deployment Checklist:**
# When moving to another host (via RPC or SCP), you must copy **both** files:
#   1. ``mlp_cpu.so`` (or ``mlp_cuda.so`` for GPU) - The compiled model code
#   2. ``model_params.npz`` - The model parameters (serialized as NumPy arrays)
#
# The remote machine needs both files in the same directory. The script above
# assumes they are in ``relax_export_artifacts/`` relative to the script location.
# Adjust the paths as needed for your deployment. For GPU deployment, ensure the
# target machine has compatible CUDA drivers and the model was compiled for the
# same GPU architecture.


######################################################################
# Deploying to Remote Devices
# ---------------------------
# To deploy the exported model to a remote ARM Linux device (e.g., Raspberry Pi),
# you can use TVM's RPC mechanism to cross-compile, upload, and run the model
# remotely. This workflow is useful when:
#
# - The target device has limited resources for compilation
# - You want to fine-tune performance by running on the actual hardware
# - You need to deploy to embedded devices
#
# See :doc:`cross_compilation_and_rpc </how_to/tutorials/cross_compilation_and_rpc>`
# for a comprehensive guide on:
#
# - Setting up TVM runtime on the remote device
# - Starting an RPC server on the device
# - Cross-compiling for ARM targets (e.g., ``llvm -mtriple=aarch64-linux-gnu``)
# - Uploading exported libraries via RPC
# - Running inference remotely
#
# Quick example for ARM deployment workflow:
#
# .. code-block:: python
#
#    import tvm.rpc as rpc
#    from tvm import relax
#
#    # Step 1: Cross-compile for ARM target (on local machine)
#    TARGET = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu")
#    executable = tvm.compile(built_mod, target=TARGET)
#    executable.export_library("mlp_arm.so")
#
#    # Step 2: Connect to remote device RPC server
#    remote = rpc.connect("192.168.1.100", 9090)  # Device IP and RPC port
#
#    # Step 3: Upload the compiled library and parameters
#    remote.upload("mlp_arm.so")
#    remote.upload("model_params.npz")
#
#    # Step 4: Load and run on remote device
#    lib = remote.load_module("mlp_arm.so")
#    vm = relax.VirtualMachine(lib, remote.cpu())
#    # ... prepare input and params, then run inference
#
# The key difference is using an ARM target triple during compilation and
# uploading files via RPC instead of copying them directly.


######################################################################
# FAQ
# ---
# **Can I run the ``.so`` as a standalone executable (like ``./mlp_cpu.so``)?**
#     No. The ``.so`` file is a shared library, not a standalone executable binary.
#     You cannot run it directly from the terminal. It must be loaded through a TVM
#     runtime program (as shown in the "Loading and Running" section above). The
#     ``.so`` bundles VM bytecode and compiled kernels, but still requires the TVM
#     runtime to execute.
#
# **Which devices can run the exported library?**
#     The target must match the ISA you compiled for (``llvm`` in this example).
#     As long as the target triple, runtime ABI, and available devices line up,
#     you can move the artifact between machines. For heterogeneous builds (CPU
#     plus GPU), ship the extra device libraries as well.
#
# **What about the ``.params`` and ``metadata.json`` files?**
#     These auxiliary files are only generated in specific configurations. In this
#     tutorial, since we pass parameters at runtime, they are not generated. When
#     they do appear, they may be kept alongside the ``.so`` for inspection, but
#     the essential content is typically embedded in the shared object itself, so
#     deploying the ``.so`` alone is usually sufficient.